LibRegex: Treat pattern string characters as unsigned

For example, consider the following pattern: new RegExp('\ud834\udf06', 'u') With this pattern, the regex parser should insert the UTF-8 encoded bytes 0xf0, 0x9d, 0x8c, and 0x86. However, because these characters are currently treated as normal char types, they have a negative value since they are all > 0x7f. Then, due to sign extension, when these characters are cast to u64, the sign bit is preserved. The result is that these bytes are inserted as 0xfffffffffffffff0, 0xffffffffffffff9d, etc. Fortunately, there are only a few places where we insert bytecode with the raw characters. In these places, be sure to treat the bytes as u8 before they are cast to u64.
2025-07-25 20:37:35 +00:00 · 2021-08-20 10:22:23 -04:00 · 2021-08-20 10:22:23 -04:00 · 562d4e497b
commit 562d4e497b
parent 7c54b6bd45
3 changed files with 9 additions and 7 deletions
--- a/Userland/Libraries/LibRegex/RegexParser.cpp
+++ b/Userland/Libraries/LibRegex/RegexParser.cpp
@ -145,9 +145,9 @@ ALWAYS_INLINE bool Parser::lookahead_any(StringView str)
    return false;
 }

-ALWAYS_INLINE char Parser::skip()
+ALWAYS_INLINE unsigned char Parser::skip()
 {
-    char ch;
+    unsigned char ch;
    if (m_parser_state.current_token.value().length() == 1) {
        ch = m_parser_state.current_token.value()[0];
    } else {
@ -1287,7 +1287,7 @@ bool ECMA262Parser::parse_atom(ByteCode& stack, size_t& match_length_minimum, bo
        // Also part of AtomEscape.
        auto token = consume();
        match_length_minimum += 1;
-        stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)token.value()[1] } });
+        stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (u8)token.value()[1] } });
        return true;
    }
    if (try_skip("\\")) {
@ -1326,7 +1326,7 @@ bool ECMA262Parser::parse_atom(ByteCode& stack, size_t& match_length_minimum, bo
        if (m_should_use_browser_extended_grammar) {
            auto token = consume();
            match_length_minimum += 1;
-            stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)token.value()[0] } });
+            stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (u8)token.value()[0] } });
            return true;
        } else {
            return false;
@ -1336,7 +1336,7 @@ bool ECMA262Parser::parse_atom(ByteCode& stack, size_t& match_length_minimum, bo
    if (match_ordinary_characters()) {
        auto token = consume().value();
        match_length_minimum += 1;
-        stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)token[0] } });
+        stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (u8)token[0] } });
        return true;
    }

@ -1594,7 +1594,7 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
            // Allow all SourceCharacter's as escapes here.
            auto token = consume();
            match_length_minimum += 1;
-            stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)token.value()[0] } });
+            stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (u8)token.value()[0] } });
            return true;
        }