LibRegex: Support ECMA-262 Unicode escapes of the form "\u{code_point}"

When the Unicode flag is set, regular expressions may escape code points by surrounding the hexadecimal code point with curly braces, e.g. \u{41} is the character "A". When the Unicode flag is not set, this should be considered a repetition symbol - \u{41} is the character "u" repeated 41 times. This is left as a TODO for now.
2025-09-16 16:26:16 +00:00 · 2021-07-22 09:25:58 -04:00 · 2021-07-22 09:25:58 -04:00 · 345ef6abba
commit 345ef6abba
parent 0e6375558d
2 changed files with 27 additions and 0 deletions
--- a/Tests/LibRegex/Regex.cpp
+++ b/Tests/LibRegex/Regex.cpp
@ -510,6 +510,11 @@ TEST_CASE(ECMA262_parse)
        { "\\uxxxx", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
        { "\\ud83d", regex::Error::NoError, ECMAScriptFlags::Unicode },
        { "\\ud83d\\uxxxx", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
+        { "\\u{0}", regex::Error::NoError, ECMAScriptFlags::Unicode },
+        { "\\u{10ffff}", regex::Error::NoError, ECMAScriptFlags::Unicode },
+        { "\\u{10ffff", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
+        { "\\u{10ffffx", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
+        { "\\u{110000}", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
    };

    for (auto& test : tests) {
@ -605,6 +610,7 @@ TEST_CASE(ECMA262_unicode_match)
        { "\\ude00", "😀", false, ECMAScriptFlags::Unicode },
        { "\\ud83d\\ude00", "😀", true },
        { "\\ud83d\\ude00", "😀", true, ECMAScriptFlags::Unicode },
+        { "\\u{1f600}", "😀", true, ECMAScriptFlags::Unicode },
        { "\\ud83d\\ud83d", "\xed\xa0\xbd\xed\xa0\xbd", true },
        { "\\ud83d\\ud83d", "\xed\xa0\xbd\xed\xa0\xbd", true, ECMAScriptFlags::Unicode },
    };