From 17087ac4a265356e7f635bcaa484b49d631f6672 Mon Sep 17 00:00:00 2001 From: Ali Mohammad Pur Date: Sat, 16 Sep 2023 16:03:54 +0330 Subject: [PATCH] LibJS: Unescape incorrectly escaped code units in regex patterns MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We were translating the pattern [\⪾-\⫀] to [\\u2abe-\\u2ac0], which is a very different pattern; as a code unit converted to the \uhhh format has no meaning when escaped, this commit makes us simply skip escaping it when translating the pattern. --- .../Libraries/LibJS/Runtime/RegExpObject.cpp | 19 ++++++++++++++++--- .../LibJS/Tests/builtins/RegExp/RegExp.js | 6 ++++++ 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/Userland/Libraries/LibJS/Runtime/RegExpObject.cpp b/Userland/Libraries/LibJS/Runtime/RegExpObject.cpp index 6d41cdf058..1ff6fc087e 100644 --- a/Userland/Libraries/LibJS/Runtime/RegExpObject.cpp +++ b/Userland/Libraries/LibJS/Runtime/RegExpObject.cpp @@ -100,6 +100,7 @@ ErrorOr parse_regex_pattern(StringView // If the Unicode flag is set, append each code point to the pattern. Otherwise, append each // code unit. But unlike the spec, multi-byte code units must be escaped for LibRegex to parse. + auto previous_code_unit_was_backslash = false; for (size_t i = 0; i < utf16_pattern_view.length_in_code_units();) { if (unicode || unicode_sets) { auto code_point = code_point_at(utf16_pattern_view, i); @@ -111,10 +112,22 @@ ErrorOr parse_regex_pattern(StringView u16 code_unit = utf16_pattern_view.code_unit_at(i); ++i; - if (code_unit > 0x7f) - builder.appendff("\\u{:04x}", code_unit); - else + if (code_unit > 0x7f) { + // Incorrectly escaping this code unit will result in a wildly different regex than intended + // as we're converting to <\uhhhh>, which would turn into <\\uhhhh> if (incorrectly) escaped again, + // leading to a matcher for the literal string "\uhhhh" instead of the intended code unit . + // As such, we're going to remove the (invalid) backslash and pretend it never existed. + if (!previous_code_unit_was_backslash) + builder.append('\\'); + builder.appendff("u{:04x}", code_unit); + } else { builder.append_code_point(code_unit); + } + + if (code_unit == '\\') + previous_code_unit_was_backslash = !previous_code_unit_was_backslash; + else + previous_code_unit_was_backslash = false; } return builder.to_deprecated_string(); diff --git a/Userland/Libraries/LibJS/Tests/builtins/RegExp/RegExp.js b/Userland/Libraries/LibJS/Tests/builtins/RegExp/RegExp.js index ccd2509b6e..0aa6e95ef1 100644 --- a/Userland/Libraries/LibJS/Tests/builtins/RegExp/RegExp.js +++ b/Userland/Libraries/LibJS/Tests/builtins/RegExp/RegExp.js @@ -60,3 +60,9 @@ test("regexp literals are re-useable", () => { expect(re.test("test")).toBeTrue(); } }); + +test("Incorrectly escaped code units not converted to invalid patterns", () => { + const re = /[\⪾-\⫀]/; + expect(re.test("⫀")).toBeTrue(); + expect(re.test("\\u2abe")).toBeFalse(); // ⫀ is \u2abe +});