mirror of
https://github.com/RGBCube/serenity
synced 2025-07-25 16:57:35 +00:00
LibJS: Unescape incorrectly escaped code units in regex patterns
We were translating the pattern [\⪾-\⫀] to [\\u2abe-\\u2ac0], which is a very different pattern; as a code unit converted to the \uhhh format has no meaning when escaped, this commit makes us simply skip escaping it when translating the pattern.
This commit is contained in:
parent
35ff38aaea
commit
17087ac4a2
2 changed files with 22 additions and 3 deletions
|
@ -100,6 +100,7 @@ ErrorOr<DeprecatedString, ParseRegexPatternError> parse_regex_pattern(StringView
|
||||||
|
|
||||||
// If the Unicode flag is set, append each code point to the pattern. Otherwise, append each
|
// If the Unicode flag is set, append each code point to the pattern. Otherwise, append each
|
||||||
// code unit. But unlike the spec, multi-byte code units must be escaped for LibRegex to parse.
|
// code unit. But unlike the spec, multi-byte code units must be escaped for LibRegex to parse.
|
||||||
|
auto previous_code_unit_was_backslash = false;
|
||||||
for (size_t i = 0; i < utf16_pattern_view.length_in_code_units();) {
|
for (size_t i = 0; i < utf16_pattern_view.length_in_code_units();) {
|
||||||
if (unicode || unicode_sets) {
|
if (unicode || unicode_sets) {
|
||||||
auto code_point = code_point_at(utf16_pattern_view, i);
|
auto code_point = code_point_at(utf16_pattern_view, i);
|
||||||
|
@ -111,10 +112,22 @@ ErrorOr<DeprecatedString, ParseRegexPatternError> parse_regex_pattern(StringView
|
||||||
u16 code_unit = utf16_pattern_view.code_unit_at(i);
|
u16 code_unit = utf16_pattern_view.code_unit_at(i);
|
||||||
++i;
|
++i;
|
||||||
|
|
||||||
if (code_unit > 0x7f)
|
if (code_unit > 0x7f) {
|
||||||
builder.appendff("\\u{:04x}", code_unit);
|
// Incorrectly escaping this code unit will result in a wildly different regex than intended
|
||||||
else
|
// as we're converting <c> to <\uhhhh>, which would turn into <\\uhhhh> if (incorrectly) escaped again,
|
||||||
|
// leading to a matcher for the literal string "\uhhhh" instead of the intended code unit <c>.
|
||||||
|
// As such, we're going to remove the (invalid) backslash and pretend it never existed.
|
||||||
|
if (!previous_code_unit_was_backslash)
|
||||||
|
builder.append('\\');
|
||||||
|
builder.appendff("u{:04x}", code_unit);
|
||||||
|
} else {
|
||||||
builder.append_code_point(code_unit);
|
builder.append_code_point(code_unit);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (code_unit == '\\')
|
||||||
|
previous_code_unit_was_backslash = !previous_code_unit_was_backslash;
|
||||||
|
else
|
||||||
|
previous_code_unit_was_backslash = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
return builder.to_deprecated_string();
|
return builder.to_deprecated_string();
|
||||||
|
|
|
@ -60,3 +60,9 @@ test("regexp literals are re-useable", () => {
|
||||||
expect(re.test("test")).toBeTrue();
|
expect(re.test("test")).toBeTrue();
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
test("Incorrectly escaped code units not converted to invalid patterns", () => {
|
||||||
|
const re = /[\⪾-\⫀]/;
|
||||||
|
expect(re.test("⫀")).toBeTrue();
|
||||||
|
expect(re.test("\\u2abe")).toBeFalse(); // ⫀ is \u2abe
|
||||||
|
});
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue