From 48cb15283a3e3a1378e6874dc5cf669edffaa17f Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Mon, 29 Aug 2022 11:05:12 -0400 Subject: [PATCH] LibRegex: Explicitly check if a character falls into a table-based range Previously, for a regex such as /[a-sy-z]/i, we would incorrectly think the character "u" fell into the range "a-s" because neither of the conditions "u > s && U > s" or "u < a && U < a" would be true, resulting in the lookup falling back to assuming the character is in the range. Instead, first explicitly check if the character falls into the range, rather than checking if it falls outside the range. If the explicit checks fail, then we know the character is outside the range. --- Tests/LibRegex/Regex.cpp | 5 ++++- Userland/Libraries/LibRegex/RegexByteCode.cpp | 11 +++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/Tests/LibRegex/Regex.cpp b/Tests/LibRegex/Regex.cpp index daeb623c43..2b4da92c98 100644 --- a/Tests/LibRegex/Regex.cpp +++ b/Tests/LibRegex/Regex.cpp @@ -690,7 +690,10 @@ TEST_CASE(ECMA262_match) { "a|$"sv, "x"sv, true, (ECMAScriptFlags)regex::AllFlags::Global }, // #11940, Global (not the 'g' flag) regexps should attempt to match the zero-length end of the string too. { "foo\nbar"sv, "foo\nbar"sv, true }, // #12126, ECMA262 regexp should match literal newlines without the 's' flag. { "foo[^]bar"sv, "foo\nbar"sv, true }, // #12126, ECMA262 regexp should match newline with [^]. - { "^[_A-Z]+$"sv, "_aA"sv, true, ECMAScriptFlags::Insensitive } // Insensitive lookup table: characters in a range do not necessarily lie in the same range after being converted to lowercase. + { "^[_A-Z]+$"sv, "_aA"sv, true, ECMAScriptFlags::Insensitive }, // Insensitive lookup table: characters in a range do not necessarily lie in the same range after being converted to lowercase. + { "^[a-sy-z]$"sv, "b"sv, true, ECMAScriptFlags::Insensitive }, + { "^[a-sy-z]$"sv, "y"sv, true, ECMAScriptFlags::Insensitive }, + { "^[a-sy-z]$"sv, "u"sv, false, ECMAScriptFlags::Insensitive }, }; // clang-format on diff --git a/Userland/Libraries/LibRegex/RegexByteCode.cpp b/Userland/Libraries/LibRegex/RegexByteCode.cpp index ace32620fc..e0c0ca4a27 100644 --- a/Userland/Libraries/LibRegex/RegexByteCode.cpp +++ b/Userland/Libraries/LibRegex/RegexByteCode.cpp @@ -557,11 +557,14 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M upper_case_needle = to_ascii_uppercase(needle); lower_case_needle = to_ascii_lowercase(needle); } - if (lower_case_needle > range.to && upper_case_needle > range.to) + + if (lower_case_needle >= range.from && lower_case_needle <= range.to) + return 0; + if (upper_case_needle >= range.from && upper_case_needle <= range.to) + return 0; + if (lower_case_needle > range.to || upper_case_needle > range.to) return 1; - if (lower_case_needle < range.from && upper_case_needle < range.from) - return -1; - return 0; + return -1; }); if (matching_range) {