diff --git a/Tests/LibRegex/Regex.cpp b/Tests/LibRegex/Regex.cpp index b8a0a3201e..5d9672faba 100644 --- a/Tests/LibRegex/Regex.cpp +++ b/Tests/LibRegex/Regex.cpp @@ -648,7 +648,7 @@ TEST_CASE(ECMA262_match) { "^[\\0-\\x1f]$"sv, "\n"sv }, { .pattern = "\\bhello\\B"sv, .subject = "hello1"sv, .options = ECMAScriptFlags::Global }, { "\\b.*\\b"sv, "hello1"sv }, - { "[^\\D\\S]{2}"sv, "1 "sv }, + { "[^\\D\\S]{2}"sv, "1 "sv, false }, { "bar(?=f.)foo"sv, "barfoo"sv }, { "bar(?=foo)bar"sv, "barbar"sv, false }, { "bar(?!foo)bar"sv, "barbar"sv, true }, @@ -1174,6 +1174,14 @@ TEST_CASE(inversion_state_in_char_class) EXPECT_EQ(result.capture_group_matches.first()[0].view.to_byte_string(), "slideNumbers"sv); EXPECT_EQ(result.capture_group_matches.first()[1].view.to_byte_string(), "}"sv); } + { + // #21786, /[^\S\n]/.exec("\n") should be null, not [ "\n" ]. + // This was a general confusion between the inversion state and the negation state (temp inverse). + Regex re("[^\\S\\n]", ECMAScriptFlags::Global | (ECMAScriptFlags)regex::AllFlags::SingleMatch); + + auto result = re.match("\n"sv); + EXPECT_EQ(result.success, false); + } } TEST_CASE(mismatching_brackets) diff --git a/Userland/Libraries/LibRegex/RegexByteCode.cpp b/Userland/Libraries/LibRegex/RegexByteCode.cpp index 77467d7b93..c2205d39ab 100644 --- a/Userland/Libraries/LibRegex/RegexByteCode.cpp +++ b/Userland/Libraries/LibRegex/RegexByteCode.cpp @@ -426,6 +426,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M bool active { false }; bool is_conjunction { false }; bool fail { false }; + bool inverse_matched { false }; size_t initial_position; size_t initial_code_unit_position; Optional last_accepted_position {}; @@ -623,8 +624,9 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M case CharacterCompareType::And: disjunction_states.append({ .active = true, - .is_conjunction = false, - .fail = false, + .is_conjunction = current_inversion_state(), + .fail = current_inversion_state(), + .inverse_matched = current_inversion_state(), .initial_position = state.string_position, .initial_code_unit_position = state.string_position_in_code_units, }); @@ -632,8 +634,9 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M case CharacterCompareType::Or: disjunction_states.append({ .active = true, - .is_conjunction = true, - .fail = true, + .is_conjunction = !current_inversion_state(), + .fail = !current_inversion_state(), + .inverse_matched = !current_inversion_state(), .initial_position = state.string_position, .initial_code_unit_position = state.string_position_in_code_units, }); @@ -644,6 +647,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M state.string_position = disjunction_state.last_accepted_position.value_or(disjunction_state.initial_position); state.string_position_in_code_units = disjunction_state.last_accepted_code_unit_position.value_or(disjunction_state.initial_code_unit_position); } + inverse_matched = disjunction_state.inverse_matched || disjunction_state.fail; break; } default: @@ -664,6 +668,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M if (!failed) { new_disjunction_state.last_accepted_position = state.string_position; new_disjunction_state.last_accepted_code_unit_position = state.string_position_in_code_units; + new_disjunction_state.inverse_matched |= inverse_matched; } if (new_disjunction_state.is_conjunction) @@ -673,6 +678,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M state.string_position = new_disjunction_state.initial_position; state.string_position_in_code_units = new_disjunction_state.initial_code_unit_position; + inverse_matched = false; } } diff --git a/Userland/Libraries/LibRegex/RegexParser.cpp b/Userland/Libraries/LibRegex/RegexParser.cpp index ae97620623..64fbbb7889 100644 --- a/Userland/Libraries/LibRegex/RegexParser.cpp +++ b/Userland/Libraries/LibRegex/RegexParser.cpp @@ -1777,10 +1777,12 @@ bool ECMA262Parser::parse_character_class(ByteCode& stack, size_t& match_length_ Vector compares; + auto uses_explicit_or_semantics = false; if (match(TokenType::Circumflex)) { // Negated charclass consume(); compares.empend(CompareTypeAndValuePair { CharacterCompareType::Inverse, 0 }); + uses_explicit_or_semantics = true; } // ClassContents :: [empty] @@ -1800,6 +1802,11 @@ bool ECMA262Parser::parse_character_class(ByteCode& stack, size_t& match_length_ if (flags.unicode_sets && !parse_class_set_expression(compares)) return false; + if (uses_explicit_or_semantics && compares.size() > 2) { + compares.insert(1, CompareTypeAndValuePair { CharacterCompareType::Or, 0 }); + compares.empend(CompareTypeAndValuePair { CharacterCompareType::EndAndOr, 0 }); + } + match_length_minimum += 1; stack.insert_bytecode_compare_values(move(compares)); return true; @@ -2466,9 +2473,9 @@ DeprecatedFlyString ECMA262Parser::read_capture_group_specifier(bool take_starti { static auto id_start_category = Unicode::property_from_string("ID_Start"sv); static auto id_continue_category = Unicode::property_from_string("ID_Continue"sv); - static constexpr const u32 REPLACEMENT_CHARACTER = 0xFFFD; - constexpr const u32 ZERO_WIDTH_NON_JOINER { 0x200C }; - constexpr const u32 ZERO_WIDTH_JOINER { 0x200D }; + static constexpr u32 const REPLACEMENT_CHARACTER = 0xFFFD; + constexpr u32 const ZERO_WIDTH_NON_JOINER { 0x200C }; + constexpr u32 const ZERO_WIDTH_JOINER { 0x200D }; if (take_starting_angle_bracket && !consume("<")) return {};