diff --git a/Tests/LibRegex/Regex.cpp b/Tests/LibRegex/Regex.cpp index d35d6e722b..4776528e13 100644 --- a/Tests/LibRegex/Regex.cpp +++ b/Tests/LibRegex/Regex.cpp @@ -697,6 +697,7 @@ TEST_CASE(ECMA262_match) { "^[a-sy-z]$"sv, "b"sv, true, ECMAScriptFlags::Insensitive }, { "^[a-sy-z]$"sv, "y"sv, true, ECMAScriptFlags::Insensitive }, { "^[a-sy-z]$"sv, "u"sv, false, ECMAScriptFlags::Insensitive }, + { "."sv, "\n\r\u2028\u2029"sv, false }, // Dot should not match any of CR/LF/LS/PS in ECMA262 mode without DotAll. }; // clang-format on diff --git a/Userland/Libraries/LibC/bits/regex_defs.h b/Userland/Libraries/LibC/bits/regex_defs.h index db1e52c13b..497a4cc231 100644 --- a/Userland/Libraries/LibC/bits/regex_defs.h +++ b/Userland/Libraries/LibC/bits/regex_defs.h @@ -31,24 +31,25 @@ enum __Regex_Error { }; enum __RegexAllFlags { - __Regex_Global = 1, // All matches (don't return after first match) - __Regex_Insensitive = __Regex_Global << 1, // Case insensitive match (ignores case of [a-zA-Z]) - __Regex_Ungreedy = __Regex_Global << 2, // The match becomes lazy by default. Now a ? following a quantifier makes it greedy - __Regex_Unicode = __Regex_Global << 3, // Enable all unicode features and interpret all unicode escape sequences as such - __Regex_Extended = __Regex_Global << 4, // Ignore whitespaces. Spaces and text after a # in the pattern are ignored - __Regex_Extra = __Regex_Global << 5, // Disallow meaningless escapes. A \ followed by a letter with no special meaning is faulted - __Regex_MatchNotBeginOfLine = __Regex_Global << 6, // Pattern is not forced to ^ -> search in whole string! - __Regex_MatchNotEndOfLine = __Regex_Global << 7, // Don't Force the dollar sign, $, to always match end of the string, instead of end of the line. This option is ignored if the Multiline-flag is set - __Regex_SkipSubExprResults = __Regex_Global << 8, // Do not return sub expressions in the result - __Regex_StringCopyMatches = __Regex_Global << 9, // Do explicitly copy results into new allocated string instead of StringView to original string. - __Regex_SingleLine = __Regex_Global << 10, // Dot matches newline characters - __Regex_Sticky = __Regex_Global << 11, // Force the pattern to only match consecutive matches from where the previous match ended. - __Regex_Multiline = __Regex_Global << 12, // Handle newline characters. Match each line, one by one. - __Regex_SkipTrimEmptyMatches = __Regex_Global << 13, // Do not remove empty capture group results. - __Regex_SingleMatch = __Regex_Global << 14, // Stop after acquiring a single match. - __Regex_UnicodeSets = __Regex_Global << 15, // ECMA262 Parser specific: Allow set operations in char classes. - __Regex_Internal_Stateful = __Regex_Global << 16, // Internal flag; enables stateful matches. - __Regex_Internal_BrowserExtended = __Regex_Global << 17, // Internal flag; enable browser-specific ECMA262 extensions. - __Regex_Internal_ConsiderNewline = __Regex_Global << 18, // Internal flag; allow matchers to consider newlines as line separators. - __Regex_Last = __Regex_UnicodeSets, + __Regex_Global = 1, // All matches (don't return after first match) + __Regex_Insensitive = __Regex_Global << 1, // Case insensitive match (ignores case of [a-zA-Z]) + __Regex_Ungreedy = __Regex_Global << 2, // The match becomes lazy by default. Now a ? following a quantifier makes it greedy + __Regex_Unicode = __Regex_Global << 3, // Enable all unicode features and interpret all unicode escape sequences as such + __Regex_Extended = __Regex_Global << 4, // Ignore whitespaces. Spaces and text after a # in the pattern are ignored + __Regex_Extra = __Regex_Global << 5, // Disallow meaningless escapes. A \ followed by a letter with no special meaning is faulted + __Regex_MatchNotBeginOfLine = __Regex_Global << 6, // Pattern is not forced to ^ -> search in whole string! + __Regex_MatchNotEndOfLine = __Regex_Global << 7, // Don't Force the dollar sign, $, to always match end of the string, instead of end of the line. This option is ignored if the Multiline-flag is set + __Regex_SkipSubExprResults = __Regex_Global << 8, // Do not return sub expressions in the result + __Regex_StringCopyMatches = __Regex_Global << 9, // Do explicitly copy results into new allocated string instead of StringView to original string. + __Regex_SingleLine = __Regex_Global << 10, // Dot matches newline characters + __Regex_Sticky = __Regex_Global << 11, // Force the pattern to only match consecutive matches from where the previous match ended. + __Regex_Multiline = __Regex_Global << 12, // Handle newline characters. Match each line, one by one. + __Regex_SkipTrimEmptyMatches = __Regex_Global << 13, // Do not remove empty capture group results. + __Regex_SingleMatch = __Regex_Global << 14, // Stop after acquiring a single match. + __Regex_UnicodeSets = __Regex_Global << 15, // ECMA262 Parser specific: Allow set operations in char classes. + __Regex_Internal_Stateful = __Regex_Global << 16, // Internal flag; enables stateful matches. + __Regex_Internal_BrowserExtended = __Regex_Global << 17, // Internal flag; enable browser-specific ECMA262 extensions. + __Regex_Internal_ConsiderNewline = __Regex_Global << 18, // Internal flag; allow matchers to consider newlines as line separators. + __Regex_Internal_ECMA262DotSemantics = __Regex_Global << 19, // Internal flag; use ECMA262 semantics for dot ('.') - disallow CR/LF/LS/PS instead of just CR. + __Regex_Last = __Regex_Internal_ECMA262DotSemantics, }; diff --git a/Userland/Libraries/LibRegex/RegexByteCode.cpp b/Userland/Libraries/LibRegex/RegexByteCode.cpp index 07e22f5ae8..6bc1ee1f11 100644 --- a/Userland/Libraries/LibRegex/RegexByteCode.cpp +++ b/Userland/Libraries/LibRegex/RegexByteCode.cpp @@ -499,8 +499,18 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M if (input.view.length() <= state.string_position) return ExecutionResult::Failed_ExecuteLowPrioForks; + // U+2028 LINE SEPARATOR + constexpr static u32 const LineSeparator { 0x2028 }; + // U+2029 PARAGRAPH SEPARATOR + constexpr static u32 const ParagraphSeparator { 0x2029 }; + auto input_view = input.view.substring_view(state.string_position, 1)[0]; - if (input_view != '\n' || (input.regex_options.has_flag_set(AllFlags::SingleLine) && input.regex_options.has_flag_set(AllFlags::Internal_ConsiderNewline))) { + auto is_equivalent_to_newline = input_view == '\n' + || (input.regex_options.has_flag_set(AllFlags::Internal_ECMA262DotSemantics) + ? (input_view == '\r' || input_view == LineSeparator || input_view == ParagraphSeparator) + : false); + + if (!is_equivalent_to_newline || (input.regex_options.has_flag_set(AllFlags::SingleLine) && input.regex_options.has_flag_set(AllFlags::Internal_ConsiderNewline))) { if (current_inversion_state()) inverse_matched = true; else diff --git a/Userland/Libraries/LibRegex/RegexOptions.h b/Userland/Libraries/LibRegex/RegexOptions.h index faffa51cd0..c6a6291fb1 100644 --- a/Userland/Libraries/LibRegex/RegexOptions.h +++ b/Userland/Libraries/LibRegex/RegexOptions.h @@ -19,29 +19,32 @@ namespace regex { using FlagsUnderlyingType = u32; enum class AllFlags { - Global = __Regex_Global, // All matches (don't return after first match) - Insensitive = __Regex_Insensitive, // Case insensitive match (ignores case of [a-zA-Z]) - Ungreedy = __Regex_Ungreedy, // The match becomes lazy by default. Now a ? following a quantifier makes it greedy - Unicode = __Regex_Unicode, // Enable all unicode features and interpret all unicode escape sequences as such - Extended = __Regex_Extended, // Ignore whitespaces. Spaces and text after a # in the pattern are ignored - Extra = __Regex_Extra, // Disallow meaningless escapes. A \ followed by a letter with no special meaning is faulted - MatchNotBeginOfLine = __Regex_MatchNotBeginOfLine, // Pattern is not forced to ^ -> search in whole string! - MatchNotEndOfLine = __Regex_MatchNotEndOfLine, // Don't Force the dollar sign, $, to always match end of the string, instead of end of the line. This option is ignored if the Multiline-flag is set - SkipSubExprResults = __Regex_SkipSubExprResults, // Do not return sub expressions in the result - StringCopyMatches = __Regex_StringCopyMatches, // Do explicitly copy results into new allocated string instead of StringView to original string. - SingleLine = __Regex_SingleLine, // Dot matches newline characters - Sticky = __Regex_Sticky, // Force the pattern to only match consecutive matches from where the previous match ended. - Multiline = __Regex_Multiline, // Handle newline characters. Match each line, one by one. - SkipTrimEmptyMatches = __Regex_SkipTrimEmptyMatches, // Do not remove empty capture group results. - SingleMatch = __Regex_SingleMatch, // Stop after acquiring a single match. - UnicodeSets = __Regex_UnicodeSets, // Only for ECMA262, Allow set operations in character classes. - Internal_Stateful = __Regex_Internal_Stateful, // Make global matches match one result at a time, and further match() calls on the same instance continue where the previous one left off. - Internal_BrowserExtended = __Regex_Internal_BrowserExtended, // Only for ECMA262, Enable the behaviors defined in section B.1.4. of the ECMA262 spec. - Internal_ConsiderNewline = __Regex_Internal_ConsiderNewline, // Only for ECMA262, Allow multiline matches to consider newlines as line boundaries. + Default = 0, + Global = __Regex_Global, // All matches (don't return after first match) + Insensitive = __Regex_Insensitive, // Case insensitive match (ignores case of [a-zA-Z]) + Ungreedy = __Regex_Ungreedy, // The match becomes lazy by default. Now a ? following a quantifier makes it greedy + Unicode = __Regex_Unicode, // Enable all unicode features and interpret all unicode escape sequences as such + Extended = __Regex_Extended, // Ignore whitespaces. Spaces and text after a # in the pattern are ignored + Extra = __Regex_Extra, // Disallow meaningless escapes. A \ followed by a letter with no special meaning is faulted + MatchNotBeginOfLine = __Regex_MatchNotBeginOfLine, // Pattern is not forced to ^ -> search in whole string! + MatchNotEndOfLine = __Regex_MatchNotEndOfLine, // Don't Force the dollar sign, $, to always match end of the string, instead of end of the line. This option is ignored if the Multiline-flag is set + SkipSubExprResults = __Regex_SkipSubExprResults, // Do not return sub expressions in the result + StringCopyMatches = __Regex_StringCopyMatches, // Do explicitly copy results into new allocated string instead of StringView to original string. + SingleLine = __Regex_SingleLine, // Dot matches newline characters + Sticky = __Regex_Sticky, // Force the pattern to only match consecutive matches from where the previous match ended. + Multiline = __Regex_Multiline, // Handle newline characters. Match each line, one by one. + SkipTrimEmptyMatches = __Regex_SkipTrimEmptyMatches, // Do not remove empty capture group results. + SingleMatch = __Regex_SingleMatch, // Stop after acquiring a single match. + UnicodeSets = __Regex_UnicodeSets, // Only for ECMA262, Allow set operations in character classes. + Internal_Stateful = __Regex_Internal_Stateful, // Make global matches match one result at a time, and further match() calls on the same instance continue where the previous one left off. + Internal_BrowserExtended = __Regex_Internal_BrowserExtended, // Only for ECMA262, Enable the behaviors defined in section B.1.4. of the ECMA262 spec. + Internal_ConsiderNewline = __Regex_Internal_ConsiderNewline, // Only for ECMA262, Allow multiline matches to consider newlines as line boundaries. + Internal_ECMA262DotSemantics = __Regex_Internal_ECMA262DotSemantics, // Use ECMA262 dot semantics: disallow matching CR/LF/LS/PS instead of just CR. Last = Internal_BrowserExtended, }; enum class PosixFlags : FlagsUnderlyingType { + Default = 0, Global = (FlagsUnderlyingType)AllFlags::Global, Insensitive = (FlagsUnderlyingType)AllFlags::Insensitive, Ungreedy = (FlagsUnderlyingType)AllFlags::Ungreedy, @@ -58,6 +61,7 @@ enum class PosixFlags : FlagsUnderlyingType { }; enum class ECMAScriptFlags : FlagsUnderlyingType { + Default = (FlagsUnderlyingType)AllFlags::Internal_ECMA262DotSemantics, Global = (FlagsUnderlyingType)AllFlags::Global | (FlagsUnderlyingType)AllFlags::Internal_Stateful, // Note: ECMAScript "Global" creates a stateful regex. Insensitive = (FlagsUnderlyingType)AllFlags::Insensitive, Ungreedy = (FlagsUnderlyingType)AllFlags::Ungreedy, @@ -80,13 +84,13 @@ public: RegexOptions() = default; constexpr RegexOptions(T flags) - : m_flags(flags) + : m_flags(static_cast(to_underlying(flags) | to_underlying(T::Default))) { } template constexpr RegexOptions(RegexOptions other) - : m_flags((T) static_cast(other.value())) + : RegexOptions(static_cast(to_underlying(other.value()))) { } @@ -115,7 +119,7 @@ public: T value() const { return m_flags; } private: - T m_flags { 0 }; + T m_flags { T::Default }; }; template