1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-07-23 13:57:35 +00:00

LibRegex: Make '.' reject matching LF / LS / PS as per the ECMA262 spec

Previously we allowed it to match those, but the ECMA262 spec disallows
these (except in DotAll).
This commit is contained in:
Ali Mohammad Pur 2023-02-15 09:55:43 +03:30 committed by Andreas Kling
parent 1e022295c4
commit 936a9fd759
4 changed files with 59 additions and 43 deletions

View file

@ -697,6 +697,7 @@ TEST_CASE(ECMA262_match)
{ "^[a-sy-z]$"sv, "b"sv, true, ECMAScriptFlags::Insensitive },
{ "^[a-sy-z]$"sv, "y"sv, true, ECMAScriptFlags::Insensitive },
{ "^[a-sy-z]$"sv, "u"sv, false, ECMAScriptFlags::Insensitive },
{ "."sv, "\n\r\u2028\u2029"sv, false }, // Dot should not match any of CR/LF/LS/PS in ECMA262 mode without DotAll.
};
// clang-format on

View file

@ -50,5 +50,6 @@ enum __RegexAllFlags {
__Regex_Internal_Stateful = __Regex_Global << 16, // Internal flag; enables stateful matches.
__Regex_Internal_BrowserExtended = __Regex_Global << 17, // Internal flag; enable browser-specific ECMA262 extensions.
__Regex_Internal_ConsiderNewline = __Regex_Global << 18, // Internal flag; allow matchers to consider newlines as line separators.
__Regex_Last = __Regex_UnicodeSets,
__Regex_Internal_ECMA262DotSemantics = __Regex_Global << 19, // Internal flag; use ECMA262 semantics for dot ('.') - disallow CR/LF/LS/PS instead of just CR.
__Regex_Last = __Regex_Internal_ECMA262DotSemantics,
};

View file

@ -499,8 +499,18 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
if (input.view.length() <= state.string_position)
return ExecutionResult::Failed_ExecuteLowPrioForks;
// U+2028 LINE SEPARATOR
constexpr static u32 const LineSeparator { 0x2028 };
// U+2029 PARAGRAPH SEPARATOR
constexpr static u32 const ParagraphSeparator { 0x2029 };
auto input_view = input.view.substring_view(state.string_position, 1)[0];
if (input_view != '\n' || (input.regex_options.has_flag_set(AllFlags::SingleLine) && input.regex_options.has_flag_set(AllFlags::Internal_ConsiderNewline))) {
auto is_equivalent_to_newline = input_view == '\n'
|| (input.regex_options.has_flag_set(AllFlags::Internal_ECMA262DotSemantics)
? (input_view == '\r' || input_view == LineSeparator || input_view == ParagraphSeparator)
: false);
if (!is_equivalent_to_newline || (input.regex_options.has_flag_set(AllFlags::SingleLine) && input.regex_options.has_flag_set(AllFlags::Internal_ConsiderNewline))) {
if (current_inversion_state())
inverse_matched = true;
else

View file

@ -19,6 +19,7 @@ namespace regex {
using FlagsUnderlyingType = u32;
enum class AllFlags {
Default = 0,
Global = __Regex_Global, // All matches (don't return after first match)
Insensitive = __Regex_Insensitive, // Case insensitive match (ignores case of [a-zA-Z])
Ungreedy = __Regex_Ungreedy, // The match becomes lazy by default. Now a ? following a quantifier makes it greedy
@ -38,10 +39,12 @@ enum class AllFlags {
Internal_Stateful = __Regex_Internal_Stateful, // Make global matches match one result at a time, and further match() calls on the same instance continue where the previous one left off.
Internal_BrowserExtended = __Regex_Internal_BrowserExtended, // Only for ECMA262, Enable the behaviors defined in section B.1.4. of the ECMA262 spec.
Internal_ConsiderNewline = __Regex_Internal_ConsiderNewline, // Only for ECMA262, Allow multiline matches to consider newlines as line boundaries.
Internal_ECMA262DotSemantics = __Regex_Internal_ECMA262DotSemantics, // Use ECMA262 dot semantics: disallow matching CR/LF/LS/PS instead of just CR.
Last = Internal_BrowserExtended,
};
enum class PosixFlags : FlagsUnderlyingType {
Default = 0,
Global = (FlagsUnderlyingType)AllFlags::Global,
Insensitive = (FlagsUnderlyingType)AllFlags::Insensitive,
Ungreedy = (FlagsUnderlyingType)AllFlags::Ungreedy,
@ -58,6 +61,7 @@ enum class PosixFlags : FlagsUnderlyingType {
};
enum class ECMAScriptFlags : FlagsUnderlyingType {
Default = (FlagsUnderlyingType)AllFlags::Internal_ECMA262DotSemantics,
Global = (FlagsUnderlyingType)AllFlags::Global | (FlagsUnderlyingType)AllFlags::Internal_Stateful, // Note: ECMAScript "Global" creates a stateful regex.
Insensitive = (FlagsUnderlyingType)AllFlags::Insensitive,
Ungreedy = (FlagsUnderlyingType)AllFlags::Ungreedy,
@ -80,13 +84,13 @@ public:
RegexOptions() = default;
constexpr RegexOptions(T flags)
: m_flags(flags)
: m_flags(static_cast<T>(to_underlying(flags) | to_underlying(T::Default)))
{
}
template<class U>
constexpr RegexOptions(RegexOptions<U> other)
: m_flags((T) static_cast<FlagsUnderlyingType>(other.value()))
: RegexOptions(static_cast<T>(to_underlying(other.value())))
{
}
@ -115,7 +119,7 @@ public:
T value() const { return m_flags; }
private:
T m_flags { 0 };
T m_flags { T::Default };
};
template<class T>