mirror of
https://github.com/RGBCube/serenity
synced 2025-07-24 15:47:42 +00:00
LibRegex: Implement ECMA262 multiline matching without splitting lines
As ECMA262 regex allows `[^]` and literal newlines to match newlines in the input string, we shouldn't split the input string into lines, rather simply make boundaries and catchall patterns capable of checking for these conditions specifically.
This commit is contained in:
parent
98183ef572
commit
5fac41f733
7 changed files with 55 additions and 20 deletions
|
@ -684,6 +684,8 @@ TEST_CASE(ECMA262_match)
|
||||||
{ "(\0|a)"sv, "a"sv, true }, // #9686, Should allow null bytes in pattern
|
{ "(\0|a)"sv, "a"sv, true }, // #9686, Should allow null bytes in pattern
|
||||||
{ "(.*?)a(?!(a+)b\\2c)\\2(.*)"sv, "baaabaac"sv, true }, // #6042, Groups inside lookarounds may be referenced outside, but their contents appear empty if the pattern in the lookaround fails.
|
{ "(.*?)a(?!(a+)b\\2c)\\2(.*)"sv, "baaabaac"sv, true }, // #6042, Groups inside lookarounds may be referenced outside, but their contents appear empty if the pattern in the lookaround fails.
|
||||||
{ "a|$"sv, "x"sv, true, (ECMAScriptFlags)regex::AllFlags::Global }, // #11940, Global (not the 'g' flag) regexps should attempt to match the zero-length end of the string too.
|
{ "a|$"sv, "x"sv, true, (ECMAScriptFlags)regex::AllFlags::Global }, // #11940, Global (not the 'g' flag) regexps should attempt to match the zero-length end of the string too.
|
||||||
|
{ "foo\nbar"sv, "foo\nbar"sv, true }, // #12126, ECMA262 regexp should match literal newlines without the 's' flag.
|
||||||
|
{ "foo[^]bar"sv, "foo\nbar"sv, true }, // #12126, ECMA262 regexp should match newline with [^].
|
||||||
};
|
};
|
||||||
// clang-format on
|
// clang-format on
|
||||||
|
|
||||||
|
|
|
@ -83,6 +83,7 @@ enum __RegexAllFlags {
|
||||||
__Regex_SkipTrimEmptyMatches = __Regex_Global << 13, // Do not remove empty capture group results.
|
__Regex_SkipTrimEmptyMatches = __Regex_Global << 13, // Do not remove empty capture group results.
|
||||||
__Regex_Internal_Stateful = __Regex_Global << 14, // Internal flag; enables stateful matches.
|
__Regex_Internal_Stateful = __Regex_Global << 14, // Internal flag; enables stateful matches.
|
||||||
__Regex_Internal_BrowserExtended = __Regex_Global << 15, // Internal flag; enable browser-specific ECMA262 extensions.
|
__Regex_Internal_BrowserExtended = __Regex_Global << 15, // Internal flag; enable browser-specific ECMA262 extensions.
|
||||||
|
__Regex_Internal_ConsiderNewline = __Regex_Global << 16, // Internal flag; allow matchers to consider newlines as line separators.
|
||||||
__Regex_Last = __Regex_SkipTrimEmptyMatches
|
__Regex_Last = __Regex_SkipTrimEmptyMatches
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -97,7 +98,6 @@ enum __RegexAllFlags {
|
||||||
#define REG_NOTBOL __Regex_MatchNotBeginOfLine // The circumflex character (^), when taken as a special character, will not match the beginning of string.
|
#define REG_NOTBOL __Regex_MatchNotBeginOfLine // The circumflex character (^), when taken as a special character, will not match the beginning of string.
|
||||||
#define REG_NOTEOL __Regex_MatchNotEndOfLine // The dollar sign ($), when taken as a special character, will not match the end of string.
|
#define REG_NOTEOL __Regex_MatchNotEndOfLine // The dollar sign ($), when taken as a special character, will not match the end of string.
|
||||||
|
|
||||||
//static_assert (sizeof(FlagsUnderlyingType) * 8 >= regex::POSIXFlags::Last << 1), "flags type too small")
|
|
||||||
#define REG_SEARCH __Regex_Last << 1
|
#define REG_SEARCH __Regex_Last << 1
|
||||||
|
|
||||||
int regcomp(regex_t*, const char*, int);
|
int regcomp(regex_t*, const char*, int);
|
||||||
|
|
|
@ -273,12 +273,23 @@ ALWAYS_INLINE ExecutionResult OpCode_ForkReplaceStay::execute(MatchInput const&
|
||||||
|
|
||||||
ALWAYS_INLINE ExecutionResult OpCode_CheckBegin::execute(MatchInput const& input, MatchState& state) const
|
ALWAYS_INLINE ExecutionResult OpCode_CheckBegin::execute(MatchInput const& input, MatchState& state) const
|
||||||
{
|
{
|
||||||
if (0 == state.string_position && (input.regex_options & AllFlags::MatchNotBeginOfLine))
|
auto is_at_line_boundary = [&] {
|
||||||
|
if (state.string_position == 0)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
if (input.regex_options.has_flag_set(AllFlags::Multiline) && input.regex_options.has_flag_set(AllFlags::Internal_ConsiderNewline)) {
|
||||||
|
auto input_view = input.view.substring_view(state.string_position - 1, 1)[0];
|
||||||
|
return input_view == '\n';
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}();
|
||||||
|
if (is_at_line_boundary && (input.regex_options & AllFlags::MatchNotBeginOfLine))
|
||||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||||
|
|
||||||
if ((0 == state.string_position && !(input.regex_options & AllFlags::MatchNotBeginOfLine))
|
if ((is_at_line_boundary && !(input.regex_options & AllFlags::MatchNotBeginOfLine))
|
||||||
|| (0 != state.string_position && (input.regex_options & AllFlags::MatchNotBeginOfLine))
|
|| (!is_at_line_boundary && (input.regex_options & AllFlags::MatchNotBeginOfLine))
|
||||||
|| (0 == state.string_position && (input.regex_options & AllFlags::Global)))
|
|| (is_at_line_boundary && (input.regex_options & AllFlags::Global)))
|
||||||
return ExecutionResult::Continue;
|
return ExecutionResult::Continue;
|
||||||
|
|
||||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||||
|
@ -315,11 +326,22 @@ ALWAYS_INLINE ExecutionResult OpCode_CheckBoundary::execute(MatchInput const& in
|
||||||
|
|
||||||
ALWAYS_INLINE ExecutionResult OpCode_CheckEnd::execute(MatchInput const& input, MatchState& state) const
|
ALWAYS_INLINE ExecutionResult OpCode_CheckEnd::execute(MatchInput const& input, MatchState& state) const
|
||||||
{
|
{
|
||||||
if (state.string_position == input.view.length() && (input.regex_options & AllFlags::MatchNotEndOfLine))
|
auto is_at_line_boundary = [&] {
|
||||||
|
if (state.string_position == input.view.length())
|
||||||
|
return true;
|
||||||
|
|
||||||
|
if (input.regex_options.has_flag_set(AllFlags::Multiline) && input.regex_options.has_flag_set(AllFlags::Internal_ConsiderNewline)) {
|
||||||
|
auto input_view = input.view.substring_view(state.string_position, 1)[0];
|
||||||
|
return input_view == '\n';
|
||||||
|
}
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}();
|
||||||
|
if (is_at_line_boundary && (input.regex_options & AllFlags::MatchNotEndOfLine))
|
||||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||||
|
|
||||||
if ((state.string_position == input.view.length() && !(input.regex_options & AllFlags::MatchNotEndOfLine))
|
if ((is_at_line_boundary && !(input.regex_options & AllFlags::MatchNotEndOfLine))
|
||||||
|| (state.string_position != input.view.length() && (input.regex_options & AllFlags::MatchNotEndOfLine || input.regex_options & AllFlags::MatchNotBeginOfLine)))
|
|| (!is_at_line_boundary && (input.regex_options & AllFlags::MatchNotEndOfLine || input.regex_options & AllFlags::MatchNotBeginOfLine)))
|
||||||
return ExecutionResult::Continue;
|
return ExecutionResult::Continue;
|
||||||
|
|
||||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||||
|
@ -461,8 +483,9 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
|
||||||
if (input.view.length() <= state.string_position)
|
if (input.view.length() <= state.string_position)
|
||||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||||
|
|
||||||
VERIFY(!current_inversion_state());
|
auto input_view = input.view.substring_view(state.string_position, 1)[0];
|
||||||
advance_string_position(state, input.view);
|
if (input_view != '\n' || (input.regex_options.has_flag_set(AllFlags::SingleLine) && input.regex_options.has_flag_set(AllFlags::Internal_ConsiderNewline)))
|
||||||
|
advance_string_position(state, input.view, input_view);
|
||||||
|
|
||||||
} else if (compare_type == CharacterCompareType::String) {
|
} else if (compare_type == CharacterCompareType::String) {
|
||||||
VERIFY(!current_inversion_state());
|
VERIFY(!current_inversion_state());
|
||||||
|
|
|
@ -41,7 +41,7 @@ Regex<Parser>::Regex(String pattern, typename ParserTraits<Parser>::OptionsType
|
||||||
|
|
||||||
run_optimization_passes();
|
run_optimization_passes();
|
||||||
if (parser_result.error == regex::Error::NoError)
|
if (parser_result.error == regex::Error::NoError)
|
||||||
matcher = make<Matcher<Parser>>(this, regex_options);
|
matcher = make<Matcher<Parser>>(this, static_cast<decltype(regex_options.value())>(parser_result.options.value()));
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Parser>
|
template<class Parser>
|
||||||
|
@ -51,7 +51,7 @@ Regex<Parser>::Regex(regex::Parser::Result parse_result, String pattern, typenam
|
||||||
{
|
{
|
||||||
run_optimization_passes();
|
run_optimization_passes();
|
||||||
if (parser_result.error == regex::Error::NoError)
|
if (parser_result.error == regex::Error::NoError)
|
||||||
matcher = make<Matcher<Parser>>(this, regex_options);
|
matcher = make<Matcher<Parser>>(this, regex_options | static_cast<decltype(regex_options.value())>(parse_result.options.value()));
|
||||||
}
|
}
|
||||||
|
|
||||||
template<class Parser>
|
template<class Parser>
|
||||||
|
@ -104,8 +104,10 @@ RegexResult Matcher<Parser>::match(RegexStringView view, Optional<typename Parse
|
||||||
{
|
{
|
||||||
AllOptions options = m_regex_options | regex_options.value_or({}).value();
|
AllOptions options = m_regex_options | regex_options.value_or({}).value();
|
||||||
|
|
||||||
if (options.has_flag_set(AllFlags::Multiline))
|
if constexpr (!IsSame<Parser, ECMA262>) {
|
||||||
return match(view.lines(), regex_options); // FIXME: how do we know, which line ending a line has (1char or 2char)? This is needed to get the correct match offsets from start of string...
|
if (options.has_flag_set(AllFlags::Multiline))
|
||||||
|
return match(view.lines(), regex_options); // FIXME: how do we know, which line ending a line has (1char or 2char)? This is needed to get the correct match offsets from start of string...
|
||||||
|
}
|
||||||
|
|
||||||
Vector<RegexStringView> views;
|
Vector<RegexStringView> views;
|
||||||
views.append(view);
|
views.append(view);
|
||||||
|
|
|
@ -16,7 +16,7 @@
|
||||||
|
|
||||||
namespace regex {
|
namespace regex {
|
||||||
|
|
||||||
using FlagsUnderlyingType = u16;
|
using FlagsUnderlyingType = u32;
|
||||||
|
|
||||||
enum class AllFlags {
|
enum class AllFlags {
|
||||||
Global = __Regex_Global, // All matches (don't return after first match)
|
Global = __Regex_Global, // All matches (don't return after first match)
|
||||||
|
@ -35,6 +35,7 @@ enum class AllFlags {
|
||||||
SkipTrimEmptyMatches = __Regex_SkipTrimEmptyMatches, // Do not remove empty capture group results.
|
SkipTrimEmptyMatches = __Regex_SkipTrimEmptyMatches, // Do not remove empty capture group results.
|
||||||
Internal_Stateful = __Regex_Internal_Stateful, // Make global matches match one result at a time, and further match() calls on the same instance continue where the previous one left off.
|
Internal_Stateful = __Regex_Internal_Stateful, // Make global matches match one result at a time, and further match() calls on the same instance continue where the previous one left off.
|
||||||
Internal_BrowserExtended = __Regex_Internal_BrowserExtended, // Only for ECMA262, Enable the behaviors defined in section B.1.4. of the ECMA262 spec.
|
Internal_BrowserExtended = __Regex_Internal_BrowserExtended, // Only for ECMA262, Enable the behaviors defined in section B.1.4. of the ECMA262 spec.
|
||||||
|
Internal_ConsiderNewline = __Regex_Internal_ConsiderNewline, // Only for ECMA262, Allow multiline matches to consider newlines as line boundaries.
|
||||||
Last = Internal_BrowserExtended,
|
Last = Internal_BrowserExtended,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -194,7 +194,8 @@ Parser::Result Parser::parse(Optional<AllOptions> regex_options)
|
||||||
move(m_parser_state.match_length_minimum),
|
move(m_parser_state.match_length_minimum),
|
||||||
move(m_parser_state.error),
|
move(m_parser_state.error),
|
||||||
move(m_parser_state.error_token),
|
move(m_parser_state.error_token),
|
||||||
m_parser_state.named_capture_groups.keys()
|
m_parser_state.named_capture_groups.keys(),
|
||||||
|
m_parser_state.regex_options,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -54,6 +54,7 @@ public:
|
||||||
Error error;
|
Error error;
|
||||||
Token error_token;
|
Token error_token;
|
||||||
Vector<FlyString> capture_groups;
|
Vector<FlyString> capture_groups;
|
||||||
|
AllOptions options;
|
||||||
};
|
};
|
||||||
|
|
||||||
explicit Parser(Lexer& lexer)
|
explicit Parser(Lexer& lexer)
|
||||||
|
@ -71,6 +72,7 @@ public:
|
||||||
Result parse(Optional<AllOptions> regex_options = {});
|
Result parse(Optional<AllOptions> regex_options = {});
|
||||||
bool has_error() const { return m_parser_state.error != Error::NoError; }
|
bool has_error() const { return m_parser_state.error != Error::NoError; }
|
||||||
Error error() const { return m_parser_state.error; }
|
Error error() const { return m_parser_state.error; }
|
||||||
|
AllOptions options() const { return m_parser_state.regex_options; }
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
virtual bool parse_internal(ByteCode&, size_t& match_length_minimum) = 0;
|
virtual bool parse_internal(ByteCode&, size_t& match_length_minimum) = 0;
|
||||||
|
@ -170,14 +172,16 @@ private:
|
||||||
};
|
};
|
||||||
|
|
||||||
class PosixExtendedParser final : public AbstractPosixParser {
|
class PosixExtendedParser final : public AbstractPosixParser {
|
||||||
|
constexpr static auto default_options = static_cast<PosixFlags>(AllFlags::SingleLine) | static_cast<PosixFlags>(AllFlags::Internal_ConsiderNewline);
|
||||||
|
|
||||||
public:
|
public:
|
||||||
explicit PosixExtendedParser(Lexer& lexer)
|
explicit PosixExtendedParser(Lexer& lexer)
|
||||||
: AbstractPosixParser(lexer)
|
: AbstractPosixParser(lexer, default_options)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
PosixExtendedParser(Lexer& lexer, Optional<typename ParserTraits<PosixExtendedParser>::OptionsType> regex_options)
|
PosixExtendedParser(Lexer& lexer, Optional<typename ParserTraits<PosixExtendedParser>::OptionsType> regex_options)
|
||||||
: AbstractPosixParser(lexer, regex_options.value_or({}))
|
: AbstractPosixParser(lexer, regex_options.value_or({}) | default_options.value())
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -195,15 +199,17 @@ private:
|
||||||
};
|
};
|
||||||
|
|
||||||
class ECMA262Parser final : public Parser {
|
class ECMA262Parser final : public Parser {
|
||||||
|
constexpr static ECMAScriptOptions default_options = static_cast<ECMAScriptFlags>(AllFlags::Internal_ConsiderNewline);
|
||||||
|
|
||||||
public:
|
public:
|
||||||
explicit ECMA262Parser(Lexer& lexer)
|
explicit ECMA262Parser(Lexer& lexer)
|
||||||
: Parser(lexer)
|
: Parser(lexer, default_options)
|
||||||
{
|
{
|
||||||
m_capture_groups_in_scope.empend();
|
m_capture_groups_in_scope.empend();
|
||||||
}
|
}
|
||||||
|
|
||||||
ECMA262Parser(Lexer& lexer, Optional<typename ParserTraits<ECMA262Parser>::OptionsType> regex_options)
|
ECMA262Parser(Lexer& lexer, Optional<typename ParserTraits<ECMA262Parser>::OptionsType> regex_options)
|
||||||
: Parser(lexer, regex_options.value_or({}))
|
: Parser(lexer, regex_options.value_or({}) | default_options.value())
|
||||||
{
|
{
|
||||||
m_should_use_browser_extended_grammar = regex_options.has_value() && regex_options->has_flag_set(ECMAScriptFlags::BrowserExtended);
|
m_should_use_browser_extended_grammar = regex_options.has_value() && regex_options->has_flag_set(ECMAScriptFlags::BrowserExtended);
|
||||||
m_capture_groups_in_scope.empend();
|
m_capture_groups_in_scope.empend();
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue