From 4f2cbe119b675ad669d48ddbd2cef317c6d616de Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Wed, 18 Aug 2021 17:17:18 -0400 Subject: [PATCH] LibRegex: Allow Unicode escape sequences in capture group names Unfortunately, this requires a slight divergence in the way the capture group names are stored. Previously, the generated byte code would simply store a view into the regex pattern string, so no string copying was required. Now, the escape sequences are decoded into a new string, and a vector of all parsed capture group names are stored in a vector in the parser result structure. The byte code then stores a view into the corresponding string in that vector. --- Tests/LibRegex/Regex.cpp | 3 +++ .../builtins/String/String.prototype.match.js | 15 +++++++++++ Userland/Libraries/LibRegex/RegexParser.cpp | 26 +++++++++++++------ Userland/Libraries/LibRegex/RegexParser.h | 3 ++- 4 files changed, 38 insertions(+), 9 deletions(-) diff --git a/Tests/LibRegex/Regex.cpp b/Tests/LibRegex/Regex.cpp index 8cb3a87c9e..7a14f30eb9 100644 --- a/Tests/LibRegex/Regex.cpp +++ b/Tests/LibRegex/Regex.cpp @@ -698,6 +698,9 @@ TEST_CASE(ECMA262_unicode_match) { "\\ud83d\\ud83d"sv, "\xed\xa0\xbd\xed\xa0\xbd"sv, true, ECMAScriptFlags::Unicode }, { "(?<=.{3})f"sv, "abcdef"sv, true, ECMAScriptFlags::Unicode }, { "(?<=.{3})f"sv, "abcπŸ˜€ef"sv, true, ECMAScriptFlags::Unicode }, + { "(?<𝓑𝓻𝓸𝔀𝓷>brown)"sv, "brown"sv, true, ECMAScriptFlags::Unicode }, + { "(?<\\u{1d4d1}\\u{1d4fb}\\u{1d4f8}\\u{1d500}\\u{1d4f7}>brown)"sv, "brown"sv, true, ECMAScriptFlags::Unicode }, + { "(?<\\ud835\\udcd1\\ud835\\udcfb\\ud835\\udcf8\\ud835\\udd00\\ud835\\udcf7>brown)"sv, "brown"sv, true, ECMAScriptFlags::Unicode }, }; for (auto& test : tests) { diff --git a/Userland/Libraries/LibJS/Tests/builtins/String/String.prototype.match.js b/Userland/Libraries/LibJS/Tests/builtins/String/String.prototype.match.js index 9a04eae26c..859fd94b1e 100644 --- a/Userland/Libraries/LibJS/Tests/builtins/String/String.prototype.match.js +++ b/Userland/Libraries/LibJS/Tests/builtins/String/String.prototype.match.js @@ -55,3 +55,18 @@ test("UTF-16", () => { expect("πŸ˜€πŸ˜€".match(/\ud83d/g)).toEqual(["\ud83d", "\ud83d"]); expect("πŸ˜€πŸ˜€".match(/\ude00/g)).toEqual(["\ude00", "\ude00"]); }); + +test("escaped code points", () => { + var string = "The quick brown fox jumped over the lazy dog's back"; + + var re = /(?<𝓑𝓻𝓸𝔀𝓷>brown)/u; + expect(string.match(re).groups.𝓑𝓻𝓸𝔀𝓷).toBe("brown"); + + re = /(?<\u{1d4d1}\u{1d4fb}\u{1d4f8}\u{1d500}\u{1d4f7}>brown)/u; + expect(string.match(re).groups.𝓑𝓻𝓸𝔀𝓷).toBe("brown"); + expect(string.match(re).groups.𝓑𝓻𝓸𝔀𝓷).toBe("brown"); + + re = /(?<\ud835\udcd1\ud835\udcfb\ud835\udcf8\ud835\udd00\ud835\udcf7>brown)/u; + expect(string.match(re).groups.𝓑𝓻𝓸𝔀𝓷).toBe("brown"); + expect(string.match(re).groups.𝓑𝓻𝓸𝔀𝓷).toBe("brown"); +}); diff --git a/Userland/Libraries/LibRegex/RegexParser.cpp b/Userland/Libraries/LibRegex/RegexParser.cpp index 3617f0ec5c..02598f5da9 100644 --- a/Userland/Libraries/LibRegex/RegexParser.cpp +++ b/Userland/Libraries/LibRegex/RegexParser.cpp @@ -195,7 +195,8 @@ Parser::Result Parser::parse(Optional regex_options) move(m_parser_state.named_capture_groups_count), move(m_parser_state.match_length_minimum), move(m_parser_state.error), - move(m_parser_state.error_token) + move(m_parser_state.error_token), + m_parser_state.named_capture_groups.keys() }; } @@ -2009,21 +2010,30 @@ bool ECMA262Parser::parse_unicode_property_escape(PropertyEscape& property, bool [](Empty&) -> bool { VERIFY_NOT_REACHED(); }); } -StringView ECMA262Parser::read_capture_group_specifier(bool take_starting_angle_bracket) +FlyString ECMA262Parser::read_capture_group_specifier(bool take_starting_angle_bracket) { if (take_starting_angle_bracket && !consume("<")) return {}; - auto start_token = m_parser_state.current_token; - size_t offset = 0; - while (match(TokenType::Char) || match(TokenType::Dollar)) { + StringBuilder builder; + while (match(TokenType::Char) || match(TokenType::Dollar) || match(TokenType::LeftCurly) || match(TokenType::RightCurly)) { auto c = m_parser_state.current_token.value(); if (c == ">") break; - offset += consume().value().length(); + + if (try_skip("\\u"sv)) { + if (auto code_point = consume_escaped_code_point(true); code_point.has_value()) { + builder.append_code_point(*code_point); + } else { + set_error(Error::InvalidNameForCaptureGroup); + return {}; + } + } else { + builder.append(consume().value()); + } } - StringView name { start_token.value().characters_without_null_termination(), offset }; + FlyString name = builder.build(); if (!consume(">") || name.is_empty()) set_error(Error::InvalidNameForCaptureGroup); @@ -2146,7 +2156,7 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi stack.insert_bytecode_group_capture_left(group_index); stack.extend(move(capture_group_bytecode)); - stack.insert_bytecode_group_capture_right(group_index, name); + stack.insert_bytecode_group_capture_right(group_index, name.view()); match_length_minimum += length; diff --git a/Userland/Libraries/LibRegex/RegexParser.h b/Userland/Libraries/LibRegex/RegexParser.h index ee3fd648aa..480320037a 100644 --- a/Userland/Libraries/LibRegex/RegexParser.h +++ b/Userland/Libraries/LibRegex/RegexParser.h @@ -53,6 +53,7 @@ public: size_t match_length_minimum; Error error; Token error_token; + Vector capture_groups; }; explicit Parser(Lexer& lexer) @@ -218,7 +219,7 @@ private: }; StringView read_digits_as_string(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1, int min_count = -1); Optional read_digits(ReadDigitsInitialZeroState initial_zero = ReadDigitsInitialZeroState::Allow, bool hex = false, int max_count = -1, int min_count = -1); - StringView read_capture_group_specifier(bool take_starting_angle_bracket = false); + FlyString read_capture_group_specifier(bool take_starting_angle_bracket = false); struct Script { Unicode::Script script {};