1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-07-24 17:47:43 +00:00

LibRegex: Support UTF-16 RegexStringView and improve Unicode matching

When the Unicode option is not set, regular expressions should match
based on code units; when it is set, they should match based on code
points. To do so, the regex parser must combine surrogate pairs when
the Unicode option is set. Further, RegexStringView needs to know if
the flag is set in order to return code point vs. code unit based
string lengths and substrings.
This commit is contained in:
Timothy Flynn 2021-07-20 22:33:00 -04:00 committed by Linus Groh
parent 2e45e52993
commit 47f6bb38a1
5 changed files with 167 additions and 21 deletions

View file

@ -506,10 +506,14 @@ TEST_CASE(ECMA262_parse)
{ ",(?", regex::Error::InvalidCaptureGroup }, // #4583 { ",(?", regex::Error::InvalidCaptureGroup }, // #4583
{ "{1}", regex::Error::InvalidPattern }, { "{1}", regex::Error::InvalidPattern },
{ "{1,2}", regex::Error::InvalidPattern }, { "{1,2}", regex::Error::InvalidPattern },
{ "\\uxxxx", regex::Error::NoError },
{ "\\uxxxx", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
{ "\\ud83d", regex::Error::NoError, ECMAScriptFlags::Unicode },
{ "\\ud83d\\uxxxx", regex::Error::InvalidPattern, ECMAScriptFlags::Unicode },
}; };
for (auto& test : tests) { for (auto& test : tests) {
Regex<ECMA262> re(test.pattern); Regex<ECMA262> re(test.pattern, test.flags);
EXPECT_EQ(re.parser_result.error, test.expected_error); EXPECT_EQ(re.parser_result.error, test.expected_error);
if constexpr (REGEX_DEBUG) { if constexpr (REGEX_DEBUG) {
dbgln("\n"); dbgln("\n");
@ -586,6 +590,45 @@ TEST_CASE(ECMA262_match)
} }
} }
TEST_CASE(ECMA262_unicode_match)
{
struct _test {
char const* pattern;
char const* subject;
bool matches { true };
ECMAScriptFlags options {};
};
_test tests[] {
{ "\\ud83d", "😀", true },
{ "\\ud83d", "😀", false, ECMAScriptFlags::Unicode },
{ "\\ude00", "😀", true },
{ "\\ude00", "😀", false, ECMAScriptFlags::Unicode },
{ "\\ud83d\\ude00", "😀", true },
{ "\\ud83d\\ude00", "😀", true, ECMAScriptFlags::Unicode },
{ "\\ud83d\\ud83d", "\xed\xa0\xbd\xed\xa0\xbd", true },
{ "\\ud83d\\ud83d", "\xed\xa0\xbd\xed\xa0\xbd", true, ECMAScriptFlags::Unicode },
};
for (auto& test : tests) {
Regex<ECMA262> re(test.pattern, (ECMAScriptFlags)regex::AllFlags::Global | test.options);
auto subject = AK::utf8_to_utf16(test.subject);
Utf16View view { subject };
if constexpr (REGEX_DEBUG) {
dbgln("\n");
RegexDebug regex_dbg(stderr);
regex_dbg.print_raw_bytecode(re);
regex_dbg.print_header();
regex_dbg.print_bytecode(re);
dbgln("\n");
}
EXPECT_EQ(re.parser_result.error, Error::NoError);
EXPECT_EQ(re.match(view).success, test.matches);
}
}
TEST_CASE(replace) TEST_CASE(replace)
{ {
struct _test { struct _test {

View file

@ -465,12 +465,13 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
return ExecutionResult::Failed_ExecuteLowPrioForks; return ExecutionResult::Failed_ExecuteLowPrioForks;
Optional<String> str; Optional<String> str;
Vector<u16> utf16;
Vector<u32> data; Vector<u32> data;
data.ensure_capacity(length); data.ensure_capacity(length);
for (size_t i = offset; i < offset + length; ++i) for (size_t i = offset; i < offset + length; ++i)
data.unchecked_append(m_bytecode->at(i)); data.unchecked_append(m_bytecode->at(i));
auto view = input.view.construct_as_same(data, str); auto view = input.view.construct_as_same(data, str, utf16);
offset += length; offset += length;
if (!compare_string(input, state, view, had_zero_length_match)) if (!compare_string(input, state, view, had_zero_length_match))
return ExecutionResult::Failed_ExecuteLowPrioForks; return ExecutionResult::Failed_ExecuteLowPrioForks;
@ -553,7 +554,8 @@ ALWAYS_INLINE void OpCode_Compare::compare_char(MatchInput const& input, MatchSt
auto input_view = input.view.substring_view(state.string_position, 1); auto input_view = input.view.substring_view(state.string_position, 1);
Optional<String> str; Optional<String> str;
auto compare_view = input_view.construct_as_same({ &ch1, 1 }, str); Vector<u16> utf16;
auto compare_view = input_view.construct_as_same({ &ch1, 1 }, str, utf16);
bool equal; bool equal;
if (input.regex_options & AllFlags::Insensitive) if (input.regex_options & AllFlags::Insensitive)
equal = input_view.equals_ignoring_case(compare_view); equal = input_view.equals_ignoring_case(compare_view);

View file

@ -14,6 +14,7 @@
#include <AK/String.h> #include <AK/String.h>
#include <AK/StringBuilder.h> #include <AK/StringBuilder.h>
#include <AK/StringView.h> #include <AK/StringView.h>
#include <AK/Utf16View.h>
#include <AK/Utf32View.h> #include <AK/Utf32View.h>
#include <AK/Utf8View.h> #include <AK/Utf8View.h>
#include <AK/Variant.h> #include <AK/Variant.h>
@ -43,6 +44,11 @@ public:
{ {
} }
RegexStringView(Utf16View view)
: m_view(view)
{
}
RegexStringView(Utf8View view) RegexStringView(Utf8View view)
: m_view(view) : m_view(view)
{ {
@ -58,11 +64,19 @@ public:
return m_view.get<Utf32View>(); return m_view.get<Utf32View>();
} }
Utf16View const& u16_view() const
{
return m_view.get<Utf16View>();
}
Utf8View const& u8_view() const Utf8View const& u8_view() const
{ {
return m_view.get<Utf8View>(); return m_view.get<Utf8View>();
} }
bool unicode() const { return m_unicode; }
void set_unicode(bool unicode) { m_unicode = unicode; }
bool is_empty() const bool is_empty() const
{ {
return m_view.visit([](auto& view) { return view.is_empty(); }); return m_view.visit([](auto& view) { return view.is_empty(); });
@ -75,12 +89,21 @@ public:
size_t length() const size_t length() const
{ {
return m_view.visit([](auto& view) { return view.length(); }); if (unicode()) {
return m_view.visit(
[](Utf16View const& view) { return view.length_in_code_points(); },
[](auto const& view) { return view.length(); });
}
return m_view.visit(
[](Utf16View const& view) { return view.length_in_code_units(); },
[](Utf8View const& view) { return view.byte_length(); },
[](auto const& view) { return view.length(); });
} }
RegexStringView construct_as_same(Span<u32> data, Optional<String>& optional_string_storage) const RegexStringView construct_as_same(Span<u32> data, Optional<String>& optional_string_storage, Vector<u16>& optional_utf16_storage) const
{ {
return m_view.visit( auto view = m_view.visit(
[&]<typename T>(T const&) { [&]<typename T>(T const&) {
StringBuilder builder; StringBuilder builder;
for (auto ch : data) for (auto ch : data)
@ -90,7 +113,14 @@ public:
}, },
[&](Utf32View) { [&](Utf32View) {
return RegexStringView { Utf32View { data.data(), data.size() } }; return RegexStringView { Utf32View { data.data(), data.size() } };
},
[&](Utf16View) {
optional_utf16_storage = AK::utf32_to_utf16(Utf32View { data.data(), data.size() });
return RegexStringView { Utf16View { optional_utf16_storage } };
}); });
view.set_unicode(unicode());
return view;
} }
Vector<RegexStringView> lines() const Vector<RegexStringView> lines() const
@ -118,6 +148,21 @@ public:
views.empend(view); views.empend(view);
return views; return views;
}, },
[](Utf16View view) {
Vector<RegexStringView> views;
u16 newline = '\n';
while (!view.is_empty()) {
auto position = AK::memmem_optional(view.data(), view.length_in_code_units() * sizeof(u16), &newline, sizeof(u16));
if (!position.has_value())
break;
auto offset = position.value() / sizeof(u16);
views.empend(view.substring_view(0, offset));
view = view.substring_view(offset + 1, view.length_in_code_units() - offset - 1);
}
if (!view.is_empty())
views.empend(view);
return views;
},
[](Utf8View& view) { [](Utf8View& view) {
Vector<RegexStringView> views; Vector<RegexStringView> views;
auto it = view.begin(); auto it = view.begin();
@ -147,15 +192,26 @@ public:
RegexStringView substring_view(size_t offset, size_t length) const RegexStringView substring_view(size_t offset, size_t length) const
{ {
return m_view.visit( if (unicode()) {
[&](auto view) { return RegexStringView { view.substring_view(offset, length) }; }, auto view = m_view.visit(
[&](Utf8View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; }); [&](auto view) { return RegexStringView { view.substring_view(offset, length) }; },
[&](Utf16View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; },
[&](Utf8View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; });
view.set_unicode(unicode());
return view;
}
auto view = m_view.visit([&](auto view) { return RegexStringView { view.substring_view(offset, length) }; });
view.set_unicode(unicode());
return view;
} }
String to_string() const String to_string() const
{ {
return m_view.visit( return m_view.visit(
[](StringView view) { return view.to_string(); }, [](StringView view) { return view.to_string(); },
[](Utf16View view) { return view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes); },
[](auto& view) { [](auto& view) {
StringBuilder builder; StringBuilder builder;
for (auto it = view.begin(); it != view.end(); ++it) for (auto it = view.begin(); it != view.end(); ++it)
@ -173,8 +229,8 @@ public:
return 256u + ch; return 256u + ch;
return ch; return ch;
}, },
[&](auto view) -> u32 { return view[index]; }, [&](Utf32View& view) -> u32 { return view[index]; },
[&](Utf8View& view) -> u32 { [&](auto& view) -> u32 {
size_t i = index; size_t i = index;
for (auto it = view.begin(); it != view.end(); ++it, --i) { for (auto it = view.begin(); it != view.end(); ++it, --i) {
if (i == 0) if (i == 0)
@ -188,6 +244,7 @@ public:
{ {
return m_view.visit( return m_view.visit(
[&](Utf32View) { return to_string() == cstring; }, [&](Utf32View) { return to_string() == cstring; },
[&](Utf16View) { return to_string() == cstring; },
[&](Utf8View const& view) { return view.as_string() == cstring; }, [&](Utf8View const& view) { return view.as_string() == cstring; },
[&](StringView view) { return view == cstring; }); [&](StringView view) { return view == cstring; });
} }
@ -201,6 +258,7 @@ public:
{ {
return m_view.visit( return m_view.visit(
[&](Utf32View) { return to_string() == string; }, [&](Utf32View) { return to_string() == string; },
[&](Utf16View) { return to_string() == string; },
[&](Utf8View const& view) { return view.as_string() == string; }, [&](Utf8View const& view) { return view.as_string() == string; },
[&](StringView view) { return view == string; }); [&](StringView view) { return view == string; });
} }
@ -209,6 +267,7 @@ public:
{ {
return m_view.visit( return m_view.visit(
[&](Utf32View) { return to_string() == string; }, [&](Utf32View) { return to_string() == string; },
[&](Utf16View) { return to_string() == string; },
[&](Utf8View const& view) { return view.as_string() == string; }, [&](Utf8View const& view) { return view.as_string() == string; },
[&](StringView view) { return view == string; }); [&](StringView view) { return view == string; });
} }
@ -224,6 +283,7 @@ public:
[&](Utf32View view) { [&](Utf32View view) {
return view.length() == other.length() && __builtin_memcmp(view.code_points(), other.code_points(), view.length() * sizeof(u32)) == 0; return view.length() == other.length() && __builtin_memcmp(view.code_points(), other.code_points(), view.length() * sizeof(u32)) == 0;
}, },
[&](Utf16View) { return to_string() == RegexStringView { other }.to_string(); },
[&](Utf8View const& view) { return view.as_string() == RegexStringView { other }.to_string(); }, [&](Utf8View const& view) { return view.as_string() == RegexStringView { other }.to_string(); },
[&](StringView view) { return view == RegexStringView { other }.to_string(); }); [&](StringView view) { return view == RegexStringView { other }.to_string(); });
} }
@ -233,12 +293,25 @@ public:
return !(*this == other); return !(*this == other);
} }
bool operator==(Utf16View const& other) const
{
return m_view.visit(
[&](Utf32View) { return to_string() == RegexStringView { other }.to_string(); },
[&](Utf16View const& view) { return view == other; },
[&](Utf8View const& view) { return view.as_string() == RegexStringView { other }.to_string(); },
[&](StringView view) { return view == RegexStringView { other }.to_string(); });
}
bool operator!=(Utf16View const& other) const
{
return !(*this == other);
}
bool operator==(Utf8View const& other) const bool operator==(Utf8View const& other) const
{ {
return m_view.visit( return m_view.visit(
[&](Utf32View) { [&](Utf32View) { return to_string() == other.as_string(); },
return to_string() == other.as_string(); [&](Utf16View) { return to_string() == other.as_string(); },
},
[&](Utf8View const& view) { return view.as_string() == other.as_string(); }, [&](Utf8View const& view) { return view.as_string() == other.as_string(); },
[&](StringView view) { return other.as_string() == view; }); [&](StringView view) { return other.as_string() == view; });
} }
@ -271,6 +344,9 @@ public:
[&](Utf32View) -> bool { [&](Utf32View) -> bool {
TODO(); TODO();
}, },
[&](Utf16View) -> bool {
TODO();
},
[&](Utf8View const& view) { return view.as_string().starts_with(str); }, [&](Utf8View const& view) { return view.as_string().starts_with(str); },
[&](StringView view) { return view.starts_with(str); }); [&](StringView view) { return view.starts_with(str); });
} }
@ -289,6 +365,7 @@ public:
} }
return true; return true;
}, },
[&](Utf16View) -> bool { TODO(); },
[&](Utf8View const& view) { [&](Utf8View const& view) {
auto it = view.begin(); auto it = view.begin();
for (auto code_point : str) { for (auto code_point : str) {
@ -304,7 +381,8 @@ public:
} }
private: private:
Variant<StringView, Utf8View, Utf32View> m_view; Variant<StringView, Utf8View, Utf16View, Utf32View> m_view;
bool m_unicode { false };
}; };
class Match final { class Match final {

View file

@ -84,6 +84,10 @@ RegexResult Matcher<Parser>::match(Vector<RegexStringView> const views, Optional
output.operations = 0; output.operations = 0;
size_t lines_to_skip = 0; size_t lines_to_skip = 0;
bool unicode = input.regex_options.has_flag_set(AllFlags::Unicode);
for (auto& view : views)
const_cast<RegexStringView&>(view).set_unicode(unicode);
if (input.regex_options.has_flag_set(AllFlags::Internal_Stateful)) { if (input.regex_options.has_flag_set(AllFlags::Internal_Stateful)) {
if (views.size() > 1 && input.start_offset > views.first().length()) { if (views.size() > 1 && input.start_offset > views.first().length()) {
dbgln_if(REGEX_DEBUG, "Started with start={}, goff={}, skip={}", input.start_offset, input.global_offset, lines_to_skip); dbgln_if(REGEX_DEBUG, "Started with start={}, goff={}, skip={}", input.start_offset, input.global_offset, lines_to_skip);

View file

@ -10,6 +10,7 @@
#include <AK/String.h> #include <AK/String.h>
#include <AK/StringBuilder.h> #include <AK/StringBuilder.h>
#include <AK/StringUtils.h> #include <AK/StringUtils.h>
#include <AK/Utf16View.h>
namespace regex { namespace regex {
@ -1440,13 +1441,31 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
if (try_skip("u")) { if (try_skip("u")) {
if (auto code_point = read_digits(ReadDigitsInitialZeroState::Allow, true, 4); code_point.has_value()) { if (auto code_point = read_digits(ReadDigitsInitialZeroState::Allow, true, 4); code_point.has_value()) {
// FIXME: The minimum length depends on the mode - should be utf8-length in u8 mode. // In Unicode mode, we need to combine surrogate pairs into a single code point. But we also need to be
// rather forgiving if the surrogate pairs are invalid. So if a second code unit follows this code unit,
// but doesn't form a valid surrogate pair, insert bytecode for both code units individually.
Optional<u32> low_surrogate;
if (unicode && Utf16View::is_high_surrogate(*code_point) && try_skip("\\u")) {
low_surrogate = read_digits(ReadDigitsInitialZeroState::Allow, true, 4);
if (!low_surrogate.has_value()) {
set_error(Error::InvalidPattern);
return false;
}
if (Utf16View::is_low_surrogate(*low_surrogate)) {
*code_point = Utf16View::decode_surrogate_pair(*code_point, *low_surrogate);
low_surrogate.clear();
}
}
match_length_minimum += 1; match_length_minimum += 1;
StringBuilder builder; stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)code_point.value() } });
builder.append_code_point(code_point.value());
// FIXME: This isn't actually correct for ECMAScript. if (low_surrogate.has_value()) {
auto u8_encoded = builder.string_view(); match_length_minimum += 1;
stack.insert_bytecode_compare_string(u8_encoded); stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)low_surrogate.value() } });
}
return true; return true;
} else if (!unicode) { } else if (!unicode) {
// '\u' is allowed in non-unicode mode, just matches 'u'. // '\u' is allowed in non-unicode mode, just matches 'u'.