mirror of
https://github.com/RGBCube/serenity
synced 2025-07-27 03:37:45 +00:00
LibRegex+Everywhere: Make LibRegex more unicode-aware
This commit makes LibRegex (mostly) capable of operating on any of the three main string views: - StringView for raw strings - Utf8View for utf-8 encoded strings - Utf32View for raw unicode strings As a result, regexps with unicode strings should be able to properly handle utf-8 and not stop in the middle of a code point. A future commit will update LibJS to use the correct type of string depending on the flags.
This commit is contained in:
parent
e5af15a6e9
commit
f364fcec5d
8 changed files with 310 additions and 207 deletions
|
@ -15,6 +15,8 @@
|
|||
#include <AK/StringBuilder.h>
|
||||
#include <AK/StringView.h>
|
||||
#include <AK/Utf32View.h>
|
||||
#include <AK/Utf8View.h>
|
||||
#include <AK/Variant.h>
|
||||
#include <AK/Vector.h>
|
||||
|
||||
namespace regex {
|
||||
|
@ -22,124 +24,172 @@ namespace regex {
|
|||
class RegexStringView {
|
||||
public:
|
||||
RegexStringView(const char* chars)
|
||||
: m_u8view(chars)
|
||||
: m_view(StringView { chars })
|
||||
{
|
||||
}
|
||||
|
||||
RegexStringView(const String& string)
|
||||
: m_u8view(string)
|
||||
: m_view(string.view())
|
||||
{
|
||||
}
|
||||
|
||||
RegexStringView(const StringView view)
|
||||
: m_u8view(view)
|
||||
{
|
||||
}
|
||||
RegexStringView(const Utf32View view)
|
||||
: m_u32view(view)
|
||||
: m_view(view)
|
||||
{
|
||||
}
|
||||
|
||||
bool is_u8_view() const { return m_u8view.has_value(); }
|
||||
bool is_u32_view() const { return m_u32view.has_value(); }
|
||||
|
||||
const StringView& u8view() const
|
||||
RegexStringView(Utf32View view)
|
||||
: m_view(view)
|
||||
{
|
||||
VERIFY(m_u8view.has_value());
|
||||
return m_u8view.value();
|
||||
};
|
||||
}
|
||||
|
||||
const Utf32View& u32view() const
|
||||
RegexStringView(Utf8View view)
|
||||
: m_view(view)
|
||||
{
|
||||
VERIFY(m_u32view.has_value());
|
||||
return m_u32view.value();
|
||||
};
|
||||
}
|
||||
|
||||
const StringView& string_view() const
|
||||
{
|
||||
return m_view.get<StringView>();
|
||||
}
|
||||
|
||||
const Utf32View& u32_view() const
|
||||
{
|
||||
return m_view.get<Utf32View>();
|
||||
}
|
||||
|
||||
const Utf8View& u8_view() const
|
||||
{
|
||||
return m_view.get<Utf8View>();
|
||||
}
|
||||
|
||||
bool is_empty() const
|
||||
{
|
||||
if (is_u8_view())
|
||||
return m_u8view.value().is_empty();
|
||||
else
|
||||
return m_u32view.value().is_empty();
|
||||
return m_view.visit([](auto& view) { return view.is_empty(); });
|
||||
}
|
||||
|
||||
bool is_null() const
|
||||
{
|
||||
if (is_u8_view())
|
||||
return m_u8view.value().is_null();
|
||||
else
|
||||
return m_u32view.value().code_points() == nullptr;
|
||||
return m_view.visit([](auto& view) { return view.is_null(); });
|
||||
}
|
||||
|
||||
size_t length() const
|
||||
{
|
||||
if (is_u8_view())
|
||||
return m_u8view.value().length();
|
||||
else
|
||||
return m_u32view.value().length();
|
||||
return m_view.visit([](auto& view) { return view.length(); });
|
||||
}
|
||||
|
||||
RegexStringView construct_as_same(Span<u32> data, Optional<String>& optional_string_storage) const
|
||||
{
|
||||
return m_view.visit(
|
||||
[&]<typename T>(T const&) {
|
||||
StringBuilder builder;
|
||||
for (auto ch : data)
|
||||
builder.append(ch); // Note: The type conversion is intentional.
|
||||
optional_string_storage = builder.build();
|
||||
return RegexStringView { T { *optional_string_storage } };
|
||||
},
|
||||
[&](Utf32View) {
|
||||
return RegexStringView { Utf32View { data.data(), data.size() } };
|
||||
});
|
||||
}
|
||||
|
||||
Vector<RegexStringView> lines() const
|
||||
{
|
||||
if (is_u8_view()) {
|
||||
auto views = u8view().lines(false);
|
||||
Vector<RegexStringView> new_views;
|
||||
for (auto& view : views)
|
||||
new_views.append(move(view));
|
||||
return new_views;
|
||||
}
|
||||
|
||||
Vector<RegexStringView> views;
|
||||
auto view = u32view();
|
||||
u32 newline = '\n';
|
||||
while (!view.is_empty()) {
|
||||
auto position = AK::memmem_optional(view.code_points(), view.length() * sizeof(u32), &newline, sizeof(u32));
|
||||
if (!position.has_value())
|
||||
break;
|
||||
auto offset = position.value() / sizeof(u32);
|
||||
views.append(view.substring_view(0, offset));
|
||||
view = view.substring_view(offset + 1, view.length() - offset - 1);
|
||||
}
|
||||
if (!view.is_empty())
|
||||
views.append(view);
|
||||
return views;
|
||||
return m_view.visit(
|
||||
[](StringView view) {
|
||||
auto views = view.lines(false);
|
||||
Vector<RegexStringView> new_views;
|
||||
for (auto& view : views)
|
||||
new_views.empend(view);
|
||||
return new_views;
|
||||
},
|
||||
[](Utf32View view) {
|
||||
Vector<RegexStringView> views;
|
||||
u32 newline = '\n';
|
||||
while (!view.is_empty()) {
|
||||
auto position = AK::memmem_optional(view.code_points(), view.length() * sizeof(u32), &newline, sizeof(u32));
|
||||
if (!position.has_value())
|
||||
break;
|
||||
auto offset = position.value() / sizeof(u32);
|
||||
views.empend(view.substring_view(0, offset));
|
||||
view = view.substring_view(offset + 1, view.length() - offset - 1);
|
||||
}
|
||||
if (!view.is_empty())
|
||||
views.empend(view);
|
||||
return views;
|
||||
},
|
||||
[](Utf8View& view) {
|
||||
Vector<RegexStringView> views;
|
||||
auto it = view.begin();
|
||||
auto previous_newline_position_it = it;
|
||||
for (;;) {
|
||||
if (*it == '\n') {
|
||||
auto previous_offset = view.byte_offset_of(previous_newline_position_it);
|
||||
auto new_offset = view.byte_offset_of(it);
|
||||
auto slice = view.substring_view(previous_offset, new_offset - previous_offset);
|
||||
views.empend(slice);
|
||||
++it;
|
||||
previous_newline_position_it = it;
|
||||
}
|
||||
if (it.done())
|
||||
break;
|
||||
++it;
|
||||
}
|
||||
if (it != previous_newline_position_it) {
|
||||
auto previous_offset = view.byte_offset_of(previous_newline_position_it);
|
||||
auto new_offset = view.byte_offset_of(it);
|
||||
auto slice = view.substring_view(previous_offset, new_offset - previous_offset);
|
||||
views.empend(slice);
|
||||
}
|
||||
return views;
|
||||
});
|
||||
}
|
||||
|
||||
RegexStringView substring_view(size_t offset, size_t length) const
|
||||
{
|
||||
if (is_u8_view()) {
|
||||
return u8view().substring_view(offset, length);
|
||||
}
|
||||
return u32view().substring_view(offset, length);
|
||||
return m_view.visit(
|
||||
[&](auto view) { return RegexStringView { view.substring_view(offset, length) }; },
|
||||
[&](Utf8View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; });
|
||||
}
|
||||
|
||||
String to_string() const
|
||||
{
|
||||
if (is_u8_view()) {
|
||||
return u8view().to_string();
|
||||
}
|
||||
|
||||
StringBuilder builder;
|
||||
builder.append(u32view());
|
||||
return builder.to_string();
|
||||
return m_view.visit(
|
||||
[](StringView view) { return view.to_string(); },
|
||||
[](auto& view) {
|
||||
StringBuilder builder;
|
||||
for (auto it = view.begin(); it != view.end(); ++it)
|
||||
builder.append_code_point(*it);
|
||||
return builder.to_string();
|
||||
});
|
||||
}
|
||||
|
||||
u32 operator[](size_t index) const
|
||||
{
|
||||
if (is_u8_view()) {
|
||||
i8 ch = u8view()[index];
|
||||
u8 value = *reinterpret_cast<u8*>(&ch);
|
||||
return static_cast<u32>(value);
|
||||
}
|
||||
return u32view().code_points()[index];
|
||||
return m_view.visit(
|
||||
[&](StringView view) -> u32 {
|
||||
auto ch = view[index];
|
||||
if (ch < 0)
|
||||
return 256u + ch;
|
||||
return ch;
|
||||
},
|
||||
[&](auto view) -> u32 { return view[index]; },
|
||||
[&](Utf8View& view) -> u32 {
|
||||
size_t i = index;
|
||||
for (auto it = view.begin(); it != view.end(); ++it, --i) {
|
||||
if (i == 0)
|
||||
return *it;
|
||||
}
|
||||
VERIFY_NOT_REACHED();
|
||||
});
|
||||
}
|
||||
|
||||
bool operator==(const char* cstring) const
|
||||
{
|
||||
if (is_u8_view())
|
||||
return u8view() == cstring;
|
||||
|
||||
return to_string() == cstring;
|
||||
return m_view.visit(
|
||||
[&](Utf32View) { return to_string() == cstring; },
|
||||
[&](Utf8View const& view) { return view.as_string() == cstring; },
|
||||
[&](StringView view) { return view == cstring; });
|
||||
}
|
||||
|
||||
bool operator!=(const char* cstring) const
|
||||
|
@ -149,18 +199,18 @@ public:
|
|||
|
||||
bool operator==(const String& string) const
|
||||
{
|
||||
if (is_u8_view())
|
||||
return u8view() == string;
|
||||
|
||||
return to_string() == string;
|
||||
return m_view.visit(
|
||||
[&](Utf32View) { return to_string() == string; },
|
||||
[&](Utf8View const& view) { return view.as_string() == string; },
|
||||
[&](StringView view) { return view == string; });
|
||||
}
|
||||
|
||||
bool operator==(const StringView& other) const
|
||||
bool operator==(const StringView& string) const
|
||||
{
|
||||
if (is_u8_view())
|
||||
return u8view() == other;
|
||||
|
||||
return false;
|
||||
return m_view.visit(
|
||||
[&](Utf32View) { return to_string() == string; },
|
||||
[&](Utf8View const& view) { return view.as_string() == string; },
|
||||
[&](StringView view) { return view == string; });
|
||||
}
|
||||
|
||||
bool operator!=(const StringView& other) const
|
||||
|
@ -170,13 +220,12 @@ public:
|
|||
|
||||
bool operator==(const Utf32View& other) const
|
||||
{
|
||||
if (is_u32_view()) {
|
||||
StringBuilder builder;
|
||||
builder.append(other);
|
||||
return to_string() == builder.to_string();
|
||||
}
|
||||
|
||||
return false;
|
||||
return m_view.visit(
|
||||
[&](Utf32View view) {
|
||||
return view.length() == other.length() && __builtin_memcmp(view.code_points(), other.code_points(), view.length() * sizeof(u32)) == 0;
|
||||
},
|
||||
[&](Utf8View const& view) { return view.as_string() == RegexStringView { other }.to_string(); },
|
||||
[&](StringView view) { return view == RegexStringView { other }.to_string(); });
|
||||
}
|
||||
|
||||
bool operator!=(const Utf32View& other) const
|
||||
|
@ -184,34 +233,78 @@ public:
|
|||
return !(*this == other);
|
||||
}
|
||||
|
||||
const char* characters_without_null_termination() const
|
||||
bool operator==(const Utf8View& other) const
|
||||
{
|
||||
if (is_u8_view())
|
||||
return u8view().characters_without_null_termination();
|
||||
return m_view.visit(
|
||||
[&](Utf32View) {
|
||||
return to_string() == other.as_string();
|
||||
},
|
||||
[&](Utf8View const& view) { return view.as_string() == other.as_string(); },
|
||||
[&](StringView view) { return other.as_string() == view; });
|
||||
}
|
||||
|
||||
return to_string().characters(); // FIXME: it contains the null termination, does that actually matter?
|
||||
bool operator!=(const Utf8View& other) const
|
||||
{
|
||||
return !(*this == other);
|
||||
}
|
||||
|
||||
bool equals(const RegexStringView& other) const
|
||||
{
|
||||
return other.m_view.visit([&](auto const& view) { return operator==(view); });
|
||||
}
|
||||
|
||||
bool equals_ignoring_case(const RegexStringView& other) const
|
||||
{
|
||||
// FIXME: Implement equals_ignoring_case() for unicode.
|
||||
return m_view.visit(
|
||||
[&](StringView view) {
|
||||
return other.m_view.visit(
|
||||
[&](StringView other_view) { return view.equals_ignoring_case(other_view); },
|
||||
[](auto&) -> bool { TODO(); });
|
||||
},
|
||||
[](auto&) -> bool { TODO(); });
|
||||
}
|
||||
|
||||
bool starts_with(const StringView& str) const
|
||||
{
|
||||
if (is_u32_view())
|
||||
return false;
|
||||
return u8view().starts_with(str);
|
||||
return m_view.visit(
|
||||
[&](Utf32View) -> bool {
|
||||
TODO();
|
||||
},
|
||||
[&](Utf8View const& view) { return view.as_string().starts_with(str); },
|
||||
[&](StringView view) { return view.starts_with(str); });
|
||||
}
|
||||
|
||||
bool starts_with(const Utf32View& str) const
|
||||
{
|
||||
if (is_u8_view())
|
||||
return false;
|
||||
|
||||
StringBuilder builder;
|
||||
builder.append(str);
|
||||
return to_string().starts_with(builder.to_string());
|
||||
return m_view.visit(
|
||||
[&](Utf32View view) -> bool {
|
||||
if (str.length() > view.length())
|
||||
return false;
|
||||
if (str.length() == view.length())
|
||||
return operator==(str);
|
||||
for (size_t i = 0; i < str.length(); ++i) {
|
||||
if (str.at(i) != view.at(i))
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
},
|
||||
[&](Utf8View const& view) {
|
||||
auto it = view.begin();
|
||||
for (auto code_point : str) {
|
||||
if (it.done())
|
||||
return false;
|
||||
if (code_point != *it)
|
||||
return false;
|
||||
++it;
|
||||
}
|
||||
return true;
|
||||
},
|
||||
[&](StringView) -> bool { TODO(); });
|
||||
}
|
||||
|
||||
private:
|
||||
Optional<StringView> m_u8view;
|
||||
Optional<Utf32View> m_u32view;
|
||||
Variant<StringView, Utf8View, Utf32View> m_view;
|
||||
};
|
||||
|
||||
class Match final {
|
||||
|
@ -271,6 +364,9 @@ struct MatchState {
|
|||
size_t string_position { 0 };
|
||||
size_t instruction_position { 0 };
|
||||
size_t fork_at_position { 0 };
|
||||
Vector<Match> matches;
|
||||
Vector<Vector<Match>> capture_group_matches;
|
||||
Vector<HashMap<String, Match>> named_capture_group_matches;
|
||||
};
|
||||
|
||||
struct MatchOutput {
|
||||
|
@ -288,6 +384,7 @@ template<>
|
|||
struct AK::Formatter<regex::RegexStringView> : Formatter<StringView> {
|
||||
void format(FormatBuilder& builder, const regex::RegexStringView& value)
|
||||
{
|
||||
return Formatter<StringView>::format(builder, { value.characters_without_null_termination(), value.length() });
|
||||
auto string = value.to_string();
|
||||
return Formatter<StringView>::format(builder, string);
|
||||
}
|
||||
};
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue