diff --git a/AK/Utf16View.cpp b/AK/Utf16View.cpp index 44a2a89f73..5b7e2eb6ad 100644 --- a/AK/Utf16View.cpp +++ b/AK/Utf16View.cpp @@ -111,6 +111,23 @@ u16 Utf16View::code_unit_at(size_t index) const return m_code_units[index]; } +u32 Utf16View::code_point_at(size_t index) const +{ + VERIFY(index < length_in_code_units()); + + u32 code_point = code_unit_at(index); + if (!is_high_surrogate(code_point) && !is_low_surrogate(code_point)) + return code_point; + if (is_low_surrogate(code_point) || (index + 1 == length_in_code_units())) + return code_point; + + auto second = code_unit_at(index + 1); + if (!is_low_surrogate(second)) + return code_point; + + return decode_surrogate_pair(code_point, second); +} + size_t Utf16View::code_point_offset_of(size_t code_unit_offset) const { size_t code_point_offset = 0; diff --git a/AK/Utf16View.h b/AK/Utf16View.h index 5f58c12036..054c9f4043 100644 --- a/AK/Utf16View.h +++ b/AK/Utf16View.h @@ -87,6 +87,7 @@ public: u16 const* data() const { return m_code_units.data(); } u16 code_unit_at(size_t index) const; + u32 code_point_at(size_t index) const; size_t code_point_offset_of(size_t code_unit_offset) const; size_t code_unit_offset_of(size_t code_point_offset) const; diff --git a/Userland/Libraries/LibRegex/RegexMatch.h b/Userland/Libraries/LibRegex/RegexMatch.h index 28e68b6daf..7147938085 100644 --- a/Userland/Libraries/LibRegex/RegexMatch.h +++ b/Userland/Libraries/LibRegex/RegexMatch.h @@ -240,7 +240,10 @@ public: return ch; }, [&](Utf32View& view) -> u32 { return view[index]; }, + [&](Utf16View& view) -> u32 { return view.code_point_at(index); }, [&](auto& view) -> u32 { + // FIXME: Iterating to the code point is inefficient, particularly for very large + // strings. Implement something like code_point_at to Utf8View. size_t i = index; for (auto it = view.begin(); it != view.end(); ++it, --i) { if (i == 0)