From 6070df40f35ebcab2f219440e81f785a594a0bca Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Wed, 8 Nov 2023 10:13:40 -0500 Subject: [PATCH] LibUnicode: Define case-insensitive string comparison more generically The only user is currently String::equals_ignoring_case, but LibRegex will need to do the same case-folded comparison with UTF-32 data. As it turns out, the comparison works with all Unicode view types without much fuss. --- .../Libraries/LibUnicode/CharacterTypes.cpp | 64 +++++++++++++++++++ .../Libraries/LibUnicode/CharacterTypes.h | 3 + Userland/Libraries/LibUnicode/String.cpp | 54 +--------------- 3 files changed, 68 insertions(+), 53 deletions(-) diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp index 4f55760243..faecd51be8 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp +++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp @@ -8,6 +8,8 @@ #include #include #include +#include +#include #include #include #include @@ -67,6 +69,68 @@ ErrorOr to_unicode_casefold_full(StringView string) return builder.to_string(); } +template +class CasefoldStringComparator { +public: + explicit CasefoldStringComparator(ViewType string) + : m_string(string) + , m_it(m_string.begin()) + { + } + + bool has_more_data() const + { + return !m_casefolded_code_points.is_empty() || (m_it != m_string.end()); + } + + u32 next_code_point() + { + VERIFY(has_more_data()); + + if (m_casefolded_code_points.is_empty()) { + m_current_code_point = *m_it; + ++m_it; + + m_casefolded_code_points = Unicode::Detail::casefold_code_point(m_current_code_point); + VERIFY(!m_casefolded_code_points.is_empty()); // Must at least contain the provided code point. + } + + auto code_point = m_casefolded_code_points[0]; + m_casefolded_code_points = m_casefolded_code_points.substring_view(1); + + return code_point; + } + +private: + ViewType m_string; + typename ViewType::Iterator m_it; + + u32 m_current_code_point { 0 }; + Utf32View m_casefolded_code_points; +}; + +// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34145 +template +bool equals_ignoring_case(ViewType lhs, ViewType rhs) +{ + // A string X is a caseless match for a string Y if and only if: + // toCasefold(X) = toCasefold(Y) + + CasefoldStringComparator lhs_comparator { lhs }; + CasefoldStringComparator rhs_comparator { rhs }; + + while (lhs_comparator.has_more_data() && rhs_comparator.has_more_data()) { + if (lhs_comparator.next_code_point() != rhs_comparator.next_code_point()) + return false; + } + + return !lhs_comparator.has_more_data() && !rhs_comparator.has_more_data(); +} + +template bool equals_ignoring_case(Utf8View, Utf8View); +template bool equals_ignoring_case(Utf16View, Utf16View); +template bool equals_ignoring_case(Utf32View, Utf32View); + Optional __attribute__((weak)) general_category_from_string(StringView) { return {}; } bool __attribute__((weak)) code_point_has_general_category(u32, GeneralCategory) { return {}; } Optional __attribute__((weak)) property_from_string(StringView) { return {}; } diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.h b/Userland/Libraries/LibUnicode/CharacterTypes.h index 29a61b0f61..43219ca468 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.h +++ b/Userland/Libraries/LibUnicode/CharacterTypes.h @@ -60,6 +60,9 @@ ErrorOr to_unicode_uppercase_full(StringView, Optional to_unicode_titlecase_full(StringView, Optional const& locale = {}, TrailingCodePointTransformation trailing_code_point_transformation = TrailingCodePointTransformation::Lowercase); ErrorOr to_unicode_casefold_full(StringView); +template +bool equals_ignoring_case(ViewType, ViewType); + Optional general_category_from_string(StringView); bool code_point_has_general_category(u32 code_point, GeneralCategory general_category); diff --git a/Userland/Libraries/LibUnicode/String.cpp b/Userland/Libraries/LibUnicode/String.cpp index 9bbd3fb7fc..464600ce23 100644 --- a/Userland/Libraries/LibUnicode/String.cpp +++ b/Userland/Libraries/LibUnicode/String.cpp @@ -6,7 +6,6 @@ #include #include -#include #include #include #include @@ -43,60 +42,9 @@ ErrorOr String::to_casefold() const return builder.to_string(); } -class CasefoldStringComparator { -public: - explicit CasefoldStringComparator(Utf8View string) - : m_string(string) - , m_it(m_string.begin()) - { - } - - bool has_more_data() const - { - return !m_casefolded_code_points.is_empty() || (m_it != m_string.end()); - } - - u32 next_code_point() - { - VERIFY(has_more_data()); - - if (m_casefolded_code_points.is_empty()) { - m_current_code_point = *m_it; - ++m_it; - - m_casefolded_code_points = Unicode::Detail::casefold_code_point(m_current_code_point); - VERIFY(!m_casefolded_code_points.is_empty()); // Must at least contain the provided code point. - } - - auto code_point = m_casefolded_code_points[0]; - m_casefolded_code_points = m_casefolded_code_points.substring_view(1); - - return code_point; - } - -private: - Utf8View m_string; - Utf8CodePointIterator m_it; - - u32 m_current_code_point { 0 }; - Utf32View m_casefolded_code_points; -}; - -// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34145 bool String::equals_ignoring_case(String const& other) const { - // A string X is a caseless match for a string Y if and only if: - // toCasefold(X) = toCasefold(Y) - - CasefoldStringComparator lhs { code_points() }; - CasefoldStringComparator rhs { other.code_points() }; - - while (lhs.has_more_data() && rhs.has_more_data()) { - if (lhs.next_code_point() != rhs.next_code_point()) - return false; - } - - return !lhs.has_more_data() && !rhs.has_more_data(); + return Unicode::equals_ignoring_case(code_points(), other.code_points()); } }