diff --git a/AK/String.h b/AK/String.h index 71bb6e7ef2..abf792ea09 100644 --- a/AK/String.h +++ b/AK/String.h @@ -109,7 +109,7 @@ public: ErrorOr to_casefold() const; // Compare this String against another string with caseless matching. Using this method requires linking LibUnicode into your application. - ErrorOr equals_ignoring_case(String const&) const; + [[nodiscard]] bool equals_ignoring_case(String const&) const; [[nodiscard]] bool starts_with(u32 code_point) const; [[nodiscard]] bool starts_with_bytes(StringView) const; diff --git a/Tests/AK/TestString.cpp b/Tests/AK/TestString.cpp index 06318756ad..61cfbcdc9f 100644 --- a/Tests/AK/TestString.cpp +++ b/Tests/AK/TestString.cpp @@ -330,25 +330,33 @@ TEST_CASE(equals_ignoring_case) String string1 {}; String string2 {}; - EXPECT(MUST(string1.equals_ignoring_case(string2))); + EXPECT(string1.equals_ignoring_case(string2)); } { auto string1 = MUST("abcd"_string); auto string2 = MUST("ABCD"_string); auto string3 = MUST("AbCd"_string); auto string4 = MUST("dcba"_string); + auto string5 = MUST("abce"_string); + auto string6 = MUST("abc"_string); - EXPECT(MUST(string1.equals_ignoring_case(string2))); - EXPECT(MUST(string1.equals_ignoring_case(string3))); - EXPECT(!MUST(string1.equals_ignoring_case(string4))); + EXPECT(string1.equals_ignoring_case(string2)); + EXPECT(string1.equals_ignoring_case(string3)); + EXPECT(!string1.equals_ignoring_case(string4)); + EXPECT(!string1.equals_ignoring_case(string5)); + EXPECT(!string1.equals_ignoring_case(string6)); - EXPECT(MUST(string2.equals_ignoring_case(string1))); - EXPECT(MUST(string2.equals_ignoring_case(string3))); - EXPECT(!MUST(string2.equals_ignoring_case(string4))); + EXPECT(string2.equals_ignoring_case(string1)); + EXPECT(string2.equals_ignoring_case(string3)); + EXPECT(!string2.equals_ignoring_case(string4)); + EXPECT(!string2.equals_ignoring_case(string5)); + EXPECT(!string2.equals_ignoring_case(string6)); - EXPECT(MUST(string3.equals_ignoring_case(string1))); - EXPECT(MUST(string3.equals_ignoring_case(string2))); - EXPECT(!MUST(string3.equals_ignoring_case(string4))); + EXPECT(string3.equals_ignoring_case(string1)); + EXPECT(string3.equals_ignoring_case(string2)); + EXPECT(!string3.equals_ignoring_case(string4)); + EXPECT(!string3.equals_ignoring_case(string5)); + EXPECT(!string3.equals_ignoring_case(string6)); } { auto string1 = MUST("\u00DF"_string); // LATIN SMALL LETTER SHARP S @@ -358,29 +366,66 @@ TEST_CASE(equals_ignoring_case) auto string5 = MUST("S"_string); auto string6 = MUST("s"_string); - EXPECT(MUST(string1.equals_ignoring_case(string2))); - EXPECT(MUST(string1.equals_ignoring_case(string3))); - EXPECT(MUST(string1.equals_ignoring_case(string4))); - EXPECT(!MUST(string1.equals_ignoring_case(string5))); - EXPECT(!MUST(string1.equals_ignoring_case(string6))); + EXPECT(string1.equals_ignoring_case(string2)); + EXPECT(string1.equals_ignoring_case(string3)); + EXPECT(string1.equals_ignoring_case(string4)); + EXPECT(!string1.equals_ignoring_case(string5)); + EXPECT(!string1.equals_ignoring_case(string6)); - EXPECT(MUST(string2.equals_ignoring_case(string1))); - EXPECT(MUST(string2.equals_ignoring_case(string3))); - EXPECT(MUST(string2.equals_ignoring_case(string4))); - EXPECT(!MUST(string2.equals_ignoring_case(string5))); - EXPECT(!MUST(string2.equals_ignoring_case(string6))); + EXPECT(string2.equals_ignoring_case(string1)); + EXPECT(string2.equals_ignoring_case(string3)); + EXPECT(string2.equals_ignoring_case(string4)); + EXPECT(!string2.equals_ignoring_case(string5)); + EXPECT(!string2.equals_ignoring_case(string6)); - EXPECT(MUST(string3.equals_ignoring_case(string1))); - EXPECT(MUST(string3.equals_ignoring_case(string2))); - EXPECT(MUST(string3.equals_ignoring_case(string4))); - EXPECT(!MUST(string3.equals_ignoring_case(string5))); - EXPECT(!MUST(string3.equals_ignoring_case(string6))); + EXPECT(string3.equals_ignoring_case(string1)); + EXPECT(string3.equals_ignoring_case(string2)); + EXPECT(string3.equals_ignoring_case(string4)); + EXPECT(!string3.equals_ignoring_case(string5)); + EXPECT(!string3.equals_ignoring_case(string6)); - EXPECT(MUST(string4.equals_ignoring_case(string1))); - EXPECT(MUST(string4.equals_ignoring_case(string2))); - EXPECT(MUST(string4.equals_ignoring_case(string3))); - EXPECT(!MUST(string4.equals_ignoring_case(string5))); - EXPECT(!MUST(string4.equals_ignoring_case(string6))); + EXPECT(string4.equals_ignoring_case(string1)); + EXPECT(string4.equals_ignoring_case(string2)); + EXPECT(string4.equals_ignoring_case(string3)); + EXPECT(!string4.equals_ignoring_case(string5)); + EXPECT(!string4.equals_ignoring_case(string6)); + } + { + + auto string1 = MUST("Ab\u00DFCd\u00DFeF"_string); + auto string2 = MUST("ABSSCDSSEF"_string); + auto string3 = MUST("absscdssef"_string); + auto string4 = MUST("aBSscDsSEf"_string); + auto string5 = MUST("Ab\u00DFCd\u00DFeg"_string); + auto string6 = MUST("Ab\u00DFCd\u00DFe"_string); + + EXPECT(string1.equals_ignoring_case(string1)); + EXPECT(string1.equals_ignoring_case(string2)); + EXPECT(string1.equals_ignoring_case(string3)); + EXPECT(string1.equals_ignoring_case(string4)); + EXPECT(!string1.equals_ignoring_case(string5)); + EXPECT(!string1.equals_ignoring_case(string6)); + + EXPECT(string2.equals_ignoring_case(string1)); + EXPECT(string2.equals_ignoring_case(string2)); + EXPECT(string2.equals_ignoring_case(string3)); + EXPECT(string2.equals_ignoring_case(string4)); + EXPECT(!string2.equals_ignoring_case(string5)); + EXPECT(!string2.equals_ignoring_case(string6)); + + EXPECT(string3.equals_ignoring_case(string1)); + EXPECT(string3.equals_ignoring_case(string2)); + EXPECT(string3.equals_ignoring_case(string3)); + EXPECT(string3.equals_ignoring_case(string4)); + EXPECT(!string3.equals_ignoring_case(string5)); + EXPECT(!string3.equals_ignoring_case(string6)); + + EXPECT(string4.equals_ignoring_case(string1)); + EXPECT(string4.equals_ignoring_case(string2)); + EXPECT(string4.equals_ignoring_case(string3)); + EXPECT(string4.equals_ignoring_case(string4)); + EXPECT(!string4.equals_ignoring_case(string5)); + EXPECT(!string4.equals_ignoring_case(string6)); } } diff --git a/Userland/Libraries/LibJS/Runtime/Intl/AbstractOperations.cpp b/Userland/Libraries/LibJS/Runtime/Intl/AbstractOperations.cpp index 6562a2b3ed..68e57b5e43 100644 --- a/Userland/Libraries/LibJS/Runtime/Intl/AbstractOperations.cpp +++ b/Userland/Libraries/LibJS/Runtime/Intl/AbstractOperations.cpp @@ -29,7 +29,7 @@ ThrowCompletionOr> is_structurally_valid_language_t quick_sort(variants); for (size_t i = 0; i < variants.size() - 1; ++i) { - if (TRY_OR_THROW_OOM(vm, variants[i].equals_ignoring_case(variants[i + 1]))) + if (variants[i].equals_ignoring_case(variants[i + 1])) return true; } diff --git a/Userland/Libraries/LibUnicode/String.cpp b/Userland/Libraries/LibUnicode/String.cpp index e198058a70..f6636a1575 100644 --- a/Userland/Libraries/LibUnicode/String.cpp +++ b/Userland/Libraries/LibUnicode/String.cpp @@ -6,6 +6,8 @@ #include #include +#include +#include #include // This file contains definitions of AK::String methods which require UCD data. @@ -40,12 +42,60 @@ ErrorOr String::to_casefold() const return builder.to_string(); } +class CasefoldStringComparator { +public: + explicit CasefoldStringComparator(Utf8View string) + : m_string(string) + , m_it(m_string.begin()) + { + } + + bool has_more_data() const + { + return !m_casefolded_code_points.is_empty() || (m_it != m_string.end()); + } + + u32 next_code_point() + { + VERIFY(has_more_data()); + + if (m_casefolded_code_points.is_empty()) { + m_current_code_point = *m_it; + ++m_it; + + m_casefolded_code_points = Unicode::Detail::casefold_code_point(m_current_code_point); + VERIFY(!m_casefolded_code_points.is_empty()); // Must at least contain the provided code point. + } + + auto code_point = m_casefolded_code_points[0]; + m_casefolded_code_points = m_casefolded_code_points.substring_view(1); + + return code_point; + } + +private: + Utf8View m_string; + Utf8CodePointIterator m_it; + + u32 m_current_code_point { 0 }; + Utf32View m_casefolded_code_points; +}; + // https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34145 -ErrorOr String::equals_ignoring_case(String const& other) const +bool String::equals_ignoring_case(String const& other) const { // A string X is a caseless match for a string Y if and only if: // toCasefold(X) = toCasefold(Y) - return TRY(to_casefold()) == TRY(other.to_casefold()); + + CasefoldStringComparator lhs { code_points() }; + CasefoldStringComparator rhs { other.code_points() }; + + while (lhs.has_more_data() && rhs.has_more_data()) { + if (lhs.next_code_point() != rhs.next_code_point()) + return false; + } + + return !lhs.has_more_data() && !rhs.has_more_data(); } } diff --git a/Userland/Libraries/LibUnicode/UnicodeUtils.cpp b/Userland/Libraries/LibUnicode/UnicodeUtils.cpp index 7d4672aac0..7c5ed2393a 100644 --- a/Userland/Libraries/LibUnicode/UnicodeUtils.cpp +++ b/Userland/Libraries/LibUnicode/UnicodeUtils.cpp @@ -330,31 +330,32 @@ ErrorOr build_titlecase_string([[maybe_unused]] Utf8View code_points, [[ma } // https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G53253 -ErrorOr build_casefold_string([[maybe_unused]] Utf8View code_points, [[maybe_unused]] StringBuilder& builder) +ErrorOr build_casefold_string(Utf8View code_points, StringBuilder& builder) { -#if ENABLE_UNICODE_DATA // toCasefold(X): Map each character C in X to Case_Folding(C). - // - // Case_Folding(C) uses the mappings with the status field value “C” or “F” in the data file - // CaseFolding.txt in the Unicode Character Database. - - using enum CaseFoldingStatus; - for (auto code_point : code_points) { - auto const* case_folding = find_matching_case_folding(code_point); - if (!case_folding) { - TRY(builder.try_append_code_point(code_point)); - continue; - } - - for (size_t i = 0; i < case_folding->mapping_size; ++i) - TRY(builder.try_append_code_point(case_folding->mapping[i])); + auto case_folding = casefold_code_point(code_point); + TRY(builder.try_append(case_folding)); } return {}; -#else - return Error::from_string_literal("Unicode data has been disabled"); +} + +// https://www.unicode.org/reports/tr44/#CaseFolding.txt +// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G53253 +Utf32View casefold_code_point(u32 const& code_point) +{ +#if ENABLE_UNICODE_DATA + // Case_Folding(C) uses the mappings with the status field value “C” or “F” in the data file + // CaseFolding.txt in the Unicode Character Database. + using enum CaseFoldingStatus; + + if (auto const* case_folding = find_matching_case_folding(code_point)) + return Utf32View { case_folding->mapping, case_folding->mapping_size }; #endif + + // The case foldings are omitted in the data file if they are the same as the code point itself. + return Utf32View { &code_point, 1 }; } } diff --git a/Userland/Libraries/LibUnicode/UnicodeUtils.h b/Userland/Libraries/LibUnicode/UnicodeUtils.h index af7702abbc..a3f3b0cc82 100644 --- a/Userland/Libraries/LibUnicode/UnicodeUtils.h +++ b/Userland/Libraries/LibUnicode/UnicodeUtils.h @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -18,5 +19,6 @@ ErrorOr build_lowercase_string(Utf8View code_points, StringBuilder& builde ErrorOr build_uppercase_string(Utf8View code_points, StringBuilder& builder, Optional const& locale); ErrorOr build_titlecase_string(Utf8View code_points, StringBuilder& builder, Optional const& locale); ErrorOr build_casefold_string(Utf8View code_points, StringBuilder& builder); +Utf32View casefold_code_point(u32 const& code_point); }