mirror of
https://github.com/RGBCube/serenity
synced 2025-07-27 07:17:35 +00:00
AK+LibUnicode: Implement String::equals_ignoring_case without allocating
We currently fully casefold the left- and right-hand sides to compare two strings with case-insensitivity. Now, we casefold one code point at a time, storing the result in a view for comparison, until we exhaust both strings.
This commit is contained in:
parent
4aee4e80bd
commit
1393ed2000
6 changed files with 150 additions and 52 deletions
|
@ -6,6 +6,8 @@
|
|||
|
||||
#include <AK/String.h>
|
||||
#include <AK/StringBuilder.h>
|
||||
#include <AK/Utf32View.h>
|
||||
#include <AK/Utf8View.h>
|
||||
#include <LibUnicode/UnicodeUtils.h>
|
||||
|
||||
// This file contains definitions of AK::String methods which require UCD data.
|
||||
|
@ -40,12 +42,60 @@ ErrorOr<String> String::to_casefold() const
|
|||
return builder.to_string();
|
||||
}
|
||||
|
||||
class CasefoldStringComparator {
|
||||
public:
|
||||
explicit CasefoldStringComparator(Utf8View string)
|
||||
: m_string(string)
|
||||
, m_it(m_string.begin())
|
||||
{
|
||||
}
|
||||
|
||||
bool has_more_data() const
|
||||
{
|
||||
return !m_casefolded_code_points.is_empty() || (m_it != m_string.end());
|
||||
}
|
||||
|
||||
u32 next_code_point()
|
||||
{
|
||||
VERIFY(has_more_data());
|
||||
|
||||
if (m_casefolded_code_points.is_empty()) {
|
||||
m_current_code_point = *m_it;
|
||||
++m_it;
|
||||
|
||||
m_casefolded_code_points = Unicode::Detail::casefold_code_point(m_current_code_point);
|
||||
VERIFY(!m_casefolded_code_points.is_empty()); // Must at least contain the provided code point.
|
||||
}
|
||||
|
||||
auto code_point = m_casefolded_code_points[0];
|
||||
m_casefolded_code_points = m_casefolded_code_points.substring_view(1);
|
||||
|
||||
return code_point;
|
||||
}
|
||||
|
||||
private:
|
||||
Utf8View m_string;
|
||||
Utf8CodePointIterator m_it;
|
||||
|
||||
u32 m_current_code_point { 0 };
|
||||
Utf32View m_casefolded_code_points;
|
||||
};
|
||||
|
||||
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34145
|
||||
ErrorOr<bool> String::equals_ignoring_case(String const& other) const
|
||||
bool String::equals_ignoring_case(String const& other) const
|
||||
{
|
||||
// A string X is a caseless match for a string Y if and only if:
|
||||
// toCasefold(X) = toCasefold(Y)
|
||||
return TRY(to_casefold()) == TRY(other.to_casefold());
|
||||
|
||||
CasefoldStringComparator lhs { code_points() };
|
||||
CasefoldStringComparator rhs { other.code_points() };
|
||||
|
||||
while (lhs.has_more_data() && rhs.has_more_data()) {
|
||||
if (lhs.next_code_point() != rhs.next_code_point())
|
||||
return false;
|
||||
}
|
||||
|
||||
return !lhs.has_more_data() && !rhs.has_more_data();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -330,31 +330,32 @@ ErrorOr<void> build_titlecase_string([[maybe_unused]] Utf8View code_points, [[ma
|
|||
}
|
||||
|
||||
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G53253
|
||||
ErrorOr<void> build_casefold_string([[maybe_unused]] Utf8View code_points, [[maybe_unused]] StringBuilder& builder)
|
||||
ErrorOr<void> build_casefold_string(Utf8View code_points, StringBuilder& builder)
|
||||
{
|
||||
#if ENABLE_UNICODE_DATA
|
||||
// toCasefold(X): Map each character C in X to Case_Folding(C).
|
||||
//
|
||||
// Case_Folding(C) uses the mappings with the status field value “C” or “F” in the data file
|
||||
// CaseFolding.txt in the Unicode Character Database.
|
||||
|
||||
using enum CaseFoldingStatus;
|
||||
|
||||
for (auto code_point : code_points) {
|
||||
auto const* case_folding = find_matching_case_folding<Common, Full>(code_point);
|
||||
if (!case_folding) {
|
||||
TRY(builder.try_append_code_point(code_point));
|
||||
continue;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i < case_folding->mapping_size; ++i)
|
||||
TRY(builder.try_append_code_point(case_folding->mapping[i]));
|
||||
auto case_folding = casefold_code_point(code_point);
|
||||
TRY(builder.try_append(case_folding));
|
||||
}
|
||||
|
||||
return {};
|
||||
#else
|
||||
return Error::from_string_literal("Unicode data has been disabled");
|
||||
}
|
||||
|
||||
// https://www.unicode.org/reports/tr44/#CaseFolding.txt
|
||||
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G53253
|
||||
Utf32View casefold_code_point(u32 const& code_point)
|
||||
{
|
||||
#if ENABLE_UNICODE_DATA
|
||||
// Case_Folding(C) uses the mappings with the status field value “C” or “F” in the data file
|
||||
// CaseFolding.txt in the Unicode Character Database.
|
||||
using enum CaseFoldingStatus;
|
||||
|
||||
if (auto const* case_folding = find_matching_case_folding<Common, Full>(code_point))
|
||||
return Utf32View { case_folding->mapping, case_folding->mapping_size };
|
||||
#endif
|
||||
|
||||
// The case foldings are omitted in the data file if they are the same as the code point itself.
|
||||
return Utf32View { &code_point, 1 };
|
||||
}
|
||||
|
||||
}
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
#include <AK/Error.h>
|
||||
#include <AK/Forward.h>
|
||||
#include <AK/Optional.h>
|
||||
#include <AK/Utf32View.h>
|
||||
#include <AK/Utf8View.h>
|
||||
#include <LibUnicode/Forward.h>
|
||||
|
||||
|
@ -18,5 +19,6 @@ ErrorOr<void> build_lowercase_string(Utf8View code_points, StringBuilder& builde
|
|||
ErrorOr<void> build_uppercase_string(Utf8View code_points, StringBuilder& builder, Optional<StringView> const& locale);
|
||||
ErrorOr<void> build_titlecase_string(Utf8View code_points, StringBuilder& builder, Optional<StringView> const& locale);
|
||||
ErrorOr<void> build_casefold_string(Utf8View code_points, StringBuilder& builder);
|
||||
Utf32View casefold_code_point(u32 const& code_point);
|
||||
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue