1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-05-31 05:08:13 +00:00

LibUnicode: Define case-insensitive string comparison more generically

The only user is currently String::equals_ignoring_case, but LibRegex
will need to do the same case-folded comparison with UTF-32 data. As it
turns out, the comparison works with all Unicode view types without much
fuss.
This commit is contained in:
Timothy Flynn 2023-11-08 10:13:40 -05:00 committed by Tim Flynn
parent 370ea9441c
commit 6070df40f3
3 changed files with 68 additions and 53 deletions

View file

@ -6,7 +6,6 @@
#include <AK/String.h>
#include <AK/StringBuilder.h>
#include <AK/Utf32View.h>
#include <AK/Utf8View.h>
#include <LibUnicode/CharacterTypes.h>
#include <LibUnicode/UnicodeUtils.h>
@ -43,60 +42,9 @@ ErrorOr<String> String::to_casefold() const
return builder.to_string();
}
class CasefoldStringComparator {
public:
explicit CasefoldStringComparator(Utf8View string)
: m_string(string)
, m_it(m_string.begin())
{
}
bool has_more_data() const
{
return !m_casefolded_code_points.is_empty() || (m_it != m_string.end());
}
u32 next_code_point()
{
VERIFY(has_more_data());
if (m_casefolded_code_points.is_empty()) {
m_current_code_point = *m_it;
++m_it;
m_casefolded_code_points = Unicode::Detail::casefold_code_point(m_current_code_point);
VERIFY(!m_casefolded_code_points.is_empty()); // Must at least contain the provided code point.
}
auto code_point = m_casefolded_code_points[0];
m_casefolded_code_points = m_casefolded_code_points.substring_view(1);
return code_point;
}
private:
Utf8View m_string;
Utf8CodePointIterator m_it;
u32 m_current_code_point { 0 };
Utf32View m_casefolded_code_points;
};
// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G34145
bool String::equals_ignoring_case(String const& other) const
{
// A string X is a caseless match for a string Y if and only if:
// toCasefold(X) = toCasefold(Y)
CasefoldStringComparator lhs { code_points() };
CasefoldStringComparator rhs { other.code_points() };
while (lhs.has_more_data() && rhs.has_more_data()) {
if (lhs.next_code_point() != rhs.next_code_point())
return false;
}
return !lhs.has_more_data() && !rhs.has_more_data();
return Unicode::equals_ignoring_case(code_points(), other.code_points());
}
}