LibUnicode: Support full case folding for titlecasing a string

Unicode declares that to titlecase a string, the first cased code point after each word boundary should be transformed to its titlecase mapping. All other codepoints are transformed to their lowercase mapping.
2026-01-12 23:11:00 +00:00 · 2023-01-16 11:22:01 -05:00 · 2023-01-16 11:22:01 -05:00 · bc51017a03
commit bc51017a03
parent b562348d31
5 changed files with 165 additions and 0 deletions
--- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp
+++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp
@ -57,6 +57,13 @@ ErrorOr<DeprecatedString> to_unicode_uppercase_full(StringView string, Optional<
    return builder.to_deprecated_string();
 }

+ErrorOr<String> to_unicode_titlecase_full(StringView string, Optional<StringView> const& locale)
+{
+    StringBuilder builder;
+    TRY(Detail::build_titlecase_string(Utf8View { string }, builder, locale));
+    return builder.to_string();
+}
+
 Optional<GeneralCategory> __attribute__((weak)) general_category_from_string(StringView) { return {}; }
 bool __attribute__((weak)) code_point_has_general_category(u32, GeneralCategory) { return {}; }
 Optional<Property> __attribute__((weak)) property_from_string(StringView) { return {}; }
--- a/Userland/Libraries/LibUnicode/CharacterTypes.h
+++ b/Userland/Libraries/LibUnicode/CharacterTypes.h
@ -10,6 +10,7 @@
 #include <AK/Forward.h>
 #include <AK/Optional.h>
 #include <AK/Span.h>
+#include <AK/String.h>
 #include <AK/Types.h>
 #include <AK/Vector.h>
 #include <LibUnicode/Forward.h>
@ -42,6 +43,7 @@ u32 to_unicode_titlecase(u32 code_point);

 ErrorOr<DeprecatedString> to_unicode_lowercase_full(StringView, Optional<StringView> const& locale = {});
 ErrorOr<DeprecatedString> to_unicode_uppercase_full(StringView, Optional<StringView> const& locale = {});
+ErrorOr<String> to_unicode_titlecase_full(StringView, Optional<StringView> const& locale = {});

 Optional<GeneralCategory> general_category_from_string(StringView);
 bool code_point_has_general_category(u32 code_point, GeneralCategory general_category);
--- a/Userland/Libraries/LibUnicode/UnicodeUtils.cpp
+++ b/Userland/Libraries/LibUnicode/UnicodeUtils.cpp
@ -249,4 +249,66 @@ ErrorOr<void> build_uppercase_string([[maybe_unused]] Utf8View code_points, [[ma
 #endif
 }

+ErrorOr<void> build_titlecase_string([[maybe_unused]] Utf8View code_points, [[maybe_unused]] StringBuilder& builder, [[maybe_unused]] Optional<StringView> const& locale)
+{
+#if ENABLE_UNICODE_DATA
+    // toTitlecase(X): Find the word boundaries in X according to Unicode Standard Annex #29,
+    // “Unicode Text Segmentation.” For each word boundary, find the first cased character F following
+    // the word boundary. If F exists, map F to Titlecase_Mapping(F); then map all characters C between
+    // F and the following word boundary to Lowercase_Mapping(C).
+
+    auto boundaries = find_word_segmentation_boundaries(code_points);
+    if (boundaries.is_empty())
+        return {};
+
+    auto first_cased_code_point_after_boundary = [&](auto boundary, auto next_boundary) -> Optional<Utf8CodePointIterator> {
+        auto it = code_points.iterator_at_byte_offset_without_validation(boundary);
+        auto end = code_points.iterator_at_byte_offset_without_validation(next_boundary);
+
+        for (; it != end; ++it) {
+            if (code_point_has_property(*it, Property::Cased))
+                return it;
+        }
+
+        return {};
+    };
+
+    auto append_code_point_as_titlecase = [&](auto code_point, auto code_point_offset, auto code_point_length) -> ErrorOr<void> {
+        auto const* special_casing = find_matching_special_case(code_point, code_points, locale, code_point_offset, code_point_length);
+        if (!special_casing) {
+            TRY(builder.try_append_code_point(to_unicode_titlecase(code_point)));
+            return {};
+        }
+
+        for (size_t i = 0; i < special_casing->titlecase_mapping_size; ++i)
+            TRY(builder.try_append_code_point(special_casing->titlecase_mapping[i]));
+        return {};
+    };
+
+    for (size_t i = 0; i < boundaries.size() - 1; ++i) {
+        auto boundary = boundaries[i];
+        auto next_boundary = boundaries[i + 1];
+
+        if (auto it = first_cased_code_point_after_boundary(boundary, next_boundary); it.has_value()) {
+            auto code_point = *it.value();
+            auto code_point_offset = code_points.byte_offset_of(*it);
+            auto code_point_length = it->underlying_code_point_length_in_bytes();
+
+            auto caseless_code_points = code_points.substring_view(boundary, code_point_offset - boundary);
+            TRY(builder.try_append(caseless_code_points.as_string()));
+
+            TRY(append_code_point_as_titlecase(code_point, code_point_offset, code_point_length));
+            boundary = code_point_offset + code_point_length;
+        }
+
+        auto substring_to_lowercase = code_points.substring_view(boundary, next_boundary - boundary);
+        TRY(build_lowercase_string(substring_to_lowercase, builder, locale));
+    }
+
+    return {};
+#else
+    return Error::from_string_literal("Unicode data has been disabled");
+#endif
+}
+
 }
--- a/Userland/Libraries/LibUnicode/UnicodeUtils.h
+++ b/Userland/Libraries/LibUnicode/UnicodeUtils.h
@ -16,5 +16,6 @@ namespace Unicode::Detail {

 ErrorOr<void> build_lowercase_string(Utf8View code_points, StringBuilder& builder, Optional<StringView> const& locale);
 ErrorOr<void> build_uppercase_string(Utf8View code_points, StringBuilder& builder, Optional<StringView> const& locale);
+ErrorOr<void> build_titlecase_string(Utf8View code_points, StringBuilder& builder, Optional<StringView> const& locale);

 }