diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp index 972ddde92a..8a3cbac7c6 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp @@ -781,6 +781,8 @@ struct CodePointDecomposition { Optional locale_from_string(StringView locale); +Span special_case_mapping(u32 code_point); + } )~~~"); diff --git a/Userland/Libraries/LibUnicode/CMakeLists.txt b/Userland/Libraries/LibUnicode/CMakeLists.txt index 9060fad0b0..109307cad9 100644 --- a/Userland/Libraries/LibUnicode/CMakeLists.txt +++ b/Userland/Libraries/LibUnicode/CMakeLists.txt @@ -5,6 +5,7 @@ set(SOURCES CurrencyCode.cpp Emoji.cpp Normalize.cpp + UnicodeUtils.cpp ${UNICODE_DATA_SOURCES} ) set(GENERATED_SOURCES ${CURRENT_LIB_GENERATED}) diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp index fb7bd919a5..fba41ee30b 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp +++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp @@ -11,6 +11,7 @@ #include #include #include +#include #if ENABLE_UNICODE_DATA # include @@ -25,192 +26,8 @@ Optional __attribute__((weak)) code_point_display_name(u32) { Optional __attribute__((weak)) code_point_block_display_name(u32) { return {}; } Optional __attribute__((weak)) code_point_abbreviation(u32) { return {}; } u32 __attribute__((weak)) canonical_combining_class(u32) { return {}; } -Span __attribute__((weak)) special_case_mapping(u32) { return {}; } Span __attribute__((weak)) block_display_names() { return {}; } -#if ENABLE_UNICODE_DATA - -static bool is_after_uppercase_i(Utf8View const& string, size_t index) -{ - // There is an uppercase I before C, and there is no intervening combining character class 230 (Above) or 0. - auto preceding_view = string.substring_view(0, index); - bool found_uppercase_i = false; - - // FIXME: Would be better if Utf8View supported reverse iteration. - for (auto code_point : preceding_view) { - if (code_point == 'I') { - found_uppercase_i = true; - continue; - } - - u32 combining_class = canonical_combining_class(code_point); - - if (combining_class == 0) - found_uppercase_i = false; - else if (combining_class == 230) - found_uppercase_i = false; - } - - return found_uppercase_i; -} - -static bool is_after_soft_dotted_code_point(Utf8View const& string, size_t index) -{ - // There is a Soft_Dotted character before C, with no intervening character of combining class 0 or 230 (Above). - auto preceding_view = string.substring_view(0, index); - bool found_soft_dotted_code_point = false; - - // FIXME: Would be better if Utf8View supported reverse iteration. - for (auto code_point : preceding_view) { - if (code_point_has_property(code_point, Property::Soft_Dotted)) { - found_soft_dotted_code_point = true; - continue; - } - - u32 combining_class = canonical_combining_class(code_point); - - if (combining_class == 0) - found_soft_dotted_code_point = false; - else if (combining_class == 230) - found_soft_dotted_code_point = false; - } - - return found_soft_dotted_code_point; -} - -static bool is_final_code_point(Utf8View const& string, size_t index, size_t byte_length) -{ - // C is preceded by a sequence consisting of a cased letter and then zero or more case-ignorable - // characters, and C is not followed by a sequence consisting of zero or more case-ignorable - // characters and then a cased letter. - auto preceding_view = string.substring_view(0, index); - auto following_view = ((index + byte_length) < string.byte_length()) - ? string.substring_view(index + byte_length) - : Utf8View {}; - - size_t cased_letter_count = 0; - - for (auto code_point : preceding_view) { - bool is_cased = code_point_has_property(code_point, Property::Cased); - bool is_case_ignorable = code_point_has_property(code_point, Property::Case_Ignorable); - - if (is_cased && !is_case_ignorable) - ++cased_letter_count; - else if (!is_case_ignorable) - cased_letter_count = 0; - } - - if (cased_letter_count == 0) - return false; - - for (auto code_point : following_view) { - bool is_cased = code_point_has_property(code_point, Property::Cased); - bool is_case_ignorable = code_point_has_property(code_point, Property::Case_Ignorable); - - if (is_case_ignorable) - continue; - if (is_cased) - return false; - - break; - } - - return true; -} - -static bool is_followed_by_combining_class_above(Utf8View const& string, size_t index, size_t byte_length) -{ - // C is followed by a character of combining class 230 (Above) with no intervening character of combining class 0 or 230 (Above). - auto following_view = ((index + byte_length) < string.byte_length()) - ? string.substring_view(index + byte_length) - : Utf8View {}; - - for (auto code_point : following_view) { - u32 combining_class = canonical_combining_class(code_point); - - if (combining_class == 0) - return false; - if (combining_class == 230) - return true; - } - - return false; -} - -static bool is_followed_by_combining_dot_above(Utf8View const& string, size_t index, size_t byte_length) -{ - // C is followed by combining dot above (U+0307). Any sequence of characters with a combining class that is neither 0 nor 230 may - // intervene between the current character and the combining dot above. - auto following_view = ((index + byte_length) < string.byte_length()) - ? string.substring_view(index + byte_length) - : Utf8View {}; - - for (auto code_point : following_view) { - if (code_point == 0x307) - return true; - - u32 combining_class = canonical_combining_class(code_point); - - if (combining_class == 0) - return false; - if (combining_class == 230) - return false; - } - - return false; -} - -static SpecialCasing const* find_matching_special_case(u32 code_point, Utf8View const& string, Optional locale, size_t index, size_t byte_length) -{ - auto requested_locale = Locale::None; - - if (locale.has_value()) { - if (auto maybe_locale = locale_from_string(*locale); maybe_locale.has_value()) - requested_locale = *maybe_locale; - } - - auto special_casings = special_case_mapping(code_point); - - for (auto const* special_casing : special_casings) { - if (special_casing->locale != Locale::None && special_casing->locale != requested_locale) - continue; - - switch (special_casing->condition) { - case Condition::None: - return special_casing; - - case Condition::AfterI: - if (is_after_uppercase_i(string, index)) - return special_casing; - break; - - case Condition::AfterSoftDotted: - if (is_after_soft_dotted_code_point(string, index)) - return special_casing; - break; - - case Condition::FinalSigma: - if (is_final_code_point(string, index, byte_length)) - return special_casing; - break; - - case Condition::MoreAbove: - if (is_followed_by_combining_class_above(string, index, byte_length)) - return special_casing; - break; - - case Condition::NotBeforeDot: - if (!is_followed_by_combining_dot_above(string, index, byte_length)) - return special_casing; - break; - } - } - - return nullptr; -} - -#endif - u32 __attribute__((weak)) to_unicode_lowercase(u32 code_point) { return to_ascii_lowercase(code_point); @@ -221,62 +38,18 @@ u32 __attribute__((weak)) to_unicode_uppercase(u32 code_point) return to_ascii_uppercase(code_point); } -ErrorOr to_unicode_lowercase_full(StringView string, [[maybe_unused]] Optional locale) +ErrorOr to_unicode_lowercase_full(StringView string, Optional const& locale) { -#if ENABLE_UNICODE_DATA - Utf8View view { string }; StringBuilder builder; - - size_t index = 0; - size_t byte_length = 0; - - for (auto it = view.begin(); it != view.end(); ++it, index += byte_length) { - u32 code_point = *it; - byte_length = it.underlying_code_point_length_in_bytes(); - - auto const* special_casing = find_matching_special_case(code_point, view, locale, index, byte_length); - if (!special_casing) { - TRY(builder.try_append_code_point(to_unicode_lowercase(code_point))); - continue; - } - - for (size_t i = 0; i < special_casing->lowercase_mapping_size; ++i) - TRY(builder.try_append_code_point(special_casing->lowercase_mapping[i])); - } - - return builder.build(); -#else - return string.to_lowercase_string(); -#endif + TRY(Detail::build_lowercase_string(Utf8View { string }, builder, locale)); + return builder.to_deprecated_string(); } -ErrorOr to_unicode_uppercase_full(StringView string, [[maybe_unused]] Optional locale) +ErrorOr to_unicode_uppercase_full(StringView string, Optional const& locale) { -#if ENABLE_UNICODE_DATA - Utf8View view { string }; StringBuilder builder; - - size_t index = 0; - size_t byte_length = 0; - - for (auto it = view.begin(); it != view.end(); ++it, index += byte_length) { - u32 code_point = *it; - byte_length = it.underlying_code_point_length_in_bytes(); - - auto const* special_casing = find_matching_special_case(code_point, view, locale, index, byte_length); - if (!special_casing) { - TRY(builder.try_append_code_point(to_unicode_uppercase(code_point))); - continue; - } - - for (size_t i = 0; i < special_casing->uppercase_mapping_size; ++i) - TRY(builder.try_append_code_point(special_casing->uppercase_mapping[i])); - } - - return builder.build(); -#else - return string.to_uppercase_string(); -#endif + TRY(Detail::build_uppercase_string(Utf8View { string }, builder, locale)); + return builder.to_deprecated_string(); } Optional __attribute__((weak)) general_category_from_string(StringView) { return {}; } diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.h b/Userland/Libraries/LibUnicode/CharacterTypes.h index 25d48ff889..788d567b91 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.h +++ b/Userland/Libraries/LibUnicode/CharacterTypes.h @@ -32,15 +32,14 @@ Optional code_point_abbreviation(u32 code_point); Span block_display_names(); u32 canonical_combining_class(u32 code_point); -Span special_case_mapping(u32 code_point); // Note: The single code point case conversions only perform simple case folding. // Use the full-string transformations for full case folding. u32 to_unicode_lowercase(u32 code_point); u32 to_unicode_uppercase(u32 code_point); -ErrorOr to_unicode_lowercase_full(StringView, Optional locale = {}); -ErrorOr to_unicode_uppercase_full(StringView, Optional locale = {}); +ErrorOr to_unicode_lowercase_full(StringView, Optional const& locale = {}); +ErrorOr to_unicode_uppercase_full(StringView, Optional const& locale = {}); Optional general_category_from_string(StringView); bool code_point_has_general_category(u32 code_point, GeneralCategory general_category); diff --git a/Userland/Libraries/LibUnicode/UnicodeUtils.cpp b/Userland/Libraries/LibUnicode/UnicodeUtils.cpp new file mode 100644 index 0000000000..992122690a --- /dev/null +++ b/Userland/Libraries/LibUnicode/UnicodeUtils.cpp @@ -0,0 +1,252 @@ +/* + * Copyright (c) 2023, Tim Flynn + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include +#include +#include +#include +#include +#include + +#if ENABLE_UNICODE_DATA +# include +#endif + +// For details on the algorithms used here, see Section 3.13 Default Case Algorithms +// https://www.unicode.org/versions/Unicode13.0.0/ch03.pdf + +namespace Unicode::Detail { + +#if ENABLE_UNICODE_DATA + +static bool is_after_uppercase_i(Utf8View const& string, size_t index) +{ + // There is an uppercase I before C, and there is no intervening combining character class 230 (Above) or 0. + auto preceding_view = string.substring_view(0, index); + bool found_uppercase_i = false; + + // FIXME: Would be better if Utf8View supported reverse iteration. + for (auto code_point : preceding_view) { + if (code_point == 'I') { + found_uppercase_i = true; + continue; + } + + auto combining_class = canonical_combining_class(code_point); + if (combining_class == 0 || combining_class == 230) + found_uppercase_i = false; + } + + return found_uppercase_i; +} + +static bool is_after_soft_dotted_code_point(Utf8View const& string, size_t index) +{ + // There is a Soft_Dotted character before C, with no intervening character of combining class 0 or 230 (Above). + auto preceding_view = string.substring_view(0, index); + bool found_soft_dotted_code_point = false; + + // FIXME: Would be better if Utf8View supported reverse iteration. + for (auto code_point : preceding_view) { + if (code_point_has_property(code_point, Property::Soft_Dotted)) { + found_soft_dotted_code_point = true; + continue; + } + + auto combining_class = canonical_combining_class(code_point); + if (combining_class == 0 || combining_class == 230) + found_soft_dotted_code_point = false; + } + + return found_soft_dotted_code_point; +} + +static bool is_final_code_point(Utf8View const& string, size_t index, size_t byte_length) +{ + // C is preceded by a sequence consisting of a cased letter and then zero or more case-ignorable + // characters, and C is not followed by a sequence consisting of zero or more case-ignorable + // characters and then a cased letter. + auto preceding_view = string.substring_view(0, index); + auto following_view = ((index + byte_length) < string.byte_length()) + ? string.substring_view(index + byte_length) + : Utf8View {}; + + size_t cased_letter_count = 0; + + for (auto code_point : preceding_view) { + bool is_cased = code_point_has_property(code_point, Property::Cased); + bool is_case_ignorable = code_point_has_property(code_point, Property::Case_Ignorable); + + if (is_cased && !is_case_ignorable) + ++cased_letter_count; + else if (!is_case_ignorable) + cased_letter_count = 0; + } + + if (cased_letter_count == 0) + return false; + + for (auto code_point : following_view) { + bool is_cased = code_point_has_property(code_point, Property::Cased); + bool is_case_ignorable = code_point_has_property(code_point, Property::Case_Ignorable); + + if (is_case_ignorable) + continue; + if (is_cased) + return false; + + break; + } + + return true; +} + +static bool is_followed_by_combining_class_above(Utf8View const& string, size_t index, size_t byte_length) +{ + // C is followed by a character of combining class 230 (Above) with no intervening character of combining class 0 or 230 (Above). + auto following_view = ((index + byte_length) < string.byte_length()) + ? string.substring_view(index + byte_length) + : Utf8View {}; + + for (auto code_point : following_view) { + u32 combining_class = canonical_combining_class(code_point); + + if (combining_class == 0) + return false; + if (combining_class == 230) + return true; + } + + return false; +} + +static bool is_followed_by_combining_dot_above(Utf8View const& string, size_t index, size_t byte_length) +{ + // C is followed by combining dot above (U+0307). Any sequence of characters with a combining class that is neither 0 nor 230 may + // intervene between the current character and the combining dot above. + auto following_view = ((index + byte_length) < string.byte_length()) + ? string.substring_view(index + byte_length) + : Utf8View {}; + + for (auto code_point : following_view) { + if (code_point == 0x307) + return true; + + u32 combining_class = canonical_combining_class(code_point); + + if (combining_class == 0) + return false; + if (combining_class == 230) + return false; + } + + return false; +} + +static SpecialCasing const* find_matching_special_case(u32 code_point, Utf8View const& string, Optional locale, size_t index, size_t byte_length) +{ + auto requested_locale = Locale::None; + + if (locale.has_value()) { + if (auto maybe_locale = locale_from_string(*locale); maybe_locale.has_value()) + requested_locale = *maybe_locale; + } + + auto special_casings = special_case_mapping(code_point); + + for (auto const* special_casing : special_casings) { + if (special_casing->locale != Locale::None && special_casing->locale != requested_locale) + continue; + + switch (special_casing->condition) { + case Condition::None: + return special_casing; + + case Condition::AfterI: + if (is_after_uppercase_i(string, index)) + return special_casing; + break; + + case Condition::AfterSoftDotted: + if (is_after_soft_dotted_code_point(string, index)) + return special_casing; + break; + + case Condition::FinalSigma: + if (is_final_code_point(string, index, byte_length)) + return special_casing; + break; + + case Condition::MoreAbove: + if (is_followed_by_combining_class_above(string, index, byte_length)) + return special_casing; + break; + + case Condition::NotBeforeDot: + if (!is_followed_by_combining_dot_above(string, index, byte_length)) + return special_casing; + break; + } + } + + return nullptr; +} + +#endif + +ErrorOr build_lowercase_string([[maybe_unused]] Utf8View code_points, [[maybe_unused]] StringBuilder& builder, [[maybe_unused]] Optional const& locale) +{ +#if ENABLE_UNICODE_DATA + size_t index = 0; + size_t byte_length = 0; + + for (auto it = code_points.begin(); it != code_points.end(); ++it, index += byte_length) { + u32 code_point = *it; + byte_length = it.underlying_code_point_length_in_bytes(); + + auto const* special_casing = find_matching_special_case(code_point, code_points, locale, index, byte_length); + if (!special_casing) { + TRY(builder.try_append_code_point(to_unicode_lowercase(code_point))); + continue; + } + + for (size_t i = 0; i < special_casing->lowercase_mapping_size; ++i) + TRY(builder.try_append_code_point(special_casing->lowercase_mapping[i])); + } + + return {}; +#else + return Error::from_string_literal("Unicode data has been disabled"); +#endif +} + +ErrorOr build_uppercase_string([[maybe_unused]] Utf8View code_points, [[maybe_unused]] StringBuilder& builder, [[maybe_unused]] Optional const& locale) +{ +#if ENABLE_UNICODE_DATA + size_t index = 0; + size_t byte_length = 0; + + for (auto it = code_points.begin(); it != code_points.end(); ++it, index += byte_length) { + u32 code_point = *it; + byte_length = it.underlying_code_point_length_in_bytes(); + + auto const* special_casing = find_matching_special_case(code_point, code_points, locale, index, byte_length); + if (!special_casing) { + TRY(builder.try_append_code_point(to_unicode_uppercase(code_point))); + continue; + } + + for (size_t i = 0; i < special_casing->uppercase_mapping_size; ++i) + TRY(builder.try_append_code_point(special_casing->uppercase_mapping[i])); + } + + return {}; +#else + return Error::from_string_literal("Unicode data has been disabled"); +#endif +} + +} diff --git a/Userland/Libraries/LibUnicode/UnicodeUtils.h b/Userland/Libraries/LibUnicode/UnicodeUtils.h new file mode 100644 index 0000000000..1770c385a7 --- /dev/null +++ b/Userland/Libraries/LibUnicode/UnicodeUtils.h @@ -0,0 +1,20 @@ +/* + * Copyright (c) 2023, Tim Flynn + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include +#include +#include +#include +#include + +namespace Unicode::Detail { + +ErrorOr build_lowercase_string(Utf8View code_points, StringBuilder& builder, Optional const& locale); +ErrorOr build_uppercase_string(Utf8View code_points, StringBuilder& builder, Optional const& locale); + +}