From 70d0c1616ff47c6c928b0222866fe4db2b9f0f5b Mon Sep 17 00:00:00 2001 From: matcool <26722564+matcool@users.noreply.github.com> Date: Sun, 2 Oct 2022 22:57:22 -0300 Subject: [PATCH] LibUnicode: Add decomposition mappings and Unicode normalization The mappings are exposed via `Unicode::code_point_decomposition(u32)` and `Unicode::code_point_decompositions()`, the latter being useful for reverse searching a code point from its decomposition. The normalization code does not make use of `Quick_Check` props (https://www.unicode.org/reports/tr44/#Decompositions_and_Normalization), meaning no quick check optimizations. --- .../LibUnicode/GenerateUnicodeData.cpp | 82 ++++- Userland/Libraries/LibUnicode/CMakeLists.txt | 1 + Userland/Libraries/LibUnicode/Forward.h | 1 + Userland/Libraries/LibUnicode/Normalize.cpp | 280 ++++++++++++++++++ Userland/Libraries/LibUnicode/Normalize.h | 30 ++ 5 files changed, 392 insertions(+), 2 deletions(-) create mode 100644 Userland/Libraries/LibUnicode/Normalize.cpp create mode 100644 Userland/Libraries/LibUnicode/Normalize.h diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp index 6abb646a19..3aab2b4b96 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp @@ -44,6 +44,14 @@ struct SpecialCasing { String condition; }; +// Field descriptions: https://www.unicode.org/reports/tr44/#Character_Decomposition_Mappings +struct CodePointDecomposition { + // `tag` is a string since it's used for codegen as an enum value. + String tag { "Canonical"sv }; + size_t decomposition_index { 0 }; + size_t decomposition_size { 0 }; +}; + // PropList source: https://www.unicode.org/Public/13.0.0/ucd/PropList.txt // Property descriptions: https://www.unicode.org/reports/tr44/tr44-13.html#PropList.txt using PropList = HashMap>; @@ -78,7 +86,7 @@ struct CodePointData { Optional abbreviation; u8 canonical_combining_class { 0 }; String bidi_class; - String decomposition_type; + Optional decomposition_mapping; Optional numeric_value_decimal; Optional numeric_value_digit; Optional numeric_value_numeric; @@ -101,6 +109,10 @@ struct UnicodeData { u32 code_points_with_non_zero_combining_class { 0 }; + u32 code_points_with_decomposition_mapping { 0 }; + Vector decomposition_mappings; + Vector compatibility_tags; + u32 simple_uppercase_mapping_size { 0 }; u32 simple_lowercase_mapping_size { 0 }; @@ -534,6 +546,35 @@ static void add_canonical_code_point_name(CodePointRange range, StringView name, unicode_data.code_point_display_names.append({ range, index }); } +static Optional parse_decomposition_mapping(StringView string, UnicodeData& unicode_data) +{ + if (string.is_empty()) + return {}; + + CodePointDecomposition mapping; + + auto parts = string.split_view(' '); + + VERIFY(parts.size() > 0); + + if (parts.first().starts_with('<')) { + auto const tag = parts.take_first().trim("<>"sv); + + mapping.tag = String::formatted("{:c}{}", to_ascii_uppercase(tag[0]), tag.substring_view(1)); + + if (!unicode_data.compatibility_tags.contains_slow(mapping.tag)) + unicode_data.compatibility_tags.append(mapping.tag); + } + + mapping.decomposition_index = unicode_data.decomposition_mappings.size(); + mapping.decomposition_size = parts.size(); + for (auto part : parts) { + unicode_data.decomposition_mappings.append(AK::StringUtils::convert_to_uint_from_hex(part).value()); + } + + return mapping; +} + static ErrorOr parse_block_display_names(Core::Stream::BufferedFile& file, UnicodeData& unicode_data) { Array buffer; @@ -581,7 +622,7 @@ static ErrorOr parse_unicode_data(Core::Stream::BufferedFile& file, Unicod data.name = segments[1]; data.canonical_combining_class = AK::StringUtils::convert_to_uint(segments[3]).value(); data.bidi_class = segments[4]; - data.decomposition_type = segments[5]; + data.decomposition_mapping = parse_decomposition_mapping(segments[5], unicode_data); data.numeric_value_decimal = AK::StringUtils::convert_to_int(segments[6]); data.numeric_value_digit = AK::StringUtils::convert_to_int(segments[7]); data.numeric_value_numeric = AK::StringUtils::convert_to_int(segments[8]); @@ -639,6 +680,7 @@ static ErrorOr parse_unicode_data(Core::Stream::BufferedFile& file, Unicod unicode_data.code_points_with_non_zero_combining_class += data.canonical_combining_class != 0; unicode_data.simple_uppercase_mapping_size += data.simple_uppercase_mapping.has_value(); unicode_data.simple_lowercase_mapping_size += data.simple_lowercase_mapping.has_value(); + unicode_data.code_points_with_decomposition_mapping += data.decomposition_mapping.has_value(); unicode_data.code_points_with_special_casing += has_special_casing; unicode_data.largest_special_casing_size = max(unicode_data.largest_special_casing_size, data.special_casing_indices.size()); @@ -714,6 +756,7 @@ namespace Unicode { generate_enum("GraphemeBreakProperty"sv, {}, unicode_data.grapheme_break_props.keys()); generate_enum("WordBreakProperty"sv, {}, unicode_data.word_break_props.keys()); generate_enum("SentenceBreakProperty"sv, {}, unicode_data.sentence_break_props.keys()); + generate_enum("CompatibilityFormattingTag"sv, "Canonical"sv, unicode_data.compatibility_tags); generator.append(R"~~~( struct SpecialCasing { @@ -732,6 +775,12 @@ struct SpecialCasing { Condition condition { Condition::None }; }; +struct CodePointDecomposition { + u32 code_point { 0 }; + CompatibilityFormattingTag tag { CompatibilityFormattingTag::Canonical }; + Span decomposition; +}; + Optional locale_from_string(StringView locale); } @@ -760,6 +809,7 @@ static ErrorOr generate_unicode_data_implementation(Core::Stream::Buffered #include #include #include +#include namespace Unicode { )~~~"); @@ -863,6 +913,11 @@ struct CodePointNameComparator : public CodePointRangeComparator { }; )~~~"); + generator.set("decomposition_mappings_size", String::number(unicode_data.decomposition_mappings.size())); + generator.append("\nstatic constexpr Array s_decomposition_mappings_data { "); + generator.append(String::join(", "sv, unicode_data.decomposition_mappings, "{:#x}"sv)); + generator.append(" };\n"); + auto append_code_point_mappings = [&](StringView name, StringView mapping_type, u32 size, auto mapping_getter) { generator.set("name", name); generator.set("mapping_type", mapping_type); @@ -895,6 +950,11 @@ static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { { if constexpr (IsSame> || IsSame>) { generator.set("mapping", String::formatted("{:#x}", *mapping)); generator.append(", @mapping@ },"); + } else if constexpr (IsSame>) { + generator.set("tag", mapping->tag); + generator.set("start", String::number(mapping->decomposition_index)); + generator.set("size", String::number(mapping->decomposition_size)); + generator.append(", CompatibilityFormattingTag::@tag@, Span { s_decomposition_mappings_data.data() + @start@, @size@ } },"); } else { append_list_and_size(data.special_casing_indices, "&s_special_casing[{}]"sv); generator.append(" },"); @@ -921,6 +981,11 @@ static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { { append_code_point_mappings("special_case"sv, "SpecialCaseMapping"sv, unicode_data.code_points_with_special_casing, [](auto const& data) { return data.special_casing_indices; }); append_code_point_mappings("abbreviation"sv, "CodePointAbbreviation"sv, unicode_data.code_point_abbreviations.size(), [](auto const& data) { return data.abbreviation; }); + append_code_point_mappings("decomposition"sv, "CodePointDecomposition"sv, unicode_data.code_points_with_decomposition_mapping, + [](auto const& data) { + return data.decomposition_mapping; + }); + auto append_code_point_range_list = [&](String name, Vector const& ranges) { generator.set("name", name); generator.set("size", String::number(ranges.size())); @@ -1094,6 +1159,19 @@ Optional code_point_abbreviation(u32 code_point) return decode_string(mapping->abbreviation); } + +Optional code_point_decomposition(u32 code_point) +{ + auto const* mapping = binary_search(s_decomposition_mappings, code_point, nullptr, CodePointComparator {}); + if (mapping == nullptr) + return {}; + return *mapping; +} + +Span code_point_decompositions() +{ + return s_decomposition_mappings; +} )~~~"); auto append_prop_search = [&](StringView enum_title, StringView enum_snake, StringView collection_name) { diff --git a/Userland/Libraries/LibUnicode/CMakeLists.txt b/Userland/Libraries/LibUnicode/CMakeLists.txt index c6210ab217..96a35f058d 100644 --- a/Userland/Libraries/LibUnicode/CMakeLists.txt +++ b/Userland/Libraries/LibUnicode/CMakeLists.txt @@ -4,6 +4,7 @@ set(SOURCES CharacterTypes.cpp CurrencyCode.cpp Emoji.cpp + Normalize.cpp ${UNICODE_DATA_SOURCES} ) diff --git a/Userland/Libraries/LibUnicode/Forward.h b/Userland/Libraries/LibUnicode/Forward.h index 22cf7c698e..5b6126fad4 100644 --- a/Userland/Libraries/LibUnicode/Forward.h +++ b/Userland/Libraries/LibUnicode/Forward.h @@ -19,6 +19,7 @@ enum class Script : u8; enum class SentenceBreakProperty : u8; enum class WordBreakProperty : u8; +struct CodePointDecomposition; struct CurrencyCode; struct Emoji; struct SpecialCasing; diff --git a/Userland/Libraries/LibUnicode/Normalize.cpp b/Userland/Libraries/LibUnicode/Normalize.cpp new file mode 100644 index 0000000000..c6792f5121 --- /dev/null +++ b/Userland/Libraries/LibUnicode/Normalize.cpp @@ -0,0 +1,280 @@ +/* + * Copyright (c) 2022, mat + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include +#include +#include +#include +#include +#include +#include + +namespace Unicode { + +Optional __attribute__((weak)) code_point_decomposition(u32) { return {}; } +Span __attribute__((weak)) code_point_decompositions() { return {}; } + +ALWAYS_INLINE static bool is_starter(u32 code_point) +{ + return Unicode::canonical_combining_class(code_point) == 0; +} + +// From https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G56669 +static constexpr u32 HANGUL_SYLLABLE_BASE = 0xAC00; +static constexpr u32 HANGUL_LEADING_BASE = 0x1100; +static constexpr u32 HANGUL_VOWEL_BASE = 0x1161; +static constexpr u32 HANGUL_TRAILING_BASE = 0x11A7; +static constexpr u32 HANGUL_LEADING_COUNT = 19; +static constexpr u32 HANGUL_VOWEL_COUNT = 21; +static constexpr u32 HANGUL_TRAILING_COUNT = 28; +// NCount in the standard. +static constexpr u32 HANGUL_BLOCK_COUNT = HANGUL_VOWEL_COUNT * HANGUL_TRAILING_COUNT; +static constexpr u32 HANGUL_SYLLABLE_COUNT = HANGUL_LEADING_COUNT * HANGUL_BLOCK_COUNT; + +ALWAYS_INLINE static bool is_hangul_code_point(u32 code_point) +{ + return code_point >= HANGUL_SYLLABLE_BASE && code_point < HANGUL_SYLLABLE_BASE + HANGUL_SYLLABLE_COUNT; +} + +ALWAYS_INLINE static bool is_hangul_leading(u32 code_point) +{ + return code_point >= HANGUL_LEADING_BASE && code_point < HANGUL_LEADING_BASE + HANGUL_LEADING_COUNT; +} + +ALWAYS_INLINE static bool is_hangul_vowel(u32 code_point) +{ + return code_point >= HANGUL_VOWEL_BASE && code_point < HANGUL_VOWEL_BASE + HANGUL_VOWEL_COUNT; +} + +ALWAYS_INLINE static bool is_hangul_trailing(u32 code_point) +{ + return code_point >= HANGUL_TRAILING_BASE && code_point < HANGUL_TRAILING_BASE + HANGUL_TRAILING_COUNT; +} + +// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G56669 +static void decompose_hangul_code_point(u32 code_point, Vector& code_points_output) +{ + auto const index = code_point - HANGUL_SYLLABLE_BASE; + + auto const leading_index = index / HANGUL_BLOCK_COUNT; + auto const vowel_index = (index % HANGUL_BLOCK_COUNT) / HANGUL_TRAILING_COUNT; + auto const trailing_index = index % HANGUL_TRAILING_COUNT; + + auto const leading_part = HANGUL_LEADING_BASE + leading_index; + auto const vowel_part = HANGUL_VOWEL_BASE + vowel_index; + auto const trailing_part = HANGUL_TRAILING_BASE + trailing_index; + + code_points_output.append(leading_part); + code_points_output.append(vowel_part); + if (trailing_index != 0) + code_points_output.append(trailing_part); +} + +// L, V and LV, T Hangul Syllable Composition +// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G59688 +static u32 combine_hangul_code_points(u32 a, u32 b) +{ + if (is_hangul_leading(a) && is_hangul_vowel(b)) { + auto const leading_index = a - HANGUL_LEADING_BASE; + auto const vowel_index = b - HANGUL_VOWEL_BASE; + auto const leading_vowel_index = leading_index * HANGUL_BLOCK_COUNT + vowel_index * HANGUL_TRAILING_COUNT; + return HANGUL_SYLLABLE_BASE + leading_vowel_index; + } + if (is_hangul_code_point(a) && is_hangul_trailing(b)) { + return a + b - HANGUL_TRAILING_BASE; + } + return 0; +} + +static u32 combine_code_points(u32 a, u32 b) +{ + Array const points { a, b }; + // FIXME: Do something better than linear search to find reverse mappings. + for (auto const& mapping : Unicode::code_point_decompositions()) { + if (mapping.tag == CompatibilityFormattingTag::Canonical && mapping.decomposition == points) { + if (code_point_has_property(mapping.code_point, Property::Full_Composition_Exclusion)) + continue; + return mapping.code_point; + } + } + return 0; +} + +enum class UseCompatibility { + Yes, + No +}; + +static void decompose_code_point(u32 code_point, Vector& code_points_output, UseCompatibility use_compatibility) +{ + if (is_hangul_code_point(code_point)) { + decompose_hangul_code_point(code_point, code_points_output); + return; + } + auto const mapping = Unicode::code_point_decomposition(code_point); + if (mapping.has_value() && (mapping->tag == CompatibilityFormattingTag::Canonical || use_compatibility == UseCompatibility::Yes)) { + for (auto code_point : mapping->decomposition) { + decompose_code_point(code_point, code_points_output, use_compatibility); + } + } else { + code_points_output.append(code_point); + } +} + +// This can be any sorting algorithm that maintains order (like std::stable_sort), +// however bubble sort is easier to implement, so go with it (for now). +template +void bubble_sort(Span span, LessThan less_than) +{ + for (size_t i = 0; i < span.size() - 1; ++i) { + for (size_t j = 0; j < span.size() - 1 - i; ++j) { + if (!less_than(span[j], span[j + 1])) + swap(span[j], span[j + 1]); + } + } +} + +// The Canonical Ordering Algorithm, as specified in Version 15.0.0 of the Unicode Standard. +// See Section 3.11, D109; and UAX #15 https://unicode.org/reports/tr15 +// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G49591 +static void canonical_ordering_algorithm(Span code_points) +{ + for (size_t i = 0; i < code_points.size(); ++i) { + if (!is_starter(code_points[i])) { + auto starter = find_if(code_points.begin() + i, code_points.end(), is_starter); + auto const span_size = static_cast(starter - (code_points.begin() + i)); + // Nothing to reorder, so continue. + if (span_size <= 1) + continue; + Span const span { code_points.data() + i, span_size }; + + bubble_sort(span, [](u32 a, u32 b) { + // Use <= to keep ordering. + return Unicode::canonical_combining_class(a) <= Unicode::canonical_combining_class(b); + }); + + // Skip over span we just sorted. + i += span_size - 1; + } + } +} + +// See Section 3.11, D115 of Version 15.0.0 of the Unicode Standard. +static bool is_blocked(Span code_points, size_t a, size_t c) +{ + if (!is_starter(code_points[a]) || a == c - 1) + return false; + auto const c_combining_class = Unicode::canonical_combining_class(code_points[c]); + auto const b_combining_class = Unicode::canonical_combining_class(code_points[c - 1]); + return b_combining_class == 0 || b_combining_class >= c_combining_class; +} + +// The Canonical Composition Algorithm, as specified in Version 15.0.0 of the Unicode Standard. +// See Section 3.11, D117; and UAX #15 https://unicode.org/reports/tr15 +// https://www.unicode.org/versions/Unicode15.0.0/ch03.pdf#G50628 +static void canonical_composition_algorithm(Vector& code_points) +{ + for (size_t i = 1; i < code_points.size(); ++i) { + auto const current_character = code_points[i]; + // R1. Seek back (left) to find the last Starter L preceding C in the character sequence + for (ssize_t j = i - 1; j >= 0; --j) { + if (!is_starter(code_points[j])) + continue; + // R2. If there is such an L, and C is not blocked from L, + // and there exists a Primary Composite P which is canonically equivalent to , + // then replace L by P in the sequence and delete C from the sequence. + if (is_blocked(code_points.span(), j, i)) + continue; + + auto composite = combine_hangul_code_points(code_points[j], current_character); + + if (composite == 0) + composite = combine_code_points(code_points[j], current_character); + + if (composite != 0) { + code_points[j] = composite; + code_points.remove(i); + --i; + break; + } + } + } +} + +static Vector normalize_nfd(Utf8View string) +{ + Vector result; + + for (auto const code_point : string) { + decompose_code_point(code_point, result, UseCompatibility::No); + } + + canonical_ordering_algorithm(result); + + return result; +} + +static Vector normalize_nfc(Utf8View string) +{ + auto result = normalize_nfd(string); + + canonical_composition_algorithm(result); + + return result; +} + +static Vector normalize_nfkd(Utf8View string) +{ + Vector result; + + for (auto const code_point : string) { + decompose_code_point(code_point, result, UseCompatibility::Yes); + } + + canonical_ordering_algorithm(result); + + return result; +} + +static Vector normalize_nfkc(Utf8View string) +{ + auto result = normalize_nfkd(string); + + canonical_composition_algorithm(result); + + return result; +} + +static Vector normalize_implementation(Utf8View string, NormalizationForm form) +{ + switch (form) { + case NormalizationForm::NFD: + return normalize_nfd(string); + case NormalizationForm::NFC: + return normalize_nfc(string); + case NormalizationForm::NFKD: + return normalize_nfkd(string); + case NormalizationForm::NFKC: + return normalize_nfkc(string); + } + VERIFY_NOT_REACHED(); +} + +String normalize(StringView string, NormalizationForm form) +{ + Utf8View const view { string }; + + auto const code_points = normalize_implementation(view, form); + + StringBuilder builder; + for (auto code_point : code_points) { + builder.append_code_point(code_point); + } + + return builder.to_string(); +} + +} diff --git a/Userland/Libraries/LibUnicode/Normalize.h b/Userland/Libraries/LibUnicode/Normalize.h new file mode 100644 index 0000000000..57ae03426a --- /dev/null +++ b/Userland/Libraries/LibUnicode/Normalize.h @@ -0,0 +1,30 @@ +/* + * Copyright (c) 2022, mat + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace Unicode { + +Optional code_point_decomposition(u32 code_point); +Span code_point_decompositions(); + +enum class NormalizationForm { + NFD, + NFC, + NFKD, + NFKC +}; + +[[nodiscard]] String normalize(StringView string, NormalizationForm form); + +}