mirror of
https://github.com/RGBCube/serenity
synced 2025-05-14 11:34:59 +00:00
LibUnicode: Support code point names that apply to ranges of code points
For example, consider the following adjacent entries in UnicodeData.txt: 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;; 4DBF;<CJK Ideograph Extension A, Last>;Lo;0;L;;;;;N;;;;; Our current implementation would assign the display name "CJK Ideograph Extension A" to code points U+3400 & U+4DBF, but not to the code points in between. Not only should those code points be assigned a name, but the Unicode spec also has formatting rules on what the names should be (the names for these ranged code points are not as they appear in UnicodeData.txt). The spec also defines names for code point ranges that actually are listed individually in UnicodeData.txt. For example: 2F800;CJK COMPATIBILITY IDEOGRAPH-2F800;Lo;0;L;4E3D;;;;N;;;;; 2F801;CJK COMPATIBILITY IDEOGRAPH-2F801;Lo;0;L;4E38;;;;N;;;;; 2F802;CJK COMPATIBILITY IDEOGRAPH-2F802;Lo;0;L;4E41;;;;N;;;;; Code points are only coalesced into a range if all fields after the name are equivalent. Our parser will insert the range and its name formatting pattern when it comes across the first code point in that range, then ignore other code points in that range. This reduces the number of names we generated by nearly 2,000.
This commit is contained in:
parent
f2f4980f15
commit
7e6ad172a4
4 changed files with 137 additions and 51 deletions
|
@ -629,3 +629,34 @@ TEST_CASE(script_extension)
|
|||
EXPECT(Unicode::code_point_has_script(0x101fd, script_inherited));
|
||||
EXPECT(Unicode::code_point_has_script_extension(0x101fd, script_inherited));
|
||||
}
|
||||
|
||||
TEST_CASE(code_point_display_name)
|
||||
{
|
||||
auto code_point_display_name = [](u32 code_point) {
|
||||
auto name = Unicode::code_point_display_name(code_point);
|
||||
VERIFY(name.has_value());
|
||||
return name.release_value();
|
||||
};
|
||||
|
||||
// Control code points.
|
||||
EXPECT_EQ(code_point_display_name(0), "NULL"sv);
|
||||
EXPECT_EQ(code_point_display_name(1), "START OF HEADING"sv);
|
||||
EXPECT_EQ(code_point_display_name(0xa), "LINE FEED"sv);
|
||||
|
||||
// Ideographic code points (which already appeared in a range in UnicodeData.txt).
|
||||
EXPECT_EQ(code_point_display_name(0x3400), "CJK UNIFIED IDEOGRAPH-3400"sv);
|
||||
EXPECT_EQ(code_point_display_name(0x3401), "CJK UNIFIED IDEOGRAPH-3401"sv);
|
||||
EXPECT_EQ(code_point_display_name(0x3402), "CJK UNIFIED IDEOGRAPH-3402"sv);
|
||||
EXPECT_EQ(code_point_display_name(0x4dbf), "CJK UNIFIED IDEOGRAPH-4DBF"sv);
|
||||
|
||||
EXPECT_EQ(code_point_display_name(0x20000), "CJK UNIFIED IDEOGRAPH-20000"sv);
|
||||
EXPECT_EQ(code_point_display_name(0x20001), "CJK UNIFIED IDEOGRAPH-20001"sv);
|
||||
EXPECT_EQ(code_point_display_name(0x20002), "CJK UNIFIED IDEOGRAPH-20002"sv);
|
||||
EXPECT(!Unicode::code_point_display_name(0x2a6df).has_value());
|
||||
|
||||
// Ideographic code points (which appeared individually in UnicodeData.txt and were coalesced into a range).
|
||||
EXPECT_EQ(code_point_display_name(0x2f800), "CJK COMPATIBILITY IDEOGRAPH-2F800"sv);
|
||||
EXPECT_EQ(code_point_display_name(0x2f801), "CJK COMPATIBILITY IDEOGRAPH-2F801"sv);
|
||||
EXPECT_EQ(code_point_display_name(0x2f802), "CJK COMPATIBILITY IDEOGRAPH-2F802"sv);
|
||||
EXPECT_EQ(code_point_display_name(0x2fa1d), "CJK COMPATIBILITY IDEOGRAPH-2FA1D"sv);
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue