mirror of
				https://github.com/RGBCube/serenity
				synced 2025-10-26 23:32:07 +00:00 
			
		
		
		
	 d7ffa51424
			
		
	
	
		d7ffa51424
		
	
	
	
	
		
			
			Before, this was getting included as part of the output text, which was confusing the HTML parser. Nobody needs the BOM after we have identified the codec, so now we remove it when converting to UTF-8.
		
			
				
	
	
		
			437 lines
		
	
	
	
		
			16 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
			
		
		
	
	
			437 lines
		
	
	
	
		
			16 KiB
		
	
	
	
		
			C++
		
	
	
	
	
	
| /*
 | |
|  * Copyright (c) 2020, Andreas Kling <kling@serenityos.org>
 | |
|  *
 | |
|  * SPDX-License-Identifier: BSD-2-Clause
 | |
|  */
 | |
| 
 | |
| #include <AK/String.h>
 | |
| #include <AK/StringBuilder.h>
 | |
| #include <LibTextCodec/Decoder.h>
 | |
| 
 | |
| namespace TextCodec {
 | |
| 
 | |
| namespace {
 | |
| Latin1Decoder& latin1_decoder()
 | |
| {
 | |
|     static Latin1Decoder* decoder = nullptr;
 | |
|     if (!decoder)
 | |
|         decoder = new Latin1Decoder;
 | |
|     return *decoder;
 | |
| }
 | |
| 
 | |
| UTF8Decoder& utf8_decoder()
 | |
| {
 | |
|     static UTF8Decoder* decoder = nullptr;
 | |
|     if (!decoder)
 | |
|         decoder = new UTF8Decoder;
 | |
|     return *decoder;
 | |
| }
 | |
| 
 | |
| UTF16BEDecoder& utf16be_decoder()
 | |
| {
 | |
|     static UTF16BEDecoder* decoder = nullptr;
 | |
|     if (!decoder)
 | |
|         decoder = new UTF16BEDecoder;
 | |
|     return *decoder;
 | |
| }
 | |
| 
 | |
| Latin2Decoder& latin2_decoder()
 | |
| {
 | |
|     static Latin2Decoder* decoder = nullptr;
 | |
|     if (!decoder)
 | |
|         decoder = new Latin2Decoder;
 | |
|     return *decoder;
 | |
| }
 | |
| 
 | |
| HebrewDecoder& hebrew_decoder()
 | |
| {
 | |
|     static HebrewDecoder* decoder = nullptr;
 | |
|     if (!decoder)
 | |
|         decoder = new HebrewDecoder;
 | |
|     return *decoder;
 | |
| }
 | |
| 
 | |
| CyrillicDecoder& cyrillic_decoder()
 | |
| {
 | |
|     static CyrillicDecoder* decoder = nullptr;
 | |
|     if (!decoder)
 | |
|         decoder = new CyrillicDecoder;
 | |
|     return *decoder;
 | |
| }
 | |
| 
 | |
| Latin9Decoder& latin9_decoder()
 | |
| {
 | |
|     static Latin9Decoder* decoder = nullptr;
 | |
|     if (!decoder)
 | |
|         decoder = new Latin9Decoder;
 | |
|     return *decoder;
 | |
| }
 | |
| 
 | |
| TurkishDecoder& turkish_decoder()
 | |
| {
 | |
|     static TurkishDecoder* decoder = nullptr;
 | |
|     if (!decoder)
 | |
|         decoder = new TurkishDecoder;
 | |
|     return *decoder;
 | |
| }
 | |
| 
 | |
| }
 | |
| 
 | |
| Decoder* decoder_for(const String& a_encoding)
 | |
| {
 | |
|     auto encoding = get_standardized_encoding(a_encoding);
 | |
|     if (encoding.has_value()) {
 | |
|         if (encoding.value().equals_ignoring_case("windows-1252"))
 | |
|             return &latin1_decoder();
 | |
|         if (encoding.value().equals_ignoring_case("utf-8"))
 | |
|             return &utf8_decoder();
 | |
|         if (encoding.value().equals_ignoring_case("utf-16be"))
 | |
|             return &utf16be_decoder();
 | |
|         if (encoding.value().equals_ignoring_case("iso-8859-2"))
 | |
|             return &latin2_decoder();
 | |
|         if (encoding.value().equals_ignoring_case("windows-1255"))
 | |
|             return &hebrew_decoder();
 | |
|         if (encoding.value().equals_ignoring_case("windows-1251"))
 | |
|             return &cyrillic_decoder();
 | |
|         if (encoding.value().equals_ignoring_case("iso-8859-15"))
 | |
|             return &latin9_decoder();
 | |
|         if (encoding.value().equals_ignoring_case("windows-1254"))
 | |
|             return &turkish_decoder();
 | |
|     }
 | |
|     dbgln("TextCodec: No decoder implemented for encoding '{}'", a_encoding);
 | |
|     return nullptr;
 | |
| }
 | |
| 
 | |
| // https://encoding.spec.whatwg.org/#concept-encoding-get
 | |
| Optional<String> get_standardized_encoding(const String& encoding)
 | |
| {
 | |
|     String trimmed_lowercase_encoding = encoding.trim_whitespace().to_lowercase();
 | |
| 
 | |
|     if (trimmed_lowercase_encoding.is_one_of("unicode-1-1-utf-8", "unicode11utf8", "unicode20utf8", "utf-8", "utf8", "x-unicode20utf8"))
 | |
|         return "UTF-8";
 | |
|     if (trimmed_lowercase_encoding.is_one_of("866", "cp866", "csibm866", "ibm866"))
 | |
|         return "IBM866";
 | |
|     if (trimmed_lowercase_encoding.is_one_of("csisolatin2", "iso-8859-2", "iso-ir-101", "iso8859-2", "iso88592", "iso_8859-2", "iso_8859-2:1987", "l2", "latin2"))
 | |
|         return "ISO-8859-2";
 | |
|     if (trimmed_lowercase_encoding.is_one_of("csisolatin3", "iso-8859-3", "iso-ir-109", "iso8859-3", "iso88593", "iso_8859-3", "iso_8859-3:1988", "l3", "latin3"))
 | |
|         return "ISO-8859-3";
 | |
|     if (trimmed_lowercase_encoding.is_one_of("csisolatin4", "iso-8859-4", "iso-ir-110", "iso8859-4", "iso88594", "iso_8859-4", "iso_8859-4:1989", "l4", "latin4"))
 | |
|         return "ISO-8859-4";
 | |
|     if (trimmed_lowercase_encoding.is_one_of("csisolatincyrillic", "cyrillic", "iso-8859-5", "iso-ir-144", "iso8859-5", "iso88595", "iso_8859-5", "iso_8859-5:1988"))
 | |
|         return "ISO-8859-5";
 | |
|     if (trimmed_lowercase_encoding.is_one_of("arabic", "asmo-708", "csiso88596e", "csiso88596i", "csisolatinarabic", "ecma-114", "iso-8859-6", "iso-8859-6-e", "iso-8859-6-i", "iso-ir-127", "iso8859-6", "iso88596", "iso_8859-6", "iso_8859-6:1987"))
 | |
|         return "ISO-8859-6";
 | |
|     if (trimmed_lowercase_encoding.is_one_of("csisolatingreek", "ecma-118", "elot_928", "greek", "greek8", "iso-8859-7", "iso-ir-126", "iso8859-7", "iso88597", "iso_8859-7", "iso_8859-7:1987", "sun_eu_greek"))
 | |
|         return "ISO-8859-7";
 | |
|     if (trimmed_lowercase_encoding.is_one_of("csiso88598e", "csisolatinhebrew", "hebrew", "iso-8859-8", "iso-8859-8-e", "iso-ir-138", "iso8859-8", "iso88598", "iso_8859-8", "iso_8859-8:1988", "visual"))
 | |
|         return "ISO-8859-8";
 | |
|     if (trimmed_lowercase_encoding.is_one_of("csiso88598i", "iso-8859-8-i", "logical"))
 | |
|         return "ISO-8859-8-I";
 | |
|     if (trimmed_lowercase_encoding.is_one_of("csisolatin6", "iso8859-10", "iso-ir-157", "iso8859-10", "iso885910", "l6", "latin6"))
 | |
|         return "ISO-8859-10";
 | |
|     if (trimmed_lowercase_encoding.is_one_of("iso-8859-13", "iso8859-13", "iso885913"))
 | |
|         return "ISO-8859-13";
 | |
|     if (trimmed_lowercase_encoding.is_one_of("iso-8859-14", "iso8859-14", "iso885914"))
 | |
|         return "ISO-8859-14";
 | |
|     if (trimmed_lowercase_encoding.is_one_of("csisolatin9", "iso-8859-15", "iso8859-15", "iso885915", "iso_8859-15", "l9"))
 | |
|         return "ISO-8859-15";
 | |
|     if (trimmed_lowercase_encoding == "iso-8859-16")
 | |
|         return "ISO-8859-16";
 | |
|     if (trimmed_lowercase_encoding.is_one_of("cskoi8r", "koi", "koi8", "koi8-r", "koi8_r"))
 | |
|         return "KOI8-R";
 | |
|     if (trimmed_lowercase_encoding.is_one_of("koi8-ru", "koi8-u"))
 | |
|         return "KOI8-U";
 | |
|     if (trimmed_lowercase_encoding.is_one_of("csmacintosh", "mac", "macintosh", "x-mac-roman"))
 | |
|         return "macintosh";
 | |
|     if (trimmed_lowercase_encoding.is_one_of("dos-874", "iso-8859-11", "iso8859-11", "iso885911", "tis-620", "windows-874"))
 | |
|         return "windows-874";
 | |
|     if (trimmed_lowercase_encoding.is_one_of("cp1250", "windows-1250", "x-cp1250"))
 | |
|         return "windows-1250";
 | |
|     if (trimmed_lowercase_encoding.is_one_of("cp1251", "windows-1251", "x-cp1251"))
 | |
|         return "windows-1251";
 | |
|     if (trimmed_lowercase_encoding.is_one_of("ansi_x3.4-1968", "ascii", "cp1252", "cp819", "csisolatin1", "ibm819", "iso-8859-1", "iso-ir-100", "iso8859-1", "iso88591", "iso_8859-1", "iso_8859-1:1987", "l1", "latin1", "us-ascii", "windows-1252", "x-cp1252"))
 | |
|         return "windows-1252";
 | |
|     if (trimmed_lowercase_encoding.is_one_of("cp1253", "windows-1253", "x-cp1253"))
 | |
|         return "windows-1253";
 | |
|     if (trimmed_lowercase_encoding.is_one_of("cp1254", "csisolatin5", "iso-8859-9", "iso-ir-148", "iso-8859-9", "iso-88599", "iso_8859-9", "iso_8859-9:1989", "l5", "latin5", "windows-1254", "x-cp1254"))
 | |
|         return "windows-1254";
 | |
|     if (trimmed_lowercase_encoding.is_one_of("cp1255", "windows-1255", "x-cp1255"))
 | |
|         return "windows-1255";
 | |
|     if (trimmed_lowercase_encoding.is_one_of("cp1256", "windows-1256", "x-cp1256"))
 | |
|         return "windows-1256";
 | |
|     if (trimmed_lowercase_encoding.is_one_of("cp1257", "windows-1257", "x-cp1257"))
 | |
|         return "windows-1257";
 | |
|     if (trimmed_lowercase_encoding.is_one_of("cp1258", "windows-1258", "x-cp1258"))
 | |
|         return "windows-1258";
 | |
|     if (trimmed_lowercase_encoding.is_one_of("x-mac-cyrillic", "x-mac-ukrainian"))
 | |
|         return "x-mac-cyrillic";
 | |
|     if (trimmed_lowercase_encoding.is_one_of("chinese", "csgb2312", "csiso58gb231280", "gb2312", "gb_2312", "gb_2312-80", "gbk", "iso-ir-58", "x-gbk"))
 | |
|         return "GBK";
 | |
|     if (trimmed_lowercase_encoding == "gb18030")
 | |
|         return "gb18030";
 | |
|     if (trimmed_lowercase_encoding.is_one_of("big5", "big5-hkscs", "cn-big5", "csbig5", "x-x-big5"))
 | |
|         return "Big5";
 | |
|     if (trimmed_lowercase_encoding.is_one_of("cseucpkdfmtjapanese", "euc-jp", "x-euc-jp"))
 | |
|         return "EUC-JP";
 | |
|     if (trimmed_lowercase_encoding.is_one_of("csiso2022jp", "iso-2022-jp"))
 | |
|         return "ISO-2022-JP";
 | |
|     if (trimmed_lowercase_encoding.is_one_of("csshiftjis", "ms932", "ms_kanji", "shift-jis", "shift_jis", "sjis", "windows-31j", "x-sjis"))
 | |
|         return "Shift_JIS";
 | |
|     if (trimmed_lowercase_encoding.is_one_of("cseuckr", "csksc56011987", "euc-kr", "iso-ir-149", "korean", "ks_c_5601-1987", "ks_c_5601-1989", "ksc5601", "ksc_5601", "windows-949"))
 | |
|         return "EUC-KR";
 | |
|     if (trimmed_lowercase_encoding.is_one_of("csiso2022kr", "hz-gb-2312", "iso-2022-cn", "iso-2022-cn-ext", "iso-2022-kr", "replacement"))
 | |
|         return "replacement";
 | |
|     if (trimmed_lowercase_encoding.is_one_of("unicodefffe", "utf-16be"))
 | |
|         return "UTF-16BE";
 | |
|     if (trimmed_lowercase_encoding.is_one_of("csunicode", "iso-10646-ucs-2", "ucs-2", "unicode", "unicodefeff", "utf-16", "utf-16le"))
 | |
|         return "UTF-16LE";
 | |
|     if (trimmed_lowercase_encoding == "x-user-defined")
 | |
|         return "x-user-defined";
 | |
| 
 | |
|     dbgln("TextCodec: Unrecognized encoding: {}", encoding);
 | |
|     return {};
 | |
| }
 | |
| 
 | |
| String Decoder::to_utf8(const StringView& input)
 | |
| {
 | |
|     StringBuilder builder(input.length());
 | |
|     process(input, [&builder](u32 c) { builder.append_code_point(c); });
 | |
|     return builder.to_string();
 | |
| }
 | |
| 
 | |
| void UTF8Decoder::process(const StringView& input, Function<void(u32)> on_code_point)
 | |
| {
 | |
|     for (auto c : input) {
 | |
|         on_code_point(c);
 | |
|     }
 | |
| }
 | |
| 
 | |
| String UTF8Decoder::to_utf8(const StringView& input)
 | |
| {
 | |
|     // Discard the BOM
 | |
|     auto bomless_input = input;
 | |
|     if (auto bytes = input.bytes(); bytes.size() >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF) {
 | |
|         bomless_input = input.substring_view(3);
 | |
|     }
 | |
| 
 | |
|     return bomless_input;
 | |
| }
 | |
| 
 | |
| void UTF16BEDecoder::process(const StringView& input, Function<void(u32)> on_code_point)
 | |
| {
 | |
|     size_t utf16_length = input.length() - (input.length() % 2);
 | |
|     for (size_t i = 0; i < utf16_length; i += 2) {
 | |
|         u16 code_point = (input[i] << 8) | input[i + 1];
 | |
|         on_code_point(code_point);
 | |
|     }
 | |
| }
 | |
| 
 | |
| String UTF16BEDecoder::to_utf8(const StringView& input)
 | |
| {
 | |
|     // Discard the BOM
 | |
|     auto bomless_input = input;
 | |
|     if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF) {
 | |
|         bomless_input = input.substring_view(2);
 | |
|     }
 | |
| 
 | |
|     StringBuilder builder(bomless_input.length() / 2);
 | |
|     process(bomless_input, [&builder](u32 c) { builder.append_code_point(c); });
 | |
|     return builder.to_string();
 | |
| }
 | |
| 
 | |
| void Latin1Decoder::process(const StringView& input, Function<void(u32)> on_code_point)
 | |
| {
 | |
|     for (size_t i = 0; i < input.length(); ++i) {
 | |
|         u8 ch = input[i];
 | |
|         // Latin1 is the same as the first 256 Unicode code_points, so no mapping is needed, just utf-8 encoding.
 | |
|         on_code_point(ch);
 | |
|     }
 | |
| }
 | |
| 
 | |
| namespace {
 | |
| u32 convert_latin2_to_utf8(u8 in)
 | |
| {
 | |
|     switch (in) {
 | |
| 
 | |
| #define MAP(X, Y) \
 | |
|     case X:       \
 | |
|         return Y
 | |
| 
 | |
|         MAP(0xA1, 0x104);
 | |
|         MAP(0xA2, 0x2D8);
 | |
|         MAP(0xA3, 0x141);
 | |
|         MAP(0xA5, 0x13D);
 | |
|         MAP(0xA6, 0x15A);
 | |
|         MAP(0xA9, 0x160);
 | |
|         MAP(0xAA, 0x15E);
 | |
|         MAP(0xAB, 0x164);
 | |
|         MAP(0xAC, 0x179);
 | |
|         MAP(0xAE, 0x17D);
 | |
|         MAP(0xAF, 0x17B);
 | |
| 
 | |
|         MAP(0xB1, 0x105);
 | |
|         MAP(0xB2, 0x2DB);
 | |
|         MAP(0xB3, 0x142);
 | |
|         MAP(0xB5, 0x13E);
 | |
|         MAP(0xB6, 0x15B);
 | |
|         MAP(0xB7, 0x2C7);
 | |
|         MAP(0xB9, 0x161);
 | |
|         MAP(0xBA, 0x15F);
 | |
|         MAP(0xBB, 0x165);
 | |
|         MAP(0xBC, 0x17A);
 | |
|         MAP(0xBD, 0x2DD);
 | |
|         MAP(0xBE, 0x17E);
 | |
|         MAP(0xBF, 0x17C);
 | |
| 
 | |
|         MAP(0xC0, 0x154);
 | |
|         MAP(0xC3, 0x102);
 | |
|         MAP(0xC5, 0x139);
 | |
|         MAP(0xC6, 0x106);
 | |
|         MAP(0xC8, 0x10C);
 | |
|         MAP(0xCA, 0x118);
 | |
|         MAP(0xCC, 0x11A);
 | |
|         MAP(0xCF, 0x10E);
 | |
| 
 | |
|         MAP(0xD0, 0x110);
 | |
|         MAP(0xD1, 0x143);
 | |
|         MAP(0xD2, 0x147);
 | |
|         MAP(0xD5, 0x150);
 | |
|         MAP(0xD8, 0x158);
 | |
|         MAP(0xD9, 0x16E);
 | |
|         MAP(0xDB, 0x170);
 | |
|         MAP(0xDE, 0x162);
 | |
| 
 | |
|         MAP(0xE0, 0x155);
 | |
|         MAP(0xE3, 0x103);
 | |
|         MAP(0xE5, 0x13A);
 | |
|         MAP(0xE6, 0x107);
 | |
|         MAP(0xE8, 0x10D);
 | |
|         MAP(0xEA, 0x119);
 | |
|         MAP(0xEC, 0x11B);
 | |
|         MAP(0xEF, 0x10F);
 | |
| 
 | |
|         MAP(0xF0, 0x111);
 | |
|         MAP(0xF1, 0x144);
 | |
|         MAP(0xF2, 0x148);
 | |
|         MAP(0xF5, 0x151);
 | |
|         MAP(0xF8, 0x159);
 | |
|         MAP(0xF9, 0x16F);
 | |
|         MAP(0xFB, 0x171);
 | |
|         MAP(0xFE, 0x163);
 | |
|         MAP(0xFF, 0x2D9);
 | |
| #undef MAP
 | |
| 
 | |
|     default:
 | |
|         return in;
 | |
|     }
 | |
| }
 | |
| }
 | |
| 
 | |
| void Latin2Decoder::process(const StringView& input, Function<void(u32)> on_code_point)
 | |
| {
 | |
|     for (auto c : input) {
 | |
|         on_code_point(convert_latin2_to_utf8(c));
 | |
|     }
 | |
| }
 | |
| 
 | |
| void HebrewDecoder::process(const StringView& input, Function<void(u32)> on_code_point)
 | |
| {
 | |
|     static constexpr Array<u32, 128> translation_table = {
 | |
|         0x20AC, 0xFFFD, 0x201A, 0x192, 0x201E, 0x2026, 0x2020, 0x2021, 0x2C6, 0x2030, 0xFFFD, 0x2039, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
 | |
|         0xFFFD, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0x2DC, 0x2122, 0xFFFD, 0x203A, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
 | |
|         0xA0, 0xA1, 0xA2, 0xA3, 0x20AA, 0xA5, 0xA6, 0xA7, 0xA8, 0xA9, 0xD7, 0xAB, 0xAC, 0xAD, 0xAE, 0xAF,
 | |
|         0xB0, 0xB1, 0xB2, 0xB3, 0xB4, 0xB5, 0xB6, 0xB7, 0xB8, 0xB9, 0xF7, 0xBB, 0xBC, 0xBD, 0xBE, 0xBF,
 | |
|         0x5B0, 0x5B1, 0x5B2, 0x5B3, 0x5B4, 0x5B5, 0x5B6, 0x5B7, 0x5B8, 0x5B9, 0x5BA, 0x5BB, 0x5BC, 0x5BD, 0x5BE, 0x5BF,
 | |
|         0x5C0, 0x5C1, 0x5C2, 0x5C3, 0x5F0, 0x5F1, 0x5F2, 0x5F3, 0x5F4, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD, 0xFFFD,
 | |
|         0x5D0, 0x5D1, 0x5D2, 0x5D3, 0x5D4, 0x5D5, 0x5D6, 0x5D7, 0x5D8, 0x5D9, 0x5DA, 0x5DB, 0x5DC, 0x5DD, 0x5DE, 0x5DF,
 | |
|         0x5E0, 0x5E1, 0x5E2, 0x5E3, 0x5E4, 0x5E5, 0x5E6, 0x5E7, 0x5E8, 0x5E9, 0x5EA, 0xFFFD, 0xFFFD, 0x200E, 0x200F, 0xFFFD
 | |
|     };
 | |
|     for (unsigned char ch : input) {
 | |
|         if (ch < 0x80) { // Superset of ASCII
 | |
|             on_code_point(ch);
 | |
|         } else {
 | |
|             on_code_point(translation_table[ch - 0x80]);
 | |
|         }
 | |
|     }
 | |
| }
 | |
| 
 | |
| void CyrillicDecoder::process(const StringView& input, Function<void(u32)> on_code_point)
 | |
| {
 | |
|     static constexpr Array<u32, 128> translation_table = {
 | |
|         0x402, 0x403, 0x201A, 0x453, 0x201E, 0x2026, 0x2020, 0x2021, 0x20AC, 0x2030, 0x409, 0x2039, 0x40A, 0x40C, 0x40B, 0x40F,
 | |
|         0x452, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0xFFFD, 0x2122, 0x459, 0x203A, 0x45A, 0x45C, 0x45B, 0x45F,
 | |
|         0xA0, 0x40E, 0x45E, 0x408, 0xA4, 0x490, 0xA6, 0xA7, 0x401, 0xA9, 0x404, 0xAB, 0xAC, 0xAD, 0xAE, 0x407,
 | |
|         0xB0, 0xB1, 0x406, 0x456, 0x491, 0xB5, 0xB6, 0xB7, 0x451, 0x2116, 0x454, 0xBB, 0x458, 0x405, 0x455, 0x457,
 | |
|         0x410, 0x411, 0x412, 0x413, 0x414, 0x415, 0x416, 0x417, 0x418, 0x419, 0x41A, 0x41B, 0x41C, 0x41D, 0x41E, 0x41F,
 | |
|         0x420, 0x421, 0x422, 0x423, 0x424, 0x425, 0x426, 0x427, 0x428, 0x429, 0x42A, 0x42B, 0x42C, 0x42D, 0x42E, 0x42F,
 | |
|         0x430, 0x431, 0x432, 0x433, 0x434, 0x435, 0x436, 0x437, 0x438, 0x439, 0x43A, 0x43B, 0x43C, 0x43D, 0x43E, 0x43F,
 | |
|         0x440, 0x441, 0x442, 0x443, 0x444, 0x445, 0x446, 0x447, 0x448, 0x449, 0x44A, 0x44B, 0x44C, 0x44D, 0x44E, 0x44F
 | |
|     };
 | |
|     for (unsigned char ch : input) {
 | |
|         if (ch < 0x80) { // Superset of ASCII
 | |
|             on_code_point(ch);
 | |
|         } else {
 | |
|             on_code_point(translation_table[ch - 0x80]);
 | |
|         }
 | |
|     }
 | |
| }
 | |
| 
 | |
| void Latin9Decoder::process(const StringView& input, Function<void(u32)> on_code_point)
 | |
| {
 | |
|     auto convert_latin9_to_utf8 = [](u8 ch) -> u32 {
 | |
|         // Latin9 is the same as the first 256 Unicode code points, except for 8 characters.
 | |
|         switch (ch) {
 | |
|         case 0xA4:
 | |
|             return 0x20AC;
 | |
|         case 0xA6:
 | |
|             return 0x160;
 | |
|         case 0xA8:
 | |
|             return 0x161;
 | |
|         case 0xB4:
 | |
|             return 0x17D;
 | |
|         case 0xB8:
 | |
|             return 0x17E;
 | |
|         case 0xBC:
 | |
|             return 0x152;
 | |
|         case 0xBD:
 | |
|             return 0x153;
 | |
|         case 0xBE:
 | |
|             return 0x178;
 | |
|         default:
 | |
|             return ch;
 | |
|         }
 | |
|     };
 | |
| 
 | |
|     for (auto ch : input) {
 | |
|         on_code_point(convert_latin9_to_utf8(ch));
 | |
|     }
 | |
| }
 | |
| 
 | |
| void TurkishDecoder::process(const StringView& input, Function<void(u32)> on_code_point)
 | |
| {
 | |
|     auto convert_turkish_to_utf8 = [](u8 ch) -> u32 {
 | |
|         // Turkish (aka ISO-8859-9, Windows-1254) is the same as the first 256 Unicode code points, except for 6 characters.
 | |
|         switch (ch) {
 | |
|         case 0xD0:
 | |
|             return 0x11E;
 | |
|         case 0xDD:
 | |
|             return 0x130;
 | |
|         case 0xDE:
 | |
|             return 0x15E;
 | |
|         case 0xF0:
 | |
|             return 0x11F;
 | |
|         case 0xFD:
 | |
|             return 0x131;
 | |
|         case 0xFE:
 | |
|             return 0x15F;
 | |
|         default:
 | |
|             return ch;
 | |
|         }
 | |
|     };
 | |
| 
 | |
|     for (auto ch : input) {
 | |
|         on_code_point(convert_turkish_to_utf8(ch));
 | |
|     }
 | |
| }
 | |
| 
 | |
| }
 |