1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-05-31 11:58:12 +00:00

LibTextCodec: Implement a Windows-1251 decoder

This encoding (a superset of ascii that adds in the cyrillic alphabet)
is currently the third most used encoding on the web, and because
cyrillic glyphs were added by Dmitrii Trifonov recently, we can now
support it as well :^)
This commit is contained in:
Idan Horowitz 2021-05-01 18:18:26 +03:00 committed by Linus Groh
parent 4b0098e52f
commit 87cabda80d
2 changed files with 38 additions and 0 deletions

View file

@ -51,6 +51,14 @@ HebrewDecoder& hebrew_decoder()
return *decoder;
}
CyrillicDecoder& cyrillic_decoder()
{
static CyrillicDecoder* decoder = nullptr;
if (!decoder)
decoder = new CyrillicDecoder;
return *decoder;
}
}
Decoder* decoder_for(const String& a_encoding)
@ -66,6 +74,8 @@ Decoder* decoder_for(const String& a_encoding)
return &latin2_decoder();
if (encoding.equals_ignoring_case("windows-1255"))
return &hebrew_decoder();
if (encoding.equals_ignoring_case("windows-1251"))
return &cyrillic_decoder();
dbgln("TextCodec: No decoder implemented for encoding '{}'", a_encoding);
return nullptr;
}
@ -304,4 +314,27 @@ String HebrewDecoder::to_utf8(const StringView& input)
return builder.to_string();
}
String CyrillicDecoder::to_utf8(const StringView& input)
{
static constexpr Array<u32, 128> translation_table = {
0x402, 0x403, 0x201A, 0x453, 0x201E, 0x2026, 0x2020, 0x2021, 0x20AC, 0x2030, 0x409, 0x2039, 0x40A, 0x40C, 0x40B, 0x40F,
0x452, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014, 0xFFFD, 0x2122, 0x459, 0x203A, 0x45A, 0x45C, 0x45B, 0x45F,
0xA0, 0x40E, 0x45E, 0x408, 0xA4, 0x490, 0xA6, 0xA7, 0x401, 0xA9, 0x404, 0xAB, 0xAC, 0xAD, 0xAE, 0x407,
0xB0, 0xB1, 0x406, 0x456, 0x491, 0xB5, 0xB6, 0xB7, 0x451, 0x2116, 0x454, 0xBB, 0x458, 0x405, 0x455, 0x457,
0x410, 0x411, 0x412, 0x413, 0x414, 0x415, 0x416, 0x417, 0x418, 0x419, 0x41A, 0x41B, 0x41C, 0x41D, 0x41E, 0x41F,
0x420, 0x421, 0x422, 0x423, 0x424, 0x425, 0x426, 0x427, 0x428, 0x429, 0x42A, 0x42B, 0x42C, 0x42D, 0x42E, 0x42F,
0x430, 0x431, 0x432, 0x433, 0x434, 0x435, 0x436, 0x437, 0x438, 0x439, 0x43A, 0x43B, 0x43C, 0x43D, 0x43E, 0x43F,
0x440, 0x441, 0x442, 0x443, 0x444, 0x445, 0x446, 0x447, 0x448, 0x449, 0x44A, 0x44B, 0x44C, 0x44D, 0x44E, 0x44F
};
StringBuilder builder(input.length());
for (unsigned char ch : input) {
if (ch < 0x80) { // Superset of ASCII
builder.append(ch);
} else {
builder.append_code_point(translation_table[ch - 0x80]);
}
}
return builder.to_string();
}
}