mirror of
https://github.com/RGBCube/serenity
synced 2025-05-16 19:15:07 +00:00
LibTextCodec: Add BOM sniffer
This takes the input and sniffs it for a BOM. If it has the UTF-8 or UTF-16BE BOM, it will return their respective decoder. Currently we don't have a UTF-16LE decoder, so it will assert TODO if it detects a UTF-16LE BOM. If there is no recognisable BOM, it will return no decoder.
This commit is contained in:
parent
4ccade42b7
commit
94965ba28d
2 changed files with 38 additions and 0 deletions
|
@ -141,6 +141,41 @@ Optional<String> get_standardized_encoding(const String& encoding)
|
||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// https://encoding.spec.whatwg.org/#bom-sniff
|
||||||
|
Decoder* bom_sniff_to_decoder(StringView input)
|
||||||
|
{
|
||||||
|
// 1. Let BOM be the result of peeking 3 bytes from ioQueue, converted to a byte sequence.
|
||||||
|
// 2. For each of the rows in the table below, starting with the first one and going down,
|
||||||
|
// if BOM starts with the bytes given in the first column, then return the encoding given
|
||||||
|
// in the cell in the second column of that row. Otherwise, return null.
|
||||||
|
|
||||||
|
// Byte Order Mark | Encoding
|
||||||
|
// --------------------------
|
||||||
|
// 0xEF 0xBB 0xBF | UTF-8
|
||||||
|
// 0xFE 0xFF | UTF-16BE
|
||||||
|
// 0xFF 0xFE | UTF-16LE
|
||||||
|
|
||||||
|
auto bytes = input.bytes();
|
||||||
|
if (bytes.size() < 2)
|
||||||
|
return nullptr;
|
||||||
|
|
||||||
|
auto first_byte = bytes[0];
|
||||||
|
|
||||||
|
switch (first_byte) {
|
||||||
|
case 0xEF: // UTF-8
|
||||||
|
if (bytes.size() < 3)
|
||||||
|
return nullptr;
|
||||||
|
return bytes[1] == 0xBB && bytes[2] == 0xBF ? &s_utf8_decoder : nullptr;
|
||||||
|
case 0xFE: // UTF-16BE
|
||||||
|
return bytes[1] == 0xFF ? &s_utf16be_decoder : nullptr;
|
||||||
|
case 0xFF: // UTF-16LE
|
||||||
|
// FIXME: There is currently no UTF-16LE decoder.
|
||||||
|
TODO();
|
||||||
|
}
|
||||||
|
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
String Decoder::to_utf8(StringView input)
|
String Decoder::to_utf8(StringView input)
|
||||||
{
|
{
|
||||||
StringBuilder builder(input.length());
|
StringBuilder builder(input.length());
|
||||||
|
|
|
@ -70,4 +70,7 @@ public:
|
||||||
Decoder* decoder_for(String const& encoding);
|
Decoder* decoder_for(String const& encoding);
|
||||||
Optional<String> get_standardized_encoding(const String& encoding);
|
Optional<String> get_standardized_encoding(const String& encoding);
|
||||||
|
|
||||||
|
// This returns the appropriate Unicode decoder for the sniffed BOM or nullptr if there is no appropriate decoder.
|
||||||
|
Decoder* bom_sniff_to_decoder(StringView);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue