LibTextCodec: Add BOM sniffer

This takes the input and sniffs it for a BOM. If it has the UTF-8 or UTF-16BE BOM, it will return their respective decoder. Currently we don't have a UTF-16LE decoder, so it will assert TODO if it detects a UTF-16LE BOM. If there is no recognisable BOM, it will return no decoder.
2025-10-22 18:32:07 +00:00 · 2022-02-11 20:58:06 +00:00 · 2022-02-11 20:58:06 +00:00 · 94965ba28d
commit 94965ba28d
parent 4ccade42b7
2 changed files with 38 additions and 0 deletions
--- a/Userland/Libraries/LibTextCodec/Decoder.cpp
+++ b/Userland/Libraries/LibTextCodec/Decoder.cpp
@ -141,6 +141,41 @@ Optional<String> get_standardized_encoding(const String& encoding)
    return {};
 }
 // https://encoding.spec.whatwg.org/#bom-sniff
 Decoder* bom_sniff_to_decoder(StringView input)
 {
    // 1. Let BOM be the result of peeking 3 bytes from ioQueue, converted to a byte sequence.
    // 2. For each of the rows in the table below, starting with the first one and going down,
    //    if BOM starts with the bytes given in the first column, then return the encoding given
    //    in the cell in the second column of that row. Otherwise, return null.
    // Byte Order Mark | Encoding
    // --------------------------
    // 0xEF 0xBB 0xBF  | UTF-8
    // 0xFE 0xFF       | UTF-16BE
    // 0xFF 0xFE       | UTF-16LE
    auto bytes = input.bytes();
    if (bytes.size() < 2)
        return nullptr;
    auto first_byte = bytes[0];
    switch (first_byte) {
    case 0xEF: // UTF-8
        if (bytes.size() < 3)
            return nullptr;
        return bytes[1] == 0xBB && bytes[2] == 0xBF ? &s_utf8_decoder : nullptr;
    case 0xFE: // UTF-16BE
        return bytes[1] == 0xFF ? &s_utf16be_decoder : nullptr;
    case 0xFF: // UTF-16LE
        // FIXME: There is currently no UTF-16LE decoder.
        TODO();
    }
    return nullptr;
 }
 String Decoder::to_utf8(StringView input)
 {
    StringBuilder builder(input.length());
--- a/Userland/Libraries/LibTextCodec/Decoder.h
+++ b/Userland/Libraries/LibTextCodec/Decoder.h
@ -70,4 +70,7 @@ public:
 Decoder* decoder_for(String const& encoding);
 Optional<String> get_standardized_encoding(const String& encoding);
 // This returns the appropriate Unicode decoder for the sniffed BOM or nullptr if there is no appropriate decoder.
 Decoder* bom_sniff_to_decoder(StringView);
 }