LibAudio: Account for 0xFF start byte in FLAC UTF-8 decoder

This specialized UTF-8 decoder is more powerful than a normal UTF-8 decoder anyways, but it couldn't account for the never spec-compliant 0xff start byte. This commit makes that byte behave as expected if taking UTF-8 to its extreme, even if it is a little silly and likely not relevant for real applications.
2025-09-15 02:06:17 +00:00 · 2023-06-27 20:28:37 +02:00 · 2023-06-27 20:28:37 +02:00 · e82bee86dd
commit e82bee86dd
parent 8bc56c7fb0
1 changed files with 11 additions and 4 deletions
--- a/Userland/Libraries/LibAudio/FlacLoader.cpp
+++ b/Userland/Libraries/LibAudio/FlacLoader.cpp
@ -958,13 +958,20 @@ ErrorOr<u64> read_utf8_char(BigEndianInputBitStream& input)
    } else if ((start_byte & 0b11000000) == 0b10000000) {
        return Error::from_string_literal("Illegal continuation byte");
    }
-    // This algorithm is too good and supports the theoretical max 0xFF start byte
+    // This algorithm supports the theoretical max 0xFF start byte, which is not part of the regular UTF-8 spec.
    u8 length = 1;
    while (((start_byte << length) & 0b10000000) == 0b10000000)
        ++length;
-    u8 bits_from_start_byte = 8 - (length + 1);
-    u8 start_byte_bitmask = AK::exp2(bits_from_start_byte) - 1;
-    character = start_byte_bitmask & start_byte;
+
+    // This is technically not spec-compliant, but if we take UTF-8 to its logical extreme,
+    // we can say 0xFF means there's 7 following continuation bytes and no data at all in the leading character.
+    if (length == 8) [[unlikely]] {
+        character = 0;
+    } else {
+        u8 bits_from_start_byte = 8 - (length + 1);
+        u8 start_byte_bitmask = AK::exp2(bits_from_start_byte) - 1;
+        character = start_byte_bitmask & start_byte;
+    }
    for (u8 i = length - 1; i > 0; --i) {
        u8 current_byte = TRY(input.read_value<u8>());
        character = (character << 6) | (current_byte & 0b00111111);