From e82bee86dd06a5c78d29632b5edbc9063a3e8785 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?kleines=20Filmr=C3=B6llchen?= Date: Tue, 27 Jun 2023 20:28:37 +0200 Subject: [PATCH] LibAudio: Account for 0xFF start byte in FLAC UTF-8 decoder This specialized UTF-8 decoder is more powerful than a normal UTF-8 decoder anyways, but it couldn't account for the never spec-compliant 0xff start byte. This commit makes that byte behave as expected if taking UTF-8 to its extreme, even if it is a little silly and likely not relevant for real applications. --- Userland/Libraries/LibAudio/FlacLoader.cpp | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/Userland/Libraries/LibAudio/FlacLoader.cpp b/Userland/Libraries/LibAudio/FlacLoader.cpp index c935310adc..93cddc443e 100644 --- a/Userland/Libraries/LibAudio/FlacLoader.cpp +++ b/Userland/Libraries/LibAudio/FlacLoader.cpp @@ -958,13 +958,20 @@ ErrorOr read_utf8_char(BigEndianInputBitStream& input) } else if ((start_byte & 0b11000000) == 0b10000000) { return Error::from_string_literal("Illegal continuation byte"); } - // This algorithm is too good and supports the theoretical max 0xFF start byte + // This algorithm supports the theoretical max 0xFF start byte, which is not part of the regular UTF-8 spec. u8 length = 1; while (((start_byte << length) & 0b10000000) == 0b10000000) ++length; - u8 bits_from_start_byte = 8 - (length + 1); - u8 start_byte_bitmask = AK::exp2(bits_from_start_byte) - 1; - character = start_byte_bitmask & start_byte; + + // This is technically not spec-compliant, but if we take UTF-8 to its logical extreme, + // we can say 0xFF means there's 7 following continuation bytes and no data at all in the leading character. + if (length == 8) [[unlikely]] { + character = 0; + } else { + u8 bits_from_start_byte = 8 - (length + 1); + u8 start_byte_bitmask = AK::exp2(bits_from_start_byte) - 1; + character = start_byte_bitmask & start_byte; + } for (u8 i = length - 1; i > 0; --i) { u8 current_byte = TRY(input.read_value()); character = (character << 6) | (current_byte & 0b00111111);