1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-07-25 17:07:34 +00:00

AK: Invalidate UTF-8 encoded code points larger than U+10ffff

On oss-fuzz, the LibJS REPL is provided a file encoded with Windows-1252
with the following contents:

    /ô¡°½/

The REPL assumes the input file is UTF-8. So in Windows-1252, the above
is represented as [0x2f 0xf4 0xa1 0xb0 0xbd 0x2f]. The inner 4 bytes are
actually a valid UTF-8 encoding if we only look at the most significant
bits to parse leading/continuation bytes. However, it decodes to the
code point U+121c3d, which is not a valid code point.

This commit adds additional validation to ensure the decoded code point
itself is also valid.
This commit is contained in:
Timothy Flynn 2022-04-03 09:03:14 -04:00 committed by Linus Groh
parent 119873b822
commit 9e5abec6f1
2 changed files with 20 additions and 3 deletions

View file

@ -70,6 +70,16 @@ TEST_CASE(validate_invalid_ut8)
Utf8View utf8_4 { StringView { invalid_utf8_4 } };
EXPECT(!utf8_4.validate(valid_bytes));
EXPECT(valid_bytes == 0);
char invalid_utf8_5[] = { (char)0xf4, (char)0x8f, (char)0xbf, (char)0xc0, 0 }; // U+110000
Utf8View utf8_5 { StringView { invalid_utf8_5 } };
EXPECT(!utf8_5.validate(valid_bytes));
EXPECT(valid_bytes == 0);
char invalid_utf8_6[] = { (char)0xf4, (char)0xa1, (char)0xb0, (char)0xbd, 0 }; // U+121c3d
Utf8View utf8_6 { StringView { invalid_utf8_6 } };
EXPECT(!utf8_6.validate(valid_bytes));
EXPECT(valid_bytes == 0);
}
TEST_CASE(iterate_utf8)