1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-07-25 00:37:34 +00:00

AK: Invalidate UTF-8 encoded code points larger than U+10ffff

On oss-fuzz, the LibJS REPL is provided a file encoded with Windows-1252
with the following contents:

    /ô¡°½/

The REPL assumes the input file is UTF-8. So in Windows-1252, the above
is represented as [0x2f 0xf4 0xa1 0xb0 0xbd 0x2f]. The inner 4 bytes are
actually a valid UTF-8 encoding if we only look at the most significant
bits to parse leading/continuation bytes. However, it decodes to the
code point U+121c3d, which is not a valid code point.

This commit adds additional validation to ensure the decoded code point
itself is also valid.
This commit is contained in:
Timothy Flynn 2022-04-03 09:03:14 -04:00 committed by Linus Groh
parent 119873b822
commit 9e5abec6f1
2 changed files with 20 additions and 3 deletions

View file

@ -6,6 +6,7 @@
*/
#include <AK/Assertions.h>
#include <AK/CharacterTypes.h>
#include <AK/Format.h>
#include <AK/Utf8View.h>
@ -100,9 +101,9 @@ bool Utf8View::validate(size_t& valid_bytes) const
{
valid_bytes = 0;
for (auto ptr = begin_ptr(); ptr < end_ptr(); ptr++) {
size_t code_point_length_in_bytes;
u32 value;
bool first_byte_makes_sense = decode_first_byte(*ptr, code_point_length_in_bytes, value);
size_t code_point_length_in_bytes = 0;
u32 code_point = 0;
bool first_byte_makes_sense = decode_first_byte(*ptr, code_point_length_in_bytes, code_point);
if (!first_byte_makes_sense)
return false;
@ -112,8 +113,14 @@ bool Utf8View::validate(size_t& valid_bytes) const
return false;
if (*ptr >> 6 != 2)
return false;
code_point <<= 6;
code_point |= *ptr & 63;
}
if (!is_unicode(code_point))
return false;
valid_bytes += code_point_length_in_bytes;
}