1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-07-25 18:47:34 +00:00

AK: Invalidate overlong UTF-8 code point encodings

For example, the code point U+002F could be encoded as UTF-8 with the
bytes 0x80 0xAF. This trick has historically been used to bypass
security checks.
This commit is contained in:
Timothy Flynn 2023-03-03 08:40:12 -05:00 committed by Tim Flynn
parent 796a615bc1
commit c4d78c29a2
2 changed files with 58 additions and 6 deletions

View file

@ -7,7 +7,6 @@
#pragma once
#include <AK/CharacterTypes.h>
#include <AK/DeprecatedString.h>
#include <AK/Format.h>
#include <AK/StringView.h>
@ -142,7 +141,7 @@ public:
code_point |= code_point_bits;
}
if (!is_unicode(code_point))
if (!is_valid_code_point(code_point, byte_length))
return false;
valid_bytes += byte_length;
@ -162,13 +161,15 @@ private:
size_t byte_length { 0 };
u8 encoding_bits { 0 };
u8 encoding_mask { 0 };
u32 first_code_point { 0 };
u32 last_code_point { 0 };
};
static constexpr Array<Utf8EncodedByteData, 4> utf8_encoded_byte_data { {
{ 1, 0b0000'0000, 0b1000'0000 },
{ 2, 0b1100'0000, 0b1110'0000 },
{ 3, 0b1110'0000, 0b1111'0000 },
{ 4, 0b1111'0000, 0b1111'1000 },
{ 1, 0b0000'0000, 0b1000'0000, 0x0000, 0x007F },
{ 2, 0b1100'0000, 0b1110'0000, 0x0080, 0x07FF },
{ 3, 0b1110'0000, 0b1111'0000, 0x0800, 0xFFFF },
{ 4, 0b1111'0000, 0b1111'1000, 0x10000, 0x10FFFF },
} };
struct LeadingByte {
@ -208,6 +209,16 @@ private:
return { .is_valid = false };
}
static constexpr bool is_valid_code_point(u32 code_point, size_t byte_length)
{
for (auto const& data : utf8_encoded_byte_data) {
if (code_point >= data.first_code_point && code_point <= data.last_code_point)
return byte_length == data.byte_length;
}
return false;
}
StringView m_string;
mutable size_t m_length { 0 };
mutable bool m_have_length { false };