mirror of
https://github.com/RGBCube/serenity
synced 2025-07-25 15:37:46 +00:00
AK+Tests: Avoid creating invalid code points from malformed UTF-8
Instead of doing anything reasonable, Utf8CodePointIterator returned invalid code points, for example U+123456. However, many callers of this iterator assume that a code point is always at most 0x10FFFF. In fact, this is one of two reasons for the following OSS Fuzz issue: https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=49184 This is probably a very old bug. In the particular case of URLParser, AK::is_url_code_point got confused: return /* ... */ || code_point >= 0xA0; If code_point is a "code point" beyond 0x10FFFF, this violates the condition given in the preceding comment, but satisfies the given condition, which eventually causes URLParser to crash. This commit fixes *only* the erroneous UTF-8 decoding, and does not fully resolve OSS-Fuzz#49184.
This commit is contained in:
parent
3aeb57ed09
commit
ff8f3814cc
2 changed files with 31 additions and 0 deletions
|
@ -274,6 +274,10 @@ u32 Utf8CodePointIterator::operator*() const
|
||||||
code_point_value_so_far |= m_ptr[offset] & 63;
|
code_point_value_so_far |= m_ptr[offset] & 63;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (code_point_value_so_far > 0x10FFFF) {
|
||||||
|
dbgln_if(UTF8_DEBUG, "Multi-byte sequence is otherwise valid, but code point {:#x} is not permissible.", code_point_value_so_far);
|
||||||
|
return 0xFFFD;
|
||||||
|
}
|
||||||
return code_point_value_so_far;
|
return code_point_value_so_far;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -191,6 +191,33 @@ TEST_CASE(decode_invalid_ut8)
|
||||||
}
|
}
|
||||||
VERIFY(i == expected_size);
|
VERIFY(i == expected_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Test case 5 : Oversized four-byte sequence (e.g. U+123456)
|
||||||
|
{
|
||||||
|
// Want to encode: (000)1 0010 0011 0100 0101 0110
|
||||||
|
// Into mask: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
|
||||||
|
// Shifted: 100 100011 010001 010110
|
||||||
|
// Result: 11110100 10100011 10010001 10010110
|
||||||
|
char raw_data[] = { 'a', (char)0xF4, (char)0xA3, (char)0x91, (char)0x96, 'b' };
|
||||||
|
Utf8View view { StringView { raw_data, 6 } };
|
||||||
|
// This definition seems to suggest that we should instead output multiple replacement characters:
|
||||||
|
// https://encoding.spec.whatwg.org/#ref-for-concept-stream-prepend②
|
||||||
|
// This is supported by the plaintext description and example collection, which annoyingly does not give an example of how to deal with this:
|
||||||
|
// https://www.unicode.org/versions/Unicode14.0.0/ch03.pdf , section "U+FFFD Substitution of Maximal Subparts"
|
||||||
|
// However, that would go against how we deal with several other kinds of errors, so we stick to emitting only one U+FFFD.
|
||||||
|
u32 expected_characters[] = { 'a', 0xFFFD, 'b' };
|
||||||
|
String expected_underlying_bytes[] = { "a", "\xF4\xA3\x91\x96", "b" };
|
||||||
|
size_t expected_size = sizeof(expected_characters) / sizeof(expected_characters[0]);
|
||||||
|
size_t i = 0;
|
||||||
|
for (auto it = view.begin(); it != view.end(); ++it) {
|
||||||
|
u32 code_point = *it;
|
||||||
|
VERIFY(i < expected_size);
|
||||||
|
EXPECT_EQ(code_point, expected_characters[i]);
|
||||||
|
EXPECT_EQ(it.underlying_code_point_bytes(), expected_underlying_bytes[i].bytes());
|
||||||
|
i++;
|
||||||
|
}
|
||||||
|
VERIFY(i == expected_size);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_CASE(trim)
|
TEST_CASE(trim)
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue