diff --git a/AK/GenericLexer.cpp b/AK/GenericLexer.cpp index 2cbabf3d3e..7a0372d6a0 100644 --- a/AK/GenericLexer.cpp +++ b/AK/GenericLexer.cpp @@ -5,9 +5,11 @@ */ #include +#include #include #include #include +#include namespace AK { // Consume a number of characters @@ -128,4 +130,74 @@ String GenericLexer::consume_and_unescape_string(char escape_char) return builder.to_string(); } +auto GenericLexer::consume_escaped_code_point(bool combine_surrogate_pairs) -> Result +{ + if (!consume_specific("\\u"sv)) + return UnicodeEscapeError::MalformedUnicodeEscape; + + if (next_is('{')) + return decode_code_point(); + return decode_single_or_paired_surrogate(combine_surrogate_pairs); +} + +auto GenericLexer::decode_code_point() -> Result +{ + bool starts_with_open_bracket = consume_specific('{'); + VERIFY(starts_with_open_bracket); + + u32 code_point = 0; + + while (true) { + if (!next_is(is_ascii_hex_digit)) + return UnicodeEscapeError::MalformedUnicodeEscape; + + auto new_code_point = (code_point << 4u) | parse_ascii_hex_digit(consume()); + if (new_code_point < code_point) + return UnicodeEscapeError::UnicodeEscapeOverflow; + + code_point = new_code_point; + if (consume_specific('}')) + break; + } + + if (is_unicode(code_point)) + return code_point; + return UnicodeEscapeError::UnicodeEscapeOverflow; +} + +auto GenericLexer::decode_single_or_paired_surrogate(bool combine_surrogate_pairs) -> Result +{ + constexpr size_t surrogate_length = 4; + + auto decode_one_surrogate = [&]() -> Optional { + u16 surrogate = 0; + + for (size_t i = 0; i < surrogate_length; ++i) { + if (!next_is(is_ascii_hex_digit)) + return {}; + + surrogate = (surrogate << 4u) | parse_ascii_hex_digit(consume()); + } + + return surrogate; + }; + + auto high_surrogate = decode_one_surrogate(); + if (!high_surrogate.has_value()) + return UnicodeEscapeError::MalformedUnicodeEscape; + if (!Utf16View::is_high_surrogate(*high_surrogate)) + return *high_surrogate; + if (!combine_surrogate_pairs || !consume_specific("\\u"sv)) + return *high_surrogate; + + auto low_surrogate = decode_one_surrogate(); + if (!low_surrogate.has_value()) + return UnicodeEscapeError::MalformedUnicodeEscape; + if (Utf16View::is_low_surrogate(*low_surrogate)) + return Utf16View::decode_surrogate_pair(*high_surrogate, *low_surrogate); + + retreat(6); + return *high_surrogate; +} + } diff --git a/AK/GenericLexer.h b/AK/GenericLexer.h index e824f473a8..7a01060068 100644 --- a/AK/GenericLexer.h +++ b/AK/GenericLexer.h @@ -6,6 +6,7 @@ #pragma once +#include #include namespace AK { @@ -115,6 +116,13 @@ public: StringView consume_quoted_string(char escape_char = 0); String consume_and_unescape_string(char escape_char = '\\'); + enum class UnicodeEscapeError { + MalformedUnicodeEscape, + UnicodeEscapeOverflow, + }; + + Result consume_escaped_code_point(bool combine_surrogate_pairs = true); + constexpr void ignore(size_t count = 1) { count = min(count, m_input.length() - m_index); @@ -201,6 +209,10 @@ public: protected: StringView m_input; size_t m_index { 0 }; + +private: + Result decode_code_point(); + Result decode_single_or_paired_surrogate(bool combine_surrogate_pairs); }; constexpr auto is_any_of(const StringView& values) diff --git a/Tests/AK/TestGenericLexer.cpp b/Tests/AK/TestGenericLexer.cpp index 29089ec5d4..86e2383ed5 100644 --- a/Tests/AK/TestGenericLexer.cpp +++ b/Tests/AK/TestGenericLexer.cpp @@ -156,3 +156,51 @@ TEST_CASE(should_constexpr_ignore_until_pred) }(); static_assert(sut.peek() == 'c'); } + +TEST_CASE(consume_escaped_code_point) +{ + auto test = [](StringView test, Result expected, bool combine_surrogate_pairs = true) { + GenericLexer lexer(test); + + auto actual = lexer.consume_escaped_code_point(combine_surrogate_pairs); + EXPECT_EQ(actual.is_error(), expected.is_error()); + + if (actual.is_error() && expected.is_error()) + EXPECT_EQ(actual.error(), expected.error()); + else + EXPECT_EQ(actual.value(), expected.value()); + }; + + test("\\u"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape); + test("\\u{"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape); + test("\\u{1"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape); + test("\\u{}"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape); + test("\\u{x}"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape); + + test("\\u{110000}"sv, GenericLexer::UnicodeEscapeError::UnicodeEscapeOverflow); + test("\\u{f00000000}"sv, GenericLexer::UnicodeEscapeError::UnicodeEscapeOverflow); + + test("\\u{0}"sv, 0); + test("\\u{41}"sv, 0x41); + test("\\u{ffff}"sv, 0xffff); + test("\\u{10ffff}"sv, 0x10ffff); + + test("\\u1"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape); + test("\\u11"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape); + test("\\u111"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape); + test("\\u111x"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape); + test("\\ud800\\u"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape); + test("\\ud800\\u1"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape); + test("\\ud800\\u11"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape); + test("\\ud800\\u111"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape); + test("\\ud800\\u111x"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape); + + test("\\u0000"sv, 0x0); + test("\\u0041"sv, 0x41); + test("\\uffff"sv, 0xffff); + + test("\\ud83d"sv, 0xd83d); + test("\\ud83d\\u1111"sv, 0xd83d); + test("\\ud83d\\ude00"sv, 0x1f600); + test("\\ud83d\\ude00"sv, 0xd83d, false); +}