1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-05-28 22:05:06 +00:00

AK: Add GenericLexer API to consume an escaped Unicode code point

This parsing is already duplicated between LibJS and LibRegex, and will
shortly be needed in more places in those libraries. Move it to AK to
prevent further duplication.

This API will consume escaped Unicode code points of the form:
    \\u{code point}
    \\unnnn (where each n is a hexadecimal digit)
    \\unnnn\\unnnn (where the two escaped values are a surrogate pair)
This commit is contained in:
Timothy Flynn 2021-08-17 22:20:04 -04:00 committed by Andreas Kling
parent 02e3633b7f
commit fd8ccedf2b
3 changed files with 132 additions and 0 deletions

View file

@ -5,9 +5,11 @@
*/ */
#include <AK/Assertions.h> #include <AK/Assertions.h>
#include <AK/CharacterTypes.h>
#include <AK/GenericLexer.h> #include <AK/GenericLexer.h>
#include <AK/String.h> #include <AK/String.h>
#include <AK/StringBuilder.h> #include <AK/StringBuilder.h>
#include <AK/Utf16View.h>
namespace AK { namespace AK {
// Consume a number of characters // Consume a number of characters
@ -128,4 +130,74 @@ String GenericLexer::consume_and_unescape_string(char escape_char)
return builder.to_string(); return builder.to_string();
} }
auto GenericLexer::consume_escaped_code_point(bool combine_surrogate_pairs) -> Result<u32, UnicodeEscapeError>
{
if (!consume_specific("\\u"sv))
return UnicodeEscapeError::MalformedUnicodeEscape;
if (next_is('{'))
return decode_code_point();
return decode_single_or_paired_surrogate(combine_surrogate_pairs);
}
auto GenericLexer::decode_code_point() -> Result<u32, UnicodeEscapeError>
{
bool starts_with_open_bracket = consume_specific('{');
VERIFY(starts_with_open_bracket);
u32 code_point = 0;
while (true) {
if (!next_is(is_ascii_hex_digit))
return UnicodeEscapeError::MalformedUnicodeEscape;
auto new_code_point = (code_point << 4u) | parse_ascii_hex_digit(consume());
if (new_code_point < code_point)
return UnicodeEscapeError::UnicodeEscapeOverflow;
code_point = new_code_point;
if (consume_specific('}'))
break;
}
if (is_unicode(code_point))
return code_point;
return UnicodeEscapeError::UnicodeEscapeOverflow;
}
auto GenericLexer::decode_single_or_paired_surrogate(bool combine_surrogate_pairs) -> Result<u32, UnicodeEscapeError>
{
constexpr size_t surrogate_length = 4;
auto decode_one_surrogate = [&]() -> Optional<u16> {
u16 surrogate = 0;
for (size_t i = 0; i < surrogate_length; ++i) {
if (!next_is(is_ascii_hex_digit))
return {};
surrogate = (surrogate << 4u) | parse_ascii_hex_digit(consume());
}
return surrogate;
};
auto high_surrogate = decode_one_surrogate();
if (!high_surrogate.has_value())
return UnicodeEscapeError::MalformedUnicodeEscape;
if (!Utf16View::is_high_surrogate(*high_surrogate))
return *high_surrogate;
if (!combine_surrogate_pairs || !consume_specific("\\u"sv))
return *high_surrogate;
auto low_surrogate = decode_one_surrogate();
if (!low_surrogate.has_value())
return UnicodeEscapeError::MalformedUnicodeEscape;
if (Utf16View::is_low_surrogate(*low_surrogate))
return Utf16View::decode_surrogate_pair(*high_surrogate, *low_surrogate);
retreat(6);
return *high_surrogate;
}
} }

View file

@ -6,6 +6,7 @@
#pragma once #pragma once
#include <AK/Result.h>
#include <AK/StringView.h> #include <AK/StringView.h>
namespace AK { namespace AK {
@ -115,6 +116,13 @@ public:
StringView consume_quoted_string(char escape_char = 0); StringView consume_quoted_string(char escape_char = 0);
String consume_and_unescape_string(char escape_char = '\\'); String consume_and_unescape_string(char escape_char = '\\');
enum class UnicodeEscapeError {
MalformedUnicodeEscape,
UnicodeEscapeOverflow,
};
Result<u32, UnicodeEscapeError> consume_escaped_code_point(bool combine_surrogate_pairs = true);
constexpr void ignore(size_t count = 1) constexpr void ignore(size_t count = 1)
{ {
count = min(count, m_input.length() - m_index); count = min(count, m_input.length() - m_index);
@ -201,6 +209,10 @@ public:
protected: protected:
StringView m_input; StringView m_input;
size_t m_index { 0 }; size_t m_index { 0 };
private:
Result<u32, UnicodeEscapeError> decode_code_point();
Result<u32, UnicodeEscapeError> decode_single_or_paired_surrogate(bool combine_surrogate_pairs);
}; };
constexpr auto is_any_of(const StringView& values) constexpr auto is_any_of(const StringView& values)

View file

@ -156,3 +156,51 @@ TEST_CASE(should_constexpr_ignore_until_pred)
}(); }();
static_assert(sut.peek() == 'c'); static_assert(sut.peek() == 'c');
} }
TEST_CASE(consume_escaped_code_point)
{
auto test = [](StringView test, Result<u32, GenericLexer::UnicodeEscapeError> expected, bool combine_surrogate_pairs = true) {
GenericLexer lexer(test);
auto actual = lexer.consume_escaped_code_point(combine_surrogate_pairs);
EXPECT_EQ(actual.is_error(), expected.is_error());
if (actual.is_error() && expected.is_error())
EXPECT_EQ(actual.error(), expected.error());
else
EXPECT_EQ(actual.value(), expected.value());
};
test("\\u"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape);
test("\\u{"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape);
test("\\u{1"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape);
test("\\u{}"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape);
test("\\u{x}"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape);
test("\\u{110000}"sv, GenericLexer::UnicodeEscapeError::UnicodeEscapeOverflow);
test("\\u{f00000000}"sv, GenericLexer::UnicodeEscapeError::UnicodeEscapeOverflow);
test("\\u{0}"sv, 0);
test("\\u{41}"sv, 0x41);
test("\\u{ffff}"sv, 0xffff);
test("\\u{10ffff}"sv, 0x10ffff);
test("\\u1"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape);
test("\\u11"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape);
test("\\u111"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape);
test("\\u111x"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape);
test("\\ud800\\u"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape);
test("\\ud800\\u1"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape);
test("\\ud800\\u11"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape);
test("\\ud800\\u111"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape);
test("\\ud800\\u111x"sv, GenericLexer::UnicodeEscapeError::MalformedUnicodeEscape);
test("\\u0000"sv, 0x0);
test("\\u0041"sv, 0x41);
test("\\uffff"sv, 0xffff);
test("\\ud83d"sv, 0xd83d);
test("\\ud83d\\u1111"sv, 0xd83d);
test("\\ud83d\\ude00"sv, 0x1f600);
test("\\ud83d\\ude00"sv, 0xd83d, false);
}