1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-07-26 02:47:34 +00:00

LibWeb: Use StateTransaction for UnicodeRange parsing

...and change how the two parsing steps fit together.

The two steps were previously quite muddled. Both worked with the
TokenStream directly, and both were responsible for rewinding that
stream if there was an error. This is both confusing and also made it
impossible to replace the rewinding with StateTransactions.

This commit more clearly divides the work between the two functions: One
parses ComponentValues and produces a string, and the other parses that
string to produce the UnicodeRange. It also replaces manual rewinding
in the former with StateTransactions.
This commit is contained in:
Sam Atkins 2022-04-27 19:56:05 +01:00 committed by Andreas Kling
parent dc00657c4f
commit 2b3185955e
2 changed files with 66 additions and 66 deletions

View file

@ -2705,13 +2705,8 @@ Optional<Ratio> Parser::parse_ratio(TokenStream<ComponentValue>& tokens)
// https://www.w3.org/TR/css-syntax-3/#urange-syntax // https://www.w3.org/TR/css-syntax-3/#urange-syntax
Optional<UnicodeRange> Parser::parse_unicode_range(TokenStream<ComponentValue>& tokens) Optional<UnicodeRange> Parser::parse_unicode_range(TokenStream<ComponentValue>& tokens)
{ {
auto transaction = tokens.begin_transaction();
tokens.skip_whitespace(); tokens.skip_whitespace();
auto position = tokens.position();
auto error = [&]() -> Optional<UnicodeRange> {
tokens.rewind_to_position(position);
return {};
};
// <urange> = // <urange> =
// u '+' <ident-token> '?'* | // u '+' <ident-token> '?'* |
@ -2723,9 +2718,9 @@ Optional<UnicodeRange> Parser::parse_unicode_range(TokenStream<ComponentValue>&
// (All with no whitespace in between tokens.) // (All with no whitespace in between tokens.)
// NOTE: Parsing this is different from usual. We take these steps: // NOTE: Parsing this is different from usual. We take these steps:
// 1. Match the grammar above against the tokens. // 1. Match the grammar above against the tokens, concatenating them into a string using their original representation.
// 2. Convert the matching tokens back into a string using their original representation. // 2. Then, parse that string according to the spec algorithm.
// 3. Then, parse that string according to the spec algorithm. // Step 2 is performed by calling the other parse_unicode_range() overload.
auto is_question_mark = [](ComponentValue const& component_value) { auto is_question_mark = [](ComponentValue const& component_value) {
return component_value.is(Token::Type::Delim) && component_value.token().delim() == '?'; return component_value.is(Token::Type::Delim) && component_value.token().delim() == '?';
@ -2738,78 +2733,102 @@ Optional<UnicodeRange> Parser::parse_unicode_range(TokenStream<ComponentValue>&
|| component_value.is(Token::Type::Whitespace); || component_value.is(Token::Type::Whitespace);
}; };
auto representation_of = [](ComponentValue const& component_value) {
// FIXME: This should use the "representation", that is, the original text that produced the token.
// See: https://www.w3.org/TR/css-syntax-3/#representation
// We don't have a way to get that, so instead, we're relying on Token::to_string(), and
// handling specific cases where that's not enough.
// Integers like `+34` get serialized as `34`, so manually include the `+` sign.
if (component_value.is(Token::Type::Number) && component_value.token().number().is_integer_with_explicit_sign()) {
auto int_value = component_value.token().number().integer_value();
return String::formatted("{:+}", int_value);
}
return component_value.to_string();
};
auto create_unicode_range = [&](StringView text, auto& local_transaction) -> Optional<UnicodeRange> {
auto maybe_unicode_range = parse_unicode_range(text);
if (maybe_unicode_range.has_value()) {
local_transaction.commit();
transaction.commit();
}
return maybe_unicode_range;
};
// All options start with 'u'/'U'. // All options start with 'u'/'U'.
auto& u = tokens.next_token(); auto& u = tokens.next_token();
if (!(u.is(Token::Type::Ident) && u.token().ident().equals_ignoring_case("u"))) { if (!(u.is(Token::Type::Ident) && u.token().ident().equals_ignoring_case("u"))) {
dbgln_if(CSS_PARSER_DEBUG, "CSSParser: <urange> does not start with 'u'"); dbgln_if(CSS_PARSER_DEBUG, "CSSParser: <urange> does not start with 'u'");
return error(); return {};
} }
auto& second_token = tokens.next_token(); auto& second_token = tokens.next_token();
auto after_second_token = tokens.position();
// u '+' <ident-token> '?'* | // u '+' <ident-token> '?'* |
// u '+' '?'+ // u '+' '?'+
if (second_token.is(Token::Type::Delim) && second_token.token().delim() == '+') { if (second_token.is(Token::Type::Delim) && second_token.token().delim() == '+') {
auto local_transaction = tokens.begin_transaction();
StringBuilder string_builder;
string_builder.append(representation_of(second_token));
auto& third_token = tokens.next_token(); auto& third_token = tokens.next_token();
if (third_token.is(Token::Type::Ident) || is_question_mark(third_token)) { if (third_token.is(Token::Type::Ident) || is_question_mark(third_token)) {
string_builder.append(representation_of(third_token));
while (is_question_mark(tokens.peek_token())) while (is_question_mark(tokens.peek_token()))
tokens.next_token(); string_builder.append(representation_of(tokens.next_token()));
if (is_ending_token(tokens.peek_token())) if (is_ending_token(tokens.peek_token()))
return create_unicode_range_from_tokens(tokens, position, tokens.position()); return create_unicode_range(string_builder.string_view(), local_transaction);
} }
tokens.rewind_to_position(after_second_token);
} }
// u <dimension-token> '?'* // u <dimension-token> '?'*
if (second_token.is(Token::Type::Dimension)) { if (second_token.is(Token::Type::Dimension)) {
auto local_transaction = tokens.begin_transaction();
StringBuilder string_builder;
string_builder.append(representation_of(second_token));
while (is_question_mark(tokens.peek_token())) while (is_question_mark(tokens.peek_token()))
tokens.next_token(); string_builder.append(representation_of(tokens.next_token()));
if (is_ending_token(tokens.peek_token())) if (is_ending_token(tokens.peek_token()))
return create_unicode_range_from_tokens(tokens, position, tokens.position()); return create_unicode_range(string_builder.string_view(), local_transaction);
tokens.rewind_to_position(after_second_token);
} }
// u <number-token> '?'* | // u <number-token> '?'* |
// u <number-token> <dimension-token> | // u <number-token> <dimension-token> |
// u <number-token> <number-token> // u <number-token> <number-token>
if (second_token.is(Token::Type::Number)) { if (second_token.is(Token::Type::Number)) {
auto local_transaction = tokens.begin_transaction();
StringBuilder string_builder;
string_builder.append(representation_of(second_token));
if (is_ending_token(tokens.peek_token())) if (is_ending_token(tokens.peek_token()))
return create_unicode_range_from_tokens(tokens, position, tokens.position()); return create_unicode_range(string_builder.string_view(), local_transaction);
auto& third_token = tokens.next_token(); auto& third_token = tokens.next_token();
string_builder.append(representation_of(third_token));
if (is_question_mark(third_token)) { if (is_question_mark(third_token)) {
while (is_question_mark(tokens.peek_token())) while (is_question_mark(tokens.peek_token()))
tokens.next_token(); string_builder.append(representation_of(tokens.next_token()));
if (is_ending_token(tokens.peek_token())) if (is_ending_token(tokens.peek_token()))
return create_unicode_range_from_tokens(tokens, position, tokens.position()); return create_unicode_range(string_builder.string_view(), local_transaction);
} else if (third_token.is(Token::Type::Dimension)) { } else if (third_token.is(Token::Type::Dimension)) {
if (is_ending_token(tokens.peek_token())) if (is_ending_token(tokens.peek_token()))
return create_unicode_range_from_tokens(tokens, position, tokens.position()); return create_unicode_range(string_builder.string_view(), local_transaction);
} else if (third_token.is(Token::Type::Number)) { } else if (third_token.is(Token::Type::Number)) {
if (is_ending_token(tokens.peek_token())) if (is_ending_token(tokens.peek_token()))
return create_unicode_range_from_tokens(tokens, position, tokens.position()); return create_unicode_range(string_builder.string_view(), local_transaction);
} }
tokens.rewind_to_position(after_second_token);
} }
if constexpr (CSS_PARSER_DEBUG) { if constexpr (CSS_PARSER_DEBUG) {
dbgln("CSSParser: Tokens did not match <urange> grammar."); dbgln("CSSParser: Tokens did not match <urange> grammar.");
tokens.dump_all_tokens(); tokens.dump_all_tokens();
} }
return error(); return {};
} }
Optional<UnicodeRange> Parser::create_unicode_range_from_tokens(TokenStream<ComponentValue>& tokens, int start_position, int end_position) Optional<UnicodeRange> Parser::parse_unicode_range(StringView text)
{ {
auto error = [&]() -> Optional<UnicodeRange> {
tokens.rewind_to_position(start_position);
return {};
};
auto make_valid_unicode_range = [&](u32 start_value, u32 end_value) -> Optional<UnicodeRange> { auto make_valid_unicode_range = [&](u32 start_value, u32 end_value) -> Optional<UnicodeRange> {
// https://www.w3.org/TR/css-syntax-3/#maximum-allowed-code-point // https://www.w3.org/TR/css-syntax-3/#maximum-allowed-code-point
constexpr u32 maximum_allowed_code_point = 0x10FFFF; constexpr u32 maximum_allowed_code_point = 0x10FFFF;
@ -2819,13 +2838,13 @@ Optional<UnicodeRange> Parser::create_unicode_range_from_tokens(TokenStream<Comp
// the <urange> is invalid and a syntax error. // the <urange> is invalid and a syntax error.
if (end_value > maximum_allowed_code_point) { if (end_value > maximum_allowed_code_point) {
dbgln_if(CSS_PARSER_DEBUG, "CSSParser: Invalid <urange>: end_value ({}) > maximum ({})", end_value, maximum_allowed_code_point); dbgln_if(CSS_PARSER_DEBUG, "CSSParser: Invalid <urange>: end_value ({}) > maximum ({})", end_value, maximum_allowed_code_point);
return error(); return {};
} }
// 2. If start value is greater than end value, the <urange> is invalid and a syntax error. // 2. If start value is greater than end value, the <urange> is invalid and a syntax error.
if (start_value > end_value) { if (start_value > end_value) {
dbgln_if(CSS_PARSER_DEBUG, "CSSParser: Invalid <urange>: start_value ({}) > end_value ({})", start_value, end_value); dbgln_if(CSS_PARSER_DEBUG, "CSSParser: Invalid <urange>: start_value ({}) > end_value ({})", start_value, end_value);
return error(); return {};
} }
// 3. Otherwise, the <urange> represents a contiguous range of codepoints from start value to end value, inclusive. // 3. Otherwise, the <urange> represents a contiguous range of codepoints from start value to end value, inclusive.
@ -2834,26 +2853,7 @@ Optional<UnicodeRange> Parser::create_unicode_range_from_tokens(TokenStream<Comp
// 1. Skipping the first u token, concatenate the representations of all the tokens in the production together. // 1. Skipping the first u token, concatenate the representations of all the tokens in the production together.
// Let this be text. // Let this be text.
StringBuilder text_builder; // NOTE: The concatenation is already done by the caller.
tokens.rewind_to_position(start_position);
(void)tokens.next_token(); // Skip the 'u'
while (tokens.position() != end_position) {
// FIXME: This should use the "representation", that is, the original text that produced the token.
// See: https://www.w3.org/TR/css-syntax-3/#representation
// We don't have a way to get that, so instead, we're relying on Token::to_string(), and
// handling specific cases where that's not enough.
auto& token = tokens.next_token();
// Integers like `+34` get serialized as `34`, so manually include the `+` sign.
if (token.is(Token::Type::Number) && token.token().number().is_integer_with_explicit_sign()) {
auto int_value = token.token().number().integer_value();
if (int_value >= 0)
text_builder.append('+');
text_builder.append(String::number(int_value));
} else {
text_builder.append(token.to_string());
}
}
auto text = text_builder.string_view();
GenericLexer lexer { text }; GenericLexer lexer { text };
// 2. If the first character of text is U+002B PLUS SIGN, consume it. // 2. If the first character of text is U+002B PLUS SIGN, consume it.
@ -2862,7 +2862,7 @@ Optional<UnicodeRange> Parser::create_unicode_range_from_tokens(TokenStream<Comp
lexer.consume(); lexer.consume();
} else { } else {
dbgln_if(CSS_PARSER_DEBUG, "CSSParser: Second character of <urange> was not '+'; got: '{}'", lexer.consume()); dbgln_if(CSS_PARSER_DEBUG, "CSSParser: Second character of <urange> was not '+'; got: '{}'", lexer.consume());
return error(); return {};
} }
// 3. Consume as many hex digits from text as possible. // 3. Consume as many hex digits from text as possible.
@ -2874,7 +2874,7 @@ Optional<UnicodeRange> Parser::create_unicode_range_from_tokens(TokenStream<Comp
size_t consumed_code_points = hex_digits.length() + question_marks.length(); size_t consumed_code_points = hex_digits.length() + question_marks.length();
if (consumed_code_points == 0 || consumed_code_points > 6) { if (consumed_code_points == 0 || consumed_code_points > 6) {
dbgln_if(CSS_PARSER_DEBUG, "CSSParser: <urange> start value had {} digits/?s, expected between 1 and 6.", consumed_code_points); dbgln_if(CSS_PARSER_DEBUG, "CSSParser: <urange> start value had {} digits/?s, expected between 1 and 6.", consumed_code_points);
return error(); return {};
} }
StringView start_value_code_points { hex_digits.characters_without_null_termination(), consumed_code_points }; StringView start_value_code_points { hex_digits.characters_without_null_termination(), consumed_code_points };
@ -2884,7 +2884,7 @@ Optional<UnicodeRange> Parser::create_unicode_range_from_tokens(TokenStream<Comp
// and this algorithm must exit. // and this algorithm must exit.
if (lexer.tell_remaining() != 0) { if (lexer.tell_remaining() != 0) {
dbgln_if(CSS_PARSER_DEBUG, "CSSParser: <urange> invalid; had {} code points left over.", lexer.tell_remaining()); dbgln_if(CSS_PARSER_DEBUG, "CSSParser: <urange> invalid; had {} code points left over.", lexer.tell_remaining());
return error(); return {};
} }
// 2. Interpret the consumed code points as a hexadecimal number, // 2. Interpret the consumed code points as a hexadecimal number,
@ -2894,7 +2894,7 @@ Optional<UnicodeRange> Parser::create_unicode_range_from_tokens(TokenStream<Comp
auto maybe_start_value = AK::StringUtils::convert_to_uint_from_hex<u32>(start_value_string); auto maybe_start_value = AK::StringUtils::convert_to_uint_from_hex<u32>(start_value_string);
if (!maybe_start_value.has_value()) { if (!maybe_start_value.has_value()) {
dbgln_if(CSS_PARSER_DEBUG, "CSSParser: <urange> ?-converted start value did not parse as hex number."); dbgln_if(CSS_PARSER_DEBUG, "CSSParser: <urange> ?-converted start value did not parse as hex number.");
return error(); return {};
} }
u32 start_value = maybe_start_value.release_value(); u32 start_value = maybe_start_value.release_value();
@ -2905,7 +2905,7 @@ Optional<UnicodeRange> Parser::create_unicode_range_from_tokens(TokenStream<Comp
auto maybe_end_value = AK::StringUtils::convert_to_uint_from_hex<u32>(end_value_string); auto maybe_end_value = AK::StringUtils::convert_to_uint_from_hex<u32>(end_value_string);
if (!maybe_end_value.has_value()) { if (!maybe_end_value.has_value()) {
dbgln_if(CSS_PARSER_DEBUG, "CSSParser: <urange> ?-converted end value did not parse as hex number."); dbgln_if(CSS_PARSER_DEBUG, "CSSParser: <urange> ?-converted end value did not parse as hex number.");
return error(); return {};
} }
u32 end_value = maybe_end_value.release_value(); u32 end_value = maybe_end_value.release_value();
@ -2916,7 +2916,7 @@ Optional<UnicodeRange> Parser::create_unicode_range_from_tokens(TokenStream<Comp
auto maybe_start_value = AK::StringUtils::convert_to_uint_from_hex<u32>(start_value_code_points); auto maybe_start_value = AK::StringUtils::convert_to_uint_from_hex<u32>(start_value_code_points);
if (!maybe_start_value.has_value()) { if (!maybe_start_value.has_value()) {
dbgln_if(CSS_PARSER_DEBUG, "CSSParser: <urange> start value did not parse as hex number."); dbgln_if(CSS_PARSER_DEBUG, "CSSParser: <urange> start value did not parse as hex number.");
return error(); return {};
} }
u32 start_value = maybe_start_value.release_value(); u32 start_value = maybe_start_value.release_value();
@ -2932,7 +2932,7 @@ Optional<UnicodeRange> Parser::create_unicode_range_from_tokens(TokenStream<Comp
// Otherwise, this is an invalid <urange>, and this algorithm must exit. // Otherwise, this is an invalid <urange>, and this algorithm must exit.
else { else {
dbgln_if(CSS_PARSER_DEBUG, "CSSParser: <urange> start and end values not separated by '-'."); dbgln_if(CSS_PARSER_DEBUG, "CSSParser: <urange> start and end values not separated by '-'.");
return error(); return {};
} }
// 6. Consume as many hex digits as possible from text. // 6. Consume as many hex digits as possible from text.
@ -2942,20 +2942,20 @@ Optional<UnicodeRange> Parser::create_unicode_range_from_tokens(TokenStream<Comp
// this is an invalid <urange>, and this algorithm must exit. // this is an invalid <urange>, and this algorithm must exit.
if (end_hex_digits.length() == 0 || end_hex_digits.length() > 6) { if (end_hex_digits.length() == 0 || end_hex_digits.length() > 6) {
dbgln_if(CSS_PARSER_DEBUG, "CSSParser: <urange> end value had {} digits, expected between 1 and 6.", end_hex_digits.length()); dbgln_if(CSS_PARSER_DEBUG, "CSSParser: <urange> end value had {} digits, expected between 1 and 6.", end_hex_digits.length());
return error(); return {};
} }
// If there are any code points left in text, this is an invalid <urange>, and this algorithm must exit. // If there are any code points left in text, this is an invalid <urange>, and this algorithm must exit.
if (lexer.tell_remaining() != 0) { if (lexer.tell_remaining() != 0) {
dbgln_if(CSS_PARSER_DEBUG, "CSSParser: <urange> invalid; had {} code points left over.", lexer.tell_remaining()); dbgln_if(CSS_PARSER_DEBUG, "CSSParser: <urange> invalid; had {} code points left over.", lexer.tell_remaining());
return error(); return {};
} }
// 7. Interpret the consumed code points as a hexadecimal number. This is the end value. // 7. Interpret the consumed code points as a hexadecimal number. This is the end value.
auto maybe_end_value = AK::StringUtils::convert_to_uint_from_hex<u32>(end_hex_digits); auto maybe_end_value = AK::StringUtils::convert_to_uint_from_hex<u32>(end_hex_digits);
if (!maybe_end_value.has_value()) { if (!maybe_end_value.has_value()) {
dbgln_if(CSS_PARSER_DEBUG, "CSSParser: <urange> end value did not parse as hex number."); dbgln_if(CSS_PARSER_DEBUG, "CSSParser: <urange> end value did not parse as hex number.");
return error(); return {};
} }
u32 end_value = maybe_end_value.release_value(); u32 end_value = maybe_end_value.release_value();

View file

@ -310,7 +310,7 @@ private:
Optional<Length> parse_length(ComponentValue const&); Optional<Length> parse_length(ComponentValue const&);
Optional<Ratio> parse_ratio(TokenStream<ComponentValue>&); Optional<Ratio> parse_ratio(TokenStream<ComponentValue>&);
Optional<UnicodeRange> parse_unicode_range(TokenStream<ComponentValue>&); Optional<UnicodeRange> parse_unicode_range(TokenStream<ComponentValue>&);
Optional<UnicodeRange> create_unicode_range_from_tokens(TokenStream<ComponentValue>&, int start_position, int end_position); Optional<UnicodeRange> parse_unicode_range(StringView);
enum class AllowedDataUrlType { enum class AllowedDataUrlType {
None, None,