1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-05-31 07:58:11 +00:00

LibWeb: Parse <urange> as CSS::UnicodeRange

Like, An+B, this is an old construct that does not fit well with modern
CSS syntax, so things get a bit hairy! We have to determine which
tokens match the grammar for `<urange>`, then turn those back into a
string, and then parse the string differently from normal. Thankfully
the spec describes in detail how to do that. :^)

This is not 100% correct, since we are not using the original source
text (referred to in the spec as the "representation") of the tokens,
but just converting them to strings in a manual, ad-hoc way.
Re-engineering the Tokenizer to keep that original text was too much of
a tangent for today. In any case, we do parse `U+4???`, `U+0-100`,
`U+1234`, and similar, so good enough for now!
This commit is contained in:
Sam Atkins 2022-04-07 17:41:54 +01:00 committed by Andreas Kling
parent 1f7bf46061
commit ef7d80ced2
2 changed files with 265 additions and 1 deletions

View file

@ -9,6 +9,7 @@
#include <AK/CharacterTypes.h>
#include <AK/Debug.h>
#include <AK/GenericLexer.h>
#include <AK/NonnullRefPtrVector.h>
#include <AK/SourceLocation.h>
#include <LibWeb/CSS/CSSFontFaceRule.h>
@ -2707,6 +2708,266 @@ Optional<Ratio> Parser::parse_ratio(TokenStream<ComponentValue>& tokens)
return Ratio { static_cast<float>(first_number.token().number_value()) };
}
// https://www.w3.org/TR/css-syntax-3/#urange-syntax
Optional<UnicodeRange> Parser::parse_unicode_range(TokenStream<ComponentValue>& tokens)
{
tokens.skip_whitespace();
auto position = tokens.position();
auto error = [&]() -> Optional<UnicodeRange> {
tokens.rewind_to_position(position);
return {};
};
// <urange> =
// u '+' <ident-token> '?'* |
// u <dimension-token> '?'* |
// u <number-token> '?'* |
// u <number-token> <dimension-token> |
// u <number-token> <number-token> |
// u '+' '?'+
// (All with no whitespace in between tokens.)
// NOTE: Parsing this is different from usual. We take these steps:
// 1. Match the grammar above against the tokens.
// 2. Convert the matching tokens back into a string using their original representation.
// 3. Then, parse that string according to the spec algorithm.
auto is_question_mark = [](ComponentValue const& component_value) {
return component_value.is(Token::Type::Delim) && component_value.token().delim() == '?';
};
auto is_ending_token = [](ComponentValue const& component_value) {
return component_value.is(Token::Type::EndOfFile)
|| component_value.is(Token::Type::Comma)
|| component_value.is(Token::Type::Semicolon)
|| component_value.is(Token::Type::Whitespace);
};
// All options start with 'u'/'U'.
auto& u = tokens.next_token();
if (!(u.is(Token::Type::Ident) && u.token().ident().equals_ignoring_case("u"))) {
dbgln_if(CSS_PARSER_DEBUG, "CSSParser: <urange> does not start with 'u'");
return error();
}
auto& second_token = tokens.next_token();
auto after_second_token = tokens.position();
// u '+' <ident-token> '?'* |
// u '+' '?'+
if (second_token.is(Token::Type::Delim) && second_token.token().delim() == '+') {
auto& third_token = tokens.next_token();
if (third_token.is(Token::Type::Ident) || is_question_mark(third_token)) {
while (is_question_mark(tokens.peek_token()))
tokens.next_token();
if (is_ending_token(tokens.peek_token()))
return create_unicode_range_from_tokens(tokens, position, tokens.position());
}
tokens.rewind_to_position(after_second_token);
}
// u <dimension-token> '?'*
if (second_token.is(Token::Type::Dimension)) {
while (is_question_mark(tokens.peek_token()))
tokens.next_token();
if (is_ending_token(tokens.peek_token()))
return create_unicode_range_from_tokens(tokens, position, tokens.position());
tokens.rewind_to_position(after_second_token);
}
// u <number-token> '?'* |
// u <number-token> <dimension-token> |
// u <number-token> <number-token>
if (second_token.is(Token::Type::Number)) {
if (is_ending_token(tokens.peek_token()))
return create_unicode_range_from_tokens(tokens, position, tokens.position());
auto& third_token = tokens.next_token();
if (is_question_mark(third_token)) {
while (is_question_mark(tokens.peek_token()))
tokens.next_token();
if (is_ending_token(tokens.peek_token()))
return create_unicode_range_from_tokens(tokens, position, tokens.position());
} else if (third_token.is(Token::Type::Dimension)) {
if (is_ending_token(tokens.peek_token()))
return create_unicode_range_from_tokens(tokens, position, tokens.position());
} else if (third_token.is(Token::Type::Number)) {
if (is_ending_token(tokens.peek_token()))
return create_unicode_range_from_tokens(tokens, position, tokens.position());
}
tokens.rewind_to_position(after_second_token);
}
if constexpr (CSS_PARSER_DEBUG) {
dbgln("CSSParser: Tokens did not match <urange> grammar.");
tokens.dump_all_tokens();
}
return error();
}
Optional<UnicodeRange> Parser::create_unicode_range_from_tokens(TokenStream<ComponentValue>& tokens, int start_position, int end_position)
{
auto error = [&]() -> Optional<UnicodeRange> {
tokens.rewind_to_position(start_position);
return {};
};
auto make_valid_unicode_range = [&](u32 start_value, u32 end_value) -> Optional<UnicodeRange> {
// https://www.w3.org/TR/css-syntax-3/#maximum-allowed-code-point
constexpr u32 maximum_allowed_code_point = 0x10FFFF;
// To determine what codepoints the <urange> represents:
// 1. If end value is greater than the maximum allowed code point,
// the <urange> is invalid and a syntax error.
if (end_value > maximum_allowed_code_point) {
dbgln_if(CSS_PARSER_DEBUG, "CSSParser: Invalid <urange>: end_value ({}) > maximum ({})", end_value, maximum_allowed_code_point);
return error();
}
// 2. If start value is greater than end value, the <urange> is invalid and a syntax error.
if (start_value > end_value) {
dbgln_if(CSS_PARSER_DEBUG, "CSSParser: Invalid <urange>: start_value ({}) > end_value ({})", start_value, end_value);
return error();
}
// 3. Otherwise, the <urange> represents a contiguous range of codepoints from start value to end value, inclusive.
return UnicodeRange { start_value, end_value };
};
// 1. Skipping the first u token, concatenate the representations of all the tokens in the production together.
// Let this be text.
StringBuilder text_builder;
tokens.rewind_to_position(start_position);
(void)tokens.next_token(); // Skip the 'u'
while (tokens.position() != end_position) {
// FIXME: This should use the "representation", that is, the original text that produced the token.
// See: https://www.w3.org/TR/css-syntax-3/#representation
// We don't have a way to get that, so instead, we're relying on Token::to_string(), and
// handling specific cases where that's not enough.
auto& token = tokens.next_token();
// Integers like `+34` get serialized as `34`, so manually include the `+` sign.
if (token.is(Token::Type::Number) && token.token().number().is_integer_with_explicit_sign()) {
auto int_value = token.token().number().integer_value();
if (int_value >= 0)
text_builder.append('+');
text_builder.append(String::number(int_value));
} else {
text_builder.append(token.to_string());
}
}
auto text = text_builder.string_view();
GenericLexer lexer { text };
// 2. If the first character of text is U+002B PLUS SIGN, consume it.
// Otherwise, this is an invalid <urange>, and this algorithm must exit.
if (lexer.next_is('+')) {
lexer.consume();
} else {
dbgln_if(CSS_PARSER_DEBUG, "CSSParser: Second character of <urange> was not '+'; got: '{}'", lexer.consume());
return error();
}
// 3. Consume as many hex digits from text as possible.
// then consume as many U+003F QUESTION MARK (?) code points as possible.
auto hex_digits = lexer.consume_while(is_ascii_hex_digit);
auto question_marks = lexer.consume_while([](auto it) { return it == '?'; });
// If zero code points were consumed, or more than six code points were consumed,
// this is an invalid <urange>, and this algorithm must exit.
size_t consumed_code_points = hex_digits.length() + question_marks.length();
if (consumed_code_points == 0 || consumed_code_points > 6) {
dbgln_if(CSS_PARSER_DEBUG, "CSSParser: <urange> start value had {} digits/?s, expected between 1 and 6.", consumed_code_points);
return error();
}
StringView start_value_code_points { hex_digits.characters_without_null_termination(), consumed_code_points };
// If any U+003F QUESTION MARK (?) code points were consumed, then:
if (question_marks.length() > 0) {
// 1. If there are any code points left in text, this is an invalid <urange>,
// and this algorithm must exit.
if (lexer.tell_remaining() != 0) {
dbgln_if(CSS_PARSER_DEBUG, "CSSParser: <urange> invalid; had {} code points left over.", lexer.tell_remaining());
return error();
}
// 2. Interpret the consumed code points as a hexadecimal number,
// with the U+003F QUESTION MARK (?) code points replaced by U+0030 DIGIT ZERO (0) code points.
// This is the start value.
auto start_value_string = start_value_code_points.replace("?", "0", true);
auto maybe_start_value = AK::StringUtils::convert_to_uint_from_hex<u32>(start_value_string);
if (!maybe_start_value.has_value()) {
dbgln_if(CSS_PARSER_DEBUG, "CSSParser: <urange> ?-converted start value did not parse as hex number.");
return error();
}
u32 start_value = maybe_start_value.release_value();
// 3. Interpret the consumed code points as a hexadecimal number again,
// with the U+003F QUESTION MARK (?) code points replaced by U+0046 LATIN CAPITAL LETTER F (F) code points.
// This is the end value.
auto end_value_string = start_value_code_points.replace("?", "F", true);
auto maybe_end_value = AK::StringUtils::convert_to_uint_from_hex<u32>(end_value_string);
if (!maybe_end_value.has_value()) {
dbgln_if(CSS_PARSER_DEBUG, "CSSParser: <urange> ?-converted end value did not parse as hex number.");
return error();
}
u32 end_value = maybe_end_value.release_value();
// 4. Exit this algorithm.
return make_valid_unicode_range(start_value, end_value);
}
// Otherwise, interpret the consumed code points as a hexadecimal number. This is the start value.
auto maybe_start_value = AK::StringUtils::convert_to_uint_from_hex<u32>(start_value_code_points);
if (!maybe_start_value.has_value()) {
dbgln_if(CSS_PARSER_DEBUG, "CSSParser: <urange> start value did not parse as hex number.");
return error();
}
u32 start_value = maybe_start_value.release_value();
// 4. If there are no code points left in text, The end value is the same as the start value.
// Exit this algorithm.
if (lexer.tell_remaining() == 0)
return make_valid_unicode_range(start_value, start_value);
// 5. If the next code point in text is U+002D HYPHEN-MINUS (-), consume it.
if (lexer.next_is('-')) {
lexer.consume();
}
// Otherwise, this is an invalid <urange>, and this algorithm must exit.
else {
dbgln_if(CSS_PARSER_DEBUG, "CSSParser: <urange> start and end values not separated by '-'.");
return error();
}
// 6. Consume as many hex digits as possible from text.
auto end_hex_digits = lexer.consume_while(is_ascii_hex_digit);
// If zero hex digits were consumed, or more than 6 hex digits were consumed,
// this is an invalid <urange>, and this algorithm must exit.
if (end_hex_digits.length() == 0 || end_hex_digits.length() > 6) {
dbgln_if(CSS_PARSER_DEBUG, "CSSParser: <urange> end value had {} digits, expected between 1 and 6.", end_hex_digits.length());
return error();
}
// If there are any code points left in text, this is an invalid <urange>, and this algorithm must exit.
if (lexer.tell_remaining() != 0) {
dbgln_if(CSS_PARSER_DEBUG, "CSSParser: <urange> invalid; had {} code points left over.", lexer.tell_remaining());
return error();
}
// 7. Interpret the consumed code points as a hexadecimal number. This is the end value.
auto maybe_end_value = AK::StringUtils::convert_to_uint_from_hex<u32>(end_hex_digits);
if (!maybe_end_value.has_value()) {
dbgln_if(CSS_PARSER_DEBUG, "CSSParser: <urange> end value did not parse as hex number.");
return error();
}
u32 end_value = maybe_end_value.release_value();
return make_valid_unicode_range(start_value, end_value);
}
RefPtr<StyleValue> Parser::parse_dimension_value(ComponentValue const& component_value)
{
// Numbers with no units can be lengths, in two situations: