1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-07-23 09:37:34 +00:00

LibWeb: Propagate errors from CSS Tokenizer construction

Instead of constructing a Tokenizer and then calling parse() on it, we
now call `Tokenizer::tokenize(...)` directly. (Renamed from `parse()`
because this is a Tokenizer, not a Parser.)
This commit is contained in:
Sam Atkins 2023-03-06 14:19:39 +00:00 committed by Andreas Kling
parent 98ee2fcd1b
commit 17618989a3
5 changed files with 47 additions and 44 deletions

View file

@ -84,8 +84,7 @@ AK::URL ParsingContext::complete_url(StringView relative_url) const
Parser::Parser(ParsingContext const& context, StringView input, StringView encoding)
: m_context(context)
, m_tokenizer(input, encoding)
, m_tokens(m_tokenizer.parse())
, m_tokens(Tokenizer::tokenize(input, encoding).release_value_but_fixme_should_propagate_errors())
, m_token_stream(TokenStream(m_tokens))
{
}

View file

@ -360,7 +360,6 @@ private:
ParsingContext m_context;
Tokenizer m_tokenizer;
Vector<Token> m_tokens;
TokenStream<Token> m_token_stream;
};

View file

@ -195,7 +195,7 @@ static inline bool is_E(u32 code_point)
return code_point == 0x45;
}
Tokenizer::Tokenizer(StringView input, StringView encoding)
ErrorOr<Vector<Token>> Tokenizer::tokenize(StringView input, StringView encoding)
{
// https://www.w3.org/TR/css-syntax-3/#css-filter-code-points
auto filter_code_points = [](StringView input, auto encoding) -> ErrorOr<String> {
@ -206,48 +206,53 @@ Tokenizer::Tokenizer(StringView input, StringView encoding)
bool last_was_carriage_return = false;
// To filter code points from a stream of (unfiltered) code points input:
decoder->process(input, [&builder, &last_was_carriage_return](u32 code_point) -> ErrorOr<void> {
// Replace any U+000D CARRIAGE RETURN (CR) code points,
// U+000C FORM FEED (FF) code points,
// or pairs of U+000D CARRIAGE RETURN (CR) followed by U+000A LINE FEED (LF)
// in input by a single U+000A LINE FEED (LF) code point.
if (code_point == '\r') {
if (last_was_carriage_return) {
TRY(builder.try_append('\n'));
} else {
last_was_carriage_return = true;
}
} else {
if (last_was_carriage_return)
TRY(builder.try_append('\n'));
TRY(decoder->process(input, [&builder, &last_was_carriage_return](u32 code_point) -> ErrorOr<void> {
// Replace any U+000D CARRIAGE RETURN (CR) code points,
// U+000C FORM FEED (FF) code points,
// or pairs of U+000D CARRIAGE RETURN (CR) followed by U+000A LINE FEED (LF)
// in input by a single U+000A LINE FEED (LF) code point.
if (code_point == '\r') {
if (last_was_carriage_return) {
TRY(builder.try_append('\n'));
} else {
last_was_carriage_return = true;
}
} else {
if (last_was_carriage_return)
TRY(builder.try_append('\n'));
if (code_point == '\n') {
if (!last_was_carriage_return)
TRY(builder.try_append('\n'));
if (code_point == '\n') {
if (!last_was_carriage_return)
TRY(builder.try_append('\n'));
} else if (code_point == '\f') {
TRY(builder.try_append('\n'));
// Replace any U+0000 NULL or surrogate code points in input with U+FFFD REPLACEMENT CHARACTER (<28>).
} else if (code_point == 0x00 || (code_point >= 0xD800 && code_point <= 0xDFFF)) {
TRY(builder.try_append_code_point(REPLACEMENT_CHARACTER));
} else {
TRY(builder.try_append_code_point(code_point));
}
} else if (code_point == '\f') {
TRY(builder.try_append('\n'));
// Replace any U+0000 NULL or surrogate code points in input with U+FFFD REPLACEMENT CHARACTER (<28>).
} else if (code_point == 0x00 || (code_point >= 0xD800 && code_point <= 0xDFFF)) {
TRY(builder.try_append_code_point(REPLACEMENT_CHARACTER));
} else {
TRY(builder.try_append_code_point(code_point));
}
last_was_carriage_return = false;
}
return {};
})
.release_value_but_fixme_should_propagate_errors();
last_was_carriage_return = false;
}
return {};
}));
return builder.to_string();
};
m_decoded_input = filter_code_points(input, encoding).release_value_but_fixme_should_propagate_errors();
m_utf8_view = Utf8View(m_decoded_input);
m_utf8_iterator = m_utf8_view.begin();
Tokenizer tokenizer { TRY(filter_code_points(input, encoding)) };
return tokenizer.tokenize();
}
Vector<Token> Tokenizer::parse()
Tokenizer::Tokenizer(String decoded_input)
: m_decoded_input(move(decoded_input))
, m_utf8_view(m_decoded_input)
, m_utf8_iterator(m_utf8_view.begin())
{
}
Vector<Token> Tokenizer::tokenize()
{
Vector<Token> tokens;
for (;;) {

View file

@ -58,15 +58,16 @@ public:
};
class Tokenizer {
public:
explicit Tokenizer(StringView input, StringView encoding);
[[nodiscard]] Vector<Token> parse();
static ErrorOr<Vector<Token>> tokenize(StringView input, StringView encoding);
[[nodiscard]] static Token create_eof_token();
private:
explicit Tokenizer(String decoded_input);
[[nodiscard]] Vector<Token> tokenize();
[[nodiscard]] u32 next_code_point();
[[nodiscard]] u32 peek_code_point(size_t offset = 0) const;
[[nodiscard]] U32Twin peek_twin() const;

View file

@ -45,8 +45,7 @@ void SyntaxHighlighter::rehighlight(Palette const& palette)
false);
};
CSS::Parser::Tokenizer tokenizer { text, "utf-8"sv };
auto tokens = tokenizer.parse();
auto tokens = CSS::Parser::Tokenizer::tokenize(text, "utf-8"sv).release_value_but_fixme_should_propagate_errors();
for (auto const& token : tokens) {
if (token.is(Parser::Token::Type::EndOfFile))
break;