diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp b/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp index 48190ad95f..8c0c24d37e 100644 --- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp +++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp @@ -187,9 +187,25 @@ Optional HTMLTokenizer::next_code_point() { if (m_utf8_iterator == m_utf8_view.end()) return {}; - skip(1); - dbgln_if(TOKENIZER_TRACE_DEBUG, "(Tokenizer) Next code_point: {}", (char)*m_prev_utf8_iterator); - return *m_prev_utf8_iterator; + + u32 code_point; + // https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream:tokenization + // https://infra.spec.whatwg.org/#normalize-newlines + if (peek_code_point(0).value_or(0) == '\r' && peek_code_point(1).value_or(0) == '\n') { + // replace every U+000D CR U+000A LF code point pair with a single U+000A LF code point, + skip(2); + code_point = '\n'; + } else if (peek_code_point(0).value_or(0) == '\r') { + // replace every remaining U+000D CR code point with a U+000A LF code point. + skip(1); + code_point = '\n'; + } else { + skip(1); + code_point = *m_prev_utf8_iterator; + } + + dbgln_if(TOKENIZER_TRACE_DEBUG, "(Tokenizer) Next code_point: {}", code_point); + return code_point; } void HTMLTokenizer::skip(size_t count)