From 6ffcd53479e9ff8c7126fb212cd10407c191509a Mon Sep 17 00:00:00 2001 From: Luke Date: Tue, 29 Dec 2020 14:42:47 +0000 Subject: [PATCH] LibWeb: Fix character references losing characters in certain situations This fixes 4 issues: - RECONSUME_IN_RETURN_STATE was functionally equivalent to SWITCH_TO_RETURN_STATE, which caused us to lose characters. For example, &test= would lose the = - & characters by themselves would be lost. For example, 1 & 2 would become 1 2. This is because we forgot to flush characters in the the ANYTHING_ELSE path in CharacterReference - Named character references didn't work at all in attributes. This is because there was a path that was checking the entity code points instead of the entity itself. Plus, the path that was checking the entity itself wasn't quite spec compliant. - If we fail to match a named character reference, the first character is lost. For example &test would become &est. However, this relies on a little hack since I can't wrap my head around on how to change the code to do as the spec says. The hack is to reconsume in AmbigiousAmpersand instead of just switching to it. Fixes #3957 --- .../LibWeb/HTML/Parser/HTMLTokenizer.cpp | 29 +++++++++---------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp b/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp index 70bf9920de..1bf1dab3c3 100644 --- a/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp +++ b/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp @@ -71,11 +71,13 @@ namespace Web::HTML { goto _StartOfFunction; \ } while (0) -#define RECONSUME_IN_RETURN_STATE \ - do { \ - will_reconsume_in(m_return_state); \ - m_state = m_return_state; \ - goto _StartOfFunction; \ +#define RECONSUME_IN_RETURN_STATE \ + do { \ + will_reconsume_in(m_return_state); \ + m_state = m_return_state; \ + if (current_input_character.has_value()) \ + m_utf8_iterator = m_prev_utf8_iterator; \ + goto _StartOfFunction; \ } while (0) #define SWITCH_TO_AND_EMIT_CURRENT_TOKEN(new_state) \ @@ -1525,6 +1527,7 @@ _StartOfFunction: } ANYTHING_ELSE { + FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE; RECONSUME_IN_RETURN_STATE; } } @@ -1544,17 +1547,9 @@ _StartOfFunction: for (auto ch : match.value().entity) m_temporary_buffer.append(ch); - if (consumed_as_part_of_an_attribute() && match.value().code_points.last() != ';') { - auto next = peek_code_point(0); - if (next.has_value() && (next.value() == '=' || isalnum(next.value()))) { - FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE; - SWITCH_TO_RETURN_STATE; - } - } - - if (consumed_as_part_of_an_attribute() && match.value().entity.ends_with(';')) { + if (consumed_as_part_of_an_attribute() && !match.value().entity.ends_with(';')) { auto next_code_point = peek_code_point(0); - if (next_code_point.has_value() && next_code_point.value() == '=') { + if (next_code_point.has_value() && (next_code_point.value() == '=' || isalnum(next_code_point.value()))) { FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE; SWITCH_TO_RETURN_STATE; } @@ -1571,7 +1566,9 @@ _StartOfFunction: SWITCH_TO_RETURN_STATE; } else { FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE; - SWITCH_TO(AmbiguousAmpersand); + // FIXME: This should be SWITCH_TO, but we always lose the first character on this path, so just reconsume it. + // I can't wrap my head around how to do it as the spec says. + RECONSUME_IN(AmbiguousAmpersand); } } END_STATE