From 5a2bf7fdd1efd228910557fd86b82f8aad0f2ef1 Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Thu, 24 Aug 2023 17:43:05 -0400 Subject: [PATCH] LibWeb: Set the correct end position of HTML attribute names We were previously setting the end position of attribute names in self- closing HTML tags to the end of the attribute value. To illustrate the previous behavior, consider this tag and its attribute's start and end positions (shown inclusively below): ^ name start ^ value start ^ value end ^ name end Rather than setting the end position of the attribute name when we parse the closing slash, ensure the end position is already set while we are in the AttributeName state. We now have: ^ name start ^ name end ^ value start ^ value end The tokenizer unit test has been extended to test these positions. --- Tests/LibWeb/TestHTMLTokenizer.cpp | 48 +++++++++++++------ .../Libraries/LibWeb/HTML/Parser/HTMLToken.h | 9 +++- .../LibWeb/HTML/Parser/HTMLTokenizer.cpp | 8 ++-- 3 files changed, 46 insertions(+), 19 deletions(-) diff --git a/Tests/LibWeb/TestHTMLTokenizer.cpp b/Tests/LibWeb/TestHTMLTokenizer.cpp index 6d51c315ea..39f4fe4dec 100644 --- a/Tests/LibWeb/TestHTMLTokenizer.cpp +++ b/Tests/LibWeb/TestHTMLTokenizer.cpp @@ -59,9 +59,15 @@ using Token = Web::HTML::HTMLToken; EXPECT_EQ(current_token->type(), Token::Type::DOCTYPE); \ NEXT_TOKEN(); -#define EXPECT_TAG_TOKEN_ATTRIBUTE(name, value) \ - VERIFY(last_token); \ - EXPECT_EQ(last_token->attribute(#name), value); +#define EXPECT_TAG_TOKEN_ATTRIBUTE(name, attribute_value, name_start_column, name_end_column, value_start_column, value_end_column) \ + VERIFY(last_token); \ + auto name##_attr = last_token->raw_attribute(#name); \ + VERIFY(name##_attr.has_value()); \ + EXPECT_EQ(name##_attr->value, attribute_value); \ + EXPECT_EQ(name##_attr->name_start_position.column, name_start_column); \ + EXPECT_EQ(name##_attr->name_end_position.column, name_end_column); \ + EXPECT_EQ(name##_attr->value_start_position.column, value_start_column); \ + EXPECT_EQ(name##_attr->value_end_position.column, value_end_column); #define EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(count) \ VERIFY(last_token); \ @@ -128,7 +134,7 @@ TEST_CASE(unquoted_attributes) BEGIN_ENUMERATION(tokens); EXPECT_START_TAG_TOKEN(p, 1u, 10u); EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(1); - EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "bar"); + EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "bar", 3u, 6u, 7u, 10u); EXPECT_END_OF_FILE_TOKEN(); END_ENUMERATION(); } @@ -139,7 +145,7 @@ TEST_CASE(single_quoted_attributes) BEGIN_ENUMERATION(tokens); EXPECT_START_TAG_TOKEN(p, 1u, 12u); EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(1); - EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "bar"); + EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "bar", 3u, 6u, 7u, 12u); EXPECT_END_OF_FILE_TOKEN(); END_ENUMERATION(); } @@ -150,20 +156,32 @@ TEST_CASE(double_quoted_attributes) BEGIN_ENUMERATION(tokens); EXPECT_START_TAG_TOKEN(p, 1u, 12u); EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(1); - EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "bar"); + EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "bar", 3u, 6u, 7u, 12u); + EXPECT_END_OF_FILE_TOKEN(); + END_ENUMERATION(); +} + +TEST_CASE(valueless_attribute) +{ + auto tokens = run_tokenizer("

"sv); + BEGIN_ENUMERATION(tokens); + EXPECT_START_TAG_TOKEN(p, 1u, 6u); + EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(1); + EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "", 3u, 6u, 0u, 0u); EXPECT_END_OF_FILE_TOKEN(); END_ENUMERATION(); } TEST_CASE(multiple_attributes) { - auto tokens = run_tokenizer("

"sv); + auto tokens = run_tokenizer("

"sv); BEGIN_ENUMERATION(tokens); - EXPECT_START_TAG_TOKEN(p, 1u, 35u); - EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(3); - EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "bar"); - EXPECT_TAG_TOKEN_ATTRIBUTE(baz, "foobar"); - EXPECT_TAG_TOKEN_ATTRIBUTE(foo2, "bar2"); + EXPECT_START_TAG_TOKEN(p, 1u, 39u); + EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(4); + EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "bar", 3u, 6u, 7u, 12u); + EXPECT_TAG_TOKEN_ATTRIBUTE(baz, "foobar", 13u, 16u, 17u, 23u); + EXPECT_TAG_TOKEN_ATTRIBUTE(biz, "", 24u, 27u, 0u, 0u); + EXPECT_TAG_TOKEN_ATTRIBUTE(foo2, "bar2", 28u, 32u, 33u, 39u); EXPECT_END_OF_FILE_TOKEN(); END_ENUMERATION(); } @@ -174,9 +192,9 @@ TEST_CASE(character_reference_in_attribute) BEGIN_ENUMERATION(tokens); EXPECT_START_TAG_TOKEN(p, 1u, 43u); EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(3); - EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "a&b"); - EXPECT_TAG_TOKEN_ATTRIBUTE(bar, "a&b"); - EXPECT_TAG_TOKEN_ATTRIBUTE(baz, "a&b"); + EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "a&b", 3u, 6u, 7u, 14u); + EXPECT_TAG_TOKEN_ATTRIBUTE(bar, "a&b", 15u, 18u, 19u, 28u); + EXPECT_TAG_TOKEN_ATTRIBUTE(baz, "a&b", 29u, 32u, 33u, 43u); EXPECT_END_OF_FILE_TOKEN(); END_ENUMERATION(); } diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.h b/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.h index 07c1cb7511..548e3193a3 100644 --- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.h +++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.h @@ -246,6 +246,13 @@ public: } StringView attribute(DeprecatedFlyString const& attribute_name) const + { + if (auto result = raw_attribute(attribute_name); result.has_value()) + return result->value; + return {}; + } + + Optional raw_attribute(DeprecatedFlyString const& attribute_name) const { VERIFY(is_start_tag() || is_end_tag()); @@ -254,7 +261,7 @@ public: return {}; for (auto& attribute : *ptr) { if (attribute_name == attribute.local_name) - return attribute.value; + return attribute; } return {}; } diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp b/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp index d99534de0f..c38a78102e 100644 --- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp +++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp @@ -1051,8 +1051,6 @@ _StartOfFunction: } ON('/') { - if (m_current_token.has_attributes()) - m_current_token.last_attribute().name_end_position = nth_last_position(1); RECONSUME_IN(AfterAttributeName); } ON('>') @@ -1108,21 +1106,25 @@ _StartOfFunction: { ON_WHITESPACE { + m_current_token.last_attribute().name_end_position = nth_last_position(1); m_current_token.last_attribute().local_name = consume_current_builder(); RECONSUME_IN(AfterAttributeName); } ON('/') { + m_current_token.last_attribute().name_end_position = nth_last_position(1); m_current_token.last_attribute().local_name = consume_current_builder(); RECONSUME_IN(AfterAttributeName); } ON('>') { + m_current_token.last_attribute().name_end_position = nth_last_position(1); m_current_token.last_attribute().local_name = consume_current_builder(); RECONSUME_IN(AfterAttributeName); } ON_EOF { + m_current_token.last_attribute().name_end_position = nth_last_position(1); m_current_token.last_attribute().local_name = consume_current_builder(); RECONSUME_IN(AfterAttributeName); } @@ -1196,7 +1198,7 @@ _StartOfFunction: { m_current_token.add_attribute({}); if (!m_source_positions.is_empty()) - m_current_token.last_attribute().name_start_position = m_source_positions.last(); + m_current_token.last_attribute().name_start_position = nth_last_position(1); RECONSUME_IN(AttributeName); } }