From f5f1a5228ea9e9ad45b1c1da2bddc51f24fd71fc Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Sun, 12 Mar 2023 22:05:03 -0400 Subject: [PATCH] LibWeb: Escape HTML text fragments with multi-byte code point awareness The UTF-8 encoding of U+00A0 (NBSP) is the bytes 0xc2 0xa0. By looping over the string to escape byte-by-byte, we replace the second byte with " ", but leave the first byte in the resulting text. This creates an invalid UTF-8 string, with a lone leading byte. --- .../Libraries/LibWeb/HTML/Parser/HTMLParser.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLParser.cpp b/Userland/Libraries/LibWeb/HTML/Parser/HTMLParser.cpp index 37bf2a116a..3648116461 100644 --- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLParser.cpp +++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLParser.cpp @@ -3586,23 +3586,23 @@ DeprecatedString HTMLParser::serialize_html_fragment(DOM::Node const& node) auto escape_string = [](StringView string, AttributeMode attribute_mode) -> DeprecatedString { // https://html.spec.whatwg.org/multipage/parsing.html#escapingString StringBuilder builder; - for (auto& ch : string) { + for (auto code_point : Utf8View { string }) { // 1. Replace any occurrence of the "&" character by the string "&". - if (ch == '&') + if (code_point == '&') builder.append("&"sv); // 2. Replace any occurrences of the U+00A0 NO-BREAK SPACE character by the string " ". - else if (ch == '\xA0') + else if (code_point == 0xA0) builder.append(" "sv); // 3. If the algorithm was invoked in the attribute mode, replace any occurrences of the """ character by the string """. - else if (ch == '"' && attribute_mode == AttributeMode::Yes) + else if (code_point == '"' && attribute_mode == AttributeMode::Yes) builder.append("""sv); // 4. If the algorithm was not invoked in the attribute mode, replace any occurrences of the "<" character by the string "<", and any occurrences of the ">" character by the string ">". - else if (ch == '<' && attribute_mode == AttributeMode::No) + else if (code_point == '<' && attribute_mode == AttributeMode::No) builder.append("<"sv); - else if (ch == '>' && attribute_mode == AttributeMode::No) + else if (code_point == '>' && attribute_mode == AttributeMode::No) builder.append(">"sv); else - builder.append(ch); + builder.append_code_point(code_point); } return builder.to_deprecated_string(); };