From bb1cc99750acc5254ee8d42d81897cc096d0c5e7 Mon Sep 17 00:00:00 2001
From: Sam Atkins <atkinssj@serenityos.org>
Date: Fri, 17 Sep 2021 16:49:45 +0100
Subject: [PATCH] LibWeb: Stop treating EOF as a valid part of an identifier

This was specifically causing the string "0" to be parsed as an invalid
Dimension token with no units, instead of as a Number. That then caused
out generated `property_initial_value()` function to fail for those
values.
---
 Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.cpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.cpp b/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.cpp
index ecb3c57446..6d2df75432 100644
--- a/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.cpp
+++ b/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.cpp
@@ -11,7 +11,7 @@
 #include <LibTextCodec/Decoder.h>
 #include <LibWeb/CSS/Parser/Tokenizer.h>
 
-//U+FFFD REPLACEMENT CHARACTER (�)
+// U+FFFD REPLACEMENT CHARACTER (�)
 #define REPLACEMENT_CHARACTER 0xFFFD
 static const u32 TOKENIZER_EOF = 0xFFFFFFFF;
 
@@ -42,7 +42,10 @@ static inline bool is_low_line(u32 code_point)
 
 static inline bool is_name_start_code_point(u32 code_point)
 {
-    return is_ascii_alpha(code_point) || !is_ascii(code_point) || is_low_line(code_point);
+    // FIXME: We use !is_ascii() for "non-ASCII code point" in the spec, but it's not quite right -
+    //        it treats EOF as a valid! The spec also lacks a definition of code point. For now, the
+    //        !is_eof() check is a hack, but it should work.
+    return !is_eof(code_point) && (is_ascii_alpha(code_point) || !is_ascii(code_point) || is_low_line(code_point));
 }
 
 static inline bool is_hyphen_minus(u32 code_point)
@@ -585,6 +588,7 @@ Token Tokenizer::consume_a_numeric_token()
         token.m_number_type = number.type;
 
         auto unit = consume_a_name();
+        VERIFY(!unit.is_empty() && !unit.is_whitespace());
         token.m_unit.append(unit);
 
         return token;