mirror of
https://github.com/RGBCube/serenity
synced 2025-05-14 07:04:57 +00:00

Similar to POSIX read, the basic read and write functions of AK::Stream do not have a lower limit of how much data they read or write (apart from "none at all"). Rename the functions to "read some [data]" and "write some [data]" (with "data" being omitted, since everything here is reading and writing data) to make them sufficiently distinct from the functions that ensure to use the entire buffer (which should be the go-to function for most usages). No functional changes, just a lot of new FIXMEs.
220 lines
6.5 KiB
C++
220 lines
6.5 KiB
C++
/*
|
|
* Copyright (c) 2021, Max Wipfli <max.wipfli@serenityos.org>
|
|
*
|
|
* SPDX-License-Identifier: BSD-2-Clause
|
|
*/
|
|
|
|
#include <LibTest/TestCase.h>
|
|
|
|
#include <LibCore/File.h>
|
|
#include <LibWeb/HTML/Parser/HTMLTokenizer.h>
|
|
|
|
using Tokenizer = Web::HTML::HTMLTokenizer;
|
|
using Token = Web::HTML::HTMLToken;
|
|
|
|
#define BEGIN_ENUMERATION(tokens) \
|
|
auto current_token = (tokens).begin(); \
|
|
[[maybe_unused]] Token* last_token;
|
|
|
|
#define END_ENUMERATION() \
|
|
EXPECT(current_token.is_end());
|
|
|
|
#define NEXT_TOKEN() \
|
|
last_token = &*current_token; \
|
|
++current_token;
|
|
|
|
#define EXPECT_START_TAG_TOKEN(_tag_name) \
|
|
EXPECT_EQ(current_token->type(), Token::Type::StartTag); \
|
|
EXPECT_EQ(current_token->tag_name(), #_tag_name); \
|
|
NEXT_TOKEN();
|
|
|
|
#define EXPECT_END_TAG_TOKEN(_tag_name) \
|
|
EXPECT_EQ(current_token->type(), Token::Type::EndTag); \
|
|
EXPECT_EQ(current_token->tag_name(), #_tag_name); \
|
|
NEXT_TOKEN();
|
|
|
|
#define EXPECT_END_OF_FILE_TOKEN() \
|
|
EXPECT_EQ(current_token->type(), Token::Type::EndOfFile); \
|
|
NEXT_TOKEN();
|
|
|
|
#define EXPECT_CHARACTER_TOKEN(character) \
|
|
EXPECT_EQ(current_token->type(), Token::Type::Character); \
|
|
EXPECT_EQ(current_token->code_point(), (u32)(character)); \
|
|
NEXT_TOKEN();
|
|
|
|
#define EXPECT_CHARACTER_TOKENS(string) \
|
|
for (auto c : #string##sv) { \
|
|
EXPECT_CHARACTER_TOKEN(c); \
|
|
}
|
|
|
|
#define EXPECT_COMMENT_TOKEN() \
|
|
EXPECT_EQ(current_token->type(), Token::Type::Comment); \
|
|
NEXT_TOKEN();
|
|
|
|
#define EXPECT_DOCTYPE_TOKEN() \
|
|
EXPECT_EQ(current_token->type(), Token::Type::DOCTYPE); \
|
|
NEXT_TOKEN();
|
|
|
|
#define EXPECT_TAG_TOKEN_ATTRIBUTE(name, value) \
|
|
VERIFY(last_token); \
|
|
EXPECT_EQ(last_token->attribute(#name), value);
|
|
|
|
#define EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(count) \
|
|
VERIFY(last_token); \
|
|
EXPECT_EQ(last_token->attribute_count(), (size_t)(count));
|
|
|
|
static Vector<Token> run_tokenizer(StringView input)
|
|
{
|
|
Vector<Token> tokens;
|
|
Tokenizer tokenizer { input, "UTF-8"sv };
|
|
while (true) {
|
|
auto maybe_token = tokenizer.next_token();
|
|
if (!maybe_token.has_value())
|
|
break;
|
|
tokens.append(maybe_token.release_value());
|
|
}
|
|
return tokens;
|
|
}
|
|
|
|
// FIXME: It's not very nice to rely on the format of HTMLToken::to_string() to stay the same.
|
|
static u32 hash_tokens(Vector<Token> const& tokens)
|
|
{
|
|
StringBuilder builder;
|
|
for (auto& token : tokens)
|
|
builder.append(token.to_deprecated_string());
|
|
return (u32)builder.string_view().hash();
|
|
}
|
|
|
|
TEST_CASE(empty)
|
|
{
|
|
auto tokens = run_tokenizer(""sv);
|
|
BEGIN_ENUMERATION(tokens);
|
|
EXPECT_END_OF_FILE_TOKEN();
|
|
END_ENUMERATION();
|
|
}
|
|
|
|
TEST_CASE(basic)
|
|
{
|
|
auto tokens = run_tokenizer("<html><head></head><body></body></html>"sv);
|
|
BEGIN_ENUMERATION(tokens);
|
|
EXPECT_START_TAG_TOKEN(html);
|
|
EXPECT_START_TAG_TOKEN(head);
|
|
EXPECT_END_TAG_TOKEN(head);
|
|
EXPECT_START_TAG_TOKEN(body);
|
|
EXPECT_END_TAG_TOKEN(body);
|
|
EXPECT_END_TAG_TOKEN(html);
|
|
EXPECT_END_OF_FILE_TOKEN();
|
|
END_ENUMERATION();
|
|
}
|
|
|
|
TEST_CASE(basic_with_text)
|
|
{
|
|
auto tokens = run_tokenizer("<p>This is some text.</p>"sv);
|
|
BEGIN_ENUMERATION(tokens);
|
|
EXPECT_START_TAG_TOKEN(p);
|
|
EXPECT_CHARACTER_TOKENS(This is some text.);
|
|
EXPECT_END_TAG_TOKEN(p);
|
|
EXPECT_END_OF_FILE_TOKEN();
|
|
END_ENUMERATION();
|
|
}
|
|
|
|
TEST_CASE(unquoted_attributes)
|
|
{
|
|
auto tokens = run_tokenizer("<p foo=bar>"sv);
|
|
BEGIN_ENUMERATION(tokens);
|
|
EXPECT_START_TAG_TOKEN(p);
|
|
EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(1);
|
|
EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "bar");
|
|
EXPECT_END_OF_FILE_TOKEN();
|
|
END_ENUMERATION();
|
|
}
|
|
|
|
TEST_CASE(single_quoted_attributes)
|
|
{
|
|
auto tokens = run_tokenizer("<p foo='bar'>"sv);
|
|
BEGIN_ENUMERATION(tokens);
|
|
EXPECT_START_TAG_TOKEN(p);
|
|
EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(1);
|
|
EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "bar");
|
|
EXPECT_END_OF_FILE_TOKEN();
|
|
END_ENUMERATION();
|
|
}
|
|
|
|
TEST_CASE(double_quoted_attributes)
|
|
{
|
|
auto tokens = run_tokenizer("<p foo=\"bar\">"sv);
|
|
BEGIN_ENUMERATION(tokens);
|
|
EXPECT_START_TAG_TOKEN(p);
|
|
EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(1);
|
|
EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "bar");
|
|
EXPECT_END_OF_FILE_TOKEN();
|
|
END_ENUMERATION();
|
|
}
|
|
|
|
TEST_CASE(multiple_attributes)
|
|
{
|
|
auto tokens = run_tokenizer("<p foo=\"bar\" baz=foobar foo2=\"bar2\">"sv);
|
|
BEGIN_ENUMERATION(tokens);
|
|
EXPECT_START_TAG_TOKEN(p);
|
|
EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(3);
|
|
EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "bar");
|
|
EXPECT_TAG_TOKEN_ATTRIBUTE(baz, "foobar");
|
|
EXPECT_TAG_TOKEN_ATTRIBUTE(foo2, "bar2");
|
|
EXPECT_END_OF_FILE_TOKEN();
|
|
END_ENUMERATION();
|
|
}
|
|
|
|
TEST_CASE(character_reference_in_attribute)
|
|
{
|
|
auto tokens = run_tokenizer("<p foo=a&b bar='a&b' baz=\"a&b\">"sv);
|
|
BEGIN_ENUMERATION(tokens);
|
|
EXPECT_START_TAG_TOKEN(p);
|
|
EXPECT_TAG_TOKEN_ATTRIBUTE_COUNT(3);
|
|
EXPECT_TAG_TOKEN_ATTRIBUTE(foo, "a&b");
|
|
EXPECT_TAG_TOKEN_ATTRIBUTE(bar, "a&b");
|
|
EXPECT_TAG_TOKEN_ATTRIBUTE(baz, "a&b");
|
|
EXPECT_END_OF_FILE_TOKEN();
|
|
END_ENUMERATION();
|
|
}
|
|
|
|
TEST_CASE(comment)
|
|
{
|
|
auto tokens = run_tokenizer("<p><!-- This is a comment --></p>"sv);
|
|
BEGIN_ENUMERATION(tokens);
|
|
EXPECT_START_TAG_TOKEN(p);
|
|
EXPECT_COMMENT_TOKEN();
|
|
EXPECT_END_TAG_TOKEN(p);
|
|
EXPECT_END_OF_FILE_TOKEN();
|
|
END_ENUMERATION();
|
|
}
|
|
|
|
TEST_CASE(doctype)
|
|
{
|
|
auto tokens = run_tokenizer("<!DOCTYPE html><html></html>"sv);
|
|
BEGIN_ENUMERATION(tokens);
|
|
EXPECT_DOCTYPE_TOKEN();
|
|
EXPECT_START_TAG_TOKEN(html);
|
|
EXPECT_END_TAG_TOKEN(html);
|
|
}
|
|
|
|
// NOTE: This relies on the format of HTMLToken::to_string() staying the same.
|
|
// If that changes, or something is added to the test HTML, the hash needs to be adjusted.
|
|
TEST_CASE(regression)
|
|
{
|
|
// This makes sure that the tests will run both on target and in Lagom.
|
|
#ifdef AK_OS_SERENITY
|
|
StringView path = "/usr/Tests/LibWeb/tokenizer-test.html"sv;
|
|
#else
|
|
StringView path = "tokenizer-test.html"sv;
|
|
#endif
|
|
|
|
auto file = MUST(Core::File::open(path, Core::File::OpenMode::Read));
|
|
auto file_size = MUST(file->size());
|
|
auto content = MUST(ByteBuffer::create_uninitialized(file_size));
|
|
// FIXME: This should read the entire span.
|
|
MUST(file->read_some(content.bytes()));
|
|
DeprecatedString file_contents { content.bytes() };
|
|
auto tokens = run_tokenizer(file_contents);
|
|
u32 hash = hash_tokens(tokens);
|
|
EXPECT_EQ(hash, 710375345u);
|
|
}
|