From ac2c3a73b136787d5dad0e00a43c1fcbed8f1ff3 Mon Sep 17 00:00:00 2001 From: davidot Date: Sun, 3 Oct 2021 13:10:35 +0200 Subject: [PATCH] LibJS: Add a specific test for invalid unicode characters in the lexer Also fixes that it tried to make substrings past the end of the source if we overran the source length. --- Meta/Lagom/CMakeLists.txt | 3 + Tests/LibJS/CMakeLists.txt | 3 + Tests/LibJS/test-invalid-unicode-js.cpp | 76 +++++++++++++++++++++++++ Userland/Libraries/LibJS/Lexer.cpp | 2 +- 4 files changed, 83 insertions(+), 1 deletion(-) create mode 100644 Tests/LibJS/test-invalid-unicode-js.cpp diff --git a/Meta/Lagom/CMakeLists.txt b/Meta/Lagom/CMakeLists.txt index 847a7282cd..c0688f96df 100644 --- a/Meta/Lagom/CMakeLists.txt +++ b/Meta/Lagom/CMakeLists.txt @@ -528,6 +528,9 @@ if (BUILD_LAGOM) ) set_tests_properties(JS PROPERTIES ENVIRONMENT SERENITY_SOURCE_DIR=${SERENITY_PROJECT_ROOT}) + # test-invalid-unicode-js + lagom_test(../../Tests/LibJS/test-invalid-unicode-js.cpp LIBS LagomJS) + # Markdown include(commonmark_spec) file(GLOB LIBMARKDOWN_TEST_SOURCES CONFIGURE_DEPENDS "../../Tests/LibMarkdown/*.cpp") diff --git a/Tests/LibJS/CMakeLists.txt b/Tests/LibJS/CMakeLists.txt index 7bd2772c9f..b800d6ca21 100644 --- a/Tests/LibJS/CMakeLists.txt +++ b/Tests/LibJS/CMakeLists.txt @@ -1,2 +1,5 @@ serenity_testjs_test(test-js.cpp test-js) + install(TARGETS test-js RUNTIME DESTINATION bin OPTIONAL) + +serenity_test(test-invalid-unicode-js.cpp LibJS LIBS LibJS) diff --git a/Tests/LibJS/test-invalid-unicode-js.cpp b/Tests/LibJS/test-invalid-unicode-js.cpp new file mode 100644 index 0000000000..9e209f29d5 --- /dev/null +++ b/Tests/LibJS/test-invalid-unicode-js.cpp @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2021, David Tuin + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include +#include + +TEST_CASE(invalid_unicode_only) +{ + char const* code = "\xEA\xFD"; + auto lexer = JS::Lexer(code); + auto token = lexer.next(); + EXPECT_EQ(token.type(), JS::TokenType::Invalid); + + // After this we can get as many eof tokens as we like. + for (auto i = 0; i < 10; i++) { + auto eof_token = lexer.next(); + EXPECT_EQ(eof_token.type(), JS::TokenType::Eof); + } +} + +TEST_CASE(long_invalid_unicode) +{ + char const* code = "\xF7"; + auto lexer = JS::Lexer(code); + auto token = lexer.next(); + EXPECT_EQ(token.type(), JS::TokenType::Invalid); + + // After this we can get as many eof tokens as we like. + for (auto i = 0; i < 10; i++) { + auto eof_token = lexer.next(); + EXPECT_EQ(eof_token.type(), JS::TokenType::Eof); + } +} + +TEST_CASE(invalid_unicode_and_valid_code) +{ + char const* code = "\xEA\xFDthrow 1;"; + auto lexer = JS::Lexer(code); + auto invalid_token = lexer.next(); + EXPECT_EQ(invalid_token.type(), JS::TokenType::Invalid); + // 0xEA is the start of a three character unicode code point thus it consumes the 't'. + auto token_after = lexer.next(); + EXPECT_EQ(token_after.value(), "hrow"); +} + +TEST_CASE(long_invalid_unicode_and_valid_code) +{ + char const* code = "\xF7throw 1;"; + auto lexer = JS::Lexer(code); + auto invalid_token = lexer.next(); + EXPECT_EQ(invalid_token.type(), JS::TokenType::Invalid); + // 0xF7 is the start of a four character unicode code point thus it consumes 'thr'. + auto token_after = lexer.next(); + EXPECT_EQ(token_after.value(), "ow"); +} + +TEST_CASE(invalid_unicode_after_valid_code_and_before_eof) +{ + char const* code = "let \xEA\xFD;"; + auto lexer = JS::Lexer(code); + auto let_token = lexer.next(); + EXPECT_EQ(let_token.type(), JS::TokenType::Let); + auto invalid_token = lexer.next(); + EXPECT_EQ(invalid_token.type(), JS::TokenType::Invalid); + // It should still get the valid trivia in front. + EXPECT_EQ(invalid_token.trivia(), " "); + + // After this we can get as many eof tokens as we like. + for (auto i = 0; i < 10; i++) { + auto eof_token = lexer.next(); + EXPECT_EQ(eof_token.type(), JS::TokenType::Eof); + } +} diff --git a/Userland/Libraries/LibJS/Lexer.cpp b/Userland/Libraries/LibJS/Lexer.cpp index 5a02ac560b..0977954605 100644 --- a/Userland/Libraries/LibJS/Lexer.cpp +++ b/Userland/Libraries/LibJS/Lexer.cpp @@ -145,7 +145,7 @@ void Lexer::consume() return false; m_eof = true; m_current_char = '\0'; - m_position++; + m_position = m_source.length() + 1; m_line_column++; return true; };