mirror of
https://github.com/RGBCube/serenity
synced 2025-07-26 02:07:35 +00:00
LibJS: Correctly handle Unicode characters in JS source text
Also recognize additional white space characters.
This commit is contained in:
parent
4d6502de42
commit
47bc72bcf6
6 changed files with 100 additions and 16 deletions
|
@ -9,6 +9,8 @@
|
||||||
#include <AK/CharacterTypes.h>
|
#include <AK/CharacterTypes.h>
|
||||||
#include <AK/Debug.h>
|
#include <AK/Debug.h>
|
||||||
#include <AK/HashMap.h>
|
#include <AK/HashMap.h>
|
||||||
|
#include <AK/Utf8View.h>
|
||||||
|
#include <LibUnicode/CharacterTypes.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
|
|
||||||
namespace JS {
|
namespace JS {
|
||||||
|
@ -186,6 +188,26 @@ void Lexer::consume()
|
||||||
} else {
|
} else {
|
||||||
dbgln_if(LEXER_DEBUG, "Previous was CR, this is LF - not incrementing line number again.");
|
dbgln_if(LEXER_DEBUG, "Previous was CR, this is LF - not incrementing line number again.");
|
||||||
}
|
}
|
||||||
|
} else if (is_unicode_character()) {
|
||||||
|
size_t char_size = 1;
|
||||||
|
if ((m_current_char & 64) == 0) {
|
||||||
|
// invalid char
|
||||||
|
} else if ((m_current_char & 32) == 0) {
|
||||||
|
char_size = 2;
|
||||||
|
} else if ((m_current_char & 16) == 0) {
|
||||||
|
char_size = 3;
|
||||||
|
} else if ((m_current_char & 8) == 0) {
|
||||||
|
char_size = 4;
|
||||||
|
}
|
||||||
|
|
||||||
|
VERIFY(char_size > 1);
|
||||||
|
--char_size;
|
||||||
|
|
||||||
|
m_position += char_size;
|
||||||
|
if (did_reach_eof())
|
||||||
|
return;
|
||||||
|
|
||||||
|
m_line_column++;
|
||||||
} else {
|
} else {
|
||||||
m_line_column++;
|
m_line_column++;
|
||||||
}
|
}
|
||||||
|
@ -310,21 +332,67 @@ bool Lexer::is_line_terminator() const
|
||||||
{
|
{
|
||||||
if (m_current_char == '\n' || m_current_char == '\r')
|
if (m_current_char == '\n' || m_current_char == '\r')
|
||||||
return true;
|
return true;
|
||||||
if (m_position > 0 && m_position + 1 < m_source.length()) {
|
if (!is_unicode_character())
|
||||||
auto three_chars_view = m_source.substring_view(m_position - 1, 3);
|
return false;
|
||||||
return (three_chars_view == LINE_SEPARATOR) || (three_chars_view == PARAGRAPH_SEPARATOR);
|
|
||||||
}
|
auto code_point = current_code_point();
|
||||||
|
return code_point == LINE_SEPARATOR || code_point == PARAGRAPH_SEPARATOR;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool Lexer::is_unicode_character() const
|
||||||
|
{
|
||||||
|
return (m_current_char & 128) != 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
u32 Lexer::current_code_point() const
|
||||||
|
{
|
||||||
|
static constexpr const u32 REPLACEMENT_CHARACTER = 0xFFFD;
|
||||||
|
if (m_position == 0)
|
||||||
|
return REPLACEMENT_CHARACTER;
|
||||||
|
Utf8View utf_8_view { m_source.substring_view(m_position - 1) };
|
||||||
|
return *utf_8_view.begin();
|
||||||
|
}
|
||||||
|
|
||||||
|
bool Lexer::is_whitespace() const
|
||||||
|
{
|
||||||
|
if (is_ascii_space(m_current_char))
|
||||||
|
return true;
|
||||||
|
if (!is_unicode_character())
|
||||||
|
return false;
|
||||||
|
auto code_point = current_code_point();
|
||||||
|
if (code_point == NO_BREAK_SPACE)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
static auto space_separator_category = Unicode::general_category_from_string("Space_Separator"sv);
|
||||||
|
if (space_separator_category.has_value())
|
||||||
|
return Unicode::code_point_has_general_category(code_point, *space_separator_category);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Lexer::is_identifier_start() const
|
bool Lexer::is_identifier_start() const
|
||||||
{
|
{
|
||||||
return is_ascii_alpha(m_current_char) || m_current_char == '_' || m_current_char == '$';
|
if (!is_unicode_character())
|
||||||
|
return is_ascii_alpha(m_current_char) || m_current_char == '_' || m_current_char == '$';
|
||||||
|
auto code_point = current_code_point();
|
||||||
|
|
||||||
|
static auto id_start_category = Unicode::property_from_string("ID_Start"sv);
|
||||||
|
if (id_start_category.has_value())
|
||||||
|
return Unicode::code_point_has_property(code_point, *id_start_category);
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Lexer::is_identifier_middle() const
|
bool Lexer::is_identifier_middle() const
|
||||||
{
|
{
|
||||||
return is_identifier_start() || is_ascii_digit(m_current_char);
|
if (!is_unicode_character())
|
||||||
|
return is_identifier_start() || is_ascii_digit(m_current_char);
|
||||||
|
auto code_point = current_code_point();
|
||||||
|
if (code_point == ZERO_WIDTH_NON_JOINER || code_point == ZERO_WIDTH_JOINER)
|
||||||
|
return true;
|
||||||
|
|
||||||
|
static auto id_continue_category = Unicode::property_from_string("ID_Continue"sv);
|
||||||
|
if (id_continue_category.has_value())
|
||||||
|
return Unicode::code_point_has_property(code_point, *id_continue_category);
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Lexer::is_line_comment_start(bool line_has_token_yet) const
|
bool Lexer::is_line_comment_start(bool line_has_token_yet) const
|
||||||
|
@ -390,10 +458,10 @@ Token Lexer::next()
|
||||||
do {
|
do {
|
||||||
consume();
|
consume();
|
||||||
} while (is_line_terminator());
|
} while (is_line_terminator());
|
||||||
} else if (is_ascii_space(m_current_char)) {
|
} else if (is_whitespace()) {
|
||||||
do {
|
do {
|
||||||
consume();
|
consume();
|
||||||
} while (is_ascii_space(m_current_char));
|
} while (is_whitespace());
|
||||||
} else if (is_line_comment_start(line_has_token_yet)) {
|
} else if (is_line_comment_start(line_has_token_yet)) {
|
||||||
consume();
|
consume();
|
||||||
do {
|
do {
|
||||||
|
|
|
@ -34,8 +34,13 @@ private:
|
||||||
bool consume_hexadecimal_number();
|
bool consume_hexadecimal_number();
|
||||||
bool consume_binary_number();
|
bool consume_binary_number();
|
||||||
bool consume_decimal_number();
|
bool consume_decimal_number();
|
||||||
|
|
||||||
|
bool is_unicode_character() const;
|
||||||
|
u32 current_code_point() const;
|
||||||
|
|
||||||
bool is_eof() const;
|
bool is_eof() const;
|
||||||
bool is_line_terminator() const;
|
bool is_line_terminator() const;
|
||||||
|
bool is_whitespace() const;
|
||||||
bool is_identifier_start() const;
|
bool is_identifier_start() const;
|
||||||
bool is_identifier_middle() const;
|
bool is_identifier_middle() const;
|
||||||
bool is_line_comment_start(bool line_has_token_yet) const;
|
bool is_line_comment_start(bool line_has_token_yet) const;
|
||||||
|
|
|
@ -120,8 +120,8 @@ public:
|
||||||
String source_string { source };
|
String source_string { source };
|
||||||
source_string.replace("\r\n", "\n");
|
source_string.replace("\r\n", "\n");
|
||||||
source_string.replace("\r", "\n");
|
source_string.replace("\r", "\n");
|
||||||
source_string.replace(LINE_SEPARATOR, "\n");
|
source_string.replace(LINE_SEPARATOR_STRING, "\n");
|
||||||
source_string.replace(PARAGRAPH_SEPARATOR, "\n");
|
source_string.replace(PARAGRAPH_SEPARATOR_STRING, "\n");
|
||||||
StringBuilder builder;
|
StringBuilder builder;
|
||||||
builder.append(source_string.split_view('\n', true)[position.value().line - 1]);
|
builder.append(source_string.split_view('\n', true)[position.value().line - 1]);
|
||||||
builder.append('\n');
|
builder.append('\n');
|
||||||
|
|
|
@ -85,8 +85,8 @@ static String escape_regexp_pattern(const RegExpObject& regexp_object)
|
||||||
// FIXME: Check u flag and escape accordingly
|
// FIXME: Check u flag and escape accordingly
|
||||||
pattern.replace("\n", "\\n", true);
|
pattern.replace("\n", "\\n", true);
|
||||||
pattern.replace("\r", "\\r", true);
|
pattern.replace("\r", "\\r", true);
|
||||||
pattern.replace(LINE_SEPARATOR, "\\u2028", true);
|
pattern.replace(LINE_SEPARATOR_STRING, "\\u2028", true);
|
||||||
pattern.replace(PARAGRAPH_SEPARATOR, "\\u2029", true);
|
pattern.replace(PARAGRAPH_SEPARATOR_STRING, "\\u2029", true);
|
||||||
pattern.replace("/", "\\/", true);
|
pattern.replace("/", "\\/", true);
|
||||||
return pattern;
|
return pattern;
|
||||||
}
|
}
|
||||||
|
|
|
@ -130,7 +130,7 @@ String Token::string_value(StringValueStatus& status) const
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
// Line continuation
|
// Line continuation
|
||||||
if (lexer.next_is(LINE_SEPARATOR) || lexer.next_is(PARAGRAPH_SEPARATOR)) {
|
if (lexer.next_is(LINE_SEPARATOR_STRING) || lexer.next_is(PARAGRAPH_SEPARATOR_STRING)) {
|
||||||
lexer.ignore(3);
|
lexer.ignore(3);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
@ -281,7 +281,7 @@ bool Token::is_identifier_name() const
|
||||||
|
|
||||||
bool Token::trivia_contains_line_terminator() const
|
bool Token::trivia_contains_line_terminator() const
|
||||||
{
|
{
|
||||||
return m_trivia.contains('\n') || m_trivia.contains('\r') || m_trivia.contains(LINE_SEPARATOR) || m_trivia.contains(PARAGRAPH_SEPARATOR);
|
return m_trivia.contains('\n') || m_trivia.contains('\r') || m_trivia.contains(LINE_SEPARATOR_STRING) || m_trivia.contains(PARAGRAPH_SEPARATOR_STRING);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -13,11 +13,22 @@ namespace JS {
|
||||||
|
|
||||||
// U+2028 LINE SEPARATOR
|
// U+2028 LINE SEPARATOR
|
||||||
constexpr const char line_separator_chars[] { (char)0xe2, (char)0x80, (char)0xa8, 0 };
|
constexpr const char line_separator_chars[] { (char)0xe2, (char)0x80, (char)0xa8, 0 };
|
||||||
constexpr const StringView LINE_SEPARATOR { line_separator_chars };
|
constexpr const StringView LINE_SEPARATOR_STRING { line_separator_chars };
|
||||||
|
constexpr const u32 LINE_SEPARATOR { 0x2028 };
|
||||||
|
|
||||||
// U+2029 PARAGRAPH SEPARATOR
|
// U+2029 PARAGRAPH SEPARATOR
|
||||||
constexpr const char paragraph_separator_chars[] { (char)0xe2, (char)0x80, (char)0xa9, 0 };
|
constexpr const char paragraph_separator_chars[] { (char)0xe2, (char)0x80, (char)0xa9, 0 };
|
||||||
constexpr const StringView PARAGRAPH_SEPARATOR { paragraph_separator_chars };
|
constexpr const StringView PARAGRAPH_SEPARATOR_STRING { paragraph_separator_chars };
|
||||||
|
constexpr const u32 PARAGRAPH_SEPARATOR { 0x2029 };
|
||||||
|
|
||||||
|
// U+00A0 NO BREAK SPACE
|
||||||
|
constexpr const u32 NO_BREAK_SPACE { 0x00A0 };
|
||||||
|
|
||||||
|
// U+200C ZERO WIDTH NON-JOINER
|
||||||
|
constexpr const u32 ZERO_WIDTH_NON_JOINER { 0x200C };
|
||||||
|
|
||||||
|
// U+200D ZERO WIDTH JOINER
|
||||||
|
constexpr const u32 ZERO_WIDTH_JOINER { 0x200D };
|
||||||
|
|
||||||
#define ENUMERATE_JS_TOKENS \
|
#define ENUMERATE_JS_TOKENS \
|
||||||
__ENUMERATE_JS_TOKEN(Ampersand, Operator) \
|
__ENUMERATE_JS_TOKEN(Ampersand, Operator) \
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue