mirror of
https://github.com/RGBCube/serenity
synced 2025-07-25 16:37:35 +00:00
LibJS: Detect invalid unicode and stop lexing at that point
Previously we might swallow invalid unicode point which would skip valid ascii characters. This could be dangerous as we might skip a '"' thus not closing a string where we should. This might have been exploitable as it would not have been clear what code gets executed when looking at a script. Another approach to this would be simply replacing all invalid characters with the replacement character (this is what v8 does). But our lexer and parser are currently not set up for such a change.
This commit is contained in:
parent
b1e022908d
commit
56c425eec1
3 changed files with 145 additions and 50 deletions
|
@ -194,7 +194,7 @@ void Lexer::consume()
|
|||
} else if (is_unicode_character()) {
|
||||
size_t char_size = 1;
|
||||
if ((m_current_char & 64) == 0) {
|
||||
// invalid char
|
||||
m_hit_invalid_unicode = m_position;
|
||||
} else if ((m_current_char & 32) == 0) {
|
||||
char_size = 2;
|
||||
} else if ((m_current_char & 16) == 0) {
|
||||
|
@ -206,7 +206,18 @@ void Lexer::consume()
|
|||
VERIFY(char_size >= 1);
|
||||
--char_size;
|
||||
|
||||
m_position += char_size;
|
||||
for (size_t i = m_position; i < m_position + char_size; i++) {
|
||||
if (i >= m_source.length() || (m_source[i] & 0b11000000) != 0b10000000) {
|
||||
m_hit_invalid_unicode = m_position;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (m_hit_invalid_unicode.has_value())
|
||||
m_position = m_source.length();
|
||||
else
|
||||
m_position += char_size;
|
||||
|
||||
if (did_reach_eof())
|
||||
return;
|
||||
|
||||
|
@ -813,15 +824,29 @@ Token Lexer::next()
|
|||
}
|
||||
}
|
||||
|
||||
m_current_token = Token(
|
||||
token_type,
|
||||
token_message,
|
||||
m_source.substring_view(trivia_start - 1, value_start - trivia_start),
|
||||
m_source.substring_view(value_start - 1, m_position - value_start),
|
||||
m_filename,
|
||||
value_start_line_number,
|
||||
value_start_column_number,
|
||||
m_position);
|
||||
if (m_hit_invalid_unicode.has_value()) {
|
||||
value_start = m_hit_invalid_unicode.value() - 1;
|
||||
m_current_token = Token(TokenType::Invalid, "Invalid unicode codepoint in source",
|
||||
"", // Since the invalid unicode can occur anywhere in the current token the trivia is not correct
|
||||
m_source.substring_view(value_start + 1, min(4u, m_source.length() - value_start - 2)),
|
||||
m_filename,
|
||||
m_line_number,
|
||||
m_line_column - 1,
|
||||
m_position);
|
||||
m_hit_invalid_unicode.clear();
|
||||
// Do not produce any further tokens.
|
||||
VERIFY(is_eof());
|
||||
} else {
|
||||
m_current_token = Token(
|
||||
token_type,
|
||||
token_message,
|
||||
m_source.substring_view(trivia_start - 1, value_start - trivia_start),
|
||||
m_source.substring_view(value_start - 1, m_position - value_start),
|
||||
m_filename,
|
||||
value_start_line_number,
|
||||
value_start_column_number,
|
||||
m_position);
|
||||
}
|
||||
|
||||
if (identifier.has_value())
|
||||
m_current_token.set_identifier_value(identifier.release_value());
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue