1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-07-25 16:37:35 +00:00

LibJS: Detect invalid unicode and stop lexing at that point

Previously we might swallow invalid unicode point which would skip valid
ascii characters. This could be dangerous as we might skip a '"' thus
not closing a string where we should.
This might have been exploitable as it would not have been clear what
code gets executed when looking at a script.

Another approach to this would be simply replacing all invalid
characters with the replacement character (this is what v8 does). But
our lexer and parser are currently not set up for such a change.
This commit is contained in:
davidot 2021-12-29 11:55:06 +01:00 committed by Linus Groh
parent b1e022908d
commit 56c425eec1
3 changed files with 145 additions and 50 deletions

View file

@ -194,7 +194,7 @@ void Lexer::consume()
} else if (is_unicode_character()) {
size_t char_size = 1;
if ((m_current_char & 64) == 0) {
// invalid char
m_hit_invalid_unicode = m_position;
} else if ((m_current_char & 32) == 0) {
char_size = 2;
} else if ((m_current_char & 16) == 0) {
@ -206,7 +206,18 @@ void Lexer::consume()
VERIFY(char_size >= 1);
--char_size;
m_position += char_size;
for (size_t i = m_position; i < m_position + char_size; i++) {
if (i >= m_source.length() || (m_source[i] & 0b11000000) != 0b10000000) {
m_hit_invalid_unicode = m_position;
break;
}
}
if (m_hit_invalid_unicode.has_value())
m_position = m_source.length();
else
m_position += char_size;
if (did_reach_eof())
return;
@ -813,15 +824,29 @@ Token Lexer::next()
}
}
m_current_token = Token(
token_type,
token_message,
m_source.substring_view(trivia_start - 1, value_start - trivia_start),
m_source.substring_view(value_start - 1, m_position - value_start),
m_filename,
value_start_line_number,
value_start_column_number,
m_position);
if (m_hit_invalid_unicode.has_value()) {
value_start = m_hit_invalid_unicode.value() - 1;
m_current_token = Token(TokenType::Invalid, "Invalid unicode codepoint in source",
"", // Since the invalid unicode can occur anywhere in the current token the trivia is not correct
m_source.substring_view(value_start + 1, min(4u, m_source.length() - value_start - 2)),
m_filename,
m_line_number,
m_line_column - 1,
m_position);
m_hit_invalid_unicode.clear();
// Do not produce any further tokens.
VERIFY(is_eof());
} else {
m_current_token = Token(
token_type,
token_message,
m_source.substring_view(trivia_start - 1, value_start - trivia_start),
m_source.substring_view(value_start - 1, m_position - value_start),
m_filename,
value_start_line_number,
value_start_column_number,
m_position);
}
if (identifier.has_value())
m_current_token.set_identifier_value(identifier.release_value());