1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-07-25 04:27:43 +00:00

LibRegex: Support ECMA-262 Unicode escapes of the form "\u{code_point}"

When the Unicode flag is set, regular expressions may escape code points
by surrounding the hexadecimal code point with curly braces, e.g. \u{41}
is the character "A".

When the Unicode flag is not set, this should be considered a repetition
symbol - \u{41} is the character "u" repeated 41 times. This is left as
a TODO for now.
This commit is contained in:
Timothy Flynn 2021-07-22 09:25:58 -04:00 committed by Linus Groh
parent 0e6375558d
commit 345ef6abba
2 changed files with 27 additions and 0 deletions

View file

@ -7,6 +7,7 @@
#include "RegexParser.h"
#include "RegexDebug.h"
#include <AK/CharacterTypes.h>
#include <AK/String.h>
#include <AK/StringBuilder.h>
#include <AK/StringUtils.h>
@ -1440,6 +1441,26 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
}
if (try_skip("u")) {
if (match(TokenType::LeftCurly)) {
consume();
if (!unicode) {
// FIXME: In non-Unicode mode, this should be parsed as a repetition symbol (repeating the 'u').
TODO();
}
auto code_point = read_digits(ReadDigitsInitialZeroState::Allow, true, 6);
if (code_point.has_value() && is_unicode(*code_point) && match(TokenType::RightCurly)) {
consume();
match_length_minimum += 1;
stack.insert_bytecode_compare_values({ { CharacterCompareType::Char, (ByteCodeValueType)code_point.value() } });
return true;
}
set_error(Error::InvalidPattern);
return false;
}
if (auto code_point = read_digits(ReadDigitsInitialZeroState::Allow, true, 4); code_point.has_value()) {
// In Unicode mode, we need to combine surrogate pairs into a single code point. But we also need to be
// rather forgiving if the surrogate pairs are invalid. So if a second code unit follows this code unit,