mirror of
https://github.com/RGBCube/serenity
synced 2025-07-23 09:37:34 +00:00
LibJS: Implement RegExpCreate/RegExpInitialize closer to the spec
RegExpInitialize specifies how the pattern string should be created before passing it to [[RegExpMatcher]]. Rather than passing it as-is, the string should be converted to code points and back to a "List" (if the Unicode flag is present), or as a "List" of UTF-16 code units. Further. the spec requires that we keep both the original pattern string and this parsed string in the RegExp object. The caveat is that the LibRegex parser further requires any multi-byte code units to be escaped (as "\unnnn"). Otherwise, the code unit is recognized as individual UTF-8 bytes.
This commit is contained in:
parent
345ef6abba
commit
a0c19deb80
3 changed files with 55 additions and 20 deletions
|
@ -175,7 +175,7 @@ void NewRegExp::execute_impl(Bytecode::Interpreter& interpreter) const
|
|||
auto source = interpreter.current_executable().get_string(m_source_index);
|
||||
auto flags = interpreter.current_executable().get_string(m_flags_index);
|
||||
|
||||
interpreter.accumulator() = RegExpObject::create(interpreter.global_object(), source, flags);
|
||||
interpreter.accumulator() = regexp_create(interpreter.global_object(), js_string(interpreter.vm(), source), js_string(interpreter.vm(), flags));
|
||||
}
|
||||
|
||||
void CopyObjectExcludingProperties::execute_impl(Bytecode::Interpreter& interpreter) const
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
#include <LibJS/Runtime/GlobalObject.h>
|
||||
#include <LibJS/Runtime/PrimitiveString.h>
|
||||
#include <LibJS/Runtime/RegExpObject.h>
|
||||
#include <LibJS/Runtime/StringPrototype.h>
|
||||
#include <LibJS/Runtime/Value.h>
|
||||
|
||||
namespace JS {
|
||||
|
@ -88,17 +89,18 @@ static Flags options_from(GlobalObject& global_object, const String& flags)
|
|||
return options;
|
||||
}
|
||||
|
||||
RegExpObject* RegExpObject::create(GlobalObject& global_object, String pattern, String flags)
|
||||
RegExpObject* RegExpObject::create(GlobalObject& global_object, String original_pattern, String parsed_pattern, String flags)
|
||||
{
|
||||
return global_object.heap().allocate<RegExpObject>(global_object, pattern, flags, *global_object.regexp_prototype());
|
||||
return global_object.heap().allocate<RegExpObject>(global_object, move(original_pattern), move(parsed_pattern), move(flags), *global_object.regexp_prototype());
|
||||
}
|
||||
|
||||
RegExpObject::RegExpObject(String pattern, String flags, Object& prototype)
|
||||
RegExpObject::RegExpObject(String original_pattern, String parsed_pattern, String flags, Object& prototype)
|
||||
: Object(prototype)
|
||||
, m_pattern(pattern)
|
||||
, m_flags(flags)
|
||||
, m_original_pattern(move(original_pattern))
|
||||
, m_parsed_pattern(move(parsed_pattern))
|
||||
, m_flags(move(flags))
|
||||
, m_active_flags(options_from(global_object(), m_flags))
|
||||
, m_regex(pattern, m_active_flags.effective_flags)
|
||||
, m_regex(m_parsed_pattern, m_active_flags.effective_flags)
|
||||
{
|
||||
if (m_regex.parser_result.error != regex::Error::NoError) {
|
||||
vm().throw_exception<SyntaxError>(global_object(), ErrorType::RegExpCompileError, m_regex.error_string());
|
||||
|
@ -120,14 +122,7 @@ void RegExpObject::initialize(GlobalObject& global_object)
|
|||
RegExpObject* regexp_create(GlobalObject& global_object, Value pattern, Value flags)
|
||||
{
|
||||
auto& vm = global_object.vm();
|
||||
String p;
|
||||
if (pattern.is_undefined()) {
|
||||
p = String::empty();
|
||||
} else {
|
||||
p = pattern.to_string(global_object);
|
||||
if (vm.exception())
|
||||
return {};
|
||||
}
|
||||
|
||||
String f;
|
||||
if (flags.is_undefined()) {
|
||||
f = String::empty();
|
||||
|
@ -136,7 +131,46 @@ RegExpObject* regexp_create(GlobalObject& global_object, Value pattern, Value fl
|
|||
if (vm.exception())
|
||||
return {};
|
||||
}
|
||||
auto* object = RegExpObject::create(global_object, move(p), move(f));
|
||||
|
||||
String original_pattern;
|
||||
String parsed_pattern;
|
||||
|
||||
if (pattern.is_undefined()) {
|
||||
original_pattern = String::empty();
|
||||
parsed_pattern = String::empty();
|
||||
} else {
|
||||
auto utf16_pattern = pattern.to_utf16_string(global_object);
|
||||
if (vm.exception())
|
||||
return {};
|
||||
|
||||
Utf16View utf16_pattern_view { utf16_pattern };
|
||||
bool unicode = f.find('u').has_value();
|
||||
StringBuilder builder;
|
||||
|
||||
// If the Unicode flag is set, append each code point to the pattern. Otherwise, append each
|
||||
// code unit. But unlike the spec, multi-byte code units must be escaped for LibRegex to parse.
|
||||
for (size_t i = 0; i < utf16_pattern_view.length_in_code_units();) {
|
||||
if (unicode) {
|
||||
auto code_point = code_point_at(utf16_pattern_view, i);
|
||||
builder.append_code_point(code_point.code_point);
|
||||
i += code_point.code_unit_count;
|
||||
continue;
|
||||
}
|
||||
|
||||
u16 code_unit = utf16_pattern_view.code_unit_at(i);
|
||||
++i;
|
||||
|
||||
if (code_unit > 0x7f)
|
||||
builder.appendff("\\u{:04x}", code_unit);
|
||||
else
|
||||
builder.append_code_point(code_unit);
|
||||
}
|
||||
|
||||
original_pattern = utf16_pattern_view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes);
|
||||
parsed_pattern = builder.build();
|
||||
}
|
||||
|
||||
auto* object = RegExpObject::create(global_object, move(original_pattern), move(parsed_pattern), move(f));
|
||||
object->set(vm.names.lastIndex, Value(0), Object::ShouldThrowExceptions::Yes);
|
||||
if (vm.exception())
|
||||
return {};
|
||||
|
|
|
@ -23,20 +23,21 @@ class RegExpObject : public Object {
|
|||
JS_OBJECT(RegExpObject, Object);
|
||||
|
||||
public:
|
||||
static RegExpObject* create(GlobalObject&, String pattern, String flags);
|
||||
static RegExpObject* create(GlobalObject&, String original_pattern, String parsed_pattern, String flags);
|
||||
|
||||
RegExpObject(String pattern, String flags, Object& prototype);
|
||||
RegExpObject(String original_pattern, String parsed_pattern, String flags, Object& prototype);
|
||||
virtual void initialize(GlobalObject&) override;
|
||||
virtual ~RegExpObject() override;
|
||||
|
||||
const String& pattern() const { return m_pattern; }
|
||||
const String& pattern() const { return m_original_pattern; }
|
||||
const String& flags() const { return m_flags; }
|
||||
const regex::RegexOptions<ECMAScriptFlags>& declared_options() { return m_active_flags.declared_flags; }
|
||||
const Regex<ECMA262>& regex() { return m_regex; }
|
||||
const Regex<ECMA262>& regex() const { return m_regex; }
|
||||
|
||||
private:
|
||||
String m_pattern;
|
||||
String m_original_pattern;
|
||||
String m_parsed_pattern;
|
||||
String m_flags;
|
||||
Flags m_active_flags;
|
||||
Regex<ECMA262> m_regex;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue