mirror of
https://github.com/RGBCube/serenity
synced 2025-07-25 19:47:44 +00:00
LibJS: Decode UTF-16 surrogate pairs during string literal construction
Rather than deferring this decoding to PrimitiveString, we can decode surrogate pairs when parsing the string. This prevents a string copy when constructing the PrimitiveString.
This commit is contained in:
parent
27d555bab0
commit
4c2cc419f9
2 changed files with 28 additions and 14 deletions
|
@ -63,17 +63,6 @@ PrimitiveString* js_string(Heap& heap, String string)
|
||||||
return &heap.vm().single_ascii_character_string(ch);
|
return &heap.vm().single_ascii_character_string(ch);
|
||||||
}
|
}
|
||||||
|
|
||||||
// UTF-8 strings must first be transcoded to UTF-16, even though they are stored as String objects
|
|
||||||
// internally, to parse encoded surrogate pairs. As an optimization to reduce string copying, only
|
|
||||||
// perform that transcoding if there are non-ASCII codepoints in the string.
|
|
||||||
for (auto it : string) {
|
|
||||||
auto ch = static_cast<u8>(it);
|
|
||||||
if (!is_ascii(ch)) {
|
|
||||||
auto utf16_string = AK::utf8_to_utf16(string);
|
|
||||||
return js_string(heap, Utf16View { utf16_string });
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return heap.allocate_without_global_object<PrimitiveString>(move(string));
|
return heap.allocate_without_global_object<PrimitiveString>(move(string));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -10,6 +10,7 @@
|
||||||
#include <AK/CharacterTypes.h>
|
#include <AK/CharacterTypes.h>
|
||||||
#include <AK/GenericLexer.h>
|
#include <AK/GenericLexer.h>
|
||||||
#include <AK/StringBuilder.h>
|
#include <AK/StringBuilder.h>
|
||||||
|
#include <AK/Utf16View.h>
|
||||||
|
|
||||||
namespace JS {
|
namespace JS {
|
||||||
|
|
||||||
|
@ -102,6 +103,16 @@ String Token::string_value(StringValueStatus& status) const
|
||||||
return {};
|
return {};
|
||||||
};
|
};
|
||||||
|
|
||||||
|
auto decode_surrogate = [&lexer]() -> Optional<u16> {
|
||||||
|
u16 surrogate = 0;
|
||||||
|
for (int j = 0; j < 4; ++j) {
|
||||||
|
if (!lexer.next_is(is_ascii_hex_digit))
|
||||||
|
return {};
|
||||||
|
surrogate = (surrogate << 4u) | hex2int(lexer.consume());
|
||||||
|
}
|
||||||
|
return surrogate;
|
||||||
|
};
|
||||||
|
|
||||||
StringBuilder builder;
|
StringBuilder builder;
|
||||||
while (!lexer.is_eof()) {
|
while (!lexer.is_eof()) {
|
||||||
// No escape, consume one char and continue
|
// No escape, consume one char and continue
|
||||||
|
@ -157,10 +168,24 @@ String Token::string_value(StringValueStatus& status) const
|
||||||
}
|
}
|
||||||
lexer.ignore();
|
lexer.ignore();
|
||||||
} else {
|
} else {
|
||||||
for (int j = 0; j < 4; ++j) {
|
auto high_surrogate = decode_surrogate();
|
||||||
if (!lexer.next_is(is_ascii_hex_digit))
|
if (!high_surrogate.has_value())
|
||||||
|
return encoding_failure(StringValueStatus::MalformedUnicodeEscape);
|
||||||
|
|
||||||
|
if (Utf16View::is_high_surrogate(*high_surrogate) && lexer.consume_specific("\\u"sv)) {
|
||||||
|
auto low_surrogate = decode_surrogate();
|
||||||
|
if (!low_surrogate.has_value())
|
||||||
return encoding_failure(StringValueStatus::MalformedUnicodeEscape);
|
return encoding_failure(StringValueStatus::MalformedUnicodeEscape);
|
||||||
code_point = (code_point << 4u) | hex2int(lexer.consume());
|
|
||||||
|
if (Utf16View::is_low_surrogate(*low_surrogate)) {
|
||||||
|
code_point = Utf16View::decode_surrogate_pair(*high_surrogate, *low_surrogate);
|
||||||
|
} else {
|
||||||
|
builder.append_code_point(*high_surrogate);
|
||||||
|
code_point = *low_surrogate;
|
||||||
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
code_point = *high_surrogate;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
builder.append_code_point(code_point);
|
builder.append_code_point(code_point);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue