mirror of
https://github.com/RGBCube/serenity
synced 2025-05-31 14:48:14 +00:00
AK: Bring JsonParser's string consumption closer to the ECMA 404 spec
I added some spec comments, and implementation notices, this should not
change behavior in a significant way.
The previous code was quite unwieldy and repetitive.
The long `if(next_is('X'))` chain is now a smaller `switch`.
I also reinstated the fast path for long sequences of literal
characters, which was broken in 0aad21fff2
This commit is contained in:
parent
c990db0913
commit
e02a4f5181
1 changed files with 85 additions and 75 deletions
|
@ -18,6 +18,19 @@ constexpr bool is_space(int ch)
|
||||||
return ch == '\t' || ch == '\n' || ch == '\r' || ch == ' ';
|
return ch == '\t' || ch == '\n' || ch == '\r' || ch == ' ';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// ECMA-404 9 String
|
||||||
|
// Boils down to
|
||||||
|
// STRING = "\"" *("[^\"\\]" | "\\" ("[\"\\bfnrt]" | "u[0-9A-Za-z]{4}")) "\""
|
||||||
|
// │├── " ──╮───────────────────────────────────────────────╭── " ──┤│
|
||||||
|
// │ │
|
||||||
|
// │ ╭───────────────────<─────────────────────╮ │
|
||||||
|
// │ │ │ │
|
||||||
|
// ╰──╰──╮───────────── [^"\\] ──────────────╭──╯──╯
|
||||||
|
// │ │
|
||||||
|
// ╰── \ ───╮──── ["\\bfnrt] ───────╭──╯
|
||||||
|
// │ │
|
||||||
|
// ╰─── u[0-9A-Za-z]{4} ──╯
|
||||||
|
//
|
||||||
ErrorOr<DeprecatedString> JsonParser::consume_and_unescape_string()
|
ErrorOr<DeprecatedString> JsonParser::consume_and_unescape_string()
|
||||||
{
|
{
|
||||||
if (!consume_specific('"'))
|
if (!consume_specific('"'))
|
||||||
|
@ -25,98 +38,95 @@ ErrorOr<DeprecatedString> JsonParser::consume_and_unescape_string()
|
||||||
StringBuilder final_sb;
|
StringBuilder final_sb;
|
||||||
|
|
||||||
for (;;) {
|
for (;;) {
|
||||||
size_t peek_index = m_index;
|
// OPTIMIZATION: We try to append as many literal characters as possible at a time
|
||||||
char ch = 0;
|
// This also pre-checks some error conditions
|
||||||
|
// Note: All utf8 characters are either plain ascii, or have their most signifiant bit set,
|
||||||
|
// which puts the, above plain ascii in value, so they will always consist
|
||||||
|
// of a set of "legal" non-special bytes,
|
||||||
|
// hence we don't need to bother with a code-point iterator,
|
||||||
|
// as a simple byte iterator suffices, which GenericLexer provides by default
|
||||||
|
size_t literal_characters = 0;
|
||||||
for (;;) {
|
for (;;) {
|
||||||
if (peek_index == m_input.length())
|
char ch = peek(literal_characters);
|
||||||
break;
|
// Note: We get a 0 byte when we hit EOF
|
||||||
ch = m_input[peek_index];
|
if (ch == 0)
|
||||||
|
return Error::from_string_literal("JsonParser: EOF while parsing String");
|
||||||
|
// Spec: All code points may be placed within the quotation marks except
|
||||||
|
// for the code points that must be escaped: quotation mark (U+0022),
|
||||||
|
// reverse solidus (U+005C), and the control characters U+0000 to U+001F.
|
||||||
|
// There are two-character escape sequence representations of some characters.
|
||||||
|
if (is_ascii_c0_control(ch))
|
||||||
|
return Error::from_string_literal("JsonParser: ASCII control sequence encountered");
|
||||||
if (ch == '"' || ch == '\\')
|
if (ch == '"' || ch == '\\')
|
||||||
break;
|
break;
|
||||||
if (is_ascii_c0_control(ch))
|
++literal_characters;
|
||||||
return Error::from_string_literal("JsonParser: Error while parsing string");
|
}
|
||||||
++peek_index;
|
final_sb.append(consume(literal_characters));
|
||||||
|
|
||||||
|
// We have checked all cases except end-of-string and escaped characters in the loop above,
|
||||||
|
// so we now only have to handle those two cases
|
||||||
|
char ch = peek();
|
||||||
|
|
||||||
|
if (ch == '"') {
|
||||||
|
consume();
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
while (peek_index != m_index) {
|
ignore(); // '\'
|
||||||
final_sb.append(m_input[m_index]);
|
|
||||||
m_index++;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (m_index == m_input.length())
|
switch (peek()) {
|
||||||
break;
|
case '\0':
|
||||||
if (ch == '"')
|
return Error::from_string_literal("JsonParser: EOF while parsing String");
|
||||||
break;
|
case '"':
|
||||||
if (ch != '\\') {
|
case '\\':
|
||||||
|
case '/':
|
||||||
final_sb.append(consume());
|
final_sb.append(consume());
|
||||||
continue;
|
break;
|
||||||
}
|
case 'b':
|
||||||
ignore();
|
|
||||||
if (next_is('"')) {
|
|
||||||
ignore();
|
|
||||||
final_sb.append('"');
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (next_is('\\')) {
|
|
||||||
ignore();
|
|
||||||
final_sb.append('\\');
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (next_is('/')) {
|
|
||||||
ignore();
|
|
||||||
final_sb.append('/');
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (next_is('n')) {
|
|
||||||
ignore();
|
|
||||||
final_sb.append('\n');
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (next_is('r')) {
|
|
||||||
ignore();
|
|
||||||
final_sb.append('\r');
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (next_is('t')) {
|
|
||||||
ignore();
|
|
||||||
final_sb.append('\t');
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (next_is('b')) {
|
|
||||||
ignore();
|
ignore();
|
||||||
final_sb.append('\b');
|
final_sb.append('\b');
|
||||||
continue;
|
break;
|
||||||
}
|
case 'f':
|
||||||
|
|
||||||
if (next_is('f')) {
|
|
||||||
ignore();
|
ignore();
|
||||||
final_sb.append('\f');
|
final_sb.append('\f');
|
||||||
continue;
|
break;
|
||||||
}
|
case 'n':
|
||||||
|
|
||||||
if (next_is('u')) {
|
|
||||||
ignore();
|
ignore();
|
||||||
|
final_sb.append('\n');
|
||||||
|
break;
|
||||||
|
case 'r':
|
||||||
|
ignore();
|
||||||
|
final_sb.append('\r');
|
||||||
|
break;
|
||||||
|
case 't':
|
||||||
|
ignore();
|
||||||
|
final_sb.append('\t');
|
||||||
|
break;
|
||||||
|
case 'u': {
|
||||||
|
ignore(); // 'u'
|
||||||
|
|
||||||
if (tell_remaining() < 4)
|
if (tell_remaining() < 4)
|
||||||
return Error::from_string_literal("JsonParser: EOF while parsing Unicode escape");
|
return Error::from_string_literal("JsonParser: EOF while parsing Unicode escape");
|
||||||
|
auto escaped_string = consume(4);
|
||||||
auto code_point = AK::StringUtils::convert_to_uint_from_hex(consume(4));
|
auto code_point = AK::StringUtils::convert_to_uint_from_hex(escaped_string);
|
||||||
if (code_point.has_value()) {
|
if (!code_point.has_value()) {
|
||||||
final_sb.append_code_point(code_point.value());
|
dbgln("JsonParser: Error while parsing Unicode escape {}", escaped_string);
|
||||||
continue;
|
return Error::from_string_literal("JsonParser: Error while parsing Unicode escape");
|
||||||
}
|
}
|
||||||
return Error::from_string_literal("JsonParser: Error while parsing Unicode escape");
|
// Note/FIXME: "To escape a code point that is not in the Basic Multilingual Plane, the character may be represented as a
|
||||||
|
// twelve-character sequence, encoding the UTF-16 surrogate pair corresponding to the code point. So for
|
||||||
|
// example, a string containing only the G clef character (U+1D11E) may be represented as "\uD834\uDD1E".
|
||||||
|
// However, whether a processor of JSON texts interprets such a surrogate pair as a single code point or as an
|
||||||
|
// explicit surrogate pair is a semantic decision that is determined by the specific processor."
|
||||||
|
// ~ECMA-404, 2nd Edition Dec. 2017, page 5
|
||||||
|
final_sb.append_code_point(code_point.value());
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
default:
|
||||||
|
dbgln("JsonParser: Invalid escaped character '{}' ({:#x}) ", peek(), peek());
|
||||||
|
return Error::from_string_literal("JsonParser: Invalid escaped character");
|
||||||
}
|
}
|
||||||
|
|
||||||
return Error::from_string_literal("JsonParser: Error while parsing string");
|
|
||||||
}
|
}
|
||||||
if (!consume_specific('"'))
|
|
||||||
return Error::from_string_literal("JsonParser: Expected '\"'");
|
|
||||||
|
|
||||||
return final_sb.to_deprecated_string();
|
return final_sb.to_deprecated_string();
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue