mirror of
https://github.com/RGBCube/serenity
synced 2025-05-31 05:18:12 +00:00
LibWeb: HTML Parser, handle html escaped characters
Convert HTML escaped (&#XXX;) characters to string.
This commit is contained in:
parent
738235574f
commit
241df7206e
1 changed files with 56 additions and 2 deletions
|
@ -27,6 +27,7 @@
|
||||||
#include <AK/Function.h>
|
#include <AK/Function.h>
|
||||||
#include <AK/NonnullRefPtrVector.h>
|
#include <AK/NonnullRefPtrVector.h>
|
||||||
#include <AK/StringBuilder.h>
|
#include <AK/StringBuilder.h>
|
||||||
|
#include <AK/StringUtils.h>
|
||||||
#include <LibTextCodec/Decoder.h>
|
#include <LibTextCodec/Decoder.h>
|
||||||
#include <LibWeb/DOM/Comment.h>
|
#include <LibWeb/DOM/Comment.h>
|
||||||
#include <LibWeb/DOM/DocumentFragment.h>
|
#include <LibWeb/DOM/DocumentFragment.h>
|
||||||
|
@ -64,6 +65,23 @@ static bool is_void_element(const StringView& tag_name)
|
||||||
|| tag_name == "wbr";
|
|| tag_name == "wbr";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static Vector<char> codepoint_to_bytes(const u32 codepoint)
|
||||||
|
{
|
||||||
|
Vector<char, 0> bytes;
|
||||||
|
|
||||||
|
if (codepoint < 0x80) {
|
||||||
|
bytes.insert(0, (char)codepoint);
|
||||||
|
} else if (codepoint < 0x800) {
|
||||||
|
char b2 = (codepoint & 0x3F) + 0x80;
|
||||||
|
char b1 = ((codepoint >> 6) & 0x1F) + +0xC0;
|
||||||
|
|
||||||
|
bytes.insert(0, b1);
|
||||||
|
bytes.insert(1, b2);
|
||||||
|
}
|
||||||
|
|
||||||
|
return bytes;
|
||||||
|
}
|
||||||
|
|
||||||
static bool parse_html_document(const StringView& html, Document& document, ParentNode& root)
|
static bool parse_html_document(const StringView& html, Document& document, ParentNode& root)
|
||||||
{
|
{
|
||||||
NonnullRefPtrVector<ParentNode> node_stack;
|
NonnullRefPtrVector<ParentNode> node_stack;
|
||||||
|
@ -213,6 +231,7 @@ static bool parse_html_document(const StringView& html, Document& document, Pare
|
||||||
};
|
};
|
||||||
auto rest_of_html = html.substring_view(i, html.length() - i);
|
auto rest_of_html = html.substring_view(i, html.length() - i);
|
||||||
bool found = false;
|
bool found = false;
|
||||||
|
|
||||||
for (auto& escape : escapes) {
|
for (auto& escape : escapes) {
|
||||||
if (rest_of_html.starts_with(escape.code)) {
|
if (rest_of_html.starts_with(escape.code)) {
|
||||||
text_buffer.append(escape.value);
|
text_buffer.append(escape.value);
|
||||||
|
@ -221,8 +240,43 @@ static bool parse_html_document(const StringView& html, Document& document, Pare
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!found)
|
|
||||||
dbg() << "Unhandled escape sequence";
|
if (!found) {
|
||||||
|
char num_sign = html[i + 1];
|
||||||
|
if (num_sign && num_sign == '#') {
|
||||||
|
int j = 2; // spip '&#' and search for ';'
|
||||||
|
while (html[i + j] != ';' && j < 7) {
|
||||||
|
j++;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (j < 7) { // We found ; char
|
||||||
|
bool ok;
|
||||||
|
u32 codepoint;
|
||||||
|
String str_code_point = html.substring_view(i + 2, j - 2);
|
||||||
|
if (str_code_point.starts_with('x')) {
|
||||||
|
String str = str_code_point.substring(1, str_code_point.length() - 1);
|
||||||
|
codepoint = AK::StringUtils::convert_to_uint_from_hex(str, ok);
|
||||||
|
} else {
|
||||||
|
codepoint = str_code_point.to_uint(ok);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (ok) {
|
||||||
|
Vector<char> bytes = codepoint_to_bytes(codepoint);
|
||||||
|
if (bytes.size() > 0) {
|
||||||
|
for (size_t i = 0; i < bytes.size(); i++) {
|
||||||
|
text_buffer.append(bytes.at(i));
|
||||||
|
}
|
||||||
|
found = true;
|
||||||
|
i = i + j;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!found) {
|
||||||
|
dbg() << "Unhandled escape sequence:" << html.substring_view(i, min((size_t)5, html.length()));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
break;
|
break;
|
||||||
case State::BeforeTagName:
|
case State::BeforeTagName:
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue