mirror of
https://github.com/RGBCube/serenity
synced 2025-07-25 03:57:43 +00:00
LibWeb: Plumb content encoding into the new HTML parser
We still don't handle non-ASCII input correctly, but at least now we'll convert e.g ISO-8859-1 to UTF-8 before starting to tokenize. This patch also makes "view source" work with the new parser. :^)
This commit is contained in:
parent
772b51038e
commit
5e53c45113
6 changed files with 18 additions and 9 deletions
|
@ -444,7 +444,7 @@ RefPtr<Document> HtmlView::create_document_from_mime_type(const ByteBuffer& data
|
||||||
return create_gemini_document(data, url);
|
return create_gemini_document(data, url);
|
||||||
if (mime_type == "text/html") {
|
if (mime_type == "text/html") {
|
||||||
if (m_use_new_parser) {
|
if (m_use_new_parser) {
|
||||||
HTMLDocumentParser parser(data);
|
HTMLDocumentParser parser(data, encoding);
|
||||||
parser.run(url);
|
parser.run(url);
|
||||||
return parser.document();
|
return parser.document();
|
||||||
}
|
}
|
||||||
|
|
|
@ -24,7 +24,7 @@
|
||||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#define PARSER_DEBUG
|
//#define PARSER_DEBUG
|
||||||
|
|
||||||
#include <AK/Utf32View.h>
|
#include <AK/Utf32View.h>
|
||||||
#include <LibWeb/DOM/Comment.h>
|
#include <LibWeb/DOM/Comment.h>
|
||||||
|
@ -51,8 +51,8 @@
|
||||||
|
|
||||||
namespace Web {
|
namespace Web {
|
||||||
|
|
||||||
HTMLDocumentParser::HTMLDocumentParser(const StringView& input)
|
HTMLDocumentParser::HTMLDocumentParser(const StringView& input, const String& encoding)
|
||||||
: m_tokenizer(input)
|
: m_tokenizer(input, encoding)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -64,6 +64,7 @@ void HTMLDocumentParser::run(const URL& url)
|
||||||
{
|
{
|
||||||
m_document = adopt(*new Document);
|
m_document = adopt(*new Document);
|
||||||
m_document->set_url(url);
|
m_document->set_url(url);
|
||||||
|
m_document->set_source(m_tokenizer.source());
|
||||||
|
|
||||||
for (;;) {
|
for (;;) {
|
||||||
auto optional_token = m_tokenizer.next_token();
|
auto optional_token = m_tokenizer.next_token();
|
||||||
|
|
|
@ -61,7 +61,7 @@ namespace Web {
|
||||||
|
|
||||||
class HTMLDocumentParser {
|
class HTMLDocumentParser {
|
||||||
public:
|
public:
|
||||||
explicit HTMLDocumentParser(const StringView& input);
|
HTMLDocumentParser(const StringView& input, const String& encoding);
|
||||||
~HTMLDocumentParser();
|
~HTMLDocumentParser();
|
||||||
|
|
||||||
void run(const URL&);
|
void run(const URL&);
|
||||||
|
|
|
@ -24,6 +24,7 @@
|
||||||
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include <LibTextCodec/Decoder.h>
|
||||||
#include <LibWeb/Parser/Entities.h>
|
#include <LibWeb/Parser/Entities.h>
|
||||||
#include <LibWeb/Parser/HTMLToken.h>
|
#include <LibWeb/Parser/HTMLToken.h>
|
||||||
#include <LibWeb/Parser/HTMLTokenizer.h>
|
#include <LibWeb/Parser/HTMLTokenizer.h>
|
||||||
|
@ -1711,9 +1712,12 @@ void HTMLTokenizer::create_new_token(HTMLToken::Type type)
|
||||||
m_current_token.m_type = type;
|
m_current_token.m_type = type;
|
||||||
}
|
}
|
||||||
|
|
||||||
HTMLTokenizer::HTMLTokenizer(const StringView& input)
|
HTMLTokenizer::HTMLTokenizer(const StringView& input, const String& encoding)
|
||||||
: m_input(input)
|
|
||||||
{
|
{
|
||||||
|
auto* decoder = TextCodec::decoder_for(encoding);
|
||||||
|
ASSERT(decoder);
|
||||||
|
m_decoded_input = decoder->to_utf8(input);
|
||||||
|
m_input = m_decoded_input;
|
||||||
}
|
}
|
||||||
|
|
||||||
void HTMLTokenizer::will_switch_to([[maybe_unused]] State new_state)
|
void HTMLTokenizer::will_switch_to([[maybe_unused]] State new_state)
|
||||||
|
|
|
@ -118,7 +118,7 @@ namespace Web {
|
||||||
|
|
||||||
class HTMLTokenizer {
|
class HTMLTokenizer {
|
||||||
public:
|
public:
|
||||||
explicit HTMLTokenizer(const StringView& input);
|
explicit HTMLTokenizer(const StringView& input, const String& encoding);
|
||||||
|
|
||||||
enum class State {
|
enum class State {
|
||||||
#define __ENUMERATE_TOKENIZER_STATE(state) state,
|
#define __ENUMERATE_TOKENIZER_STATE(state) state,
|
||||||
|
@ -133,6 +133,8 @@ public:
|
||||||
void set_blocked(bool b) { m_blocked = b; }
|
void set_blocked(bool b) { m_blocked = b; }
|
||||||
bool is_blocked() const { return m_blocked; }
|
bool is_blocked() const { return m_blocked; }
|
||||||
|
|
||||||
|
String source() const { return m_decoded_input; }
|
||||||
|
|
||||||
private:
|
private:
|
||||||
Optional<u32> next_codepoint();
|
Optional<u32> next_codepoint();
|
||||||
Optional<u32> peek_codepoint(size_t offset) const;
|
Optional<u32> peek_codepoint(size_t offset) const;
|
||||||
|
@ -163,6 +165,8 @@ private:
|
||||||
|
|
||||||
Vector<u32> m_temporary_buffer;
|
Vector<u32> m_temporary_buffer;
|
||||||
|
|
||||||
|
String m_decoded_input;
|
||||||
|
|
||||||
StringView m_input;
|
StringView m_input;
|
||||||
size_t m_cursor { 0 };
|
size_t m_cursor { 0 };
|
||||||
|
|
||||||
|
|
|
@ -47,7 +47,7 @@ int main(int argc, char** argv)
|
||||||
return 1;
|
return 1;
|
||||||
auto contents = file_or_error.value()->read_all();
|
auto contents = file_or_error.value()->read_all();
|
||||||
|
|
||||||
Web::HTMLDocumentParser parser(contents);
|
Web::HTMLDocumentParser parser(contents, "utf-8");
|
||||||
parser.run(URL::create_with_file_protocol(input_path));
|
parser.run(URL::create_with_file_protocol(input_path));
|
||||||
|
|
||||||
auto& document = parser.document();
|
auto& document = parser.document();
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue