From ae6a84c2613ea977e5439cb543ef899f96f90c21 Mon Sep 17 00:00:00 2001 From: Sam Atkins Date: Tue, 10 Jan 2023 22:57:32 +0000 Subject: [PATCH] LibGUI: Lex INI files as Utf8 Iterating byte by byte meant that the column positions assigned to INI tokens would be off if there were any multi-byte codepoints. Using a Utf8View means these positions refer to whole codepoints instead, and the column positions match what GUI::TextEditor expects. :^) Fixes #12706. --- Userland/Libraries/LibGUI/INILexer.cpp | 20 +++++++++----------- Userland/Libraries/LibGUI/INILexer.h | 10 +++++----- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/Userland/Libraries/LibGUI/INILexer.cpp b/Userland/Libraries/LibGUI/INILexer.cpp index 6fdf3576b6..e3e9300003 100644 --- a/Userland/Libraries/LibGUI/INILexer.cpp +++ b/Userland/Libraries/LibGUI/INILexer.cpp @@ -1,5 +1,6 @@ /* * Copyright (c) 2020, Hüseyin Aslıtürk + * Copyright (c) 2023, Sam Atkins * * SPDX-License-Identifier: BSD-2-Clause */ @@ -12,20 +13,20 @@ namespace GUI { IniLexer::IniLexer(StringView input) : m_input(input) + , m_iterator(m_input.begin()) { } -char IniLexer::peek(size_t offset) const +u32 IniLexer::peek(size_t offset) const { - if ((m_index + offset) >= m_input.length()) - return 0; - return m_input[m_index + offset]; + return m_iterator.peek(offset).value_or(0); } -char IniLexer::consume() +u32 IniLexer::consume() { - VERIFY(m_index < m_input.length()); - char ch = m_input[m_index++]; + VERIFY(m_iterator != m_input.end()); + u32 ch = *m_iterator; + ++m_iterator; if (ch == '\n') { m_position.line++; m_position.column = 0; @@ -38,8 +39,6 @@ char IniLexer::consume() Vector IniLexer::lex() { Vector tokens; - - size_t token_start_index = 0; IniPosition token_start_position; auto emit_token = [&](auto type) { @@ -52,7 +51,6 @@ Vector IniLexer::lex() }; auto begin_token = [&] { - token_start_index = m_index; token_start_position = m_position; }; @@ -64,7 +62,7 @@ Vector IniLexer::lex() tokens.append(token); }; - while (m_index < m_input.length()) { + while (m_iterator != m_input.end()) { auto ch = peek(); if (is_ascii_space(ch)) { diff --git a/Userland/Libraries/LibGUI/INILexer.h b/Userland/Libraries/LibGUI/INILexer.h index 4509044fdd..c32f4629be 100644 --- a/Userland/Libraries/LibGUI/INILexer.h +++ b/Userland/Libraries/LibGUI/INILexer.h @@ -6,7 +6,7 @@ #pragma once -#include +#include namespace GUI { @@ -57,11 +57,11 @@ public: Vector lex(); private: - char peek(size_t offset = 0) const; - char consume(); + u32 peek(size_t offset = 0) const; + u32 consume(); - StringView m_input; - size_t m_index { 0 }; + Utf8View m_input; + Utf8CodePointIterator m_iterator; IniPosition m_position { 0, 0 }; };