Everywhere: Replace ctype.h to avoid narrowing conversions

This replaces ctype.h with CharacterType.h everywhere I could find issues with narrowing conversions. While using it will probably make sense almost everywhere in the future, the most critical places should have been addressed.
2025-09-18 04:16:17 +00:00 · 2021-06-01 21:18:08 +02:00 · 2021-06-01 21:18:08 +02:00 · bc8d16ad28
commit bc8d16ad28
parent 1c9d87c455
16 changed files with 153 additions and 266 deletions
--- a/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.cpp
+++ b/Userland/Libraries/LibWeb/CSS/Parser/Tokenizer.cpp
@ -4,11 +4,11 @@
 * SPDX-License-Identifier: BSD-2-Clause
 */

+#include <AK/CharacterTypes.h>
 #include <AK/SourceLocation.h>
 #include <AK/Vector.h>
 #include <LibTextCodec/Decoder.h>
 #include <LibWeb/CSS/Parser/Tokenizer.h>
-#include <ctype.h>

 #define CSS_TOKENIZER_TRACE 0

@ -20,11 +20,6 @@ static inline void log_parse_error(const SourceLocation& location = SourceLocati
    dbgln_if(CSS_TOKENIZER_TRACE, "Parse error (css tokenization) {} ", location);
 }

-static inline bool is_surrogate(u32 code_point)
-{
-    return (code_point & 0xfffff800) == 0xd800;
-}
-
 static inline bool is_quotation_mark(u32 code_point)
 {
    return code_point == 0x22;
@ -35,24 +30,14 @@ static inline bool is_greater_than_maximum_allowed_code_point(u32 code_point)
    return code_point > 0x10FFFF;
 }

-static inline bool is_hex_digit(u32 code_point)
-{
-    return isxdigit(code_point);
-}
-
 static inline bool is_low_line(u32 code_point)
 {
    return code_point == 0x5F;
 }

-static inline bool is_non_ascii(u32 code_point)
-{
-    return code_point >= 0x80;
-}
-
 static inline bool is_name_start_code_point(u32 code_point)
 {
-    return isalpha(code_point) || is_non_ascii(code_point) || is_low_line(code_point);
+    return is_ascii_alpha(code_point) || !is_ascii(code_point) || is_low_line(code_point);
 }

 static inline bool is_hyphen_minus(u32 code_point)
@ -62,7 +47,7 @@ static inline bool is_hyphen_minus(u32 code_point)

 static inline bool is_name_code_point(u32 code_point)
 {
-    return is_name_start_code_point(code_point) || isdigit(code_point) || is_hyphen_minus(code_point);
+    return is_name_start_code_point(code_point) || is_ascii_digit(code_point) || is_hyphen_minus(code_point);
 }

 static inline bool is_non_printable(u32 code_point)
@ -303,12 +288,12 @@ u32 Tokenizer::consume_escaped_code_point()

    auto input = code_point.value();

-    if (is_hex_digit(input)) {
+    if (is_ascii_hex_digit(input)) {
        StringBuilder builder;
        builder.append_code_point(input);

        size_t counter = 0;
-        while (is_hex_digit(peek_code_point().value()) && counter++ < 5) {
+        while (is_ascii_hex_digit(peek_code_point().value()) && counter++ < 5) {
            builder.append_code_point(next_code_point().value());
        }

@ -317,7 +302,7 @@ u32 Tokenizer::consume_escaped_code_point()
        }

        auto unhexed = strtoul(builder.to_string().characters(), nullptr, 16);
-        if (unhexed == 0 || is_surrogate(unhexed) || is_greater_than_maximum_allowed_code_point(unhexed)) {
+        if (unhexed == 0 || is_unicode_surrogate(unhexed) || is_greater_than_maximum_allowed_code_point(unhexed)) {
            return REPLACEMENT_CHARACTER;
        }

@ -378,14 +363,14 @@ CSSNumber Tokenizer::consume_a_number()

    for (;;) {
        auto digits = peek_code_point().value();
-        if (!isdigit(digits))
+        if (!is_ascii_digit(digits))
            break;

        repr.append_code_point(next_code_point().value());
    }

    auto maybe_number = peek_twin().value();
-    if (is_full_stop(maybe_number.first) && isdigit(maybe_number.second)) {
+    if (is_full_stop(maybe_number.first) && is_ascii_digit(maybe_number.second)) {
        repr.append_code_point(next_code_point().value());
        repr.append_code_point(next_code_point().value());

@ -393,7 +378,7 @@ CSSNumber Tokenizer::consume_a_number()

        for (;;) {
            auto digits = peek_code_point();
-            if (digits.has_value() && !isdigit(digits.value()))
+            if (digits.has_value() && !is_ascii_digit(digits.value()))
                break;

            repr.append_code_point(next_code_point().value());
@ -403,12 +388,12 @@ CSSNumber Tokenizer::consume_a_number()
    auto maybe_exp = peek_triplet().value();
    if (is_E(maybe_exp.first) || is_e(maybe_exp.first)) {
        if (is_plus_sign(maybe_exp.second) || is_hyphen_minus(maybe_exp.second)) {
-            if (isdigit(maybe_exp.third)) {
+            if (is_ascii_digit(maybe_exp.third)) {
                repr.append_code_point(next_code_point().value());
                repr.append_code_point(next_code_point().value());
                repr.append_code_point(next_code_point().value());
            }
-        } else if (isdigit(maybe_exp.second)) {
+        } else if (is_ascii_digit(maybe_exp.second)) {
            repr.append_code_point(next_code_point().value());
            repr.append_code_point(next_code_point().value());
        }
@ -417,7 +402,7 @@ CSSNumber Tokenizer::consume_a_number()

        for (;;) {
            auto digits = peek_code_point().value();
-            if (!isdigit(digits))
+            if (!is_ascii_digit(digits))
                break;

            repr.append_code_point(next_code_point().value());
@ -588,19 +573,19 @@ bool Tokenizer::starts_with_a_number() const
 bool Tokenizer::starts_with_a_number(U32Triplet values)
 {
    if (is_plus_sign(values.first) || is_hyphen_minus(values.first)) {
-        if (isdigit(values.second))
+        if (is_ascii_digit(values.second))
            return true;

-        if (is_full_stop(values.second) && isdigit(values.third))
+        if (is_full_stop(values.second) && is_ascii_digit(values.third))
            return true;

        return false;
    }

    if (is_full_stop(values.first))
-        return isdigit(values.second);
+        return is_ascii_digit(values.second);

-    if (isdigit(values.first))
+    if (is_ascii_digit(values.first))
        return true;

    return false;
@ -902,7 +887,7 @@ Token Tokenizer::consume_a_token()
        return create_new_token(Token::TokenType::CloseCurly);
    }

-    if (isdigit(input)) {
+    if (is_ascii_digit(input)) {
        dbgln_if(CSS_TOKENIZER_TRACE, "is digit");
        reconsume_current_input_code_point();
        return consume_a_numeric_token();
--- a/Userland/Libraries/LibWeb/DOM/Document.cpp
+++ b/Userland/Libraries/LibWeb/DOM/Document.cpp
@ -6,6 +6,7 @@
 * SPDX-License-Identifier: BSD-2-Clause
 */

+#include <AK/CharacterTypes.h>
 #include <AK/StringBuilder.h>
 #include <AK/Utf8View.h>
 #include <LibCore/Timer.h>
@ -53,7 +54,6 @@
 #include <LibWeb/Page/BrowsingContext.h>
 #include <LibWeb/SVG/TagNames.h>
 #include <LibWeb/UIEvents/MouseEvent.h>
-#include <ctype.h>

 namespace Web::DOM {

@ -253,7 +253,7 @@ String Document::title() const
    StringBuilder builder;
    bool last_was_space = false;
    for (auto code_point : Utf8View(raw_title)) {
-        if (isspace(code_point)) {
+        if (is_ascii_space(code_point)) {
            last_was_space = true;
        } else {
            if (last_was_space && !builder.is_empty())
--- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp
+++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp
@ -4,13 +4,13 @@
 * SPDX-License-Identifier: BSD-2-Clause
 */

+#include <AK/CharacterTypes.h>
 #include <AK/Debug.h>
 #include <AK/SourceLocation.h>
 #include <LibTextCodec/Decoder.h>
 #include <LibWeb/HTML/Parser/Entities.h>
 #include <LibWeb/HTML/Parser/HTMLToken.h>
 #include <LibWeb/HTML/Parser/HTMLTokenizer.h>
-#include <ctype.h>
 #include <string.h>

 namespace Web::HTML {
@ -93,25 +93,25 @@ namespace Web::HTML {
    if (!current_input_character.has_value())

 #define ON_ASCII_ALPHA \
-    if (current_input_character.has_value() && isalpha(current_input_character.value()))
+    if (current_input_character.has_value() && is_ascii_alpha(current_input_character.value()))

 #define ON_ASCII_ALPHANUMERIC \
-    if (current_input_character.has_value() && isalnum(current_input_character.value()))
+    if (current_input_character.has_value() && is_ascii_alphanumeric(current_input_character.value()))

 #define ON_ASCII_UPPER_ALPHA \
-    if (current_input_character.has_value() && current_input_character.value() >= 'A' && current_input_character.value() <= 'Z')
+    if (current_input_character.has_value() && is_ascii_upper_alpha(current_input_character.value()))

 #define ON_ASCII_LOWER_ALPHA \
-    if (current_input_character.has_value() && current_input_character.value() >= 'a' && current_input_character.value() <= 'z')
+    if (current_input_character.has_value() && is_ascii_lower_alpha(current_input_character.value()))

 #define ON_ASCII_DIGIT \
-    if (current_input_character.has_value() && isdigit(current_input_character.value()))
+    if (current_input_character.has_value() && is_ascii_digit(current_input_character.value()))

 #define ON_ASCII_HEX_DIGIT \
-    if (current_input_character.has_value() && isxdigit(current_input_character.value()))
+    if (current_input_character.has_value() && is_ascii_hex_digit(current_input_character.value()))

 #define ON_WHITESPACE \
-    if (current_input_character.has_value() && strchr("\t\n\f ", current_input_character.value()))
+    if (current_input_character.has_value() && is_ascii(current_input_character.value()) && "\t\n\f "sv.contains(current_input_character.value()))

 #define ANYTHING_ELSE if (1)

@ -172,26 +172,6 @@ static inline void log_parse_error(const SourceLocation& location = SourceLocati
    dbgln_if(TOKENIZER_TRACE_DEBUG, "Parse error (tokenization) {}", location);
 }

-static inline bool is_surrogate(u32 code_point)
-{
-    return (code_point & 0xfffff800) == 0xd800;
-}
-
-static inline bool is_noncharacter(u32 code_point)
-{
-    return code_point >= 0xfdd0 && (code_point <= 0xfdef || (code_point & 0xfffe) == 0xfffe) && code_point <= 0x10ffff;
-}
-
-static inline bool is_c0_control(u32 code_point)
-{
-    return code_point <= 0x1f;
-}
-
-static inline bool is_control(u32 code_point)
-{
-    return is_c0_control(code_point) || (code_point >= 0x7f && code_point <= 0x9f);
-}
-
 Optional<u32> HTMLTokenizer::next_code_point()
 {
    if (m_utf8_iterator == m_utf8_view.end())
@ -322,7 +302,7 @@ _StartOfFunction:
                }
                ON_ASCII_UPPER_ALPHA
                {
-                    m_current_token.m_tag.tag_name.append(tolower(current_input_character.value()));
+                    m_current_token.m_tag.tag_name.append(to_ascii_lowercase(current_input_character.value()));
                    m_current_token.m_end_position = nth_last_position(0);
                    continue;
                }
@ -458,7 +438,7 @@ _StartOfFunction:
                ON_ASCII_UPPER_ALPHA
                {
                    create_new_token(HTMLToken::Type::DOCTYPE);
-                    m_current_token.m_doctype.name.append(tolower(current_input_character.value()));
+                    m_current_token.m_doctype.name.append(to_ascii_lowercase(current_input_character.value()));
                    m_current_token.m_doctype.missing_name = false;
                    SWITCH_TO(DOCTYPEName);
                }
@ -507,7 +487,7 @@ _StartOfFunction:
                }
                ON_ASCII_UPPER_ALPHA
                {
-                    m_current_token.m_doctype.name.append(tolower(current_input_character.value()));
+                    m_current_token.m_doctype.name.append(to_ascii_lowercase(current_input_character.value()));
                    continue;
                }
                ON(0)
@ -550,10 +530,10 @@ _StartOfFunction:
                }
                ANYTHING_ELSE
                {
-                    if (toupper(current_input_character.value()) == 'P' && consume_next_if_match("UBLIC", CaseSensitivity::CaseInsensitive)) {
+                    if (to_ascii_uppercase(current_input_character.value()) == 'P' && consume_next_if_match("UBLIC", CaseSensitivity::CaseInsensitive)) {
                        SWITCH_TO(AfterDOCTYPEPublicKeyword);
                    }
-                    if (toupper(current_input_character.value()) == 'S' && consume_next_if_match("YSTEM", CaseSensitivity::CaseInsensitive)) {
+                    if (to_ascii_uppercase(current_input_character.value()) == 'S' && consume_next_if_match("YSTEM", CaseSensitivity::CaseInsensitive)) {
                        SWITCH_TO(AfterDOCTYPESystemKeyword);
                    }
                    log_parse_error();
@ -1068,7 +1048,7 @@ _StartOfFunction:
                }
                ON_ASCII_UPPER_ALPHA
                {
-                    m_current_token.m_tag.attributes.last().local_name_builder.append_code_point(tolower(current_input_character.value()));
+                    m_current_token.m_tag.attributes.last().local_name_builder.append_code_point(to_ascii_lowercase(current_input_character.value()));
                    continue;
                }
                ON(0)
@ -1558,7 +1538,7 @@ _StartOfFunction:

                    if (consumed_as_part_of_an_attribute() && !match.value().entity.ends_with(';')) {
                        auto next_code_point = peek_code_point(0);
-                        if (next_code_point.has_value() && (next_code_point.value() == '=' || isalnum(next_code_point.value()))) {
+                        if (next_code_point.has_value() && (next_code_point.value() == '=' || is_ascii_alphanumeric(next_code_point.value()))) {
                            FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE;
                            SWITCH_TO_RETURN_STATE;
                        }
@ -1720,14 +1700,14 @@ _StartOfFunction:
                    log_parse_error();
                    m_character_reference_code = 0xFFFD;
                }
-                if (is_surrogate(m_character_reference_code)) {
+                if (is_unicode_surrogate(m_character_reference_code)) {
                    log_parse_error();
                    m_character_reference_code = 0xFFFD;
                }
-                if (is_noncharacter(m_character_reference_code)) {
+                if (is_unicode_noncharacter(m_character_reference_code)) {
                    log_parse_error();
                }
-                if (m_character_reference_code == 0xd || (is_control(m_character_reference_code) && !isspace(m_character_reference_code))) {
+                if (m_character_reference_code == 0xd || (is_unicode_control(m_character_reference_code) && !is_ascii_space(m_character_reference_code))) {
                    log_parse_error();
                    constexpr struct {
                        u32 number;
@ -1870,7 +1850,7 @@ _StartOfFunction:
                }
                ON_ASCII_UPPER_ALPHA
                {
-                    m_current_token.m_tag.tag_name.append(tolower(current_input_character.value()));
+                    m_current_token.m_tag.tag_name.append(to_ascii_lowercase(current_input_character.value()));
                    m_temporary_buffer.append(current_input_character.value());
                    continue;
                }
@ -1980,7 +1960,7 @@ _StartOfFunction:
                }
                ON_ASCII_UPPER_ALPHA
                {
-                    m_current_token.m_tag.tag_name.append(tolower(current_input_character.value()));
+                    m_current_token.m_tag.tag_name.append(to_ascii_lowercase(current_input_character.value()));
                    m_temporary_buffer.append(current_input_character.value());
                    continue;
                }
@ -2193,7 +2173,7 @@ _StartOfFunction:
                }
                ON_ASCII_UPPER_ALPHA
                {
-                    m_current_token.m_tag.tag_name.append(tolower(current_input_character.value()));
+                    m_current_token.m_tag.tag_name.append(to_ascii_lowercase(current_input_character.value()));
                    m_temporary_buffer.append(current_input_character.value());
                    continue;
                }
@ -2247,7 +2227,7 @@ _StartOfFunction:
                }
                ON_ASCII_UPPER_ALPHA
                {
-                    m_temporary_buffer.append(tolower(current_input_character.value()));
+                    m_temporary_buffer.append(to_ascii_lowercase(current_input_character.value()));
                    EMIT_CURRENT_CHARACTER;
                }
                ON_ASCII_LOWER_ALPHA
@ -2393,7 +2373,7 @@ _StartOfFunction:
                }
                ON_ASCII_UPPER_ALPHA
                {
-                    m_temporary_buffer.append(tolower(current_input_character.value()));
+                    m_temporary_buffer.append(to_ascii_lowercase(current_input_character.value()));
                    EMIT_CURRENT_CHARACTER;
                }
                ON_ASCII_LOWER_ALPHA
@ -2512,7 +2492,7 @@ _StartOfFunction:
                }
                ON_ASCII_UPPER_ALPHA
                {
-                    m_current_token.m_tag.tag_name.append(tolower(current_input_character.value()));
+                    m_current_token.m_tag.tag_name.append(to_ascii_lowercase(current_input_character.value()));
                    m_temporary_buffer.append(current_input_character.value());
                    continue;
                }
@ -2598,7 +2578,7 @@ bool HTMLTokenizer::consume_next_if_match(const StringView& string, CaseSensitiv
        // FIXME: This should be more Unicode-aware.
        if (case_sensitivity == CaseSensitivity::CaseInsensitive) {
            if (code_point.value() < 0x80) {
-                if (tolower(code_point.value()) != tolower(string[i]))
+                if (to_ascii_lowercase(code_point.value()) != to_ascii_lowercase(string[i]))
                    return false;
                continue;
            }
--- a/Userland/Libraries/LibWeb/Layout/TextNode.cpp
+++ b/Userland/Libraries/LibWeb/Layout/TextNode.cpp
@ -4,6 +4,7 @@
 * SPDX-License-Identifier: BSD-2-Clause
 */

+#include <AK/CharacterTypes.h>
 #include <AK/ScopeGuard.h>
 #include <AK/StringBuilder.h>
 #include <LibGfx/Painter.h>
@ -13,7 +14,6 @@
 #include <LibWeb/Layout/Label.h>
 #include <LibWeb/Layout/TextNode.h>
 #include <LibWeb/Page/BrowsingContext.h>
-#include <ctype.h>

 namespace Web::Layout {

@ -30,7 +30,7 @@ TextNode::~TextNode()
 static bool is_all_whitespace(const StringView& string)
 {
    for (size_t i = 0; i < string.length(); ++i) {
-        if (!isspace(string[i]))
+        if (!is_ascii_space(string[i]))
            return false;
    }
    return true;
@ -116,7 +116,7 @@ void TextNode::compute_text_for_rendering(bool collapse, bool previous_is_empty_
    auto it = utf8_view.begin();
    auto skip_over_whitespace = [&] {
        auto prev = it;
-        while (it != utf8_view.end() && isspace(*it)) {
+        while (it != utf8_view.end() && is_ascii_space(*it)) {
            prev = it;
            ++it;
        }
@ -125,7 +125,7 @@ void TextNode::compute_text_for_rendering(bool collapse, bool previous_is_empty_
    if (previous_is_empty_or_ends_in_whitespace)
        skip_over_whitespace();
    for (; it != utf8_view.end(); ++it) {
-        if (!isspace(*it)) {
+        if (!is_ascii_space(*it)) {
            builder.append(utf8_view.as_string().characters_without_null_termination() + utf8_view.byte_offset_of(it), it.code_point_length_in_bytes());
        } else {
            builder.append(' ');
@ -160,7 +160,7 @@ void TextNode::split_into_lines_by_rules(InlineFormattingContext& context, Layou

        float chunk_width;
        if (do_wrap_lines) {
-            if (do_collapse && isspace(*chunk.view.begin()) && line_boxes.last().is_empty_or_ends_in_whitespace()) {
+            if (do_collapse && is_ascii_space(*chunk.view.begin()) && line_boxes.last().is_empty_or_ends_in_whitespace()) {
                // This is a non-empty chunk that starts with collapsible whitespace.
                // We are at either at the start of a new line, or after something that ended in whitespace,
                // so we don't need to contribute our own whitespace to the line. Skip over it instead!
@ -264,7 +264,7 @@ TextNode::ChunkIterator::ChunkIterator(StringView const& text, LayoutMode layout
    , m_start_of_chunk(m_utf8_view.begin())
    , m_iterator(m_utf8_view.begin())
 {
-    m_last_was_space = !text.is_empty() && isspace(*m_utf8_view.begin());
+    m_last_was_space = !text.is_empty() && is_ascii_space(*m_utf8_view.begin());
 }

 Optional<TextNode::Chunk> TextNode::ChunkIterator::next()
@ -286,7 +286,7 @@ Optional<TextNode::Chunk> TextNode::ChunkIterator::next()
                return result.release_value();
        }
        if (m_wrap_lines) {
-            bool is_space = isspace(*m_iterator);
+            bool is_space = is_ascii_space(*m_iterator);
            if (is_space != m_last_was_space) {
                m_last_was_space = is_space;
                if (auto result = try_commit_chunk(m_iterator, false); result.has_value())