AK: Track byte length, rather than code point length, in Utf8View::trim

Utf8View::trim uses Utf8View::substring_view to return its result, which requires the input to be a byte offset/length rather than code point length.
2025-09-14 01:17:34 +00:00 · 2021-07-16 12:40:46 -04:00 · 2021-07-16 12:40:46 -04:00 · 87848cdf7d
commit 87848cdf7d
parent 660a8982e7
3 changed files with 49 additions and 8 deletions
--- a/AK/Utf8View.cpp
+++ b/AK/Utf8View.cpp
@ -185,24 +185,24 @@ bool Utf8View::contains(u32 needle) const
 Utf8View Utf8View::trim(const Utf8View& characters, TrimMode mode) const
 {
    size_t substring_start = 0;
-    size_t substring_length = length();
+    size_t substring_length = byte_length();

    if (mode == TrimMode::Left || mode == TrimMode::Both) {
-        for (auto code_point : *this) {
+        for (auto code_point = begin(); code_point != end(); ++code_point) {
            if (substring_length == 0)
                return {};
-            if (!characters.contains(code_point))
+            if (!characters.contains(*code_point))
                break;
-            ++substring_start;
-            --substring_length;
+            substring_start += code_point.underlying_code_point_length_in_bytes();
+            substring_length -= code_point.underlying_code_point_length_in_bytes();
        }
    }

    if (mode == TrimMode::Right || mode == TrimMode::Both) {
        size_t seen_whitespace_length = 0;
-        for (auto code_point : *this) {
-            if (characters.contains(code_point))
-                seen_whitespace_length++;
+        for (auto code_point = begin(); code_point != end(); ++code_point) {
+            if (characters.contains(*code_point))
+                seen_whitespace_length += code_point.underlying_code_point_length_in_bytes();
            else
                seen_whitespace_length = 0;
        }
--- a/Tests/AK/TestUtf8.cpp
+++ b/Tests/AK/TestUtf8.cpp
@ -182,3 +182,38 @@ TEST_CASE(decode_invalid_ut8)
        VERIFY(i == expected_size);
    }
 }
+
+TEST_CASE(trim)
+{
+    Utf8View whitespace { " " };
+    {
+        Utf8View view { "word" };
+        EXPECT_EQ(view.trim(whitespace, TrimMode::Both).as_string(), "word");
+        EXPECT_EQ(view.trim(whitespace, TrimMode::Left).as_string(), "word");
+        EXPECT_EQ(view.trim(whitespace, TrimMode::Right).as_string(), "word");
+    }
+    {
+        Utf8View view { "   word" };
+        EXPECT_EQ(view.trim(whitespace, TrimMode::Both).as_string(), "word");
+        EXPECT_EQ(view.trim(whitespace, TrimMode::Left).as_string(), "word");
+        EXPECT_EQ(view.trim(whitespace, TrimMode::Right).as_string(), "   word");
+    }
+    {
+        Utf8View view { "word   " };
+        EXPECT_EQ(view.trim(whitespace, TrimMode::Both).as_string(), "word");
+        EXPECT_EQ(view.trim(whitespace, TrimMode::Left).as_string(), "word   ");
+        EXPECT_EQ(view.trim(whitespace, TrimMode::Right).as_string(), "word");
+    }
+    {
+        Utf8View view { "   word   " };
+        EXPECT_EQ(view.trim(whitespace, TrimMode::Both).as_string(), "word");
+        EXPECT_EQ(view.trim(whitespace, TrimMode::Left).as_string(), "word   ");
+        EXPECT_EQ(view.trim(whitespace, TrimMode::Right).as_string(), "   word");
+    }
+    {
+        Utf8View view { "\u180E" };
+        EXPECT_EQ(view.trim(whitespace, TrimMode::Both).as_string(), "\u180E");
+        EXPECT_EQ(view.trim(whitespace, TrimMode::Left).as_string(), "\u180E");
+        EXPECT_EQ(view.trim(whitespace, TrimMode::Right).as_string(), "\u180E");
+    }
+}
--- a/Userland/Libraries/LibJS/Tests/builtins/String/String.prototype.trim.js
+++ b/Userland/Libraries/LibJS/Tests/builtins/String/String.prototype.trim.js
@ -56,3 +56,9 @@ test("trimEnd", () => {
    expect("\r\nhello friends".trimEnd()).toBe("\r\nhello friends");
    expect("\rhello friends\r\n".trimEnd()).toBe("\rhello friends");
 });
+
+test("multi-byte code point", () => {
+    expect("_\u180E".trim()).toBe("_\u180E");
+    expect("\u180E".trim()).toBe("\u180E");
+    expect("\u180E_".trim()).toBe("\u180E_");
+});