1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-06-01 06:18:12 +00:00

AK: Track byte length, rather than code point length, in Utf8View::trim

Utf8View::trim uses Utf8View::substring_view to return its result, which
requires the input to be a byte offset/length rather than code point
length.
This commit is contained in:
Timothy Flynn 2021-07-16 12:40:46 -04:00 committed by Linus Groh
parent 660a8982e7
commit 87848cdf7d
3 changed files with 49 additions and 8 deletions

View file

@ -185,24 +185,24 @@ bool Utf8View::contains(u32 needle) const
Utf8View Utf8View::trim(const Utf8View& characters, TrimMode mode) const
{
size_t substring_start = 0;
size_t substring_length = length();
size_t substring_length = byte_length();
if (mode == TrimMode::Left || mode == TrimMode::Both) {
for (auto code_point : *this) {
for (auto code_point = begin(); code_point != end(); ++code_point) {
if (substring_length == 0)
return {};
if (!characters.contains(code_point))
if (!characters.contains(*code_point))
break;
++substring_start;
--substring_length;
substring_start += code_point.underlying_code_point_length_in_bytes();
substring_length -= code_point.underlying_code_point_length_in_bytes();
}
}
if (mode == TrimMode::Right || mode == TrimMode::Both) {
size_t seen_whitespace_length = 0;
for (auto code_point : *this) {
if (characters.contains(code_point))
seen_whitespace_length++;
for (auto code_point = begin(); code_point != end(); ++code_point) {
if (characters.contains(*code_point))
seen_whitespace_length += code_point.underlying_code_point_length_in_bytes();
else
seen_whitespace_length = 0;
}

View file

@ -182,3 +182,38 @@ TEST_CASE(decode_invalid_ut8)
VERIFY(i == expected_size);
}
}
TEST_CASE(trim)
{
Utf8View whitespace { " " };
{
Utf8View view { "word" };
EXPECT_EQ(view.trim(whitespace, TrimMode::Both).as_string(), "word");
EXPECT_EQ(view.trim(whitespace, TrimMode::Left).as_string(), "word");
EXPECT_EQ(view.trim(whitespace, TrimMode::Right).as_string(), "word");
}
{
Utf8View view { " word" };
EXPECT_EQ(view.trim(whitespace, TrimMode::Both).as_string(), "word");
EXPECT_EQ(view.trim(whitespace, TrimMode::Left).as_string(), "word");
EXPECT_EQ(view.trim(whitespace, TrimMode::Right).as_string(), " word");
}
{
Utf8View view { "word " };
EXPECT_EQ(view.trim(whitespace, TrimMode::Both).as_string(), "word");
EXPECT_EQ(view.trim(whitespace, TrimMode::Left).as_string(), "word ");
EXPECT_EQ(view.trim(whitespace, TrimMode::Right).as_string(), "word");
}
{
Utf8View view { " word " };
EXPECT_EQ(view.trim(whitespace, TrimMode::Both).as_string(), "word");
EXPECT_EQ(view.trim(whitespace, TrimMode::Left).as_string(), "word ");
EXPECT_EQ(view.trim(whitespace, TrimMode::Right).as_string(), " word");
}
{
Utf8View view { "\u180E" };
EXPECT_EQ(view.trim(whitespace, TrimMode::Both).as_string(), "\u180E");
EXPECT_EQ(view.trim(whitespace, TrimMode::Left).as_string(), "\u180E");
EXPECT_EQ(view.trim(whitespace, TrimMode::Right).as_string(), "\u180E");
}
}

View file

@ -56,3 +56,9 @@ test("trimEnd", () => {
expect("\r\nhello friends".trimEnd()).toBe("\r\nhello friends");
expect("\rhello friends\r\n".trimEnd()).toBe("\rhello friends");
});
test("multi-byte code point", () => {
expect("_\u180E".trim()).toBe("_\u180E");
expect("\u180E".trim()).toBe("\u180E");
expect("\u180E_".trim()).toBe("\u180E_");
});