From e9dfa615888d2623435449c009c07f1525962b14 Mon Sep 17 00:00:00 2001
From: Shannon Booth <shannon@serenityos.org>
Date: Thu, 4 Jan 2024 10:27:25 +1300
Subject: [PATCH] LibWeb: Use UTF-16 code unit offsets in Range::to_string
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Similar to another problem we had in CharacterData, we were assuming
that the offsets were raw utf8 byte offsets into the data, instead of
utf16 code units. Fix this by using the substring helpers in
CharacterData to get the text data from the Range.

There are more instances of this issue around the place that we will
need to track down and add tests for, but this fixes one of them :^)

For the test included in this commit, we were previously returning:

llo💨😮

Instead of the expected:

llo💨😮 Wo
---
 .../Text/expected/DOM/Range-to-string.txt      |  2 ++
 .../LibWeb/Text/input/DOM/Range-to-string.html | 14 ++++++++++++++
 Userland/Libraries/LibWeb/DOM/Range.cpp        | 18 ++++++++++++------
 3 files changed, 28 insertions(+), 6 deletions(-)
 create mode 100644 Tests/LibWeb/Text/expected/DOM/Range-to-string.txt
 create mode 100644 Tests/LibWeb/Text/input/DOM/Range-to-string.html
diff --git a/Tests/LibWeb/Text/expected/DOM/Range-to-string.txt b/Tests/LibWeb/Text/expected/DOM/Range-to-string.txt
new file mode 100644
index 0000000000..eb90804337
--- /dev/null
+++ b/Tests/LibWeb/Text/expected/DOM/Range-to-string.txt
@@ -0,0 +1,2 @@
+Hello💨😮 World   
+llo💨😮 Wo
diff --git a/Tests/LibWeb/Text/input/DOM/Range-to-string.html b/Tests/LibWeb/Text/input/DOM/Range-to-string.html
new file mode 100644
index 0000000000..5583df7c10
--- /dev/null
+++ b/Tests/LibWeb/Text/input/DOM/Range-to-string.html
@@ -0,0 +1,14 @@
+<body><p id="p1"><b>Hello💨</b>😮 World</p>
+<script src="../include.js"></script>
+<script>
+    test(() => {
+        const p1 = document.getElementById("p1");
+        const hello = p1.firstChild.firstChild;
+        const world = p1.lastChild;
+        const range = document.createRange();
+        range.setStart(hello, 2);
+        range.setEnd(world, 5);
+        println('');
+        println(range.toString());
+    });
+</script>
diff --git a/Userland/Libraries/LibWeb/DOM/Range.cpp b/Userland/Libraries/LibWeb/DOM/Range.cpp
index ff1778e36f..191cfa6f03 100644
--- a/Userland/Libraries/LibWeb/DOM/Range.cpp
+++ b/Userland/Libraries/LibWeb/DOM/Range.cpp
@@ -560,12 +560,16 @@ String Range::to_string() const
 
     // 2. If this’s start node is this’s end node and it is a Text node,
     //    then return the substring of that Text node’s data beginning at this’s start offset and ending at this’s end offset.
-    if (start_container() == end_container() && is<Text>(*start_container()))
-        return MUST(static_cast<Text const&>(*start_container()).data().substring_from_byte_offset(start_offset(), end_offset() - start_offset()));
+    if (start_container() == end_container() && is<Text>(*start_container())) {
+        auto const& text = static_cast<Text const&>(*start_container());
+        return MUST(text.substring_data(start_offset(), end_offset() - start_offset()));
+    }
 
     // 3. If this’s start node is a Text node, then append the substring of that node’s data from this’s start offset until the end to s.
-    if (is<Text>(*start_container()))
-        builder.append(static_cast<Text const&>(*start_container()).data().bytes_as_string_view().substring_view(start_offset()));
+    if (is<Text>(*start_container())) {
+        auto const& text = static_cast<Text const&>(*start_container());
+        builder.append(MUST(text.substring_data(start_offset(), text.length_in_utf16_code_units() - start_offset())));
+    }
 
     // 4. Append the concatenation of the data of all Text nodes that are contained in this, in tree order, to s.
     for (Node const* node = start_container(); node != end_container()->next_sibling(); node = node->next_in_pre_order()) {
@@ -574,8 +578,10 @@ String Range::to_string() const
     }
 
     // 5. If this’s end node is a Text node, then append the substring of that node’s data from its start until this’s end offset to s.
-    if (is<Text>(*end_container()))
-        builder.append(static_cast<Text const&>(*end_container()).data().bytes_as_string_view().substring_view(0, end_offset()));
+    if (is<Text>(*end_container())) {
+        auto const& text = static_cast<Text const&>(*end_container());
+        builder.append(MUST(text.substring_data(0, end_offset())));
+    }
 
     // 6. Return s.
     return MUST(builder.to_string());