1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-07-25 13:57:35 +00:00

LibWeb: Use UTF-16 code unit offsets and lengths in CharacterData

We were previously assuming that the input offsets and lengths were all
in raw byte offsets into a UTF-8 string. While internally our String
representation may be in UTF-8 from the external world it is seen as
UTF-16, with code unit offsets passed through, and used as the returned
length.

Beforehand, the included test included in this commit would crash
ladybird (and otherwise return wrong values).

The implementation here is very inefficient, I am sure there is a
much smarter way to write it so that we would not need a conversion
from UTF-8 to a UTF-16 string (and then back again).

Fixes: #20971
This commit is contained in:
Shannon Booth 2023-12-22 20:41:34 +13:00 committed by Andreas Kling
parent d51f84501a
commit d8759d9656
6 changed files with 54 additions and 24 deletions

View file

@ -0,0 +1,6 @@
text.data = '🙃', length = 2
text.data = '🙃🙃', length = 4
text.data = '🙃hi🙃🙃', length = 8
text.data = '🙃i🙃🙃', length = 7
text.data = '🙃replaced!', length = 11
repla

View file

@ -0,0 +1,20 @@
<script src="../include.js"></script>
<script>
test(() => {
function dumpText(text) {
println(`text.data = '${text.data}', length = ${text.length}`);
}
let text = new Text('🙃');
dumpText(text);
text.appendData('🙃')
dumpText(text);
text.insertData(2, 'hi🙃')
dumpText(text);
text.deleteData(2, 1)
dumpText(text);
text.replaceData(2, 5, 'replaced!')
dumpText(text);
println(text.substringData(2, 5))
});
</script>

View file

@ -35,37 +35,39 @@ void CharacterData::set_data(String const& data)
// NOTE: Since the offset is 0, it can never be above data's length, so this can never throw.
// NOTE: Setting the data to the same value as the current data still causes a mutation observer callback.
// FIXME: Figure out a way to make this a no-op again if the passed in data is the same as the current data.
MUST(replace_data(0, this->length(), data));
MUST(replace_data(0, this->length_in_utf16_code_units(), data));
}
// https://dom.spec.whatwg.org/#concept-cd-substring
WebIDL::ExceptionOr<String> CharacterData::substring_data(size_t offset, size_t count) const
{
// 1. Let length be nodes length.
auto length = this->length();
// FIXME: This is very inefficient!
auto utf16_data = MUST(AK::utf8_to_utf16(m_data));
Utf16View utf16_view { utf16_data };
auto length = utf16_view.length_in_code_units();
// 2. If offset is greater than length, then throw an "IndexSizeError" DOMException.
if (offset > length)
return WebIDL::IndexSizeError::create(realm(), "Substring offset out of range."_fly_string);
// FIXME: The offset and count we are given here is in UTF-16 code units, but we are incorrectly assuming it is a byte offset.
// 3. If offset plus count is greater than length, return a string whose value is the code units from the offsetth code unit
// to the end of nodes data, and then return.
if (offset + count > length)
return MUST(m_data.substring_from_byte_offset(offset));
return MUST(utf16_view.substring_view(offset).to_utf8());
// 4. Return a string whose value is the code units from the offsetth code unit to the offset+countth code unit in nodes data.
return MUST(m_data.substring_from_byte_offset(offset, count));
return MUST(utf16_view.substring_view(offset, count).to_utf8());
}
// https://dom.spec.whatwg.org/#concept-cd-replace
WebIDL::ExceptionOr<void> CharacterData::replace_data(size_t offset, size_t count, String const& data)
{
// FIXME: The offset and count we are given here is in UTF-16 code units, but we are incorrectly assuming it is a byte offset.
// 1. Let length be nodes length.
auto length = this->length();
// FIXME: This is very inefficient!
auto utf16_data = MUST(AK::utf8_to_utf16(m_data));
Utf16View utf16_view { utf16_data };
auto length = utf16_view.length_in_code_units();
// 2. If offset is greater than length, then throw an "IndexSizeError" DOMException.
if (offset > length)
@ -82,9 +84,9 @@ WebIDL::ExceptionOr<void> CharacterData::replace_data(size_t offset, size_t coun
// 6. Let delete offset be offset + datas length.
// 7. Starting from delete offset code units, remove count code units from nodes data.
StringBuilder builder;
builder.append(this->data().bytes_as_string_view().substring_view(0, offset));
builder.append(MUST(utf16_view.substring_view(0, offset).to_utf8()));
builder.append(data);
builder.append(this->data().bytes_as_string_view().substring_view(offset + count));
builder.append(MUST(utf16_view.substring_view(offset + count).to_utf8()));
m_data = MUST(builder.to_string());
// 8. For each live range whose start node is node and start offset is greater than offset but less than or equal to offset plus count, set its start offset to offset.
@ -130,7 +132,7 @@ WebIDL::ExceptionOr<void> CharacterData::replace_data(size_t offset, size_t coun
WebIDL::ExceptionOr<void> CharacterData::append_data(String const& data)
{
// The appendData(data) method steps are to replace data with node this, offset thiss length, count 0, and data data.
return replace_data(this->length(), 0, data);
return replace_data(this->length_in_utf16_code_units(), 0, data);
}
// https://dom.spec.whatwg.org/#dom-characterdata-insertdata

View file

@ -13,6 +13,7 @@
namespace Web::DOM {
// https://dom.spec.whatwg.org/#characterdata
class CharacterData
: public Node
, public ChildNode<CharacterData>
@ -26,14 +27,18 @@ public:
String const& data() const { return m_data; }
void set_data(String const&);
// FIXME: This should be in UTF-16 code units, not byte size.
unsigned length() const { return m_data.bytes().size(); }
unsigned length_in_utf16_code_units() const
{
// FIXME: This is inefficient!
auto utf16_data = MUST(AK::utf8_to_utf16(m_data));
return Utf16View { utf16_data }.length_in_code_units();
}
WebIDL::ExceptionOr<String> substring_data(size_t offset, size_t count) const;
WebIDL::ExceptionOr<String> substring_data(size_t offset_in_utf16_code_units, size_t count_in_utf16_code_units) const;
WebIDL::ExceptionOr<void> append_data(String const&);
WebIDL::ExceptionOr<void> insert_data(size_t offset, String const&);
WebIDL::ExceptionOr<void> delete_data(size_t offset, size_t count);
WebIDL::ExceptionOr<void> replace_data(size_t offset, size_t count, String const&);
WebIDL::ExceptionOr<void> insert_data(size_t offset_in_utf16_code_units, String const&);
WebIDL::ExceptionOr<void> delete_data(size_t offset_in_utf16_code_units, size_t count_in_utf16_code_units);
WebIDL::ExceptionOr<void> replace_data(size_t offset_in_utf16_code_units, size_t count_in_utf16_code_units, String const&);
protected:
CharacterData(Document&, NodeType, String const&);

View file

@ -6,7 +6,7 @@
[Exposed=Window]
interface CharacterData : Node {
[LegacyNullToEmptyString] attribute DOMString data;
readonly attribute unsigned long length;
[ImplementedAs=length_in_utf16_code_units] readonly attribute unsigned long length;
DOMString substringData(unsigned long offset, unsigned long count);
undefined appendData(DOMString data);

View file

@ -1492,11 +1492,8 @@ size_t Node::length() const
return 0;
// 2. If node is a CharacterData node, then return nodes datas length.
if (is_character_data()) {
auto* character_data_node = verify_cast<CharacterData>(this);
// FIXME: This should be in UTF-16 code units, not byte size.
return character_data_node->data().bytes().size();
}
if (is_character_data())
return verify_cast<CharacterData>(*this).length_in_utf16_code_units();
// 3. Return the number of nodes children.
return child_count();