From a05ce330b8a6c418e36c6f9257fd6c985ac8d145 Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Mon, 19 Jul 2021 11:34:29 -0400 Subject: [PATCH] LibJS: Implement String.prototype.codePointAt with UTF-16 code units This also implements the CodePointAt abstract operation. This is needed to handle invalid code units specific to the JavaScript spec, rather than e.g. inserting replacement code units. This abstraction is public because RegExp.prototype will also need it. --- .../LibJS/Runtime/StringPrototype.cpp | 39 +++++++++++++++---- .../Libraries/LibJS/Runtime/StringPrototype.h | 8 ++++ .../String/String.prototype.codePointAt.js | 29 ++++++++++++++ 3 files changed, 68 insertions(+), 8 deletions(-) create mode 100644 Userland/Libraries/LibJS/Tests/builtins/String/String.prototype.codePointAt.js diff --git a/Userland/Libraries/LibJS/Runtime/StringPrototype.cpp b/Userland/Libraries/LibJS/Runtime/StringPrototype.cpp index 6bc35c61df..7cf00654e7 100644 --- a/Userland/Libraries/LibJS/Runtime/StringPrototype.cpp +++ b/Userland/Libraries/LibJS/Runtime/StringPrototype.cpp @@ -51,6 +51,29 @@ static Optional split_match(const String& haystack, size_t start, const return start + r; } +// 11.1.4 CodePointAt ( string, position ), https://tc39.es/ecma262/#sec-codepointat +CodePoint code_point_at(Utf16View const& string, size_t position) +{ + VERIFY(position < string.length_in_code_units()); + + auto first = string.code_unit_at(position); + auto code_point = static_cast(first); + + if (!Utf16View::is_high_surrogate(first) && !Utf16View::is_low_surrogate(first)) + return { code_point, 1, false }; + + if (Utf16View::is_low_surrogate(first) || (position + 1 == string.length_in_code_units())) + return { code_point, 1, true }; + + auto second = string.code_unit_at(position + 1); + + if (!Utf16View::is_low_surrogate(second)) + return { code_point, 1, true }; + + code_point = Utf16View::decode_surrogate_pair(first, second); + return { code_point, 2, false }; +} + StringPrototype::StringPrototype(GlobalObject& global_object) : StringObject(*js_string(global_object.heap(), String::empty()), *global_object.object_prototype()) { @@ -162,19 +185,19 @@ JS_DEFINE_NATIVE_FUNCTION(StringPrototype::char_code_at) // 22.1.3.3 String.prototype.codePointAt ( pos ), https://tc39.es/ecma262/#sec-string.prototype.codepointat JS_DEFINE_NATIVE_FUNCTION(StringPrototype::code_point_at) { - auto string = ak_string_from(vm, global_object); - if (!string.has_value()) + auto string = utf16_string_from(vm, global_object); + if (vm.exception()) return {}; auto position = vm.argument(0).to_integer_or_infinity(global_object); if (vm.exception()) return {}; - auto view = Utf8View(*string); - if (position < 0 || position >= view.length()) + + Utf16View utf16_string_view { string }; + if (position < 0 || position >= utf16_string_view.length_in_code_units()) return js_undefined(); - auto it = view.begin(); - for (auto i = 0; i < position; ++i) - ++it; - return Value(*it); + + auto code_point = JS::code_point_at(utf16_string_view, position); + return Value(code_point.code_point); } // 22.1.3.16 String.prototype.repeat ( count ), https://tc39.es/ecma262/#sec-string.prototype.repeat diff --git a/Userland/Libraries/LibJS/Runtime/StringPrototype.h b/Userland/Libraries/LibJS/Runtime/StringPrototype.h index bea193bd2e..92c6dc00a0 100644 --- a/Userland/Libraries/LibJS/Runtime/StringPrototype.h +++ b/Userland/Libraries/LibJS/Runtime/StringPrototype.h @@ -10,6 +10,14 @@ namespace JS { +struct CodePoint { + u32 code_point { 0 }; + size_t code_unit_count { 0 }; + bool is_unpaired_surrogate { false }; +}; + +CodePoint code_point_at(Utf16View const& string, size_t position); + class StringPrototype final : public StringObject { JS_OBJECT(StringPrototype, StringObject); diff --git a/Userland/Libraries/LibJS/Tests/builtins/String/String.prototype.codePointAt.js b/Userland/Libraries/LibJS/Tests/builtins/String/String.prototype.codePointAt.js new file mode 100644 index 0000000000..1111ec4391 --- /dev/null +++ b/Userland/Libraries/LibJS/Tests/builtins/String/String.prototype.codePointAt.js @@ -0,0 +1,29 @@ +test("basic functionality", () => { + expect(String.prototype.charAt).toHaveLength(1); + + var s = "Foobar"; + expect(typeof s).toBe("string"); + expect(s).toHaveLength(6); + + expect(s.codePointAt(0)).toBe(70); + expect(s.codePointAt(1)).toBe(111); + expect(s.codePointAt(2)).toBe(111); + expect(s.codePointAt(3)).toBe(98); + expect(s.codePointAt(4)).toBe(97); + expect(s.codePointAt(5)).toBe(114); + expect(s.codePointAt(6)).toBe(undefined); + expect(s.codePointAt(-1)).toBe(undefined); + + expect(s.codePointAt()).toBe(70); + expect(s.codePointAt(NaN)).toBe(70); + expect(s.codePointAt("foo")).toBe(70); + expect(s.codePointAt(undefined)).toBe(70); +}); + +test("UTF-16", () => { + var s = "😀"; + expect(s).toHaveLength(2); + expect(s.codePointAt(0)).toBe(0x1f600); + expect(s.codePointAt(1)).toBe(0xde00); + expect(s.codePointAt(2)).toBe(undefined); +});