From 7cb956d17b96938175c23552266ce9f76d5b91d4 Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Wed, 15 Feb 2023 10:03:14 -0500 Subject: [PATCH] LibJS: Use iterative text segmentation algorithms for Intl.Segmenter This uses the find-next and find-previous APIs instead of storing all indices as a vector. --- .../Runtime/Intl/SegmentIteratorPrototype.cpp | 2 +- .../LibJS/Runtime/Intl/Segmenter.cpp | 67 ++++++++++--------- .../Libraries/LibJS/Runtime/Intl/Segmenter.h | 4 +- .../Libraries/LibJS/Runtime/Intl/Segments.h | 4 -- .../LibJS/Runtime/Intl/SegmentsPrototype.cpp | 4 +- 5 files changed, 43 insertions(+), 38 deletions(-) diff --git a/Userland/Libraries/LibJS/Runtime/Intl/SegmentIteratorPrototype.cpp b/Userland/Libraries/LibJS/Runtime/Intl/SegmentIteratorPrototype.cpp index db3fdcf214..dd4e957296 100644 --- a/Userland/Libraries/LibJS/Runtime/Intl/SegmentIteratorPrototype.cpp +++ b/Userland/Libraries/LibJS/Runtime/Intl/SegmentIteratorPrototype.cpp @@ -50,7 +50,7 @@ JS_DEFINE_NATIVE_FUNCTION(SegmentIteratorPrototype::next) auto start_index = iterator->iterated_string_next_segment_code_unit_index(); // 6. Let endIndex be ! FindBoundary(segmenter, string, startIndex, after). - auto end_index = find_boundary(segmenter, string, start_index, Direction::After, iterator->segments().boundaries_cache()); + auto end_index = find_boundary(segmenter, string, start_index, Direction::After); // 7. If endIndex is not finite, then if (!Value(end_index).is_finite_number()) { diff --git a/Userland/Libraries/LibJS/Runtime/Intl/Segmenter.cpp b/Userland/Libraries/LibJS/Runtime/Intl/Segmenter.cpp index 6b3a52ab1b..fc234b4e2f 100644 --- a/Userland/Libraries/LibJS/Runtime/Intl/Segmenter.cpp +++ b/Userland/Libraries/LibJS/Runtime/Intl/Segmenter.cpp @@ -1,10 +1,10 @@ /* * Copyright (c) 2022, Idan Horowitz + * Copyright (c) 2023, Tim Flynn * * SPDX-License-Identifier: BSD-2-Clause */ -#include #include #include #include @@ -92,11 +92,39 @@ ThrowCompletionOr> create_segment_data_object(VM& vm, Segme return result; } +static Optional find_previous_boundary_index(Utf16View const& string, size_t index, Segmenter::SegmenterGranularity granularity) +{ + switch (granularity) { + case Segmenter::SegmenterGranularity::Grapheme: + return Unicode::previous_grapheme_segmentation_boundary(string, index); + case Segmenter::SegmenterGranularity::Word: + return Unicode::previous_word_segmentation_boundary(string, index); + case Segmenter::SegmenterGranularity::Sentence: + return Unicode::previous_sentence_segmentation_boundary(string, index); + } + + VERIFY_NOT_REACHED(); +} + +static Optional find_next_boundary_index(Utf16View const& string, size_t index, Segmenter::SegmenterGranularity granularity) +{ + switch (granularity) { + case Segmenter::SegmenterGranularity::Grapheme: + return Unicode::next_grapheme_segmentation_boundary(string, index); + case Segmenter::SegmenterGranularity::Word: + return Unicode::next_word_segmentation_boundary(string, index); + case Segmenter::SegmenterGranularity::Sentence: + return Unicode::next_sentence_segmentation_boundary(string, index); + } + + VERIFY_NOT_REACHED(); +} + // 18.8.1 FindBoundary ( segmenter, string, startIndex, direction ), https://tc39.es/ecma402/#sec-findboundary -double find_boundary(Segmenter const& segmenter, Utf16View const& string, double start_index, Direction direction, Optional>& boundaries_cache) +double find_boundary(Segmenter const& segmenter, Utf16View const& string, double start_index, Direction direction) { // 1. Let locale be segmenter.[[Locale]]. - auto const& locale = segmenter.locale(); + // FIXME: Support locale-sensitive boundaries // 2. Let granularity be segmenter.[[SegmenterGranularity]]. auto granularity = segmenter.segmenter_granularity(); @@ -104,24 +132,6 @@ double find_boundary(Segmenter const& segmenter, Utf16View const& string, double // 3. Let len be the length of string. auto length = string.length_in_code_units(); - // Non-standard, populate boundaries cache - if (!boundaries_cache.has_value()) { - switch (granularity) { - case Segmenter::SegmenterGranularity::Grapheme: - boundaries_cache = Unicode::find_grapheme_segmentation_boundaries(string); - break; - case Segmenter::SegmenterGranularity::Word: - boundaries_cache = Unicode::find_word_segmentation_boundaries(string); - break; - case Segmenter::SegmenterGranularity::Sentence: - boundaries_cache = Unicode::find_sentence_segmentation_boundaries(string); - break; - default: - VERIFY_NOT_REACHED(); - } - } - (void)locale; // TODO: Support locale-sensitive boundaries - // 4. If direction is before, then if (direction == Direction::Before) { // a. Assert: startIndex ≥ 0. @@ -130,12 +140,11 @@ double find_boundary(Segmenter const& segmenter, Utf16View const& string, double VERIFY(start_index < length); // c. Search string for the last segmentation boundary that is preceded by at most startIndex code units from the beginning, using locale locale and text element granularity granularity. - size_t boundary_index; - binary_search(*boundaries_cache, start_index, &boundary_index); + auto boundary_index = find_previous_boundary_index(string, static_cast(start_index) + 1, granularity); // d. If a boundary is found, return the count of code units in string preceding it. - if (boundary_index < boundaries_cache->size()) - return boundaries_cache->at(boundary_index); + if (boundary_index.has_value()) + return static_cast(*boundary_index); // e. Return 0. return 0; @@ -149,13 +158,11 @@ double find_boundary(Segmenter const& segmenter, Utf16View const& string, double return INFINITY; // 7. Search string for the first segmentation boundary that follows the code unit at index startIndex, using locale locale and text element granularity granularity. - size_t boundary_index; - binary_search(*boundaries_cache, start_index, &boundary_index); - ++boundary_index; + auto boundary_index = find_next_boundary_index(string, static_cast(start_index), granularity); // 8. If a boundary is found, return the count of code units in string preceding it. - if (boundary_index < boundaries_cache->size()) - return boundaries_cache->at(boundary_index); + if (boundary_index.has_value()) + return static_cast(*boundary_index); // 9. Return len. return length; diff --git a/Userland/Libraries/LibJS/Runtime/Intl/Segmenter.h b/Userland/Libraries/LibJS/Runtime/Intl/Segmenter.h index e9fc8f1619..5cd541cf99 100644 --- a/Userland/Libraries/LibJS/Runtime/Intl/Segmenter.h +++ b/Userland/Libraries/LibJS/Runtime/Intl/Segmenter.h @@ -1,5 +1,6 @@ /* * Copyright (c) 2022, Idan Horowitz + * Copyright (c) 2023, Tim Flynn * * SPDX-License-Identifier: BSD-2-Clause */ @@ -38,10 +39,11 @@ private: }; ThrowCompletionOr> create_segment_data_object(VM&, Segmenter const&, Utf16View const&, double start_index, double end_index); + enum class Direction { Before, After, }; -double find_boundary(Segmenter const&, Utf16View const&, double start_index, Direction, Optional>& boundaries_cache); +double find_boundary(Segmenter const&, Utf16View const&, double start_index, Direction); } diff --git a/Userland/Libraries/LibJS/Runtime/Intl/Segments.h b/Userland/Libraries/LibJS/Runtime/Intl/Segments.h index 07b6ba27d5..56137f0d27 100644 --- a/Userland/Libraries/LibJS/Runtime/Intl/Segments.h +++ b/Userland/Libraries/LibJS/Runtime/Intl/Segments.h @@ -24,8 +24,6 @@ public: Utf16View segments_string() const { return m_segments_string.view(); } - Optional>& boundaries_cache() const { return m_boundaries_cache; } - private: Segments(Realm&, Segmenter&, Utf16String); @@ -33,8 +31,6 @@ private: Segmenter& m_segments_segmenter; // [[SegmentsSegmenter]] Utf16String m_segments_string; // [[SegmentsString]] - - mutable Optional> m_boundaries_cache; }; } diff --git a/Userland/Libraries/LibJS/Runtime/Intl/SegmentsPrototype.cpp b/Userland/Libraries/LibJS/Runtime/Intl/SegmentsPrototype.cpp index 8758fd8df1..f3a4fe5a61 100644 --- a/Userland/Libraries/LibJS/Runtime/Intl/SegmentsPrototype.cpp +++ b/Userland/Libraries/LibJS/Runtime/Intl/SegmentsPrototype.cpp @@ -54,10 +54,10 @@ JS_DEFINE_NATIVE_FUNCTION(SegmentsPrototype::containing) return js_undefined(); // 8. Let startIndex be ! FindBoundary(segmenter, string, n, before). - auto start_index = find_boundary(segmenter, string, n, Direction::Before, segments->boundaries_cache()); + auto start_index = find_boundary(segmenter, string, n, Direction::Before); // 9. Let endIndex be ! FindBoundary(segmenter, string, n, after). - auto end_index = find_boundary(segmenter, string, n, Direction::After, segments->boundaries_cache()); + auto end_index = find_boundary(segmenter, string, n, Direction::After); // 10. Return ! CreateSegmentDataObject(segmenter, string, startIndex, endIndex). return TRY(create_segment_data_object(vm, segmenter, string, start_index, end_index));