mirror of
https://github.com/RGBCube/serenity
synced 2025-07-27 12:47:35 +00:00
LibJS: Use iterative text segmentation algorithms for Intl.Segmenter
This uses the find-next and find-previous APIs instead of storing all indices as a vector.
This commit is contained in:
parent
9a7b3c145f
commit
7cb956d17b
5 changed files with 43 additions and 38 deletions
|
@ -50,7 +50,7 @@ JS_DEFINE_NATIVE_FUNCTION(SegmentIteratorPrototype::next)
|
||||||
auto start_index = iterator->iterated_string_next_segment_code_unit_index();
|
auto start_index = iterator->iterated_string_next_segment_code_unit_index();
|
||||||
|
|
||||||
// 6. Let endIndex be ! FindBoundary(segmenter, string, startIndex, after).
|
// 6. Let endIndex be ! FindBoundary(segmenter, string, startIndex, after).
|
||||||
auto end_index = find_boundary(segmenter, string, start_index, Direction::After, iterator->segments().boundaries_cache());
|
auto end_index = find_boundary(segmenter, string, start_index, Direction::After);
|
||||||
|
|
||||||
// 7. If endIndex is not finite, then
|
// 7. If endIndex is not finite, then
|
||||||
if (!Value(end_index).is_finite_number()) {
|
if (!Value(end_index).is_finite_number()) {
|
||||||
|
|
|
@ -1,10 +1,10 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2022, Idan Horowitz <idan.horowitz@serenityos.org>
|
* Copyright (c) 2022, Idan Horowitz <idan.horowitz@serenityos.org>
|
||||||
|
* Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org>
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: BSD-2-Clause
|
* SPDX-License-Identifier: BSD-2-Clause
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <AK/BinarySearch.h>
|
|
||||||
#include <AK/Utf16View.h>
|
#include <AK/Utf16View.h>
|
||||||
#include <LibJS/Runtime/GlobalObject.h>
|
#include <LibJS/Runtime/GlobalObject.h>
|
||||||
#include <LibJS/Runtime/Intl/Segmenter.h>
|
#include <LibJS/Runtime/Intl/Segmenter.h>
|
||||||
|
@ -92,11 +92,39 @@ ThrowCompletionOr<NonnullGCPtr<Object>> create_segment_data_object(VM& vm, Segme
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static Optional<size_t> find_previous_boundary_index(Utf16View const& string, size_t index, Segmenter::SegmenterGranularity granularity)
|
||||||
|
{
|
||||||
|
switch (granularity) {
|
||||||
|
case Segmenter::SegmenterGranularity::Grapheme:
|
||||||
|
return Unicode::previous_grapheme_segmentation_boundary(string, index);
|
||||||
|
case Segmenter::SegmenterGranularity::Word:
|
||||||
|
return Unicode::previous_word_segmentation_boundary(string, index);
|
||||||
|
case Segmenter::SegmenterGranularity::Sentence:
|
||||||
|
return Unicode::previous_sentence_segmentation_boundary(string, index);
|
||||||
|
}
|
||||||
|
|
||||||
|
VERIFY_NOT_REACHED();
|
||||||
|
}
|
||||||
|
|
||||||
|
static Optional<size_t> find_next_boundary_index(Utf16View const& string, size_t index, Segmenter::SegmenterGranularity granularity)
|
||||||
|
{
|
||||||
|
switch (granularity) {
|
||||||
|
case Segmenter::SegmenterGranularity::Grapheme:
|
||||||
|
return Unicode::next_grapheme_segmentation_boundary(string, index);
|
||||||
|
case Segmenter::SegmenterGranularity::Word:
|
||||||
|
return Unicode::next_word_segmentation_boundary(string, index);
|
||||||
|
case Segmenter::SegmenterGranularity::Sentence:
|
||||||
|
return Unicode::next_sentence_segmentation_boundary(string, index);
|
||||||
|
}
|
||||||
|
|
||||||
|
VERIFY_NOT_REACHED();
|
||||||
|
}
|
||||||
|
|
||||||
// 18.8.1 FindBoundary ( segmenter, string, startIndex, direction ), https://tc39.es/ecma402/#sec-findboundary
|
// 18.8.1 FindBoundary ( segmenter, string, startIndex, direction ), https://tc39.es/ecma402/#sec-findboundary
|
||||||
double find_boundary(Segmenter const& segmenter, Utf16View const& string, double start_index, Direction direction, Optional<Vector<size_t>>& boundaries_cache)
|
double find_boundary(Segmenter const& segmenter, Utf16View const& string, double start_index, Direction direction)
|
||||||
{
|
{
|
||||||
// 1. Let locale be segmenter.[[Locale]].
|
// 1. Let locale be segmenter.[[Locale]].
|
||||||
auto const& locale = segmenter.locale();
|
// FIXME: Support locale-sensitive boundaries
|
||||||
|
|
||||||
// 2. Let granularity be segmenter.[[SegmenterGranularity]].
|
// 2. Let granularity be segmenter.[[SegmenterGranularity]].
|
||||||
auto granularity = segmenter.segmenter_granularity();
|
auto granularity = segmenter.segmenter_granularity();
|
||||||
|
@ -104,24 +132,6 @@ double find_boundary(Segmenter const& segmenter, Utf16View const& string, double
|
||||||
// 3. Let len be the length of string.
|
// 3. Let len be the length of string.
|
||||||
auto length = string.length_in_code_units();
|
auto length = string.length_in_code_units();
|
||||||
|
|
||||||
// Non-standard, populate boundaries cache
|
|
||||||
if (!boundaries_cache.has_value()) {
|
|
||||||
switch (granularity) {
|
|
||||||
case Segmenter::SegmenterGranularity::Grapheme:
|
|
||||||
boundaries_cache = Unicode::find_grapheme_segmentation_boundaries(string);
|
|
||||||
break;
|
|
||||||
case Segmenter::SegmenterGranularity::Word:
|
|
||||||
boundaries_cache = Unicode::find_word_segmentation_boundaries(string);
|
|
||||||
break;
|
|
||||||
case Segmenter::SegmenterGranularity::Sentence:
|
|
||||||
boundaries_cache = Unicode::find_sentence_segmentation_boundaries(string);
|
|
||||||
break;
|
|
||||||
default:
|
|
||||||
VERIFY_NOT_REACHED();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
(void)locale; // TODO: Support locale-sensitive boundaries
|
|
||||||
|
|
||||||
// 4. If direction is before, then
|
// 4. If direction is before, then
|
||||||
if (direction == Direction::Before) {
|
if (direction == Direction::Before) {
|
||||||
// a. Assert: startIndex ≥ 0.
|
// a. Assert: startIndex ≥ 0.
|
||||||
|
@ -130,12 +140,11 @@ double find_boundary(Segmenter const& segmenter, Utf16View const& string, double
|
||||||
VERIFY(start_index < length);
|
VERIFY(start_index < length);
|
||||||
|
|
||||||
// c. Search string for the last segmentation boundary that is preceded by at most startIndex code units from the beginning, using locale locale and text element granularity granularity.
|
// c. Search string for the last segmentation boundary that is preceded by at most startIndex code units from the beginning, using locale locale and text element granularity granularity.
|
||||||
size_t boundary_index;
|
auto boundary_index = find_previous_boundary_index(string, static_cast<size_t>(start_index) + 1, granularity);
|
||||||
binary_search(*boundaries_cache, start_index, &boundary_index);
|
|
||||||
|
|
||||||
// d. If a boundary is found, return the count of code units in string preceding it.
|
// d. If a boundary is found, return the count of code units in string preceding it.
|
||||||
if (boundary_index < boundaries_cache->size())
|
if (boundary_index.has_value())
|
||||||
return boundaries_cache->at(boundary_index);
|
return static_cast<double>(*boundary_index);
|
||||||
|
|
||||||
// e. Return 0.
|
// e. Return 0.
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -149,13 +158,11 @@ double find_boundary(Segmenter const& segmenter, Utf16View const& string, double
|
||||||
return INFINITY;
|
return INFINITY;
|
||||||
|
|
||||||
// 7. Search string for the first segmentation boundary that follows the code unit at index startIndex, using locale locale and text element granularity granularity.
|
// 7. Search string for the first segmentation boundary that follows the code unit at index startIndex, using locale locale and text element granularity granularity.
|
||||||
size_t boundary_index;
|
auto boundary_index = find_next_boundary_index(string, static_cast<size_t>(start_index), granularity);
|
||||||
binary_search(*boundaries_cache, start_index, &boundary_index);
|
|
||||||
++boundary_index;
|
|
||||||
|
|
||||||
// 8. If a boundary is found, return the count of code units in string preceding it.
|
// 8. If a boundary is found, return the count of code units in string preceding it.
|
||||||
if (boundary_index < boundaries_cache->size())
|
if (boundary_index.has_value())
|
||||||
return boundaries_cache->at(boundary_index);
|
return static_cast<double>(*boundary_index);
|
||||||
|
|
||||||
// 9. Return len.
|
// 9. Return len.
|
||||||
return length;
|
return length;
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2022, Idan Horowitz <idan.horowitz@serenityos.org>
|
* Copyright (c) 2022, Idan Horowitz <idan.horowitz@serenityos.org>
|
||||||
|
* Copyright (c) 2023, Tim Flynn <trflynn89@serenityos.org>
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: BSD-2-Clause
|
* SPDX-License-Identifier: BSD-2-Clause
|
||||||
*/
|
*/
|
||||||
|
@ -38,10 +39,11 @@ private:
|
||||||
};
|
};
|
||||||
|
|
||||||
ThrowCompletionOr<NonnullGCPtr<Object>> create_segment_data_object(VM&, Segmenter const&, Utf16View const&, double start_index, double end_index);
|
ThrowCompletionOr<NonnullGCPtr<Object>> create_segment_data_object(VM&, Segmenter const&, Utf16View const&, double start_index, double end_index);
|
||||||
|
|
||||||
enum class Direction {
|
enum class Direction {
|
||||||
Before,
|
Before,
|
||||||
After,
|
After,
|
||||||
};
|
};
|
||||||
double find_boundary(Segmenter const&, Utf16View const&, double start_index, Direction, Optional<Vector<size_t>>& boundaries_cache);
|
double find_boundary(Segmenter const&, Utf16View const&, double start_index, Direction);
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -24,8 +24,6 @@ public:
|
||||||
|
|
||||||
Utf16View segments_string() const { return m_segments_string.view(); }
|
Utf16View segments_string() const { return m_segments_string.view(); }
|
||||||
|
|
||||||
Optional<Vector<size_t>>& boundaries_cache() const { return m_boundaries_cache; }
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
Segments(Realm&, Segmenter&, Utf16String);
|
Segments(Realm&, Segmenter&, Utf16String);
|
||||||
|
|
||||||
|
@ -33,8 +31,6 @@ private:
|
||||||
|
|
||||||
Segmenter& m_segments_segmenter; // [[SegmentsSegmenter]]
|
Segmenter& m_segments_segmenter; // [[SegmentsSegmenter]]
|
||||||
Utf16String m_segments_string; // [[SegmentsString]]
|
Utf16String m_segments_string; // [[SegmentsString]]
|
||||||
|
|
||||||
mutable Optional<Vector<size_t>> m_boundaries_cache;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -54,10 +54,10 @@ JS_DEFINE_NATIVE_FUNCTION(SegmentsPrototype::containing)
|
||||||
return js_undefined();
|
return js_undefined();
|
||||||
|
|
||||||
// 8. Let startIndex be ! FindBoundary(segmenter, string, n, before).
|
// 8. Let startIndex be ! FindBoundary(segmenter, string, n, before).
|
||||||
auto start_index = find_boundary(segmenter, string, n, Direction::Before, segments->boundaries_cache());
|
auto start_index = find_boundary(segmenter, string, n, Direction::Before);
|
||||||
|
|
||||||
// 9. Let endIndex be ! FindBoundary(segmenter, string, n, after).
|
// 9. Let endIndex be ! FindBoundary(segmenter, string, n, after).
|
||||||
auto end_index = find_boundary(segmenter, string, n, Direction::After, segments->boundaries_cache());
|
auto end_index = find_boundary(segmenter, string, n, Direction::After);
|
||||||
|
|
||||||
// 10. Return ! CreateSegmentDataObject(segmenter, string, startIndex, endIndex).
|
// 10. Return ! CreateSegmentDataObject(segmenter, string, startIndex, endIndex).
|
||||||
return TRY(create_segment_data_object(vm, segmenter, string, start_index, end_index));
|
return TRY(create_segment_data_object(vm, segmenter, string, start_index, end_index));
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue