mirror of
https://github.com/RGBCube/serenity
synced 2025-07-25 07:17:35 +00:00
AK: Use hashing to accelerate searching a CircularBuffer
This commit is contained in:
parent
4a10cf1506
commit
60ac254df6
4 changed files with 269 additions and 101 deletions
|
@ -296,10 +296,34 @@ ErrorOr<SearchableCircularBuffer> SearchableCircularBuffer::create_initialized(B
|
|||
|
||||
circular_buffer.m_used_space = circular_buffer.m_buffer.size();
|
||||
|
||||
for (size_t i = 0; i + HASH_CHUNK_SIZE <= circular_buffer.m_buffer.size(); i++)
|
||||
TRY(circular_buffer.insert_location_hash(circular_buffer.m_buffer.span().slice(i, HASH_CHUNK_SIZE), i));
|
||||
|
||||
return circular_buffer;
|
||||
}
|
||||
|
||||
ErrorOr<Vector<SearchableCircularBuffer::Match>> SearchableCircularBuffer::find_copy_in_seekback(size_t maximum_length, size_t minimum_length) const
|
||||
ErrorOr<Bytes> SearchableCircularBuffer::read(Bytes bytes)
|
||||
{
|
||||
auto read_bytes_span = CircularBuffer::read(bytes);
|
||||
TRY(hash_last_bytes(read_bytes_span.size()));
|
||||
return read_bytes_span;
|
||||
}
|
||||
|
||||
ErrorOr<void> SearchableCircularBuffer::discard(size_t discarded_bytes)
|
||||
{
|
||||
TRY(CircularBuffer::discard(discarded_bytes));
|
||||
TRY(hash_last_bytes(discarded_bytes));
|
||||
return {};
|
||||
}
|
||||
|
||||
ErrorOr<size_t> SearchableCircularBuffer::flush_to_stream(Stream& stream)
|
||||
{
|
||||
auto flushed_byte_count = TRY(CircularBuffer::flush_to_stream(stream));
|
||||
TRY(hash_last_bytes(flushed_byte_count));
|
||||
return flushed_byte_count;
|
||||
}
|
||||
|
||||
Optional<SearchableCircularBuffer::Match> SearchableCircularBuffer::find_copy_in_seekback(size_t maximum_length, size_t minimum_length)
|
||||
{
|
||||
VERIFY(minimum_length > 0);
|
||||
|
||||
|
@ -308,71 +332,137 @@ ErrorOr<Vector<SearchableCircularBuffer::Match>> SearchableCircularBuffer::find_
|
|||
maximum_length = m_used_space;
|
||||
|
||||
if (maximum_length < minimum_length)
|
||||
return Vector<Match> {};
|
||||
return {};
|
||||
|
||||
Vector<Match> matches;
|
||||
Optional<Match> best_match;
|
||||
|
||||
// Use memmem to find the initial matches.
|
||||
size_t haystack_offset_from_start = 0;
|
||||
Vector<ReadonlyBytes, 2> haystack;
|
||||
haystack.append(next_search_span(search_limit()));
|
||||
if (haystack[0].size() < search_limit())
|
||||
haystack.append(next_search_span(search_limit() - haystack[0].size()));
|
||||
Array<u8, HASH_CHUNK_SIZE> needle_storage;
|
||||
auto needle = needle_storage.span().trim(min(HASH_CHUNK_SIZE, maximum_length));
|
||||
|
||||
auto needle = next_read_span().trim(minimum_length);
|
||||
{
|
||||
auto needle_read_bytes = MUST(read_with_seekback(needle, used_space()));
|
||||
VERIFY(needle_read_bytes.size() == needle.size());
|
||||
}
|
||||
|
||||
auto memmem_match = AK::memmem(haystack.begin(), haystack.end(), needle);
|
||||
while (memmem_match.has_value()) {
|
||||
auto match_offset = memmem_match.release_value();
|
||||
// Try an efficient hash-based search first.
|
||||
if (needle.size() >= HASH_CHUNK_SIZE) {
|
||||
auto needle_hash = StringView { needle }.hash();
|
||||
|
||||
// Add the match to the list of matches to work with.
|
||||
TRY(matches.try_empend(m_seekback_limit - used_space() - haystack_offset_from_start - match_offset, minimum_length));
|
||||
auto maybe_starting_offset = m_hash_location_map.get(needle_hash);
|
||||
|
||||
auto size_to_discard = match_offset + 1;
|
||||
if (maybe_starting_offset.has_value()) {
|
||||
Optional<size_t> previous_buffer_offset;
|
||||
auto current_buffer_offset = maybe_starting_offset.value();
|
||||
|
||||
// Trim away the already processed bytes from the haystack.
|
||||
haystack_offset_from_start += size_to_discard;
|
||||
while (size_to_discard > 0) {
|
||||
if (haystack[0].size() < size_to_discard) {
|
||||
size_to_discard -= haystack[0].size();
|
||||
haystack.remove(0);
|
||||
} else {
|
||||
haystack[0] = haystack[0].slice(size_to_discard);
|
||||
break;
|
||||
while (true) {
|
||||
auto current_search_offset = (capacity() + m_reading_head - current_buffer_offset) % capacity();
|
||||
|
||||
// Validate the hash. In case it is invalid, we can discard the rest of the chain, as the data (and everything older) got updated.
|
||||
Array<u8, HASH_CHUNK_SIZE> hash_chunk_at_offset;
|
||||
auto hash_chunk_at_offset_span = MUST(read_with_seekback(hash_chunk_at_offset, current_search_offset + used_space()));
|
||||
VERIFY(hash_chunk_at_offset_span.size() == HASH_CHUNK_SIZE);
|
||||
auto found_chunk_hash = StringView { hash_chunk_at_offset }.hash();
|
||||
if (needle_hash != found_chunk_hash) {
|
||||
if (!previous_buffer_offset.has_value())
|
||||
m_hash_location_map.remove(needle_hash);
|
||||
else
|
||||
m_location_chain_map.remove(*previous_buffer_offset);
|
||||
break;
|
||||
}
|
||||
|
||||
// Validate the match through the set-distance-based implementation.
|
||||
auto maybe_new_match = find_copy_in_seekback(Array { current_search_offset }, maximum_length, HASH_CHUNK_SIZE);
|
||||
|
||||
// If we found a match, record it.
|
||||
// If we haven't found a match, we simply got a hash collision, so skip.
|
||||
if (maybe_new_match.has_value()) {
|
||||
auto new_match = maybe_new_match.release_value();
|
||||
|
||||
if (!best_match.has_value() || best_match->length < new_match.length) {
|
||||
best_match = new_match;
|
||||
|
||||
// If we already found a result with the best possible length, then stop searching.
|
||||
if (best_match->length >= maximum_length)
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Get the next location with the same hash from the location chain.
|
||||
auto maybe_next_buffer_offset = m_location_chain_map.get(current_buffer_offset);
|
||||
|
||||
// End of the chain, nothing more to check.
|
||||
if (!maybe_next_buffer_offset.has_value())
|
||||
break;
|
||||
|
||||
previous_buffer_offset = current_buffer_offset;
|
||||
current_buffer_offset = maybe_next_buffer_offset.release_value();
|
||||
}
|
||||
|
||||
// If we found a match, return it now.
|
||||
if (best_match.has_value())
|
||||
return best_match;
|
||||
}
|
||||
|
||||
if (haystack.size() == 0)
|
||||
break;
|
||||
|
||||
// Try and find the next match.
|
||||
memmem_match = AK::memmem(haystack.begin(), haystack.end(), needle);
|
||||
}
|
||||
|
||||
// From now on, all matches that we have stored have at least a length of `minimum_length` and they all refer to the same value.
|
||||
// For the remaining part, we will keep checking the next byte incrementally and keep eliminating matches until we eliminated all of them.
|
||||
Vector<Match> next_matches;
|
||||
// Try a plain memory search for smaller values.
|
||||
// Note: This overlaps with the hash search for chunks of size HASH_CHUNK_SIZE for the purpose of validation.
|
||||
if (minimum_length <= HASH_CHUNK_SIZE) {
|
||||
size_t haystack_offset_from_start = 0;
|
||||
Vector<ReadonlyBytes, 2> haystack;
|
||||
haystack.append(next_search_span(search_limit()));
|
||||
if (haystack[0].size() < search_limit())
|
||||
haystack.append(next_search_span(search_limit() - haystack[0].size()));
|
||||
|
||||
for (size_t offset = minimum_length; offset < maximum_length; offset++) {
|
||||
auto needle_data = m_buffer[(capacity() + m_reading_head + offset) % capacity()];
|
||||
// TODO: `memmem` searches the memory in "natural" order, which means that it finds matches with a greater distance first.
|
||||
// Hash-based searching finds the shortest distances first, which is most likely better for encoding and memory efficiency.
|
||||
// Look into creating a `memmem_reverse`, which starts searching from the end.
|
||||
auto memmem_match = AK::memmem(haystack.begin(), haystack.end(), needle);
|
||||
while (memmem_match.has_value()) {
|
||||
auto match_offset = memmem_match.release_value();
|
||||
auto corrected_match_distance = search_limit() - haystack_offset_from_start - match_offset;
|
||||
|
||||
for (auto const& match : matches) {
|
||||
auto haystack_data = m_buffer[(capacity() + m_reading_head - match.distance + offset) % capacity()];
|
||||
// Validate the match through the set-distance-based implementation and extend it to the largest size possible.
|
||||
auto maybe_new_match = find_copy_in_seekback(Array { corrected_match_distance }, min(maximum_length, HASH_CHUNK_SIZE), minimum_length);
|
||||
|
||||
if (haystack_data != needle_data)
|
||||
continue;
|
||||
// If we weren't able to validate the match at all, either our memmem search returned garbage or our validation function is incorrect. Investigate.
|
||||
VERIFY(maybe_new_match.has_value());
|
||||
|
||||
TRY(next_matches.try_empend(match.distance, match.length + 1));
|
||||
auto new_match = maybe_new_match.release_value();
|
||||
|
||||
if (!best_match.has_value() || best_match->length < new_match.length) {
|
||||
best_match = new_match;
|
||||
|
||||
// If we already found a result with the best possible length, then stop searching.
|
||||
if (best_match->length >= maximum_length)
|
||||
break;
|
||||
}
|
||||
|
||||
auto size_to_discard = match_offset + 1;
|
||||
|
||||
// Trim away the already processed bytes from the haystack.
|
||||
haystack_offset_from_start += size_to_discard;
|
||||
while (size_to_discard > 0) {
|
||||
if (haystack[0].size() < size_to_discard) {
|
||||
size_to_discard -= haystack[0].size();
|
||||
haystack.remove(0);
|
||||
} else {
|
||||
haystack[0] = haystack[0].slice(size_to_discard);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (haystack.size() == 0)
|
||||
break;
|
||||
|
||||
// Try and find the next match.
|
||||
memmem_match = AK::memmem(haystack.begin(), haystack.end(), needle);
|
||||
}
|
||||
|
||||
if (next_matches.size() == 0)
|
||||
return matches;
|
||||
|
||||
swap(matches, next_matches);
|
||||
next_matches.clear_with_capacity();
|
||||
// If we found a match of size HASH_CHUNK_SIZE, we should have already found that using the hash search. Investigate.
|
||||
VERIFY(!best_match.has_value() || best_match->length < HASH_CHUNK_SIZE);
|
||||
}
|
||||
|
||||
return matches;
|
||||
return best_match;
|
||||
}
|
||||
|
||||
Optional<SearchableCircularBuffer::Match> SearchableCircularBuffer::find_copy_in_seekback(ReadonlySpan<size_t> distances, size_t maximum_length, size_t minimum_length) const
|
||||
|
@ -422,4 +512,75 @@ Optional<SearchableCircularBuffer::Match> SearchableCircularBuffer::find_copy_in
|
|||
return best_match;
|
||||
}
|
||||
|
||||
ErrorOr<void> SearchableCircularBuffer::insert_location_hash(ReadonlyBytes value, size_t raw_offset)
|
||||
{
|
||||
VERIFY(value.size() == HASH_CHUNK_SIZE);
|
||||
|
||||
auto value_hash = StringView { value }.hash();
|
||||
|
||||
// Discard any old entries for this offset first. This should eliminate accidental loops by breaking the chain.
|
||||
// The actual cleanup is done on access, since we can only remove invalid references when actually walking the chain.
|
||||
m_location_chain_map.remove(raw_offset);
|
||||
|
||||
// Check if we have any existing entries for this hash.
|
||||
// If so, we need to add it to the location chain map instead, as we will soon replace the entry in the hash location map.
|
||||
auto existing_entry = m_hash_location_map.get(value_hash);
|
||||
|
||||
if (existing_entry.has_value())
|
||||
TRY(m_location_chain_map.try_set(raw_offset, existing_entry.value()));
|
||||
|
||||
TRY(m_hash_location_map.try_set(value_hash, raw_offset));
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
ErrorOr<void> SearchableCircularBuffer::hash_last_bytes(size_t count)
|
||||
{
|
||||
// Stop early if we don't have enough data overall to hash a full chunk.
|
||||
if (search_limit() < HASH_CHUNK_SIZE)
|
||||
return {};
|
||||
|
||||
auto remaining_recalculations = count;
|
||||
while (remaining_recalculations > 0) {
|
||||
// Note: We offset everything by HASH_CHUNK_SIZE because we have up to HASH_CHUNK_SIZE - 1 bytes that we couldn't hash before (as we had missing data).
|
||||
// The number of recalculations stays the same, since we now have up to HASH_CHUNK_SIZE - 1 bytes that we can't hash now.
|
||||
auto recalculation_span = next_search_span(min(remaining_recalculations + HASH_CHUNK_SIZE - 1, search_limit()));
|
||||
|
||||
// If the span is smaller than a hash chunk, we need to manually craft some consecutive data to do the hashing.
|
||||
if (recalculation_span.size() < HASH_CHUNK_SIZE) {
|
||||
auto auxiliary_span = next_seekback_span(remaining_recalculations);
|
||||
|
||||
// Ensure that our math is correct and that both spans are "adjacent".
|
||||
VERIFY(recalculation_span.data() + recalculation_span.size() == m_buffer.data() + m_buffer.size());
|
||||
VERIFY(auxiliary_span.data() == m_buffer.data());
|
||||
|
||||
while (recalculation_span.size() > 0 && recalculation_span.size() + auxiliary_span.size() >= HASH_CHUNK_SIZE) {
|
||||
Array<u8, HASH_CHUNK_SIZE> temporary_hash_chunk;
|
||||
|
||||
auto copied_from_recalculation_span = recalculation_span.copy_to(temporary_hash_chunk);
|
||||
VERIFY(copied_from_recalculation_span == recalculation_span.size());
|
||||
|
||||
auto copied_from_auxiliary_span = auxiliary_span.copy_to(temporary_hash_chunk.span().slice(copied_from_recalculation_span));
|
||||
VERIFY(copied_from_recalculation_span + copied_from_auxiliary_span == HASH_CHUNK_SIZE);
|
||||
|
||||
TRY(insert_location_hash(temporary_hash_chunk, recalculation_span.data() - m_buffer.data()));
|
||||
|
||||
recalculation_span = recalculation_span.slice(1);
|
||||
remaining_recalculations--;
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
for (size_t i = 0; i + HASH_CHUNK_SIZE <= recalculation_span.size(); i++) {
|
||||
auto value = recalculation_span.slice(i, HASH_CHUNK_SIZE);
|
||||
auto raw_offset = value.data() - m_buffer.data();
|
||||
TRY(insert_location_hash(value, raw_offset));
|
||||
remaining_recalculations--;
|
||||
}
|
||||
}
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue