mirror of
https://github.com/RGBCube/serenity
synced 2025-07-26 04:27:44 +00:00
AK: Add CircularBuffer::find_copy_in_seekback()
This is useful for compressors, which quite frequently need to find a matching span of data within the seekback.
This commit is contained in:
parent
d194011570
commit
221b91ff61
3 changed files with 223 additions and 0 deletions
|
@ -249,4 +249,107 @@ ErrorOr<size_t> CircularBuffer::copy_from_seekback(size_t distance, size_t lengt
|
||||||
return length - remaining_length;
|
return length - remaining_length;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ErrorOr<Vector<CircularBuffer::Match>> CircularBuffer::find_copy_in_seekback(size_t maximum_length, size_t minimum_length, Optional<Vector<size_t> const&> distance_hints) const
|
||||||
|
{
|
||||||
|
VERIFY(minimum_length > 0);
|
||||||
|
|
||||||
|
// Clip the maximum length to the amount of data that we actually store.
|
||||||
|
if (maximum_length > m_used_space)
|
||||||
|
maximum_length = m_used_space;
|
||||||
|
|
||||||
|
if (maximum_length < minimum_length)
|
||||||
|
return Vector<Match> {};
|
||||||
|
|
||||||
|
Vector<Match> matches;
|
||||||
|
|
||||||
|
if (distance_hints.has_value()) {
|
||||||
|
// If we have any hints, verify and use those.
|
||||||
|
for (auto const& distance : distance_hints.value()) {
|
||||||
|
// TODO: This does not yet support looping repetitions.
|
||||||
|
if (distance < minimum_length)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
auto needle_offset = (capacity() + m_reading_head) % capacity();
|
||||||
|
auto haystack_offset = (capacity() + m_reading_head - distance) % capacity();
|
||||||
|
|
||||||
|
for (size_t i = 0; i < minimum_length; i++) {
|
||||||
|
if (m_buffer[needle_offset] != m_buffer[haystack_offset])
|
||||||
|
break;
|
||||||
|
|
||||||
|
needle_offset = (needle_offset + 1) % capacity();
|
||||||
|
haystack_offset = (haystack_offset + 1) % capacity();
|
||||||
|
|
||||||
|
if (i + 1 == minimum_length)
|
||||||
|
TRY(matches.try_empend(distance, minimum_length));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Otherwise, use memmem to find the initial matches.
|
||||||
|
// Note: We have the read head as our reference point, but `next_read_span_with_seekback` isn't aware of that and continues to use the write head.
|
||||||
|
// Therefore, we need to make sure to slice off the extraneous bytes from the end of the span and shift the returned distances by the correct amount.
|
||||||
|
size_t haystack_offset_from_start = 0;
|
||||||
|
Vector<ReadonlyBytes, 2> haystack;
|
||||||
|
haystack.append(next_read_span_with_seekback(m_seekback_limit));
|
||||||
|
if (haystack[0].size() < m_seekback_limit - used_space())
|
||||||
|
haystack.append(next_read_span_with_seekback(m_seekback_limit - haystack[0].size()));
|
||||||
|
|
||||||
|
haystack.last() = haystack.last().trim(haystack.last().size() - used_space());
|
||||||
|
|
||||||
|
auto needle = next_read_span().trim(minimum_length);
|
||||||
|
|
||||||
|
auto memmem_match = AK::memmem(haystack.begin(), haystack.end(), needle);
|
||||||
|
while (memmem_match.has_value()) {
|
||||||
|
auto match_offset = memmem_match.release_value();
|
||||||
|
|
||||||
|
// Add the match to the list of matches to work with.
|
||||||
|
TRY(matches.try_empend(m_seekback_limit - used_space() - haystack_offset_from_start - match_offset, minimum_length));
|
||||||
|
|
||||||
|
auto size_to_discard = match_offset + 1;
|
||||||
|
|
||||||
|
// Trim away the already processed bytes from the haystack.
|
||||||
|
haystack_offset_from_start += size_to_discard;
|
||||||
|
while (size_to_discard > 0) {
|
||||||
|
if (haystack[0].size() < size_to_discard) {
|
||||||
|
size_to_discard -= haystack[0].size();
|
||||||
|
haystack.remove(0);
|
||||||
|
} else {
|
||||||
|
haystack[0] = haystack[0].slice(size_to_discard);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (haystack.size() == 0)
|
||||||
|
break;
|
||||||
|
|
||||||
|
// Try and find the next match.
|
||||||
|
memmem_match = AK::memmem(haystack.begin(), haystack.end(), needle);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// From now on, all matches that we have stored have at least a length of `minimum_length` and they all refer to the same value.
|
||||||
|
// For the remaining part, we will keep checking the next byte incrementally and keep eliminating matches until we eliminated all of them.
|
||||||
|
Vector<Match> next_matches;
|
||||||
|
|
||||||
|
for (size_t offset = minimum_length; offset < maximum_length; offset++) {
|
||||||
|
auto needle_data = m_buffer[(capacity() + m_reading_head + offset) % capacity()];
|
||||||
|
|
||||||
|
for (auto const& match : matches) {
|
||||||
|
auto haystack_data = m_buffer[(capacity() + m_reading_head - match.distance + offset) % capacity()];
|
||||||
|
|
||||||
|
if (haystack_data != needle_data)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
TRY(next_matches.try_empend(match.distance, match.length + 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
if (next_matches.size() == 0)
|
||||||
|
return matches;
|
||||||
|
|
||||||
|
swap(matches, next_matches);
|
||||||
|
next_matches.clear_with_capacity();
|
||||||
|
}
|
||||||
|
|
||||||
|
return matches;
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -9,6 +9,7 @@
|
||||||
#include <AK/ByteBuffer.h>
|
#include <AK/ByteBuffer.h>
|
||||||
#include <AK/Error.h>
|
#include <AK/Error.h>
|
||||||
#include <AK/Noncopyable.h>
|
#include <AK/Noncopyable.h>
|
||||||
|
#include <AK/Vector.h>
|
||||||
|
|
||||||
namespace AK {
|
namespace AK {
|
||||||
|
|
||||||
|
@ -36,6 +37,15 @@ public:
|
||||||
|
|
||||||
ErrorOr<size_t> copy_from_seekback(size_t distance, size_t length);
|
ErrorOr<size_t> copy_from_seekback(size_t distance, size_t length);
|
||||||
|
|
||||||
|
struct Match {
|
||||||
|
size_t distance;
|
||||||
|
size_t length;
|
||||||
|
};
|
||||||
|
/// This searches the seekback buffer (between read head and limit) for occurrences where it matches the next `length` bytes from the read buffer.
|
||||||
|
/// Supplying any hints will only consider those distances, in case existing offsets need to be validated.
|
||||||
|
/// Note that, since we only start searching at the read head, the length between read head and write head is excluded from the distance.
|
||||||
|
ErrorOr<Vector<Match>> find_copy_in_seekback(size_t maximum_length, size_t minimum_length = 2, Optional<Vector<size_t> const&> distance_hints = {}) const;
|
||||||
|
|
||||||
[[nodiscard]] size_t empty_space() const;
|
[[nodiscard]] size_t empty_space() const;
|
||||||
[[nodiscard]] size_t used_space() const;
|
[[nodiscard]] size_t used_space() const;
|
||||||
[[nodiscard]] size_t capacity() const;
|
[[nodiscard]] size_t capacity() const;
|
||||||
|
|
|
@ -329,6 +329,116 @@ TEST_CASE(offset_of_with_until_and_after_wrapping_around)
|
||||||
EXPECT_EQ(result.value_or(42), 14ul);
|
EXPECT_EQ(result.value_or(42), 14ul);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_CASE(find_copy_in_seekback)
|
||||||
|
{
|
||||||
|
auto haystack = "ABABCABCDAB"sv.bytes();
|
||||||
|
auto needle = "ABCD"sv.bytes();
|
||||||
|
|
||||||
|
// Set up the buffer for testing.
|
||||||
|
auto buffer = MUST(CircularBuffer::create_empty(haystack.size() + needle.size()));
|
||||||
|
auto written_haystack_bytes = buffer.write(haystack);
|
||||||
|
VERIFY(written_haystack_bytes == haystack.size());
|
||||||
|
MUST(buffer.discard(haystack.size()));
|
||||||
|
auto written_needle_bytes = buffer.write(needle);
|
||||||
|
VERIFY(written_needle_bytes == needle.size());
|
||||||
|
|
||||||
|
{
|
||||||
|
// Find the largest matches with a length between 1 and 1 (all "A").
|
||||||
|
auto matches = MUST(buffer.find_copy_in_seekback(1, 1));
|
||||||
|
EXPECT_EQ(matches.size(), 4ul);
|
||||||
|
EXPECT_EQ(matches[0].distance, 11ul);
|
||||||
|
EXPECT_EQ(matches[0].length, 1ul);
|
||||||
|
EXPECT_EQ(matches[1].distance, 9ul);
|
||||||
|
EXPECT_EQ(matches[1].length, 1ul);
|
||||||
|
EXPECT_EQ(matches[2].distance, 6ul);
|
||||||
|
EXPECT_EQ(matches[2].length, 1ul);
|
||||||
|
EXPECT_EQ(matches[3].distance, 2ul);
|
||||||
|
EXPECT_EQ(matches[3].length, 1ul);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// Find the largest matches with a length between 1 and 2 (all "AB", everything smaller gets eliminated).
|
||||||
|
auto matches = MUST(buffer.find_copy_in_seekback(2, 1));
|
||||||
|
EXPECT_EQ(matches.size(), 4ul);
|
||||||
|
EXPECT_EQ(matches[0].distance, 11ul);
|
||||||
|
EXPECT_EQ(matches[0].length, 2ul);
|
||||||
|
EXPECT_EQ(matches[1].distance, 9ul);
|
||||||
|
EXPECT_EQ(matches[1].length, 2ul);
|
||||||
|
EXPECT_EQ(matches[2].distance, 6ul);
|
||||||
|
EXPECT_EQ(matches[2].length, 2ul);
|
||||||
|
EXPECT_EQ(matches[3].distance, 2ul);
|
||||||
|
EXPECT_EQ(matches[3].length, 2ul);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// Find the largest matches with a length between 1 and 3 (all "ABC", everything smaller gets eliminated).
|
||||||
|
auto matches = MUST(buffer.find_copy_in_seekback(3, 1));
|
||||||
|
EXPECT_EQ(matches.size(), 2ul);
|
||||||
|
EXPECT_EQ(matches[0].distance, 9ul);
|
||||||
|
EXPECT_EQ(matches[0].length, 3ul);
|
||||||
|
EXPECT_EQ(matches[1].distance, 6ul);
|
||||||
|
EXPECT_EQ(matches[1].length, 3ul);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// Find the largest matches with a length between 1 and 4 (all "ABCD", everything smaller gets eliminated).
|
||||||
|
auto matches = MUST(buffer.find_copy_in_seekback(4, 1));
|
||||||
|
EXPECT_EQ(matches.size(), 1ul);
|
||||||
|
EXPECT_EQ(matches[0].distance, 6ul);
|
||||||
|
EXPECT_EQ(matches[0].length, 4ul);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// Find the largest matches with a length between 1 and 5 (all "ABCD", everything smaller gets eliminated, and nothing larger exists).
|
||||||
|
auto matches = MUST(buffer.find_copy_in_seekback(5, 1));
|
||||||
|
EXPECT_EQ(matches.size(), 1ul);
|
||||||
|
EXPECT_EQ(matches[0].distance, 6ul);
|
||||||
|
EXPECT_EQ(matches[0].length, 4ul);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// Find the largest matches with a length between 4 and 5 (all "ABCD", everything smaller never gets found, nothing larger exists).
|
||||||
|
auto matches = MUST(buffer.find_copy_in_seekback(5, 4));
|
||||||
|
EXPECT_EQ(matches.size(), 1ul);
|
||||||
|
EXPECT_EQ(matches[0].distance, 6ul);
|
||||||
|
EXPECT_EQ(matches[0].length, 4ul);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// Find the largest matches with a length between 5 and 5 (nothing is found).
|
||||||
|
auto matches = MUST(buffer.find_copy_in_seekback(5, 5));
|
||||||
|
EXPECT_EQ(matches.size(), 0ul);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// Find the largest matches with a length between 1 and 2 (selected "AB", everything smaller gets eliminated).
|
||||||
|
auto matches = MUST(buffer.find_copy_in_seekback(2, 1, Vector<size_t> { 6ul, 9ul }));
|
||||||
|
EXPECT_EQ(matches.size(), 2ul);
|
||||||
|
EXPECT_EQ(matches[0].distance, 6ul);
|
||||||
|
EXPECT_EQ(matches[0].length, 2ul);
|
||||||
|
EXPECT_EQ(matches[1].distance, 9ul);
|
||||||
|
EXPECT_EQ(matches[1].length, 2ul);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// Check that we don't find anything for hints before the valid range.
|
||||||
|
auto matches = MUST(buffer.find_copy_in_seekback(2, 1, Vector<size_t> { 0ul }));
|
||||||
|
EXPECT_EQ(matches.size(), 0ul);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// Check that we don't find anything for hints after the valid range.
|
||||||
|
auto matches = MUST(buffer.find_copy_in_seekback(2, 1, Vector<size_t> { 12ul }));
|
||||||
|
EXPECT_EQ(matches.size(), 0ul);
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
// Check that we don't find anything for a minimum length beyond the whole buffer size.
|
||||||
|
auto matches = MUST(buffer.find_copy_in_seekback(12, 13));
|
||||||
|
EXPECT_EQ(matches.size(), 0ul);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
BENCHMARK_CASE(looping_copy_from_seekback)
|
BENCHMARK_CASE(looping_copy_from_seekback)
|
||||||
{
|
{
|
||||||
auto circular_buffer = MUST(CircularBuffer::create_empty(16 * MiB));
|
auto circular_buffer = MUST(CircularBuffer::create_empty(16 * MiB));
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue