From 440d8f908ff1cd6eb315e6d14274ac500ccaa38d Mon Sep 17 00:00:00 2001 From: Tim Schumacher Date: Mon, 1 May 2023 13:01:06 +0200 Subject: [PATCH] LibCompress: Extract the LZMA state to a separate class We will also need this in the compressor, as it needs to do the exact same calculations in reverse. --- Userland/Libraries/LibCompress/Lzma.cpp | 107 +++++++++--------- Userland/Libraries/LibCompress/Lzma.h | 137 +++++++++++++----------- 2 files changed, 134 insertions(+), 110 deletions(-) diff --git a/Userland/Libraries/LibCompress/Lzma.cpp b/Userland/Libraries/LibCompress/Lzma.cpp index e2a160df88..a95fe3c96b 100644 --- a/Userland/Libraries/LibCompress/Lzma.cpp +++ b/Userland/Libraries/LibCompress/Lzma.cpp @@ -85,7 +85,7 @@ ErrorOr LzmaHeader::as_decompressor_options() const }; } -void LzmaDecompressor::initialize_to_default_probability(Span span) +void LzmaState::initialize_to_default_probability(Span span) { for (auto& entry : span) entry = default_probability; @@ -117,11 +117,8 @@ ErrorOr> LzmaDecompressor::create_from_raw_strea return decompressor; } -LzmaDecompressor::LzmaDecompressor(MaybeOwned stream, LzmaDecompressorOptions options, MaybeOwned dictionary, FixedArray literal_probabilities) - : m_stream(move(stream)) - , m_options(move(options)) - , m_dictionary(move(dictionary)) - , m_literal_probabilities(move(literal_probabilities)) +LzmaState::LzmaState(FixedArray literal_probabilities) + : m_literal_probabilities(move(literal_probabilities)) { initialize_to_default_probability(m_literal_probabilities.span()); @@ -141,6 +138,14 @@ LzmaDecompressor::LzmaDecompressor(MaybeOwned stream, LzmaDecompressorOp initialize_to_default_probability(m_is_rep0_long_probabilities); } +LzmaDecompressor::LzmaDecompressor(MaybeOwned stream, LzmaDecompressorOptions options, MaybeOwned dictionary, FixedArray literal_probabilities) + : LzmaState(move(literal_probabilities)) + , m_stream(move(stream)) + , m_options(move(options)) + , m_dictionary(move(dictionary)) +{ +} + bool LzmaDecompressor::is_range_decoder_in_clean_state() const { return m_range_decoder_code == 0; @@ -151,7 +156,7 @@ bool LzmaDecompressor::has_reached_expected_data_size() const if (!m_options.uncompressed_size.has_value()) return false; - return m_total_decoded_bytes >= m_options.uncompressed_size.value(); + return m_total_processed_bytes >= m_options.uncompressed_size.value(); } ErrorOr LzmaDecompressor::initialize_range_decoder() @@ -303,7 +308,7 @@ ErrorOr LzmaDecompressor::decode_literal_to_output_buffer() // "To select the table for decoding it uses the context that consists of // (lc) high bits from previous literal and (lp) low bits from value that // represents current position in outputStream." - u16 literal_state_bits_from_position = m_total_decoded_bytes & ((1 << m_options.literal_position_bits) - 1); + u16 literal_state_bits_from_position = m_total_processed_bytes & ((1 << m_options.literal_position_bits) - 1); u16 literal_state_bits_from_output = previous_byte >> (8 - m_options.literal_context_bits); u16 literal_state = literal_state_bits_from_position << m_options.literal_context_bits | literal_state_bits_from_output; @@ -343,12 +348,12 @@ ErrorOr LzmaDecompressor::decode_literal_to_output_buffer() size_t written_bytes = m_dictionary->write({ &actual_result, sizeof(actual_result) }); VERIFY(written_bytes == sizeof(actual_result)); - m_total_decoded_bytes += sizeof(actual_result); + m_total_processed_bytes += sizeof(actual_result); return {}; } -LzmaDecompressor::LzmaLengthDecoderState::LzmaLengthDecoderState() +LzmaState::LzmaLengthCoderState::LzmaLengthCoderState() { for (auto& array : m_low_length_probabilities) initialize_to_default_probability(array); @@ -359,11 +364,11 @@ LzmaDecompressor::LzmaLengthDecoderState::LzmaLengthDecoderState() initialize_to_default_probability(m_high_length_probabilities); } -ErrorOr LzmaDecompressor::decode_normalized_match_length(LzmaLengthDecoderState& length_decoder_state) +ErrorOr LzmaDecompressor::decode_normalized_match_length(LzmaLengthCoderState& length_decoder_state) { // "LZMA uses "posState" value as context to select the binary tree // from LowCoder and MidCoder binary tree arrays:" - u16 position_state = m_total_decoded_bytes & ((1 << m_options.position_bits) - 1); + u16 position_state = m_total_processed_bytes & ((1 << m_options.position_bits) - 1); // "The following scheme is used for the match length encoding: // @@ -455,7 +460,7 @@ ErrorOr LzmaDecompressor::decode_normalized_match_distance(u16 normalized_m return (distance_prefix << number_of_alignment_bits) | TRY(decode_symbol_using_reverse_bit_tree(number_of_alignment_bits, m_alignment_bit_probabilities)); } -u32 LzmaDecompressor::current_repetition_offset() const +u32 LzmaState::current_repetition_offset() const { // LZMA never needs to read at offset 0 (i.e. the actual read head of the buffer). // Instead, the values are remapped so that the rep-value n starts reading n + 1 bytes back. @@ -465,6 +470,40 @@ u32 LzmaDecompressor::current_repetition_offset() const return m_rep0 + 1; } +void LzmaState::update_state_after_literal() +{ + if (m_state < 4) + m_state = 0; + else if (m_state < 10) + m_state -= 3; + else + m_state -= 6; +} + +void LzmaState::update_state_after_match() +{ + if (m_state < 7) + m_state = 7; + else + m_state = 10; +}; + +void LzmaState::update_state_after_rep() +{ + if (m_state < 7) + m_state = 8; + else + m_state = 11; +} + +void LzmaState::update_state_after_short_rep() +{ + if (m_state < 7) + m_state = 9; + else + m_state = 11; +} + ErrorOr LzmaDecompressor::read_some(Bytes bytes) { while (m_dictionary->used_space() < bytes.size() && m_dictionary->empty_space() != 0) { @@ -481,48 +520,18 @@ ErrorOr LzmaDecompressor::read_some(Bytes bytes) // "The decoder calculates "state2" variable value to select exact variable from // "IsMatch" and "IsRep0Long" arrays." - u16 position_state = m_total_decoded_bytes & ((1 << m_options.position_bits) - 1); + u16 position_state = m_total_processed_bytes & ((1 << m_options.position_bits) - 1); u16 state2 = (m_state << maximum_number_of_position_bits) + position_state; - auto update_state_after_literal = [&] { - if (m_state < 4) - m_state = 0; - else if (m_state < 10) - m_state -= 3; - else - m_state -= 6; - }; - - auto update_state_after_match = [&] { - if (m_state < 7) - m_state = 7; - else - m_state = 10; - }; - - auto update_state_after_rep = [&] { - if (m_state < 7) - m_state = 8; - else - m_state = 11; - }; - - auto update_state_after_short_rep = [&] { - if (m_state < 7) - m_state = 9; - else - m_state = 11; - }; - auto copy_match_to_buffer = [&](u16 real_length) -> ErrorOr { VERIFY(!m_leftover_match_length.has_value()); - if (m_options.uncompressed_size.has_value() && m_options.uncompressed_size.value() < m_total_decoded_bytes + real_length) + if (m_options.uncompressed_size.has_value() && m_options.uncompressed_size.value() < m_total_processed_bytes + real_length) return Error::from_string_literal("Tried to copy match beyond expected uncompressed file size"); auto copied_length = TRY(m_dictionary->copy_from_seekback(current_repetition_offset(), real_length)); - m_total_decoded_bytes += copied_length; + m_total_processed_bytes += copied_length; real_length -= copied_length; if (real_length > 0) @@ -569,7 +578,7 @@ ErrorOr LzmaDecompressor::read_some(Bytes bytes) m_rep1 = m_rep0; // "The zero-based length is decoded with "LenDecoder"." - u16 normalized_length = TRY(decode_normalized_match_length(m_length_decoder)); + u16 normalized_length = TRY(decode_normalized_match_length(m_length_coder)); // "The state is update with UpdateState_Match function." update_state_after_match(); @@ -667,7 +676,7 @@ ErrorOr LzmaDecompressor::read_some(Bytes bytes) // "In other cases (Rep Match 0/1/2/3), it decodes the zero-based // length of match with "RepLenDecoder" decoder." - u16 normalized_length = TRY(decode_normalized_match_length(m_rep_length_decoder)); + u16 normalized_length = TRY(decode_normalized_match_length(m_rep_length_coder)); // "Then it updates the state." update_state_after_rep(); @@ -678,7 +687,7 @@ ErrorOr LzmaDecompressor::read_some(Bytes bytes) } if (m_found_end_of_stream_marker || has_reached_expected_data_size()) { - if (m_options.uncompressed_size.has_value() && m_total_decoded_bytes < m_options.uncompressed_size.value()) + if (m_options.uncompressed_size.has_value() && m_total_processed_bytes < m_options.uncompressed_size.value()) return Error::from_string_literal("Found end-of-stream marker earlier than expected"); if (!is_range_decoder_in_clean_state()) diff --git a/Userland/Libraries/LibCompress/Lzma.h b/Userland/Libraries/LibCompress/Lzma.h index af9426a38c..7f71040005 100644 --- a/Userland/Libraries/LibCompress/Lzma.h +++ b/Userland/Libraries/LibCompress/Lzma.h @@ -48,7 +48,81 @@ private: }; static_assert(sizeof(LzmaHeader) == 13); -class LzmaDecompressor : public Stream { +class LzmaState { +protected: + // LZMA uses 11-bit probability counters, but they are usually stored in 16-bit variables. + // Therefore, we can model probabilities with a resolution of up to 1 / 2^11 (which is equal to 1 / 2048). + // The default probability for most counters is 0.5. + using Probability = u16; + static constexpr size_t probability_bit_count = 11; + static constexpr Probability default_probability = (1 << probability_bit_count) / 2; + static void initialize_to_default_probability(Span); + + LzmaState(FixedArray literal_probabilities); + + u64 m_total_processed_bytes { 0 }; + + static constexpr size_t literal_probability_table_size = 0x300; + FixedArray m_literal_probabilities; + + struct LzmaLengthCoderState { + public: + LzmaLengthCoderState(); + + Probability m_first_choice_probability { default_probability }; + Probability m_second_choice_probability { default_probability }; + + static constexpr size_t maximum_number_of_position_bits = 4; + Array, (1 << maximum_number_of_position_bits)> m_low_length_probabilities; + Array, (1 << maximum_number_of_position_bits)> m_medium_length_probabilities; + Array m_high_length_probabilities; + }; + + LzmaLengthCoderState m_length_coder; + LzmaLengthCoderState m_rep_length_coder; + + static constexpr u16 normalized_to_real_match_length_offset = 2; + static constexpr u32 normalized_to_real_match_distance_offset = 1; + + static constexpr size_t number_of_length_to_position_states = 4; + Array, number_of_length_to_position_states> m_length_to_position_states; + + static constexpr size_t first_position_slot_with_binary_tree_bits = 4; + static constexpr size_t first_position_slot_with_direct_encoded_bits = 14; + + // This is a bit wasteful on memory and not in the specification, but it makes the math easier. + static constexpr size_t number_of_binary_tree_distance_slots = first_position_slot_with_direct_encoded_bits - first_position_slot_with_binary_tree_bits; + static constexpr size_t largest_number_of_binary_tree_distance_bits = 5; + Array, number_of_binary_tree_distance_slots> m_binary_tree_distance_probabilities; + + static constexpr size_t number_of_alignment_bits = 4; + Array m_alignment_bit_probabilities; + + // LZ state tracking. + u16 m_state { 0 }; + u32 m_rep0 { 0 }; + u32 m_rep1 { 0 }; + u32 m_rep2 { 0 }; + u32 m_rep3 { 0 }; + u32 current_repetition_offset() const; + + void update_state_after_literal(); + void update_state_after_match(); + void update_state_after_rep(); + void update_state_after_short_rep(); + + static constexpr size_t maximum_number_of_position_bits = 4; + static constexpr size_t number_of_states = 12; + Array m_is_match_probabilities; + Array m_is_rep_probabilities; + Array m_is_rep_g0_probabilities; + Array m_is_rep_g1_probabilities; + Array m_is_rep_g2_probabilities; + Array m_is_rep0_long_probabilities; +}; + +class LzmaDecompressor : public Stream + , LzmaState { public: /// Creates a decompressor from a standalone LZMA container (.lzma file extension, occasionally known as an LZMA 'archive'). static ErrorOr> create_from_container(MaybeOwned, Optional> dictionary = {}); @@ -65,14 +139,6 @@ public: virtual void close() override; private: - // LZMA uses 11-bit probability counters, but they are usually stored in 16-bit variables. - // Therefore, we can model probabilities with a resolution of up to 1 / 2^11 (which is equal to 1 / 2048). - // The default probability for most counters is 0.5. - using Probability = u16; - static constexpr size_t probability_bit_count = 11; - static constexpr Probability default_probability = (1 << probability_bit_count) / 2; - static void initialize_to_default_probability(Span); - LzmaDecompressor(MaybeOwned, LzmaDecompressorOptions, MaybeOwned, FixedArray literal_probabilities); MaybeOwned m_stream; @@ -80,7 +146,6 @@ private: // This doubles as an output buffer, since we have to write all of our results into this anyways. MaybeOwned m_dictionary; - u64 m_total_decoded_bytes { 0 }; bool m_found_end_of_stream_marker { false }; bool is_range_decoder_in_clean_state() const; bool has_reached_expected_data_size() const; @@ -101,62 +166,12 @@ private: ErrorOr decode_symbol_using_reverse_bit_tree(size_t bit_count, Span probability_tree); ErrorOr decode_literal_to_output_buffer(); - static constexpr size_t literal_probability_table_size = 0x300; - FixedArray m_literal_probabilities; - struct LzmaLengthDecoderState { - public: - LzmaLengthDecoderState(); - - Probability m_first_choice_probability { default_probability }; - Probability m_second_choice_probability { default_probability }; - - static constexpr size_t maximum_number_of_position_bits = 4; - Array, (1 << maximum_number_of_position_bits)> m_low_length_probabilities; - Array, (1 << maximum_number_of_position_bits)> m_medium_length_probabilities; - Array m_high_length_probabilities; - }; - - LzmaLengthDecoderState m_length_decoder; - LzmaLengthDecoderState m_rep_length_decoder; - static constexpr u16 normalized_to_real_match_length_offset = 2; - ErrorOr decode_normalized_match_length(LzmaLengthDecoderState&); - - static constexpr size_t number_of_length_to_position_states = 4; - Array, number_of_length_to_position_states> m_length_to_position_states; - - static constexpr size_t first_position_slot_with_binary_tree_bits = 4; - static constexpr size_t first_position_slot_with_direct_encoded_bits = 14; - - // This is a bit wasteful on memory and not in the specification, but it makes the math easier. - static constexpr size_t number_of_binary_tree_distance_slots = first_position_slot_with_direct_encoded_bits - first_position_slot_with_binary_tree_bits; - static constexpr size_t largest_number_of_binary_tree_distance_bits = 5; - Array, number_of_binary_tree_distance_slots> m_binary_tree_distance_probabilities; - - static constexpr size_t number_of_alignment_bits = 4; - Array m_alignment_bit_probabilities; + ErrorOr decode_normalized_match_length(LzmaLengthCoderState&); // This deviates from the specification, which states that "unsigned" is at least 16-bit. // However, the match distance needs to be at least 32-bit, at the very least to hold the 0xFFFFFFFF end marker value. - static constexpr u32 normalized_to_real_match_distance_offset = 1; ErrorOr decode_normalized_match_distance(u16 normalized_match_length); - - // LZ state tracking. - u16 m_state { 0 }; - u32 m_rep0 { 0 }; - u32 m_rep1 { 0 }; - u32 m_rep2 { 0 }; - u32 m_rep3 { 0 }; - u32 current_repetition_offset() const; - - static constexpr size_t maximum_number_of_position_bits = 4; - static constexpr size_t number_of_states = 12; - Array m_is_match_probabilities; - Array m_is_rep_probabilities; - Array m_is_rep_g0_probabilities; - Array m_is_rep_g1_probabilities; - Array m_is_rep_g2_probabilities; - Array m_is_rep0_long_probabilities; }; }