LibCompress: Extract the LZMA state to a separate class

We will also need this in the compressor, as it needs to do the exact same calculations in reverse.
2025-09-13 08:07:58 +00:00 · 2023-05-01 13:01:06 +02:00 · 2023-05-01 13:01:06 +02:00 · 440d8f908f
commit 440d8f908f
parent 221b91ff61
2 changed files with 134 additions and 110 deletions
--- a/Userland/Libraries/LibCompress/Lzma.cpp
+++ b/Userland/Libraries/LibCompress/Lzma.cpp
@ -85,7 +85,7 @@ ErrorOr<LzmaDecompressorOptions> LzmaHeader::as_decompressor_options() const
    };
 }

-void LzmaDecompressor::initialize_to_default_probability(Span<Probability> span)
+void LzmaState::initialize_to_default_probability(Span<Probability> span)
 {
    for (auto& entry : span)
        entry = default_probability;
@ -117,11 +117,8 @@ ErrorOr<NonnullOwnPtr<LzmaDecompressor>> LzmaDecompressor::create_from_raw_strea
    return decompressor;
 }

-LzmaDecompressor::LzmaDecompressor(MaybeOwned<Stream> stream, LzmaDecompressorOptions options, MaybeOwned<CircularBuffer> dictionary, FixedArray<Probability> literal_probabilities)
-    : m_stream(move(stream))
-    , m_options(move(options))
-    , m_dictionary(move(dictionary))
-    , m_literal_probabilities(move(literal_probabilities))
+LzmaState::LzmaState(FixedArray<Probability> literal_probabilities)
+    : m_literal_probabilities(move(literal_probabilities))
 {
    initialize_to_default_probability(m_literal_probabilities.span());

@ -141,6 +138,14 @@ LzmaDecompressor::LzmaDecompressor(MaybeOwned<Stream> stream, LzmaDecompressorOp
    initialize_to_default_probability(m_is_rep0_long_probabilities);
 }

+LzmaDecompressor::LzmaDecompressor(MaybeOwned<Stream> stream, LzmaDecompressorOptions options, MaybeOwned<CircularBuffer> dictionary, FixedArray<Probability> literal_probabilities)
+    : LzmaState(move(literal_probabilities))
+    , m_stream(move(stream))
+    , m_options(move(options))
+    , m_dictionary(move(dictionary))
+{
+}
+
 bool LzmaDecompressor::is_range_decoder_in_clean_state() const
 {
    return m_range_decoder_code == 0;
@ -151,7 +156,7 @@ bool LzmaDecompressor::has_reached_expected_data_size() const
    if (!m_options.uncompressed_size.has_value())
        return false;

-    return m_total_decoded_bytes >= m_options.uncompressed_size.value();
+    return m_total_processed_bytes >= m_options.uncompressed_size.value();
 }

 ErrorOr<void> LzmaDecompressor::initialize_range_decoder()
@ -303,7 +308,7 @@ ErrorOr<void> LzmaDecompressor::decode_literal_to_output_buffer()
    // "To select the table for decoding it uses the context that consists of
    //  (lc) high bits from previous literal and (lp) low bits from value that
    //  represents current position in outputStream."
-    u16 literal_state_bits_from_position = m_total_decoded_bytes & ((1 << m_options.literal_position_bits) - 1);
+    u16 literal_state_bits_from_position = m_total_processed_bytes & ((1 << m_options.literal_position_bits) - 1);
    u16 literal_state_bits_from_output = previous_byte >> (8 - m_options.literal_context_bits);
    u16 literal_state = literal_state_bits_from_position << m_options.literal_context_bits | literal_state_bits_from_output;

@ -343,12 +348,12 @@ ErrorOr<void> LzmaDecompressor::decode_literal_to_output_buffer()

    size_t written_bytes = m_dictionary->write({ &actual_result, sizeof(actual_result) });
    VERIFY(written_bytes == sizeof(actual_result));
-    m_total_decoded_bytes += sizeof(actual_result);
+    m_total_processed_bytes += sizeof(actual_result);

    return {};
 }

-LzmaDecompressor::LzmaLengthDecoderState::LzmaLengthDecoderState()
+LzmaState::LzmaLengthCoderState::LzmaLengthCoderState()
 {
    for (auto& array : m_low_length_probabilities)
        initialize_to_default_probability(array);
@ -359,11 +364,11 @@ LzmaDecompressor::LzmaLengthDecoderState::LzmaLengthDecoderState()
    initialize_to_default_probability(m_high_length_probabilities);
 }

-ErrorOr<u16> LzmaDecompressor::decode_normalized_match_length(LzmaLengthDecoderState& length_decoder_state)
+ErrorOr<u16> LzmaDecompressor::decode_normalized_match_length(LzmaLengthCoderState& length_decoder_state)
 {
    // "LZMA uses "posState" value as context to select the binary tree
    //  from LowCoder and MidCoder binary tree arrays:"
-    u16 position_state = m_total_decoded_bytes & ((1 << m_options.position_bits) - 1);
+    u16 position_state = m_total_processed_bytes & ((1 << m_options.position_bits) - 1);

    // "The following scheme is used for the match length encoding:
    //
@ -455,7 +460,7 @@ ErrorOr<u32> LzmaDecompressor::decode_normalized_match_distance(u16 normalized_m
    return (distance_prefix << number_of_alignment_bits) | TRY(decode_symbol_using_reverse_bit_tree(number_of_alignment_bits, m_alignment_bit_probabilities));
 }

-u32 LzmaDecompressor::current_repetition_offset() const
+u32 LzmaState::current_repetition_offset() const
 {
    // LZMA never needs to read at offset 0 (i.e. the actual read head of the buffer).
    // Instead, the values are remapped so that the rep-value n starts reading n + 1 bytes back.
@ -465,6 +470,40 @@ u32 LzmaDecompressor::current_repetition_offset() const
    return m_rep0 + 1;
 }

+void LzmaState::update_state_after_literal()
+{
+    if (m_state < 4)
+        m_state = 0;
+    else if (m_state < 10)
+        m_state -= 3;
+    else
+        m_state -= 6;
+}
+
+void LzmaState::update_state_after_match()
+{
+    if (m_state < 7)
+        m_state = 7;
+    else
+        m_state = 10;
+};
+
+void LzmaState::update_state_after_rep()
+{
+    if (m_state < 7)
+        m_state = 8;
+    else
+        m_state = 11;
+}
+
+void LzmaState::update_state_after_short_rep()
+{
+    if (m_state < 7)
+        m_state = 9;
+    else
+        m_state = 11;
+}
+
 ErrorOr<Bytes> LzmaDecompressor::read_some(Bytes bytes)
 {
    while (m_dictionary->used_space() < bytes.size() && m_dictionary->empty_space() != 0) {
@ -481,48 +520,18 @@ ErrorOr<Bytes> LzmaDecompressor::read_some(Bytes bytes)

        // "The decoder calculates "state2" variable value to select exact variable from
        //  "IsMatch" and "IsRep0Long" arrays."
-        u16 position_state = m_total_decoded_bytes & ((1 << m_options.position_bits) - 1);
+        u16 position_state = m_total_processed_bytes & ((1 << m_options.position_bits) - 1);
        u16 state2 = (m_state << maximum_number_of_position_bits) + position_state;

-        auto update_state_after_literal = [&] {
-            if (m_state < 4)
-                m_state = 0;
-            else if (m_state < 10)
-                m_state -= 3;
-            else
-                m_state -= 6;
-        };
-
-        auto update_state_after_match = [&] {
-            if (m_state < 7)
-                m_state = 7;
-            else
-                m_state = 10;
-        };
-
-        auto update_state_after_rep = [&] {
-            if (m_state < 7)
-                m_state = 8;
-            else
-                m_state = 11;
-        };
-
-        auto update_state_after_short_rep = [&] {
-            if (m_state < 7)
-                m_state = 9;
-            else
-                m_state = 11;
-        };
-
        auto copy_match_to_buffer = [&](u16 real_length) -> ErrorOr<void> {
            VERIFY(!m_leftover_match_length.has_value());

-            if (m_options.uncompressed_size.has_value() && m_options.uncompressed_size.value() < m_total_decoded_bytes + real_length)
+            if (m_options.uncompressed_size.has_value() && m_options.uncompressed_size.value() < m_total_processed_bytes + real_length)
                return Error::from_string_literal("Tried to copy match beyond expected uncompressed file size");

            auto copied_length = TRY(m_dictionary->copy_from_seekback(current_repetition_offset(), real_length));

-            m_total_decoded_bytes += copied_length;
+            m_total_processed_bytes += copied_length;
            real_length -= copied_length;

            if (real_length > 0)
@ -569,7 +578,7 @@ ErrorOr<Bytes> LzmaDecompressor::read_some(Bytes bytes)
            m_rep1 = m_rep0;

            // "The zero-based length is decoded with "LenDecoder"."
-            u16 normalized_length = TRY(decode_normalized_match_length(m_length_decoder));
+            u16 normalized_length = TRY(decode_normalized_match_length(m_length_coder));

            // "The state is update with UpdateState_Match function."
            update_state_after_match();
@ -667,7 +676,7 @@ ErrorOr<Bytes> LzmaDecompressor::read_some(Bytes bytes)

        // "In other cases (Rep Match 0/1/2/3), it decodes the zero-based
        //  length of match with "RepLenDecoder" decoder."
-        u16 normalized_length = TRY(decode_normalized_match_length(m_rep_length_decoder));
+        u16 normalized_length = TRY(decode_normalized_match_length(m_rep_length_coder));

        // "Then it updates the state."
        update_state_after_rep();
@ -678,7 +687,7 @@ ErrorOr<Bytes> LzmaDecompressor::read_some(Bytes bytes)
    }

    if (m_found_end_of_stream_marker || has_reached_expected_data_size()) {
-        if (m_options.uncompressed_size.has_value() && m_total_decoded_bytes < m_options.uncompressed_size.value())
+        if (m_options.uncompressed_size.has_value() && m_total_processed_bytes < m_options.uncompressed_size.value())
            return Error::from_string_literal("Found end-of-stream marker earlier than expected");

        if (!is_range_decoder_in_clean_state())
--- a/Userland/Libraries/LibCompress/Lzma.h
+++ b/Userland/Libraries/LibCompress/Lzma.h
@ -48,7 +48,81 @@ private:
 };
 static_assert(sizeof(LzmaHeader) == 13);

-class LzmaDecompressor : public Stream {
+class LzmaState {
+protected:
+    // LZMA uses 11-bit probability counters, but they are usually stored in 16-bit variables.
+    // Therefore, we can model probabilities with a resolution of up to 1 / 2^11 (which is equal to 1 / 2048).
+    // The default probability for most counters is 0.5.
+    using Probability = u16;
+    static constexpr size_t probability_bit_count = 11;
+    static constexpr Probability default_probability = (1 << probability_bit_count) / 2;
+    static void initialize_to_default_probability(Span<Probability>);
+
+    LzmaState(FixedArray<Probability> literal_probabilities);
+
+    u64 m_total_processed_bytes { 0 };
+
+    static constexpr size_t literal_probability_table_size = 0x300;
+    FixedArray<Probability> m_literal_probabilities;
+
+    struct LzmaLengthCoderState {
+    public:
+        LzmaLengthCoderState();
+
+        Probability m_first_choice_probability { default_probability };
+        Probability m_second_choice_probability { default_probability };
+
+        static constexpr size_t maximum_number_of_position_bits = 4;
+        Array<Array<Probability, (1 << 3)>, (1 << maximum_number_of_position_bits)> m_low_length_probabilities;
+        Array<Array<Probability, (1 << 3)>, (1 << maximum_number_of_position_bits)> m_medium_length_probabilities;
+        Array<Probability, (1 << 8)> m_high_length_probabilities;
+    };
+
+    LzmaLengthCoderState m_length_coder;
+    LzmaLengthCoderState m_rep_length_coder;
+
+    static constexpr u16 normalized_to_real_match_length_offset = 2;
+    static constexpr u32 normalized_to_real_match_distance_offset = 1;
+
+    static constexpr size_t number_of_length_to_position_states = 4;
+    Array<Array<Probability, (1 << 6)>, number_of_length_to_position_states> m_length_to_position_states;
+
+    static constexpr size_t first_position_slot_with_binary_tree_bits = 4;
+    static constexpr size_t first_position_slot_with_direct_encoded_bits = 14;
+
+    // This is a bit wasteful on memory and not in the specification, but it makes the math easier.
+    static constexpr size_t number_of_binary_tree_distance_slots = first_position_slot_with_direct_encoded_bits - first_position_slot_with_binary_tree_bits;
+    static constexpr size_t largest_number_of_binary_tree_distance_bits = 5;
+    Array<Array<Probability, (1 << largest_number_of_binary_tree_distance_bits)>, number_of_binary_tree_distance_slots> m_binary_tree_distance_probabilities;
+
+    static constexpr size_t number_of_alignment_bits = 4;
+    Array<Probability, (1 << number_of_alignment_bits)> m_alignment_bit_probabilities;
+
+    // LZ state tracking.
+    u16 m_state { 0 };
+    u32 m_rep0 { 0 };
+    u32 m_rep1 { 0 };
+    u32 m_rep2 { 0 };
+    u32 m_rep3 { 0 };
+    u32 current_repetition_offset() const;
+
+    void update_state_after_literal();
+    void update_state_after_match();
+    void update_state_after_rep();
+    void update_state_after_short_rep();
+
+    static constexpr size_t maximum_number_of_position_bits = 4;
+    static constexpr size_t number_of_states = 12;
+    Array<Probability, (number_of_states << maximum_number_of_position_bits)> m_is_match_probabilities;
+    Array<Probability, number_of_states> m_is_rep_probabilities;
+    Array<Probability, number_of_states> m_is_rep_g0_probabilities;
+    Array<Probability, number_of_states> m_is_rep_g1_probabilities;
+    Array<Probability, number_of_states> m_is_rep_g2_probabilities;
+    Array<Probability, (number_of_states << maximum_number_of_position_bits)> m_is_rep0_long_probabilities;
+};
+
+class LzmaDecompressor : public Stream
+    , LzmaState {
 public:
    /// Creates a decompressor from a standalone LZMA container (.lzma file extension, occasionally known as an LZMA 'archive').
    static ErrorOr<NonnullOwnPtr<LzmaDecompressor>> create_from_container(MaybeOwned<Stream>, Optional<MaybeOwned<CircularBuffer>> dictionary = {});
@ -65,14 +139,6 @@ public:
    virtual void close() override;

 private:
-    // LZMA uses 11-bit probability counters, but they are usually stored in 16-bit variables.
-    // Therefore, we can model probabilities with a resolution of up to 1 / 2^11 (which is equal to 1 / 2048).
-    // The default probability for most counters is 0.5.
-    using Probability = u16;
-    static constexpr size_t probability_bit_count = 11;
-    static constexpr Probability default_probability = (1 << probability_bit_count) / 2;
-    static void initialize_to_default_probability(Span<Probability>);
-
    LzmaDecompressor(MaybeOwned<Stream>, LzmaDecompressorOptions, MaybeOwned<CircularBuffer>, FixedArray<Probability> literal_probabilities);

    MaybeOwned<Stream> m_stream;
@ -80,7 +146,6 @@ private:

    // This doubles as an output buffer, since we have to write all of our results into this anyways.
    MaybeOwned<CircularBuffer> m_dictionary;
-    u64 m_total_decoded_bytes { 0 };
    bool m_found_end_of_stream_marker { false };
    bool is_range_decoder_in_clean_state() const;
    bool has_reached_expected_data_size() const;
@ -101,62 +166,12 @@ private:
    ErrorOr<u16> decode_symbol_using_reverse_bit_tree(size_t bit_count, Span<Probability> probability_tree);

    ErrorOr<void> decode_literal_to_output_buffer();
-    static constexpr size_t literal_probability_table_size = 0x300;
-    FixedArray<Probability> m_literal_probabilities;

-    struct LzmaLengthDecoderState {
-    public:
-        LzmaLengthDecoderState();
-
-        Probability m_first_choice_probability { default_probability };
-        Probability m_second_choice_probability { default_probability };
-
-        static constexpr size_t maximum_number_of_position_bits = 4;
-        Array<Array<Probability, (1 << 3)>, (1 << maximum_number_of_position_bits)> m_low_length_probabilities;
-        Array<Array<Probability, (1 << 3)>, (1 << maximum_number_of_position_bits)> m_medium_length_probabilities;
-        Array<Probability, (1 << 8)> m_high_length_probabilities;
-    };
-
-    LzmaLengthDecoderState m_length_decoder;
-    LzmaLengthDecoderState m_rep_length_decoder;
-    static constexpr u16 normalized_to_real_match_length_offset = 2;
-    ErrorOr<u16> decode_normalized_match_length(LzmaLengthDecoderState&);
-
-    static constexpr size_t number_of_length_to_position_states = 4;
-    Array<Array<Probability, (1 << 6)>, number_of_length_to_position_states> m_length_to_position_states;
-
-    static constexpr size_t first_position_slot_with_binary_tree_bits = 4;
-    static constexpr size_t first_position_slot_with_direct_encoded_bits = 14;
-
-    // This is a bit wasteful on memory and not in the specification, but it makes the math easier.
-    static constexpr size_t number_of_binary_tree_distance_slots = first_position_slot_with_direct_encoded_bits - first_position_slot_with_binary_tree_bits;
-    static constexpr size_t largest_number_of_binary_tree_distance_bits = 5;
-    Array<Array<Probability, (1 << largest_number_of_binary_tree_distance_bits)>, number_of_binary_tree_distance_slots> m_binary_tree_distance_probabilities;
-
-    static constexpr size_t number_of_alignment_bits = 4;
-    Array<Probability, (1 << number_of_alignment_bits)> m_alignment_bit_probabilities;
+    ErrorOr<u16> decode_normalized_match_length(LzmaLengthCoderState&);

    // This deviates from the specification, which states that "unsigned" is at least 16-bit.
    // However, the match distance needs to be at least 32-bit, at the very least to hold the 0xFFFFFFFF end marker value.
-    static constexpr u32 normalized_to_real_match_distance_offset = 1;
    ErrorOr<u32> decode_normalized_match_distance(u16 normalized_match_length);
-
-    // LZ state tracking.
-    u16 m_state { 0 };
-    u32 m_rep0 { 0 };
-    u32 m_rep1 { 0 };
-    u32 m_rep2 { 0 };
-    u32 m_rep3 { 0 };
-    u32 current_repetition_offset() const;
-
-    static constexpr size_t maximum_number_of_position_bits = 4;
-    static constexpr size_t number_of_states = 12;
-    Array<Probability, (number_of_states << maximum_number_of_position_bits)> m_is_match_probabilities;
-    Array<Probability, number_of_states> m_is_rep_probabilities;
-    Array<Probability, number_of_states> m_is_rep_g0_probabilities;
-    Array<Probability, number_of_states> m_is_rep_g1_probabilities;
-    Array<Probability, number_of_states> m_is_rep_g2_probabilities;
-    Array<Probability, (number_of_states << maximum_number_of_position_bits)> m_is_rep0_long_probabilities;
 };

 }