LibCompress: Decode the LZMA match type in a separate function

This should keep the `read_some` function a bit flatter and shorter, and make it easier to match the match type decoding process with the specification.
2025-07-27 06:27:45 +00:00 · 2023-05-01 15:51:04 +02:00 · 2023-05-01 15:51:04 +02:00 · 42514c6961
commit 42514c6961
parent 4a37bac374
2 changed files with 98 additions and 64 deletions
--- a/Userland/Libraries/LibCompress/Lzma.cpp
+++ b/Userland/Libraries/LibCompress/Lzma.cpp
@ -504,6 +504,56 @@ void LzmaState::update_state_after_short_rep()
        m_state = 11;
 }
 ErrorOr<LzmaDecompressor::MatchType> LzmaDecompressor::decode_match_type()
 {
    // "The decoder calculates "state2" variable value to select exact variable from
    //  "IsMatch" and "IsRep0Long" arrays."
    u16 position_state = m_total_processed_bytes & ((1 << m_options.position_bits) - 1);
    u16 state2 = (m_state << maximum_number_of_position_bits) + position_state;
    // "The decoder uses the following code flow scheme to select exact
    //  type of LITERAL or MATCH:
    //
    //  IsMatch[state2] decode
    //   0 - the Literal"
    if (TRY(decode_bit_with_probability(m_is_match_probabilities[state2])) == 0)
        return MatchType::Literal;
    // " 1 - the Match
    //     IsRep[state] decode
    //       0 - Simple Match"
    if (TRY(decode_bit_with_probability(m_is_rep_probabilities[m_state])) == 0)
        return MatchType::SimpleMatch;
    // "     1 - Rep Match
    //         IsRepG0[state] decode
    //           0 - the distance is rep0"
    if (TRY(decode_bit_with_probability(m_is_rep_g0_probabilities[m_state])) == 0) {
        // "       IsRep0Long[state2] decode
        //           0 - Short Rep Match"
        if (TRY(decode_bit_with_probability(m_is_rep0_long_probabilities[state2])) == 0)
            return MatchType::ShortRepMatch;
        // "         1 - Rep Match 0"
        return MatchType::RepMatch0;
    }
    // "         1 -
    //             IsRepG1[state] decode
    //               0 - Rep Match 1"
    if (TRY(decode_bit_with_probability(m_is_rep_g1_probabilities[m_state])) == 0)
        return MatchType::RepMatch1;
    // "             1 -
    //                 IsRepG2[state] decode
    //                   0 - Rep Match 2"
    if (TRY(decode_bit_with_probability(m_is_rep_g2_probabilities[m_state])) == 0)
        return MatchType::RepMatch2;
    // "                 1 - Rep Match 3"
    return MatchType::RepMatch3;
 }
 ErrorOr<Bytes> LzmaDecompressor::read_some(Bytes bytes)
 {
    while (m_dictionary->used_space() < bytes.size() && m_dictionary->empty_space() != 0) {
@ -518,11 +568,6 @@ ErrorOr<Bytes> LzmaDecompressor::read_some(Bytes bytes)
            // Otherwise, we give it one last try to find the end marker in the remaining data.
        }
        // "The decoder calculates "state2" variable value to select exact variable from
        //  "IsMatch" and "IsRep0Long" arrays."
        u16 position_state = m_total_processed_bytes & ((1 << m_options.position_bits) - 1);
        u16 state2 = (m_state << maximum_number_of_position_bits) + position_state;
        auto copy_match_to_buffer = [&](u16 real_length) -> ErrorOr<void> {
            VERIFY(!m_leftover_match_length.has_value());
@ -546,16 +591,13 @@ ErrorOr<Bytes> LzmaDecompressor::read_some(Bytes bytes)
            continue;
        }
-        // "The decoder uses the following code flow scheme to select exact
+        auto const match_type = TRY(decode_match_type());
        //  type of LITERAL or MATCH:
        //
        //  IsMatch[state2] decode
        //   0 - the Literal"
        if (TRY(decode_bit_with_probability(m_is_match_probabilities[state2])) == 0) {
            // If we are already past the expected uncompressed size, we are already in "look for EOS only" mode.
            if (has_reached_expected_data_size())
                return Error::from_string_literal("Found literal after reaching expected uncompressed size");
        // If we are looking for EOS, but find another match type, the stream is also corrupted.
        if (has_reached_expected_data_size() && match_type != MatchType::SimpleMatch)
            return Error::from_string_literal("First match type after the expected uncompressed size is not a simple match");
        if (match_type == MatchType::Literal) {
            // "At first the LZMA decoder must check that it doesn't exceed
            //  specified uncompressed size."
            // This is already checked for at the beginning of the loop.
@ -568,10 +610,7 @@ ErrorOr<Bytes> LzmaDecompressor::read_some(Bytes bytes)
            continue;
        }
-        // " 1 - the Match
+        if (match_type == MatchType::SimpleMatch) {
        //     IsRep[state] decode
        //       0 - Simple Match"
        if (TRY(decode_bit_with_probability(m_is_rep_probabilities[m_state])) == 0) {
            // "The distance history table is updated with the following scheme:"
            m_rep3 = m_rep2;
            m_rep2 = m_rep1;
@ -620,58 +659,41 @@ ErrorOr<Bytes> LzmaDecompressor::read_some(Bytes bytes)
            continue;
        }
-        // If we are looking for EOS, but find another match type, the stream is also corrupted.
+        if (match_type == MatchType::ShortRepMatch) {
        if (has_reached_expected_data_size())
            return Error::from_string_literal("First match type after the expected uncompressed size is not a simple match");
        // "     1 - Rep Match
        //         IsRepG0[state] decode
        //           0 - the distance is rep0"
        if (TRY(decode_bit_with_probability(m_is_rep_g0_probabilities[m_state])) == 0) {
            // "LZMA doesn't update the distance history."
-            // "       IsRep0Long[state2] decode
+            // "If the subtype is "Short Rep Match", the decoder updates the state, puts
-            //           0 - Short Rep Match"
+            //  the one byte from window to current position in window and goes to next
-            if (TRY(decode_bit_with_probability(m_is_rep0_long_probabilities[state2])) == 0) {
+            //  MATCH/LITERAL symbol."
-                // "If the subtype is "Short Rep Match", the decoder updates the state, puts
+            update_state_after_short_rep();
                //  the one byte from window to current position in window and goes to next
                //  MATCH/LITERAL symbol."
                update_state_after_short_rep();
-                TRY(copy_match_to_buffer(1));
+            TRY(copy_match_to_buffer(1));
-                continue;
+            continue;
-            }
+        }
            // "         1 - Rep Match 0"
            // Intentional fallthrough, we just need to make sure to not run the detection for other match types and to not switch around the distance history.
        } else {
            // "     1 -
            //         IsRepG1[state] decode
            //           0 - Rep Match 1"
            if (TRY(decode_bit_with_probability(m_is_rep_g1_probabilities[m_state])) == 0) {
                u32 distance = m_rep1;
                m_rep1 = m_rep0;
                m_rep0 = distance;
            }
-            // "         1 -
+        // Note: We don't need to do anything specific for "Rep Match 0", we just need to make sure to not
-            //             IsRepG2[state] decode
+        //       run the detection for other match types and to not switch around the distance history.
            //               0 - Rep Match 2"
            else if (TRY(decode_bit_with_probability(m_is_rep_g2_probabilities[m_state])) == 0) {
                u32 distance = m_rep2;
                m_rep2 = m_rep1;
                m_rep1 = m_rep0;
                m_rep0 = distance;
            }
-            // "             1 - Rep Match 3"
+        if (match_type == MatchType::RepMatch1) {
-            else {
+            u32 distance = m_rep1;
-                u32 distance = m_rep3;
+            m_rep1 = m_rep0;
-                m_rep3 = m_rep2;
+            m_rep0 = distance;
-                m_rep2 = m_rep1;
+        }
-                m_rep1 = m_rep0;
+
-                m_rep0 = distance;
+        if (match_type == MatchType::RepMatch2) {
-            }
+            u32 distance = m_rep2;
            m_rep2 = m_rep1;
            m_rep1 = m_rep0;
            m_rep0 = distance;
        }
        if (match_type == MatchType::RepMatch3) {
            u32 distance = m_rep3;
            m_rep3 = m_rep2;
            m_rep2 = m_rep1;
            m_rep1 = m_rep0;
            m_rep0 = distance;
        }
        // "In other cases (Rep Match 0/1/2/3), it decodes the zero-based
--- a/Userland/Libraries/LibCompress/Lzma.h
+++ b/Userland/Libraries/LibCompress/Lzma.h
@ -118,6 +118,16 @@ protected:
    Array<Probability, number_of_states> m_is_rep_g1_probabilities;
    Array<Probability, number_of_states> m_is_rep_g2_probabilities;
    Array<Probability, (number_of_states << maximum_number_of_position_bits)> m_is_rep0_long_probabilities;
    enum MatchType {
        Literal,
        SimpleMatch,
        RepMatch0,
        ShortRepMatch,
        RepMatch1,
        RepMatch2,
        RepMatch3,
    };
 };
 class LzmaDecompressor : public Stream
@ -159,6 +169,8 @@ private:
    ErrorOr<u8> decode_direct_bit();
    ErrorOr<u8> decode_bit_with_probability(Probability& probability);
    ErrorOr<MatchType> decode_match_type();
    // Decodes a multi-bit symbol using a given probability tree (either in normal or in reverse order).
    // The specification states that "unsigned" is at least 16 bits in size, our implementation assumes this as the maximum symbol size.
    ErrorOr<u16> decode_symbol_using_bit_tree(size_t bit_count, Span<Probability> probability_tree);