From 42514c69611b983ee5b4ac0552a032673907339a Mon Sep 17 00:00:00 2001 From: Tim Schumacher Date: Mon, 1 May 2023 15:51:04 +0200 Subject: [PATCH] LibCompress: Decode the LZMA match type in a separate function This should keep the `read_some` function a bit flatter and shorter, and make it easier to match the match type decoding process with the specification. --- Userland/Libraries/LibCompress/Lzma.cpp | 150 ++++++++++++++---------- Userland/Libraries/LibCompress/Lzma.h | 12 ++ 2 files changed, 98 insertions(+), 64 deletions(-) diff --git a/Userland/Libraries/LibCompress/Lzma.cpp b/Userland/Libraries/LibCompress/Lzma.cpp index 1dd95e89aa..195853fe09 100644 --- a/Userland/Libraries/LibCompress/Lzma.cpp +++ b/Userland/Libraries/LibCompress/Lzma.cpp @@ -504,6 +504,56 @@ void LzmaState::update_state_after_short_rep() m_state = 11; } +ErrorOr LzmaDecompressor::decode_match_type() +{ + // "The decoder calculates "state2" variable value to select exact variable from + // "IsMatch" and "IsRep0Long" arrays." + u16 position_state = m_total_processed_bytes & ((1 << m_options.position_bits) - 1); + u16 state2 = (m_state << maximum_number_of_position_bits) + position_state; + + // "The decoder uses the following code flow scheme to select exact + // type of LITERAL or MATCH: + // + // IsMatch[state2] decode + // 0 - the Literal" + if (TRY(decode_bit_with_probability(m_is_match_probabilities[state2])) == 0) + return MatchType::Literal; + + // " 1 - the Match + // IsRep[state] decode + // 0 - Simple Match" + if (TRY(decode_bit_with_probability(m_is_rep_probabilities[m_state])) == 0) + return MatchType::SimpleMatch; + + // " 1 - Rep Match + // IsRepG0[state] decode + // 0 - the distance is rep0" + if (TRY(decode_bit_with_probability(m_is_rep_g0_probabilities[m_state])) == 0) { + // " IsRep0Long[state2] decode + // 0 - Short Rep Match" + if (TRY(decode_bit_with_probability(m_is_rep0_long_probabilities[state2])) == 0) + return MatchType::ShortRepMatch; + + // " 1 - Rep Match 0" + return MatchType::RepMatch0; + } + + // " 1 - + // IsRepG1[state] decode + // 0 - Rep Match 1" + if (TRY(decode_bit_with_probability(m_is_rep_g1_probabilities[m_state])) == 0) + return MatchType::RepMatch1; + + // " 1 - + // IsRepG2[state] decode + // 0 - Rep Match 2" + if (TRY(decode_bit_with_probability(m_is_rep_g2_probabilities[m_state])) == 0) + return MatchType::RepMatch2; + + // " 1 - Rep Match 3" + return MatchType::RepMatch3; +} + ErrorOr LzmaDecompressor::read_some(Bytes bytes) { while (m_dictionary->used_space() < bytes.size() && m_dictionary->empty_space() != 0) { @@ -518,11 +568,6 @@ ErrorOr LzmaDecompressor::read_some(Bytes bytes) // Otherwise, we give it one last try to find the end marker in the remaining data. } - // "The decoder calculates "state2" variable value to select exact variable from - // "IsMatch" and "IsRep0Long" arrays." - u16 position_state = m_total_processed_bytes & ((1 << m_options.position_bits) - 1); - u16 state2 = (m_state << maximum_number_of_position_bits) + position_state; - auto copy_match_to_buffer = [&](u16 real_length) -> ErrorOr { VERIFY(!m_leftover_match_length.has_value()); @@ -546,16 +591,13 @@ ErrorOr LzmaDecompressor::read_some(Bytes bytes) continue; } - // "The decoder uses the following code flow scheme to select exact - // type of LITERAL or MATCH: - // - // IsMatch[state2] decode - // 0 - the Literal" - if (TRY(decode_bit_with_probability(m_is_match_probabilities[state2])) == 0) { - // If we are already past the expected uncompressed size, we are already in "look for EOS only" mode. - if (has_reached_expected_data_size()) - return Error::from_string_literal("Found literal after reaching expected uncompressed size"); + auto const match_type = TRY(decode_match_type()); + // If we are looking for EOS, but find another match type, the stream is also corrupted. + if (has_reached_expected_data_size() && match_type != MatchType::SimpleMatch) + return Error::from_string_literal("First match type after the expected uncompressed size is not a simple match"); + + if (match_type == MatchType::Literal) { // "At first the LZMA decoder must check that it doesn't exceed // specified uncompressed size." // This is already checked for at the beginning of the loop. @@ -568,10 +610,7 @@ ErrorOr LzmaDecompressor::read_some(Bytes bytes) continue; } - // " 1 - the Match - // IsRep[state] decode - // 0 - Simple Match" - if (TRY(decode_bit_with_probability(m_is_rep_probabilities[m_state])) == 0) { + if (match_type == MatchType::SimpleMatch) { // "The distance history table is updated with the following scheme:" m_rep3 = m_rep2; m_rep2 = m_rep1; @@ -620,58 +659,41 @@ ErrorOr LzmaDecompressor::read_some(Bytes bytes) continue; } - // If we are looking for EOS, but find another match type, the stream is also corrupted. - if (has_reached_expected_data_size()) - return Error::from_string_literal("First match type after the expected uncompressed size is not a simple match"); - - // " 1 - Rep Match - // IsRepG0[state] decode - // 0 - the distance is rep0" - if (TRY(decode_bit_with_probability(m_is_rep_g0_probabilities[m_state])) == 0) { + if (match_type == MatchType::ShortRepMatch) { // "LZMA doesn't update the distance history." - // " IsRep0Long[state2] decode - // 0 - Short Rep Match" - if (TRY(decode_bit_with_probability(m_is_rep0_long_probabilities[state2])) == 0) { - // "If the subtype is "Short Rep Match", the decoder updates the state, puts - // the one byte from window to current position in window and goes to next - // MATCH/LITERAL symbol." - update_state_after_short_rep(); + // "If the subtype is "Short Rep Match", the decoder updates the state, puts + // the one byte from window to current position in window and goes to next + // MATCH/LITERAL symbol." + update_state_after_short_rep(); - TRY(copy_match_to_buffer(1)); + TRY(copy_match_to_buffer(1)); - continue; - } - // " 1 - Rep Match 0" - // Intentional fallthrough, we just need to make sure to not run the detection for other match types and to not switch around the distance history. - } else { - // " 1 - - // IsRepG1[state] decode - // 0 - Rep Match 1" - if (TRY(decode_bit_with_probability(m_is_rep_g1_probabilities[m_state])) == 0) { - u32 distance = m_rep1; - m_rep1 = m_rep0; - m_rep0 = distance; - } + continue; + } - // " 1 - - // IsRepG2[state] decode - // 0 - Rep Match 2" - else if (TRY(decode_bit_with_probability(m_is_rep_g2_probabilities[m_state])) == 0) { - u32 distance = m_rep2; - m_rep2 = m_rep1; - m_rep1 = m_rep0; - m_rep0 = distance; - } + // Note: We don't need to do anything specific for "Rep Match 0", we just need to make sure to not + // run the detection for other match types and to not switch around the distance history. - // " 1 - Rep Match 3" - else { - u32 distance = m_rep3; - m_rep3 = m_rep2; - m_rep2 = m_rep1; - m_rep1 = m_rep0; - m_rep0 = distance; - } + if (match_type == MatchType::RepMatch1) { + u32 distance = m_rep1; + m_rep1 = m_rep0; + m_rep0 = distance; + } + + if (match_type == MatchType::RepMatch2) { + u32 distance = m_rep2; + m_rep2 = m_rep1; + m_rep1 = m_rep0; + m_rep0 = distance; + } + + if (match_type == MatchType::RepMatch3) { + u32 distance = m_rep3; + m_rep3 = m_rep2; + m_rep2 = m_rep1; + m_rep1 = m_rep0; + m_rep0 = distance; } // "In other cases (Rep Match 0/1/2/3), it decodes the zero-based diff --git a/Userland/Libraries/LibCompress/Lzma.h b/Userland/Libraries/LibCompress/Lzma.h index 4197a5edaf..9250f0e06d 100644 --- a/Userland/Libraries/LibCompress/Lzma.h +++ b/Userland/Libraries/LibCompress/Lzma.h @@ -118,6 +118,16 @@ protected: Array m_is_rep_g1_probabilities; Array m_is_rep_g2_probabilities; Array m_is_rep0_long_probabilities; + + enum MatchType { + Literal, + SimpleMatch, + RepMatch0, + ShortRepMatch, + RepMatch1, + RepMatch2, + RepMatch3, + }; }; class LzmaDecompressor : public Stream @@ -159,6 +169,8 @@ private: ErrorOr decode_direct_bit(); ErrorOr decode_bit_with_probability(Probability& probability); + ErrorOr decode_match_type(); + // Decodes a multi-bit symbol using a given probability tree (either in normal or in reverse order). // The specification states that "unsigned" is at least 16 bits in size, our implementation assumes this as the maximum symbol size. ErrorOr decode_symbol_using_bit_tree(size_t bit_count, Span probability_tree);