1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-07-27 06:27:45 +00:00

LibCompress: Decode the LZMA match type in a separate function

This should keep the `read_some` function a bit flatter and shorter, and
make it easier to match the match type decoding process with the
specification.
This commit is contained in:
Tim Schumacher 2023-05-01 15:51:04 +02:00 committed by Andreas Kling
parent 4a37bac374
commit 42514c6961
2 changed files with 98 additions and 64 deletions

View file

@ -504,6 +504,56 @@ void LzmaState::update_state_after_short_rep()
m_state = 11; m_state = 11;
} }
ErrorOr<LzmaDecompressor::MatchType> LzmaDecompressor::decode_match_type()
{
// "The decoder calculates "state2" variable value to select exact variable from
// "IsMatch" and "IsRep0Long" arrays."
u16 position_state = m_total_processed_bytes & ((1 << m_options.position_bits) - 1);
u16 state2 = (m_state << maximum_number_of_position_bits) + position_state;
// "The decoder uses the following code flow scheme to select exact
// type of LITERAL or MATCH:
//
// IsMatch[state2] decode
// 0 - the Literal"
if (TRY(decode_bit_with_probability(m_is_match_probabilities[state2])) == 0)
return MatchType::Literal;
// " 1 - the Match
// IsRep[state] decode
// 0 - Simple Match"
if (TRY(decode_bit_with_probability(m_is_rep_probabilities[m_state])) == 0)
return MatchType::SimpleMatch;
// " 1 - Rep Match
// IsRepG0[state] decode
// 0 - the distance is rep0"
if (TRY(decode_bit_with_probability(m_is_rep_g0_probabilities[m_state])) == 0) {
// " IsRep0Long[state2] decode
// 0 - Short Rep Match"
if (TRY(decode_bit_with_probability(m_is_rep0_long_probabilities[state2])) == 0)
return MatchType::ShortRepMatch;
// " 1 - Rep Match 0"
return MatchType::RepMatch0;
}
// " 1 -
// IsRepG1[state] decode
// 0 - Rep Match 1"
if (TRY(decode_bit_with_probability(m_is_rep_g1_probabilities[m_state])) == 0)
return MatchType::RepMatch1;
// " 1 -
// IsRepG2[state] decode
// 0 - Rep Match 2"
if (TRY(decode_bit_with_probability(m_is_rep_g2_probabilities[m_state])) == 0)
return MatchType::RepMatch2;
// " 1 - Rep Match 3"
return MatchType::RepMatch3;
}
ErrorOr<Bytes> LzmaDecompressor::read_some(Bytes bytes) ErrorOr<Bytes> LzmaDecompressor::read_some(Bytes bytes)
{ {
while (m_dictionary->used_space() < bytes.size() && m_dictionary->empty_space() != 0) { while (m_dictionary->used_space() < bytes.size() && m_dictionary->empty_space() != 0) {
@ -518,11 +568,6 @@ ErrorOr<Bytes> LzmaDecompressor::read_some(Bytes bytes)
// Otherwise, we give it one last try to find the end marker in the remaining data. // Otherwise, we give it one last try to find the end marker in the remaining data.
} }
// "The decoder calculates "state2" variable value to select exact variable from
// "IsMatch" and "IsRep0Long" arrays."
u16 position_state = m_total_processed_bytes & ((1 << m_options.position_bits) - 1);
u16 state2 = (m_state << maximum_number_of_position_bits) + position_state;
auto copy_match_to_buffer = [&](u16 real_length) -> ErrorOr<void> { auto copy_match_to_buffer = [&](u16 real_length) -> ErrorOr<void> {
VERIFY(!m_leftover_match_length.has_value()); VERIFY(!m_leftover_match_length.has_value());
@ -546,16 +591,13 @@ ErrorOr<Bytes> LzmaDecompressor::read_some(Bytes bytes)
continue; continue;
} }
// "The decoder uses the following code flow scheme to select exact auto const match_type = TRY(decode_match_type());
// type of LITERAL or MATCH:
//
// IsMatch[state2] decode
// 0 - the Literal"
if (TRY(decode_bit_with_probability(m_is_match_probabilities[state2])) == 0) {
// If we are already past the expected uncompressed size, we are already in "look for EOS only" mode.
if (has_reached_expected_data_size())
return Error::from_string_literal("Found literal after reaching expected uncompressed size");
// If we are looking for EOS, but find another match type, the stream is also corrupted.
if (has_reached_expected_data_size() && match_type != MatchType::SimpleMatch)
return Error::from_string_literal("First match type after the expected uncompressed size is not a simple match");
if (match_type == MatchType::Literal) {
// "At first the LZMA decoder must check that it doesn't exceed // "At first the LZMA decoder must check that it doesn't exceed
// specified uncompressed size." // specified uncompressed size."
// This is already checked for at the beginning of the loop. // This is already checked for at the beginning of the loop.
@ -568,10 +610,7 @@ ErrorOr<Bytes> LzmaDecompressor::read_some(Bytes bytes)
continue; continue;
} }
// " 1 - the Match if (match_type == MatchType::SimpleMatch) {
// IsRep[state] decode
// 0 - Simple Match"
if (TRY(decode_bit_with_probability(m_is_rep_probabilities[m_state])) == 0) {
// "The distance history table is updated with the following scheme:" // "The distance history table is updated with the following scheme:"
m_rep3 = m_rep2; m_rep3 = m_rep2;
m_rep2 = m_rep1; m_rep2 = m_rep1;
@ -620,58 +659,41 @@ ErrorOr<Bytes> LzmaDecompressor::read_some(Bytes bytes)
continue; continue;
} }
// If we are looking for EOS, but find another match type, the stream is also corrupted. if (match_type == MatchType::ShortRepMatch) {
if (has_reached_expected_data_size())
return Error::from_string_literal("First match type after the expected uncompressed size is not a simple match");
// " 1 - Rep Match
// IsRepG0[state] decode
// 0 - the distance is rep0"
if (TRY(decode_bit_with_probability(m_is_rep_g0_probabilities[m_state])) == 0) {
// "LZMA doesn't update the distance history." // "LZMA doesn't update the distance history."
// " IsRep0Long[state2] decode // "If the subtype is "Short Rep Match", the decoder updates the state, puts
// 0 - Short Rep Match" // the one byte from window to current position in window and goes to next
if (TRY(decode_bit_with_probability(m_is_rep0_long_probabilities[state2])) == 0) { // MATCH/LITERAL symbol."
// "If the subtype is "Short Rep Match", the decoder updates the state, puts update_state_after_short_rep();
// the one byte from window to current position in window and goes to next
// MATCH/LITERAL symbol."
update_state_after_short_rep();
TRY(copy_match_to_buffer(1)); TRY(copy_match_to_buffer(1));
continue; continue;
} }
// " 1 - Rep Match 0"
// Intentional fallthrough, we just need to make sure to not run the detection for other match types and to not switch around the distance history.
} else {
// " 1 -
// IsRepG1[state] decode
// 0 - Rep Match 1"
if (TRY(decode_bit_with_probability(m_is_rep_g1_probabilities[m_state])) == 0) {
u32 distance = m_rep1;
m_rep1 = m_rep0;
m_rep0 = distance;
}
// " 1 - // Note: We don't need to do anything specific for "Rep Match 0", we just need to make sure to not
// IsRepG2[state] decode // run the detection for other match types and to not switch around the distance history.
// 0 - Rep Match 2"
else if (TRY(decode_bit_with_probability(m_is_rep_g2_probabilities[m_state])) == 0) {
u32 distance = m_rep2;
m_rep2 = m_rep1;
m_rep1 = m_rep0;
m_rep0 = distance;
}
// " 1 - Rep Match 3" if (match_type == MatchType::RepMatch1) {
else { u32 distance = m_rep1;
u32 distance = m_rep3; m_rep1 = m_rep0;
m_rep3 = m_rep2; m_rep0 = distance;
m_rep2 = m_rep1; }
m_rep1 = m_rep0;
m_rep0 = distance; if (match_type == MatchType::RepMatch2) {
} u32 distance = m_rep2;
m_rep2 = m_rep1;
m_rep1 = m_rep0;
m_rep0 = distance;
}
if (match_type == MatchType::RepMatch3) {
u32 distance = m_rep3;
m_rep3 = m_rep2;
m_rep2 = m_rep1;
m_rep1 = m_rep0;
m_rep0 = distance;
} }
// "In other cases (Rep Match 0/1/2/3), it decodes the zero-based // "In other cases (Rep Match 0/1/2/3), it decodes the zero-based

View file

@ -118,6 +118,16 @@ protected:
Array<Probability, number_of_states> m_is_rep_g1_probabilities; Array<Probability, number_of_states> m_is_rep_g1_probabilities;
Array<Probability, number_of_states> m_is_rep_g2_probabilities; Array<Probability, number_of_states> m_is_rep_g2_probabilities;
Array<Probability, (number_of_states << maximum_number_of_position_bits)> m_is_rep0_long_probabilities; Array<Probability, (number_of_states << maximum_number_of_position_bits)> m_is_rep0_long_probabilities;
enum MatchType {
Literal,
SimpleMatch,
RepMatch0,
ShortRepMatch,
RepMatch1,
RepMatch2,
RepMatch3,
};
}; };
class LzmaDecompressor : public Stream class LzmaDecompressor : public Stream
@ -159,6 +169,8 @@ private:
ErrorOr<u8> decode_direct_bit(); ErrorOr<u8> decode_direct_bit();
ErrorOr<u8> decode_bit_with_probability(Probability& probability); ErrorOr<u8> decode_bit_with_probability(Probability& probability);
ErrorOr<MatchType> decode_match_type();
// Decodes a multi-bit symbol using a given probability tree (either in normal or in reverse order). // Decodes a multi-bit symbol using a given probability tree (either in normal or in reverse order).
// The specification states that "unsigned" is at least 16 bits in size, our implementation assumes this as the maximum symbol size. // The specification states that "unsigned" is at least 16 bits in size, our implementation assumes this as the maximum symbol size.
ErrorOr<u16> decode_symbol_using_bit_tree(size_t bit_count, Span<Probability> probability_tree); ErrorOr<u16> decode_symbol_using_bit_tree(size_t bit_count, Span<Probability> probability_tree);