mirror of
https://github.com/RGBCube/serenity
synced 2025-07-27 08:07:34 +00:00
LibCompress: Decode the LZMA match type in a separate function
This should keep the `read_some` function a bit flatter and shorter, and make it easier to match the match type decoding process with the specification.
This commit is contained in:
parent
4a37bac374
commit
42514c6961
2 changed files with 98 additions and 64 deletions
|
@ -504,6 +504,56 @@ void LzmaState::update_state_after_short_rep()
|
|||
m_state = 11;
|
||||
}
|
||||
|
||||
ErrorOr<LzmaDecompressor::MatchType> LzmaDecompressor::decode_match_type()
|
||||
{
|
||||
// "The decoder calculates "state2" variable value to select exact variable from
|
||||
// "IsMatch" and "IsRep0Long" arrays."
|
||||
u16 position_state = m_total_processed_bytes & ((1 << m_options.position_bits) - 1);
|
||||
u16 state2 = (m_state << maximum_number_of_position_bits) + position_state;
|
||||
|
||||
// "The decoder uses the following code flow scheme to select exact
|
||||
// type of LITERAL or MATCH:
|
||||
//
|
||||
// IsMatch[state2] decode
|
||||
// 0 - the Literal"
|
||||
if (TRY(decode_bit_with_probability(m_is_match_probabilities[state2])) == 0)
|
||||
return MatchType::Literal;
|
||||
|
||||
// " 1 - the Match
|
||||
// IsRep[state] decode
|
||||
// 0 - Simple Match"
|
||||
if (TRY(decode_bit_with_probability(m_is_rep_probabilities[m_state])) == 0)
|
||||
return MatchType::SimpleMatch;
|
||||
|
||||
// " 1 - Rep Match
|
||||
// IsRepG0[state] decode
|
||||
// 0 - the distance is rep0"
|
||||
if (TRY(decode_bit_with_probability(m_is_rep_g0_probabilities[m_state])) == 0) {
|
||||
// " IsRep0Long[state2] decode
|
||||
// 0 - Short Rep Match"
|
||||
if (TRY(decode_bit_with_probability(m_is_rep0_long_probabilities[state2])) == 0)
|
||||
return MatchType::ShortRepMatch;
|
||||
|
||||
// " 1 - Rep Match 0"
|
||||
return MatchType::RepMatch0;
|
||||
}
|
||||
|
||||
// " 1 -
|
||||
// IsRepG1[state] decode
|
||||
// 0 - Rep Match 1"
|
||||
if (TRY(decode_bit_with_probability(m_is_rep_g1_probabilities[m_state])) == 0)
|
||||
return MatchType::RepMatch1;
|
||||
|
||||
// " 1 -
|
||||
// IsRepG2[state] decode
|
||||
// 0 - Rep Match 2"
|
||||
if (TRY(decode_bit_with_probability(m_is_rep_g2_probabilities[m_state])) == 0)
|
||||
return MatchType::RepMatch2;
|
||||
|
||||
// " 1 - Rep Match 3"
|
||||
return MatchType::RepMatch3;
|
||||
}
|
||||
|
||||
ErrorOr<Bytes> LzmaDecompressor::read_some(Bytes bytes)
|
||||
{
|
||||
while (m_dictionary->used_space() < bytes.size() && m_dictionary->empty_space() != 0) {
|
||||
|
@ -518,11 +568,6 @@ ErrorOr<Bytes> LzmaDecompressor::read_some(Bytes bytes)
|
|||
// Otherwise, we give it one last try to find the end marker in the remaining data.
|
||||
}
|
||||
|
||||
// "The decoder calculates "state2" variable value to select exact variable from
|
||||
// "IsMatch" and "IsRep0Long" arrays."
|
||||
u16 position_state = m_total_processed_bytes & ((1 << m_options.position_bits) - 1);
|
||||
u16 state2 = (m_state << maximum_number_of_position_bits) + position_state;
|
||||
|
||||
auto copy_match_to_buffer = [&](u16 real_length) -> ErrorOr<void> {
|
||||
VERIFY(!m_leftover_match_length.has_value());
|
||||
|
||||
|
@ -546,16 +591,13 @@ ErrorOr<Bytes> LzmaDecompressor::read_some(Bytes bytes)
|
|||
continue;
|
||||
}
|
||||
|
||||
// "The decoder uses the following code flow scheme to select exact
|
||||
// type of LITERAL or MATCH:
|
||||
//
|
||||
// IsMatch[state2] decode
|
||||
// 0 - the Literal"
|
||||
if (TRY(decode_bit_with_probability(m_is_match_probabilities[state2])) == 0) {
|
||||
// If we are already past the expected uncompressed size, we are already in "look for EOS only" mode.
|
||||
if (has_reached_expected_data_size())
|
||||
return Error::from_string_literal("Found literal after reaching expected uncompressed size");
|
||||
auto const match_type = TRY(decode_match_type());
|
||||
|
||||
// If we are looking for EOS, but find another match type, the stream is also corrupted.
|
||||
if (has_reached_expected_data_size() && match_type != MatchType::SimpleMatch)
|
||||
return Error::from_string_literal("First match type after the expected uncompressed size is not a simple match");
|
||||
|
||||
if (match_type == MatchType::Literal) {
|
||||
// "At first the LZMA decoder must check that it doesn't exceed
|
||||
// specified uncompressed size."
|
||||
// This is already checked for at the beginning of the loop.
|
||||
|
@ -568,10 +610,7 @@ ErrorOr<Bytes> LzmaDecompressor::read_some(Bytes bytes)
|
|||
continue;
|
||||
}
|
||||
|
||||
// " 1 - the Match
|
||||
// IsRep[state] decode
|
||||
// 0 - Simple Match"
|
||||
if (TRY(decode_bit_with_probability(m_is_rep_probabilities[m_state])) == 0) {
|
||||
if (match_type == MatchType::SimpleMatch) {
|
||||
// "The distance history table is updated with the following scheme:"
|
||||
m_rep3 = m_rep2;
|
||||
m_rep2 = m_rep1;
|
||||
|
@ -620,58 +659,41 @@ ErrorOr<Bytes> LzmaDecompressor::read_some(Bytes bytes)
|
|||
continue;
|
||||
}
|
||||
|
||||
// If we are looking for EOS, but find another match type, the stream is also corrupted.
|
||||
if (has_reached_expected_data_size())
|
||||
return Error::from_string_literal("First match type after the expected uncompressed size is not a simple match");
|
||||
|
||||
// " 1 - Rep Match
|
||||
// IsRepG0[state] decode
|
||||
// 0 - the distance is rep0"
|
||||
if (TRY(decode_bit_with_probability(m_is_rep_g0_probabilities[m_state])) == 0) {
|
||||
if (match_type == MatchType::ShortRepMatch) {
|
||||
// "LZMA doesn't update the distance history."
|
||||
|
||||
// " IsRep0Long[state2] decode
|
||||
// 0 - Short Rep Match"
|
||||
if (TRY(decode_bit_with_probability(m_is_rep0_long_probabilities[state2])) == 0) {
|
||||
// "If the subtype is "Short Rep Match", the decoder updates the state, puts
|
||||
// the one byte from window to current position in window and goes to next
|
||||
// MATCH/LITERAL symbol."
|
||||
update_state_after_short_rep();
|
||||
// "If the subtype is "Short Rep Match", the decoder updates the state, puts
|
||||
// the one byte from window to current position in window and goes to next
|
||||
// MATCH/LITERAL symbol."
|
||||
update_state_after_short_rep();
|
||||
|
||||
TRY(copy_match_to_buffer(1));
|
||||
TRY(copy_match_to_buffer(1));
|
||||
|
||||
continue;
|
||||
}
|
||||
// " 1 - Rep Match 0"
|
||||
// Intentional fallthrough, we just need to make sure to not run the detection for other match types and to not switch around the distance history.
|
||||
} else {
|
||||
// " 1 -
|
||||
// IsRepG1[state] decode
|
||||
// 0 - Rep Match 1"
|
||||
if (TRY(decode_bit_with_probability(m_is_rep_g1_probabilities[m_state])) == 0) {
|
||||
u32 distance = m_rep1;
|
||||
m_rep1 = m_rep0;
|
||||
m_rep0 = distance;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
// " 1 -
|
||||
// IsRepG2[state] decode
|
||||
// 0 - Rep Match 2"
|
||||
else if (TRY(decode_bit_with_probability(m_is_rep_g2_probabilities[m_state])) == 0) {
|
||||
u32 distance = m_rep2;
|
||||
m_rep2 = m_rep1;
|
||||
m_rep1 = m_rep0;
|
||||
m_rep0 = distance;
|
||||
}
|
||||
// Note: We don't need to do anything specific for "Rep Match 0", we just need to make sure to not
|
||||
// run the detection for other match types and to not switch around the distance history.
|
||||
|
||||
// " 1 - Rep Match 3"
|
||||
else {
|
||||
u32 distance = m_rep3;
|
||||
m_rep3 = m_rep2;
|
||||
m_rep2 = m_rep1;
|
||||
m_rep1 = m_rep0;
|
||||
m_rep0 = distance;
|
||||
}
|
||||
if (match_type == MatchType::RepMatch1) {
|
||||
u32 distance = m_rep1;
|
||||
m_rep1 = m_rep0;
|
||||
m_rep0 = distance;
|
||||
}
|
||||
|
||||
if (match_type == MatchType::RepMatch2) {
|
||||
u32 distance = m_rep2;
|
||||
m_rep2 = m_rep1;
|
||||
m_rep1 = m_rep0;
|
||||
m_rep0 = distance;
|
||||
}
|
||||
|
||||
if (match_type == MatchType::RepMatch3) {
|
||||
u32 distance = m_rep3;
|
||||
m_rep3 = m_rep2;
|
||||
m_rep2 = m_rep1;
|
||||
m_rep1 = m_rep0;
|
||||
m_rep0 = distance;
|
||||
}
|
||||
|
||||
// "In other cases (Rep Match 0/1/2/3), it decodes the zero-based
|
||||
|
|
|
@ -118,6 +118,16 @@ protected:
|
|||
Array<Probability, number_of_states> m_is_rep_g1_probabilities;
|
||||
Array<Probability, number_of_states> m_is_rep_g2_probabilities;
|
||||
Array<Probability, (number_of_states << maximum_number_of_position_bits)> m_is_rep0_long_probabilities;
|
||||
|
||||
enum MatchType {
|
||||
Literal,
|
||||
SimpleMatch,
|
||||
RepMatch0,
|
||||
ShortRepMatch,
|
||||
RepMatch1,
|
||||
RepMatch2,
|
||||
RepMatch3,
|
||||
};
|
||||
};
|
||||
|
||||
class LzmaDecompressor : public Stream
|
||||
|
@ -159,6 +169,8 @@ private:
|
|||
ErrorOr<u8> decode_direct_bit();
|
||||
ErrorOr<u8> decode_bit_with_probability(Probability& probability);
|
||||
|
||||
ErrorOr<MatchType> decode_match_type();
|
||||
|
||||
// Decodes a multi-bit symbol using a given probability tree (either in normal or in reverse order).
|
||||
// The specification states that "unsigned" is at least 16 bits in size, our implementation assumes this as the maximum symbol size.
|
||||
ErrorOr<u16> decode_symbol_using_bit_tree(size_t bit_count, Span<Probability> probability_tree);
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue