From 9e990f7329294151a2dc232f35311cd5a044c856 Mon Sep 17 00:00:00 2001 From: Tim Schumacher Date: Sat, 11 Mar 2023 14:30:02 +0100 Subject: [PATCH] LibCompress: Add support for XZ --- Userland/Libraries/LibCompress/CMakeLists.txt | 1 + Userland/Libraries/LibCompress/Xz.cpp | 474 ++++++++++++++++++ Userland/Libraries/LibCompress/Xz.h | 136 +++++ 3 files changed, 611 insertions(+) create mode 100644 Userland/Libraries/LibCompress/Xz.cpp create mode 100644 Userland/Libraries/LibCompress/Xz.h diff --git a/Userland/Libraries/LibCompress/CMakeLists.txt b/Userland/Libraries/LibCompress/CMakeLists.txt index 805c60fb24..5c3bfaf10b 100644 --- a/Userland/Libraries/LibCompress/CMakeLists.txt +++ b/Userland/Libraries/LibCompress/CMakeLists.txt @@ -4,6 +4,7 @@ set(SOURCES Deflate.cpp Lzma.cpp Lzma2.cpp + Xz.cpp Zlib.cpp Gzip.cpp ) diff --git a/Userland/Libraries/LibCompress/Xz.cpp b/Userland/Libraries/LibCompress/Xz.cpp new file mode 100644 index 0000000000..7fabb841f3 --- /dev/null +++ b/Userland/Libraries/LibCompress/Xz.cpp @@ -0,0 +1,474 @@ +/* + * Copyright (c) 2023, Tim Schumacher + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include +#include +#include +#include +#include + +namespace Compress { + +ErrorOr XzMultibyteInteger::read_from_stream(Stream& stream) +{ + // 1.2. Multibyte Integers: + // "When smaller values are more likely than bigger values (for + // example file sizes), multibyte integers are encoded in a + // variable-length representation: + // - Numbers in the range [0, 127] are copied as is, and take + // one byte of space. + // - Bigger numbers will occupy two or more bytes. All but the + // last byte of the multibyte representation have the highest + // (eighth) bit set." + + // 9 * 7 bits is 63 bits, which is the largest that will fit into an u64. + constexpr size_t maximum_number_of_bytes = 9; + + u64 result = 0; + + for (size_t i = 0; i < maximum_number_of_bytes; i++) { + u64 next_byte = TRY(stream.read_value()); + result |= (next_byte & 0x7F) << (i * 7); + + // We should reject numbers that are encoded in too many bytes. + if (next_byte == 0x00 && i != 0) + return Error::from_string_literal("XZ multibyte integer has a larger encoding than necessary"); + + if ((next_byte & 0x80) == 0) + break; + } + + return XzMultibyteInteger { result }; +} + +ErrorOr XzStreamHeader::validate() +{ + // 2.1.1.1. Header Magic Bytes: + // "The first six (6) bytes of the Stream are so called Header + // Magic Bytes. They can be used to identify the file type. + // + // Using a C array and ASCII: + // const uint8_t HEADER_MAGIC[6] + // = { 0xFD, '7', 'z', 'X', 'Z', 0x00 }; + // + // In plain hexadecimal: + // FD 37 7A 58 5A 00 + // + // If the Header Magic Bytes don't match, the decoder MUST + // indicate an error." + if (magic[0] != 0xFD || magic[1] != '7' || magic[2] != 'z' || magic[3] != 'X' || magic[4] != 'Z' || magic[5] != 0x00) + return Error::from_string_literal("XZ stream header has an invalid magic"); + + // 2.1.1.2. Stream Flags: + // "If any reserved bit is set, the decoder MUST indicate an error. + // It is possible that there is a new field present which the + // decoder is not aware of, and can thus parse the Stream Header + // incorrectly." + if (flags.reserved != 0 || flags.reserved_bits != 0) + return Error::from_string_literal("XZ stream header has reserved non-null stream flag bits"); + + // 2.1.1.3. CRC32: + // "The CRC32 is calculated from the Stream Flags field. It is + // stored as an unsigned 32-bit little endian integer. If the + // calculated value does not match the stored one, the decoder + // MUST indicate an error." + if (Crypto::Checksum::CRC32({ &flags, sizeof(flags) }).digest() != flags_crc32) + return Error::from_string_literal("XZ stream header has an invalid CRC32 checksum"); + + return {}; +} + +ErrorOr XzStreamFooter::validate() +{ + // 2.1.2.1. CRC32: + // "The CRC32 is calculated from the Backward Size and Stream Flags + // fields. It is stored as an unsigned 32-bit little endian + // integer. If the calculated value does not match the stored one, + // the decoder MUST indicate an error." + Crypto::Checksum::CRC32 calculated_crc32; + calculated_crc32.update({ &encoded_backward_size, sizeof(encoded_backward_size) }); + calculated_crc32.update({ &flags, sizeof(flags) }); + if (calculated_crc32.digest() != size_and_flags_crc32) + return Error::from_string_literal("XZ stream footer has an invalid CRC32 checksum"); + + // 2.1.2.4. Footer Magic Bytes: + // "As the last step of the decoding process, the decoder MUST + // verify the existence of Footer Magic Bytes. If they don't + // match, an error MUST be indicated. + // + // Using a C array and ASCII: + // const uint8_t FOOTER_MAGIC[2] = { 'Y', 'Z' }; + // + // In hexadecimal: + // 59 5A" + if (magic[0] != 'Y' || magic[1] != 'Z') + return Error::from_string_literal("XZ stream footer has an invalid magic"); + + return {}; +} + +u32 XzStreamFooter::backward_size() +{ + // 2.1.2.2. Backward Size: + // "Backward Size is stored as a 32-bit little endian integer, + // which indicates the size of the Index field as multiple of + // four bytes, minimum value being four bytes: + // + // real_backward_size = (stored_backward_size + 1) * 4;" + return (encoded_backward_size + 1) * 4; +} + +u8 XzBlockFlags::number_of_filters() +{ + // 3.1.2. Block Flags: + // "Bit(s) Mask Description + // 0-1 0x03 Number of filters (1-4)" + return encoded_number_of_filters + 1; +} + +ErrorOr XzFilterLzma2Properties::validate() +{ + // 5.3.1. LZMA2: + // "Bits Mask Description + // 6-7 0xC0 Reserved for future use; MUST be zero for now." + if (reserved != 0) + return Error::from_string_literal("XZ LZMA2 filter properties contains non-null reserved bits"); + + // " const uint8_t bits = get_dictionary_flags() & 0x3F; + // if (bits > 40) + // return DICTIONARY_TOO_BIG; // Bigger than 4 GiB" + if (encoded_dictionary_size > 40) + return Error::from_string_literal("XZ LZMA2 filter properties contains larger-than-allowed dictionary size"); + + return {}; +} + +u32 XzFilterLzma2Properties::dictionary_size() +{ + // "Dictionary Size is encoded with one-bit mantissa and five-bit + // exponent. The smallest dictionary size is 4 KiB and the biggest + // is 4 GiB. + // Instead of having a table in the decoder, the dictionary size + // can be decoded using the following C code:" + if (encoded_dictionary_size == 40) + return NumericLimits::max(); + + u32 dictionary_size = 2 | (encoded_dictionary_size & 1); + dictionary_size <<= encoded_dictionary_size / 2 + 11; + return dictionary_size; +} + +ErrorOr> XzDecompressor::create(MaybeOwned stream) +{ + auto counting_stream = TRY(try_make(move(stream))); + + auto stream_header = TRY(counting_stream->read_value()); + TRY(stream_header.validate()); + + auto decompressor = TRY(adopt_nonnull_own_or_enomem(new (nothrow) XzDecompressor(move(counting_stream), stream_header.flags))); + + return decompressor; +} + +XzDecompressor::XzDecompressor(NonnullOwnPtr stream, XzStreamFlags stream_flags) + : m_stream(move(stream)) + , m_stream_flags(stream_flags) +{ +} + +ErrorOr XzDecompressor::read_some(Bytes bytes) +{ + if (m_found_stream_footer) + return bytes.trim(0); + + if (!m_current_block_stream.has_value() || (*m_current_block_stream)->is_eof()) { + if (m_current_block_stream.has_value()) { + // We have already processed a block, so we weed to clean up trailing data before the next block starts. + + // 3.3. Block Padding: + // "Block Padding MUST contain 0-3 null bytes to make the size of + // the Block a multiple of four bytes. This can be needed when + // the size of Compressed Data is not a multiple of four." + while (m_stream->read_bytes() % 4 != 0) { + auto padding_byte = TRY(m_stream->read_value()); + + // "If any of the bytes in Block Padding are not null bytes, the decoder + // MUST indicate an error." + if (padding_byte != 0) + return Error::from_string_literal("XZ block contains a non-null padding byte"); + } + + // 3.4. Check: + // "The type and size of the Check field depends on which bits + // are set in the Stream Flags field (see Section 2.1.1.2). + // + // The Check, when used, is calculated from the original + // uncompressed data. If the calculated Check does not match the + // stored one, the decoder MUST indicate an error. If the selected + // type of Check is not supported by the decoder, it SHOULD + // indicate a warning or error." + // TODO: Block content checks are currently unimplemented as a whole, independent of the check type. + // For now, we only make sure to remove the correct amount of bytes from the stream. + switch (m_stream_flags.check_type) { + case XzStreamCheckType::None: + break; + case XzStreamCheckType::CRC32: + TRY(m_stream->discard(4)); + break; + case XzStreamCheckType::CRC64: + TRY(m_stream->discard(8)); + break; + case XzStreamCheckType::SHA256: + TRY(m_stream->discard(32)); + break; + default: + return Error::from_string_literal("XZ stream has an unknown check type"); + } + } + + auto start_of_current_block = m_stream->read_bytes(); + + // Ensure that the start of the block is aligned to a multiple of four (in theory, everything in XZ is). + // This allows us to make sure that the block padding is correct without having to store the block start offset explicitly. + VERIFY(start_of_current_block % 4 == 0); + + // The first byte between Block Header (3.1.1. Block Header Size) and Index (4.1. Index Indicator) overlap. + // Block header sizes have valid values in the range of [0x01, 0xFF], the only valid value for an Index Indicator is therefore 0x00. + auto encoded_block_header_size_or_index_indicator = TRY(m_stream->read_value()); + + if (encoded_block_header_size_or_index_indicator == 0x00) { + // This is an Index. + + // 4.2. Number of Records: + // "This field indicates how many Records there are in the List + // of Records field, and thus how many Blocks there are in the + // Stream. The value is stored using the encoding described in + // Section 1.2." + u64 number_of_records = TRY(m_stream->read_value()); + + // 4.3. List of Records: + // "List of Records consists of as many Records as indicated by the + // Number of Records field:" + for (u64 i = 0; i < number_of_records; i++) { + // "Each Record contains information about one Block: + // + // +===============+===================+ + // | Unpadded Size | Uncompressed Size | + // +===============+===================+" + + // 4.3.1. Unpadded Size: + // "This field indicates the size of the Block excluding the Block + // Padding field. That is, Unpadded Size is the size of the Block + // Header, Compressed Data, and Check fields. Unpadded Size is + // stored using the encoding described in Section 1.2." + u64 unpadded_size = TRY(m_stream->read_value()); + + // "The value MUST never be zero; with the current structure of Blocks, the + // actual minimum value for Unpadded Size is five." + if (unpadded_size < 5) + return Error::from_string_literal("XZ index contains a record with an unpadded size of less than five"); + + // 4.3.2. Uncompressed Size: + // "This field indicates the Uncompressed Size of the respective + // Block as bytes. The value is stored using the encoding + // described in Section 1.2." + u64 uncompressed_size = TRY(m_stream->read_value()); + + // 4.3. List of Records: + // "If the decoder has decoded all the Blocks of the Stream, it + // MUST verify that the contents of the Records match the real + // Unpadded Size and Uncompressed Size of the respective Blocks." + // TODO: Validation of unpadded and uncompressed size against the actual blocks is currently unimplemented. + (void)unpadded_size; + (void)uncompressed_size; + } + + // 4.4. Index Padding: + // "This field MUST contain 0-3 null bytes to pad the Index to + // a multiple of four bytes. If any of the bytes are not null + // bytes, the decoder MUST indicate an error." + while ((m_stream->read_bytes() - start_of_current_block) % 4 != 0) { + auto padding_byte = TRY(m_stream->read_value()); + + if (padding_byte != 0) + return Error::from_string_literal("XZ index contains a non-null padding byte"); + } + + // 4.5. CRC32: + // "The CRC32 is calculated over everything in the Index field + // except the CRC32 field itself. The CRC32 is stored as an + // unsigned 32-bit little endian integer." + u32 index_crc32 = TRY(m_stream->read_value>()); + + // "If the calculated value does not match the stored one, the decoder MUST indicate + // an error." + // TODO: Validation of the index CRC32 is currently unimplemented. + (void)index_crc32; + + auto size_of_index = m_stream->read_bytes() - start_of_current_block; + + // According to the specification of a stream (2.1. Stream), the index is the last element in a stream, + // followed by the stream footer (2.1.2. Stream Footer). + auto stream_footer = TRY(m_stream->read_value()); + + // This handles verifying the CRC32 (2.1.2.1. CRC32) and the magic bytes (2.1.2.4. Footer Magic Bytes). + TRY(stream_footer.validate()); + + // 2.1.2.2. Backward Size: + // "If the stored value does not match the real size of the Index + // field, the decoder MUST indicate an error." + if (stream_footer.backward_size() != size_of_index) + return Error::from_string_literal("XZ index size does not match the stored size in the stream footer"); + + // 2.1.2.3. Stream Flags: + // "This is a copy of the Stream Flags field from the Stream + // Header. The information stored to Stream Flags is needed + // when parsing the Stream backwards. The decoder MUST compare + // the Stream Flags fields in both Stream Header and Stream + // Footer, and indicate an error if they are not identical." + if (Bytes { &m_stream_flags, sizeof(m_stream_flags) } != Bytes { &stream_footer.flags, sizeof(stream_footer.flags) }) + return Error::from_string_literal("XZ stream header flags don't match the stream footer"); + + m_found_stream_footer = true; + return bytes.trim(0); + } + + // 3.1.1. Block Header Size: + // "This field contains the size of the Block Header field, + // including the Block Header Size field itself. Valid values are + // in the range [0x01, 0xFF], which indicate the size of the Block + // Header as multiples of four bytes, minimum size being eight + // bytes: + // + // real_header_size = (encoded_header_size + 1) * 4;" + u64 block_header_size = (encoded_block_header_size_or_index_indicator + 1) * 4; + + // Read the whole header into a buffer to allow calculating the CRC32 later (3.1.7. CRC32). + auto header = TRY(ByteBuffer::create_uninitialized(block_header_size)); + header[0] = encoded_block_header_size_or_index_indicator; + TRY(m_stream->read_until_filled(header.span().slice(1))); + + FixedMemoryStream header_stream { header.span().slice(1) }; + + // 3.1.2. Block Flags: + // "If any reserved bit is set, the decoder MUST indicate an error. + // It is possible that there is a new field present which the + // decoder is not aware of, and can thus parse the Block Header + // incorrectly." + auto flags = TRY(header_stream.read_value()); + + if (flags.reserved != 0) + return Error::from_string_literal("XZ block header has reserved non-null block flag bits"); + + MaybeOwned new_block_stream { *m_stream }; + + // 3.1.3. Compressed Size: + // "This field is present only if the appropriate bit is set in + // the Block Flags field (see Section 3.1.2)." + if (flags.compressed_size_present) { + // "Compressed Size is stored using the encoding described in Section 1.2." + u64 compressed_size = TRY(header_stream.read_value()); + + // "The Compressed Size field contains the size of the Compressed + // Data field, which MUST be non-zero." + if (compressed_size == 0) + return Error::from_string_literal("XZ block header contains a compressed size of zero"); + + new_block_stream = TRY(try_make(move(new_block_stream), compressed_size)); + } + + // 3.1.4. Uncompressed Size: + // "This field is present only if the appropriate bit is set in + // the Block Flags field (see Section 3.1.2)." + if (flags.uncompressed_size_present) { + // "Uncompressed Size is stored using the encoding described in Section 1.2." + u64 uncompressed_size = TRY(header_stream.read_value()); + + m_current_block_uncompressed_size = uncompressed_size; + } else { + m_current_block_uncompressed_size.clear(); + } + + // 3.1.5. List of Filter Flags: + // "The number of Filter Flags fields is stored in the Block Flags + // field (see Section 3.1.2)." + for (size_t i = 0; i < flags.number_of_filters(); i++) { + // "The format of each Filter Flags field is as follows: + // Both Filter ID and Size of Properties are stored using the + // encoding described in Section 1.2." + u64 filter_id = TRY(header_stream.read_value()); + u64 size_of_properties = TRY(header_stream.read_value()); + + // "Size of Properties indicates the size of the Filter Properties field as bytes." + auto filter_properties = TRY(ByteBuffer::create_uninitialized(size_of_properties)); + TRY(header_stream.read_until_filled(filter_properties)); + + // 5.3.1. LZMA2 + if (filter_id == 0x21) { + if (size_of_properties < sizeof(XzFilterLzma2Properties)) + return Error::from_string_literal("XZ LZMA2 filter has a smaller-than-needed properties size"); + + auto properties = reinterpret_cast(filter_properties.data()); + TRY(properties->validate()); + + new_block_stream = TRY(Lzma2Decompressor::create_from_raw_stream(move(new_block_stream), properties->dictionary_size())); + continue; + } + + return Error::from_string_literal("XZ block header contains unknown filter ID"); + } + + // 3.1.6. Header Padding: + // "This field contains as many null byte as it is needed to make + // the Block Header have the size specified in Block Header Size." + constexpr size_t size_of_block_header_size = 1; + constexpr size_t size_of_crc32 = 4; + while (MUST(header_stream.tell()) < block_header_size - size_of_block_header_size - size_of_crc32) { + auto padding_byte = TRY(header_stream.read_value()); + + // "If any of the bytes are not null bytes, the decoder MUST + // indicate an error." + if (padding_byte != 0) + return Error::from_string_literal("XZ block header padding contains non-null bytes"); + } + + // 3.1.7. CRC32: + // "The CRC32 is calculated over everything in the Block Header + // field except the CRC32 field itself. + Crypto::Checksum::CRC32 calculated_header_crc32 { header.span().trim(block_header_size - size_of_crc32) }; + // It is stored as an unsigned 32-bit little endian integer. + u32 stored_header_crc32 = TRY(header_stream.read_value>()); + // If the calculated value does not match the stored one, the decoder MUST indicate + // an error." + if (calculated_header_crc32.digest() != stored_header_crc32) + return Error::from_string_literal("Stored XZ block header CRC32 does not match the stored CRC32"); + + m_current_block_stream = move(new_block_stream); + } + + return TRY((*m_current_block_stream)->read_some(bytes)); +} + +ErrorOr XzDecompressor::write_some(ReadonlyBytes) +{ + return Error::from_errno(EBADF); +} + +bool XzDecompressor::is_eof() const +{ + return m_found_stream_footer; +} + +bool XzDecompressor::is_open() const +{ + return true; +} + +void XzDecompressor::close() +{ +} + +} diff --git a/Userland/Libraries/LibCompress/Xz.h b/Userland/Libraries/LibCompress/Xz.h new file mode 100644 index 0000000000..1e95a7ea86 --- /dev/null +++ b/Userland/Libraries/LibCompress/Xz.h @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2023, Tim Schumacher + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace Compress { + +// This implementation is based on the "The .xz File Format" specification version 1.1.0: +// https://tukaani.org/xz/xz-file-format-1.1.0.txt + +// 1.2. Multibyte Integers +class [[gnu::packed]] XzMultibyteInteger { +public: + constexpr XzMultibyteInteger() = default; + + constexpr XzMultibyteInteger(u64 value) + : m_value(value) + { + } + + constexpr operator u64() const { return m_value; } + + static ErrorOr read_from_stream(Stream& stream); + +private: + u64 m_value { 0 }; +}; + +// 2.1.1.2. Stream Flags +enum XzStreamCheckType : u8 { + None = 0x00, + CRC32 = 0x01, + CRC64 = 0x04, + SHA256 = 0x0A, +}; + +// 2.1.1.2. Stream Flags +struct [[gnu::packed]] XzStreamFlags { + u8 reserved; + XzStreamCheckType check_type : 4; + u8 reserved_bits : 4; +}; +static_assert(sizeof(XzStreamFlags) == 2); + +// 2.1.1. Stream Header +struct [[gnu::packed]] XzStreamHeader { + u8 magic[6]; + XzStreamFlags flags; + LittleEndian flags_crc32; + + ErrorOr validate(); +}; +static_assert(sizeof(XzStreamHeader) == 12); + +// 2.1.2. Stream Footer +struct [[gnu::packed]] XzStreamFooter { + LittleEndian size_and_flags_crc32; + LittleEndian encoded_backward_size; + XzStreamFlags flags; + u8 magic[2]; + + ErrorOr validate(); + u32 backward_size(); +}; +static_assert(sizeof(XzStreamFooter) == 12); + +// 3.1.2. Block Flags +struct [[gnu::packed]] XzBlockFlags { + u8 encoded_number_of_filters : 2; + u8 reserved : 4; + bool compressed_size_present : 1; + bool uncompressed_size_present : 1; + + u8 number_of_filters(); +}; +static_assert(sizeof(XzBlockFlags) == 1); + +// 5.3.1. LZMA2 +struct [[gnu::packed]] XzFilterLzma2Properties { + u8 encoded_dictionary_size : 6; + u8 reserved : 2; + + ErrorOr validate(); + u32 dictionary_size(); +}; +static_assert(sizeof(XzFilterLzma2Properties) == 1); + +class XzDecompressor : public Stream { +public: + static ErrorOr> create(MaybeOwned); + + virtual ErrorOr read_some(Bytes) override; + virtual ErrorOr write_some(ReadonlyBytes) override; + virtual bool is_eof() const override; + virtual bool is_open() const override; + virtual void close() override; + +private: + XzDecompressor(NonnullOwnPtr, XzStreamFlags); + + NonnullOwnPtr m_stream; + XzStreamFlags m_stream_flags; + bool m_found_stream_footer { false }; + + Optional> m_current_block_stream {}; + Optional m_current_block_uncompressed_size {}; +}; + +} + +template<> +struct AK::Traits : public AK::GenericTraits { + static constexpr bool is_trivially_serializable() { return true; } +}; + +template<> +struct AK::Traits : public AK::GenericTraits { + static constexpr bool is_trivially_serializable() { return true; } +}; + +template<> +struct AK::Traits : public AK::GenericTraits { + static constexpr bool is_trivially_serializable() { return true; } +};