serenity/Libraries/LibCompress/Deflate.cpp

/*
 * Copyright (c) 2020, the SerenityOS developers
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * 1. Redistributions of source code must retain the above copyright notice, this
 *    list of conditions and the following disclaimer.
 *
 * 2. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
 * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <AK/Assertions.h>
#include <AK/LogStream.h>
#include <AK/Span.h>
#include <AK/Types.h>
#include <AK/Vector.h>
#include <LibCompress/Deflate.h>

namespace Compress {

bool DeflateStream::read_next_block() const
{
    if (m_read_last_block)
        return false;

    m_read_last_block = m_reader.read_bits(1);
    auto block_type = m_reader.read_bits(2);

    switch (block_type) {
    case 0:
        decompress_uncompressed_block();
        break;
    case 1:
        decompress_static_block();
        break;
    case 2:
        decompress_dynamic_block();
        break;
    case 3:
        dbg() << "Block contains reserved block type...";
        ASSERT_NOT_REACHED();
        break;
    default:
        dbg() << "Invalid block type was read...";
        ASSERT_NOT_REACHED();
        break;
    }

    return true;
}

void DeflateStream::decompress_uncompressed_block() const
{
    // Align to the next byte boundary.
    while (m_reader.get_bit_byte_offset() != 0) {
        m_reader.read();
    }

    auto length = m_reader.read_bits(16) & 0xFFFF;
    auto negated_length = m_reader.read_bits(16) & 0xFFFF;

    if ((length ^ 0xFFFF) != negated_length) {
        dbg() << "Block length is invalid...";
        ASSERT_NOT_REACHED();
    }

    for (size_t i = 0; i < length; i++) {
        auto byte = m_reader.read_byte();
        if (byte < 0) {
            dbg() << "Ran out of bytes while reading uncompressed block...";
            ASSERT_NOT_REACHED();
        }

        m_intermediate_stream << byte;
    }
}

void DeflateStream::decompress_static_block() const
{
    decompress_huffman_block(m_literal_length_codes, &m_fixed_distance_codes);
}

void DeflateStream::decompress_dynamic_block() const
{
    auto codes = decode_huffman_codes();
    if (codes.size() == 2) {
        decompress_huffman_block(codes[0], &codes[1]);
    } else {
        decompress_huffman_block(codes[0], nullptr);
    }
}

void DeflateStream::decompress_huffman_block(CanonicalCode& length_codes, CanonicalCode* distance_codes) const
{
    for (;;) {
        u32 symbol = length_codes.next_symbol(m_reader);

        // End of block.
        if (symbol == 256) {
            break;
        }

        // literal byte.
        if (symbol < 256) {
            m_intermediate_stream << static_cast<u8>(symbol);
            continue;
        }

        // Length and distance for copying.
        ASSERT(distance_codes);

        auto run = decode_run_length(symbol);
        if (run < 3 || run > 258) {
            dbg() << "Invalid run length";
            ASSERT_NOT_REACHED();
        }

        auto distance_symbol = distance_codes->next_symbol(m_reader);
        auto distance = decode_distance(distance_symbol);
        if (distance < 1 || distance > 32768) {
            dbg() << "Invalid distance";
            ASSERT_NOT_REACHED();
        }

        copy_from_history(distance, run);
    }
}

Vector<CanonicalCode> DeflateStream::decode_huffman_codes() const
{
    // FIXME: This path is not tested.
    Vector<CanonicalCode> result;

    auto length_code_count = m_reader.read_bits(5) + 257;
    auto distance_code_count = m_reader.read_bits(5) + 1;

    size_t length_code_code_length = m_reader.read_bits(4) + 4;

    Vector<u8> code_length_code_length;
    code_length_code_length.resize(19);
    code_length_code_length[16] = m_reader.read_bits(3);
    code_length_code_length[17] = m_reader.read_bits(3);
    code_length_code_length[18] = m_reader.read_bits(3);
    code_length_code_length[0] = m_reader.read_bits(3);
    for (size_t i = 0; i < length_code_code_length; i++) {
        auto index = (i % 2 == 0) ? (8 + (i / 2)) : (7 - (i / 2));
        code_length_code_length[index] = m_reader.read_bits(3);
    }

    auto code_length_code = CanonicalCode(code_length_code_length);

    Vector<u32> code_lens;
    code_lens.resize(length_code_count + distance_code_count);

    for (size_t index = 0; index < code_lens.capacity();) {
        auto symbol = code_length_code.next_symbol(m_reader);

        if (symbol <= 15) {
            code_lens[index] = symbol;
            index++;
            continue;
        }

        u32 run_length;
        u32 run_value = 0;

        if (symbol == 16) {
            if (index == 0) {
                dbg() << "No code length value avaliable";
                ASSERT_NOT_REACHED();
            }

            run_length = m_reader.read_bits(2) + 3;
            run_value = code_lens[index - 1];
        } else if (symbol == 17) {
            run_length = m_reader.read_bits(3) + 3;
        } else if (symbol == 18) {
            run_length = m_reader.read_bits(7) + 11;
        } else {
            dbg() << "Code symbol is out of range!";
            ASSERT_NOT_REACHED();
        }

        u32 end = index + run_length;
        if (end > code_lens.capacity()) {
            dbg() << "Code run is out of range!";
            ASSERT_NOT_REACHED();
        }

        memset(code_lens.data() + index, run_value, run_length);
        index = end;
    }

    Vector<u8> literal_codes;
    literal_codes.resize(length_code_count);
    memcpy(literal_codes.data(), code_lens.data(), literal_codes.capacity());
    result.append(CanonicalCode(literal_codes));

    Vector<u8> distance_codes;
    distance_codes.resize(distance_code_count);
    memcpy(distance_codes.data(), code_lens.data() + length_code_count, distance_codes.capacity());

    if (distance_code_count == 1 && distance_codes[0] == 0) {
        return result;
    }

    u8 one_count = 0;
    u8 other_count = 0;

    for (size_t i = 0; i < distance_codes.capacity(); i++) {
        u8 value = distance_codes.at(i);

        if (value == 1) {
            one_count++;
        } else if (value > 1) {
            other_count++;
        }
    }

    if (one_count == 1 && other_count == 0) {
        distance_codes.resize(32);
        distance_codes[31] = 1;
    }

    result.append(CanonicalCode(distance_codes));
    return result;
}

u32 DeflateStream::decode_run_length(u32 symbol) const
{
    if (symbol <= 264) {
        return symbol - 254;
    }

    if (symbol <= 284) {
        auto extra_bits = (symbol - 261) / 4;
        return ((((symbol - 265) % 4) + 4) << extra_bits) + 3 + m_reader.read_bits(extra_bits);
    }

    if (symbol == 285) {
        return 258;
    }

    dbg() << "Found invalid symbol in run length " << symbol;
    ASSERT_NOT_REACHED();
}

u32 DeflateStream::decode_distance(u32 symbol) const
{
    if (symbol <= 3) {
        return symbol + 1;
    }

    if (symbol <= 29) {
        auto extra_bits = (symbol / 2) - 1;
        return (((symbol % 2) + 2) << extra_bits) + 1 + m_reader.read_bits(extra_bits);
    }

    dbg() << "Found invalid symbol in distance" << symbol;
    ASSERT_NOT_REACHED();
}

void DeflateStream::copy_from_history(u32 distance, u32 run) const
{
    for (size_t i = 0; i < run; i++) {
        u8 byte;

        // FIXME: In many cases we can read more than one byte at a time, this should
        //        be refactored into a while loop. Beware, edge case:
        //
        //            // The first four bytes are on the stream already, the other four
        //            // are written by copy_from_history() itself.
        //            copy_from_history(4, 8);
        m_intermediate_stream.read({ &byte, sizeof(byte) }, m_intermediate_stream.woffset() - distance);
        m_intermediate_stream << byte;
    }
}

i8 BitStreamReader::read()
{
    if (m_current_byte == -1) {
        return -1;
    }

    if (m_remaining_bits == 0) {
        if (m_data_index + 1 > m_data.size())
            return -1;

        m_current_byte = m_data.at(m_data_index++);
        m_remaining_bits = 8;
    }

    m_remaining_bits--;
    return (m_current_byte >> (7 - m_remaining_bits)) & 1;
}

i8 BitStreamReader::read_byte()
{
    m_current_byte = 0;
    m_remaining_bits = 0;

    if (m_data_index + 1 > m_data.size())
        return -1;

    return m_data.at(m_data_index++);
}

u8 BitStreamReader::get_bit_byte_offset()
{
    return (8 - m_remaining_bits) % 8;
}

u32 BitStreamReader::read_bits(u8 count)
{
    ASSERT(count > 0 && count < 32);

    u32 result = 0;
    for (size_t i = 0; i < count; i++) {
        result |= read() << i;
    }
    return result;
}

Vector<u8> DeflateStream::generate_literal_length_codes() const
{
    Vector<u8> ll_codes;
    ll_codes.resize(288);
    memset(ll_codes.data() + 0, 8, 144 - 0);
    memset(ll_codes.data() + 144, 9, 256 - 144);
    memset(ll_codes.data() + 256, 7, 280 - 256);
    memset(ll_codes.data() + 280, 8, 288 - 280);
    return ll_codes;
}

Vector<u8> DeflateStream::generate_fixed_distance_codes() const
{
    Vector<u8> fd_codes;
    fd_codes.resize(32);
    memset(fd_codes.data(), 5, 32);
    return fd_codes;
}

CanonicalCode::CanonicalCode(Vector<u8> codes)
{
    m_symbol_codes.resize(codes.size());
    m_symbol_values.resize(codes.size());

    auto allocated_symbols_count = 0;
    auto next_code = 0;

    for (size_t code_length = 1; code_length <= 15; code_length++) {
        next_code <<= 1;
        auto start_bit = 1 << code_length;

        for (size_t symbol = 0; symbol < codes.size(); symbol++) {
            if (codes.at(symbol) != code_length) {
                continue;
            }

            if (next_code > start_bit) {
                dbg() << "Canonical code overflows the huffman tree";
                ASSERT_NOT_REACHED();
            }

            m_symbol_codes[allocated_symbols_count] = start_bit | next_code;
            m_symbol_values[allocated_symbols_count] = symbol;

            allocated_symbols_count++;
            next_code++;
        }
    }

    if (next_code != (1 << 15)) {
        dbg() << "Canonical code underflows the huffman tree " << next_code;
        ASSERT_NOT_REACHED();
    }
}

static i32 binary_search(Vector<u32>& heystack, u32 needle)
{
    i32 low = 0;
    i32 high = heystack.size();

    while (low <= high) {
        u32 mid = (low + high) >> 1;
        u32 value = heystack.at(mid);

        if (value < needle) {
            low = mid + 1;
        } else if (value > needle) {
            high = mid - 1;
        } else {
            return mid;
        }
    }

    return -1;
}

u32 CanonicalCode::next_symbol(BitStreamReader& reader)
{
    auto code_bits = 1;

    for (;;) {
        code_bits = code_bits << 1 | reader.read();
        i32 index = binary_search(m_symbol_codes, code_bits);
        if (index >= 0) {
            return m_symbol_values.at(index);
        }
    }
}

}