From 7514e49c173df25f0fbaf1b7f2bdfe93961a2c12 Mon Sep 17 00:00:00 2001 From: Zaggy1024 Date: Tue, 15 Nov 2022 03:36:19 -0600 Subject: [PATCH] LibVideo: Make all VP9 block intermediates stack-allocated arrays This has two benefits: - I observed a ~34% decrease in decoding time running TestVP9Decode. - Removing all of these silly Vector fields helps simplify the code relationships between all the functions in Decoder.cpp. It'll also be much easier to make these static with template specializations, if that turns out to be worthy performance improvement. --- Userland/Libraries/LibVideo/VP9/Decoder.cpp | 178 +++++++++----------- Userland/Libraries/LibVideo/VP9/Decoder.h | 65 +++---- 2 files changed, 105 insertions(+), 138 deletions(-) diff --git a/Userland/Libraries/LibVideo/VP9/Decoder.cpp b/Userland/Libraries/LibVideo/VP9/Decoder.cpp index d498910d3c..5a22587d8f 100644 --- a/Userland/Libraries/LibVideo/VP9/Decoder.cpp +++ b/Userland/Libraries/LibVideo/VP9/Decoder.cpp @@ -135,16 +135,9 @@ inline size_t buffer_size(Gfx::Size size) DecoderErrorOr Decoder::allocate_buffers() { - // FIXME: Confirm that we need to clear buffers between frames. - m_buffers = {}; - for (size_t plane = 0; plane < 3; plane++) { auto size = m_parser->get_decoded_size_for_plane(plane); - auto& temp_buffer = get_temp_buffer(plane); - temp_buffer.clear_with_capacity(); - DECODER_TRY_ALLOC(temp_buffer.try_resize_and_keep_capacity(buffer_size(size))); - auto& output_buffer = get_output_buffer(plane); output_buffer.clear_with_capacity(); DECODER_TRY_ALLOC(output_buffer.try_resize_and_keep_capacity(buffer_size(size))); @@ -152,14 +145,9 @@ DecoderErrorOr Decoder::allocate_buffers() return {}; } -Vector& Decoder::get_temp_buffer(u8 plane) -{ - return m_buffers.intermediate[plane]; -} - Vector& Decoder::get_output_buffer(u8 plane) { - return m_buffers.output[plane]; + return m_output_buffers[plane]; } inline CodingIndependentCodePoints Decoder::get_cicp_color_space() @@ -404,15 +392,14 @@ DecoderErrorOr Decoder::predict_intra(u8 plane, u32 x, u32 y, bool have_le // - [1 .. block_size] // - [block_size + 1 .. block_size * 2] // The array indices must be offset by 1 to accommodate index -1. - Vector& above_row = m_buffers.above_row; - DECODER_TRY_ALLOC(above_row.try_resize_and_keep_capacity(block_size * 2 + 1)); + Array above_row; auto above_row_at = [&](i32 index) -> Intermediate& { return above_row[index + 1]; }; // NOTE: This value is pre-calculated since it is reused in spec below. // Use this to replace spec text "(1<<(BitDepth-1))". - auto half_sample_value = (1u << (m_parser->m_bit_depth - 1u)); + Intermediate half_sample_value = (1 << (m_parser->m_bit_depth - 1)); // The array aboveRow[ i ] for i = 0..size-1 is specified by: if (!have_above) { @@ -452,8 +439,7 @@ DecoderErrorOr Decoder::predict_intra(u8 plane, u32 x, u32 y, bool have_le } // The array leftCol[ i ] for i = 0..size-1 is specified by: - Vector& left_column = m_buffers.left_column; - DECODER_TRY_ALLOC(left_column.try_resize_and_keep_capacity(block_size)); + Array left_column; if (have_left) { // − If haveLeft is equal to 1, leftCol[ i ] is set equal to CurrFrame[ plane ][ Min(maxY, y+i) ][ x-1 ]. for (auto i = 0u; i < block_size; i++) @@ -465,8 +451,7 @@ DecoderErrorOr Decoder::predict_intra(u8 plane, u32 x, u32 y, bool have_le } // A 2D array named pred containing the intra predicted samples is constructed as follows: - Vector& predicted_samples = m_buffers.predicted_samples; - DECODER_TRY_ALLOC(predicted_samples.try_resize_and_keep_capacity(block_size * block_size)); + Array predicted_samples; auto const predicted_sample_at = [&](u32 row, u32 column) -> Intermediate& { return predicted_samples[index_from_row_and_column(row, column, block_size)]; }; @@ -777,8 +762,9 @@ MotionVector Decoder::clamp_motion_vector(u8 plane, MotionVector vector) }; } -DecoderErrorOr Decoder::predict_inter_block(u8 plane, u8 ref_list, u32 x, u32 y, u32 width, u32 height, u32 block_index, Vector& block_buffer) +DecoderErrorOr Decoder::predict_inter_block(u8 plane, u8 ref_list, u32 x, u32 y, u32 width, u32 height, u32 block_index, Span block_buffer) { + VERIFY(width <= maximum_block_dimensions && height <= maximum_block_dimensions); // 2. The motion vector selection process in section 8.5.2.1 is invoked with plane, refList, blockIdx as inputs // and the output being the motion vector mv. auto motion_vector = select_motion_vector(plane, ref_list, block_index); @@ -876,7 +862,8 @@ DecoderErrorOr Decoder::predict_inter_block(u8 plane, u8 ref_list, u32 x, // − variables x and y giving the block location in units of 1/16 th of a sample, // − variables xStep and yStep giving the step size in units of 1/16 th of a sample. (These will be at most equal // to 80 due to the restrictions on scaling between reference frames.) - VERIFY(scaled_step_x <= 80 && scaled_step_y <= 80); + static constexpr i32 MAX_SCALED_STEP = 80; + VERIFY(scaled_step_x <= MAX_SCALED_STEP && scaled_step_y <= MAX_SCALED_STEP); // − variables w and h giving the width and height of the block in units of samples // The output from this process is the 2D array named pred containing inter predicted samples. @@ -887,7 +874,6 @@ DecoderErrorOr Decoder::predict_inter_block(u8 plane, u8 ref_list, u32 x, return reference_frame_buffer[row * reference_frame_width + column]; }; - block_buffer.resize_and_keep_capacity(width * height); auto block_buffer_at = [&](u32 row, u32 column) -> u16& { return block_buffer[row * width + column]; }; @@ -900,7 +886,9 @@ DecoderErrorOr Decoder::predict_inter_block(u8 plane, u8 ref_list, u32 x, // The variable intermediateHeight specifying the height required for the intermediate array is set equal to (((h - // 1) * yStep + 15) >> 4) + 8. + static constexpr auto maximum_intermediate_height = (((maximum_block_dimensions - 1) * MAX_SCALED_STEP + 15) >> 4) + 8; auto intermediate_height = (((height - 1) * scaled_step_y + 15) >> 4) + 8; + VERIFY(intermediate_height <= maximum_intermediate_height); // The sub-sample interpolation is effected via two one-dimensional convolutions. First a horizontal filter is used // to build up a temporary array, and then this array is vertically filtered to obtain the final prediction. The // fractional parts of the motion vectors determine the filtering process. If the fractional part is zero, then the @@ -908,9 +896,7 @@ DecoderErrorOr Decoder::predict_inter_block(u8 plane, u8 ref_list, u32 x, // The filtering is applied as follows: // The array intermediate is specified as follows: // Note: Height is specified by `intermediate_height`, width is specified by `width` - Vector& intermediate_buffer = m_buffers.inter_horizontal; - intermediate_buffer.clear_with_capacity(); - intermediate_buffer.resize_and_keep_capacity(intermediate_height * width); + Array intermediate_buffer; auto intermediate_buffer_at = [&](u32 row, u32 column) -> u16& { return intermediate_buffer[row * width + column]; }; @@ -963,9 +949,10 @@ DecoderErrorOr Decoder::predict_inter(u8 plane, u32 x, u32 y, u32 width, u // The prediction arrays are formed by the following ordered steps: // 1. The variable refList is set equal to 0. // 2. through 5. - auto predicted_buffer = m_buffers.inter_predicted; - TRY(predict_inter_block(plane, 0, x, y, width, height, block_index, predicted_buffer)); - auto predicted_buffer_at = [&](Vector& buffer, u32 row, u32 column) -> u16& { + Array predicted_buffer; + auto predicted_span = predicted_buffer.span().trim(width * height); + TRY(predict_inter_block(plane, 0, x, y, width, height, block_index, predicted_span)); + auto predicted_buffer_at = [&](Span buffer, u32 row, u32 column) -> u16& { return buffer[row * width + column]; }; @@ -988,7 +975,7 @@ DecoderErrorOr Decoder::predict_inter(u8 plane, u32 x, u32 y, u32 width, u if (!is_compound) { for (auto i = 0u; i < height_in_frame_buffer; i++) { for (auto j = 0u; j < width_in_frame_buffer; j++) - frame_buffer_at(y + i, x + j) = predicted_buffer_at(predicted_buffer, i, j); + frame_buffer_at(y + i, x + j) = predicted_buffer_at(predicted_span, i, j); } return {}; @@ -996,12 +983,13 @@ DecoderErrorOr Decoder::predict_inter(u8 plane, u32 x, u32 y, u32 width, u // − Otherwise, CurrFrame[ plane ][ y + i ][ x + j ] is set equal to Round2( preds[ 0 ][ i ][ j ] + preds[ 1 ][ i ][ j ], 1 ) // for i = 0..h-1 and j = 0..w-1. - auto second_predicted_buffer = m_buffers.inter_predicted_compound; - TRY(predict_inter_block(plane, 1, x, y, width, height, block_index, second_predicted_buffer)); + Array second_predicted_buffer; + auto second_predicted_span = second_predicted_buffer.span().trim(width * height); + TRY(predict_inter_block(plane, 1, x, y, width, height, block_index, second_predicted_span)); for (auto i = 0u; i < height_in_frame_buffer; i++) { for (auto j = 0u; j < width_in_frame_buffer; j++) - frame_buffer_at(y + i, x + j) = round_2(predicted_buffer_at(predicted_buffer, i, j) + predicted_buffer_at(second_predicted_buffer, i, j), 1); + frame_buffer_at(y + i, x + j) = round_2(predicted_buffer_at(predicted_span, i, j) + predicted_buffer_at(second_predicted_span, i, j), 1); } return {}; @@ -1089,8 +1077,7 @@ DecoderErrorOr Decoder::reconstruct(u8 plane, u32 transform_block_x, u32 t // 1. Dequant[ i ][ j ] is set equal to ( Tokens[ i * n0 + j ] * get_ac_quant( plane ) ) / dqDenom // for i = 0..(n0-1), for j = 0..(n0-1) - Vector& dequantized = m_buffers.dequantized; - DECODER_TRY_ALLOC(dequantized.try_resize_and_keep_capacity(buffer_size(block_size, block_size))); + Array dequantized; Intermediate ac_quant = get_ac_quant(plane); for (auto i = 0u; i < block_size; i++) { for (auto j = 0u; j < block_size; j++) { @@ -1134,7 +1121,7 @@ DecoderErrorOr Decoder::reconstruct(u8 plane, u32 transform_block_x, u32 t return {}; } -inline DecoderErrorOr Decoder::inverse_walsh_hadamard_transform(Vector& data, u8 log2_of_block_size, u8 shift) +inline DecoderErrorOr Decoder::inverse_walsh_hadamard_transform(Span data, u8 log2_of_block_size, u8 shift) { (void)data; (void)shift; @@ -1192,7 +1179,7 @@ inline bool Decoder::check_intermediate_bounds(Intermediate value) } // (8.7.1.1) The function B( a, b, angle, 0 ) performs a butterfly rotation. -inline void Decoder::butterfly_rotation_in_place(Vector& data, size_t index_a, size_t index_b, u8 angle, bool flip) +inline void Decoder::butterfly_rotation_in_place(Span data, size_t index_a, size_t index_b, u8 angle, bool flip) { auto cos = cos64(angle); auto sin = sin64(angle); @@ -1218,7 +1205,7 @@ inline void Decoder::butterfly_rotation_in_place(Vector& data, siz } // (8.7.1.1) The function H( a, b, 0 ) performs a Hadamard rotation. -inline void Decoder::hadamard_rotation_in_place(Vector& data, size_t index_a, size_t index_b, bool flip) +inline void Decoder::hadamard_rotation_in_place(Span data, size_t index_a, size_t index_b, bool flip) { // The function H( a, b, 1 ) performs a Hadamard rotation with flipped indices and is specified as follows: // 1. The function H( b, a, 0 ) is invoked. @@ -1242,7 +1229,7 @@ inline void Decoder::hadamard_rotation_in_place(Vector& data, size VERIFY(check_intermediate_bounds(data[index_b])); } -inline DecoderErrorOr Decoder::inverse_discrete_cosine_transform_array_permutation(Vector& data, u8 log2_of_block_size) +inline DecoderErrorOr Decoder::inverse_discrete_cosine_transform_array_permutation(Span data, u8 log2_of_block_size) { u8 block_size = 1 << log2_of_block_size; @@ -1252,10 +1239,8 @@ inline DecoderErrorOr Decoder::inverse_discrete_cosine_transform_array_per return DecoderError::corrupted("Block size was out of range"sv); // 1.1. A temporary array named copyT is set equal to T. - Vector& data_copy = m_buffers.transform_temp; - data_copy.clear_with_capacity(); - DECODER_TRY_ALLOC(data_copy.try_resize_and_keep_capacity(buffer_size(block_size, block_size))); - data_copy = data; + Array data_copy; + AK::TypedTransfer::copy(data_copy.data(), data.data(), block_size); // 1.2. T[ i ] is set equal to copyT[ brev( n, i ) ] for i = 0..((1< Decoder::inverse_discrete_cosine_transform_array_per return {}; } -inline DecoderErrorOr Decoder::inverse_discrete_cosine_transform(Vector& data, u8 log2_of_block_size) +inline DecoderErrorOr Decoder::inverse_discrete_cosine_transform(Span data, u8 log2_of_block_size) { // 2.1. The variable n0 is set equal to 1< Decoder::inverse_discrete_cosine_transform(Vector& data, Vector& temp, u8 log2_of_block_size) +inline void Decoder::inverse_asymmetric_discrete_sine_transform_input_array_permutation(Span data, u8 log2_of_block_size) { // The variable n0 is set equal to 1< data_copy; + AK::TypedTransfer::copy(data_copy.data(), data.data(), block_size); // The values at even locations T[ 2 * i ] are set equal to copyT[ n0 - 1 - 2 * i ] for i = 0..(n1-1). // The values at odd locations T[ 2 * i + 1 ] are set equal to copyT[ 2 * i ] for i = 0..(n1-1). for (auto i = 0u; i < block_size; i += 2) { - data[i] = temp[block_size - 1 - i]; - data[i + 1] = temp[i]; + data[i] = data_copy[block_size - 1 - i]; + data[i + 1] = data_copy[i]; } } -inline void Decoder::inverse_asymmetric_discrete_sine_transform_output_array_permutation(Vector& data, Vector& temp, u8 log2_of_block_size) +inline void Decoder::inverse_asymmetric_discrete_sine_transform_output_array_permutation(Span data, u8 log2_of_block_size) { + auto block_size = 1u << log2_of_block_size; + // A temporary array named copyT is set equal to T. - temp = data; + Array data_copy; + AK::TypedTransfer::copy(data_copy.data(), data.data(), block_size); // The permutation depends on n as follows: if (log2_of_block_size == 4) { @@ -1394,7 +1383,7 @@ inline void Decoder::inverse_asymmetric_discrete_sine_transform_output_array_per for (auto b = 0u; b < 2; b++) for (auto c = 0u; c < 2; c++) for (auto d = 0u; d < 2; d++) - data[(8 * a) + (4 * b) + (2 * c) + d] = temp[8 * (d ^ c) + 4 * (c ^ b) + 2 * (b ^ a) + a]; + data[(8 * a) + (4 * b) + (2 * c) + d] = data_copy[8 * (d ^ c) + 4 * (c ^ b) + 2 * (b ^ a) + a]; } else { VERIFY(log2_of_block_size == 3); // − Otherwise (n is equal to 3), @@ -1403,11 +1392,11 @@ inline void Decoder::inverse_asymmetric_discrete_sine_transform_output_array_per for (auto a = 0u; a < 2; a++) for (auto b = 0u; b < 2; b++) for (auto c = 0u; c < 2; c++) - data[4 * a + 2 * b + c] = temp[4 * (c ^ b) + 2 * (b ^ a) + a]; + data[4 * a + 2 * b + c] = data_copy[4 * (c ^ b) + 2 * (b ^ a) + a]; } } -inline void Decoder::inverse_asymmetric_discrete_sine_transform_4(Vector& data) +inline void Decoder::inverse_asymmetric_discrete_sine_transform_4(Span data) { VERIFY(data.size() == 4); const i64 sinpi_1_9 = 5283; @@ -1474,7 +1463,7 @@ inline void Decoder::inverse_asymmetric_discrete_sine_transform_4(Vector -inline void Decoder::butterfly_rotation(Vector& source, Vector& destination, size_t index_a, size_t index_b, u8 angle, bool flip) +inline void Decoder::butterfly_rotation(Span source, Span destination, size_t index_a, size_t index_b, u8 angle, bool flip) { // The function SB( a, b, angle, 0 ) performs a butterfly rotation according to the following ordered steps: auto cos = cos64(angle); @@ -1497,7 +1486,7 @@ inline void Decoder::butterfly_rotation(Vector& source, Vector& destinatio // The function SH( a, b ) performs a Hadamard rotation and rounding. // Spec defines the source array as S, and the destination array as T. template -inline void Decoder::hadamard_rotation(Vector& source, Vector& destination, size_t index_a, size_t index_b) +inline void Decoder::hadamard_rotation(Span source, Span destination, size_t index_a, size_t index_b) { // Keep the source buffer's precision until rounding. S a = source[index_a]; @@ -1508,26 +1497,23 @@ inline void Decoder::hadamard_rotation(Vector& source, Vector& destination destination[index_b] = round_2(a - b, 14); } -inline DecoderErrorOr Decoder::inverse_asymmetric_discrete_sine_transform_8(Vector& data) +inline DecoderErrorOr Decoder::inverse_asymmetric_discrete_sine_transform_8(Span data) { VERIFY(data.size() == 8); - // This process does an in-place transform of the array T using: // A higher precision array S for intermediate results. - Vector& high_precision_temp = m_buffers.adst_temp; - high_precision_temp.clear_with_capacity(); - DECODER_TRY_ALLOC(high_precision_temp.try_resize_and_keep_capacity(8)); + Array high_precision_temp; // The following ordered steps apply: // 1. Invoke the ADST input array permutation process specified in section 8.7.1.4 with the input variable n set // equal to 3. - inverse_asymmetric_discrete_sine_transform_input_array_permutation(data, m_buffers.transform_temp, 3); + inverse_asymmetric_discrete_sine_transform_input_array_permutation(data, 3); // 2. Invoke SB( 2*i, 1+2*i, 30-8*i, 1 ) for i = 0..3. for (auto i = 0u; i < 4; i++) - butterfly_rotation(data, high_precision_temp, 2 * i, 1 + (2 * i), 30 - (8 * i), true); + butterfly_rotation(data, high_precision_temp.span(), 2 * i, 1 + (2 * i), 30 - (8 * i), true); // (8.7.1.1) NOTE - The values in array S require higher precision to avoid overflow. Using signed integers with // 24 + BitDepth bits of precision is enough to avoid overflow. const u8 bits = 24 + m_parser->m_bit_depth; @@ -1535,17 +1521,17 @@ inline DecoderErrorOr Decoder::inverse_asymmetric_discrete_sine_transform_ VERIFY(check_bounds(high_precision_temp[i], bits)); // 3. Invoke SH( i, 4+i ) for i = 0..3. for (auto i = 0u; i < 4; i++) - hadamard_rotation(high_precision_temp, data, i, 4 + i); + hadamard_rotation(high_precision_temp.span(), data, i, 4 + i); // 4. Invoke SB( 4+3*i, 5+i, 24-16*i, 1 ) for i = 0..1. for (auto i = 0u; i < 2; i++) - butterfly_rotation(data, high_precision_temp, 4 + (3 * i), 5 + i, 24 - (16 * i), true); + butterfly_rotation(data, high_precision_temp.span(), 4 + (3 * i), 5 + i, 24 - (16 * i), true); // Check again that we haven't exceeded the integer bounds. for (auto i = 0u; i < 8; i++) VERIFY(check_bounds(high_precision_temp[i], bits)); // 5. Invoke SH( 4+i, 6+i ) for i = 0..1. for (auto i = 0u; i < 2; i++) - hadamard_rotation(high_precision_temp, data, 4 + i, 6 + i); + hadamard_rotation(high_precision_temp.span(), data, 4 + i, 6 + i); // 6. Invoke H( i, 2+i, 0 ) for i = 0..1. for (auto i = 0u; i < 2; i++) @@ -1557,7 +1543,7 @@ inline DecoderErrorOr Decoder::inverse_asymmetric_discrete_sine_transform_ // 8. Invoke the ADST output array permutation process specified in section 8.7.1.5 with the input variable n // set equal to 3. - inverse_asymmetric_discrete_sine_transform_output_array_permutation(data, m_buffers.transform_temp, 3); + inverse_asymmetric_discrete_sine_transform_output_array_permutation(data, 3); // 9. Set T[ 1+2*i ] equal to -T[ 1+2*i ] for i = 0..3. for (auto i = 0u; i < 4; i++) { @@ -1567,25 +1553,23 @@ inline DecoderErrorOr Decoder::inverse_asymmetric_discrete_sine_transform_ return {}; } -inline DecoderErrorOr Decoder::inverse_asymmetric_discrete_sine_transform_16(Vector& data) +inline DecoderErrorOr Decoder::inverse_asymmetric_discrete_sine_transform_16(Span data) { VERIFY(data.size() == 16); // This process does an in-place transform of the array T using: // A higher precision array S for intermediate results. - Vector& high_precision_temp = m_buffers.adst_temp; - high_precision_temp.clear_with_capacity(); - DECODER_TRY_ALLOC(high_precision_temp.try_resize_and_keep_capacity(16)); + Array high_precision_temp; // The following ordered steps apply: // 1. Invoke the ADST input array permutation process specified in section 8.7.1.4 with the input variable n set // equal to 4. - inverse_asymmetric_discrete_sine_transform_input_array_permutation(data, m_buffers.transform_temp, 4); + inverse_asymmetric_discrete_sine_transform_input_array_permutation(data, 4); // 2. Invoke SB( 2*i, 1+2*i, 31-4*i, 1 ) for i = 0..7. for (auto i = 0u; i < 8; i++) - butterfly_rotation(data, high_precision_temp, 2 * i, 1 + (2 * i), 31 - (4 * i), true); + butterfly_rotation(data, high_precision_temp.span(), 2 * i, 1 + (2 * i), 31 - (4 * i), true); // (8.7.1.1) The inverse asymmetric discrete sine transforms also make use of an intermediate array named S. // The values in this array require higher precision to avoid overflow. Using signed integers with 24 + // BitDepth bits of precision is enough to avoid overflow. @@ -1594,17 +1578,17 @@ inline DecoderErrorOr Decoder::inverse_asymmetric_discrete_sine_transform_ VERIFY(check_bounds(data[i], bits)); // 3. Invoke SH( i, 8+i ) for i = 0..7. for (auto i = 0u; i < 8; i++) - hadamard_rotation(high_precision_temp, data, i, 8 + i); + hadamard_rotation(high_precision_temp.span(), data, i, 8 + i); // 4. Invoke SB( 8+2*i, 9+2*i, 28-16*i, 1 ) for i = 0..3. for (auto i = 0u; i < 4; i++) - butterfly_rotation(data, high_precision_temp, 8 + (2 * i), 9 + (2 * i), 128 + 28 - (16 * i), true); + butterfly_rotation(data, high_precision_temp.span(), 8 + (2 * i), 9 + (2 * i), 128 + 28 - (16 * i), true); // Check again that we haven't exceeded the integer bounds. for (auto i = 0u; i < 16; i++) VERIFY(check_bounds(data[i], bits)); // 5. Invoke SH( 8+i, 12+i ) for i = 0..3. for (auto i = 0u; i < 4; i++) - hadamard_rotation(high_precision_temp, data, 8 + i, 12 + i); + hadamard_rotation(high_precision_temp.span(), data, 8 + i, 12 + i); // 6. Invoke H( i, 4+i, 0 ) for i = 0..3. for (auto i = 0u; i < 4; i++) @@ -1613,14 +1597,14 @@ inline DecoderErrorOr Decoder::inverse_asymmetric_discrete_sine_transform_ // 7. Invoke SB( 4+8*i+3*j, 5+8*i+j, 24-16*j, 1 ) for i = 0..1, for j = 0..1. for (auto i = 0u; i < 2; i++) for (auto j = 0u; j < 2; j++) - butterfly_rotation(data, high_precision_temp, 4 + (8 * i) + (3 * j), 5 + (8 * i) + j, 24 - (16 * j), true); + butterfly_rotation(data, high_precision_temp.span(), 4 + (8 * i) + (3 * j), 5 + (8 * i) + j, 24 - (16 * j), true); // Check again that we haven't exceeded the integer bounds. for (auto i = 0u; i < 16; i++) VERIFY(check_bounds(data[i], bits)); // 8. Invoke SH( 4+8*j+i, 6+8*j+i ) for i = 0..1, j = 0..1. for (auto i = 0u; i < 2; i++) for (auto j = 0u; j < 2; j++) - hadamard_rotation(high_precision_temp, data, 4 + (8 * j) + i, 6 + (8 * j) + i); + hadamard_rotation(high_precision_temp.span(), data, 4 + (8 * j) + i, 6 + (8 * j) + i); // 9. Invoke H( 8*j+i, 2+8*j+i, 0 ) for i = 0..1, for j = 0..1. for (auto i = 0u; i < 2; i++) @@ -1633,7 +1617,7 @@ inline DecoderErrorOr Decoder::inverse_asymmetric_discrete_sine_transform_ // 11. Invoke the ADST output array permutation process specified in section 8.7.1.5 with the input variable n // set equal to 4. - inverse_asymmetric_discrete_sine_transform_output_array_permutation(data, m_buffers.transform_temp, 4); + inverse_asymmetric_discrete_sine_transform_output_array_permutation(data, 4); // 12. Set T[ 1+12*j+2*i ] equal to -T[ 1+12*j+2*i ] for i = 0..1, for j = 0..1. for (auto i = 0u; i < 2; i++) { @@ -1645,7 +1629,7 @@ inline DecoderErrorOr Decoder::inverse_asymmetric_discrete_sine_transform_ return {}; } -inline DecoderErrorOr Decoder::inverse_asymmetric_discrete_sine_transform(Vector& data, u8 log2_of_block_size) +inline DecoderErrorOr Decoder::inverse_asymmetric_discrete_sine_transform(Span data, u8 log2_of_block_size) { // 8.7.1.9 Inverse ADST Process @@ -1658,7 +1642,8 @@ inline DecoderErrorOr Decoder::inverse_asymmetric_discrete_sine_transform( // − If n is equal to 2, invoke the Inverse ADST4 process specified in section 8.7.1.6. inverse_asymmetric_discrete_sine_transform_4(data); return {}; - } else if (log2_of_block_size == 3) { + } + if (log2_of_block_size == 3) { // − Otherwise if n is equal to 3, invoke the Inverse ADST8 process specified in section 8.7.1.7. return inverse_asymmetric_discrete_sine_transform_8(data); } @@ -1666,7 +1651,7 @@ inline DecoderErrorOr Decoder::inverse_asymmetric_discrete_sine_transform( return inverse_asymmetric_discrete_sine_transform_16(data); } -DecoderErrorOr Decoder::inverse_transform_2d(Vector& dequantized, u8 log2_of_block_size) +DecoderErrorOr Decoder::inverse_transform_2d(Span dequantized, u8 log2_of_block_size) { // This process performs a 2D inverse transform for an array of size 2^n by 2^n stored in the 2D array Dequant. // The input to this process is a variable n (log2_of_block_size) that specifies the base 2 logarithm of the width of the transform. @@ -1674,19 +1659,19 @@ DecoderErrorOr Decoder::inverse_transform_2d(Vector& dequant // 1. Set the variable n0 (block_size) equal to 1 << n. auto block_size = 1u << log2_of_block_size; - Vector& row_or_column = m_buffers.row_or_column; - DECODER_TRY_ALLOC(row_or_column.try_resize_and_keep_capacity(block_size)); + Array row_array; + Span row = row_array.span().trim(block_size); // 2. The row transforms with i = 0..(n0-1) are applied as follows: for (auto i = 0u; i < block_size; i++) { // 1. Set T[ j ] equal to Dequant[ i ][ j ] for j = 0..(n0-1). for (auto j = 0u; j < block_size; j++) - row_or_column[j] = dequantized[index_from_row_and_column(i, j, block_size)]; + row[j] = dequantized[index_from_row_and_column(i, j, block_size)]; // 2. If Lossless is equal to 1, invoke the Inverse WHT process as specified in section 8.7.1.10 with shift equal // to 2. if (m_parser->m_lossless) { - TRY(inverse_walsh_hadamard_transform(row_or_column, log2_of_block_size, 2)); + TRY(inverse_walsh_hadamard_transform(row, log2_of_block_size, 2)); continue; } switch (m_parser->m_tx_type) { @@ -1695,15 +1680,15 @@ DecoderErrorOr Decoder::inverse_transform_2d(Vector& dequant // Otherwise, if TxType is equal to DCT_DCT or TxType is equal to ADST_DCT, apply an inverse DCT as // follows: // 1. Invoke the inverse DCT permutation process as specified in section 8.7.1.2 with the input variable n. - TRY(inverse_discrete_cosine_transform_array_permutation(row_or_column, log2_of_block_size)); + TRY(inverse_discrete_cosine_transform_array_permutation(row, log2_of_block_size)); // 2. Invoke the inverse DCT process as specified in section 8.7.1.3 with the input variable n. - TRY(inverse_discrete_cosine_transform(row_or_column, log2_of_block_size)); + TRY(inverse_discrete_cosine_transform(row, log2_of_block_size)); break; case DCT_ADST: case ADST_ADST: // 4. Otherwise (TxType is equal to DCT_ADST or TxType is equal to ADST_ADST), invoke the inverse ADST // process as specified in section 8.7.1.9 with input variable n. - TRY(inverse_asymmetric_discrete_sine_transform(row_or_column, log2_of_block_size)); + TRY(inverse_asymmetric_discrete_sine_transform(row, log2_of_block_size)); break; default: return DecoderError::corrupted("Unknown tx_type"sv); @@ -1711,19 +1696,22 @@ DecoderErrorOr Decoder::inverse_transform_2d(Vector& dequant // 5. Set Dequant[ i ][ j ] equal to T[ j ] for j = 0..(n0-1). for (auto j = 0u; j < block_size; j++) - dequantized[index_from_row_and_column(i, j, block_size)] = row_or_column[j]; + dequantized[index_from_row_and_column(i, j, block_size)] = row[j]; } + Array column_array; + auto column = column_array.span().trim(block_size); + // 3. The column transforms with j = 0..(n0-1) are applied as follows: for (auto j = 0u; j < block_size; j++) { // 1. Set T[ i ] equal to Dequant[ i ][ j ] for i = 0..(n0-1). for (auto i = 0u; i < block_size; i++) - row_or_column[i] = dequantized[index_from_row_and_column(i, j, block_size)]; + column[i] = dequantized[index_from_row_and_column(i, j, block_size)]; // 2. If Lossless is equal to 1, invoke the Inverse WHT process as specified in section 8.7.1.10 with shift equal // to 0. if (m_parser->m_lossless) { - TRY(inverse_walsh_hadamard_transform(row_or_column, log2_of_block_size, 2)); + TRY(inverse_walsh_hadamard_transform(column, log2_of_block_size, 2)); continue; } switch (m_parser->m_tx_type) { @@ -1732,15 +1720,15 @@ DecoderErrorOr Decoder::inverse_transform_2d(Vector& dequant // Otherwise, if TxType is equal to DCT_DCT or TxType is equal to DCT_ADST, apply an inverse DCT as // follows: // 1. Invoke the inverse DCT permutation process as specified in section 8.7.1.2 with the input variable n. - TRY(inverse_discrete_cosine_transform_array_permutation(row_or_column, log2_of_block_size)); + TRY(inverse_discrete_cosine_transform_array_permutation(column, log2_of_block_size)); // 2. Invoke the inverse DCT process as specified in section 8.7.1.3 with the input variable n. - TRY(inverse_discrete_cosine_transform(row_or_column, log2_of_block_size)); + TRY(inverse_discrete_cosine_transform(column, log2_of_block_size)); break; case ADST_DCT: case ADST_ADST: // 4. Otherwise (TxType is equal to ADST_DCT or TxType is equal to ADST_ADST), invoke the inverse ADST // process as specified in section 8.7.1.9 with input variable n. - TRY(inverse_asymmetric_discrete_sine_transform(row_or_column, log2_of_block_size)); + TRY(inverse_asymmetric_discrete_sine_transform(column, log2_of_block_size)); break; default: VERIFY_NOT_REACHED(); @@ -1748,7 +1736,7 @@ DecoderErrorOr Decoder::inverse_transform_2d(Vector& dequant // 5. If Lossless is equal to 1, set Dequant[ i ][ j ] equal to T[ i ] for i = 0..(n0-1). for (auto i = 0u; i < block_size; i++) - dequantized[index_from_row_and_column(i, j, block_size)] = row_or_column[i]; + dequantized[index_from_row_and_column(i, j, block_size)] = column[i]; // 6. Otherwise (Lossless is equal to 0), set Dequant[ i ][ j ] equal to Round2( T[ i ], Min( 6, n + 2 ) ) // for i = 0..(n0-1). diff --git a/Userland/Libraries/LibVideo/VP9/Decoder.h b/Userland/Libraries/LibVideo/VP9/Decoder.h index 119f524cfd..7985686170 100644 --- a/Userland/Libraries/LibVideo/VP9/Decoder.h +++ b/Userland/Libraries/LibVideo/VP9/Decoder.h @@ -36,6 +36,12 @@ public: private: typedef i32 Intermediate; + // Based on the maximum size resulting from num_4x4_blocks_wide_lookup. + static constexpr size_t maximum_block_dimensions = 64ULL; + static constexpr size_t maximum_block_size = maximum_block_dimensions * maximum_block_dimensions; + // Based on the maximum for TXSize. + static constexpr size_t maximum_transform_size = 32ULL * 32ULL; + DecoderErrorOr decode_frame(ReadonlyBytes); DecoderErrorOr create_video_frame(); @@ -64,7 +70,7 @@ private: // (8.5.2.3) Motion vector scaling process DecoderErrorOr scale_motion_vector(u8 plane, u8 ref_list, u32 x, u32 y, MotionVector vector); // From (8.5.1) Inter prediction process, steps 2-5 - DecoderErrorOr predict_inter_block(u8 plane, u8 ref_list, u32 x, u32 y, u32 width, u32 height, u32 block_index, Vector& buffer); + DecoderErrorOr predict_inter_block(u8 plane, u8 ref_list, u32 x, u32 y, u32 width, u32 height, u32 block_index, Span block_buffer); /* (8.6) Reconstruction and Dequantization */ @@ -82,7 +88,7 @@ private: DecoderErrorOr reconstruct(u8 plane, u32 transform_block_x, u32 transform_block_y, TXSize transform_block_size); // (8.7) Inverse transform process - DecoderErrorOr inverse_transform_2d(Vector& dequantized, u8 log2_of_block_size); + DecoderErrorOr inverse_transform_2d(Span dequantized, u8 log2_of_block_size); // (8.7.1) 1D Transforms // (8.7.1.1) Butterfly functions @@ -90,17 +96,17 @@ private: inline i32 cos64(u8 angle); inline i32 sin64(u8 angle); // The function B( a, b, angle, 0 ) performs a butterfly rotation. - inline void butterfly_rotation_in_place(Vector& data, size_t index_a, size_t index_b, u8 angle, bool flip); + inline void butterfly_rotation_in_place(Span data, size_t index_a, size_t index_b, u8 angle, bool flip); // The function H( a, b, 0 ) performs a Hadamard rotation. - inline void hadamard_rotation_in_place(Vector& data, size_t index_a, size_t index_b, bool flip); + inline void hadamard_rotation_in_place(Span data, size_t index_a, size_t index_b, bool flip); // The function SB( a, b, angle, 0 ) performs a butterfly rotation. // Spec defines the source as array T, and the destination array as S. template - inline void butterfly_rotation(Vector& source, Vector& destination, size_t index_a, size_t index_b, u8 angle, bool flip); + inline void butterfly_rotation(Span source, Span destination, size_t index_a, size_t index_b, u8 angle, bool flip); // The function SH( a, b ) performs a Hadamard rotation and rounding. // Spec defines the source array as S, and the destination array as T. template - inline void hadamard_rotation(Vector& source, Vector& destination, size_t index_a, size_t index_b); + inline void hadamard_rotation(Span source, Span destination, size_t index_a, size_t index_b); template inline i32 round_2(T value, u8 bits); @@ -109,30 +115,30 @@ private: inline bool check_intermediate_bounds(Intermediate value); // (8.7.1.10) This process does an in-place Walsh-Hadamard transform of the array T (of length 4). - inline DecoderErrorOr inverse_walsh_hadamard_transform(Vector& data, u8 log2_of_block_size, u8 shift); + inline DecoderErrorOr inverse_walsh_hadamard_transform(Span data, u8 log2_of_block_size, u8 shift); // (8.7.1.2) Inverse DCT array permutation process - inline DecoderErrorOr inverse_discrete_cosine_transform_array_permutation(Vector& data, u8 log2_of_block_size); + inline DecoderErrorOr inverse_discrete_cosine_transform_array_permutation(Span data, u8 log2_of_block_size); // (8.7.1.3) Inverse DCT process - inline DecoderErrorOr inverse_discrete_cosine_transform(Vector& data, u8 log2_of_block_size); + inline DecoderErrorOr inverse_discrete_cosine_transform(Span data, u8 log2_of_block_size); // (8.7.1.4) This process performs the in-place permutation of the array T of length 2 n which is required as the first step of // the inverse ADST. - inline void inverse_asymmetric_discrete_sine_transform_input_array_permutation(Vector& data, Vector& temp, u8 log2_of_block_size); + inline void inverse_asymmetric_discrete_sine_transform_input_array_permutation(Span data, u8 log2_of_block_size); // (8.7.1.5) This process performs the in-place permutation of the array T of length 2 n which is required before the final // step of the inverse ADST. - inline void inverse_asymmetric_discrete_sine_transform_output_array_permutation(Vector& data, Vector& temp, u8 log2_of_block_size); + inline void inverse_asymmetric_discrete_sine_transform_output_array_permutation(Span data, u8 log2_of_block_size); // (8.7.1.6) This process does an in-place transform of the array T to perform an inverse ADST. - inline void inverse_asymmetric_discrete_sine_transform_4(Vector& data); + inline void inverse_asymmetric_discrete_sine_transform_4(Span data); // (8.7.1.7) This process does an in-place transform of the array T using a higher precision array S for intermediate // results. - inline DecoderErrorOr inverse_asymmetric_discrete_sine_transform_8(Vector& data); + inline DecoderErrorOr inverse_asymmetric_discrete_sine_transform_8(Span data); // (8.7.1.8) This process does an in-place transform of the array T using a higher precision array S for intermediate // results. - inline DecoderErrorOr inverse_asymmetric_discrete_sine_transform_16(Vector& data); + inline DecoderErrorOr inverse_asymmetric_discrete_sine_transform_16(Span data); // (8.7.1.9) This process performs an in-place inverse ADST process on the array T of size 2 n for 2 ≤ n ≤ 4. - inline DecoderErrorOr inverse_asymmetric_discrete_sine_transform(Vector& data, u8 log2_of_block_size); + inline DecoderErrorOr inverse_asymmetric_discrete_sine_transform(Span data, u8 log2_of_block_size); /* (8.10) Reference Frame Update Process */ DecoderErrorOr update_reference_frames(); @@ -141,34 +147,7 @@ private: NonnullOwnPtr m_parser; - struct { - // FIXME: We may be able to consolidate some of these to reduce memory consumption. - - // FIXME: Create a new struct to store these buffers, specifying size and providing - // helper functions to get values at coordinates. All *_at(row, column) - // functions in Decoder.cpp and functions returning row * width + column - // should be replaced if possible. - - Vector dequantized; - Vector row_or_column; - - // predict_intra - Vector above_row; - Vector left_column; - Vector predicted_samples; - - // transforms (dct, adst) - Vector transform_temp; - Vector adst_temp; - - // predict_inter - Vector inter_horizontal; - Vector inter_predicted; - Vector inter_predicted_compound; - - Vector intermediate[3]; - Vector output[3]; - } m_buffers; + Vector m_output_buffers[3]; Queue, 1> m_video_frame_queue; };