From f6764beeaddb54d8b59fa7d5cb666c7ad636be00 Mon Sep 17 00:00:00 2001 From: Zaggy1024 Date: Sun, 16 Apr 2023 11:07:06 -0500 Subject: [PATCH] LibVideo/VP9: Specialize transforms on their block size Previously, the block sizes would be checked at runtime to determine the transform size to apply for residuals. Making the block sizes into constant expressions allows all the loops to be unrolled and reduces branching significantly. This results in about a 26% improvement (~18s -> ~13.2s) in speed in an intra-heavy test video. --- Userland/Libraries/LibVideo/VP9/Decoder.cpp | 117 +++++++++++++------- Userland/Libraries/LibVideo/VP9/Decoder.h | 20 +++- 2 files changed, 90 insertions(+), 47 deletions(-) diff --git a/Userland/Libraries/LibVideo/VP9/Decoder.cpp b/Userland/Libraries/LibVideo/VP9/Decoder.cpp index de83efba05..a58526c0ae 100644 --- a/Userland/Libraries/LibVideo/VP9/Decoder.cpp +++ b/Userland/Libraries/LibVideo/VP9/Decoder.cpp @@ -1220,16 +1220,39 @@ DecoderErrorOr Decoder::reconstruct(u8 plane, BlockContext const& block_co { // 8.6.2 Reconstruct process - // The variable dqDenom is set equal to 2 if txSz is equal to Transform_32X32, otherwise dqDenom is set equal to 1. - Intermediate dq_denominator = transform_block_size == Transform_32x32 ? 2 : 1; // The variable n (specifying the base 2 logarithm of the width of the transform block) is set equal to 2 + txSz. u8 log2_of_block_size = 2u + transform_block_size; + switch (log2_of_block_size) { + case 2: + return reconstruct_templated<2>(plane, block_context, transform_block_x, transform_block_y, transform_set); + break; + case 3: + return reconstruct_templated<3>(plane, block_context, transform_block_x, transform_block_y, transform_set); + break; + case 4: + return reconstruct_templated<4>(plane, block_context, transform_block_x, transform_block_y, transform_set); + break; + case 5: + return reconstruct_templated<5>(plane, block_context, transform_block_x, transform_block_y, transform_set); + break; + default: + VERIFY_NOT_REACHED(); + } +} + +template +DecoderErrorOr Decoder::reconstruct_templated(u8 plane, BlockContext const& block_context, u32 transform_block_x, u32 transform_block_y, TransformSet transform_set) +{ + // 8.6.2 Reconstruct process, continued: + + // The variable dqDenom is set equal to 2 if txSz is equal to Transform_32X32, otherwise dqDenom is set equal to 1. + constexpr Intermediate dq_denominator = log2_of_block_size == 5 ? 2 : 1; // The variable n0 (specifying the width of the transform block) is set equal to 1 << n. - auto block_size = 1u << log2_of_block_size; + constexpr auto block_size = 1u << log2_of_block_size; // 1. Dequant[ i ][ j ] is set equal to ( Tokens[ i * n0 + j ] * get_ac_quant( plane ) ) / dqDenom // for i = 0..(n0-1), for j = 0..(n0-1) - Array dequantized; + Array dequantized; Intermediate ac_quant = get_ac_quantizer(block_context, plane); for (auto i = 0u; i < block_size; i++) { for (auto j = 0u; j < block_size; j++) { @@ -1250,7 +1273,7 @@ DecoderErrorOr Decoder::reconstruct(u8 plane, BlockContext const& block_co // 3. Invoke the 2D inverse transform block process defined in section 8.7.2 with the variable n as input. // The inverse transform outputs are stored back to the Dequant buffer. - TRY(inverse_transform_2d(block_context, dequantized, log2_of_block_size, transform_set)); + TRY(inverse_transform_2d(block_context, dequantized, transform_set)); // 4. CurrFrame[ plane ][ y + i ][ x + j ] is set equal to Clip1( CurrFrame[ plane ][ y + i ][ x + j ] + Dequant[ i ][ j ] ) // for i = 0..(n0-1) and j = 0..(n0-1). @@ -1359,9 +1382,12 @@ inline void Decoder::hadamard_rotation_in_place(Span data, size_t // to allow these bounds to be violated. Therefore, we can avoid the performance cost here. } -inline DecoderErrorOr Decoder::inverse_discrete_cosine_transform_array_permutation(Span data, u8 log2_of_block_size) +template +inline DecoderErrorOr Decoder::inverse_discrete_cosine_transform_array_permutation(Span data) { - u8 block_size = 1 << log2_of_block_size; + static_assert(log2_of_block_size >= 2 && log2_of_block_size <= 5, "Block size out of range."); + + constexpr u8 block_size = 1 << log2_of_block_size; // This process performs an in-place permutation of the array T of length 2^n for 2 ≤ n ≤ 5 which is required before // execution of the inverse DCT process. @@ -1369,7 +1395,7 @@ inline DecoderErrorOr Decoder::inverse_discrete_cosine_transform_array_per return DecoderError::corrupted("Block size was out of range"sv); // 1.1. A temporary array named copyT is set equal to T. - Array data_copy; + Array data_copy; AK::TypedTransfer::copy(data_copy.data(), data.data(), block_size); // 1.2. T[ i ] is set equal to copyT[ brev( n, i ) ] for i = 0..((1< Decoder::inverse_discrete_cosine_transform_array_per return {}; } -inline DecoderErrorOr Decoder::inverse_discrete_cosine_transform(Span data, u8 log2_of_block_size) +template +inline DecoderErrorOr Decoder::inverse_discrete_cosine_transform(Span data) { + static_assert(log2_of_block_size >= 2 && log2_of_block_size <= 5, "Block size out of range."); + // 2.1. The variable n0 is set equal to 1<> 1; + constexpr u8 half_block_size = block_size >> 1; // 2.3 The variable n2 is set equal to 1<<(n-2). - u8 quarter_block_size = half_block_size >> 1; + constexpr u8 quarter_block_size = half_block_size >> 1; // 2.4 The variable n3 is set equal to 1<<(n-3). - u8 eighth_block_size = quarter_block_size >> 1; + constexpr u8 eighth_block_size = quarter_block_size >> 1; // 2.5 If n is equal to 2, invoke B( 0, 1, 16, 1 ), otherwise recursively invoke the inverse DCT defined in this // section with the variable n set equal to n - 1. - if (log2_of_block_size == 2) + if constexpr (log2_of_block_size == 2) butterfly_rotation_in_place(data, 0, 1, 16, true); else - TRY(inverse_discrete_cosine_transform(data, log2_of_block_size - 1)); + TRY(inverse_discrete_cosine_transform(data)); // 2.6 Invoke B( n1+i, n0-1-i, 32-brev( 5, n1+i), 0 ) for i = 0..(n2-1). for (auto i = 0u; i < quarter_block_size; i++) { @@ -1407,7 +1436,7 @@ inline DecoderErrorOr Decoder::inverse_discrete_cosine_transform(Span= 3) { + if constexpr (log2_of_block_size >= 3) { // a. Invoke H( n1+4*i+2*j, n1+1+4*i+2*j, j ) for i = 0..(n3-1), j = 0..1. for (auto i = 0u; i < eighth_block_size; i++) { for (auto j = 0u; j < 2; j++) { @@ -1418,7 +1447,7 @@ inline DecoderErrorOr Decoder::inverse_discrete_cosine_transform(Span Decoder::inverse_discrete_cosine_transform(Span= 4) { + if constexpr (log2_of_block_size >= 4) { // a. Invoke B( n0-n+2-i-n2*j, n1+n-3+i+n2*j, 24+48*j, 1 ) for i = 0..(n==5), j = 0..1. for (auto i = 0u; i <= (log2_of_block_size == 5); i++) { for (auto j = 0u; j < 2; j++) { @@ -1461,7 +1490,7 @@ inline DecoderErrorOr Decoder::inverse_discrete_cosine_transform(Span= 3) { + if constexpr (log2_of_block_size >= 3) { // a. Invoke B( n0-n3-1-i, n1+n3+i, 16, 1 ) for i = 0..(n3-1). for (auto i = 0u; i < eighth_block_size; i++) { auto index_a = block_size - eighth_block_size - 1 - i; @@ -1477,15 +1506,16 @@ inline DecoderErrorOr Decoder::inverse_discrete_cosine_transform(Span data, u8 log2_of_block_size) +template +inline void Decoder::inverse_asymmetric_discrete_sine_transform_input_array_permutation(Span data) { // The variable n0 is set equal to 1< data_copy; + Array data_copy; AK::TypedTransfer::copy(data_copy.data(), data.data(), block_size); // The values at even locations T[ 2 * i ] are set equal to copyT[ n0 - 1 - 2 * i ] for i = 0..(n1-1). @@ -1496,7 +1526,8 @@ inline void Decoder::inverse_asymmetric_discrete_sine_transform_input_array_perm } } -inline void Decoder::inverse_asymmetric_discrete_sine_transform_output_array_permutation(Span data, u8 log2_of_block_size) +template +inline void Decoder::inverse_asymmetric_discrete_sine_transform_output_array_permutation(Span data) { auto block_size = 1u << log2_of_block_size; @@ -1638,7 +1669,7 @@ inline DecoderErrorOr Decoder::inverse_asymmetric_discrete_sine_transform_ // 1. Invoke the ADST input array permutation process specified in section 8.7.1.4 with the input variable n set // equal to 3. - inverse_asymmetric_discrete_sine_transform_input_array_permutation(data, 3); + inverse_asymmetric_discrete_sine_transform_input_array_permutation<3>(data); // 2. Invoke SB( 2*i, 1+2*i, 30-8*i, 1 ) for i = 0..3. for (auto i = 0u; i < 4; i++) @@ -1665,7 +1696,7 @@ inline DecoderErrorOr Decoder::inverse_asymmetric_discrete_sine_transform_ // 8. Invoke the ADST output array permutation process specified in section 8.7.1.5 with the input variable n // set equal to 3. - inverse_asymmetric_discrete_sine_transform_output_array_permutation(data, 3); + inverse_asymmetric_discrete_sine_transform_output_array_permutation<3>(data); // 9. Set T[ 1+2*i ] equal to -T[ 1+2*i ] for i = 0..3. for (auto i = 0u; i < 4; i++) { @@ -1690,7 +1721,7 @@ inline DecoderErrorOr Decoder::inverse_asymmetric_discrete_sine_transform_ // 1. Invoke the ADST input array permutation process specified in section 8.7.1.4 with the input variable n set // equal to 4. - inverse_asymmetric_discrete_sine_transform_input_array_permutation(data, 4); + inverse_asymmetric_discrete_sine_transform_input_array_permutation<4>(data); // 2. Invoke SB( 2*i, 1+2*i, 31-4*i, 1 ) for i = 0..7. for (auto i = 0u; i < 8; i++) @@ -1730,7 +1761,7 @@ inline DecoderErrorOr Decoder::inverse_asymmetric_discrete_sine_transform_ // 11. Invoke the ADST output array permutation process specified in section 8.7.1.5 with the input variable n // set equal to 4. - inverse_asymmetric_discrete_sine_transform_output_array_permutation(data, 4); + inverse_asymmetric_discrete_sine_transform_output_array_permutation<4>(data); // 12. Set T[ 1+12*j+2*i ] equal to -T[ 1+12*j+2*i ] for i = 0..1, for j = 0..1. for (auto i = 0u; i < 2; i++) { @@ -1742,21 +1773,22 @@ inline DecoderErrorOr Decoder::inverse_asymmetric_discrete_sine_transform_ return {}; } -inline DecoderErrorOr Decoder::inverse_asymmetric_discrete_sine_transform(Span data, u8 log2_of_block_size) +template +inline DecoderErrorOr Decoder::inverse_asymmetric_discrete_sine_transform(Span data) { // 8.7.1.9 Inverse ADST Process // This process performs an in-place inverse ADST process on the array T of size 2^n for 2 ≤ n ≤ 4. - if (log2_of_block_size < 2 || log2_of_block_size > 4) + if constexpr (log2_of_block_size < 2 || log2_of_block_size > 4) return DecoderError::corrupted("Block size was out of range"sv); // The process to invoke depends on n as follows: - if (log2_of_block_size == 2) { + if constexpr (log2_of_block_size == 2) { // − If n is equal to 2, invoke the Inverse ADST4 process specified in section 8.7.1.6. inverse_asymmetric_discrete_sine_transform_4(data); return {}; } - if (log2_of_block_size == 3) { + if constexpr (log2_of_block_size == 3) { // − Otherwise if n is equal to 3, invoke the Inverse ADST8 process specified in section 8.7.1.7. return inverse_asymmetric_discrete_sine_transform_8(data); } @@ -1764,15 +1796,18 @@ inline DecoderErrorOr Decoder::inverse_asymmetric_discrete_sine_transform( return inverse_asymmetric_discrete_sine_transform_16(data); } -DecoderErrorOr Decoder::inverse_transform_2d(BlockContext const& block_context, Span dequantized, u8 log2_of_block_size, TransformSet transform_set) +template +DecoderErrorOr Decoder::inverse_transform_2d(BlockContext const& block_context, Span dequantized, TransformSet transform_set) { + static_assert(log2_of_block_size >= 2 && log2_of_block_size <= 5); + // This process performs a 2D inverse transform for an array of size 2^n by 2^n stored in the 2D array Dequant. // The input to this process is a variable n (log2_of_block_size) that specifies the base 2 logarithm of the width of the transform. // 1. Set the variable n0 (block_size) equal to 1 << n. - auto block_size = 1u << log2_of_block_size; + constexpr auto block_size = 1u << log2_of_block_size; - Array row_array; + Array row_array; Span row = row_array.span().trim(block_size); // 2. The row transforms with i = 0..(n0-1) are applied as follows: @@ -1792,14 +1827,14 @@ DecoderErrorOr Decoder::inverse_transform_2d(BlockContext const& block_con // Otherwise, if TxType is equal to DCT_DCT or TxType is equal to ADST_DCT, apply an inverse DCT as // follows: // 1. Invoke the inverse DCT permutation process as specified in section 8.7.1.2 with the input variable n. - TRY(inverse_discrete_cosine_transform_array_permutation(row, log2_of_block_size)); + TRY(inverse_discrete_cosine_transform_array_permutation(row)); // 2. Invoke the inverse DCT process as specified in section 8.7.1.3 with the input variable n. - TRY(inverse_discrete_cosine_transform(row, log2_of_block_size)); + TRY(inverse_discrete_cosine_transform(row)); break; case TransformType::ADST: // 4. Otherwise (TxType is equal to DCT_ADST or TxType is equal to ADST_ADST), invoke the inverse ADST // process as specified in section 8.7.1.9 with input variable n. - TRY(inverse_asymmetric_discrete_sine_transform(row, log2_of_block_size)); + TRY(inverse_asymmetric_discrete_sine_transform(row)); break; default: return DecoderError::corrupted("Unknown tx_type"sv); @@ -1810,7 +1845,7 @@ DecoderErrorOr Decoder::inverse_transform_2d(BlockContext const& block_con dequantized[i * block_size + j] = row[j]; } - Array column_array; + Array column_array; auto column = column_array.span().trim(block_size); // 3. The column transforms with j = 0..(n0-1) are applied as follows: @@ -1830,14 +1865,14 @@ DecoderErrorOr Decoder::inverse_transform_2d(BlockContext const& block_con // Otherwise, if TxType is equal to DCT_DCT or TxType is equal to DCT_ADST, apply an inverse DCT as // follows: // 1. Invoke the inverse DCT permutation process as specified in section 8.7.1.2 with the input variable n. - TRY(inverse_discrete_cosine_transform_array_permutation(column, log2_of_block_size)); + TRY(inverse_discrete_cosine_transform_array_permutation(column)); // 2. Invoke the inverse DCT process as specified in section 8.7.1.3 with the input variable n. - TRY(inverse_discrete_cosine_transform(column, log2_of_block_size)); + TRY(inverse_discrete_cosine_transform(column)); break; case TransformType::ADST: // 4. Otherwise (TxType is equal to ADST_DCT or TxType is equal to ADST_ADST), invoke the inverse ADST // process as specified in section 8.7.1.9 with input variable n. - TRY(inverse_asymmetric_discrete_sine_transform(column, log2_of_block_size)); + TRY(inverse_asymmetric_discrete_sine_transform(column)); break; default: VERIFY_NOT_REACHED(); diff --git a/Userland/Libraries/LibVideo/VP9/Decoder.h b/Userland/Libraries/LibVideo/VP9/Decoder.h index f861e67468..9246fd504f 100644 --- a/Userland/Libraries/LibVideo/VP9/Decoder.h +++ b/Userland/Libraries/LibVideo/VP9/Decoder.h @@ -81,9 +81,12 @@ private: // (8.6.2) Reconstruct process DecoderErrorOr reconstruct(u8 plane, BlockContext const&, u32 transform_block_x, u32 transform_block_y, TransformSize transform_block_size, TransformSet); + template + DecoderErrorOr reconstruct_templated(u8 plane, BlockContext const&, u32 transform_block_x, u32 transform_block_y, TransformSet); // (8.7) Inverse transform process - DecoderErrorOr inverse_transform_2d(BlockContext const&, Span dequantized, u8 log2_of_block_size, TransformSet); + template + DecoderErrorOr inverse_transform_2d(BlockContext const&, Span dequantized, TransformSet); // (8.7.1) 1D Transforms // (8.7.1.1) Butterfly functions @@ -107,16 +110,20 @@ private: inline DecoderErrorOr inverse_walsh_hadamard_transform(Span data, u8 log2_of_block_size, u8 shift); // (8.7.1.2) Inverse DCT array permutation process - inline DecoderErrorOr inverse_discrete_cosine_transform_array_permutation(Span data, u8 log2_of_block_size); + template + inline DecoderErrorOr inverse_discrete_cosine_transform_array_permutation(Span data); // (8.7.1.3) Inverse DCT process - inline DecoderErrorOr inverse_discrete_cosine_transform(Span data, u8 log2_of_block_size); + template + inline DecoderErrorOr inverse_discrete_cosine_transform(Span data); // (8.7.1.4) This process performs the in-place permutation of the array T of length 2 n which is required as the first step of // the inverse ADST. - inline void inverse_asymmetric_discrete_sine_transform_input_array_permutation(Span data, u8 log2_of_block_size); + template + inline void inverse_asymmetric_discrete_sine_transform_input_array_permutation(Span data); // (8.7.1.5) This process performs the in-place permutation of the array T of length 2 n which is required before the final // step of the inverse ADST. - inline void inverse_asymmetric_discrete_sine_transform_output_array_permutation(Span data, u8 log2_of_block_size); + template + inline void inverse_asymmetric_discrete_sine_transform_output_array_permutation(Span data); // (8.7.1.6) This process does an in-place transform of the array T to perform an inverse ADST. inline void inverse_asymmetric_discrete_sine_transform_4(Span data); @@ -127,7 +134,8 @@ private: // results. inline DecoderErrorOr inverse_asymmetric_discrete_sine_transform_16(Span data); // (8.7.1.9) This process performs an in-place inverse ADST process on the array T of size 2 n for 2 ≤ n ≤ 4. - inline DecoderErrorOr inverse_asymmetric_discrete_sine_transform(Span data, u8 log2_of_block_size); + template + inline DecoderErrorOr inverse_asymmetric_discrete_sine_transform(Span data); /* (8.10) Reference Frame Update Process */ DecoderErrorOr update_reference_frames(FrameContext const&);