1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-07-27 07:57:46 +00:00

LibVideo/VP9: Specialize transforms on their block size

Previously, the block sizes would be checked at runtime to
determine the transform size to apply for residuals. Making the block
sizes into constant expressions allows all the loops to be unrolled
and reduces branching significantly.

This results in about a 26% improvement (~18s -> ~13.2s) in speed in an
intra-heavy test video.
This commit is contained in:
Zaggy1024 2023-04-16 11:07:06 -05:00 committed by Tim Flynn
parent 5b4c1056f1
commit f6764beead
2 changed files with 90 additions and 47 deletions

View file

@ -81,9 +81,12 @@ private:
// (8.6.2) Reconstruct process
DecoderErrorOr<void> reconstruct(u8 plane, BlockContext const&, u32 transform_block_x, u32 transform_block_y, TransformSize transform_block_size, TransformSet);
template<u8 log2_of_block_size>
DecoderErrorOr<void> reconstruct_templated(u8 plane, BlockContext const&, u32 transform_block_x, u32 transform_block_y, TransformSet);
// (8.7) Inverse transform process
DecoderErrorOr<void> inverse_transform_2d(BlockContext const&, Span<Intermediate> dequantized, u8 log2_of_block_size, TransformSet);
template<u8 log2_of_block_size>
DecoderErrorOr<void> inverse_transform_2d(BlockContext const&, Span<Intermediate> dequantized, TransformSet);
// (8.7.1) 1D Transforms
// (8.7.1.1) Butterfly functions
@ -107,16 +110,20 @@ private:
inline DecoderErrorOr<void> inverse_walsh_hadamard_transform(Span<Intermediate> data, u8 log2_of_block_size, u8 shift);
// (8.7.1.2) Inverse DCT array permutation process
inline DecoderErrorOr<void> inverse_discrete_cosine_transform_array_permutation(Span<Intermediate> data, u8 log2_of_block_size);
template<u8 log2_of_block_size>
inline DecoderErrorOr<void> inverse_discrete_cosine_transform_array_permutation(Span<Intermediate> data);
// (8.7.1.3) Inverse DCT process
inline DecoderErrorOr<void> inverse_discrete_cosine_transform(Span<Intermediate> data, u8 log2_of_block_size);
template<u8 log2_of_block_size>
inline DecoderErrorOr<void> inverse_discrete_cosine_transform(Span<Intermediate> data);
// (8.7.1.4) This process performs the in-place permutation of the array T of length 2 n which is required as the first step of
// the inverse ADST.
inline void inverse_asymmetric_discrete_sine_transform_input_array_permutation(Span<Intermediate> data, u8 log2_of_block_size);
template<u8 log2_of_block_size>
inline void inverse_asymmetric_discrete_sine_transform_input_array_permutation(Span<Intermediate> data);
// (8.7.1.5) This process performs the in-place permutation of the array T of length 2 n which is required before the final
// step of the inverse ADST.
inline void inverse_asymmetric_discrete_sine_transform_output_array_permutation(Span<Intermediate> data, u8 log2_of_block_size);
template<u8 log2_of_block_size>
inline void inverse_asymmetric_discrete_sine_transform_output_array_permutation(Span<Intermediate> data);
// (8.7.1.6) This process does an in-place transform of the array T to perform an inverse ADST.
inline void inverse_asymmetric_discrete_sine_transform_4(Span<Intermediate> data);
@ -127,7 +134,8 @@ private:
// results.
inline DecoderErrorOr<void> inverse_asymmetric_discrete_sine_transform_16(Span<Intermediate> data);
// (8.7.1.9) This process performs an in-place inverse ADST process on the array T of size 2 n for 2 ≤ n ≤ 4.
inline DecoderErrorOr<void> inverse_asymmetric_discrete_sine_transform(Span<Intermediate> data, u8 log2_of_block_size);
template<u8 log2_of_block_size>
inline DecoderErrorOr<void> inverse_asymmetric_discrete_sine_transform(Span<Intermediate> data);
/* (8.10) Reference Frame Update Process */
DecoderErrorOr<void> update_reference_frames(FrameContext const&);