LibVideo: Make all VP9 block intermediates stack-allocated arrays

This has two benefits: - I observed a ~34% decrease in decoding time running TestVP9Decode. - Removing all of these silly Vector fields helps simplify the code relationships between all the functions in Decoder.cpp. It'll also be much easier to make these static with template specializations, if that turns out to be worthy performance improvement.
2025-09-17 05:46:17 +00:00 · 2022-11-15 03:36:19 -06:00 · 2022-11-15 03:36:19 -06:00 · 7514e49c17
commit 7514e49c17
parent c922d21ecb
2 changed files with 105 additions and 138 deletions
--- a/Userland/Libraries/LibVideo/VP9/Decoder.h
+++ b/Userland/Libraries/LibVideo/VP9/Decoder.h
@ -36,6 +36,12 @@ public:
 private:
    typedef i32 Intermediate;

+    // Based on the maximum size resulting from num_4x4_blocks_wide_lookup.
+    static constexpr size_t maximum_block_dimensions = 64ULL;
+    static constexpr size_t maximum_block_size = maximum_block_dimensions * maximum_block_dimensions;
+    // Based on the maximum for TXSize.
+    static constexpr size_t maximum_transform_size = 32ULL * 32ULL;
+
    DecoderErrorOr<void> decode_frame(ReadonlyBytes);
    DecoderErrorOr<void> create_video_frame();

@ -64,7 +70,7 @@ private:
    // (8.5.2.3) Motion vector scaling process
    DecoderErrorOr<MotionVector> scale_motion_vector(u8 plane, u8 ref_list, u32 x, u32 y, MotionVector vector);
    // From (8.5.1) Inter prediction process, steps 2-5
-    DecoderErrorOr<void> predict_inter_block(u8 plane, u8 ref_list, u32 x, u32 y, u32 width, u32 height, u32 block_index, Vector<u16>& buffer);
+    DecoderErrorOr<void> predict_inter_block(u8 plane, u8 ref_list, u32 x, u32 y, u32 width, u32 height, u32 block_index, Span<u16> block_buffer);

    /* (8.6) Reconstruction and Dequantization */

@ -82,7 +88,7 @@ private:
    DecoderErrorOr<void> reconstruct(u8 plane, u32 transform_block_x, u32 transform_block_y, TXSize transform_block_size);

    // (8.7) Inverse transform process
-    DecoderErrorOr<void> inverse_transform_2d(Vector<Intermediate>& dequantized, u8 log2_of_block_size);
+    DecoderErrorOr<void> inverse_transform_2d(Span<Intermediate> dequantized, u8 log2_of_block_size);

    // (8.7.1) 1D Transforms
    // (8.7.1.1) Butterfly functions
@ -90,17 +96,17 @@ private:
    inline i32 cos64(u8 angle);
    inline i32 sin64(u8 angle);
    // The function B( a, b, angle, 0 ) performs a butterfly rotation.
-    inline void butterfly_rotation_in_place(Vector<Intermediate>& data, size_t index_a, size_t index_b, u8 angle, bool flip);
+    inline void butterfly_rotation_in_place(Span<Intermediate> data, size_t index_a, size_t index_b, u8 angle, bool flip);
    // The function H( a, b, 0 ) performs a Hadamard rotation.
-    inline void hadamard_rotation_in_place(Vector<Intermediate>& data, size_t index_a, size_t index_b, bool flip);
+    inline void hadamard_rotation_in_place(Span<Intermediate> data, size_t index_a, size_t index_b, bool flip);
    // The function SB( a, b, angle, 0 ) performs a butterfly rotation.
    // Spec defines the source as array T, and the destination array as S.
    template<typename S, typename D>
-    inline void butterfly_rotation(Vector<S>& source, Vector<D>& destination, size_t index_a, size_t index_b, u8 angle, bool flip);
+    inline void butterfly_rotation(Span<S> source, Span<D> destination, size_t index_a, size_t index_b, u8 angle, bool flip);
    // The function SH( a, b ) performs a Hadamard rotation and rounding.
    // Spec defines the source array as S, and the destination array as T.
    template<typename S, typename D>
-    inline void hadamard_rotation(Vector<S>& source, Vector<D>& destination, size_t index_a, size_t index_b);
+    inline void hadamard_rotation(Span<S> source, Span<D> destination, size_t index_a, size_t index_b);

    template<typename T>
    inline i32 round_2(T value, u8 bits);
@ -109,30 +115,30 @@ private:
    inline bool check_intermediate_bounds(Intermediate value);

    // (8.7.1.10) This process does an in-place Walsh-Hadamard transform of the array T (of length 4).
-    inline DecoderErrorOr<void> inverse_walsh_hadamard_transform(Vector<Intermediate>& data, u8 log2_of_block_size, u8 shift);
+    inline DecoderErrorOr<void> inverse_walsh_hadamard_transform(Span<Intermediate> data, u8 log2_of_block_size, u8 shift);

    // (8.7.1.2) Inverse DCT array permutation process
-    inline DecoderErrorOr<void> inverse_discrete_cosine_transform_array_permutation(Vector<Intermediate>& data, u8 log2_of_block_size);
+    inline DecoderErrorOr<void> inverse_discrete_cosine_transform_array_permutation(Span<Intermediate> data, u8 log2_of_block_size);
    // (8.7.1.3) Inverse DCT process
-    inline DecoderErrorOr<void> inverse_discrete_cosine_transform(Vector<Intermediate>& data, u8 log2_of_block_size);
+    inline DecoderErrorOr<void> inverse_discrete_cosine_transform(Span<Intermediate> data, u8 log2_of_block_size);

    // (8.7.1.4) This process performs the in-place permutation of the array T of length 2 n which is required as the first step of
    // the inverse ADST.
-    inline void inverse_asymmetric_discrete_sine_transform_input_array_permutation(Vector<Intermediate>& data, Vector<Intermediate>& temp, u8 log2_of_block_size);
+    inline void inverse_asymmetric_discrete_sine_transform_input_array_permutation(Span<Intermediate> data, u8 log2_of_block_size);
    // (8.7.1.5) This process performs the in-place permutation of the array T of length 2 n which is required before the final
    // step of the inverse ADST.
-    inline void inverse_asymmetric_discrete_sine_transform_output_array_permutation(Vector<Intermediate>& data, Vector<Intermediate>& temp, u8 log2_of_block_size);
+    inline void inverse_asymmetric_discrete_sine_transform_output_array_permutation(Span<Intermediate> data, u8 log2_of_block_size);

    // (8.7.1.6) This process does an in-place transform of the array T to perform an inverse ADST.
-    inline void inverse_asymmetric_discrete_sine_transform_4(Vector<Intermediate>& data);
+    inline void inverse_asymmetric_discrete_sine_transform_4(Span<Intermediate> data);
    // (8.7.1.7) This process does an in-place transform of the array T using a higher precision array S for intermediate
    // results.
-    inline DecoderErrorOr<void> inverse_asymmetric_discrete_sine_transform_8(Vector<Intermediate>& data);
+    inline DecoderErrorOr<void> inverse_asymmetric_discrete_sine_transform_8(Span<Intermediate> data);
    // (8.7.1.8) This process does an in-place transform of the array T using a higher precision array S for intermediate
    // results.
-    inline DecoderErrorOr<void> inverse_asymmetric_discrete_sine_transform_16(Vector<Intermediate>& data);
+    inline DecoderErrorOr<void> inverse_asymmetric_discrete_sine_transform_16(Span<Intermediate> data);
    // (8.7.1.9) This process performs an in-place inverse ADST process on the array T of size 2 n for 2 ≤ n ≤ 4.
-    inline DecoderErrorOr<void> inverse_asymmetric_discrete_sine_transform(Vector<Intermediate>& data, u8 log2_of_block_size);
+    inline DecoderErrorOr<void> inverse_asymmetric_discrete_sine_transform(Span<Intermediate> data, u8 log2_of_block_size);

    /* (8.10) Reference Frame Update Process */
    DecoderErrorOr<void> update_reference_frames();
@ -141,34 +147,7 @@ private:

    NonnullOwnPtr<Parser> m_parser;

-    struct {
-        // FIXME: We may be able to consolidate some of these to reduce memory consumption.
-
-        // FIXME: Create a new struct to store these buffers, specifying size and providing
-        //        helper functions to get values at coordinates. All *_at(row, column)
-        //        functions in Decoder.cpp and functions returning row * width + column
-        //        should be replaced if possible.
-
-        Vector<Intermediate> dequantized;
-        Vector<Intermediate> row_or_column;
-
-        // predict_intra
-        Vector<Intermediate> above_row;
-        Vector<Intermediate> left_column;
-        Vector<Intermediate> predicted_samples;
-
-        // transforms (dct, adst)
-        Vector<Intermediate> transform_temp;
-        Vector<i64> adst_temp;
-
-        // predict_inter
-        Vector<u16> inter_horizontal;
-        Vector<u16> inter_predicted;
-        Vector<u16> inter_predicted_compound;
-
-        Vector<Intermediate> intermediate[3];
-        Vector<u16> output[3];
-    } m_buffers;
+    Vector<u16> m_output_buffers[3];

    Queue<NonnullOwnPtr<VideoFrame>, 1> m_video_frame_queue;
 };