1
Fork 0
mirror of https://github.com/RGBCube/serenity synced 2025-07-27 20:37:34 +00:00

LibVideo/VP9: Implement unscaled fast paths in inter prediction

Inter-prediction convolution filters are selected based on the
subpixel position determined for the motion vector relative to the
block being predicted. The subpixel position 0 only uses one single
sample in the center of the convolution, not averaging any other
samples. Let's call this a copy.

Reference frames can also be a different size relative to the frame
being predicted, but in almost every case, that scale will be 1:1
for every single frame in a video.

Taking into account these facts, we can create multiple fast paths for
inter prediction. These fast paths are only active when scaling is 1:1.

If we are doing a copy in both dimensions, then we can do a straight
memcpy from the reference frame to the output block buffer. In videos
where there is no motion, this is a dramatic speedup.

If we are doing a copy in one dimension, we can just do one convolution
and average directly into the output block buffer.

If we aren't doing a copy in either dimension, we can still cut out a
few operations from the convolution loops, since we only need to
advance our samples by whole pixels instead of subpixels.

These fast paths result in about a 34% improvement (~31.2s -> ~20.6s)
in a video which relies heavily on intra-predicted blocks due to high
motion. In videos with less motion, the improvement will be even
greater.

Also, note that the accumulators in these faster loops are only 16-bit.
High bit-depth videos will overflow those, so for now the fast path is
only used for 8-bit videos.
This commit is contained in:
Zaggy1024 2023-04-16 08:39:05 -05:00 committed by Tim Flynn
parent 8cd72ad1ed
commit 8ad0dff5c2
3 changed files with 157 additions and 52 deletions

View file

@ -207,6 +207,13 @@ DecoderErrorOr<NonnullOwnPtr<VideoFrame>> Decoder::get_decoded_frame()
return m_video_frame_queue.dequeue();
}
template<typename T>
static inline i32 rounded_right_shift(T value, u8 bits)
{
value = (value + static_cast<T>(1u << (bits - 1u))) >> bits;
return static_cast<i32>(value);
}
u8 Decoder::merge_prob(u8 pre_prob, u32 count_0, u32 count_1, u8 count_sat, u8 max_update_factor)
{
auto total_decode_count = count_0 + count_1;
@ -863,6 +870,7 @@ DecoderErrorOr<void> Decoder::predict_inter_block(u8 plane, BlockContext const&
auto x_scale = reference_frame.x_scale;
auto y_scale = reference_frame.y_scale;
// The amount of subpixels between each sample of this block. Non-16 values will cause the output to be scaled.
auto scaled_step_x = reference_frame.scaled_step_x;
auto scaled_step_y = reference_frame.scaled_step_y;
@ -901,68 +909,175 @@ DecoderErrorOr<void> Decoder::predict_inter_block(u8 plane, BlockContext const&
auto& reference_frame_buffer = reference_frame.frame_planes[plane];
auto reference_frame_width = y_size_to_uv_size(subsampling_x, reference_frame.size.width()) + MV_BORDER * 2;
auto block_buffer_at = [&](u32 row, u32 column) -> u16& {
return block_buffer[row * width + column];
};
// The variable lastX is set equal to ( (RefFrameWidth[ refIdx ] + subX) >> subX) - 1.
// The variable lastY is set equal to ( (RefFrameHeight[ refIdx ] + subY) >> subY) - 1.
// (lastX and lastY specify the coordinates of the bottom right sample of the reference plane.)
// Ad-hoc: These variables are not needed, since the reference frame is expanded to contain the samples that
// may be referenced by motion vectors on the edge of the frame.
// The variable intermediateHeight specifying the height required for the intermediate array is set equal to (((h -
// 1) * yStep + 15) >> 4) + 8.
static constexpr auto maximum_intermediate_height = (((maximum_block_dimensions - 1) * maximum_scaled_step + 15) >> 4) + 8;
auto intermediate_height = (((height - 1) * scaled_step_y + 15) >> 4) + 8;
VERIFY(intermediate_height <= maximum_intermediate_height);
// The sub-sample interpolation is effected via two one-dimensional convolutions. First a horizontal filter is used
// to build up a temporary array, and then this array is vertically filtered to obtain the final prediction. The
// fractional parts of the motion vectors determine the filtering process. If the fractional part is zero, then the
// filtering is equivalent to a straight sample copy.
// The filtering is applied as follows:
constexpr auto sample_offset = 3;
auto subpixel_row_from_reference_row = [offset_scaled_block_y](u32 row) {
return (offset_scaled_block_y >> SUBPEL_BITS) + static_cast<i32>(row);
};
auto reference_index_for_row = [reference_frame_width](i32 row) {
return static_cast<size_t>(MV_BORDER + row) * reference_frame_width;
};
// The variable intermediateHeight specifying the height required for the intermediate array is set equal to (((h -
// 1) * yStep + 15) >> 4) + 8.
static constexpr auto maximum_intermediate_height = (((maximum_block_dimensions - 1) * maximum_scaled_step + 15) >> 4) + 8;
auto const intermediate_height = (((height - 1) * scaled_step_y + 15) >> 4) + 8;
VERIFY(intermediate_height <= maximum_intermediate_height);
// Check our reference frame bounds before starting the loop.
auto const last_possible_reference_index = reference_index_for_row(subpixel_row_from_reference_row(intermediate_height - sample_offset));
VERIFY(reference_frame_buffer.size() >= last_possible_reference_index);
VERIFY(block_buffer.size() >= static_cast<size_t>(width) * height);
auto const reference_block_x = MV_BORDER + (offset_scaled_block_x >> SUBPEL_BITS);
auto const reference_block_y = MV_BORDER + (offset_scaled_block_y >> SUBPEL_BITS);
auto const reference_subpixel_x = offset_scaled_block_x & SUBPEL_MASK;
auto const reference_subpixel_y = offset_scaled_block_y & SUBPEL_MASK;
// OPTIMIZATION: If the fractional part of a component of the motion vector is 0, we want to do a fast path
// skipping one or both of the convolutions.
bool const copy_x = reference_subpixel_x == 0;
bool const copy_y = reference_subpixel_y == 0;
bool const unscaled_x = scaled_step_x == 16;
bool const unscaled_y = scaled_step_y == 16;
// The array intermediate is specified as follows:
// Note: Height is specified by `intermediate_height`, width is specified by `width`
Array<u16, maximum_intermediate_height * maximum_block_dimensions> intermediate_buffer;
auto intermediate_buffer_at = [&](u32 row, u32 column) -> u16& {
return intermediate_buffer[row * width + column];
auto const bit_depth = block_context.frame_context.color_config.bit_depth;
auto const* reference_start = reference_frame_buffer.data() + reference_block_y * reference_frame_width + reference_block_x;
// FIXME: We are using 16-bit accumulators for speed in these loops, but when accumulating for a high bit-depth video, they will overflow.
// Instead of hardcoding them, the Decoder class should have the bit depth as a template parameter, and the accumulators can select
// a size based on whether the bit depth > 8.
if (unscaled_x && unscaled_y && bit_depth == 8) {
if (copy_x && copy_y) {
// We can memcpy here to avoid doing any real work.
auto const* reference_scan_line = &reference_frame_buffer[reference_block_y * reference_frame_width + reference_block_x];
auto* destination_scan_line = block_buffer.data();
for (auto row = 0u; row < height; row++) {
memcpy(destination_scan_line, reference_scan_line, width * sizeof(*destination_scan_line));
reference_scan_line += reference_frame_width;
destination_scan_line += width;
}
return {};
}
auto horizontal_convolution_unscaled = [](auto bit_depth, auto* destination, auto width, auto height, auto const* source, auto source_stride, auto filter, auto subpixel_x) {
source -= sample_offset;
auto const source_end_skip = source_stride - width;
for (auto row = 0u; row < height; row++) {
for (auto column = 0u; column < width; column++) {
i16 accumulated_samples = 0;
for (auto t = 0; t < 8; t++) {
auto sample = source[t];
accumulated_samples += subpel_filters[filter][subpixel_x][t] * sample;
}
*destination = clip_1(bit_depth, rounded_right_shift(accumulated_samples, 7));
source++;
destination++;
}
source += source_end_skip;
}
};
if (copy_y) {
horizontal_convolution_unscaled(bit_depth, block_buffer.data(), width, height, reference_start, reference_frame_width, block_context.interpolation_filter, reference_subpixel_x);
return {};
}
auto vertical_convolution_unscaled = [](auto bit_depth, auto* destination, auto width, auto height, auto const* source, auto source_stride, auto filter, auto subpixel_y) {
auto const source_end_skip = source_stride - width;
for (auto row = 0u; row < height; row++) {
for (auto column = 0u; column < width; column++) {
auto const* scan_column = source;
i16 accumulated_samples = 0;
for (auto t = 0; t < 8; t++) {
auto sample = *scan_column;
accumulated_samples += subpel_filters[filter][subpixel_y][t] * sample;
scan_column += source_stride;
}
*destination = clip_1(bit_depth, rounded_right_shift(accumulated_samples, 7));
source++;
destination++;
}
source += source_end_skip;
}
};
if (copy_x) {
vertical_convolution_unscaled(bit_depth, block_buffer.data(), width, height, reference_start - (sample_offset * reference_frame_width), reference_frame_width, block_context.interpolation_filter, reference_subpixel_y);
return {};
}
horizontal_convolution_unscaled(bit_depth, intermediate_buffer.data(), width, intermediate_height, reference_start - (sample_offset * reference_frame_width), reference_frame_width, block_context.interpolation_filter, reference_subpixel_x);
vertical_convolution_unscaled(bit_depth, block_buffer.data(), width, height, intermediate_buffer.data(), width, block_context.interpolation_filter, reference_subpixel_y);
return {};
}
// NOTE: Accumulators below are 32-bit to allow high bit-depth videos to decode without overflows.
// These should be changed when the accumulators above are.
auto horizontal_convolution_scaled = [](auto bit_depth, auto* destination, auto width, auto height, auto const* source, auto source_stride, auto filter, auto subpixel_x, auto scale_x) {
source -= sample_offset;
for (auto row = 0u; row < height; row++) {
auto scan_subpixel = subpixel_x;
for (auto column = 0u; column < width; column++) {
auto const* scan_line = source + (scan_subpixel >> 4);
i32 accumulated_samples = 0;
for (auto t = 0; t < 8; t++) {
auto sample = scan_line[t];
accumulated_samples += subpel_filters[filter][scan_subpixel & SUBPEL_MASK][t] * sample;
}
*destination = clip_1(bit_depth, rounded_right_shift(accumulated_samples, 7));
destination++;
scan_subpixel += scale_x;
}
source += source_stride;
}
};
// Check our reference frame bounds before starting the loop.
auto last_possible_reference = (MV_BORDER + (offset_scaled_block_y >> 4) + static_cast<i32>(intermediate_height - 1) - 3) * reference_frame_width;
VERIFY(reference_frame_buffer.size() >= last_possible_reference);
auto vertical_convolution_scaled = [](auto bit_depth, auto* destination, auto width, auto height, auto const* source, auto source_stride, auto filter, auto subpixel_y, auto scale_y) {
for (auto row = 0u; row < height; row++) {
auto const* source_column_base = source + (subpixel_y >> SUBPEL_BITS) * source_stride;
for (auto row = 0u; row < intermediate_height; row++) {
auto reference_row = (offset_scaled_block_y >> 4) + static_cast<i32>(row) - 3;
u16 const* scan_line = &reference_frame_buffer[static_cast<size_t>(MV_BORDER + reference_row) * reference_frame_width];
for (auto column = 0u; column < width; column++) {
auto const* scan_column = source_column_base + column;
i32 accumulated_samples = 0;
for (auto t = 0; t < 8; t++) {
auto sample = *scan_column;
accumulated_samples += subpel_filters[filter][subpixel_y & SUBPEL_MASK][t] * sample;
scan_column += source_stride;
}
for (auto column = 0u; column < width; column++) {
auto samples_start = offset_scaled_block_x + static_cast<i32>(scaled_step_x * column);
i32 accumulated_samples = 0;
for (auto t = 0u; t < 8u; t++) {
auto sample = scan_line[MV_BORDER + (samples_start >> 4) + static_cast<i32>(t) - 3];
accumulated_samples += subpel_filters[block_context.interpolation_filter][samples_start & 15][t] * sample;
*destination = clip_1(bit_depth, rounded_right_shift(accumulated_samples, 7));
destination++;
}
intermediate_buffer_at(row, column) = clip_1(block_context.frame_context.color_config.bit_depth, rounded_right_shift(accumulated_samples, 7));
subpixel_y += scale_y;
}
}
};
for (auto row = 0u; row < height; row++) {
for (auto column = 0u; column < width; column++) {
auto samples_start = (offset_scaled_block_y & 15) + static_cast<i32>(scaled_step_y * row);
auto const* scan_column = &intermediate_buffer_at(samples_start >> 4, column);
auto const* subpel_filters_for_samples = subpel_filters[block_context.interpolation_filter][samples_start & 15];
i32 accumulated_samples = 0;
for (auto t = 0u; t < 8u; t++) {
auto sample = *scan_column;
accumulated_samples += subpel_filters_for_samples[t] * sample;
scan_column += width;
}
block_buffer_at(row, column) = clip_1(block_context.frame_context.color_config.bit_depth, rounded_right_shift(accumulated_samples, 7));
}
}
horizontal_convolution_scaled(bit_depth, intermediate_buffer.data(), width, intermediate_height, reference_start - (sample_offset * reference_frame_width), reference_frame_width, block_context.interpolation_filter, offset_scaled_block_x & SUBPEL_MASK, scaled_step_x);
vertical_convolution_scaled(bit_depth, block_buffer.data(), width, height, intermediate_buffer.data(), width, block_context.interpolation_filter, reference_subpixel_y, scaled_step_y);
return {};
}
@ -1193,13 +1308,6 @@ inline i32 Decoder::sin64(u8 angle)
return cos64(angle - 32u);
}
template<typename T>
inline i32 Decoder::rounded_right_shift(T value, u8 bits)
{
value = (value + static_cast<T>(1u << (bits - 1u))) >> bits;
return static_cast<i32>(value);
}
// (8.7.1.1) The function B( a, b, angle, 0 ) performs a butterfly rotation.
inline void Decoder::butterfly_rotation_in_place(Span<Intermediate> data, size_t index_a, size_t index_b, u8 angle, bool flip)
{

View file

@ -103,9 +103,6 @@ private:
template<typename S, typename D>
inline void hadamard_rotation(Span<S> source, Span<D> destination, size_t index_a, size_t index_b);
template<typename T>
inline i32 rounded_right_shift(T value, u8 bits);
// (8.7.1.10) This process does an in-place Walsh-Hadamard transform of the array T (of length 4).
inline DecoderErrorOr<void> inverse_walsh_hadamard_transform(Span<Intermediate> data, u8 log2_of_block_size, u8 shift);

View file

@ -336,7 +336,7 @@ static constexpr u8 counter_to_context[19] = {
};
// Coefficients used by predict_inter
static constexpr i32 subpel_filters[4][16][8] = {
static constexpr i16 subpel_filters[4][16][8] = {
{ { 0, 0, 0, 128, 0, 0, 0, 0 },
{ 0, 1, -5, 126, 8, -3, 1, 0 },
{ -1, 3, -10, 122, 18, -6, 2, 0 },