From 036eb82acab1d599fad69654a98e9fdf9aacb2cf Mon Sep 17 00:00:00 2001 From: Zaggy1024 Date: Thu, 13 Apr 2023 20:51:24 -0500 Subject: [PATCH] LibVideo/VP9: Implement threaded tile column decoding This adds a new WorkerThread class to run one task asynchronously, and allow waiting for that thread to finish its work. TileContexts are placed into multiple tile column vectors with their streams to read from pre-created. Once those are ready, the threads can start their work on each vector separately. The main thread waits for those tasks to finish, then sums up the syntax element counts for each tile that was decoded. --- Userland/Libraries/LibVideo/CMakeLists.txt | 2 +- .../Libraries/LibVideo/VP9/ContextStorage.h | 5 ++ Userland/Libraries/LibVideo/VP9/Parser.cpp | 67 +++++++++++++++++-- Userland/Libraries/LibVideo/VP9/Parser.h | 5 +- 4 files changed, 70 insertions(+), 9 deletions(-) diff --git a/Userland/Libraries/LibVideo/CMakeLists.txt b/Userland/Libraries/LibVideo/CMakeLists.txt index 6b658cbfb0..3d1dcb9854 100644 --- a/Userland/Libraries/LibVideo/CMakeLists.txt +++ b/Userland/Libraries/LibVideo/CMakeLists.txt @@ -15,4 +15,4 @@ set(SOURCES ) serenity_lib(LibVideo video) -target_link_libraries(LibVideo PRIVATE LibAudio LibCore LibIPC LibGfx) +target_link_libraries(LibVideo PRIVATE LibAudio LibCore LibIPC LibGfx LibThreading) diff --git a/Userland/Libraries/LibVideo/VP9/ContextStorage.h b/Userland/Libraries/LibVideo/VP9/ContextStorage.h index 634e98e554..5bb0d709e0 100644 --- a/Userland/Libraries/LibVideo/VP9/ContextStorage.h +++ b/Userland/Libraries/LibVideo/VP9/ContextStorage.h @@ -149,6 +149,11 @@ public: return m_storage[index_at(row, column)]; } + void assign(u32 row, u32 column, T&& value) + { + new (&m_storage[index_at(row, column)]) T(move(value)); + } + template void copy_to(Vector2D& other, Function function) const { diff --git a/Userland/Libraries/LibVideo/VP9/Parser.cpp b/Userland/Libraries/LibVideo/VP9/Parser.cpp index 360423d7d1..4e660564ff 100644 --- a/Userland/Libraries/LibVideo/VP9/Parser.cpp +++ b/Userland/Libraries/LibVideo/VP9/Parser.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include "Context.h" #include "Decoder.h" @@ -18,6 +19,9 @@ # pragma GCC optimize("O3") #endif +// Beware, threading is unstable in Serenity with smp=on, and performs worse than with it off. +#define VP9_TILE_THREADING + namespace Video::VP9 { #define TRY_READ(expression) DECODER_TRY(DecoderErrorCategory::Corrupted, expression) @@ -857,8 +861,8 @@ static u32 get_tile_offset(u32 tile_start, u32 frame_size_in_blocks, u32 tile_si DecoderErrorOr Parser::decode_tiles(FrameContext& frame_context) { auto log2_dimensions = frame_context.log2_of_tile_counts; - auto tile_cols = 1 << log2_dimensions.width(); - auto tile_rows = 1 << log2_dimensions.height(); + auto tile_cols = 1u << log2_dimensions.width(); + auto tile_rows = 1u << log2_dimensions.height(); PartitionContext above_partition_context = DECODER_TRY_ALLOC(PartitionContext::create(superblocks_to_blocks(frame_context.superblock_columns()))); NonZeroTokens above_non_zero_tokens = DECODER_TRY_ALLOC(create_non_zero_tokens(blocks_to_sub_blocks(frame_context.columns()), frame_context.color_config.subsampling_x)); @@ -868,8 +872,15 @@ DecoderErrorOr Parser::decode_tiles(FrameContext& frame_context) // then run through each column of tiles in top to bottom order afterward. Each column can be sent to a worker thread // for execution. Each worker thread will want to create a set of above contexts sized to its tile width, then provide // those to each tile as it decodes them. - for (auto tile_row = 0; tile_row < tile_rows; tile_row++) { - for (auto tile_col = 0; tile_col < tile_cols; tile_col++) { + Vector, 4> tile_workloads; + DECODER_TRY_ALLOC(tile_workloads.try_ensure_capacity(tile_cols)); + for (auto tile_col = 0u; tile_col < tile_cols; tile_col++) { + tile_workloads.append({}); + DECODER_TRY_ALLOC(tile_workloads[tile_col].try_ensure_capacity(tile_rows)); + } + + for (auto tile_row = 0u; tile_row < tile_rows; tile_row++) { + for (auto tile_col = 0u; tile_col < tile_cols; tile_col++) { auto last_tile = (tile_row == tile_rows - 1) && (tile_col == tile_cols - 1); size_t tile_size; if (last_tile) @@ -887,12 +898,54 @@ DecoderErrorOr Parser::decode_tiles(FrameContext& frame_context) auto above_non_zero_tokens_view = create_non_zero_tokens_view(above_non_zero_tokens, blocks_to_sub_blocks(columns_start), blocks_to_sub_blocks(columns_end - columns_start), frame_context.color_config.subsampling_x); auto above_segmentation_ids_for_tile = safe_slice(above_segmentation_ids.span(), columns_start, columns_end - columns_start); - auto tile_context = TRY(TileContext::try_create(frame_context, tile_size, rows_start, rows_end, columns_start, columns_end, above_partition_context_for_tile, above_non_zero_tokens_view, above_segmentation_ids_for_tile)); - TRY(decode_tile(tile_context)); - *frame_context.counter += *tile_context.counter; + tile_workloads[tile_col].append(TRY(TileContext::try_create(frame_context, tile_size, rows_start, rows_end, columns_start, columns_end, above_partition_context_for_tile, above_non_zero_tokens_view, above_segmentation_ids_for_tile))); TRY_READ(frame_context.bit_stream.discard(tile_size)); } } + + auto decode_tile_column = [this, tile_rows](auto& column_workloads) -> DecoderErrorOr { + VERIFY(column_workloads.size() == tile_rows); + for (auto tile_row = 0u; tile_row < tile_rows; tile_row++) + TRY(decode_tile(column_workloads[tile_row])); + return {}; + }; + +#ifdef VP9_TILE_THREADING + auto const worker_count = tile_cols - 1; + + if (m_worker_threads.size() < worker_count) { + m_worker_threads.clear(); + m_worker_threads.ensure_capacity(worker_count); + for (auto i = 0u; i < worker_count; i++) + m_worker_threads.append(DECODER_TRY_ALLOC(Threading::WorkerThread::create("Decoder Worker"sv))); + } + VERIFY(m_worker_threads.size() >= worker_count); + + // Start tile column decoding tasks in thread workers starting from the second column. + for (auto tile_col = 1u; tile_col < tile_cols; tile_col++) { + auto& column_workload = tile_workloads[tile_col]; + m_worker_threads[tile_col - 1]->start_task([&decode_tile_column, &column_workload]() -> DecoderErrorOr { + return decode_tile_column(column_workload); + }); + } + + // Decode the first column in this thread. + TRY(decode_tile_column(tile_workloads[0])); + + for (auto& worker_thread : m_worker_threads) + TRY(worker_thread->wait_until_task_is_finished()); +#else + for (auto& column_workloads : tile_workloads) + TRY(decode_tile_column(column_workloads)); +#endif + + // Sum up all tile contexts' syntax element counters after all decodes have finished. + for (auto& tile_contexts : tile_workloads) { + for (auto& tile_context : tile_contexts) { + *frame_context.counter += *tile_context.counter; + } + } + return {}; } diff --git a/Userland/Libraries/LibVideo/VP9/Parser.h b/Userland/Libraries/LibVideo/VP9/Parser.h index c5d874853f..2e35dd176f 100644 --- a/Userland/Libraries/LibVideo/VP9/Parser.h +++ b/Userland/Libraries/LibVideo/VP9/Parser.h @@ -11,8 +11,9 @@ #include #include #include +#include #include -#include +#include #include "ContextStorage.h" #include "LookupTables.h" @@ -141,6 +142,8 @@ private: OwnPtr m_probability_tables; Decoder& m_decoder; + + Vector>> m_worker_threads; }; }