diff --git a/Userland/Libraries/LibVideo/VP9/Context.h b/Userland/Libraries/LibVideo/VP9/Context.h
index 4a8186d9f8..12619a1b85 100644
--- a/Userland/Libraries/LibVideo/VP9/Context.h
+++ b/Userland/Libraries/LibVideo/VP9/Context.h
@@ -202,6 +202,8 @@ struct BlockContext {
 
     InterpolationFilter interpolation_filter { EightTap };
     Array<MotionVectorPair, 4> sub_block_motion_vectors;
+
+    Array<i32, 1024> residual_tokens;
 };
 
 struct BlockMotionVectorCandidateSet {
diff --git a/Userland/Libraries/LibVideo/VP9/Decoder.cpp b/Userland/Libraries/LibVideo/VP9/Decoder.cpp
index 745b391d35..c281a8a05d 100644
--- a/Userland/Libraries/LibVideo/VP9/Decoder.cpp
+++ b/Userland/Libraries/LibVideo/VP9/Decoder.cpp
@@ -1087,12 +1087,12 @@ DecoderErrorOr<void> Decoder::reconstruct(u8 plane, BlockContext const& block_co
             auto index = index_from_row_and_column(i, j, block_size);
             if (index == 0)
                 continue;
-            dequantized[index] = (m_parser->m_tokens[index] * ac_quant) / dq_denominator;
+            dequantized[index] = (block_context.residual_tokens[index] * ac_quant) / dq_denominator;
         }
     }
 
     // 2. Dequant[ 0 ][ 0 ] is set equal to ( Tokens[ 0 ] * get_dc_quant( plane ) ) / dqDenom
-    dequantized[0] = (m_parser->m_tokens[0] * get_dc_quantizer(block_context, plane)) / dq_denominator;
+    dequantized[0] = (block_context.residual_tokens[0] * get_dc_quantizer(block_context, plane)) / dq_denominator;
 
     // It is a requirement of bitstream conformance that the values written into the Dequant array in steps 1 and 2
     // are representable by a signed integer with 8 + BitDepth bits.
diff --git a/Userland/Libraries/LibVideo/VP9/Parser.cpp b/Userland/Libraries/LibVideo/VP9/Parser.cpp
index 801f5b475b..b14231934f 100644
--- a/Userland/Libraries/LibVideo/VP9/Parser.cpp
+++ b/Userland/Libraries/LibVideo/VP9/Parser.cpp
@@ -1353,6 +1353,7 @@ static TransformSet select_transform_type(BlockContext const& block_context, u8
 DecoderErrorOr<bool> Parser::residual(BlockContext& block_context, bool has_block_above, bool has_block_left)
 {
     bool block_had_non_zero_tokens = false;
+    Array<u8, 1024> token_cache;
     for (u8 plane = 0; plane < 3; plane++) {
         auto plane_subsampling_x = (plane > 0) ? block_context.frame_context.color_config.subsampling_x : 0;
         auto plane_subsampling_y = (plane > 0) ? block_context.frame_context.color_config.subsampling_y : 0;
@@ -1390,7 +1391,7 @@ DecoderErrorOr<bool> Parser::residual(BlockContext& block_context, bool has_bloc
                         TRY(m_decoder.predict_intra(plane, block_context, transform_x_in_px, transform_y_in_px, has_block_left || x > 0, has_block_above || y > 0, (x + transform_size_in_sub_blocks) < block_size_in_sub_blocks.width(), transform_size, sub_block_index));
                     if (!block_context.should_skip_residuals) {
                         auto transform_set = select_transform_type(block_context, plane, transform_size, sub_block_index);
-                        sub_block_had_non_zero_tokens = TRY(tokens(block_context, plane, transform_x_in_px, transform_y_in_px, transform_size, transform_set));
+                        sub_block_had_non_zero_tokens = TRY(tokens(block_context, plane, transform_x_in_px, transform_y_in_px, transform_size, transform_set, token_cache));
                         block_had_non_zero_tokens = block_had_non_zero_tokens || sub_block_had_non_zero_tokens;
                         TRY(m_decoder.reconstruct(plane, block_context, transform_x_in_px, transform_y_in_px, transform_size, transform_set));
                     }
@@ -1444,35 +1445,37 @@ static u16 const* get_scan(TransformSize transform_size, TransformSet transform_
     return default_scan_32x32;
 }
 
-DecoderErrorOr<bool> Parser::tokens(BlockContext& block_context, size_t plane, u32 start_x, u32 start_y, TransformSize transform_size, TransformSet transform_set)
+DecoderErrorOr<bool> Parser::tokens(BlockContext& block_context, size_t plane, u32 start_x, u32 start_y, TransformSize transform_size, TransformSet transform_set, Array<u8, 1024> token_cache)
 {
-    u16 segment_eob = 16 << (transform_size << 1);
+    block_context.residual_tokens.fill(0);
+
     auto const* scan = get_scan(transform_size, transform_set);
-    auto check_eob = true;
+
+    auto check_for_more_coefficients = true;
     u16 coef_index = 0;
+    u16 segment_eob = 16 << (transform_size << 1);
     for (; coef_index < segment_eob; coef_index++) {
         auto pos = scan[coef_index];
         auto band = (transform_size == Transform_4x4) ? coefband_4x4[coef_index] : coefband_8x8plus[coef_index];
-        auto tokens_context = TreeParser::get_tokens_context(block_context.frame_context.color_config.subsampling_x, block_context.frame_context.color_config.subsampling_y, block_context.frame_context.rows(), block_context.frame_context.columns(), m_above_nonzero_context, m_left_nonzero_context, m_token_cache, transform_size, transform_set, plane, start_x, start_y, pos, block_context.is_inter_predicted(), band, coef_index);
-        if (check_eob) {
-            auto more_coefs = TRY_READ(TreeParser::parse_more_coefficients(*m_bit_stream, *m_probability_tables, *m_syntax_element_counter, tokens_context));
-            if (!more_coefs)
-                break;
-        }
+        auto tokens_context = TreeParser::get_tokens_context(block_context.frame_context.color_config.subsampling_x, block_context.frame_context.color_config.subsampling_y, block_context.frame_context.rows(), block_context.frame_context.columns(), m_above_nonzero_context, m_left_nonzero_context, token_cache, transform_size, transform_set, plane, start_x, start_y, pos, block_context.is_inter_predicted(), band, coef_index);
+
+        if (check_for_more_coefficients && !TRY_READ(TreeParser::parse_more_coefficients(*m_bit_stream, *m_probability_tables, *m_syntax_element_counter, tokens_context)))
+            break;
+
         auto token = TRY_READ(TreeParser::parse_token(*m_bit_stream, *m_probability_tables, *m_syntax_element_counter, tokens_context));
-        m_token_cache[pos] = energy_class[token];
+        token_cache[pos] = energy_class[token];
+
+        i32 coef;
         if (token == ZeroToken) {
-            m_tokens[pos] = 0;
-            check_eob = false;
+            coef = 0;
+            check_for_more_coefficients = false;
         } else {
-            i32 coef = TRY(read_coef(block_context.frame_context.color_config.bit_depth, token));
-            bool sign_bit = TRY_READ(m_bit_stream->read_literal(1));
-            m_tokens[pos] = sign_bit ? -coef : coef;
-            check_eob = true;
+            coef = TRY(read_coef(block_context.frame_context.color_config.bit_depth, token));
+            check_for_more_coefficients = true;
         }
+        block_context.residual_tokens[pos] = coef;
     }
-    for (u16 i = coef_index; i < segment_eob; i++)
-        m_tokens[scan[i]] = 0;
+
     return coef_index > 0;
 }
 
@@ -1480,7 +1483,7 @@ DecoderErrorOr<i32> Parser::read_coef(u8 bit_depth, Token token)
 {
     auto cat = extra_bits[token][0];
     auto num_extra = extra_bits[token][1];
-    u32 coef = extra_bits[token][2];
+    i32 coef = extra_bits[token][2];
     if (token == DctValCat6) {
         for (size_t e = 0; e < (u8)(bit_depth - 8); e++) {
             auto high_bit = TRY_READ(m_bit_stream->read_bool(255));
@@ -1491,6 +1494,8 @@ DecoderErrorOr<i32> Parser::read_coef(u8 bit_depth, Token token)
         auto coef_bit = TRY_READ(m_bit_stream->read_bool(cat_probs[cat][e]));
         coef += coef_bit << (num_extra - 1 - e);
     }
+    bool sign_bit = TRY_READ(m_bit_stream->read_literal(1));
+    coef = sign_bit ? -coef : coef;
     return coef;
 }
 
diff --git a/Userland/Libraries/LibVideo/VP9/Parser.h b/Userland/Libraries/LibVideo/VP9/Parser.h
index 8893f084e2..e48407b34c 100644
--- a/Userland/Libraries/LibVideo/VP9/Parser.h
+++ b/Userland/Libraries/LibVideo/VP9/Parser.h
@@ -122,7 +122,7 @@ private:
     DecoderErrorOr<MotionVector> read_motion_vector(BlockContext const&, BlockMotionVectorCandidates const&, ReferenceIndex);
     DecoderErrorOr<i32> read_single_motion_vector_component(u8 component);
     DecoderErrorOr<bool> residual(BlockContext&, bool has_block_above, bool has_block_left);
-    DecoderErrorOr<bool> tokens(BlockContext&, size_t plane, u32 x, u32 y, TransformSize, TransformSet);
+    DecoderErrorOr<bool> tokens(BlockContext&, size_t plane, u32 x, u32 y, TransformSize, TransformSet, Array<u8, 1024> token_cache);
     DecoderErrorOr<i32> read_coef(u8 bit_depth, Token token);
 
     /* (6.5) Motion Vector Prediction */
@@ -162,8 +162,6 @@ private:
 
     Vector<u16> m_frame_store[NUM_REF_FRAMES][3];
 
-    u8 m_token_cache[1024];
-    i32 m_tokens[1024];
     bool m_use_hp { false };
 
     bool m_use_prev_frame_mvs;
diff --git a/Userland/Libraries/LibVideo/VP9/TreeParser.cpp b/Userland/Libraries/LibVideo/VP9/TreeParser.cpp
index a236c61a21..5c8c571cdb 100644
--- a/Userland/Libraries/LibVideo/VP9/TreeParser.cpp
+++ b/Userland/Libraries/LibVideo/VP9/TreeParser.cpp
@@ -624,7 +624,7 @@ ErrorOr<bool> TreeParser::parse_motion_vector_hp(BitStream& bit_stream, Probabil
     return value;
 }
 
-TokensContext TreeParser::get_tokens_context(bool subsampling_x, bool subsampling_y, u32 rows, u32 columns, Array<Vector<bool>, 3> const& above_nonzero_context, Array<Vector<bool>, 3> const& left_nonzero_context, u8 token_cache[1024], TransformSize transform_size, TransformSet transform_set, u8 plane, u32 start_x, u32 start_y, u16 position, bool is_inter, u8 band, u16 coef_index)
+TokensContext TreeParser::get_tokens_context(bool subsampling_x, bool subsampling_y, u32 rows, u32 columns, Array<Vector<bool>, 3> const& above_nonzero_context, Array<Vector<bool>, 3> const& left_nonzero_context, Array<u8, 1024> token_cache, TransformSize transform_size, TransformSet transform_set, u8 plane, u32 start_x, u32 start_y, u16 position, bool is_inter, u8 band, u16 coef_index)
 {
     u8 context;
     if (coef_index == 0) {
diff --git a/Userland/Libraries/LibVideo/VP9/TreeParser.h b/Userland/Libraries/LibVideo/VP9/TreeParser.h
index 52c97d1adf..0fd29241aa 100644
--- a/Userland/Libraries/LibVideo/VP9/TreeParser.h
+++ b/Userland/Libraries/LibVideo/VP9/TreeParser.h
@@ -86,7 +86,7 @@ public:
     static ErrorOr<u8> parse_motion_vector_fr(BitStream&, ProbabilityTables const&, SyntaxElementCounter&, u8 component);
     static ErrorOr<bool> parse_motion_vector_hp(BitStream&, ProbabilityTables const&, SyntaxElementCounter&, u8 component, bool use_hp);
 
-    static TokensContext get_tokens_context(bool subsampling_x, bool subsampling_y, u32 rows, u32 columns, Array<Vector<bool>, 3> const& above_nonzero_context, Array<Vector<bool>, 3> const& left_nonzero_context, u8 token_cache[1024], TransformSize, TransformSet, u8 plane, u32 start_x, u32 start_y, u16 position, bool is_inter, u8 band, u16 coef_index);
+    static TokensContext get_tokens_context(bool subsampling_x, bool subsampling_y, u32 rows, u32 columns, Array<Vector<bool>, 3> const& above_nonzero_context, Array<Vector<bool>, 3> const& left_nonzero_context, Array<u8, 1024> token_cache, TransformSize, TransformSet, u8 plane, u32 start_x, u32 start_y, u16 position, bool is_inter, u8 band, u16 coef_index);
     static ErrorOr<bool> parse_more_coefficients(BitStream&, ProbabilityTables const&, SyntaxElementCounter&, TokensContext const& context);
     static ErrorOr<Token> parse_token(BitStream&, ProbabilityTables const&, SyntaxElementCounter&, TokensContext const& context);
 };