Optimize filter weight adjustment in YUV strategy.

Use the linear function `sum_square_diff / num_ref_pixels * 3` to
replace the original lookup table. This improves the readability and
also makes the code easier to maintain. Furthermore, along with this
optimization, function `av1_apply_temporal_filter_yonly()` is merged
into function `av1_apply_temporal_filter_yuc()`.

As the linear function after optimization may be slightly different from
the original lookup table (due to the integer cutoff), the performance
may be affected a little bit, yet can be ignored. Under speed-4, NO
clips show performance change in all test sets.

STATS_CHANGED

Change-Id: I2e1aa3d5b161c53babad492b3ffca8778fb007d0
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 0a99f70..3dfc6b5 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -284,7 +284,7 @@
   add_proto qw/int av1_full_range_search/, "const struct macroblock *x, const struct search_site_config *cfg, MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const MV *center_mv";
 
   if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
-    add_proto qw/void av1_apply_temporal_filter_yuv/, "const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int strength, const int use_subblock, const int *subblock_filter_weights, const uint8_t *pred, uint32_t *accum, uint16_t *count";
+    add_proto qw/void av1_apply_temporal_filter_yuv/, "const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const int strength, const int use_subblock, const int *subblock_filter_weights, const uint8_t *pred, uint32_t *accum, uint16_t *count";
     specialize qw/av1_apply_temporal_filter_yuv sse4_1/;
   }
 
diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c
index 46679ed..1abaa13 100644
--- a/av1/encoder/temporal_filter.c
+++ b/av1/encoder/temporal_filter.c
@@ -468,41 +468,7 @@
   }
 }
 
-// Magic numbers used to adjust the pixel-wise weight used in YUV filtering.
-// For now, it only supports 3x3 window for filtering.
-// The adjustment is performed with following steps:
-//   (1) For a particular pixel, compute the sum of squared difference between
-//       input frame and prediction in a small window (i.e., 3x3). There are
-//       three possible outcomes:
-//       (a) If the pixel locates in the middle of the plane, it has 9
-//           neighbours (self-included).
-//       (b) If the pixel locates on the edge of the plane, it has 6
-//           neighbours (self-included).
-//       (c) If the pixel locates on the corner of the plane, it has 4
-//           neighbours (self-included).
-//   (2) For Y-plane, it will also consider the squared difference from U-plane
-//       and V-plane at the corresponding position as reference. This leads to
-//       2 more neighbours.
-//   (3) For U-plane and V-plane, it will consider the squared difference from
-//       Y-plane at the corresponding position (after upsampling) as reference.
-//       This leads to 1 more (subsampling = 0) or 4 more (subsampling = 1)
-//       neighbours.
-//   (4) Find the modifier for adjustment from the lookup table according to
-//       number of reference pixels (neighbours) used. From above, the number
-//       of neighbours can be 9+2 (11), 6+2 (8), 4+2 (6), 9+1 (10), 6+1 (7),
-//       4+1 (5), 9+4 (13), 6+4 (10), 4+4 (8).
-// TODO(any): Not sure what index 4 and index 9 are for.
-static const uint32_t filter_weight_adjustment_lookup_table_yuv[14] = {
-  0, 0, 0, 0, 49152, 39322, 32768, 28087, 24576, 21846, 19661, 17874, 0, 15124
-};
-// Lookup table for high bit-depth.
-static const uint64_t highbd_filter_weight_adjustment_lookup_table_yuv[14] = {
-  0U,          0U,          0U,          0U,          3221225472U,
-  2576980378U, 2147483648U, 1840700270U, 1610612736U, 1431655766U,
-  1288490189U, 1171354718U, 0U,          991146300U
-};
-
-// Function to adjust the filter weight when applying YUV filter.
+// Function to adjust the filter weight when use YUV strategy.
 // Inputs:
 //   filter_weight: Original filter weight.
 //   sum_square_diff: Sum of squared difference between input frame and
@@ -513,34 +479,21 @@
 //                   `filter_weight_adjustment_lookup_table_yuv` and
 //                   `highbd_filter_weight_adjustment_lookup_table_yuv`.
 //   strength: Strength for filter weight adjustment.
-//   is_high_bitdepth: Whether apply temporal filter to high bie-depth video.
 // Returns:
 //   Adjusted filter weight which will finally be used for filtering.
 static INLINE int adjust_filter_weight_yuv(const int filter_weight,
                                            const uint64_t sum_square_diff,
                                            const int num_ref_pixels,
-                                           const int strength,
-                                           const int is_high_bitdepth) {
-  assert(TF_YUV_FILTER_WINDOW_LENGTH == 3);
-  assert(num_ref_pixels >= 0 && num_ref_pixels <= 13);
-
-  const uint64_t multiplier =
-      is_high_bitdepth
-          ? highbd_filter_weight_adjustment_lookup_table_yuv[num_ref_pixels]
-          : filter_weight_adjustment_lookup_table_yuv[num_ref_pixels];
-  assert(multiplier != 0);
-
-  const uint32_t max_value = is_high_bitdepth ? UINT32_MAX : UINT16_MAX;
-  const int shift = is_high_bitdepth ? 32 : 16;
+                                           const int strength) {
   int modifier =
-      (int)((AOMMIN(sum_square_diff, max_value) * multiplier) >> shift);
-
+      (int)(AOMMIN(sum_square_diff * TF_YUV_FILTER_WEIGHT_SCALE, INT32_MAX)) /
+      num_ref_pixels;
   const int rounding = (1 << strength) >> 1;
   modifier = (modifier + rounding) >> strength;
   return (modifier >= 16) ? 0 : (16 - modifier) * filter_weight;
 }
 
-// Applies temporal filter to YUV planes.
+// Applies temporal filter with YUV strategy.
 // Inputs:
 //   frame_to_filter: Pointer to the frame to be filtered, which is used as
 //                    reference to compute squared differece from the predictor.
@@ -549,6 +502,7 @@
 //   block_size: Size of the block.
 //   mb_row: Row index of the block in the entire frame.
 //   mb_col: Column index of the block in the entire frame.
+//   num_planes: Number of planes in the frame.
 //   strength: Strength for filter weight adjustment.
 //   use_subblock: Whether to use 4 sub-blocks to replace the original block.
 //   subblock_filter_weights: The filter weights for each sub-block (row-major
@@ -560,14 +514,14 @@
 // Returns:
 //   Nothing will be returned. But the content to which `accum` and `pred`
 //   point will be modified.
-void av1_apply_temporal_filter_yuv_c(const YV12_BUFFER_CONFIG *frame_to_filter,
-                                     const MACROBLOCKD *mbd,
-                                     const BLOCK_SIZE block_size,
-                                     const int mb_row, const int mb_col,
-                                     const int strength, const int use_subblock,
-                                     const int *subblock_filter_weights,
-                                     const uint8_t *pred, uint32_t *accum,
-                                     uint16_t *count) {
+void av1_apply_temporal_filter_yuv_c(
+    const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
+    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
+    const int num_planes, const int strength, const int use_subblock,
+    const int *subblock_filter_weights, const uint8_t *pred, uint32_t *accum,
+    uint16_t *count) {
+  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
+
   // Block information.
   const int mb_height = block_size_high[block_size];
   const int mb_width = block_size_wide[block_size];
@@ -575,14 +529,14 @@
   const int is_high_bitdepth = is_frame_high_bitdepth(frame_to_filter);
   const uint16_t *pred16 = CONVERT_TO_SHORTPTR(pred);
 
-  // Allocate memory for pixel-wise squared differences for Y, U, V planes. All
-  // planes, regardless of the subsampling, are assigned with memory of size
-  // `mb_pels`.
-  uint32_t *square_diff = aom_memalign(16, 3 * mb_pels * sizeof(uint32_t));
-  memset(square_diff, 0, 3 * mb_pels * sizeof(square_diff[0]));
+  // Allocate memory for pixel-wise squared differences for all planes. They,
+  // regardless of the subsampling, are assigned with memory of size `mb_pels`.
+  uint32_t *square_diff =
+      aom_memalign(16, num_planes * mb_pels * sizeof(uint32_t));
+  memset(square_diff, 0, num_planes * mb_pels * sizeof(square_diff[0]));
 
   int plane_offset = 0;
-  for (int plane = 0; plane < 3; ++plane) {
+  for (int plane = 0; plane < num_planes; ++plane) {
     // Locate pixel on reference frame.
     const int plane_h = mb_height >> mbd->plane[plane].subsampling_y;
     const int plane_w = mb_width >> mbd->plane[plane].subsampling_x;
@@ -599,14 +553,11 @@
   assert(TF_YUV_FILTER_WINDOW_LENGTH % 2 == 1);
   const int half_window = TF_YUV_FILTER_WINDOW_LENGTH >> 1;
 
-  // Handle Y-plane, U-plane, V-plane in sequence.
+  // Handle planes in sequence.
   plane_offset = 0;
-  for (int plane = 0; plane < 3; ++plane) {
+  for (int plane = 0; plane < num_planes; ++plane) {
     const int subsampling_y = mbd->plane[plane].subsampling_y;
     const int subsampling_x = mbd->plane[plane].subsampling_x;
-    // Only 0 and 1 are supported for filter weight adjustment.
-    assert(subsampling_y == 0 || subsampling_y == 1);
-    assert(subsampling_x == 0 || subsampling_x == 1);
     const int h = mb_height >> subsampling_y;  // Plane height.
     const int w = mb_width >> subsampling_x;   // Plane width.
 
@@ -614,10 +565,6 @@
     int pred_idx = 0;
     for (int i = 0; i < h; ++i) {
       for (int j = 0; j < w; ++j) {
-        const int subblock_idx =
-            use_subblock ? (i >= h / 2) * 2 + (j >= w / 2) : 0;
-        const int filter_weight = subblock_filter_weights[subblock_idx];
-
         // non-local mean approach
         uint64_t sum_square_diff = 0;
         int num_ref_pixels = 0;
@@ -634,7 +581,7 @@
         }
 
         if (plane == 0) {  // Filter Y-plane using both U-plane and V-plane.
-          for (int p = 1; p < 3; ++p) {
+          for (int p = 1; p < num_planes; ++p) {
             const int ss_y_shift = mbd->plane[p].subsampling_y - subsampling_y;
             const int ss_x_shift = mbd->plane[p].subsampling_x - subsampling_x;
             const int yy = i >> ss_y_shift;  // Y-coord on UV-plane.
@@ -657,11 +604,15 @@
           }
         }
 
+        // Base filter weight estimated by motion search error.
+        const int subblock_idx =
+            use_subblock ? (i >= h / 2) * 2 + (j >= w / 2) : 0;
+        const int filter_weight = subblock_filter_weights[subblock_idx];
+
         const int idx = plane_offset + pred_idx;  // Index with plane shift.
         const int pred_value = is_high_bitdepth ? pred16[idx] : pred[idx];
         const int adjusted_weight = adjust_filter_weight_yuv(
-            filter_weight, sum_square_diff, num_ref_pixels, strength,
-            is_high_bitdepth);
+            filter_weight, sum_square_diff, num_ref_pixels, strength);
         accum[idx] += adjusted_weight * pred_value;
         count[idx] += adjusted_weight;
 
@@ -674,128 +625,9 @@
   aom_free(square_diff);
 }
 
-// Function to adjust the filter weight when applying filter to Y-plane only.
-// Inputs:
-//   filter_weight: Original filter weight.
-//   sum_square_diff: Sum of squared difference between input frame and
-//                    prediction. This field is computed pixel by pixel, and
-//                    is used as a reference for the filter weight adjustment.
-//   num_ref_pixels: Number of pixels used to compute the `sum_square_diff`.
-//   strength: Strength for filter weight adjustment.
-// Returns:
-//   Adjusted filter weight which will finally be used for filtering.
-static INLINE int adjust_filter_weight_yonly(const int filter_weight,
-                                             const uint64_t sum_square_diff,
-                                             const int num_ref_pixels,
-                                             const int strength) {
-  assert(TF_YONLY_FILTER_WINDOW_LENGTH == 3);
-
-  int modifier = (int)(AOMMIN(sum_square_diff * 3, INT32_MAX));
-  modifier /= num_ref_pixels;
-
-  const int rounding = (1 << strength) >> 1;
-  modifier = (modifier + rounding) >> strength;
-  return (modifier >= 16) ? 0 : (16 - modifier) * filter_weight;
-}
-
-// Applies temporal filter to Y-plane ONLY.
-// Different from the function `av1_apply_temporal_filter_yuv_c()`, this
-// function only applies temporal filter to Y-plane. This should be used when
-// the input video frame only has one plane.
-// Inputs:
-//   frame_to_filter: Pointer to the frame to be filtered, which is used as
-//                    reference to compute squared differece from the predictor.
-//   mbd: Pointer to the block for filtering, which is ONLY used to get
-//        subsampling information of Y-plane.
-//   block_size: Size of the block.
-//   mb_row: Row index of the block in the entire frame.
-//   mb_col: Column index of the block in the entire frame.
-//   strength: Strength for filter weight adjustment.
-//   use_subblock: Whether to use 4 sub-blocks to replace the original block.
-//   subblock_filter_weights: The filter weights for each sub-block (row-major
-//                            order). If `use_subblock` is set as 0, the first
-//                            weight will be applied to the entire block.
-//   pred: Pointer to the well-built predictors.
-//   accum: Pointer to the pixel-wise accumulator for filtering.
-//   count: Pointer to the pixel-wise counter fot filtering.
-// Returns:
-//   Nothing will be returned. But the content to which `accum` and `pred`
-//   point will be modified.
-void av1_apply_temporal_filter_yonly(const YV12_BUFFER_CONFIG *frame_to_filter,
-                                     const MACROBLOCKD *mbd,
-                                     const BLOCK_SIZE block_size,
-                                     const int mb_row, const int mb_col,
-                                     const int strength, const int use_subblock,
-                                     const int *subblock_filter_weights,
-                                     const uint8_t *pred, uint32_t *accum,
-                                     uint16_t *count) {
-  // Block information.
-  const int mb_height = block_size_high[block_size];
-  const int mb_width = block_size_wide[block_size];
-  const int mb_pels = mb_height * mb_width;
-  const int is_high_bitdepth = is_frame_high_bitdepth(frame_to_filter);
-  const uint16_t *pred16 = CONVERT_TO_SHORTPTR(pred);
-
-  // Y-plane information.
-  const int subsampling_y = mbd->plane[0].subsampling_y;
-  const int subsampling_x = mbd->plane[0].subsampling_x;
-  const int h = mb_height >> subsampling_y;
-  const int w = mb_width >> subsampling_x;
-
-  // Pre-compute squared difference before filtering.
-  const int frame_stride = frame_to_filter->y_stride;
-  const int frame_offset = mb_row * h * frame_stride + mb_col * w;
-  const uint8_t *ref = frame_to_filter->y_buffer;
-  uint32_t *square_diff = aom_memalign(16, mb_pels * sizeof(uint32_t));
-  memset(square_diff, 0, mb_pels * sizeof(square_diff[0]));
-  compute_square_diff(ref, frame_offset, frame_stride, pred, 0, w, h, w,
-                      is_high_bitdepth, square_diff);
-
-  // Get window size for pixel-wise filtering.
-  assert(TF_YONLY_FILTER_WINDOW_LENGTH % 2 == 1);
-  const int half_window = TF_YONLY_FILTER_WINDOW_LENGTH >> 1;
-
-  // Perform filtering.
-  int idx = 0;
-  for (int i = 0; i < h; ++i) {
-    for (int j = 0; j < w; ++j) {
-      const int subblock_idx =
-          use_subblock ? (i >= h / 2) * 2 + (j >= w / 2) : 0;
-      const int filter_weight = subblock_filter_weights[subblock_idx];
-
-      // non-local mean approach
-      uint64_t sum_square_diff = 0;
-      int num_ref_pixels = 0;
-
-      for (int wi = -half_window; wi <= half_window; ++wi) {
-        for (int wj = -half_window; wj <= half_window; ++wj) {
-          const int y = i + wi;  // Y-coord on the current plane.
-          const int x = j + wj;  // X-coord on the current plane.
-          if (y >= 0 && y < h && x >= 0 && x < w) {
-            sum_square_diff += square_diff[y * w + x];
-            ++num_ref_pixels;
-          }
-        }
-      }
-
-      const int pred_value = is_high_bitdepth ? pred16[idx] : pred[idx];
-      const int adjusted_weight = adjust_filter_weight_yonly(
-          filter_weight, sum_square_diff, num_ref_pixels, strength);
-      accum[idx] += adjusted_weight * pred_value;
-      count[idx] += adjusted_weight;
-
-      ++idx;
-    }
-  }
-
-  aom_free(square_diff);
-}
-
-// Applies temporal filter plane by plane.
-// Different from the function `av1_apply_temporal_filter_yuv_c()` and the
-// function `av1_apply_temporal_filter_yonly()`, this function applies temporal
-// filter to each plane independently. Besides, the strategy of filter weight
-// adjustment is different from the other two functions.
+// Applies temporal filter with plane-wise strategy.
+// The strategy of filter weight adjustment is different from the function
+// `av1_apply_temporal_filter_yuv_c()`.
 // Inputs:
 //   frame_to_filter: Pointer to the frame to be filtered, which is used as
 //                    reference to compute squared differece from the predictor.
@@ -933,20 +765,17 @@
 //   mb_row: Row index of the block in the entire frame.
 //   mb_col: Column index of the block in the entire frame.
 //   num_planes: Number of planes in the frame.
-//   use_planewise_strategy: Whether to use plane-wise temporal filtering
-//                           strategy. If set as 0, YUV or YONLY filtering will
-//                           be used (depending on number of planes).
-//   strength: Strength for filter weight adjustment. (Used in YUV filtering and
-//             YONLY filtering)
+//   use_planewise_strategy: Whether to use plane-wise strategy or YUV strategy.
+//   strength: Strength for filter weight adjustment. (Used in YUV strategy)
 //   use_subblock: Whether to use 4 sub-blocks to replace the original block.
-//                 (Used in YUV filtering and YONLY filtering)
+//                 (Used in YUV strategy)
 //   subblock_filter_weights: The filter weights for each sub-block (row-major
 //                            order). If `use_subblock` is set as 0, the first
 //                            weight will be applied to the entire block. (Used
-//                            in YUV filtering and YONLY filtering)
+//                            in YUV strategy)
 //   noise_levels: Pointer to the noise levels of the to-filter frame, estimated
 //                 with each plane (in Y, U, V order). (Used in plane-wise
-//                 filtering)
+//                 strategy)
 //   pred: Pointer to the well-built predictors.
 //   accum: Pointer to the pixel-wise accumulator for filtering.
 //   count: Pointer to the pixel-wise counter fot filtering.
@@ -975,16 +804,18 @@
     }
   } else {  // Commonly used for low-resolution video.
     const int adj_strength = strength + 2 * (mbd->bd - 8);
-    if (num_planes == 1) {
-      av1_apply_temporal_filter_yonly(
-          frame_to_filter, mbd, block_size, mb_row, mb_col, adj_strength,
-          use_subblock, subblock_filter_weights, pred, accum, count);
-    } else if (num_planes == 3) {
-      av1_apply_temporal_filter_yuv(
-          frame_to_filter, mbd, block_size, mb_row, mb_col, adj_strength,
-          use_subblock, subblock_filter_weights, pred, accum, count);
+    if (num_planes == 3 && TF_YUV_FILTER_WEIGHT_SCALE == 3) {
+      av1_apply_temporal_filter_yuv(frame_to_filter, mbd, block_size, mb_row,
+                                    mb_col, num_planes, adj_strength,
+                                    use_subblock, subblock_filter_weights, pred,
+                                    accum, count);
     } else {
-      assert(0 && "Only support Y-plane and YUV-plane modes.");
+      // TODO(any): sse4 version should be changed to align with C function
+      // before using.
+      av1_apply_temporal_filter_yuv_c(frame_to_filter, mbd, block_size, mb_row,
+                                      mb_col, num_planes, adj_strength,
+                                      use_subblock, subblock_filter_weights,
+                                      pred, accum, count);
     }
   }
 }
@@ -1284,8 +1115,8 @@
 }
 
 // Estimates the strength for filter weight adjustment, which is used in YUV
-// filtering and YONLY filtering. This estimation is based on the pre-estimated
-// noise level of the to-filter frame.
+// strategy. This estimation is based on the pre-estimated noise level of the
+// to-filter frame.
 // Inputs:
 //   cpi: Pointer to the composed information of input video.
 //   noise_level: Noise level of the to-filter frame, estimated with Y-plane.
diff --git a/av1/encoder/temporal_filter.h b/av1/encoder/temporal_filter.h
index eb49ce0..ef86496 100644
--- a/av1/encoder/temporal_filter.h
+++ b/av1/encoder/temporal_filter.h
@@ -22,13 +22,11 @@
 #define BH 32
 #define BW 32
 
-// Window size for temporal filtering on YUV planes.
+// Window size for YUV temporal filtering.
 // This is particually used for function `av1_apply_temporal_filter_yuv()`.
 #define TF_YUV_FILTER_WINDOW_LENGTH 3
-
-// Window size for temporal filtering on Y planes.
-// This is particually used for function `av1_apply_temporal_filter_yonly()`.
-#define TF_YONLY_FILTER_WINDOW_LENGTH 3
+// A scale factor used in YUV temporal filtering for weight adjustment.
+#define TF_YUV_FILTER_WEIGHT_SCALE 3
 
 #define TF_ENABLE_PLANEWISE_STRATEGY 1
 // Window size for plane-wise temporal filtering.
diff --git a/av1/encoder/x86/temporal_filter_sse4.c b/av1/encoder/x86/temporal_filter_sse4.c
index 5e43e43..e3f9f5f 100644
--- a/av1/encoder/x86/temporal_filter_sse4.c
+++ b/av1/encoder/x86/temporal_filter_sse4.c
@@ -2025,10 +2025,13 @@
 void av1_apply_temporal_filter_yuv_sse4_1(
     const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
     const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int strength, const int use_subblock,
+    const int num_planes, const int strength, const int use_subblock,
     const int *subblock_filter_weights, const uint8_t *pred, uint32_t *accum,
     uint16_t *count) {
   const int is_high_bitdepth = ref_frame->flags & YV12_FLAG_HIGHBITDEPTH;
+  // TODO(any): Need to support when `num_planes != 3`, like C implementation.
+  assert(num_planes == 3);
+  (void)num_planes;
   if (is_high_bitdepth) {
     highbd_apply_temporal_filter_yuv(
         ref_frame, mbd, block_size, mb_row, mb_col, strength, use_subblock,
diff --git a/test/temporal_filter_yuv_test.cc b/test/temporal_filter_yuv_test.cc
index 15ffec3..dc17aaa 100644
--- a/test/temporal_filter_yuv_test.cc
+++ b/test/temporal_filter_yuv_test.cc
@@ -29,8 +29,8 @@
 typedef void (*TemporalFilterYUVFunc)(
     const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
     const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int strength, const int use_subblock, const int *blk_fw,
-    const uint8_t *pred, uint32_t *accum, uint16_t *count);
+    const int num_planes, const int strength, const int use_subblock,
+    const int *blk_fw, const uint8_t *pred, uint32_t *accum, uint16_t *count);
 
 struct TemporalFilterWithBd {
   TemporalFilterWithBd(TemporalFilterYUVFunc func, int bitdepth)
@@ -414,6 +414,7 @@
   assert(block_width == MAX_WIDTH && MAX_WIDTH == 32);
   assert(block_height == MAX_HEIGHT && MAX_HEIGHT == 32);
   const BLOCK_SIZE block_size = BLOCK_32X32;
+  const int num_planes = 3;
   const int mb_pels = MAX_WIDTH * MAX_HEIGHT;
   const int mb_row = 0;
   const int mb_col = 0;
@@ -455,9 +456,9 @@
   memcpy(count + 1 * mb_pels, u_count, mb_pels * sizeof(uint16_t));
   memcpy(count + 2 * mb_pels, v_count, mb_pels * sizeof(uint16_t));
 
-  ASM_REGISTER_STATE_CHECK(filter_func_(ref_frame, mbd, block_size, mb_row,
-                                        mb_col, strength, use_subblock, blk_fw,
-                                        pred, accum, count));
+  ASM_REGISTER_STATE_CHECK(
+      filter_func_(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
+                   strength, use_subblock, blk_fw, pred, accum, count));
 
   memcpy(y_accum, accum + 0 * mb_pels, mb_pels * sizeof(uint32_t));
   memcpy(u_accum, accum + 1 * mb_pels, mb_pels * sizeof(uint32_t));
@@ -487,6 +488,7 @@
   assert(block_width == MAX_WIDTH && MAX_WIDTH == 32);
   assert(block_height == MAX_HEIGHT && MAX_HEIGHT == 32);
   const BLOCK_SIZE block_size = BLOCK_32X32;
+  const int num_planes = 3;
   const int mb_pels = MAX_WIDTH * MAX_HEIGHT;
   const int mb_row = 0;
   const int mb_col = 0;
@@ -529,9 +531,9 @@
   memcpy(count + 2 * mb_pels, v_count, mb_pels * sizeof(uint16_t));
   const uint8_t *pred = CONVERT_TO_BYTEPTR(pred16);
 
-  ASM_REGISTER_STATE_CHECK(filter_func_(ref_frame, mbd, block_size, mb_row,
-                                        mb_col, strength, use_subblock, blk_fw,
-                                        pred, accum, count));
+  ASM_REGISTER_STATE_CHECK(
+      filter_func_(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
+                   strength, use_subblock, blk_fw, pred, accum, count));
 
   memcpy(y_accum, accum + 0 * mb_pels, mb_pels * sizeof(uint32_t));
   memcpy(u_accum, accum + 1 * mb_pels, mb_pels * sizeof(uint32_t));