Clean up redundant functions in temporal filtering

This CL cleans up the redundant functions and unit tests after the
recent unification of the two filtering strategies. In particular, YUV
filtering strategy is removed and the filtering process is simplified.

NOTE: The encoding speed is not affacted after the unification.

See https://aomedia-review.googlesource.com/c/aom/+/104826 for more
details on the strategy unifcation.

Change-Id: Idf9775a397dd97d36d35309c718c4c1b2f041661
diff --git a/av1/av1.cmake b/av1/av1.cmake
index 2ab3496..024370a 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -347,8 +347,6 @@
             "${AOM_ROOT}/av1/encoder/x86/encodetxb_sse4.c"
             "${AOM_ROOT}/av1/encoder/x86/highbd_fwd_txfm_sse4.c"
             "${AOM_ROOT}/av1/encoder/x86/rdopt_sse4.c"
-            "${AOM_ROOT}/av1/encoder/x86/temporal_filter_constants.h"
-            "${AOM_ROOT}/av1/encoder/x86/temporal_filter_sse4.c"
             "${AOM_ROOT}/av1/encoder/x86/pickrst_sse4.c")
 
 list(APPEND AOM_AV1_ENCODER_INTRIN_AVX2
@@ -432,10 +430,8 @@
                    "${AOM_ROOT}/av1/encoder/pass2_strategy.c"
                    "${AOM_ROOT}/av1/encoder/temporal_filter.c"
                    "${AOM_ROOT}/av1/encoder/temporal_filter.h"
-                   "${AOM_ROOT}/av1/encoder/temporal_filter_constants.h"
                    "${AOM_ROOT}/av1/encoder/tpl_model.c"
-                   "${AOM_ROOT}/av1/encoder/tpl_model.h"
-                   "${AOM_ROOT}/av1/encoder/x86/temporal_filter_sse4.c")
+                   "${AOM_ROOT}/av1/encoder/tpl_model.h")
 endif()
 
 # Setup AV1 common/decoder/encoder targets. The libaom target must exist before
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 760f667..7ab2984 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -287,13 +287,8 @@
   add_proto qw/int av1_full_range_search/, "const struct macroblock *x, const struct search_site_config *cfg, MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, const struct aom_variance_vtable *fn_ptr, const MV *center_mv";
 
   if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
-    add_proto qw/void av1_apply_temporal_filter_yuv/, "const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const int strength, const int use_subblock, const int *subblock_filter_weights, const uint8_t *pred, uint32_t *accum, uint16_t *count";
-    specialize qw/av1_apply_temporal_filter_yuv sse4_1/;
-  }
-
-  if (aom_config("CONFIG_REALTIME_ONLY") ne "yes") {
-    add_proto qw/void av1_apply_temporal_filter_planewise/, "const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const int use_subblock, const int block_mse, const int *subblock_mses, const int q_factor, const uint8_t *pred, uint32_t *accum, uint16_t *count";
-    specialize qw/av1_apply_temporal_filter_planewise sse2 avx2/;
+    add_proto qw/void av1_apply_temporal_filter/, "const struct yv12_buffer_config *ref_frame, const struct macroblockd *mbd, const BLOCK_SIZE block_size, const int mb_row, const int mb_col, const int num_planes, const double *noise_levels, const int *subblock_mses, const int q_factor, const uint8_t *pred, uint32_t *accum, uint16_t *count";
+    specialize qw/av1_apply_temporal_filter sse2 avx2/;
   }
   add_proto qw/void av1_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, const qm_val_t * qm_ptr, const qm_val_t * iqm_ptr, int log_scale";
 
diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c
index e8c0b70..606adeb 100644
--- a/av1/encoder/temporal_filter.c
+++ b/av1/encoder/temporal_filter.c
@@ -37,13 +37,24 @@
 
 // NOTE: All `tf` in this file means `temporal filtering`.
 
+// Forward Declaration.
+static void tf_determine_block_partition(const MV block_mv, const int block_mse,
+                                         MV *subblock_mvs, int *subblock_mses);
+
 // Does motion search for blocks in temporal filtering. This is the first step
 // for temporal filtering. More specifically, given a frame to be filtered and
 // another frame as reference, this function searches the reference frame to
-// find out the most alike block as that from the frame to be filtered. This
+// find out the most similar block as that from the frame to be filtered. This
 // found block will be further used for weighted averaging.
 // NOTE: Besides doing motion search for the entire block, this function will
-// also do motion search for each 1/4 sub-block to get more precise prediction.
+//       also do motion search for each 1/4 sub-block to get more precise
+//       predictions. Then, this function will determines whether to use 4
+//       sub-blocks to replace the entire block. If we do need to split the
+//       entire block, 4 elements in `subblock_mvs` and `subblock_mses` refer to
+//       the searched motion vector and search error (MSE) w.r.t. each sub-block
+//       respectively. Otherwise, the 4 elements will be the same, all of which
+//       are assigned as the searched motion vector and search error (MSE) for
+//       the entire block.
 // Inputs:
 //   cpi: Pointer to the composed information of input video.
 //   frame_to_filter: Pointer to the frame to be filtered.
@@ -53,16 +64,17 @@
 //   mb_col: Column index of the block in the entire frame.
 //   ref_mv: Reference motion vector, which is commonly inherited from the
 //           motion search result of previous frame.
-//   subblock_mvs: Pointer to the result motion vectors for 4 sub-blocks.
+//   subblock_mvs: Pointer to the motion vectors for 4 sub-blocks.
 //   subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks.
 // Returns:
-//   Search error (MSE) of the entire block.
-static int tf_motion_search(AV1_COMP *cpi,
-                            const YV12_BUFFER_CONFIG *frame_to_filter,
-                            const YV12_BUFFER_CONFIG *ref_frame,
-                            const BLOCK_SIZE block_size, const int mb_row,
-                            const int mb_col, MV *ref_mv, MV *subblock_mvs,
-                            int *subblock_mses) {
+//   Nothing will be returned. Results are saved in `subblock_mvs` and
+//   `subblock_mses`.
+static void tf_motion_search(AV1_COMP *cpi,
+                             const YV12_BUFFER_CONFIG *frame_to_filter,
+                             const YV12_BUFFER_CONFIG *ref_frame,
+                             const BLOCK_SIZE block_size, const int mb_row,
+                             const int mb_col, MV *ref_mv, MV *subblock_mvs,
+                             int *subblock_mses) {
   // Frame information
   const int min_frame_size = AOMMIN(cpi->common.width, cpi->common.height);
 
@@ -84,7 +96,6 @@
   // Parameters used for motion search.
   FULLPEL_MOTION_SEARCH_PARAMS full_ms_params;
   SUBPEL_MOTION_SEARCH_PARAMS ms_params;
-
   const search_site_config ss_cfg =
       cpi->mv_search_params.ss_cfg[SS_CFG_LOOKAHEAD];
   const SEARCH_METHODS full_search_method = NSTEP;
@@ -113,10 +124,9 @@
   int cost_list[5];
 
   // Do motion search.
-  // NOTE: In `av1_full_pixel_search()` and `find_fractional_mv_step()`, the
-  // searched result will be stored in `mb->best_mv`.
-  int_mv best_mv;
+  int_mv best_mv;  // Searched motion vector.
   int block_mse = INT_MAX;
+  MV block_mv = kZeroMv;
   mb->mv_cost_type = mv_cost_type;
 
   av1_make_default_fullpel_ms_params(&full_ms_params, cpi, mb, block_size,
@@ -140,7 +150,7 @@
         ref_frame->y_buffer + y_offset + mv_offset, y_stride,
         frame_to_filter->y_buffer + y_offset, y_stride, &sse);
     block_mse = DIVIDE_AND_ROUND(error, mb_pels);
-    mb->e_mbd.mi[0]->mv[0] = best_mv;
+    block_mv = best_mv.as_mv;
   } else {  // Do fractional search on the entire block and all sub-blocks.
     av1_make_default_subpel_ms_params(&ms_params, cpi, mb, block_size,
                                       &baseline_mv, cost_list);
@@ -151,7 +161,7 @@
         &mb->e_mbd, &cpi->common, &ms_params, subpel_start_mv, &best_mv.as_mv,
         &distortion, &sse, NULL);
     block_mse = DIVIDE_AND_ROUND(error, mb_pels);
-    mb->e_mbd.mi[0]->mv[0] = best_mv;
+    block_mv = best_mv.as_mv;
     *ref_mv = best_mv.as_mv;
     // On 4 sub-blocks.
     const BLOCK_SIZE subblock_size = ss_size_lookup[block_size][1][1];
@@ -199,49 +209,33 @@
   mbd->plane[0].pre[0] = ori_pre_buf;
   mb->mv_cost_type = ori_mv_cost_type;
 
-  return block_mse;
-}
+  // Make partition decision.
+  tf_determine_block_partition(block_mv, block_mse, subblock_mvs,
+                               subblock_mses);
 
-// Helper function to get weight according to thresholds.
-static INLINE int get_weight_by_thresh(const int value, const int low,
-                                       const int high) {
-  return value < low ? 2 : value < high ? 1 : 0;
-}
-
-// Gets filter weight for blocks in temporal filtering. The weights will be
-// assigned based on the motion search errors.
-// NOTE: Besides assigning filter weight for the block, this function will also
-// determine whether to split the entire block into 4 sub-blocks for further
-// filtering.
-// TODO(any): Many magic numbers are used in this function. They may be tuned
-// to improve the performance.
-// Inputs:
-//   block_mse: Motion search error (MSE) for the entire block.
-//   subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks.
-//   is_second_arf: Whether the to-filter frame is the second ARF. This field
-//                  will affect the filter weight for the to-filter frame.
-//   subblock_filter_weights: Pointer to the assigned filter weight for each
-//                            sub-block. If not using sub-blocks, the first
-//                            element will be used for the entire block.
-// Returns: Whether to use 4 sub-blocks to replace the original block.
-static int tf_get_filter_weight(const int block_mse, const int *subblock_mses,
-                                const int is_second_arf,
-                                int *subblock_filter_weights) {
-  // `block_mse` is initialized as INT_MAX and will be overwritten after the
-  // motion search with reference frame, therefore INT_MAX can ONLY be accessed
-  // by to-filter frame.
-  if (block_mse == INT_MAX) {
-    const int weight = TF_ENABLE_PLANEWISE_STRATEGY
-                           ? TF_PLANEWISE_FILTER_WEIGHT_SCALE
-                           : is_second_arf ? 64 : 32;
-    subblock_filter_weights[0] = subblock_filter_weights[1] =
-        subblock_filter_weights[2] = subblock_filter_weights[3] = weight;
-    return 0;
+  // Do not pass down the reference motion vector if error is too large.
+  const int thresh = (min_frame_size >= 720) ? 12 : 3;
+  if (block_mse > (thresh << (mbd->bd - 8))) {
+    *ref_mv = kZeroMv;
   }
+}
 
-  const int thresh_low = is_second_arf ? 20 : 40;
-  const int thresh_high = is_second_arf ? 40 : 80;
-
+// Determines whether to split the entire block to 4 sub-blocks for filtering.
+// In particular, this decision is made based on the comparison between the
+// motion search error of the entire block and the errors of all sub-blocks.
+// Inputs:
+//   block_mv: Motion vector for the entire block (ONLY as reference).
+//   block_mse: Motion search error (MSE) for the entire block (ONLY as
+//              reference).
+//   subblock_mvs: Pointer to the motion vectors for 4 sub-blocks (will be
+//                 modified based on the partition decision).
+//   subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks (will
+//                  be modified based on the partition decision).
+// Returns:
+//   Nothing will be returned. Results are saved in `subblock_mvs` and
+//   `subblock_mses`.
+static void tf_determine_block_partition(const MV block_mv, const int block_mse,
+                                         MV *subblock_mvs, int *subblock_mses) {
   int min_subblock_mse = INT_MAX;
   int max_subblock_mse = INT_MIN;
   int sum_subblock_mse = 0;
@@ -249,20 +243,18 @@
     sum_subblock_mse += subblock_mses[i];
     min_subblock_mse = AOMMIN(min_subblock_mse, subblock_mses[i]);
     max_subblock_mse = AOMMAX(max_subblock_mse, subblock_mses[i]);
-    subblock_filter_weights[i] =
-        get_weight_by_thresh(subblock_mses[i], thresh_low, thresh_high);
   }
 
+  // TODO(any): The following magic numbers may be tuned to improve the
+  // performance OR find a way to get rid of these magic numbers.
   if (((block_mse * 15 < sum_subblock_mse * 4) &&
        max_subblock_mse - min_subblock_mse < 48) ||
       ((block_mse * 14 < sum_subblock_mse * 4) &&
        max_subblock_mse - min_subblock_mse < 24)) {  // No split.
-    const int weight = get_weight_by_thresh(block_mse, thresh_low, thresh_high);
-    subblock_filter_weights[0] = subblock_filter_weights[1] =
-        subblock_filter_weights[2] = subblock_filter_weights[3] = weight;
-    return 0;
-  } else {  // Do split.
-    return 1;
+    for (int i = 0; i < 4; ++i) {
+      subblock_mvs[i] = block_mv;
+      subblock_mses[i] = block_mse;
+    }
   }
 }
 
@@ -287,7 +279,6 @@
 //   mb_col: Column index of the block in the entire frame.
 //   num_planes: Number of planes in the frame.
 //   scale: Scaling factor.
-//   use_subblock: Whether to use 4 sub-blocks to replace the original block.
 //   subblock_mvs: The motion vectors for each sub-block (row-major order).
 //   pred: Pointer to the predictor to build.
 // Returns:
@@ -298,8 +289,7 @@
                                const BLOCK_SIZE block_size, const int mb_row,
                                const int mb_col, const int num_planes,
                                const struct scale_factors *scale,
-                               const int use_subblock, const MV *subblock_mvs,
-                               uint8_t *pred) {
+                               const MV *subblock_mvs, uint8_t *pred) {
   assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
 
   // Information of the entire block.
@@ -310,16 +300,8 @@
   const int mb_x = mb_width * mb_col;                 // X-coord (Top-left).
   const int bit_depth = mbd->bd;                      // Bit depth.
   const int is_intrabc = 0;                           // Is intra-copied?
-  const int mb_mv_row = mbd->mi[0]->mv[0].as_mv.row;  // Motion vector (y).
-  const int mb_mv_col = mbd->mi[0]->mv[0].as_mv.col;  // Motion vector (x).
-  const MV mb_mv = { (int16_t)mb_mv_row, (int16_t)mb_mv_col };
   const int is_high_bitdepth = is_frame_high_bitdepth(ref_frame);
 
-  // Information of each sub-block (actually in use).
-  const int num_blocks = use_subblock ? 2 : 1;  // Num of blocks on each side.
-  const int block_height = mb_height >> (num_blocks - 1);  // Height.
-  const int block_width = mb_width >> (num_blocks - 1);    // Width.
-
   // Default interpolation filters.
   const int_interpfilters interp_filters =
       av1_broadcast_interp_filter(MULTITAP_SHARP);
@@ -334,8 +316,8 @@
     const int plane_w = mb_width >> subsampling_x;   // Plane width.
     const int plane_y = mb_y >> subsampling_y;       // Y-coord (Top-left).
     const int plane_x = mb_x >> subsampling_x;       // X-coord (Top-left).
-    const int h = block_height >> subsampling_y;     // Sub-block height.
-    const int w = block_width >> subsampling_x;      // Sub-block width.
+    const int h = plane_h >> 1;                      // Sub-block height.
+    const int w = plane_w >> 1;                      // Sub-block width.
     const int is_y_plane = (plane == 0);             // Is Y-plane?
 
     const struct buf_2d ref_buf = { NULL, ref_frame->buffers[plane],
@@ -343,12 +325,12 @@
                                     ref_frame->heights[is_y_plane ? 0 : 1],
                                     ref_frame->strides[is_y_plane ? 0 : 1] };
 
-    // Handle entire block or sub-blocks if needed.
+    // Handle each subblock.
     int subblock_idx = 0;
     for (int i = 0; i < plane_h; i += h) {
       for (int j = 0; j < plane_w; j += w) {
         // Choose proper motion vector.
-        const MV mv = use_subblock ? subblock_mvs[subblock_idx] : mb_mv;
+        const MV mv = subblock_mvs[subblock_idx++];
         assert(mv.row >= INT16_MIN && mv.row <= INT16_MAX &&
                mv.col >= INT16_MIN && mv.col <= INT16_MAX);
 
@@ -363,8 +345,6 @@
         inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth);
         av1_enc_build_one_inter_predictor(&pred[plane_offset + i * plane_w + j],
                                           plane_w, &mv, &inter_pred_params);
-
-        ++subblock_idx;
       }
     }
     plane_offset += mb_pels;
@@ -378,21 +358,16 @@
 //        subsampling information of all planes as well as the bit-depth.
 //   block_size: Size of the block.
 //   num_planes: Number of planes in the frame.
-//   filter_weight: Weight used for filtering.
 //   pred: Pointer to the well-built predictors.
 //   accum: Pointer to the pixel-wise accumulator for filtering.
 //   count: Pointer to the pixel-wise counter fot filtering.
 // Returns:
 //   Nothing will be returned. But the content to which `accum` and `pred`
 //   point will be modified.
-void av1_apply_temporal_filter_self(const MACROBLOCKD *mbd,
-                                    const BLOCK_SIZE block_size,
-                                    const int num_planes,
-                                    const int filter_weight,
-                                    const uint8_t *pred, uint32_t *accum,
-                                    uint16_t *count) {
-  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
-
+void tf_apply_temporal_filter_self(const MACROBLOCKD *mbd,
+                                   const BLOCK_SIZE block_size,
+                                   const int num_planes, const uint8_t *pred,
+                                   uint32_t *accum, uint16_t *count) {
   // Block information.
   const int mb_height = block_size_high[block_size];
   const int mb_width = block_size_wide[block_size];
@@ -412,8 +387,8 @@
       for (int j = 0; j < w; ++j) {
         const int idx = plane_offset + pred_idx;  // Index with plane shift.
         const int pred_value = is_high_bitdepth ? pred16[idx] : pred[idx];
-        accum[idx] += filter_weight * pred_value;
-        count[idx] += filter_weight;
+        accum[idx] += TF_WEIGHT_SCALE * pred_value;
+        count[idx] += TF_WEIGHT_SCALE;
         ++pred_idx;
       }
     }
@@ -468,166 +443,7 @@
   }
 }
 
-// Function to adjust the filter weight when use YUV strategy.
-// Inputs:
-//   filter_weight: Original filter weight.
-//   sum_square_diff: Sum of squared difference between input frame and
-//                    prediction. This field is computed pixel by pixel, and
-//                    is used as a reference for the filter weight adjustment.
-//   num_ref_pixels: Number of pixels used to compute the `sum_square_diff`.
-//                   This field should align with the above lookup tables
-//                   `filter_weight_adjustment_lookup_table_yuv` and
-//                   `highbd_filter_weight_adjustment_lookup_table_yuv`.
-//   strength: Strength for filter weight adjustment.
-// Returns:
-//   Adjusted filter weight which will finally be used for filtering.
-static INLINE int adjust_filter_weight_yuv(const int filter_weight,
-                                           const uint64_t sum_square_diff,
-                                           const int num_ref_pixels,
-                                           const int strength) {
-  int modifier =
-      (int)(AOMMIN(sum_square_diff * TF_YUV_FILTER_WEIGHT_SCALE, INT32_MAX)) /
-      num_ref_pixels;
-  const int rounding = (1 << strength) >> 1;
-  modifier = (modifier + rounding) >> strength;
-  return (modifier >= 16) ? 0 : (16 - modifier) * filter_weight;
-}
-
-// Applies temporal filter with YUV strategy.
-// Inputs:
-//   frame_to_filter: Pointer to the frame to be filtered, which is used as
-//                    reference to compute squared differece from the predictor.
-//   mbd: Pointer to the block for filtering, which is ONLY used to get
-//        subsampling information of all YUV planes.
-//   block_size: Size of the block.
-//   mb_row: Row index of the block in the entire frame.
-//   mb_col: Column index of the block in the entire frame.
-//   num_planes: Number of planes in the frame.
-//   strength: Strength for filter weight adjustment.
-//   use_subblock: Whether to use 4 sub-blocks to replace the original block.
-//   subblock_filter_weights: The filter weights for each sub-block (row-major
-//                            order). If `use_subblock` is set as 0, the first
-//                            weight will be applied to the entire block.
-//   pred: Pointer to the well-built predictors.
-//   accum: Pointer to the pixel-wise accumulator for filtering.
-//   count: Pointer to the pixel-wise counter fot filtering.
-// Returns:
-//   Nothing will be returned. But the content to which `accum` and `pred`
-//   point will be modified.
-void av1_apply_temporal_filter_yuv_c(
-    const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
-    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int num_planes, const int strength, const int use_subblock,
-    const int *subblock_filter_weights, const uint8_t *pred, uint32_t *accum,
-    uint16_t *count) {
-  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
-
-  // Block information.
-  const int mb_height = block_size_high[block_size];
-  const int mb_width = block_size_wide[block_size];
-  const int mb_pels = mb_height * mb_width;
-  const int is_high_bitdepth = is_frame_high_bitdepth(frame_to_filter);
-  const uint16_t *pred16 = CONVERT_TO_SHORTPTR(pred);
-
-  // Allocate memory for pixel-wise squared differences for all planes. They,
-  // regardless of the subsampling, are assigned with memory of size `mb_pels`.
-  uint32_t *square_diff =
-      aom_memalign(16, num_planes * mb_pels * sizeof(uint32_t));
-  memset(square_diff, 0, num_planes * mb_pels * sizeof(square_diff[0]));
-
-  int plane_offset = 0;
-  for (int plane = 0; plane < num_planes; ++plane) {
-    // Locate pixel on reference frame.
-    const int plane_h = mb_height >> mbd->plane[plane].subsampling_y;
-    const int plane_w = mb_width >> mbd->plane[plane].subsampling_x;
-    const int frame_stride = frame_to_filter->strides[plane == 0 ? 0 : 1];
-    const int frame_offset = mb_row * plane_h * frame_stride + mb_col * plane_w;
-    const uint8_t *ref = frame_to_filter->buffers[plane];
-    compute_square_diff(ref, frame_offset, frame_stride, pred, plane_offset,
-                        plane_w, plane_h, plane_w, is_high_bitdepth,
-                        square_diff + plane_offset);
-    plane_offset += mb_pels;
-  }
-
-  // Get window size for pixel-wise filtering.
-  assert(TF_YUV_FILTER_WINDOW_LENGTH % 2 == 1);
-  const int half_window = TF_YUV_FILTER_WINDOW_LENGTH >> 1;
-
-  // Handle planes in sequence.
-  plane_offset = 0;
-  for (int plane = 0; plane < num_planes; ++plane) {
-    const int subsampling_y = mbd->plane[plane].subsampling_y;
-    const int subsampling_x = mbd->plane[plane].subsampling_x;
-    const int h = mb_height >> subsampling_y;  // Plane height.
-    const int w = mb_width >> subsampling_x;   // Plane width.
-
-    // Perform filtering.
-    int pred_idx = 0;
-    for (int i = 0; i < h; ++i) {
-      for (int j = 0; j < w; ++j) {
-        // non-local mean approach
-        uint64_t sum_square_diff = 0;
-        int num_ref_pixels = 0;
-
-        for (int wi = -half_window; wi <= half_window; ++wi) {
-          for (int wj = -half_window; wj <= half_window; ++wj) {
-            const int y = i + wi;  // Y-coord on the current plane.
-            const int x = j + wj;  // X-coord on the current plane.
-            if (y >= 0 && y < h && x >= 0 && x < w) {
-              sum_square_diff += square_diff[plane_offset + y * w + x];
-              ++num_ref_pixels;
-            }
-          }
-        }
-
-        if (plane == 0) {  // Filter Y-plane using both U-plane and V-plane.
-          for (int p = 1; p < num_planes; ++p) {
-            const int ss_y_shift = mbd->plane[p].subsampling_y - subsampling_y;
-            const int ss_x_shift = mbd->plane[p].subsampling_x - subsampling_x;
-            const int yy = i >> ss_y_shift;  // Y-coord on UV-plane.
-            const int xx = j >> ss_x_shift;  // X-coord on UV-plane.
-            const int ww = w >> ss_x_shift;  // Width of UV-plane.
-            sum_square_diff += square_diff[p * mb_pels + yy * ww + xx];
-            ++num_ref_pixels;
-          }
-        } else {  // Filter U-plane and V-plane using Y-plane.
-          const int ss_y_shift = subsampling_y - mbd->plane[0].subsampling_y;
-          const int ss_x_shift = subsampling_x - mbd->plane[0].subsampling_x;
-          for (int ii = 0; ii < (1 << ss_y_shift); ++ii) {
-            for (int jj = 0; jj < (1 << ss_x_shift); ++jj) {
-              const int yy = (i << ss_y_shift) + ii;  // Y-coord on Y-plane.
-              const int xx = (j << ss_x_shift) + jj;  // X-coord on Y-plane.
-              const int ww = w << ss_x_shift;         // Width of Y-plane.
-              sum_square_diff += square_diff[yy * ww + xx];
-              ++num_ref_pixels;
-            }
-          }
-        }
-
-        // Base filter weight estimated by motion search error.
-        const int subblock_idx =
-            use_subblock ? (i >= h / 2) * 2 + (j >= w / 2) : 0;
-        const int filter_weight = subblock_filter_weights[subblock_idx];
-
-        const int idx = plane_offset + pred_idx;  // Index with plane shift.
-        const int pred_value = is_high_bitdepth ? pred16[idx] : pred[idx];
-        const int adjusted_weight = adjust_filter_weight_yuv(
-            filter_weight, sum_square_diff, num_ref_pixels, strength);
-        accum[idx] += adjusted_weight * pred_value;
-        count[idx] += adjusted_weight;
-
-        ++pred_idx;
-      }
-    }
-    plane_offset += mb_pels;
-  }
-
-  aom_free(square_diff);
-}
-
-// Applies temporal filter with plane-wise strategy.
-// The strategy of filter weight adjustment is different from the function
-// `av1_apply_temporal_filter_yuv_c()`.
+// Applies temporal filtering.
 // Inputs:
 //   frame_to_filter: Pointer to the frame to be filtered, which is used as
 //                    reference to compute squared differece from the predictor.
@@ -639,8 +455,6 @@
 //   num_planes: Number of planes in the frame.
 //   noise_levels: Pointer to the noise levels of the to-filter frame, estimated
 //                 with each plane (in Y, U, V order).
-//   use_subblock: Whether to use 4 sub-blocks to replace the original block.
-//   block_mse: Motion search error (MSE) for the entire block.
 //   subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks.
 //   q_factor: Quantization factor. This is actually the `q` defined in libaom,
 //             which is converted from `qindex`.
@@ -650,14 +464,11 @@
 // Returns:
 //   Nothing will be returned. But the content to which `accum` and `pred`
 //   point will be modified.
-void av1_apply_temporal_filter_planewise_c(
+void av1_apply_temporal_filter_c(
     const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
     const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int num_planes, const double *noise_levels, const int use_subblock,
-    const int block_mse, const int *subblock_mses, const int q_factor,
-    const uint8_t *pred, uint32_t *accum, uint16_t *count) {
-  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
-
+    const int num_planes, const double *noise_levels, const int *subblock_mses,
+    const int q_factor, const uint8_t *pred, uint32_t *accum, uint16_t *count) {
   // Block information.
   const int mb_height = block_size_high[block_size];
   const int mb_width = block_size_wide[block_size];
@@ -686,8 +497,8 @@
   }
 
   // Get window size for pixel-wise filtering.
-  assert(TF_PLANEWISE_FILTER_WINDOW_LENGTH % 2 == 1);
-  const int half_window = TF_PLANEWISE_FILTER_WINDOW_LENGTH >> 1;
+  assert(TF_WINDOW_LENGTH % 2 == 1);
+  const int half_window = TF_WINDOW_LENGTH >> 1;
 
   // Hyper-parameter for filter weight adjustment.
   const int frame_height = frame_to_filter->heights[0]
@@ -740,8 +551,7 @@
         if (mbd->bd > 8) sum_square_diff >>= (mbd->bd - 8) * (mbd->bd - 8);
         const double window_error = (double)(sum_square_diff) / num_ref_pixels;
         const int subblock_idx = (i >= h / 2) * 2 + (j >= w / 2);
-        const double block_error =
-            (double)(use_subblock ? subblock_mses[subblock_idx] : block_mse);
+        const double block_error = (double)subblock_mses[subblock_idx];
 
         // Control factor for non-local mean approach.
         const double r =
@@ -751,8 +561,7 @@
         // Compute filter weight.
         const double scaled_diff =
             AOMMAX(-(window_error + block_error / 10) / (2 * r * r * q), -15.0);
-        const int adjusted_weight =
-            (int)(exp(scaled_diff) * TF_PLANEWISE_FILTER_WEIGHT_SCALE);
+        const int adjusted_weight = (int)(exp(scaled_diff) * TF_WEIGHT_SCALE);
 
         const int idx = plane_offset + pred_idx;  // Index with plane shift.
         const int pred_value = is_high_bitdepth ? pred16[idx] : pred[idx];
@@ -768,93 +577,6 @@
   aom_free(square_diff);
 }
 
-// Computes temporal filter weights and accumulators from all reference frames
-// excluding the current frame to be filtered.
-// Inputs:
-//   frame_to_filter: Pointer to the frame to be filtered, which is used as
-//                    reference to compute squared differece from the predictor.
-//   mbd: Pointer to the block for filtering, which is ONLY used to get
-//        subsampling information of all planes and the bit-depth.
-//   block_size: Size of the block.
-//   mb_row: Row index of the block in the entire frame.
-//   mb_col: Column index of the block in the entire frame.
-//   num_planes: Number of planes in the frame.
-//   strength: Strength for filter weight adjustment. (Used in YUV strategy)
-//   use_subblock: Whether to use 4 sub-blocks to replace the original block.
-//                 (Used in YUV strategy)
-//   subblock_filter_weights: The filter weights for each sub-block (row-major
-//                            order). If `use_subblock` is set as 0, the first
-//                            weight will be applied to the entire block. (Used
-//                            in YUV strategy)
-//   noise_levels: Pointer to the noise levels of the to-filter frame, estimated
-//                 with each plane (in Y, U, V order). (Used in plane-wise
-//                 strategy)
-//   block_mse: Motion search error (MSE) for the entire block.
-//   subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks.
-//   q_factor: Quantization factor.
-//   pred: Pointer to the well-built predictors.
-//   accum: Pointer to the pixel-wise accumulator for filtering.
-//   count: Pointer to the pixel-wise counter fot filtering.
-// Returns:
-//   Nothing will be returned. But the content to which `accum` and `pred`
-//   point will be modified.
-void av1_apply_temporal_filter_others(
-    const YV12_BUFFER_CONFIG *frame_to_filter, const MACROBLOCKD *mbd,
-    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int num_planes, const int strength, const int use_subblock,
-    const int *subblock_filter_weights, const double *noise_levels,
-    const int block_mse, const int *subblock_mses, const int q_factor,
-    const uint8_t *pred, uint32_t *accum, uint16_t *count) {
-  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
-
-  // Determines whether the video is with `YUV 4:2:2` format, since avx2/sse2
-  // function only supports square block size.
-  int is_yuv422_format = 0;
-  for (int plane = 1; plane < num_planes; ++plane) {
-    if (mbd->plane[plane].subsampling_x != mbd->plane[plane].subsampling_y) {
-      is_yuv422_format = 1;
-      break;
-    }
-  }
-
-  if (TF_ENABLE_PLANEWISE_STRATEGY) {
-    // TODO(any): avx2 and sse2 version should be changed to align with C
-    // function before using.
-    if (is_frame_high_bitdepth(frame_to_filter) || block_size != BLOCK_32X32 ||
-        is_yuv422_format) {
-      av1_apply_temporal_filter_planewise_c(
-          frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
-          noise_levels, use_subblock, block_mse, subblock_mses, q_factor, pred,
-          accum, count);
-    } else {
-      av1_apply_temporal_filter_planewise(
-          frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
-          noise_levels, use_subblock, block_mse, subblock_mses, q_factor, pred,
-          accum, count);
-    }
-  } else {  // Commonly used for low-resolution video.
-    if (subblock_filter_weights[0] == 0 && subblock_filter_weights[1] == 0 &&
-        subblock_filter_weights[2] == 0 && subblock_filter_weights[3] == 0) {
-      return;
-    }
-    const int adj_strength = strength + 2 * (mbd->bd - 8);
-    if (num_planes == 3 && TF_YUV_FILTER_WEIGHT_SCALE == 3 &&
-        block_size != BLOCK_32X32) {
-      av1_apply_temporal_filter_yuv(frame_to_filter, mbd, block_size, mb_row,
-                                    mb_col, num_planes, adj_strength,
-                                    use_subblock, subblock_filter_weights, pred,
-                                    accum, count);
-    } else {
-      // TODO(any): sse4 version should be changed to align with C function
-      // before using.
-      av1_apply_temporal_filter_yuv_c(frame_to_filter, mbd, block_size, mb_row,
-                                      mb_col, num_planes, adj_strength,
-                                      use_subblock, subblock_filter_weights,
-                                      pred, accum, count);
-    }
-  }
-}
-
 // Normalizes the accumulated filtering result to produce the filtered frame.
 // Inputs:
 //   mbd: Pointer to the block for filtering, which is ONLY used to get
@@ -873,8 +595,6 @@
     const MACROBLOCKD *mbd, const BLOCK_SIZE block_size, const int mb_row,
     const int mb_col, const int num_planes, const uint32_t *accum,
     const uint16_t *count, YV12_BUFFER_CONFIG *result_buffer) {
-  assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
-
   // Block information.
   const int mb_height = block_size_high[block_size];
   const int mb_width = block_size_wide[block_size];
@@ -928,20 +648,19 @@
 //   num_frames: Number of frames in the frame buffer.
 //   filter_frame_idx: Index of the frame to be filtered.
 //   is_key_frame: Whether the to-filter is a key frame.
-//   is_second_arf: Whether the to-filter frame is the second ARF. This field
-//                  is ONLY used for assigning filter weight.
 //   block_size: Block size used for temporal filtering.
 //   scale: Scaling factor.
-//   strength: Pre-estimated strength for filter weight adjustment.
 //   noise_levels: Pointer to the noise levels of the to-filter frame, estimated
 //                 with each plane (in Y, U, V order).
 // Returns:
 //   Difference between filtered frame and the original frame.
-static FRAME_DIFF tf_do_filtering(
-    AV1_COMP *cpi, YV12_BUFFER_CONFIG **frames, const int num_frames,
-    const int filter_frame_idx, const int is_key_frame, const int is_second_arf,
-    const BLOCK_SIZE block_size, const struct scale_factors *scale,
-    const int strength, const double *noise_levels) {
+static FRAME_DIFF tf_do_filtering(AV1_COMP *cpi, YV12_BUFFER_CONFIG **frames,
+                                  const int num_frames,
+                                  const int filter_frame_idx,
+                                  const int is_key_frame,
+                                  const BLOCK_SIZE block_size,
+                                  const struct scale_factors *scale,
+                                  const double *noise_levels) {
   // Basic information.
   const YV12_BUFFER_CONFIG *const frame_to_filter = frames[filter_frame_idx];
   const int frame_height = frame_to_filter->y_crop_height;
@@ -957,6 +676,12 @@
   assert(num_planes >= 1 && num_planes <= MAX_MB_PLANE);
   const int is_high_bitdepth = is_frame_high_bitdepth(frame_to_filter);
 
+  // Quantization factor used in temporal filtering.
+  const FRAME_TYPE frame_type =
+      (cpi->common.current_frame.frame_number > 1) ? INTER_FRAME : KEY_FRAME;
+  const int q_factor = (int)av1_convert_qindex_to_q(
+      cpi->rc.avg_frame_qindex[frame_type], cpi->common.seq_params.bit_depth);
+
   // Save input state.
   MACROBLOCK *const mb = &cpi->td.mb;
   MACROBLOCKD *const mbd = &mb->e_mbd;
@@ -966,6 +691,17 @@
   }
   MB_MODE_INFO **input_mb_mode_info = mbd->mi;
 
+  // Determine whether the video is with `YUV 4:2:2` format, since the avx2/sse2
+  // function only supports square block size. We will use C function instead
+  // for videos with `YUV 4:2:2` format.
+  int is_yuv422_format = 0;
+  for (int plane = 1; plane < num_planes; ++plane) {
+    if (mbd->plane[plane].subsampling_x != mbd->plane[plane].subsampling_y) {
+      is_yuv422_format = 1;
+      break;
+    }
+  }
+
   // Setup.
   mbd->block_ref_scale_factors[0] = scale;
   mbd->block_ref_scale_factors[1] = scale;
@@ -1003,49 +739,39 @@
 
         // Motion search.
         MV subblock_mvs[4] = { kZeroMv, kZeroMv, kZeroMv, kZeroMv };
-        int subblock_filter_weights[4] = { 0, 0, 0, 0 };
-        int block_mse = INT_MAX;
         int subblock_mses[4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX };
-
         if (frame == filter_frame_idx) {  // Frame to be filtered.
-          // Set motion vector as 0 for the frame to be filtered.
-          mbd->mi[0]->mv[0].as_mv = kZeroMv;
           // Change ref_mv sign for following frames.
           ref_mv.row *= -1;
           ref_mv.col *= -1;
         } else {  // Other reference frames.
-          block_mse = tf_motion_search(cpi, frame_to_filter, frames[frame],
-                                       block_size, mb_row, mb_col, &ref_mv,
-                                       subblock_mvs, subblock_mses);
-          // Do not pass down the reference motion vector if error is too large.
-          const int thresh = AOMMIN(frame_height, frame_width) >= 720 ? 12 : 3;
-          if (block_mse > (thresh << (mbd->bd - 8))) {
-            ref_mv = kZeroMv;
-          }
+          tf_motion_search(cpi, frame_to_filter, frames[frame], block_size,
+                           mb_row, mb_col, &ref_mv, subblock_mvs,
+                           subblock_mses);
         }
-
-        // Build predictor.
-        int use_subblock = tf_get_filter_weight(
-            block_mse, subblock_mses, is_second_arf, subblock_filter_weights);
         tf_build_predictor(frames[frame], mbd, block_size, mb_row, mb_col,
-                           num_planes, scale, use_subblock, subblock_mvs, pred);
+                           num_planes, scale, subblock_mvs, pred);
 
         // Perform weighted averaging.
         if (frame == filter_frame_idx) {  // Frame to be filtered.
-          av1_apply_temporal_filter_self(mbd, block_size, num_planes,
-                                         subblock_filter_weights[0], pred,
-                                         accum, count);
+          tf_apply_temporal_filter_self(mbd, block_size, num_planes, pred,
+                                        accum, count);
         } else {  // Other reference frames.
-          const FRAME_TYPE frame_type =
-              (cpi->common.current_frame.frame_number > 1) ? INTER_FRAME
-                                                           : KEY_FRAME;
-          const int q_factor =
-              (int)av1_convert_qindex_to_q(cpi->rc.avg_frame_qindex[frame_type],
-                                           cpi->common.seq_params.bit_depth);
-          av1_apply_temporal_filter_others(
-              frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
-              strength, use_subblock, subblock_filter_weights, noise_levels,
-              block_mse, subblock_mses, q_factor, pred, accum, count);
+          // TODO(any): avx2/sse2 version should be changed to align with C
+          // function before using. In particular, current avx2/sse2 function
+          // only supports 32x32 block size, 5x5 filtering window, 8-bit
+          // encoding, and the case when the video is not with `YUV 4:2:2`
+          // format.
+          if (TF_BLOCK_SIZE == BLOCK_32X32 && TF_WINDOW_LENGTH == 5 &&
+              !is_frame_high_bitdepth(frame_to_filter) && !is_yuv422_format) {
+            av1_apply_temporal_filter(
+                frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
+                noise_levels, subblock_mses, q_factor, pred, accum, count);
+          } else {
+            av1_apply_temporal_filter_c(
+                frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
+                noise_levels, subblock_mses, q_factor, pred, accum, count);
+          }
         }
       }
 
@@ -1298,9 +1024,9 @@
     av1_setup_scale_factors_for_frame(
         &sf, frames[0]->y_crop_width, frames[0]->y_crop_height,
         frames[0]->y_crop_width, frames[0]->y_crop_height);
-    diff = tf_do_filtering(cpi, frames, num_frames_for_filtering,
-                           filter_frame_idx, is_key_frame, is_second_arf,
-                           TF_BLOCK_SIZE, &sf, strength, noise_levels);
+    diff =
+        tf_do_filtering(cpi, frames, num_frames_for_filtering, filter_frame_idx,
+                        is_key_frame, TF_BLOCK_SIZE, &sf, noise_levels);
   }
 
   if (is_key_frame) {  // Key frame should always be filtered.
diff --git a/av1/encoder/temporal_filter.h b/av1/encoder/temporal_filter.h
index 5a6bde2..df01fbd 100644
--- a/av1/encoder/temporal_filter.h
+++ b/av1/encoder/temporal_filter.h
@@ -25,19 +25,11 @@
 // Block size used in temporal filtering.
 #define TF_BLOCK_SIZE BLOCK_32X32
 
-// Window size for YUV temporal filtering.
-// This is particually used for function `av1_apply_temporal_filter_yuv()`.
-#define TF_YUV_FILTER_WINDOW_LENGTH 3
-// A scale factor used in YUV temporal filtering for weight adjustment.
-#define TF_YUV_FILTER_WEIGHT_SCALE 3
-
-#define TF_ENABLE_PLANEWISE_STRATEGY 1
-// Window size for plane-wise temporal filtering.
-// This is particually used for function `av1_apply_temporal_filter_planewise()`
-#define TF_PLANEWISE_FILTER_WINDOW_LENGTH 5
-// A scale factor used in plane-wise temporal filtering to raise the filter
-// weight from `double` with range [0, 1] to `int` with range [0, 1000].
-#define TF_PLANEWISE_FILTER_WEIGHT_SCALE 1000
+// Window size for temporal filtering.
+#define TF_WINDOW_LENGTH 5
+// A scale factor used in temporal filtering to raise the filter weight from
+// `double` with range [0, 1] to `int` with range [0, 1000].
+#define TF_WEIGHT_SCALE 1000
 
 #define NOISE_ESTIMATION_EDGE_THRESHOLD 50
 // Estimates noise level from a given frame using a single plane (Y, U, or V).
diff --git a/av1/encoder/x86/temporal_filter_avx2.c b/av1/encoder/x86/temporal_filter_avx2.c
index a11f791..6cfea6a 100644
--- a/av1/encoder/x86/temporal_filter_avx2.c
+++ b/av1/encoder/x86/temporal_filter_avx2.c
@@ -127,14 +127,14 @@
   return _mm_extract_epi32(v128a, 0);
 }
 
-static void apply_temporal_filter_planewise(
+static void apply_temporal_filter(
     const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
     const unsigned int stride2, const int block_width, const int block_height,
-    const double sigma, const int decay_control, const int use_subblock,
-    const int block_mse, const int *subblock_mses, const int q_factor,
-    unsigned int *accumulator, uint16_t *count, uint16_t *luma_sq_error,
-    uint16_t *chroma_sq_error, int plane, int ss_x_shift, int ss_y_shift) {
-  assert(TF_PLANEWISE_FILTER_WINDOW_LENGTH == 5);
+    const double sigma, const int decay_control, const int *subblock_mses,
+    const int q_factor, unsigned int *accumulator, uint16_t *count,
+    uint16_t *luma_sq_error, uint16_t *chroma_sq_error, int plane,
+    int ss_x_shift, int ss_y_shift) {
+  assert(TF_WINDOW_LENGTH == 5);
   assert(((block_width == 32) && (block_height == 32)) ||
          ((block_width == 16) && (block_height == 16)));
   if (plane > PLANE_TYPE_Y) assert(chroma_sq_error != NULL);
@@ -203,8 +203,7 @@
       const int pixel_value = frame2[i * stride2 + j];
 
       int diff_sse = acc_5x5_sse[i][j];
-      int num_ref_pixels =
-          TF_PLANEWISE_FILTER_WINDOW_LENGTH * TF_PLANEWISE_FILTER_WINDOW_LENGTH;
+      int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH;
 
       // Filter U-plane and V-plane using Y-plane. This is because motion
       // search is only done on Y-plane, so the information from Y-plane will
@@ -223,13 +222,11 @@
       const double window_error = (double)(diff_sse) / num_ref_pixels;
       const int subblock_idx =
           (i >= block_height / 2) * 2 + (j >= block_width / 2);
-      const double block_error =
-          (double)(use_subblock ? subblock_mses[subblock_idx] : block_mse);
+      const double block_error = (double)subblock_mses[subblock_idx];
 
       const double scaled_diff =
           AOMMAX(-(window_error + block_error / 10) / (2 * h * h * q), -15.0);
-      const int adjusted_weight =
-          (int)(exp(scaled_diff) * TF_PLANEWISE_FILTER_WEIGHT_SCALE);
+      const int adjusted_weight = (int)(exp(scaled_diff) * TF_WEIGHT_SCALE);
 
       count[k] += adjusted_weight;
       accumulator[k] += adjusted_weight * pixel_value;
@@ -237,12 +234,11 @@
   }
 }
 
-void av1_apply_temporal_filter_planewise_avx2(
+void av1_apply_temporal_filter_avx2(
     const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
     const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int num_planes, const double *noise_levels, const int use_subblock,
-    const int block_mse, const int *subblock_mses, const int q_factor,
-    const uint8_t *pred, uint32_t *accum, uint16_t *count) {
+    const int num_planes, const double *noise_levels, const int *subblock_mses,
+    const int q_factor, const uint8_t *pred, uint32_t *accum, uint16_t *count) {
   const int is_high_bitdepth = ref_frame->flags & YV12_FLAG_HIGHBITDEPTH;
   if (is_high_bitdepth) {
     assert(0 && "Only support low bit-depth with avx2!");
@@ -273,12 +269,11 @@
     const int ss_y_shift =
         mbd->plane[plane].subsampling_y - mbd->plane[0].subsampling_y;
 
-    apply_temporal_filter_planewise(
-        ref, frame_stride, pred + mb_pels * plane, plane_w, plane_w, plane_h,
-        noise_levels[plane], decay_control, use_subblock, block_mse,
-        subblock_mses, q_factor, accum + mb_pels * plane,
-        count + mb_pels * plane, luma_sq_error, chroma_sq_error, plane,
-        ss_x_shift, ss_y_shift);
+    apply_temporal_filter(ref, frame_stride, pred + mb_pels * plane, plane_w,
+                          plane_w, plane_h, noise_levels[plane], decay_control,
+                          subblock_mses, q_factor, accum + mb_pels * plane,
+                          count + mb_pels * plane, luma_sq_error,
+                          chroma_sq_error, plane, ss_x_shift, ss_y_shift);
   }
   if (chroma_sq_error != NULL) aom_free(chroma_sq_error);
 }
diff --git a/av1/encoder/x86/temporal_filter_constants.h b/av1/encoder/x86/temporal_filter_constants.h
deleted file mode 100644
index 7cd61d7..0000000
--- a/av1/encoder/x86/temporal_filter_constants.h
+++ /dev/null
@@ -1,407 +0,0 @@
-/*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#ifndef AOM_AV1_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_
-#define AOM_AV1_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_
-
-// Division using multiplication and shifting. The C implementation does:
-// modifier *= 3;
-// modifier /= index;
-// where 'modifier' is a set of summed values and 'index' is the number of
-// summed values.
-//
-// This equation works out to (m * 3) / i which reduces to:
-// m * 3/4
-// m * 1/2
-// m * 1/3
-//
-// By pairing the multiply with a down shift by 16 (_mm_mulhi_epu16):
-// m * C / 65536
-// we can create a C to replicate the division.
-//
-// m * 49152 / 65536 = m * 3/4
-// m * 32758 / 65536 = m * 1/2
-// m * 21846 / 65536 = m * 0.3333
-//
-// These are loaded using an instruction expecting int16_t values but are used
-// with _mm_mulhi_epu16(), which treats them as unsigned.
-#define NEIGHBOR_CONSTANT_4 (int16_t)49152
-#define NEIGHBOR_CONSTANT_5 (int16_t)39322
-#define NEIGHBOR_CONSTANT_6 (int16_t)32768
-#define NEIGHBOR_CONSTANT_7 (int16_t)28087
-#define NEIGHBOR_CONSTANT_8 (int16_t)24576
-#define NEIGHBOR_CONSTANT_9 (int16_t)21846
-#define NEIGHBOR_CONSTANT_10 (int16_t)19661
-#define NEIGHBOR_CONSTANT_11 (int16_t)17874
-#define NEIGHBOR_CONSTANT_13 (int16_t)15124
-
-DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_1[8]) = {
-  NEIGHBOR_CONSTANT_5, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
-  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
-  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7
-};
-
-DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_1[8]) = {
-  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
-  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
-  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_5
-};
-
-DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_1[8]) = {
-  NEIGHBOR_CONSTANT_7,  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10
-};
-
-DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_1[8]) = {
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_7
-};
-
-DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_1[8]) = {
-  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
-  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7,
-  NEIGHBOR_CONSTANT_7, NEIGHBOR_CONSTANT_7
-};
-
-DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_1[8]) = {
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10
-};
-
-DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_2[8]) = {
-  NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
-  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
-  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8
-};
-
-DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_2[8]) = {
-  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
-  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
-  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_6
-};
-
-DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_2[8]) = {
-  NEIGHBOR_CONSTANT_8,  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
-  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
-  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11
-};
-
-DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_2[8]) = {
-  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
-  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
-  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_8
-};
-
-DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_2[8]) = {
-  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
-  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
-  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8
-};
-
-DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_2[8]) = {
-  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
-  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
-  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11
-};
-
-DECLARE_ALIGNED(16, static const int16_t, TWO_CORNER_NEIGHBORS_PLUS_2[8]) = {
-  NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
-  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_8,
-  NEIGHBOR_CONSTANT_8, NEIGHBOR_CONSTANT_6
-};
-
-DECLARE_ALIGNED(16, static const int16_t, TWO_EDGE_NEIGHBORS_PLUS_2[8]) = {
-  NEIGHBOR_CONSTANT_8,  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
-  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_11,
-  NEIGHBOR_CONSTANT_11, NEIGHBOR_CONSTANT_8
-};
-
-DECLARE_ALIGNED(16, static const int16_t, LEFT_CORNER_NEIGHBORS_PLUS_4[8]) = {
-  NEIGHBOR_CONSTANT_8,  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10
-};
-
-DECLARE_ALIGNED(16, static const int16_t, RIGHT_CORNER_NEIGHBORS_PLUS_4[8]) = {
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_8
-};
-
-DECLARE_ALIGNED(16, static const int16_t, LEFT_EDGE_NEIGHBORS_PLUS_4[8]) = {
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
-  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
-  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13
-};
-
-DECLARE_ALIGNED(16, static const int16_t, RIGHT_EDGE_NEIGHBORS_PLUS_4[8]) = {
-  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
-  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
-  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_10
-};
-
-DECLARE_ALIGNED(16, static const int16_t, MIDDLE_EDGE_NEIGHBORS_PLUS_4[8]) = {
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10
-};
-
-DECLARE_ALIGNED(16, static const int16_t, MIDDLE_CENTER_NEIGHBORS_PLUS_4[8]) = {
-  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
-  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
-  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13
-};
-
-DECLARE_ALIGNED(16, static const int16_t, TWO_CORNER_NEIGHBORS_PLUS_4[8]) = {
-  NEIGHBOR_CONSTANT_8,  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_10,
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_8
-};
-
-DECLARE_ALIGNED(16, static const int16_t, TWO_EDGE_NEIGHBORS_PLUS_4[8]) = {
-  NEIGHBOR_CONSTANT_10, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
-  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_13,
-  NEIGHBOR_CONSTANT_13, NEIGHBOR_CONSTANT_10
-};
-
-static const int16_t *const LUMA_LEFT_COLUMN_NEIGHBORS[2] = {
-  LEFT_CORNER_NEIGHBORS_PLUS_2, LEFT_EDGE_NEIGHBORS_PLUS_2
-};
-
-static const int16_t *const LUMA_MIDDLE_COLUMN_NEIGHBORS[2] = {
-  MIDDLE_EDGE_NEIGHBORS_PLUS_2, MIDDLE_CENTER_NEIGHBORS_PLUS_2
-};
-
-static const int16_t *const LUMA_RIGHT_COLUMN_NEIGHBORS[2] = {
-  RIGHT_CORNER_NEIGHBORS_PLUS_2, RIGHT_EDGE_NEIGHBORS_PLUS_2
-};
-
-static const int16_t *const CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS[2] = {
-  LEFT_CORNER_NEIGHBORS_PLUS_1, LEFT_EDGE_NEIGHBORS_PLUS_1
-};
-
-static const int16_t *const CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
-  MIDDLE_EDGE_NEIGHBORS_PLUS_1, MIDDLE_CENTER_NEIGHBORS_PLUS_1
-};
-
-static const int16_t *const CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
-  RIGHT_CORNER_NEIGHBORS_PLUS_1, RIGHT_EDGE_NEIGHBORS_PLUS_1
-};
-
-static const int16_t *const CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS[2] = {
-  LEFT_CORNER_NEIGHBORS_PLUS_2, LEFT_EDGE_NEIGHBORS_PLUS_2
-};
-
-static const int16_t *const CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
-  MIDDLE_EDGE_NEIGHBORS_PLUS_2, MIDDLE_CENTER_NEIGHBORS_PLUS_2
-};
-
-static const int16_t *const CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
-  RIGHT_CORNER_NEIGHBORS_PLUS_2, RIGHT_EDGE_NEIGHBORS_PLUS_2
-};
-
-static const int16_t *const CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS[2] = {
-  TWO_CORNER_NEIGHBORS_PLUS_2, TWO_EDGE_NEIGHBORS_PLUS_2
-};
-
-static const int16_t *const CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS[2] = {
-  LEFT_CORNER_NEIGHBORS_PLUS_4, LEFT_EDGE_NEIGHBORS_PLUS_4
-};
-
-static const int16_t *const CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
-  MIDDLE_EDGE_NEIGHBORS_PLUS_4, MIDDLE_CENTER_NEIGHBORS_PLUS_4
-};
-
-static const int16_t *const CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
-  RIGHT_CORNER_NEIGHBORS_PLUS_4, RIGHT_EDGE_NEIGHBORS_PLUS_4
-};
-
-static const int16_t *const CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS[2] = {
-  TWO_CORNER_NEIGHBORS_PLUS_4, TWO_EDGE_NEIGHBORS_PLUS_4
-};
-
-#define HIGHBD_NEIGHBOR_CONSTANT_4 (uint32_t)3221225472U
-#define HIGHBD_NEIGHBOR_CONSTANT_5 (uint32_t)2576980378U
-#define HIGHBD_NEIGHBOR_CONSTANT_6 (uint32_t)2147483648U
-#define HIGHBD_NEIGHBOR_CONSTANT_7 (uint32_t)1840700270U
-#define HIGHBD_NEIGHBOR_CONSTANT_8 (uint32_t)1610612736U
-#define HIGHBD_NEIGHBOR_CONSTANT_9 (uint32_t)1431655766U
-#define HIGHBD_NEIGHBOR_CONSTANT_10 (uint32_t)1288490189U
-#define HIGHBD_NEIGHBOR_CONSTANT_11 (uint32_t)1171354718U
-#define HIGHBD_NEIGHBOR_CONSTANT_13 (uint32_t)991146300U
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_1[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_5, HIGHBD_NEIGHBOR_CONSTANT_7,
-  HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_1[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7,
-  HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_5
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_1[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_10,
-  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_1[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10,
-  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_7
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_1[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7,
-  HIGHBD_NEIGHBOR_CONSTANT_7, HIGHBD_NEIGHBOR_CONSTANT_7
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_1[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10,
-  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_6, HIGHBD_NEIGHBOR_CONSTANT_8,
-  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8,
-  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_6
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_11,
-  HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11,
-  HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_8
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_2[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8,
-  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_8
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_2[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11,
-  HIGHBD_NEIGHBOR_CONSTANT_11, HIGHBD_NEIGHBOR_CONSTANT_11
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_4[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_8, HIGHBD_NEIGHBOR_CONSTANT_10,
-  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_4[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10,
-  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_8
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_4[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_13,
-  HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_4[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13,
-  HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_10
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_4[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10,
-  HIGHBD_NEIGHBOR_CONSTANT_10, HIGHBD_NEIGHBOR_CONSTANT_10
-};
-
-DECLARE_ALIGNED(16, static const uint32_t,
-                HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_4[4]) = {
-  HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13,
-  HIGHBD_NEIGHBOR_CONSTANT_13, HIGHBD_NEIGHBOR_CONSTANT_13
-};
-
-static const uint32_t *const HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS[2] = {
-  HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2
-};
-
-static const uint32_t *const HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS[2] = {
-  HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_2, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_2
-};
-
-static const uint32_t *const HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS[2] = {
-  HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2
-};
-
-static const uint32_t *const HIGHBD_CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS[2] = {
-  HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_1, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_1
-};
-
-static const uint32_t *const HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
-  HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_1, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_1
-};
-
-static const uint32_t *const HIGHBD_CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
-  HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_1, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_1
-};
-
-static const uint32_t
-    *const HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS[2] = {
-      HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_2
-    };
-
-static const uint32_t
-    *const HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
-      HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_2, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_2
-    };
-
-static const uint32_t
-    *const HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
-      HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_2, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_2
-    };
-
-static const uint32_t
-    *const HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS[2] = {
-      HIGHBD_LEFT_CORNER_NEIGHBORS_PLUS_4, HIGHBD_LEFT_EDGE_NEIGHBORS_PLUS_4
-    };
-
-static const uint32_t
-    *const HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS[2] = {
-      HIGHBD_MIDDLE_EDGE_NEIGHBORS_PLUS_4, HIGHBD_MIDDLE_CENTER_NEIGHBORS_PLUS_4
-    };
-
-static const uint32_t
-    *const HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS[2] = {
-      HIGHBD_RIGHT_CORNER_NEIGHBORS_PLUS_4, HIGHBD_RIGHT_EDGE_NEIGHBORS_PLUS_4
-    };
-
-#define DIST_STRIDE ((BW) + 2)
-#endif  // AOM_AV1_ENCODER_X86_TEMPORAL_FILTER_CONSTANTS_H_
diff --git a/av1/encoder/x86/temporal_filter_sse2.c b/av1/encoder/x86/temporal_filter_sse2.c
index 98a6b82..bf34df9 100644
--- a/av1/encoder/x86/temporal_filter_sse2.c
+++ b/av1/encoder/x86/temporal_filter_sse2.c
@@ -102,14 +102,14 @@
   return _mm_cvtsi128_si32(veca);
 }
 
-static void apply_temporal_filter_planewise(
+static void apply_temporal_filter(
     const uint8_t *frame1, const unsigned int stride, const uint8_t *frame2,
     const unsigned int stride2, const int block_width, const int block_height,
-    const double sigma, const int decay_control, const int use_subblock,
-    const int block_mse, const int *subblock_mses, const int q_factor,
-    unsigned int *accumulator, uint16_t *count, uint16_t *luma_sq_error,
-    uint16_t *chroma_sq_error, int plane, int ss_x_shift, int ss_y_shift) {
-  assert(TF_PLANEWISE_FILTER_WINDOW_LENGTH == 5);
+    const double sigma, const int decay_control, const int *subblock_mses,
+    const int q_factor, unsigned int *accumulator, uint16_t *count,
+    uint16_t *luma_sq_error, uint16_t *chroma_sq_error, int plane,
+    int ss_x_shift, int ss_y_shift) {
+  assert(TF_WINDOW_LENGTH == 5);
   assert(((block_width == 32) && (block_height == 32)) ||
          ((block_width == 16) && (block_height == 16)));
   if (plane > PLANE_TYPE_Y) assert(chroma_sq_error != NULL);
@@ -180,8 +180,7 @@
       const int pixel_value = frame2[i * stride2 + j];
 
       int diff_sse = acc_5x5_sse[i][j];
-      int num_ref_pixels =
-          TF_PLANEWISE_FILTER_WINDOW_LENGTH * TF_PLANEWISE_FILTER_WINDOW_LENGTH;
+      int num_ref_pixels = TF_WINDOW_LENGTH * TF_WINDOW_LENGTH;
 
       // Filter U-plane and V-plane using Y-plane. This is because motion
       // search is only done on Y-plane, so the information from Y-plane will
@@ -201,13 +200,11 @@
       const double window_error = (double)(diff_sse) / num_ref_pixels;
       const int subblock_idx =
           (i >= block_height / 2) * 2 + (j >= block_width / 2);
-      const double block_error =
-          (double)(use_subblock ? subblock_mses[subblock_idx] : block_mse);
+      const double block_error = (double)subblock_mses[subblock_idx];
 
       const double scaled_diff =
           AOMMAX(-(window_error + block_error / 10) / (2 * h * h * q), -15.0);
-      const int adjusted_weight =
-          (int)(exp(scaled_diff) * TF_PLANEWISE_FILTER_WEIGHT_SCALE);
+      const int adjusted_weight = (int)(exp(scaled_diff) * TF_WEIGHT_SCALE);
 
       count[k] += adjusted_weight;
       accumulator[k] += adjusted_weight * pixel_value;
@@ -215,12 +212,11 @@
   }
 }
 
-void av1_apply_temporal_filter_planewise_sse2(
+void av1_apply_temporal_filter_sse2(
     const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
     const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int num_planes, const double *noise_levels, const int use_subblock,
-    const int block_mse, const int *subblock_mses, const int q_factor,
-    const uint8_t *pred, uint32_t *accum, uint16_t *count) {
+    const int num_planes, const double *noise_levels, const int *subblock_mses,
+    const int q_factor, const uint8_t *pred, uint32_t *accum, uint16_t *count) {
   const int is_high_bitdepth = ref_frame->flags & YV12_FLAG_HIGHBITDEPTH;
   if (is_high_bitdepth) {
     assert(0 && "Only support low bit-depth with sse2!");
@@ -251,12 +247,11 @@
     const int ss_y_shift =
         mbd->plane[plane].subsampling_y - mbd->plane[0].subsampling_y;
 
-    apply_temporal_filter_planewise(
-        ref, frame_stride, pred + mb_pels * plane, plane_w, plane_w, plane_h,
-        noise_levels[plane], decay_control, use_subblock, block_mse,
-        subblock_mses, q_factor, accum + mb_pels * plane,
-        count + mb_pels * plane, luma_sq_error, chroma_sq_error, plane,
-        ss_x_shift, ss_y_shift);
+    apply_temporal_filter(ref, frame_stride, pred + mb_pels * plane, plane_w,
+                          plane_w, plane_h, noise_levels[plane], decay_control,
+                          subblock_mses, q_factor, accum + mb_pels * plane,
+                          count + mb_pels * plane, luma_sq_error,
+                          chroma_sq_error, plane, ss_x_shift, ss_y_shift);
   }
   if (chroma_sq_error != NULL) aom_free(chroma_sq_error);
 }
diff --git a/av1/encoder/x86/temporal_filter_sse4.c b/av1/encoder/x86/temporal_filter_sse4.c
deleted file mode 100644
index e3f9f5f..0000000
--- a/av1/encoder/x86/temporal_filter_sse4.c
+++ /dev/null
@@ -1,2044 +0,0 @@
-/*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <assert.h>
-#include <smmintrin.h>
-
-#include "config/av1_rtcd.h"
-#include "aom/aom_integer.h"
-#include "av1/encoder/encoder.h"
-#include "av1/encoder/temporal_filter.h"
-#include "av1/encoder/x86/temporal_filter_constants.h"
-
-//////////////////////////
-// Low bit-depth Begins //
-//////////////////////////
-
-// Read in 8 pixels from a and b as 8-bit unsigned integers, compute the
-// difference squared, and store as unsigned 16-bit integer to dst.
-static INLINE void store_dist_8(const uint8_t *a, const uint8_t *b,
-                                uint16_t *dst) {
-  const __m128i a_reg = _mm_loadl_epi64((const __m128i *)a);
-  const __m128i b_reg = _mm_loadl_epi64((const __m128i *)b);
-
-  const __m128i a_first = _mm_cvtepu8_epi16(a_reg);
-  const __m128i b_first = _mm_cvtepu8_epi16(b_reg);
-
-  __m128i dist_first;
-
-  dist_first = _mm_sub_epi16(a_first, b_first);
-  dist_first = _mm_mullo_epi16(dist_first, dist_first);
-
-  _mm_storeu_si128((__m128i *)dst, dist_first);
-}
-
-static INLINE void store_dist_16(const uint8_t *a, const uint8_t *b,
-                                 uint16_t *dst) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i a_reg = _mm_loadu_si128((const __m128i *)a);
-  const __m128i b_reg = _mm_loadu_si128((const __m128i *)b);
-
-  const __m128i a_first = _mm_cvtepu8_epi16(a_reg);
-  const __m128i a_second = _mm_unpackhi_epi8(a_reg, zero);
-  const __m128i b_first = _mm_cvtepu8_epi16(b_reg);
-  const __m128i b_second = _mm_unpackhi_epi8(b_reg, zero);
-
-  __m128i dist_first, dist_second;
-
-  dist_first = _mm_sub_epi16(a_first, b_first);
-  dist_second = _mm_sub_epi16(a_second, b_second);
-  dist_first = _mm_mullo_epi16(dist_first, dist_first);
-  dist_second = _mm_mullo_epi16(dist_second, dist_second);
-
-  _mm_storeu_si128((__m128i *)dst, dist_first);
-  _mm_storeu_si128((__m128i *)(dst + 8), dist_second);
-}
-
-static INLINE void read_dist_8(const uint16_t *dist, __m128i *dist_reg) {
-  *dist_reg = _mm_loadu_si128((const __m128i *)dist);
-}
-
-static INLINE void read_dist_16(const uint16_t *dist, __m128i *reg_first,
-                                __m128i *reg_second) {
-  read_dist_8(dist, reg_first);
-  read_dist_8(dist + 8, reg_second);
-}
-
-// Average the value based on the number of values summed (9 for pixels away
-// from the border, 4 for pixels in corners, and 6 for other edge values).
-//
-// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
-// by weight.
-static __m128i average_8(__m128i sum, const __m128i *mul_constants,
-                         const int strength, const int rounding,
-                         const int weight) {
-  // _mm_srl_epi16 uses the lower 64 bit value for the shift.
-  const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
-  const __m128i rounding_u16 = _mm_set1_epi16(rounding);
-  const __m128i weight_u16 = _mm_set1_epi16(weight);
-  const __m128i sixteen = _mm_set1_epi16(16);
-
-  // modifier * 3 / index;
-  sum = _mm_mulhi_epu16(sum, *mul_constants);
-
-  sum = _mm_adds_epu16(sum, rounding_u16);
-  sum = _mm_srl_epi16(sum, strength_u128);
-
-  // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4
-  // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385
-  // So this needs to use the epu16 version which did not come until SSE4.
-  sum = _mm_min_epu16(sum, sixteen);
-
-  sum = _mm_sub_epi16(sixteen, sum);
-
-  return _mm_mullo_epi16(sum, weight_u16);
-}
-
-static __m128i average_4_4(__m128i sum, const __m128i *mul_constants,
-                           const int strength, const int rounding,
-                           const int weight_0, const int weight_1) {
-  // _mm_srl_epi16 uses the lower 64 bit value for the shift.
-  const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
-  const __m128i rounding_u16 = _mm_set1_epi16(rounding);
-  const __m128i weight_u16 =
-      _mm_setr_epi16(weight_0, weight_0, weight_0, weight_0, weight_1, weight_1,
-                     weight_1, weight_1);
-  const __m128i sixteen = _mm_set1_epi16(16);
-
-  // modifier * 3 / index;
-  sum = _mm_mulhi_epu16(sum, *mul_constants);
-
-  sum = _mm_adds_epu16(sum, rounding_u16);
-  sum = _mm_srl_epi16(sum, strength_u128);
-
-  // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4
-  // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385
-  // So this needs to use the epu16 version which did not come until SSE4.
-  sum = _mm_min_epu16(sum, sixteen);
-
-  sum = _mm_sub_epi16(sixteen, sum);
-
-  return _mm_mullo_epi16(sum, weight_u16);
-}
-
-static INLINE void average_16(__m128i *sum_0_u16, __m128i *sum_1_u16,
-                              const __m128i *mul_constants_0,
-                              const __m128i *mul_constants_1,
-                              const int strength, const int rounding,
-                              const int weight) {
-  const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
-  const __m128i rounding_u16 = _mm_set1_epi16(rounding);
-  const __m128i weight_u16 = _mm_set1_epi16(weight);
-  const __m128i sixteen = _mm_set1_epi16(16);
-  __m128i input_0, input_1;
-
-  input_0 = _mm_mulhi_epu16(*sum_0_u16, *mul_constants_0);
-  input_0 = _mm_adds_epu16(input_0, rounding_u16);
-
-  input_1 = _mm_mulhi_epu16(*sum_1_u16, *mul_constants_1);
-  input_1 = _mm_adds_epu16(input_1, rounding_u16);
-
-  input_0 = _mm_srl_epi16(input_0, strength_u128);
-  input_1 = _mm_srl_epi16(input_1, strength_u128);
-
-  input_0 = _mm_min_epu16(input_0, sixteen);
-  input_1 = _mm_min_epu16(input_1, sixteen);
-  input_0 = _mm_sub_epi16(sixteen, input_0);
-  input_1 = _mm_sub_epi16(sixteen, input_1);
-
-  *sum_0_u16 = _mm_mullo_epi16(input_0, weight_u16);
-  *sum_1_u16 = _mm_mullo_epi16(input_1, weight_u16);
-}
-
-// Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.'
-static void accumulate_and_store_8(const __m128i sum_u16, const uint8_t *pred,
-                                   uint16_t *count, uint32_t *accumulator) {
-  const __m128i pred_u8 = _mm_loadl_epi64((const __m128i *)pred);
-  const __m128i zero = _mm_setzero_si128();
-  __m128i count_u16 = _mm_loadu_si128((const __m128i *)count);
-  __m128i pred_u16 = _mm_cvtepu8_epi16(pred_u8);
-  __m128i pred_0_u32, pred_1_u32;
-  __m128i accum_0_u32, accum_1_u32;
-
-  count_u16 = _mm_adds_epu16(count_u16, sum_u16);
-  _mm_storeu_si128((__m128i *)count, count_u16);
-
-  pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16);
-
-  pred_0_u32 = _mm_cvtepu16_epi32(pred_u16);
-  pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero);
-
-  accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
-  accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
-
-  accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
-  accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
-
-  _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
-  _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
-}
-
-static INLINE void accumulate_and_store_16(const __m128i sum_0_u16,
-                                           const __m128i sum_1_u16,
-                                           const uint8_t *pred, uint16_t *count,
-                                           uint32_t *accumulator) {
-  const __m128i pred_u8 = _mm_loadu_si128((const __m128i *)pred);
-  const __m128i zero = _mm_setzero_si128();
-  __m128i count_0_u16 = _mm_loadu_si128((const __m128i *)count),
-          count_1_u16 = _mm_loadu_si128((const __m128i *)(count + 8));
-  __m128i pred_0_u16 = _mm_cvtepu8_epi16(pred_u8),
-          pred_1_u16 = _mm_unpackhi_epi8(pred_u8, zero);
-  __m128i pred_0_u32, pred_1_u32, pred_2_u32, pred_3_u32;
-  __m128i accum_0_u32, accum_1_u32, accum_2_u32, accum_3_u32;
-
-  count_0_u16 = _mm_adds_epu16(count_0_u16, sum_0_u16);
-  _mm_storeu_si128((__m128i *)count, count_0_u16);
-
-  count_1_u16 = _mm_adds_epu16(count_1_u16, sum_1_u16);
-  _mm_storeu_si128((__m128i *)(count + 8), count_1_u16);
-
-  pred_0_u16 = _mm_mullo_epi16(sum_0_u16, pred_0_u16);
-  pred_1_u16 = _mm_mullo_epi16(sum_1_u16, pred_1_u16);
-
-  pred_0_u32 = _mm_cvtepu16_epi32(pred_0_u16);
-  pred_1_u32 = _mm_unpackhi_epi16(pred_0_u16, zero);
-  pred_2_u32 = _mm_cvtepu16_epi32(pred_1_u16);
-  pred_3_u32 = _mm_unpackhi_epi16(pred_1_u16, zero);
-
-  accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
-  accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
-  accum_2_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 8));
-  accum_3_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 12));
-
-  accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
-  accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
-  accum_2_u32 = _mm_add_epi32(pred_2_u32, accum_2_u32);
-  accum_3_u32 = _mm_add_epi32(pred_3_u32, accum_3_u32);
-
-  _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
-  _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
-  _mm_storeu_si128((__m128i *)(accumulator + 8), accum_2_u32);
-  _mm_storeu_si128((__m128i *)(accumulator + 12), accum_3_u32);
-}
-
-// Read in 8 pixels from y_dist. For each index i, compute y_dist[i-1] +
-// y_dist[i] + y_dist[i+1] and store in sum as 16-bit unsigned int.
-static INLINE void get_sum_8(const uint16_t *y_dist, __m128i *sum) {
-  __m128i dist_reg, dist_left, dist_right;
-
-  dist_reg = _mm_loadu_si128((const __m128i *)y_dist);
-  dist_left = _mm_loadu_si128((const __m128i *)(y_dist - 1));
-  dist_right = _mm_loadu_si128((const __m128i *)(y_dist + 1));
-
-  *sum = _mm_adds_epu16(dist_reg, dist_left);
-  *sum = _mm_adds_epu16(*sum, dist_right);
-}
-
-// Read in 16 pixels from y_dist. For each index i, compute y_dist[i-1] +
-// y_dist[i] + y_dist[i+1]. Store the result for first 8 pixels in sum_first and
-// the rest in sum_second.
-static INLINE void get_sum_16(const uint16_t *y_dist, __m128i *sum_first,
-                              __m128i *sum_second) {
-  get_sum_8(y_dist, sum_first);
-  get_sum_8(y_dist + 8, sum_second);
-}
-
-// Read in a row of chroma values corresponds to a row of 16 luma values.
-static INLINE void read_chroma_dist_row_16(int ss_x, const uint16_t *u_dist,
-                                           const uint16_t *v_dist,
-                                           __m128i *u_first, __m128i *u_second,
-                                           __m128i *v_first,
-                                           __m128i *v_second) {
-  if (!ss_x) {
-    // If there is no chroma subsampling in the horizontal direction, then we
-    // need to load 16 entries from chroma.
-    read_dist_16(u_dist, u_first, u_second);
-    read_dist_16(v_dist, v_first, v_second);
-  } else {  // ss_x == 1
-    // Otherwise, we only need to load 8 entries
-    __m128i u_reg, v_reg;
-
-    read_dist_8(u_dist, &u_reg);
-
-    *u_first = _mm_unpacklo_epi16(u_reg, u_reg);
-    *u_second = _mm_unpackhi_epi16(u_reg, u_reg);
-
-    read_dist_8(v_dist, &v_reg);
-
-    *v_first = _mm_unpacklo_epi16(v_reg, v_reg);
-    *v_second = _mm_unpackhi_epi16(v_reg, v_reg);
-  }
-}
-
-// Horizontal add unsigned 16-bit ints in src and store them as signed 32-bit
-// int in dst.
-static INLINE void hadd_epu16(__m128i *src, __m128i *dst) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i shift_right = _mm_srli_si128(*src, 2);
-
-  const __m128i odd = _mm_blend_epi16(shift_right, zero, 170);
-  const __m128i even = _mm_blend_epi16(*src, zero, 170);
-
-  *dst = _mm_add_epi32(even, odd);
-}
-
-// Add a row of luma distortion to 8 corresponding chroma mods.
-static INLINE void add_luma_dist_to_8_chroma_mod(const uint16_t *y_dist,
-                                                 int ss_x, int ss_y,
-                                                 __m128i *u_mod,
-                                                 __m128i *v_mod) {
-  __m128i y_reg;
-  if (!ss_x) {
-    read_dist_8(y_dist, &y_reg);
-    if (ss_y == 1) {
-      __m128i y_tmp;
-      read_dist_8(y_dist + DIST_STRIDE, &y_tmp);
-
-      y_reg = _mm_adds_epu16(y_reg, y_tmp);
-    }
-  } else {
-    __m128i y_first, y_second;
-    read_dist_16(y_dist, &y_first, &y_second);
-    if (ss_y == 1) {
-      __m128i y_tmp_0, y_tmp_1;
-      read_dist_16(y_dist + DIST_STRIDE, &y_tmp_0, &y_tmp_1);
-
-      y_first = _mm_adds_epu16(y_first, y_tmp_0);
-      y_second = _mm_adds_epu16(y_second, y_tmp_1);
-    }
-
-    hadd_epu16(&y_first, &y_first);
-    hadd_epu16(&y_second, &y_second);
-
-    y_reg = _mm_packus_epi32(y_first, y_second);
-  }
-
-  *u_mod = _mm_adds_epu16(*u_mod, y_reg);
-  *v_mod = _mm_adds_epu16(*v_mod, y_reg);
-}
-
-// Apply temporal filter to the luma components. This performs temporal
-// filtering on a luma block of 16 X block_height. Use blk_fw as an array of
-// size 4 for the weights for each of the 4 subblocks if blk_fw is not NULL,
-// else use top_weight for top half, and bottom weight for bottom half.
-static void apply_temporal_filter_luma_16(
-    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
-    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
-    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, int use_whole_blk, uint32_t *y_accum,
-    uint16_t *y_count, const uint16_t *y_dist, const uint16_t *u_dist,
-    const uint16_t *v_dist, const int16_t *const *neighbors_first,
-    const int16_t *const *neighbors_second, int top_weight, int bottom_weight,
-    const int *blk_fw) {
-  const int rounding = (1 << strength) >> 1;
-  int weight = top_weight;
-
-  __m128i mul_first, mul_second;
-
-  __m128i sum_row_1_first, sum_row_1_second;
-  __m128i sum_row_2_first, sum_row_2_second;
-  __m128i sum_row_3_first, sum_row_3_second;
-
-  __m128i u_first, u_second;
-  __m128i v_first, v_second;
-
-  __m128i sum_row_first;
-  __m128i sum_row_second;
-
-  // Loop variables
-  unsigned int h;
-
-  assert(strength >= 0);
-  assert(strength <= 6);
-
-  assert(block_width == 16);
-
-  (void)block_width;
-
-  // First row
-  mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]);
-  mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]);
-
-  // Add luma values
-  get_sum_16(y_dist, &sum_row_2_first, &sum_row_2_second);
-  get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
-
-  sum_row_first = _mm_adds_epu16(sum_row_2_first, sum_row_3_first);
-  sum_row_second = _mm_adds_epu16(sum_row_2_second, sum_row_3_second);
-
-  // Add chroma values
-  read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first,
-                          &v_second);
-
-  sum_row_first = _mm_adds_epu16(sum_row_first, u_first);
-  sum_row_second = _mm_adds_epu16(sum_row_second, u_second);
-
-  sum_row_first = _mm_adds_epu16(sum_row_first, v_first);
-  sum_row_second = _mm_adds_epu16(sum_row_second, v_second);
-
-  // Get modifier and store result
-  if (blk_fw) {
-    sum_row_first =
-        average_8(sum_row_first, &mul_first, strength, rounding, blk_fw[0]);
-    sum_row_second =
-        average_8(sum_row_second, &mul_second, strength, rounding, blk_fw[1]);
-  } else {
-    average_16(&sum_row_first, &sum_row_second, &mul_first, &mul_second,
-               strength, rounding, weight);
-  }
-  accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
-                          y_accum);
-
-  y_src += y_src_stride;
-  y_pre += y_pre_stride;
-  y_count += y_pre_stride;
-  y_accum += y_pre_stride;
-  y_dist += DIST_STRIDE;
-
-  u_src += uv_src_stride;
-  u_pre += uv_pre_stride;
-  u_dist += DIST_STRIDE;
-  v_src += uv_src_stride;
-  v_pre += uv_pre_stride;
-  v_dist += DIST_STRIDE;
-
-  // Then all the rows except the last one
-  mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[1]);
-  mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[1]);
-
-  for (h = 1; h < block_height - 1; ++h) {
-    // Move the weight to bottom half
-    if (!use_whole_blk && h == block_height / 2) {
-      if (blk_fw) {
-        blk_fw += 2;
-      } else {
-        weight = bottom_weight;
-      }
-    }
-    // Shift the rows up
-    sum_row_1_first = sum_row_2_first;
-    sum_row_1_second = sum_row_2_second;
-    sum_row_2_first = sum_row_3_first;
-    sum_row_2_second = sum_row_3_second;
-
-    // Add luma values to the modifier
-    sum_row_first = _mm_adds_epu16(sum_row_1_first, sum_row_2_first);
-    sum_row_second = _mm_adds_epu16(sum_row_1_second, sum_row_2_second);
-
-    get_sum_16(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
-
-    sum_row_first = _mm_adds_epu16(sum_row_first, sum_row_3_first);
-    sum_row_second = _mm_adds_epu16(sum_row_second, sum_row_3_second);
-
-    // Add chroma values to the modifier
-    if (ss_y == 0 || h % 2 == 0) {
-      // Only calculate the new chroma distortion if we are at a pixel that
-      // corresponds to a new chroma row
-      read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second,
-                              &v_first, &v_second);
-
-      u_src += uv_src_stride;
-      u_pre += uv_pre_stride;
-      u_dist += DIST_STRIDE;
-      v_src += uv_src_stride;
-      v_pre += uv_pre_stride;
-      v_dist += DIST_STRIDE;
-    }
-
-    sum_row_first = _mm_adds_epu16(sum_row_first, u_first);
-    sum_row_second = _mm_adds_epu16(sum_row_second, u_second);
-    sum_row_first = _mm_adds_epu16(sum_row_first, v_first);
-    sum_row_second = _mm_adds_epu16(sum_row_second, v_second);
-
-    // Get modifier and store result
-    if (blk_fw) {
-      sum_row_first =
-          average_8(sum_row_first, &mul_first, strength, rounding, blk_fw[0]);
-      sum_row_second =
-          average_8(sum_row_second, &mul_second, strength, rounding, blk_fw[1]);
-    } else {
-      average_16(&sum_row_first, &sum_row_second, &mul_first, &mul_second,
-                 strength, rounding, weight);
-    }
-    accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
-                            y_accum);
-
-    y_src += y_src_stride;
-    y_pre += y_pre_stride;
-    y_count += y_pre_stride;
-    y_accum += y_pre_stride;
-    y_dist += DIST_STRIDE;
-  }
-
-  // The last row
-  mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]);
-  mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]);
-
-  // Shift the rows up
-  sum_row_1_first = sum_row_2_first;
-  sum_row_1_second = sum_row_2_second;
-  sum_row_2_first = sum_row_3_first;
-  sum_row_2_second = sum_row_3_second;
-
-  // Add luma values to the modifier
-  sum_row_first = _mm_adds_epu16(sum_row_1_first, sum_row_2_first);
-  sum_row_second = _mm_adds_epu16(sum_row_1_second, sum_row_2_second);
-
-  // Add chroma values to the modifier
-  if (ss_y == 0) {
-    // Only calculate the new chroma distortion if we are at a pixel that
-    // corresponds to a new chroma row
-    read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second, &v_first,
-                            &v_second);
-  }
-
-  sum_row_first = _mm_adds_epu16(sum_row_first, u_first);
-  sum_row_second = _mm_adds_epu16(sum_row_second, u_second);
-  sum_row_first = _mm_adds_epu16(sum_row_first, v_first);
-  sum_row_second = _mm_adds_epu16(sum_row_second, v_second);
-
-  // Get modifier and store result
-  if (blk_fw) {
-    sum_row_first =
-        average_8(sum_row_first, &mul_first, strength, rounding, blk_fw[0]);
-    sum_row_second =
-        average_8(sum_row_second, &mul_second, strength, rounding, blk_fw[1]);
-  } else {
-    average_16(&sum_row_first, &sum_row_second, &mul_first, &mul_second,
-               strength, rounding, weight);
-  }
-  accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
-                          y_accum);
-}
-
-// Perform temporal filter for the luma component.
-static void apply_temporal_filter_luma(
-    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
-    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
-    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
-    uint32_t *y_accum, uint16_t *y_count, const uint16_t *y_dist,
-    const uint16_t *u_dist, const uint16_t *v_dist) {
-  unsigned int blk_col = 0, uv_blk_col = 0;
-  const unsigned int blk_col_step = 16, uv_blk_col_step = 16 >> ss_x;
-  const unsigned int mid_width = block_width >> 1,
-                     last_width = block_width - blk_col_step;
-  int top_weight = blk_fw[0],
-      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
-  const int16_t *const *neighbors_first;
-  const int16_t *const *neighbors_second;
-
-  if (block_width == 16) {
-    // Special Case: The blockwidth is 16 and we are operating on a row of 16
-    // chroma pixels. In this case, we can't use the usualy left-midle-right
-    // pattern. We also don't support splitting now.
-    neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS;
-    neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS;
-    if (use_whole_blk) {
-      apply_temporal_filter_luma_16(
-          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16,
-          block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
-          y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
-          v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
-          bottom_weight, NULL);
-    } else {
-      apply_temporal_filter_luma_16(
-          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16,
-          block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
-          y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
-          v_dist + uv_blk_col, neighbors_first, neighbors_second, 0, 0, blk_fw);
-    }
-
-    return;
-  }
-
-  // Left
-  neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS;
-  neighbors_second = LUMA_MIDDLE_COLUMN_NEIGHBORS;
-  apply_temporal_filter_luma_16(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, 16, block_height, ss_x, ss_y, strength,
-      use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
-      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
-      neighbors_second, top_weight, bottom_weight, NULL);
-
-  blk_col += blk_col_step;
-  uv_blk_col += uv_blk_col_step;
-
-  // Middle First
-  neighbors_first = LUMA_MIDDLE_COLUMN_NEIGHBORS;
-  for (; blk_col < mid_width;
-       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
-    apply_temporal_filter_luma_16(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, block_height,
-        ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
-        y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
-        v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
-        bottom_weight, NULL);
-  }
-
-  if (!use_whole_blk) {
-    top_weight = blk_fw[1];
-    bottom_weight = blk_fw[3];
-  }
-
-  // Middle Second
-  for (; blk_col < last_width;
-       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
-    apply_temporal_filter_luma_16(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, block_height,
-        ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
-        y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
-        v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
-        bottom_weight, NULL);
-  }
-
-  // Right
-  neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS;
-  apply_temporal_filter_luma_16(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, 16, block_height, ss_x, ss_y, strength,
-      use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
-      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
-      neighbors_second, top_weight, bottom_weight, NULL);
-}
-
-// Apply temporal filter to the chroma components. This performs temporal
-// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use
-// blk_fw as an array of size 4 for the weights for each of the 4 subblocks,
-// else use top_weight for top half, and bottom weight for bottom half.
-static void apply_temporal_filter_chroma_8(
-    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
-    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
-    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
-    int uv_pre_stride, unsigned int uv_block_width,
-    unsigned int uv_block_height, int ss_x, int ss_y, int strength,
-    uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
-    const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist,
-    const int16_t *const *neighbors, int top_weight, int bottom_weight,
-    const int *blk_fw) {
-  const int rounding = (1 << strength) >> 1;
-  int weight = top_weight;
-
-  __m128i mul;
-
-  __m128i u_sum_row_1, u_sum_row_2, u_sum_row_3;
-  __m128i v_sum_row_1, v_sum_row_2, v_sum_row_3;
-
-  __m128i u_sum_row, v_sum_row;
-
-  // Loop variable
-  unsigned int h;
-
-  (void)uv_block_width;
-
-  // First row
-  mul = _mm_loadu_si128((const __m128i *)neighbors[0]);
-
-  // Add chroma values
-  get_sum_8(u_dist, &u_sum_row_2);
-  get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3);
-
-  u_sum_row = _mm_adds_epu16(u_sum_row_2, u_sum_row_3);
-
-  get_sum_8(v_dist, &v_sum_row_2);
-  get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3);
-
-  v_sum_row = _mm_adds_epu16(v_sum_row_2, v_sum_row_3);
-
-  // Add luma values
-  add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
-
-  // Get modifier and store result
-  if (blk_fw) {
-    u_sum_row =
-        average_4_4(u_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]);
-    v_sum_row =
-        average_4_4(v_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]);
-  } else {
-    u_sum_row = average_8(u_sum_row, &mul, strength, rounding, weight);
-    v_sum_row = average_8(v_sum_row, &mul, strength, rounding, weight);
-  }
-  accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
-  accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
-
-  u_src += uv_src_stride;
-  u_pre += uv_pre_stride;
-  u_dist += DIST_STRIDE;
-  v_src += uv_src_stride;
-  v_pre += uv_pre_stride;
-  v_dist += DIST_STRIDE;
-  u_count += uv_pre_stride;
-  u_accum += uv_pre_stride;
-  v_count += uv_pre_stride;
-  v_accum += uv_pre_stride;
-
-  y_src += y_src_stride * (1 + ss_y);
-  y_pre += y_pre_stride * (1 + ss_y);
-  y_dist += DIST_STRIDE * (1 + ss_y);
-
-  // Then all the rows except the last one
-  mul = _mm_loadu_si128((const __m128i *)neighbors[1]);
-
-  for (h = 1; h < uv_block_height - 1; ++h) {
-    // Move the weight pointer to the bottom half of the blocks
-    if (h == uv_block_height / 2) {
-      if (blk_fw) {
-        blk_fw += 2;
-      } else {
-        weight = bottom_weight;
-      }
-    }
-
-    // Shift the rows up
-    u_sum_row_1 = u_sum_row_2;
-    u_sum_row_2 = u_sum_row_3;
-
-    v_sum_row_1 = v_sum_row_2;
-    v_sum_row_2 = v_sum_row_3;
-
-    // Add chroma values
-    u_sum_row = _mm_adds_epu16(u_sum_row_1, u_sum_row_2);
-    get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3);
-    u_sum_row = _mm_adds_epu16(u_sum_row, u_sum_row_3);
-
-    v_sum_row = _mm_adds_epu16(v_sum_row_1, v_sum_row_2);
-    get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3);
-    v_sum_row = _mm_adds_epu16(v_sum_row, v_sum_row_3);
-
-    // Add luma values
-    add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
-
-    // Get modifier and store result
-    if (blk_fw) {
-      u_sum_row = average_4_4(u_sum_row, &mul, strength, rounding, blk_fw[0],
-                              blk_fw[1]);
-      v_sum_row = average_4_4(v_sum_row, &mul, strength, rounding, blk_fw[0],
-                              blk_fw[1]);
-    } else {
-      u_sum_row = average_8(u_sum_row, &mul, strength, rounding, weight);
-      v_sum_row = average_8(v_sum_row, &mul, strength, rounding, weight);
-    }
-
-    accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
-    accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
-
-    u_src += uv_src_stride;
-    u_pre += uv_pre_stride;
-    u_dist += DIST_STRIDE;
-    v_src += uv_src_stride;
-    v_pre += uv_pre_stride;
-    v_dist += DIST_STRIDE;
-    u_count += uv_pre_stride;
-    u_accum += uv_pre_stride;
-    v_count += uv_pre_stride;
-    v_accum += uv_pre_stride;
-
-    y_src += y_src_stride * (1 + ss_y);
-    y_pre += y_pre_stride * (1 + ss_y);
-    y_dist += DIST_STRIDE * (1 + ss_y);
-  }
-
-  // The last row
-  mul = _mm_loadu_si128((const __m128i *)neighbors[0]);
-
-  // Shift the rows up
-  u_sum_row_1 = u_sum_row_2;
-  u_sum_row_2 = u_sum_row_3;
-
-  v_sum_row_1 = v_sum_row_2;
-  v_sum_row_2 = v_sum_row_3;
-
-  // Add chroma values
-  u_sum_row = _mm_adds_epu16(u_sum_row_1, u_sum_row_2);
-  v_sum_row = _mm_adds_epu16(v_sum_row_1, v_sum_row_2);
-
-  // Add luma values
-  add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row, &v_sum_row);
-
-  // Get modifier and store result
-  if (blk_fw) {
-    u_sum_row =
-        average_4_4(u_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]);
-    v_sum_row =
-        average_4_4(v_sum_row, &mul, strength, rounding, blk_fw[0], blk_fw[1]);
-  } else {
-    u_sum_row = average_8(u_sum_row, &mul, strength, rounding, weight);
-    v_sum_row = average_8(v_sum_row, &mul, strength, rounding, weight);
-  }
-
-  accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
-  accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
-}
-
-// Perform temporal filter for the chroma components.
-static void apply_temporal_filter_chroma(
-    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
-    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
-    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
-    uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
-    const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) {
-  const unsigned int uv_width = block_width >> ss_x,
-                     uv_height = block_height >> ss_y;
-
-  unsigned int blk_col = 0, uv_blk_col = 0;
-  const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x;
-  const unsigned int uv_mid_width = uv_width >> 1,
-                     uv_last_width = uv_width - uv_blk_col_step;
-  int top_weight = blk_fw[0],
-      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
-  const int16_t *const *neighbors;
-
-  if (uv_width == 8) {
-    // Special Case: We are subsampling in x direction on a 16x16 block. Since
-    // we are operating on a row of 8 chroma pixels, we can't use the usual
-    // left-middle-right pattern.
-    assert(ss_x);
-
-    if (ss_y) {
-      neighbors = CHROMA_DOUBLE_SS_SINGLE_COLUMN_NEIGHBORS;
-    } else {
-      neighbors = CHROMA_SINGLE_SS_SINGLE_COLUMN_NEIGHBORS;
-    }
-
-    if (use_whole_blk) {
-      apply_temporal_filter_chroma_8(
-          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
-          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
-          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
-          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
-          top_weight, bottom_weight, NULL);
-    } else {
-      apply_temporal_filter_chroma_8(
-          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
-          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
-          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
-          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
-          0, 0, blk_fw);
-    }
-
-    return;
-  }
-
-  // Left
-  if (ss_x && ss_y) {
-    neighbors = CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
-  } else if (ss_x || ss_y) {
-    neighbors = CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
-  } else {
-    neighbors = CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS;
-  }
-
-  apply_temporal_filter_chroma_8(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
-      strength, u_accum + uv_blk_col, u_count + uv_blk_col,
-      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
-      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
-      bottom_weight, NULL);
-
-  blk_col += blk_col_step;
-  uv_blk_col += uv_blk_col_step;
-
-  // Middle First
-  if (ss_x && ss_y) {
-    neighbors = CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
-  } else if (ss_x || ss_y) {
-    neighbors = CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
-  } else {
-    neighbors = CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
-  }
-
-  for (; uv_blk_col < uv_mid_width;
-       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
-    apply_temporal_filter_chroma_8(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
-        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
-        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
-        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
-        top_weight, bottom_weight, NULL);
-  }
-
-  if (!use_whole_blk) {
-    top_weight = blk_fw[1];
-    bottom_weight = blk_fw[3];
-  }
-
-  // Middle Second
-  for (; uv_blk_col < uv_last_width;
-       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
-    apply_temporal_filter_chroma_8(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
-        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
-        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
-        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
-        top_weight, bottom_weight, NULL);
-  }
-
-  // Right
-  if (ss_x && ss_y) {
-    neighbors = CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
-  } else if (ss_x || ss_y) {
-    neighbors = CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
-  } else {
-    neighbors = CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS;
-  }
-
-  apply_temporal_filter_chroma_8(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
-      strength, u_accum + uv_blk_col, u_count + uv_blk_col,
-      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
-      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
-      bottom_weight, NULL);
-}
-
-static void apply_temporal_filter_yuv(
-    const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
-    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int strength, const int use_subblock,
-    const int *subblock_filter_weights, const uint8_t *pred, uint32_t *accum,
-    uint16_t *count) {
-  const int use_whole_blk = !use_subblock;
-  const int *blk_fw = subblock_filter_weights;
-
-  // Block information (Y-plane).
-  const unsigned int block_height = block_size_high[block_size];
-  const unsigned int block_width = block_size_wide[block_size];
-  const int mb_pels = block_height * block_width;
-  const int y_src_stride = ref_frame->y_stride;
-  const int y_pre_stride = block_width;
-  const int mb_y_src_offset =
-      mb_row * block_height * ref_frame->y_stride + mb_col * block_width;
-
-  // Block information (UV-plane).
-  const int ss_y = mbd->plane[1].subsampling_y;
-  const int ss_x = mbd->plane[1].subsampling_x;
-  const unsigned int uv_height = block_height >> ss_y;
-  const unsigned int uv_width = block_width >> ss_x;
-  const int uv_src_stride = ref_frame->uv_stride;
-  const int uv_pre_stride = block_width >> ss_x;
-  const int mb_uv_src_offset =
-      mb_row * uv_height * ref_frame->uv_stride + mb_col * uv_width;
-
-  const uint8_t *y_src = ref_frame->y_buffer + mb_y_src_offset;
-  const uint8_t *u_src = ref_frame->u_buffer + mb_uv_src_offset;
-  const uint8_t *v_src = ref_frame->v_buffer + mb_uv_src_offset;
-  const uint8_t *y_pre = pred;
-  const uint8_t *u_pre = pred + mb_pels;
-  const uint8_t *v_pre = pred + mb_pels * 2;
-  uint32_t *y_accum = accum;
-  uint32_t *u_accum = accum + mb_pels;
-  uint32_t *v_accum = accum + mb_pels * 2;
-  uint16_t *y_count = count;
-  uint16_t *u_count = count + mb_pels;
-  uint16_t *v_count = count + mb_pels * 2;
-
-  const unsigned int chroma_height = block_height >> ss_y,
-                     chroma_width = block_width >> ss_x;
-
-  DECLARE_ALIGNED(16, uint16_t, y_dist[BH * DIST_STRIDE]) = { 0 };
-  DECLARE_ALIGNED(16, uint16_t, u_dist[BH * DIST_STRIDE]) = { 0 };
-  DECLARE_ALIGNED(16, uint16_t, v_dist[BH * DIST_STRIDE]) = { 0 };
-  const int *blk_fw_ptr = blk_fw;
-
-  uint16_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1,
-           *v_dist_ptr = v_dist + 1;
-  const uint8_t *y_src_ptr = y_src, *u_src_ptr = u_src, *v_src_ptr = v_src;
-  const uint8_t *y_pre_ptr = y_pre, *u_pre_ptr = u_pre, *v_pre_ptr = v_pre;
-
-  // Loop variables
-  unsigned int row, blk_col;
-
-  assert(block_width <= BW && "block width too large");
-  assert(block_height <= BH && "block height too large");
-  assert(block_width % 16 == 0 && "block width must be multiple of 16");
-  assert(block_height % 2 == 0 && "block height must be even");
-  assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) &&
-         "invalid chroma subsampling");
-  assert(strength >= 0 && strength <= 6 && "invalid temporal filter strength");
-  assert(blk_fw[0] >= 0 && "filter weight must be positive");
-  assert(
-      (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) &&
-      "subblock filter weight must be positive");
-  assert(blk_fw[0] <= 2 && "sublock filter weight must be less than 2");
-  assert(
-      (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) &&
-      "subblock filter weight must be less than 2");
-
-  // Precompute the difference sqaured
-  for (row = 0; row < block_height; row++) {
-    for (blk_col = 0; blk_col < block_width; blk_col += 16) {
-      store_dist_16(y_src_ptr + blk_col, y_pre_ptr + blk_col,
-                    y_dist_ptr + blk_col);
-    }
-    y_src_ptr += y_src_stride;
-    y_pre_ptr += y_pre_stride;
-    y_dist_ptr += DIST_STRIDE;
-  }
-
-  for (row = 0; row < chroma_height; row++) {
-    for (blk_col = 0; blk_col < chroma_width; blk_col += 8) {
-      store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col,
-                   u_dist_ptr + blk_col);
-      store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col,
-                   v_dist_ptr + blk_col);
-    }
-
-    u_src_ptr += uv_src_stride;
-    u_pre_ptr += uv_pre_stride;
-    u_dist_ptr += DIST_STRIDE;
-    v_src_ptr += uv_src_stride;
-    v_pre_ptr += uv_pre_stride;
-    v_dist_ptr += DIST_STRIDE;
-  }
-
-  y_dist_ptr = y_dist + 1;
-  u_dist_ptr = u_dist + 1;
-  v_dist_ptr = v_dist + 1;
-
-  apply_temporal_filter_luma(y_src, y_src_stride, y_pre, y_pre_stride, u_src,
-                             v_src, uv_src_stride, u_pre, v_pre, uv_pre_stride,
-                             block_width, block_height, ss_x, ss_y, strength,
-                             blk_fw_ptr, use_whole_blk, y_accum, y_count,
-                             y_dist_ptr, u_dist_ptr, v_dist_ptr);
-
-  apply_temporal_filter_chroma(
-      y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride,
-      u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y,
-      strength, blk_fw_ptr, use_whole_blk, u_accum, u_count, v_accum, v_count,
-      y_dist_ptr, u_dist_ptr, v_dist_ptr);
-}
-
-////////////////////////
-// Low bit-depth Ends //
-////////////////////////
-
-///////////////////////////
-// High bit-depth Begins //
-///////////////////////////
-
-// Compute (a-b)**2 for 8 pixels with size 16-bit
-static INLINE void highbd_store_dist_8(const uint16_t *a, const uint16_t *b,
-                                       uint32_t *dst) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i a_reg = _mm_loadu_si128((const __m128i *)a);
-  const __m128i b_reg = _mm_loadu_si128((const __m128i *)b);
-
-  const __m128i a_first = _mm_cvtepu16_epi32(a_reg);
-  const __m128i a_second = _mm_unpackhi_epi16(a_reg, zero);
-  const __m128i b_first = _mm_cvtepu16_epi32(b_reg);
-  const __m128i b_second = _mm_unpackhi_epi16(b_reg, zero);
-
-  __m128i dist_first, dist_second;
-
-  dist_first = _mm_sub_epi32(a_first, b_first);
-  dist_second = _mm_sub_epi32(a_second, b_second);
-  dist_first = _mm_mullo_epi32(dist_first, dist_first);
-  dist_second = _mm_mullo_epi32(dist_second, dist_second);
-
-  _mm_storeu_si128((__m128i *)dst, dist_first);
-  _mm_storeu_si128((__m128i *)(dst + 4), dist_second);
-}
-
-// Sum up three neighboring distortions for the pixels
-static INLINE void highbd_get_sum_4(const uint32_t *dist, __m128i *sum) {
-  __m128i dist_reg, dist_left, dist_right;
-
-  dist_reg = _mm_loadu_si128((const __m128i *)dist);
-  dist_left = _mm_loadu_si128((const __m128i *)(dist - 1));
-  dist_right = _mm_loadu_si128((const __m128i *)(dist + 1));
-
-  *sum = _mm_add_epi32(dist_reg, dist_left);
-  *sum = _mm_add_epi32(*sum, dist_right);
-}
-
-static INLINE void highbd_get_sum_8(const uint32_t *dist, __m128i *sum_first,
-                                    __m128i *sum_second) {
-  highbd_get_sum_4(dist, sum_first);
-  highbd_get_sum_4(dist + 4, sum_second);
-}
-
-// Average the value based on the number of values summed (9 for pixels away
-// from the border, 4 for pixels in corners, and 6 for other edge values, plus
-// however many values from y/uv plane are).
-//
-// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
-// by weight.
-static INLINE void highbd_average_4(__m128i *output, const __m128i *sum,
-                                    const __m128i *mul_constants,
-                                    const int strength, const int rounding,
-                                    const int weight) {
-  // _mm_srl_epi16 uses the lower 64 bit value for the shift.
-  const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
-  const __m128i rounding_u32 = _mm_set1_epi32(rounding);
-  const __m128i weight_u32 = _mm_set1_epi32(weight);
-  const __m128i sixteen = _mm_set1_epi32(16);
-  const __m128i zero = _mm_setzero_si128();
-
-  // modifier * 3 / index;
-  const __m128i sum_lo = _mm_unpacklo_epi32(*sum, zero);
-  const __m128i sum_hi = _mm_unpackhi_epi32(*sum, zero);
-  const __m128i const_lo = _mm_unpacklo_epi32(*mul_constants, zero);
-  const __m128i const_hi = _mm_unpackhi_epi32(*mul_constants, zero);
-
-  const __m128i mul_lo = _mm_mul_epu32(sum_lo, const_lo);
-  const __m128i mul_lo_div = _mm_srli_epi64(mul_lo, 32);
-  const __m128i mul_hi = _mm_mul_epu32(sum_hi, const_hi);
-  const __m128i mul_hi_div = _mm_srli_epi64(mul_hi, 32);
-
-  // Now we have
-  //   mul_lo: 00 a1 00 a0
-  //   mul_hi: 00 a3 00 a2
-  // Unpack as 64 bit words to get even and odd elements
-  //   unpack_lo: 00 a2 00 a0
-  //   unpack_hi: 00 a3 00 a1
-  // Then we can shift and OR the results to get everything in 32-bits
-  const __m128i mul_even = _mm_unpacklo_epi64(mul_lo_div, mul_hi_div);
-  const __m128i mul_odd = _mm_unpackhi_epi64(mul_lo_div, mul_hi_div);
-  const __m128i mul_odd_shift = _mm_slli_si128(mul_odd, 4);
-  const __m128i mul = _mm_or_si128(mul_even, mul_odd_shift);
-
-  // Round
-  *output = _mm_add_epi32(mul, rounding_u32);
-  *output = _mm_srl_epi32(*output, strength_u128);
-
-  // Multiply with the weight
-  *output = _mm_min_epu32(*output, sixteen);
-  *output = _mm_sub_epi32(sixteen, *output);
-  *output = _mm_mullo_epi32(*output, weight_u32);
-}
-
-static INLINE void highbd_average_8(__m128i *output_0, __m128i *output_1,
-                                    const __m128i *sum_0_u32,
-                                    const __m128i *sum_1_u32,
-                                    const __m128i *mul_constants_0,
-                                    const __m128i *mul_constants_1,
-                                    const int strength, const int rounding,
-                                    const int weight) {
-  highbd_average_4(output_0, sum_0_u32, mul_constants_0, strength, rounding,
-                   weight);
-  highbd_average_4(output_1, sum_1_u32, mul_constants_1, strength, rounding,
-                   weight);
-}
-
-// Add 'sum_u32' to 'count'. Multiply by 'pred' and add to 'accumulator.'
-static INLINE void highbd_accumulate_and_store_8(const __m128i sum_first_u32,
-                                                 const __m128i sum_second_u32,
-                                                 const uint16_t *pred,
-                                                 uint16_t *count,
-                                                 uint32_t *accumulator) {
-  // Cast down to 16-bit ints
-  const __m128i sum_u16 = _mm_packus_epi32(sum_first_u32, sum_second_u32);
-  const __m128i zero = _mm_setzero_si128();
-
-  __m128i pred_u16 = _mm_loadu_si128((const __m128i *)pred);
-  __m128i count_u16 = _mm_loadu_si128((const __m128i *)count);
-
-  __m128i pred_0_u32, pred_1_u32;
-  __m128i accum_0_u32, accum_1_u32;
-
-  count_u16 = _mm_adds_epu16(count_u16, sum_u16);
-  _mm_storeu_si128((__m128i *)count, count_u16);
-
-  pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16);
-
-  pred_0_u32 = _mm_cvtepu16_epi32(pred_u16);
-  pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero);
-
-  accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
-  accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
-
-  accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
-  accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
-
-  _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
-  _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
-}
-
-static INLINE void highbd_read_dist_4(const uint32_t *dist, __m128i *dist_reg) {
-  *dist_reg = _mm_loadu_si128((const __m128i *)dist);
-}
-
-static INLINE void highbd_read_dist_8(const uint32_t *dist, __m128i *reg_first,
-                                      __m128i *reg_second) {
-  highbd_read_dist_4(dist, reg_first);
-  highbd_read_dist_4(dist + 4, reg_second);
-}
-
-static INLINE void highbd_read_chroma_dist_row_8(
-    int ss_x, const uint32_t *u_dist, const uint32_t *v_dist, __m128i *u_first,
-    __m128i *u_second, __m128i *v_first, __m128i *v_second) {
-  if (!ss_x) {
-    // If there is no chroma subsampling in the horizontal direction, then we
-    // need to load 8 entries from chroma.
-    highbd_read_dist_8(u_dist, u_first, u_second);
-    highbd_read_dist_8(v_dist, v_first, v_second);
-  } else {  // ss_x == 1
-    // Otherwise, we only need to load 8 entries
-    __m128i u_reg, v_reg;
-
-    highbd_read_dist_4(u_dist, &u_reg);
-
-    *u_first = _mm_unpacklo_epi32(u_reg, u_reg);
-    *u_second = _mm_unpackhi_epi32(u_reg, u_reg);
-
-    highbd_read_dist_4(v_dist, &v_reg);
-
-    *v_first = _mm_unpacklo_epi32(v_reg, v_reg);
-    *v_second = _mm_unpackhi_epi32(v_reg, v_reg);
-  }
-}
-
-static void highbd_apply_temporal_filter_luma_8(
-    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
-    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
-    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, int use_whole_blk, uint32_t *y_accum,
-    uint16_t *y_count, const uint32_t *y_dist, const uint32_t *u_dist,
-    const uint32_t *v_dist, const uint32_t *const *neighbors_first,
-    const uint32_t *const *neighbors_second, int top_weight,
-    int bottom_weight) {
-  const int rounding = (1 << strength) >> 1;
-  int weight = top_weight;
-
-  __m128i mul_first, mul_second;
-
-  __m128i sum_row_1_first, sum_row_1_second;
-  __m128i sum_row_2_first, sum_row_2_second;
-  __m128i sum_row_3_first, sum_row_3_second;
-
-  __m128i u_first, u_second;
-  __m128i v_first, v_second;
-
-  __m128i sum_row_first;
-  __m128i sum_row_second;
-
-  // Loop variables
-  unsigned int h;
-
-  assert(strength >= 0 && strength <= 14 &&
-         "invalid adjusted temporal filter strength");
-  assert(block_width == 8);
-
-  (void)block_width;
-
-  // First row
-  mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]);
-  mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]);
-
-  // Add luma values
-  highbd_get_sum_8(y_dist, &sum_row_2_first, &sum_row_2_second);
-  highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
-
-  // We don't need to saturate here because the maximum value is UINT12_MAX ** 2
-  // * 9 ~= 2**24 * 9 < 2 ** 28 < INT32_MAX
-  sum_row_first = _mm_add_epi32(sum_row_2_first, sum_row_3_first);
-  sum_row_second = _mm_add_epi32(sum_row_2_second, sum_row_3_second);
-
-  // Add chroma values
-  highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
-                                &v_first, &v_second);
-
-  // Max value here is 2 ** 24 * (9 + 2), so no saturation is needed
-  sum_row_first = _mm_add_epi32(sum_row_first, u_first);
-  sum_row_second = _mm_add_epi32(sum_row_second, u_second);
-
-  sum_row_first = _mm_add_epi32(sum_row_first, v_first);
-  sum_row_second = _mm_add_epi32(sum_row_second, v_second);
-
-  // Get modifier and store result
-  highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first,
-                   &sum_row_second, &mul_first, &mul_second, strength, rounding,
-                   weight);
-
-  highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
-                                y_accum);
-
-  y_src += y_src_stride;
-  y_pre += y_pre_stride;
-  y_count += y_pre_stride;
-  y_accum += y_pre_stride;
-  y_dist += DIST_STRIDE;
-
-  u_src += uv_src_stride;
-  u_pre += uv_pre_stride;
-  u_dist += DIST_STRIDE;
-  v_src += uv_src_stride;
-  v_pre += uv_pre_stride;
-  v_dist += DIST_STRIDE;
-
-  // Then all the rows except the last one
-  mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[1]);
-  mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[1]);
-
-  for (h = 1; h < block_height - 1; ++h) {
-    // Move the weight to bottom half
-    if (!use_whole_blk && h == block_height / 2) {
-      weight = bottom_weight;
-    }
-    // Shift the rows up
-    sum_row_1_first = sum_row_2_first;
-    sum_row_1_second = sum_row_2_second;
-    sum_row_2_first = sum_row_3_first;
-    sum_row_2_second = sum_row_3_second;
-
-    // Add luma values to the modifier
-    sum_row_first = _mm_add_epi32(sum_row_1_first, sum_row_2_first);
-    sum_row_second = _mm_add_epi32(sum_row_1_second, sum_row_2_second);
-
-    highbd_get_sum_8(y_dist + DIST_STRIDE, &sum_row_3_first, &sum_row_3_second);
-
-    sum_row_first = _mm_add_epi32(sum_row_first, sum_row_3_first);
-    sum_row_second = _mm_add_epi32(sum_row_second, sum_row_3_second);
-
-    // Add chroma values to the modifier
-    if (ss_y == 0 || h % 2 == 0) {
-      // Only calculate the new chroma distortion if we are at a pixel that
-      // corresponds to a new chroma row
-      highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
-                                    &v_first, &v_second);
-
-      u_src += uv_src_stride;
-      u_pre += uv_pre_stride;
-      u_dist += DIST_STRIDE;
-      v_src += uv_src_stride;
-      v_pre += uv_pre_stride;
-      v_dist += DIST_STRIDE;
-    }
-
-    sum_row_first = _mm_add_epi32(sum_row_first, u_first);
-    sum_row_second = _mm_add_epi32(sum_row_second, u_second);
-    sum_row_first = _mm_add_epi32(sum_row_first, v_first);
-    sum_row_second = _mm_add_epi32(sum_row_second, v_second);
-
-    // Get modifier and store result
-    highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first,
-                     &sum_row_second, &mul_first, &mul_second, strength,
-                     rounding, weight);
-    highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
-                                  y_accum);
-
-    y_src += y_src_stride;
-    y_pre += y_pre_stride;
-    y_count += y_pre_stride;
-    y_accum += y_pre_stride;
-    y_dist += DIST_STRIDE;
-  }
-
-  // The last row
-  mul_first = _mm_loadu_si128((const __m128i *)neighbors_first[0]);
-  mul_second = _mm_loadu_si128((const __m128i *)neighbors_second[0]);
-
-  // Shift the rows up
-  sum_row_1_first = sum_row_2_first;
-  sum_row_1_second = sum_row_2_second;
-  sum_row_2_first = sum_row_3_first;
-  sum_row_2_second = sum_row_3_second;
-
-  // Add luma values to the modifier
-  sum_row_first = _mm_add_epi32(sum_row_1_first, sum_row_2_first);
-  sum_row_second = _mm_add_epi32(sum_row_1_second, sum_row_2_second);
-
-  // Add chroma values to the modifier
-  if (ss_y == 0) {
-    // Only calculate the new chroma distortion if we are at a pixel that
-    // corresponds to a new chroma row
-    highbd_read_chroma_dist_row_8(ss_x, u_dist, v_dist, &u_first, &u_second,
-                                  &v_first, &v_second);
-  }
-
-  sum_row_first = _mm_add_epi32(sum_row_first, u_first);
-  sum_row_second = _mm_add_epi32(sum_row_second, u_second);
-  sum_row_first = _mm_add_epi32(sum_row_first, v_first);
-  sum_row_second = _mm_add_epi32(sum_row_second, v_second);
-
-  // Get modifier and store result
-  highbd_average_8(&sum_row_first, &sum_row_second, &sum_row_first,
-                   &sum_row_second, &mul_first, &mul_second, strength, rounding,
-                   weight);
-  highbd_accumulate_and_store_8(sum_row_first, sum_row_second, y_pre, y_count,
-                                y_accum);
-}
-
-// Perform temporal filter for the luma component.
-static void highbd_apply_temporal_filter_luma(
-    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
-    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
-    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
-    uint32_t *y_accum, uint16_t *y_count, const uint32_t *y_dist,
-    const uint32_t *u_dist, const uint32_t *v_dist) {
-  unsigned int blk_col = 0, uv_blk_col = 0;
-  const unsigned int blk_col_step = 8, uv_blk_col_step = 8 >> ss_x;
-  const unsigned int mid_width = block_width >> 1,
-                     last_width = block_width - blk_col_step;
-  int top_weight = blk_fw[0],
-      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
-  const uint32_t *const *neighbors_first;
-  const uint32_t *const *neighbors_second;
-
-  // Left
-  neighbors_first = HIGHBD_LUMA_LEFT_COLUMN_NEIGHBORS;
-  neighbors_second = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS;
-  highbd_apply_temporal_filter_luma_8(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, blk_col_step, block_height, ss_x, ss_y,
-      strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
-      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
-      neighbors_first, neighbors_second, top_weight, bottom_weight);
-
-  blk_col += blk_col_step;
-  uv_blk_col += uv_blk_col_step;
-
-  // Middle First
-  neighbors_first = HIGHBD_LUMA_MIDDLE_COLUMN_NEIGHBORS;
-  for (; blk_col < mid_width;
-       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
-    highbd_apply_temporal_filter_luma_8(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, blk_col_step,
-        block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
-        y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
-        v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
-        bottom_weight);
-  }
-
-  if (!use_whole_blk) {
-    top_weight = blk_fw[1];
-    bottom_weight = blk_fw[3];
-  }
-
-  // Middle Second
-  for (; blk_col < last_width;
-       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
-    highbd_apply_temporal_filter_luma_8(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, blk_col_step,
-        block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
-        y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
-        v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
-        bottom_weight);
-  }
-
-  // Right
-  neighbors_second = HIGHBD_LUMA_RIGHT_COLUMN_NEIGHBORS;
-  highbd_apply_temporal_filter_luma_8(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, blk_col_step, block_height, ss_x, ss_y,
-      strength, use_whole_blk, y_accum + blk_col, y_count + blk_col,
-      y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
-      neighbors_first, neighbors_second, top_weight, bottom_weight);
-}
-
-// Add a row of luma distortion that corresponds to 8 chroma mods. If we are
-// subsampling in x direction, then we have 16 lumas, else we have 8.
-static INLINE void highbd_add_luma_dist_to_8_chroma_mod(
-    const uint32_t *y_dist, int ss_x, int ss_y, __m128i *u_mod_fst,
-    __m128i *u_mod_snd, __m128i *v_mod_fst, __m128i *v_mod_snd) {
-  __m128i y_reg_fst, y_reg_snd;
-  if (!ss_x) {
-    highbd_read_dist_8(y_dist, &y_reg_fst, &y_reg_snd);
-    if (ss_y == 1) {
-      __m128i y_tmp_fst, y_tmp_snd;
-      highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
-      y_reg_fst = _mm_add_epi32(y_reg_fst, y_tmp_fst);
-      y_reg_snd = _mm_add_epi32(y_reg_snd, y_tmp_snd);
-    }
-  } else {
-    // Temporary
-    __m128i y_fst, y_snd;
-
-    // First 8
-    highbd_read_dist_8(y_dist, &y_fst, &y_snd);
-    if (ss_y == 1) {
-      __m128i y_tmp_fst, y_tmp_snd;
-      highbd_read_dist_8(y_dist + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
-
-      y_fst = _mm_add_epi32(y_fst, y_tmp_fst);
-      y_snd = _mm_add_epi32(y_snd, y_tmp_snd);
-    }
-
-    y_reg_fst = _mm_hadd_epi32(y_fst, y_snd);
-
-    // Second 8
-    highbd_read_dist_8(y_dist + 8, &y_fst, &y_snd);
-    if (ss_y == 1) {
-      __m128i y_tmp_fst, y_tmp_snd;
-      highbd_read_dist_8(y_dist + 8 + DIST_STRIDE, &y_tmp_fst, &y_tmp_snd);
-
-      y_fst = _mm_add_epi32(y_fst, y_tmp_fst);
-      y_snd = _mm_add_epi32(y_snd, y_tmp_snd);
-    }
-
-    y_reg_snd = _mm_hadd_epi32(y_fst, y_snd);
-  }
-
-  *u_mod_fst = _mm_add_epi32(*u_mod_fst, y_reg_fst);
-  *u_mod_snd = _mm_add_epi32(*u_mod_snd, y_reg_snd);
-  *v_mod_fst = _mm_add_epi32(*v_mod_fst, y_reg_fst);
-  *v_mod_snd = _mm_add_epi32(*v_mod_snd, y_reg_snd);
-}
-
-// Apply temporal filter to the chroma components. This performs temporal
-// filtering on a chroma block of 8 X uv_height. If blk_fw is not NULL, use
-// blk_fw as an array of size 4 for the weights for each of the 4 subblocks,
-// else use top_weight for top half, and bottom weight for bottom half.
-static void highbd_apply_temporal_filter_chroma_8(
-    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
-    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
-    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
-    int uv_pre_stride, unsigned int uv_block_width,
-    unsigned int uv_block_height, int ss_x, int ss_y, int strength,
-    uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
-    const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist,
-    const uint32_t *const *neighbors_fst, const uint32_t *const *neighbors_snd,
-    int top_weight, int bottom_weight, const int *blk_fw) {
-  const int rounding = (1 << strength) >> 1;
-  int weight = top_weight;
-
-  __m128i mul_fst, mul_snd;
-
-  __m128i u_sum_row_1_fst, u_sum_row_2_fst, u_sum_row_3_fst;
-  __m128i v_sum_row_1_fst, v_sum_row_2_fst, v_sum_row_3_fst;
-  __m128i u_sum_row_1_snd, u_sum_row_2_snd, u_sum_row_3_snd;
-  __m128i v_sum_row_1_snd, v_sum_row_2_snd, v_sum_row_3_snd;
-
-  __m128i u_sum_row_fst, v_sum_row_fst;
-  __m128i u_sum_row_snd, v_sum_row_snd;
-
-  // Loop variable
-  unsigned int h;
-
-  (void)uv_block_width;
-
-  // First row
-  mul_fst = _mm_loadu_si128((const __m128i *)neighbors_fst[0]);
-  mul_snd = _mm_loadu_si128((const __m128i *)neighbors_snd[0]);
-
-  // Add chroma values
-  highbd_get_sum_8(u_dist, &u_sum_row_2_fst, &u_sum_row_2_snd);
-  highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd);
-
-  u_sum_row_fst = _mm_add_epi32(u_sum_row_2_fst, u_sum_row_3_fst);
-  u_sum_row_snd = _mm_add_epi32(u_sum_row_2_snd, u_sum_row_3_snd);
-
-  highbd_get_sum_8(v_dist, &v_sum_row_2_fst, &v_sum_row_2_snd);
-  highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd);
-
-  v_sum_row_fst = _mm_add_epi32(v_sum_row_2_fst, v_sum_row_3_fst);
-  v_sum_row_snd = _mm_add_epi32(v_sum_row_2_snd, v_sum_row_3_snd);
-
-  // Add luma values
-  highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
-                                       &u_sum_row_snd, &v_sum_row_fst,
-                                       &v_sum_row_snd);
-
-  // Get modifier and store result
-  if (blk_fw) {
-    highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength,
-                     rounding, blk_fw[0]);
-    highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength,
-                     rounding, blk_fw[1]);
-
-    highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength,
-                     rounding, blk_fw[0]);
-    highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength,
-                     rounding, blk_fw[1]);
-
-  } else {
-    highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst,
-                     &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
-                     weight);
-    highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst,
-                     &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
-                     weight);
-  }
-  highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
-                                u_accum);
-  highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
-                                v_accum);
-
-  u_src += uv_src_stride;
-  u_pre += uv_pre_stride;
-  u_dist += DIST_STRIDE;
-  v_src += uv_src_stride;
-  v_pre += uv_pre_stride;
-  v_dist += DIST_STRIDE;
-  u_count += uv_pre_stride;
-  u_accum += uv_pre_stride;
-  v_count += uv_pre_stride;
-  v_accum += uv_pre_stride;
-
-  y_src += y_src_stride * (1 + ss_y);
-  y_pre += y_pre_stride * (1 + ss_y);
-  y_dist += DIST_STRIDE * (1 + ss_y);
-
-  // Then all the rows except the last one
-  mul_fst = _mm_loadu_si128((const __m128i *)neighbors_fst[1]);
-  mul_snd = _mm_loadu_si128((const __m128i *)neighbors_snd[1]);
-
-  for (h = 1; h < uv_block_height - 1; ++h) {
-    // Move the weight pointer to the bottom half of the blocks
-    if (h == uv_block_height / 2) {
-      if (blk_fw) {
-        blk_fw += 2;
-      } else {
-        weight = bottom_weight;
-      }
-    }
-
-    // Shift the rows up
-    u_sum_row_1_fst = u_sum_row_2_fst;
-    u_sum_row_2_fst = u_sum_row_3_fst;
-    u_sum_row_1_snd = u_sum_row_2_snd;
-    u_sum_row_2_snd = u_sum_row_3_snd;
-
-    v_sum_row_1_fst = v_sum_row_2_fst;
-    v_sum_row_2_fst = v_sum_row_3_fst;
-    v_sum_row_1_snd = v_sum_row_2_snd;
-    v_sum_row_2_snd = v_sum_row_3_snd;
-
-    // Add chroma values
-    u_sum_row_fst = _mm_add_epi32(u_sum_row_1_fst, u_sum_row_2_fst);
-    u_sum_row_snd = _mm_add_epi32(u_sum_row_1_snd, u_sum_row_2_snd);
-    highbd_get_sum_8(u_dist + DIST_STRIDE, &u_sum_row_3_fst, &u_sum_row_3_snd);
-    u_sum_row_fst = _mm_add_epi32(u_sum_row_fst, u_sum_row_3_fst);
-    u_sum_row_snd = _mm_add_epi32(u_sum_row_snd, u_sum_row_3_snd);
-
-    v_sum_row_fst = _mm_add_epi32(v_sum_row_1_fst, v_sum_row_2_fst);
-    v_sum_row_snd = _mm_add_epi32(v_sum_row_1_snd, v_sum_row_2_snd);
-    highbd_get_sum_8(v_dist + DIST_STRIDE, &v_sum_row_3_fst, &v_sum_row_3_snd);
-    v_sum_row_fst = _mm_add_epi32(v_sum_row_fst, v_sum_row_3_fst);
-    v_sum_row_snd = _mm_add_epi32(v_sum_row_snd, v_sum_row_3_snd);
-
-    // Add luma values
-    highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
-                                         &u_sum_row_snd, &v_sum_row_fst,
-                                         &v_sum_row_snd);
-
-    // Get modifier and store result
-    if (blk_fw) {
-      highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength,
-                       rounding, blk_fw[0]);
-      highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength,
-                       rounding, blk_fw[1]);
-
-      highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength,
-                       rounding, blk_fw[0]);
-      highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength,
-                       rounding, blk_fw[1]);
-
-    } else {
-      highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst,
-                       &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
-                       weight);
-      highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst,
-                       &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
-                       weight);
-    }
-
-    highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
-                                  u_accum);
-    highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
-                                  v_accum);
-
-    u_src += uv_src_stride;
-    u_pre += uv_pre_stride;
-    u_dist += DIST_STRIDE;
-    v_src += uv_src_stride;
-    v_pre += uv_pre_stride;
-    v_dist += DIST_STRIDE;
-    u_count += uv_pre_stride;
-    u_accum += uv_pre_stride;
-    v_count += uv_pre_stride;
-    v_accum += uv_pre_stride;
-
-    y_src += y_src_stride * (1 + ss_y);
-    y_pre += y_pre_stride * (1 + ss_y);
-    y_dist += DIST_STRIDE * (1 + ss_y);
-  }
-
-  // The last row
-  mul_fst = _mm_loadu_si128((const __m128i *)neighbors_fst[0]);
-  mul_snd = _mm_loadu_si128((const __m128i *)neighbors_snd[0]);
-
-  // Shift the rows up
-  u_sum_row_1_fst = u_sum_row_2_fst;
-  u_sum_row_2_fst = u_sum_row_3_fst;
-  u_sum_row_1_snd = u_sum_row_2_snd;
-  u_sum_row_2_snd = u_sum_row_3_snd;
-
-  v_sum_row_1_fst = v_sum_row_2_fst;
-  v_sum_row_2_fst = v_sum_row_3_fst;
-  v_sum_row_1_snd = v_sum_row_2_snd;
-  v_sum_row_2_snd = v_sum_row_3_snd;
-
-  // Add chroma values
-  u_sum_row_fst = _mm_add_epi32(u_sum_row_1_fst, u_sum_row_2_fst);
-  v_sum_row_fst = _mm_add_epi32(v_sum_row_1_fst, v_sum_row_2_fst);
-  u_sum_row_snd = _mm_add_epi32(u_sum_row_1_snd, u_sum_row_2_snd);
-  v_sum_row_snd = _mm_add_epi32(v_sum_row_1_snd, v_sum_row_2_snd);
-
-  // Add luma values
-  highbd_add_luma_dist_to_8_chroma_mod(y_dist, ss_x, ss_y, &u_sum_row_fst,
-                                       &u_sum_row_snd, &v_sum_row_fst,
-                                       &v_sum_row_snd);
-
-  // Get modifier and store result
-  if (blk_fw) {
-    highbd_average_4(&u_sum_row_fst, &u_sum_row_fst, &mul_fst, strength,
-                     rounding, blk_fw[0]);
-    highbd_average_4(&u_sum_row_snd, &u_sum_row_snd, &mul_snd, strength,
-                     rounding, blk_fw[1]);
-
-    highbd_average_4(&v_sum_row_fst, &v_sum_row_fst, &mul_fst, strength,
-                     rounding, blk_fw[0]);
-    highbd_average_4(&v_sum_row_snd, &v_sum_row_snd, &mul_snd, strength,
-                     rounding, blk_fw[1]);
-
-  } else {
-    highbd_average_8(&u_sum_row_fst, &u_sum_row_snd, &u_sum_row_fst,
-                     &u_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
-                     weight);
-    highbd_average_8(&v_sum_row_fst, &v_sum_row_snd, &v_sum_row_fst,
-                     &v_sum_row_snd, &mul_fst, &mul_snd, strength, rounding,
-                     weight);
-  }
-
-  highbd_accumulate_and_store_8(u_sum_row_fst, u_sum_row_snd, u_pre, u_count,
-                                u_accum);
-  highbd_accumulate_and_store_8(v_sum_row_fst, v_sum_row_snd, v_pre, v_count,
-                                v_accum);
-}
-
-// Perform temporal filter for the chroma components.
-static void highbd_apply_temporal_filter_chroma(
-    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
-    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
-    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
-    uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
-    const uint32_t *y_dist, const uint32_t *u_dist, const uint32_t *v_dist) {
-  const unsigned int uv_width = block_width >> ss_x,
-                     uv_height = block_height >> ss_y;
-
-  unsigned int blk_col = 0, uv_blk_col = 0;
-  const unsigned int uv_blk_col_step = 8, blk_col_step = 8 << ss_x;
-  const unsigned int uv_mid_width = uv_width >> 1,
-                     uv_last_width = uv_width - uv_blk_col_step;
-  int top_weight = blk_fw[0],
-      bottom_weight = use_whole_blk ? blk_fw[0] : blk_fw[2];
-  const uint32_t *const *neighbors_fst;
-  const uint32_t *const *neighbors_snd;
-
-  if (uv_width == 8) {
-    // Special Case: We are subsampling in x direction on a 16x16 block. Since
-    // we are operating on a row of 8 chroma pixels, we can't use the usual
-    // left-middle-right pattern.
-    assert(ss_x);
-
-    if (ss_y) {
-      neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
-      neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
-    } else {
-      neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
-      neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
-    }
-
-    if (use_whole_blk) {
-      highbd_apply_temporal_filter_chroma_8(
-          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
-          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
-          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
-          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
-          neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
-    } else {
-      highbd_apply_temporal_filter_chroma_8(
-          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
-          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
-          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
-          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
-          neighbors_fst, neighbors_snd, 0, 0, blk_fw);
-    }
-
-    return;
-  }
-
-  // Left
-  if (ss_x && ss_y) {
-    neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_LEFT_COLUMN_NEIGHBORS;
-    neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
-  } else if (ss_x || ss_y) {
-    neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_LEFT_COLUMN_NEIGHBORS;
-    neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
-  } else {
-    neighbors_fst = HIGHBD_CHROMA_NO_SS_LEFT_COLUMN_NEIGHBORS;
-    neighbors_snd = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
-  }
-
-  highbd_apply_temporal_filter_chroma_8(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
-      strength, u_accum + uv_blk_col, u_count + uv_blk_col,
-      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
-      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, neighbors_snd,
-      top_weight, bottom_weight, NULL);
-
-  blk_col += blk_col_step;
-  uv_blk_col += uv_blk_col_step;
-
-  // Middle First
-  if (ss_x && ss_y) {
-    neighbors_fst = HIGHBD_CHROMA_DOUBLE_SS_MIDDLE_COLUMN_NEIGHBORS;
-  } else if (ss_x || ss_y) {
-    neighbors_fst = HIGHBD_CHROMA_SINGLE_SS_MIDDLE_COLUMN_NEIGHBORS;
-  } else {
-    neighbors_fst = HIGHBD_CHROMA_NO_SS_MIDDLE_COLUMN_NEIGHBORS;
-  }
-
-  for (; uv_blk_col < uv_mid_width;
-       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
-    highbd_apply_temporal_filter_chroma_8(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
-        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
-        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
-        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
-        neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
-  }
-
-  if (!use_whole_blk) {
-    top_weight = blk_fw[1];
-    bottom_weight = blk_fw[3];
-  }
-
-  // Middle Second
-  for (; uv_blk_col < uv_last_width;
-       blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
-    highbd_apply_temporal_filter_chroma_8(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
-        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
-        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
-        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col,
-        neighbors_fst, neighbors_snd, top_weight, bottom_weight, NULL);
-  }
-
-  // Right
-  if (ss_x && ss_y) {
-    neighbors_snd = HIGHBD_CHROMA_DOUBLE_SS_RIGHT_COLUMN_NEIGHBORS;
-  } else if (ss_x || ss_y) {
-    neighbors_snd = HIGHBD_CHROMA_SINGLE_SS_RIGHT_COLUMN_NEIGHBORS;
-  } else {
-    neighbors_snd = HIGHBD_CHROMA_NO_SS_RIGHT_COLUMN_NEIGHBORS;
-  }
-
-  highbd_apply_temporal_filter_chroma_8(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
-      strength, u_accum + uv_blk_col, u_count + uv_blk_col,
-      v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
-      u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_fst, neighbors_snd,
-      top_weight, bottom_weight, NULL);
-}
-
-static void highbd_apply_temporal_filter_yuv(
-    const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
-    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int strength, const int use_subblock,
-    const int *subblock_filter_weights, const uint8_t *pred, uint32_t *accum,
-    uint16_t *count) {
-  const int use_whole_blk = !use_subblock;
-  const int *blk_fw = subblock_filter_weights;
-
-  // Block information (Y-plane).
-  const unsigned int block_height = block_size_high[block_size];
-  const unsigned int block_width = block_size_wide[block_size];
-  const int mb_pels = block_height * block_width;
-  const int y_src_stride = ref_frame->y_stride;
-  const int y_pre_stride = block_width;
-  const int mb_y_src_offset =
-      mb_row * block_height * ref_frame->y_stride + mb_col * block_width;
-
-  // Block information (UV-plane).
-  const int ss_y = mbd->plane[1].subsampling_y;
-  const int ss_x = mbd->plane[1].subsampling_x;
-  const unsigned int uv_height = block_height >> ss_y;
-  const unsigned int uv_width = block_width >> ss_x;
-  const int uv_src_stride = ref_frame->uv_stride;
-  const int uv_pre_stride = block_width >> ss_x;
-  const int mb_uv_src_offset =
-      mb_row * uv_height * ref_frame->uv_stride + mb_col * uv_width;
-
-  const uint8_t *y_src = ref_frame->y_buffer + mb_y_src_offset;
-  const uint8_t *u_src = ref_frame->u_buffer + mb_uv_src_offset;
-  const uint8_t *v_src = ref_frame->v_buffer + mb_uv_src_offset;
-  const uint8_t *y_pre = pred;
-  const uint8_t *u_pre = pred + mb_pels;
-  const uint8_t *v_pre = pred + mb_pels * 2;
-  uint32_t *y_accum = accum;
-  uint32_t *u_accum = accum + mb_pels;
-  uint32_t *v_accum = accum + mb_pels * 2;
-  uint16_t *y_count = count;
-  uint16_t *u_count = count + mb_pels;
-  uint16_t *v_count = count + mb_pels * 2;
-
-  const unsigned int chroma_height = block_height >> ss_y,
-                     chroma_width = block_width >> ss_x;
-
-  DECLARE_ALIGNED(16, uint32_t, y_dist[BH * DIST_STRIDE]) = { 0 };
-  DECLARE_ALIGNED(16, uint32_t, u_dist[BH * DIST_STRIDE]) = { 0 };
-  DECLARE_ALIGNED(16, uint32_t, v_dist[BH * DIST_STRIDE]) = { 0 };
-
-  uint32_t *y_dist_ptr = y_dist + 1, *u_dist_ptr = u_dist + 1,
-           *v_dist_ptr = v_dist + 1;
-  const uint16_t *y_src_ptr = CONVERT_TO_SHORTPTR(y_src),
-                 *u_src_ptr = CONVERT_TO_SHORTPTR(u_src),
-                 *v_src_ptr = CONVERT_TO_SHORTPTR(v_src);
-  const uint16_t *y_pre_ptr = CONVERT_TO_SHORTPTR(y_pre),
-                 *u_pre_ptr = CONVERT_TO_SHORTPTR(u_pre),
-                 *v_pre_ptr = CONVERT_TO_SHORTPTR(v_pre);
-
-  // Loop variables
-  unsigned int row, blk_col;
-
-  assert(block_width <= BW && "block width too large");
-  assert(block_height <= BH && "block height too large");
-  assert(block_width % 16 == 0 && "block width must be multiple of 16");
-  assert(block_height % 2 == 0 && "block height must be even");
-  assert((ss_x == 0 || ss_x == 1) && (ss_y == 0 || ss_y == 1) &&
-         "invalid chroma subsampling");
-  assert(strength >= 0 && strength <= 14 &&
-         "invalid adjusted temporal filter strength");
-  assert(blk_fw[0] >= 0 && "filter weight must be positive");
-  assert(
-      (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) &&
-      "subblock filter weight must be positive");
-  assert(blk_fw[0] <= 2 && "sublock filter weight must be less than 2");
-  assert(
-      (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) &&
-      "subblock filter weight must be less than 2");
-
-  // Precompute the difference squared
-  for (row = 0; row < block_height; row++) {
-    for (blk_col = 0; blk_col < block_width; blk_col += 8) {
-      highbd_store_dist_8(y_src_ptr + blk_col, y_pre_ptr + blk_col,
-                          y_dist_ptr + blk_col);
-    }
-    y_src_ptr += y_src_stride;
-    y_pre_ptr += y_pre_stride;
-    y_dist_ptr += DIST_STRIDE;
-  }
-
-  for (row = 0; row < chroma_height; row++) {
-    for (blk_col = 0; blk_col < chroma_width; blk_col += 8) {
-      highbd_store_dist_8(u_src_ptr + blk_col, u_pre_ptr + blk_col,
-                          u_dist_ptr + blk_col);
-      highbd_store_dist_8(v_src_ptr + blk_col, v_pre_ptr + blk_col,
-                          v_dist_ptr + blk_col);
-    }
-
-    u_src_ptr += uv_src_stride;
-    u_pre_ptr += uv_pre_stride;
-    u_dist_ptr += DIST_STRIDE;
-    v_src_ptr += uv_src_stride;
-    v_pre_ptr += uv_pre_stride;
-    v_dist_ptr += DIST_STRIDE;
-  }
-
-  y_src_ptr = CONVERT_TO_SHORTPTR(y_src),
-  u_src_ptr = CONVERT_TO_SHORTPTR(u_src),
-  v_src_ptr = CONVERT_TO_SHORTPTR(v_src);
-  y_pre_ptr = CONVERT_TO_SHORTPTR(y_pre),
-  u_pre_ptr = CONVERT_TO_SHORTPTR(u_pre),
-  v_pre_ptr = CONVERT_TO_SHORTPTR(v_pre);
-
-  y_dist_ptr = y_dist + 1;
-  u_dist_ptr = u_dist + 1;
-  v_dist_ptr = v_dist + 1;
-
-  highbd_apply_temporal_filter_luma(
-      y_src_ptr, y_src_stride, y_pre_ptr, y_pre_stride, u_src_ptr, v_src_ptr,
-      uv_src_stride, u_pre_ptr, v_pre_ptr, uv_pre_stride, block_width,
-      block_height, ss_x, ss_y, strength, blk_fw, use_whole_blk, y_accum,
-      y_count, y_dist_ptr, u_dist_ptr, v_dist_ptr);
-
-  highbd_apply_temporal_filter_chroma(
-      y_src_ptr, y_src_stride, y_pre_ptr, y_pre_stride, u_src_ptr, v_src_ptr,
-      uv_src_stride, u_pre_ptr, v_pre_ptr, uv_pre_stride, block_width,
-      block_height, ss_x, ss_y, strength, blk_fw, use_whole_blk, u_accum,
-      u_count, v_accum, v_count, y_dist_ptr, u_dist_ptr, v_dist_ptr);
-}
-
-/////////////////////////
-// High bit-depth Ends //
-/////////////////////////
-
-void av1_apply_temporal_filter_yuv_sse4_1(
-    const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
-    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int num_planes, const int strength, const int use_subblock,
-    const int *subblock_filter_weights, const uint8_t *pred, uint32_t *accum,
-    uint16_t *count) {
-  const int is_high_bitdepth = ref_frame->flags & YV12_FLAG_HIGHBITDEPTH;
-  // TODO(any): Need to support when `num_planes != 3`, like C implementation.
-  assert(num_planes == 3);
-  (void)num_planes;
-  if (is_high_bitdepth) {
-    highbd_apply_temporal_filter_yuv(
-        ref_frame, mbd, block_size, mb_row, mb_col, strength, use_subblock,
-        subblock_filter_weights, pred, accum, count);
-  } else {
-    apply_temporal_filter_yuv(ref_frame, mbd, block_size, mb_row, mb_col,
-                              strength, use_subblock, subblock_filter_weights,
-                              pred, accum, count);
-  }
-}
diff --git a/test/temporal_filter_planewise_test.cc b/test/temporal_filter_test.cc
similarity index 73%
rename from test/temporal_filter_planewise_test.cc
rename to test/temporal_filter_test.cc
index c3f3e9e..060c370 100644
--- a/test/temporal_filter_planewise_test.cc
+++ b/test/temporal_filter_test.cc
@@ -37,22 +37,19 @@
 #if !CONFIG_REALTIME_ONLY
 namespace {
 
-typedef void (*TemporalFilterPlanewiseFunc)(
+typedef void (*TemporalFilterFunc)(
     const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
     const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int num_planes, const double *noise_level, const int use_subblock,
-    const int block_mse, const int *subblock_mses, const int q_factor,
-    const uint8_t *pred, uint32_t *accum, uint16_t *count);
-typedef libaom_test::FuncParam<TemporalFilterPlanewiseFunc>
-    TemporalFilterPlanewiseFuncParam;
+    const int num_planes, const double *noise_level, const int *subblock_mses,
+    const int q_factor, const uint8_t *pred, uint32_t *accum, uint16_t *count);
+typedef libaom_test::FuncParam<TemporalFilterFunc> TemporalFilterFuncParam;
 
-typedef std::tuple<TemporalFilterPlanewiseFuncParam, int>
-    TemporalFilterPlanewiseWithParam;
+typedef std::tuple<TemporalFilterFuncParam, int> TemporalFilterWithParam;
 
-class TemporalFilterPlanewiseTest
-    : public ::testing::TestWithParam<TemporalFilterPlanewiseWithParam> {
+class TemporalFilterTest
+    : public ::testing::TestWithParam<TemporalFilterWithParam> {
  public:
-  virtual ~TemporalFilterPlanewiseTest() {}
+  virtual ~TemporalFilterTest() {}
   virtual void SetUp() {
     params_ = GET_PARAM(0);
     rnd_.Reset(ACMRandom::DeterministicSeed());
@@ -90,14 +87,14 @@
   }
 
  protected:
-  TemporalFilterPlanewiseFuncParam params_;
+  TemporalFilterFuncParam params_;
   uint8_t *src1_;
   uint8_t *src2_;
   ACMRandom rnd_;
 };
 
-void TemporalFilterPlanewiseTest::RunTest(int isRandom, int width, int height,
-                                          int run_times) {
+void TemporalFilterTest::RunTest(int isRandom, int width, int height,
+                                 int run_times) {
   aom_usec_timer ref_timer, test_timer;
   for (int k = 0; k < 3; k++) {
     const int stride = width;
@@ -125,8 +122,6 @@
 
     assert(width == 32 && height == 32);
     const BLOCK_SIZE block_size = BLOCK_32X32;
-    const int use_subblock = 0;
-    const int block_mse = 20;
     const int subblock_mses[4] = { 15, 16, 17, 18 };
     const int q_factor = 12;
     const int mb_row = 0;
@@ -148,18 +143,18 @@
     mbd->bd = 8;
 
     params_.ref_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
-                     sigma, use_subblock, block_mse, subblock_mses, q_factor,
-                     src2_, accumulator_ref, count_ref);
+                     sigma, subblock_mses, q_factor, src2_, accumulator_ref,
+                     count_ref);
     params_.tst_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
-                     sigma, use_subblock, block_mse, subblock_mses, q_factor,
-                     src2_, accumulator_mod, count_mod);
+                     sigma, subblock_mses, q_factor, src2_, accumulator_mod,
+                     count_mod);
 
     if (run_times > 1) {
       aom_usec_timer_start(&ref_timer);
       for (int j = 0; j < run_times; j++) {
         params_.ref_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
-                         sigma, use_subblock, block_mse, subblock_mses,
-                         q_factor, src2_, accumulator_ref, count_ref);
+                         sigma, subblock_mses, q_factor, src2_, accumulator_ref,
+                         count_ref);
       }
       aom_usec_timer_mark(&ref_timer);
       const int elapsed_time_c =
@@ -168,8 +163,8 @@
       aom_usec_timer_start(&test_timer);
       for (int j = 0; j < run_times; j++) {
         params_.tst_func(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
-                         sigma, use_subblock, block_mse, subblock_mses,
-                         q_factor, src2_, accumulator_mod, count_mod);
+                         sigma, subblock_mses, q_factor, src2_, accumulator_mod,
+                         count_mod);
       }
       aom_usec_timer_mark(&test_timer);
       const int elapsed_time_simd =
@@ -200,41 +195,37 @@
   }
 }
 
-TEST_P(TemporalFilterPlanewiseTest, OperationCheck) {
+TEST_P(TemporalFilterTest, OperationCheck) {
   for (int height = 32; height <= 32; height = height * 2) {
     RunTest(1, height, height, 1);  // GenRandomData
   }
 }
 
-TEST_P(TemporalFilterPlanewiseTest, ExtremeValues) {
+TEST_P(TemporalFilterTest, ExtremeValues) {
   for (int height = 32; height <= 32; height = height * 2) {
     RunTest(0, height, height, 1);
   }
 }
 
-TEST_P(TemporalFilterPlanewiseTest, DISABLED_Speed) {
+TEST_P(TemporalFilterTest, DISABLED_Speed) {
   for (int height = 32; height <= 32; height = height * 2) {
     RunTest(1, height, height, 100000);
   }
 }
 
 #if HAVE_AVX2
-TemporalFilterPlanewiseFuncParam temporal_filter_planewise_test_avx2[] = {
-  TemporalFilterPlanewiseFuncParam(&av1_apply_temporal_filter_planewise_c,
-                                   &av1_apply_temporal_filter_planewise_avx2)
-};
-INSTANTIATE_TEST_SUITE_P(AVX2, TemporalFilterPlanewiseTest,
-                         Combine(ValuesIn(temporal_filter_planewise_test_avx2),
+TemporalFilterFuncParam temporal_filter_test_avx2[] = { TemporalFilterFuncParam(
+    &av1_apply_temporal_filter_c, &av1_apply_temporal_filter_avx2) };
+INSTANTIATE_TEST_SUITE_P(AVX2, TemporalFilterTest,
+                         Combine(ValuesIn(temporal_filter_test_avx2),
                                  Range(64, 65, 4)));
 #endif  // HAVE_AVX2
 
 #if HAVE_SSE2
-TemporalFilterPlanewiseFuncParam temporal_filter_planewise_test_sse2[] = {
-  TemporalFilterPlanewiseFuncParam(&av1_apply_temporal_filter_planewise_c,
-                                   &av1_apply_temporal_filter_planewise_sse2)
-};
-INSTANTIATE_TEST_SUITE_P(SSE2, TemporalFilterPlanewiseTest,
-                         Combine(ValuesIn(temporal_filter_planewise_test_sse2),
+TemporalFilterFuncParam temporal_filter_test_sse2[] = { TemporalFilterFuncParam(
+    &av1_apply_temporal_filter_c, &av1_apply_temporal_filter_sse2) };
+INSTANTIATE_TEST_SUITE_P(SSE2, TemporalFilterTest,
+                         Combine(ValuesIn(temporal_filter_test_sse2),
                                  Range(64, 65, 4)));
 #endif  // HAVE_SSE2
 
diff --git a/test/temporal_filter_yuv_test.cc b/test/temporal_filter_yuv_test.cc
deleted file mode 100644
index dc17aaa..0000000
--- a/test/temporal_filter_yuv_test.cc
+++ /dev/null
@@ -1,841 +0,0 @@
-/*
- * Copyright (c) 2019, Alliance for Open Media. All rights reserved
- *
- * This source code is subject to the terms of the BSD 2 Clause License and
- * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
- * was not distributed with this source code in the LICENSE file, you can
- * obtain it at www.aomedia.org/license/software. If the Alliance for Open
- * Media Patent License 1.0 was not distributed with this source code in the
- * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
- */
-
-#include <ostream>
-
-#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
-
-#include "config/av1_rtcd.h"
-#include "test/acm_random.h"
-#include "test/register_state_check.h"
-#include "aom_ports/aom_timer.h"
-#include "aom_ports/mem.h"
-
-namespace {
-
-using ::libaom_test::ACMRandom;
-
-const int MAX_WIDTH = 32;
-const int MAX_HEIGHT = 32;
-
-typedef void (*TemporalFilterYUVFunc)(
-    const YV12_BUFFER_CONFIG *ref_frame, const MACROBLOCKD *mbd,
-    const BLOCK_SIZE block_size, const int mb_row, const int mb_col,
-    const int num_planes, const int strength, const int use_subblock,
-    const int *blk_fw, const uint8_t *pred, uint32_t *accum, uint16_t *count);
-
-struct TemporalFilterWithBd {
-  TemporalFilterWithBd(TemporalFilterYUVFunc func, int bitdepth)
-      : temporal_filter(func), bd(bitdepth) {}
-
-  TemporalFilterYUVFunc temporal_filter;
-  int bd;
-};
-
-std::ostream &operator<<(std::ostream &os, const TemporalFilterWithBd &tf) {
-  return os << "Bitdepth: " << tf.bd;
-}
-
-int GetFilterWeight(unsigned int row, unsigned int col,
-                    unsigned int block_height, unsigned int block_width,
-                    const int *const blk_fw, int use_32x32) {
-  if (use_32x32) {
-    return blk_fw[0];
-  }
-
-  return blk_fw[2 * (row >= block_height / 2) + (col >= block_width / 2)];
-}
-
-template <typename PixelType>
-int GetModIndex(int sum_dist, int index, int rounding, int strength,
-                int filter_weight) {
-  int mod = sum_dist * 3 / index;
-  mod += rounding;
-  mod >>= strength;
-
-  mod = AOMMIN(16, mod);
-
-  mod = 16 - mod;
-  mod *= filter_weight;
-
-  return mod;
-}
-
-// Lowbitdepth version
-template <>
-int GetModIndex<uint8_t>(int sum_dist, int index, int rounding, int strength,
-                         int filter_weight) {
-  unsigned int index_mult[14] = { 0,     0,     0,     0,     49152,
-                                  39322, 32768, 28087, 24576, 21846,
-                                  19661, 17874, 0,     15124 };
-
-  assert(index >= 0 && index <= 13);
-  assert(index_mult[index] != 0);
-
-  int mod = (clamp(sum_dist, 0, UINT16_MAX) * index_mult[index]) >> 16;
-  mod += rounding;
-  mod >>= strength;
-
-  mod = AOMMIN(16, mod);
-
-  mod = 16 - mod;
-  mod *= filter_weight;
-
-  return mod;
-}
-
-// Highbitdepth version
-template <>
-int GetModIndex<uint16_t>(int sum_dist, int index, int rounding, int strength,
-                          int filter_weight) {
-  int64_t index_mult[14] = { 0U,          0U,          0U,          0U,
-                             3221225472U, 2576980378U, 2147483648U, 1840700270U,
-                             1610612736U, 1431655766U, 1288490189U, 1171354718U,
-                             0U,          991146300U };
-
-  assert(index >= 0 && index <= 13);
-  assert(index_mult[index] != 0);
-
-  int mod = static_cast<int>((sum_dist * index_mult[index]) >> 32);
-  mod += rounding;
-  mod >>= strength;
-
-  mod = AOMMIN(16, mod);
-
-  mod = 16 - mod;
-  mod *= filter_weight;
-
-  return mod;
-}
-
-template <typename PixelType>
-void SetArray(PixelType *pixel_array, int width, int height, int stride,
-              int val) {
-  for (int row = 0; row < height; row++) {
-    for (int col = 0; col < width; col++) {
-      pixel_array[col] = val;
-    }
-    pixel_array += stride;
-  }
-}
-
-template <typename PixelType>
-void SetArray(PixelType *pixel_array, int width, int height, int stride,
-              ACMRandom *rnd, int low_val, int high_val) {
-  EXPECT_LE(low_val, high_val);
-
-  for (int row = 0; row < height; row++) {
-    for (int col = 0; col < width; col++) {
-      const int val =
-          static_cast<int>((*rnd).PseudoUniform(high_val - low_val));
-      pixel_array[col] = low_val + val;
-    }
-    pixel_array += stride;
-  }
-}
-
-template <typename ValueType>
-bool CheckArrayEqual(const ValueType *arr_1, const ValueType *arr_2, int width,
-                     int height, int stride_1, int stride_2) {
-  for (int row = 0; row < height; row++) {
-    for (int col = 0; col < width; col++) {
-      if (arr_1[col] != arr_2[col]) {
-        return false;
-      }
-    }
-    arr_1 += stride_1;
-    arr_2 += stride_2;
-  }
-  return true;
-}
-
-template <typename ValueType>
-void PrintArrayDiff(const ValueType *arr_1, const ValueType *arr_2, int width,
-                    int height, int stride_1, int stride_2) {
-  const ValueType *arr_1_start = arr_1, *arr_2_start = arr_2;
-
-  printf("Array 1:\n");
-  for (int row = 0; row < height; ++row) {
-    for (int col = 0; col < width; ++col) {
-      if (arr_1[col] != arr_2[col]) {
-        printf("*%3d", arr_1[col]);
-      } else {
-        printf("%4d", arr_1[col]);
-      }
-    }
-    printf("\n");
-    arr_1 += stride_1;
-    arr_2 += stride_2;
-  }
-
-  arr_1 = arr_1_start;
-  arr_2 = arr_2_start;
-
-  printf("Array 2:\n");
-  for (int row = 0; row < height; ++row) {
-    for (int col = 0; col < width; ++col) {
-      if (arr_1[col] != arr_2[col]) {
-        printf("*%3d", arr_2[col]);
-      } else {
-        printf("%4d", arr_2[col]);
-      }
-    }
-    printf("\n");
-    arr_1 += stride_1;
-    arr_2 += stride_2;
-  }
-
-  arr_1 = arr_1_start;
-  arr_2 = arr_2_start;
-  printf("Difference:\n");
-  for (int row = 0; row < height; ++row) {
-    for (int col = 0; col < width; ++col) {
-      printf("%4d", arr_1[col] - arr_2[col]);
-    }
-    printf("\n");
-    arr_1 += stride_1;
-    arr_2 += stride_2;
-  }
-}
-
-template <typename PixelType>
-void ApplyReferenceFilter(const PixelType *y_src, const PixelType *y_pre,
-                          const PixelType *u_src, const PixelType *v_src,
-                          const PixelType *u_pre, const PixelType *v_pre,
-                          unsigned int block_width, unsigned int block_height,
-                          int ss_x, int ss_y, int strength,
-                          const int *const blk_fw, int use_32x32,
-                          uint32_t *y_accum, uint16_t *y_count,
-                          uint32_t *u_accum, uint16_t *u_count,
-                          uint32_t *v_accum, uint16_t *v_count) {
-  const int uv_block_width = block_width >> ss_x,
-            uv_block_height = block_height >> ss_y;
-  const int y_src_stride = block_width, y_pre_stride = block_width;
-  const int uv_src_stride = uv_block_width, uv_pre_stride = uv_block_width;
-  const int y_diff_stride = block_width, uv_diff_stride = uv_block_width;
-  const int y_count_stride = block_width, u_count_stride = uv_block_width,
-            v_count_stride = uv_block_width;
-  const int y_accum_stride = block_width, u_accum_stride = uv_block_width,
-            v_accum_stride = uv_block_width;
-
-  int y_dif[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-  int u_dif[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-  int v_dif[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-
-  const int rounding = (1 << strength) >> 1;
-
-  // Get the square diffs
-  for (int row = 0; row < (int)block_height; row++) {
-    for (int col = 0; col < (int)block_width; col++) {
-      const int diff =
-          y_src[row * y_src_stride + col] - y_pre[row * y_pre_stride + col];
-      y_dif[row * y_diff_stride + col] = diff * diff;
-    }
-  }
-
-  for (int row = 0; row < (int)uv_block_height; row++) {
-    for (int col = 0; col < (int)uv_block_width; col++) {
-      const int u_diff =
-          u_src[row * uv_src_stride + col] - u_pre[row * uv_pre_stride + col];
-      const int v_diff =
-          v_src[row * uv_src_stride + col] - v_pre[row * uv_pre_stride + col];
-      u_dif[row * uv_diff_stride + col] = u_diff * u_diff;
-      v_dif[row * uv_diff_stride + col] = v_diff * v_diff;
-    }
-  }
-
-  // Apply the filter to luma
-  for (int row = 0; row < (int)block_height; row++) {
-    for (int col = 0; col < (int)block_width; col++) {
-      const int uv_row = row >> ss_y;
-      const int uv_col = col >> ss_x;
-      const int filter_weight = GetFilterWeight(row, col, block_height,
-                                                block_width, blk_fw, use_32x32);
-
-      // First we get the modifier for the current y pixel
-      const int y_pixel = y_pre[row * y_pre_stride + col];
-      int y_num_used = 0;
-      int y_mod = 0;
-
-      // Sum the neighboring 3x3 y pixels
-      for (int row_step = -1; row_step <= 1; row_step++) {
-        for (int col_step = -1; col_step <= 1; col_step++) {
-          const int sub_row = row + row_step;
-          const int sub_col = col + col_step;
-
-          if (sub_row >= 0 && sub_row < (int)block_height && sub_col >= 0 &&
-              sub_col < (int)block_width) {
-            y_mod += y_dif[sub_row * y_diff_stride + sub_col];
-            y_num_used++;
-          }
-        }
-      }
-
-      // Sum the corresponding uv pixels to the current y modifier
-      // Note we are rounding down instead of rounding to the nearest pixel.
-      y_mod += u_dif[uv_row * uv_diff_stride + uv_col];
-      y_mod += v_dif[uv_row * uv_diff_stride + uv_col];
-
-      y_num_used += 2;
-
-      // Set the modifier
-      y_mod = GetModIndex<PixelType>(y_mod, y_num_used, rounding, strength,
-                                     filter_weight);
-
-      // Accumulate the result
-      y_count[row * y_count_stride + col] += y_mod;
-      y_accum[row * y_accum_stride + col] += y_mod * y_pixel;
-    }
-  }
-
-  // Apply the filter to chroma
-  for (int uv_row = 0; uv_row < (int)uv_block_height; uv_row++) {
-    for (int uv_col = 0; uv_col < (int)uv_block_width; uv_col++) {
-      const int y_row = uv_row << ss_y;
-      const int y_col = uv_col << ss_x;
-      const int filter_weight = GetFilterWeight(
-          uv_row, uv_col, uv_block_height, uv_block_width, blk_fw, use_32x32);
-
-      const int u_pixel = u_pre[uv_row * uv_pre_stride + uv_col];
-      const int v_pixel = v_pre[uv_row * uv_pre_stride + uv_col];
-
-      int uv_num_used = 0;
-      int u_mod = 0, v_mod = 0;
-
-      // Sum the neighboring 3x3 chromal pixels to the chroma modifier
-      for (int row_step = -1; row_step <= 1; row_step++) {
-        for (int col_step = -1; col_step <= 1; col_step++) {
-          const int sub_row = uv_row + row_step;
-          const int sub_col = uv_col + col_step;
-
-          if (sub_row >= 0 && sub_row < uv_block_height && sub_col >= 0 &&
-              sub_col < uv_block_width) {
-            u_mod += u_dif[sub_row * uv_diff_stride + sub_col];
-            v_mod += v_dif[sub_row * uv_diff_stride + sub_col];
-            uv_num_used++;
-          }
-        }
-      }
-
-      // Sum all the luma pixels associated with the current luma pixel
-      for (int row_step = 0; row_step < 1 + ss_y; row_step++) {
-        for (int col_step = 0; col_step < 1 + ss_x; col_step++) {
-          const int sub_row = y_row + row_step;
-          const int sub_col = y_col + col_step;
-          const int y_diff = y_dif[sub_row * y_diff_stride + sub_col];
-
-          u_mod += y_diff;
-          v_mod += y_diff;
-          uv_num_used++;
-        }
-      }
-
-      // Set the modifier
-      u_mod = GetModIndex<PixelType>(u_mod, uv_num_used, rounding, strength,
-                                     filter_weight);
-      v_mod = GetModIndex<PixelType>(v_mod, uv_num_used, rounding, strength,
-                                     filter_weight);
-
-      // Accumulate the result
-      u_count[uv_row * u_count_stride + uv_col] += u_mod;
-      u_accum[uv_row * u_accum_stride + uv_col] += u_mod * u_pixel;
-      v_count[uv_row * v_count_stride + uv_col] += v_mod;
-      v_accum[uv_row * v_accum_stride + uv_col] += v_mod * v_pixel;
-    }
-  }
-}
-
-class TemporalFilterYUVTest
-    : public ::testing::TestWithParam<TemporalFilterWithBd> {
- public:
-  virtual void SetUp() {
-    filter_func_ = GetParam().temporal_filter;
-    bd_ = GetParam().bd;
-    use_highbd_ = (bd_ != 8);
-
-    rnd_.Reset(ACMRandom::DeterministicSeed());
-    saturate_test_ = 0;
-    num_repeats_ = 10;
-
-    ASSERT_TRUE(bd_ == 8 || bd_ == 10 || bd_ == 12);
-  }
-
- protected:
-  template <typename PixelType>
-  void CompareTestWithParam(int width, int height, int ss_x, int ss_y,
-                            int filter_strength, int use_32x32,
-                            const int *filter_weight);
-  template <typename PixelType>
-  void RunTestFilterWithParam(int width, int height, int ss_x, int ss_y,
-                              int filter_strength, int use_32x32,
-                              const int *filter_weight);
-  template <typename PixelType>
-  void ApplyTestFilter(const PixelType *y_src, int y_src_stride,
-                       const PixelType *y_pre, int y_pre_stride,
-                       const PixelType *u_src, const PixelType *v_src,
-                       int uv_src_stride, const PixelType *u_pre,
-                       const PixelType *v_pre, int uv_pre_stride,
-                       unsigned int block_width, unsigned int block_height,
-                       int ss_x, int ss_y, int strength, const int *blk_fw,
-                       int use_32x32, uint32_t *y_accum, uint16_t *y_count,
-                       uint32_t *u_accumu, uint16_t *u_count, uint32_t *v_accum,
-                       uint16_t *v_count);
-
-  TemporalFilterYUVFunc filter_func_;
-  ACMRandom rnd_;
-  int saturate_test_;
-  int num_repeats_;
-  int use_highbd_;
-  int bd_;
-};
-
-template <>
-void TemporalFilterYUVTest::ApplyTestFilter<uint8_t>(
-    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
-    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
-    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, const int *blk_fw, int use_32x32,
-    uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count,
-    uint32_t *v_accum, uint16_t *v_count) {
-  (void)block_width;
-  (void)block_height;
-  (void)y_src_stride;
-  (void)uv_src_stride;
-
-  assert(block_width == MAX_WIDTH && MAX_WIDTH == 32);
-  assert(block_height == MAX_HEIGHT && MAX_HEIGHT == 32);
-  const BLOCK_SIZE block_size = BLOCK_32X32;
-  const int num_planes = 3;
-  const int mb_pels = MAX_WIDTH * MAX_HEIGHT;
-  const int mb_row = 0;
-  const int mb_col = 0;
-  const int use_subblock = !(use_32x32);
-
-  YV12_BUFFER_CONFIG *ref_frame =
-      (YV12_BUFFER_CONFIG *)malloc(sizeof(YV12_BUFFER_CONFIG));
-  ref_frame->strides[0] = y_pre_stride;
-  ref_frame->strides[1] = uv_pre_stride;
-  const int alloc_size = MAX_MB_PLANE * mb_pels;
-  DECLARE_ALIGNED(16, uint8_t, src[alloc_size]);
-  ref_frame->buffer_alloc = src;
-  ref_frame->buffers[0] = ref_frame->buffer_alloc + 0 * mb_pels;
-  ref_frame->buffers[1] = ref_frame->buffer_alloc + 1 * mb_pels;
-  ref_frame->buffers[2] = ref_frame->buffer_alloc + 2 * mb_pels;
-  ref_frame->flags = bd_ > 8 ? YV12_FLAG_HIGHBITDEPTH : 0;
-
-  MACROBLOCKD *mbd = (MACROBLOCKD *)malloc(sizeof(MACROBLOCKD));
-  mbd->plane[0].subsampling_y = 0;
-  mbd->plane[0].subsampling_x = 0;
-  mbd->plane[1].subsampling_y = ss_y;
-  mbd->plane[1].subsampling_x = ss_x;
-  mbd->plane[2].subsampling_y = ss_y;
-  mbd->plane[2].subsampling_x = ss_x;
-
-  DECLARE_ALIGNED(16, uint8_t, pred[alloc_size]);
-  DECLARE_ALIGNED(16, uint32_t, accum[alloc_size]);
-  DECLARE_ALIGNED(16, uint16_t, count[alloc_size]);
-  memcpy(src + 0 * mb_pels, y_src, mb_pels * sizeof(uint8_t));
-  memcpy(src + 1 * mb_pels, u_src, mb_pels * sizeof(uint8_t));
-  memcpy(src + 2 * mb_pels, v_src, mb_pels * sizeof(uint8_t));
-  memcpy(pred + 0 * mb_pels, y_pre, mb_pels * sizeof(uint8_t));
-  memcpy(pred + 1 * mb_pels, u_pre, mb_pels * sizeof(uint8_t));
-  memcpy(pred + 2 * mb_pels, v_pre, mb_pels * sizeof(uint8_t));
-  memcpy(accum + 0 * mb_pels, y_accum, mb_pels * sizeof(uint32_t));
-  memcpy(accum + 1 * mb_pels, u_accum, mb_pels * sizeof(uint32_t));
-  memcpy(accum + 2 * mb_pels, v_accum, mb_pels * sizeof(uint32_t));
-  memcpy(count + 0 * mb_pels, y_count, mb_pels * sizeof(uint16_t));
-  memcpy(count + 1 * mb_pels, u_count, mb_pels * sizeof(uint16_t));
-  memcpy(count + 2 * mb_pels, v_count, mb_pels * sizeof(uint16_t));
-
-  ASM_REGISTER_STATE_CHECK(
-      filter_func_(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
-                   strength, use_subblock, blk_fw, pred, accum, count));
-
-  memcpy(y_accum, accum + 0 * mb_pels, mb_pels * sizeof(uint32_t));
-  memcpy(u_accum, accum + 1 * mb_pels, mb_pels * sizeof(uint32_t));
-  memcpy(v_accum, accum + 2 * mb_pels, mb_pels * sizeof(uint32_t));
-  memcpy(y_count, count + 0 * mb_pels, mb_pels * sizeof(uint16_t));
-  memcpy(u_count, count + 1 * mb_pels, mb_pels * sizeof(uint16_t));
-  memcpy(v_count, count + 2 * mb_pels, mb_pels * sizeof(uint16_t));
-
-  free(ref_frame);
-  free(mbd);
-}
-
-template <>
-void TemporalFilterYUVTest::ApplyTestFilter<uint16_t>(
-    const uint16_t *y_src, int y_src_stride, const uint16_t *y_pre,
-    int y_pre_stride, const uint16_t *u_src, const uint16_t *v_src,
-    int uv_src_stride, const uint16_t *u_pre, const uint16_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, const int *blk_fw, int use_32x32,
-    uint32_t *y_accum, uint16_t *y_count, uint32_t *u_accum, uint16_t *u_count,
-    uint32_t *v_accum, uint16_t *v_count) {
-  (void)block_width;
-  (void)block_height;
-  (void)y_src_stride;
-  (void)uv_src_stride;
-
-  assert(block_width == MAX_WIDTH && MAX_WIDTH == 32);
-  assert(block_height == MAX_HEIGHT && MAX_HEIGHT == 32);
-  const BLOCK_SIZE block_size = BLOCK_32X32;
-  const int num_planes = 3;
-  const int mb_pels = MAX_WIDTH * MAX_HEIGHT;
-  const int mb_row = 0;
-  const int mb_col = 0;
-  const int use_subblock = !(use_32x32);
-
-  YV12_BUFFER_CONFIG *ref_frame =
-      (YV12_BUFFER_CONFIG *)malloc(sizeof(YV12_BUFFER_CONFIG));
-  ref_frame->strides[0] = y_pre_stride;
-  ref_frame->strides[1] = uv_pre_stride;
-  const int alloc_size = MAX_MB_PLANE * mb_pels;
-  DECLARE_ALIGNED(16, uint16_t, src16[alloc_size]);
-  ref_frame->buffer_alloc = CONVERT_TO_BYTEPTR(src16);
-  ref_frame->buffers[0] = ref_frame->buffer_alloc + 0 * mb_pels;
-  ref_frame->buffers[1] = ref_frame->buffer_alloc + 1 * mb_pels;
-  ref_frame->buffers[2] = ref_frame->buffer_alloc + 2 * mb_pels;
-  ref_frame->flags = bd_ > 8 ? YV12_FLAG_HIGHBITDEPTH : 0;
-
-  MACROBLOCKD *mbd = (MACROBLOCKD *)malloc(sizeof(MACROBLOCKD));
-  mbd->plane[0].subsampling_y = 0;
-  mbd->plane[0].subsampling_x = 0;
-  mbd->plane[1].subsampling_y = ss_y;
-  mbd->plane[1].subsampling_x = ss_x;
-  mbd->plane[2].subsampling_y = ss_y;
-  mbd->plane[2].subsampling_x = ss_x;
-
-  DECLARE_ALIGNED(16, uint16_t, pred16[alloc_size]);
-  DECLARE_ALIGNED(16, uint32_t, accum[alloc_size]);
-  DECLARE_ALIGNED(16, uint16_t, count[alloc_size]);
-  memcpy(src16 + 0 * mb_pels, y_src, mb_pels * sizeof(uint16_t));
-  memcpy(src16 + 1 * mb_pels, u_src, mb_pels * sizeof(uint16_t));
-  memcpy(src16 + 2 * mb_pels, v_src, mb_pels * sizeof(uint16_t));
-  memcpy(pred16 + 0 * mb_pels, y_pre, mb_pels * sizeof(uint16_t));
-  memcpy(pred16 + 1 * mb_pels, u_pre, mb_pels * sizeof(uint16_t));
-  memcpy(pred16 + 2 * mb_pels, v_pre, mb_pels * sizeof(uint16_t));
-  memcpy(accum + 0 * mb_pels, y_accum, mb_pels * sizeof(uint32_t));
-  memcpy(accum + 1 * mb_pels, u_accum, mb_pels * sizeof(uint32_t));
-  memcpy(accum + 2 * mb_pels, v_accum, mb_pels * sizeof(uint32_t));
-  memcpy(count + 0 * mb_pels, y_count, mb_pels * sizeof(uint16_t));
-  memcpy(count + 1 * mb_pels, u_count, mb_pels * sizeof(uint16_t));
-  memcpy(count + 2 * mb_pels, v_count, mb_pels * sizeof(uint16_t));
-  const uint8_t *pred = CONVERT_TO_BYTEPTR(pred16);
-
-  ASM_REGISTER_STATE_CHECK(
-      filter_func_(ref_frame, mbd, block_size, mb_row, mb_col, num_planes,
-                   strength, use_subblock, blk_fw, pred, accum, count));
-
-  memcpy(y_accum, accum + 0 * mb_pels, mb_pels * sizeof(uint32_t));
-  memcpy(u_accum, accum + 1 * mb_pels, mb_pels * sizeof(uint32_t));
-  memcpy(v_accum, accum + 2 * mb_pels, mb_pels * sizeof(uint32_t));
-  memcpy(y_count, count + 0 * mb_pels, mb_pels * sizeof(uint16_t));
-  memcpy(u_count, count + 1 * mb_pels, mb_pels * sizeof(uint16_t));
-  memcpy(v_count, count + 2 * mb_pels, mb_pels * sizeof(uint16_t));
-
-  free(ref_frame);
-  free(mbd);
-}
-
-template <typename PixelType>
-void TemporalFilterYUVTest::CompareTestWithParam(int width, int height,
-                                                 int ss_x, int ss_y,
-                                                 int filter_strength,
-                                                 int use_32x32,
-                                                 const int *filter_weight) {
-  const int uv_width = width >> ss_x, uv_height = height >> ss_y;
-  const int y_stride = width, uv_stride = uv_width;
-
-  DECLARE_ALIGNED(16, PixelType, y_src[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, PixelType, y_pre[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, uint16_t, y_count_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, uint32_t, y_accum_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, uint16_t, y_count_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, uint32_t, y_accum_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-
-  DECLARE_ALIGNED(16, PixelType, u_src[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, PixelType, u_pre[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, uint16_t, u_count_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, uint32_t, u_accum_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, uint16_t, u_count_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, uint32_t, u_accum_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-
-  DECLARE_ALIGNED(16, PixelType, v_src[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, PixelType, v_pre[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, uint16_t, v_count_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, uint32_t, v_accum_ref[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, uint16_t, v_count_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-  DECLARE_ALIGNED(16, uint32_t, v_accum_tst[MAX_WIDTH * MAX_HEIGHT]) = { 0 };
-
-  for (int repeats = 0; repeats < num_repeats_; repeats++) {
-    if (saturate_test_) {
-      const int max_val = (1 << bd_) - 1;
-      SetArray(y_src, width, height, y_stride, max_val);
-      SetArray(y_pre, width, height, y_stride, 0);
-      SetArray(u_src, uv_width, uv_height, uv_stride, max_val);
-      SetArray(u_pre, uv_width, uv_height, uv_stride, 0);
-      SetArray(v_src, uv_width, uv_height, uv_stride, max_val);
-      SetArray(v_pre, uv_width, uv_height, uv_stride, 0);
-    } else {
-      const int max_val = 7 << (bd_ - 8);
-      SetArray(y_src, width, height, y_stride, &rnd_, 0, max_val);
-      SetArray(y_pre, width, height, y_stride, &rnd_, 0, max_val);
-      SetArray(u_src, uv_width, uv_height, uv_stride, &rnd_, 0, max_val);
-      SetArray(u_pre, uv_width, uv_height, uv_stride, &rnd_, 0, max_val);
-      SetArray(v_src, uv_width, uv_height, uv_stride, &rnd_, 0, max_val);
-      SetArray(v_pre, uv_width, uv_height, uv_stride, &rnd_, 0, max_val);
-    }
-
-    ApplyReferenceFilter<PixelType>(
-        y_src, y_pre, u_src, v_src, u_pre, v_pre, width, height, ss_x, ss_y,
-        filter_strength, filter_weight, use_32x32, y_accum_ref, y_count_ref,
-        u_accum_ref, u_count_ref, v_accum_ref, v_count_ref);
-
-    ApplyTestFilter(y_src, y_stride, y_pre, y_stride, u_src, v_src, uv_stride,
-                    u_pre, v_pre, uv_stride, width, height, ss_x, ss_y,
-                    filter_strength, filter_weight, use_32x32, y_accum_tst,
-                    y_count_tst, u_accum_tst, u_count_tst, v_accum_tst,
-                    v_count_tst);
-
-    EXPECT_TRUE(CheckArrayEqual(y_accum_tst, y_accum_ref, width, height,
-                                y_stride, y_stride));
-    EXPECT_TRUE(CheckArrayEqual(y_count_tst, y_count_ref, width, height,
-                                y_stride, y_stride));
-    EXPECT_TRUE(CheckArrayEqual(u_accum_tst, u_accum_ref, uv_width, uv_height,
-                                uv_stride, uv_stride));
-    EXPECT_TRUE(CheckArrayEqual(u_count_tst, u_count_ref, uv_width, uv_height,
-                                uv_stride, uv_stride));
-    EXPECT_TRUE(CheckArrayEqual(v_accum_tst, v_accum_ref, uv_width, uv_height,
-                                uv_stride, uv_stride));
-    EXPECT_TRUE(CheckArrayEqual(v_count_tst, v_count_ref, uv_width, uv_height,
-                                uv_stride, uv_stride));
-
-    if (HasFailure()) {
-      if (use_32x32) {
-        printf("SS_X: %d, SS_Y: %d, Strength: %d, Weight: %d\n", ss_x, ss_y,
-               filter_strength, *filter_weight);
-      } else {
-        printf("SS_X: %d, SS_Y: %d, Strength: %d, Weights: %d,%d,%d,%d\n", ss_x,
-               ss_y, filter_strength, filter_weight[0], filter_weight[1],
-               filter_weight[2], filter_weight[3]);
-      }
-
-      PrintArrayDiff(y_accum_ref, y_accum_tst, width, height, y_stride,
-                     y_stride);
-      PrintArrayDiff(y_count_ref, y_count_tst, width, height, y_stride,
-                     y_stride);
-      PrintArrayDiff(u_accum_ref, v_accum_tst, uv_width, uv_height, uv_stride,
-                     uv_stride);
-      PrintArrayDiff(u_count_ref, v_count_tst, uv_width, uv_height, uv_stride,
-                     uv_stride);
-      PrintArrayDiff(u_accum_ref, v_accum_tst, uv_width, uv_height, uv_stride,
-                     uv_stride);
-      PrintArrayDiff(u_count_ref, v_count_tst, uv_width, uv_height, uv_stride,
-                     uv_stride);
-
-      return;
-    }
-  }
-}
-
-template <typename PixelType>
-void TemporalFilterYUVTest::RunTestFilterWithParam(int width, int height,
-                                                   int ss_x, int ss_y,
-                                                   int filter_strength,
-                                                   int use_32x32,
-                                                   const int *filter_weight) {
-  PixelType y_src[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-  PixelType y_pre[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-  uint16_t y_count[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-  uint32_t y_accum[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-
-  PixelType u_src[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-  PixelType u_pre[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-  uint16_t u_count[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-  uint32_t u_accum[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-
-  PixelType v_src[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-  PixelType v_pre[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-  uint16_t v_count[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-  uint32_t v_accum[MAX_WIDTH * MAX_HEIGHT] = { 0 };
-
-  SetArray(y_src, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8));
-  SetArray(y_pre, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8));
-  SetArray(u_src, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8));
-  SetArray(u_pre, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8));
-  SetArray(v_src, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8));
-  SetArray(v_pre, width, height, MAX_WIDTH, &rnd_, 0, 7 << (bd_ = 8));
-
-  for (int repeats = 0; repeats < num_repeats_; repeats++) {
-    ApplyTestFilter(y_src, MAX_WIDTH, y_pre, MAX_WIDTH, u_src, v_src, MAX_WIDTH,
-                    u_pre, v_pre, MAX_WIDTH, width, height, ss_x, ss_y,
-                    filter_strength, filter_weight, use_32x32, y_accum, y_count,
-                    u_accum, u_count, v_accum, v_count);
-  }
-}
-
-TEST_P(TemporalFilterYUVTest, Use32x32) {
-  const int width = 32, height = 32;
-  const int use_32x32 = 1;
-
-  for (int ss_x = 0; ss_x <= 1; ss_x++) {
-    for (int ss_y = 0; ss_y <= 1; ss_y++) {
-      for (int filter_strength = 0; filter_strength <= 6;
-           filter_strength += 2) {
-        for (int filter_weight = 0; filter_weight <= 2; filter_weight++) {
-          if (use_highbd_) {
-            const int adjusted_strength = filter_strength + 2 * (bd_ - 8);
-            CompareTestWithParam<uint16_t>(width, height, ss_x, ss_y,
-                                           adjusted_strength, use_32x32,
-                                           &filter_weight);
-          } else {
-            CompareTestWithParam<uint8_t>(width, height, ss_x, ss_y,
-                                          filter_strength, use_32x32,
-                                          &filter_weight);
-          }
-          ASSERT_FALSE(HasFailure());
-        }
-      }
-    }
-  }
-}
-
-TEST_P(TemporalFilterYUVTest, Use16x16) {
-  const int width = 32, height = 32;
-  const int use_32x32 = 0;
-
-  for (int ss_x = 0; ss_x <= 1; ss_x++) {
-    for (int ss_y = 0; ss_y <= 1; ss_y++) {
-      for (int filter_idx = 0; filter_idx < 3 * 3 * 3 * 3; filter_idx++) {
-        // Set up the filter
-        int filter_weight[4];
-        int filter_idx_cp = filter_idx;
-        for (int idx = 0; idx < 4; idx++) {
-          filter_weight[idx] = filter_idx_cp % 3;
-          filter_idx_cp /= 3;
-        }
-
-        // Test each parameter
-        for (int filter_strength = 0; filter_strength <= 6;
-             filter_strength += 2) {
-          if (use_highbd_) {
-            const int adjusted_strength = filter_strength + 2 * (bd_ - 8);
-            CompareTestWithParam<uint16_t>(width, height, ss_x, ss_y,
-                                           adjusted_strength, use_32x32,
-                                           filter_weight);
-          } else {
-            CompareTestWithParam<uint8_t>(width, height, ss_x, ss_y,
-                                          filter_strength, use_32x32,
-                                          filter_weight);
-          }
-
-          ASSERT_FALSE(HasFailure());
-        }
-      }
-    }
-  }
-}
-
-TEST_P(TemporalFilterYUVTest, SaturationTest) {
-  const int width = 32, height = 32;
-  const int use_32x32 = 1;
-  const int filter_weight = 1;
-  saturate_test_ = 1;
-
-  for (int ss_x = 0; ss_x <= 1; ss_x++) {
-    for (int ss_y = 0; ss_y <= 1; ss_y++) {
-      for (int filter_strength = 0; filter_strength <= 6;
-           filter_strength += 2) {
-        if (use_highbd_) {
-          const int adjusted_strength = filter_strength + 2 * (bd_ - 8);
-          CompareTestWithParam<uint16_t>(width, height, ss_x, ss_y,
-                                         adjusted_strength, use_32x32,
-                                         &filter_weight);
-        } else {
-          CompareTestWithParam<uint8_t>(width, height, ss_x, ss_y,
-                                        filter_strength, use_32x32,
-                                        &filter_weight);
-        }
-
-        ASSERT_FALSE(HasFailure());
-      }
-    }
-  }
-}
-
-TEST_P(TemporalFilterYUVTest, DISABLED_Speed) {
-  const int width = 32, height = 32;
-  num_repeats_ = 1000;
-
-  for (int use_32x32 = 0; use_32x32 <= 1; use_32x32++) {
-    const int num_filter_weights = use_32x32 ? 3 : 3 * 3 * 3 * 3;
-    for (int ss_x = 0; ss_x <= 1; ss_x++) {
-      for (int ss_y = 0; ss_y <= 1; ss_y++) {
-        for (int filter_idx = 0; filter_idx < num_filter_weights;
-             filter_idx++) {
-          // Set up the filter
-          int filter_weight[4];
-          int filter_idx_cp = filter_idx;
-          for (int idx = 0; idx < 4; idx++) {
-            filter_weight[idx] = filter_idx_cp % 3;
-            filter_idx_cp /= 3;
-          }
-
-          // Test each parameter
-          for (int filter_strength = 0; filter_strength <= 6;
-               filter_strength += 2) {
-            aom_usec_timer timer;
-            aom_usec_timer_start(&timer);
-
-            if (use_highbd_) {
-              RunTestFilterWithParam<uint16_t>(width, height, ss_x, ss_y,
-                                               filter_strength, use_32x32,
-                                               filter_weight);
-            } else {
-              RunTestFilterWithParam<uint8_t>(width, height, ss_x, ss_y,
-                                              filter_strength, use_32x32,
-                                              filter_weight);
-            }
-
-            aom_usec_timer_mark(&timer);
-            const int elapsed_time =
-                static_cast<int>(aom_usec_timer_elapsed(&timer));
-
-            printf(
-                "Bitdepth: %d, Use 32X32: %d, SS_X: %d, SS_Y: %d, Weight Idx: "
-                "%d, Strength: %d, Time: %5d\n",
-                bd_, use_32x32, ss_x, ss_y, filter_idx, filter_strength,
-                elapsed_time);
-          }
-        }
-      }
-    }
-  }
-}
-
-INSTANTIATE_TEST_SUITE_P(
-    C, TemporalFilterYUVTest,
-    ::testing::Values(
-        TemporalFilterWithBd(&av1_apply_temporal_filter_yuv_c, 8),
-        TemporalFilterWithBd(&av1_apply_temporal_filter_yuv_c, 10),
-        TemporalFilterWithBd(&av1_apply_temporal_filter_yuv_c, 12)));
-
-#if HAVE_SSE4_1
-INSTANTIATE_TEST_SUITE_P(
-    SSE4_1, TemporalFilterYUVTest,
-    ::testing::Values(
-        TemporalFilterWithBd(&av1_apply_temporal_filter_yuv_sse4_1, 8),
-        TemporalFilterWithBd(&av1_apply_temporal_filter_yuv_sse4_1, 10),
-        TemporalFilterWithBd(&av1_apply_temporal_filter_yuv_sse4_1, 12)));
-#endif  // HAVE_SSE4_1
-
-}  // namespace
diff --git a/test/test.cmake b/test/test.cmake
index d4d3b29..6cfb196 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -131,12 +131,10 @@
                 "${AOM_ROOT}/test/segment_binarization_sync.cc"
                 "${AOM_ROOT}/test/superframe_test.cc"
                 "${AOM_ROOT}/test/tile_independence_test.cc"
-                "${AOM_ROOT}/test/temporal_filter_planewise_test.cc"
-                "${AOM_ROOT}/test/temporal_filter_yuv_test.cc")
+                "${AOM_ROOT}/test/temporal_filter_test.cc")
     if(CONFIG_REALTIME_ONLY)
       list(REMOVE_ITEM AOM_UNIT_TEST_COMMON_SOURCES
-                       "${AOM_ROOT}/test/cnn_test.cc"
-                       "${AOM_ROOT}/test/temporal_filter_yuv_test.cc")
+                       "${AOM_ROOT}/test/cnn_test.cc")
     endif()
     if(NOT CONFIG_AV1_HIGHBITDEPTH)
       list(REMOVE_ITEM AOM_UNIT_TEST_COMMON_SOURCES