Temporal filter improvement The hierarchical TF block sizes are added for better temporal filter performance. For HD set, this gives coding gain: -0.4% SSIM, -0.07% PSNR, -0.21% VMAF/VMAF_neg. SIMD for 64x64 block TF function will be added. STATS_CHANGED Change-Id: Id7628b6105f19fff0ac16491ad36b9ea5009df01

diff --git a/av1/encoder/temporal_filter.c b/av1/encoder/temporal_filter.c
index 4437110..58dbf70 100644
--- a/av1/encoder/temporal_filter.c
+++ b/av1/encoder/temporal_filter.c

@@ -48,8 +48,90 @@
 
 // Forward Declaration.
 static void tf_determine_block_partition(const MV block_mv, const int block_mse,
+                                         const MV *midblock_mvs,
+                                         const int *midblock_mses,
                                          MV *subblock_mvs, int *subblock_mses);
 
+// Do motion search for 1 subblock. The block size can be 32x32 or 16x16.
+static void subblock_motion_search(
+    AV1_COMP *cpi, MACROBLOCK *mb, const YV12_BUFFER_CONFIG *frame_to_filter,
+    const YV12_BUFFER_CONFIG *ref_frame, BLOCK_SIZE tf_block_size, int mb_row,
+    int mb_col, BLOCK_SIZE subblock_size, int idx, int subblock_ofst_i,
+    int subblock_ofst_j, FULLPEL_MOTION_SEARCH_PARAMS *full_ms_params,
+    SUBPEL_MOTION_SEARCH_PARAMS *ms_params, const MV *ref_mv,
+    FULLPEL_MV start_mv, const search_site_config *search_site_cfg,
+    SEARCH_METHODS search_method, MV_COST_TYPE mv_cost_type, int step_param,
+    SUBPEL_SEARCH_TYPE subpel_ms_type, MV *subblock_mvs, int *subblock_mses,
+    FULLPEL_MV_STATS *best_mv_stats, int q) {
+  MACROBLOCKD *const mbd = &mb->e_mbd;
+
+  const int mb_height = block_size_high[tf_block_size];
+  const int mb_width = block_size_wide[tf_block_size];
+  const int mi_h = mi_size_high_log2[tf_block_size];
+  const int mi_w = mi_size_wide_log2[tf_block_size];
+
+  const int y_stride = frame_to_filter->y_stride;
+  assert(y_stride == ref_frame->y_stride);
+  const int y_offset = mb_row * mb_height * y_stride + mb_col * mb_width;
+
+  const int subblock_height = block_size_high[subblock_size];
+  const int subblock_width = block_size_wide[subblock_size];
+  const int subblock_pels = subblock_height * subblock_width;
+
+  int_mv best_mv;  // Searched motion vector.
+  unsigned int sse, error;
+  int distortion;
+  int cost_list[5];
+
+  av1_set_mv_row_limits(&cpi->common.mi_params, &mb->mv_limits,
+                        (mb_row << mi_h) + (subblock_ofst_i >> MI_SIZE_LOG2),
+                        (subblock_height >> MI_SIZE_LOG2),
+                        cpi->oxcf.border_in_pixels);
+  av1_set_mv_col_limits(&cpi->common.mi_params, &mb->mv_limits,
+                        (mb_col << mi_w) + (subblock_ofst_j >> MI_SIZE_LOG2),
+                        (subblock_width >> MI_SIZE_LOG2),
+                        cpi->oxcf.border_in_pixels);
+  const int boffset = subblock_ofst_i * y_stride + subblock_ofst_j;
+  mb->plane[0].src.buf = frame_to_filter->y_buffer + y_offset + boffset;
+  mbd->plane[0].pre[0].buf = ref_frame->y_buffer + y_offset + boffset;
+  av1_make_default_fullpel_ms_params(full_ms_params, cpi, mb, subblock_size,
+                                     ref_mv, start_mv, search_site_cfg,
+                                     search_method,
+                                     /*fine_search_interval=*/0);
+  full_ms_params->run_mesh_search = 1;
+  full_ms_params->mv_cost_params.mv_cost_type = mv_cost_type;
+
+  // May need to re-tune prune_mesh_search after increase TF
+  // block to 64x64.
+  if (cpi->sf.mv_sf.prune_mesh_search == PRUNE_MESH_SEARCH_LVL_1) {
+    // Enable prune_mesh_search based on q for PRUNE_MESH_SEARCH_LVL_1.
+    full_ms_params->prune_mesh_search = (q <= 20) ? 0 : 1;
+    full_ms_params->mesh_search_mv_diff_threshold = 2;
+  }
+
+  av1_full_pixel_search(start_mv, full_ms_params, step_param,
+                        cond_cost_list(cpi, cost_list), &best_mv.as_fullmv,
+                        best_mv_stats, NULL);
+
+  av1_make_default_subpel_ms_params(ms_params, cpi, mb, subblock_size, ref_mv,
+                                    cost_list);
+  ms_params->forced_stop = EIGHTH_PEL;
+  ms_params->var_params.subpel_search_type = subpel_ms_type;
+  // Since we are merely refining the result from full pixel
+  // search, we don't need regularization for subpel search
+  ms_params->mv_cost_params.mv_cost_type = MV_COST_NONE;
+  best_mv_stats->err_cost = 0;
+
+  MV subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
+  assert(av1_is_subpelmv_in_range(&ms_params->mv_limits, subpel_start_mv));
+  error = cpi->mv_search_params.find_fractional_mv_step(
+      &mb->e_mbd, &cpi->common, ms_params, subpel_start_mv, best_mv_stats,
+      &best_mv.as_mv, &distortion, &sse, NULL);
+
+  subblock_mses[idx] = DIVIDE_AND_ROUND(error, subblock_pels);
+  subblock_mvs[idx] = best_mv.as_mv;
+}
+
 // This function returns the minimum and maximum log variances for 4x4 sub
 // blocks in the current block.
 static inline void get_log_var_4x4sub_blk(
@@ -236,6 +318,10 @@
   FULLPEL_MV_STATS best_mv_stats;
   int block_mse = INT_MAX;
   MV block_mv = kZeroMv;
+  // 32x32 block motion search results.
+  int midblock_mses[4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX };
+  MV midblock_mvs[4] = { kZeroMv, kZeroMv, kZeroMv, kZeroMv };
+
   const int q = get_q(cpi);
 
   av1_make_default_fullpel_ms_params(&full_ms_params, cpi, mb, block_size,
@@ -289,54 +375,49 @@
     *is_dc_diff_large = 50 * error < sse;
     if (src_var <= 2 * (int64_t)distortion) *is_low_cntras = 1;
 
+    // On 4 mid-blocks in the 64x64 tf block.
     if (allow_me_for_sub_blks) {
-      // On 4 sub-blocks.
-      const BLOCK_SIZE subblock_size = av1_ss_size_lookup[block_size][1][1];
-      const int subblock_height = block_size_high[subblock_size];
-      const int subblock_width = block_size_wide[subblock_size];
-      const int subblock_pels = subblock_height * subblock_width;
-      start_mv = get_fullmv_from_mv(ref_mv);
+      // midblock_size is 32x32, which corresponds to each 32x32 block in the
+      // 64x64 block.
+      const BLOCK_SIZE midblock_size = av1_ss_size_lookup[block_size][1][1];
+      const int midblock_height = block_size_high[midblock_size];
+      const int midblock_width = block_size_wide[midblock_size];
+      int midblock_idx = 0;
+      for (int i = 0; i < mb_height; i += midblock_height) {
+        for (int j = 0; j < mb_width; j += midblock_width) {
+          start_mv = get_fullmv_from_mv(ref_mv);
 
-      int subblock_idx = 0;
-      for (int i = 0; i < mb_height; i += subblock_height) {
-        for (int j = 0; j < mb_width; j += subblock_width) {
-          const int offset = i * y_stride + j;
-          mb->plane[0].src.buf = frame_to_filter->y_buffer + y_offset + offset;
-          mbd->plane[0].pre[0].buf = ref_frame->y_buffer + y_offset + offset;
-          av1_make_default_fullpel_ms_params(
-              &full_ms_params, cpi, mb, subblock_size, &baseline_mv, start_mv,
-              search_site_cfg, search_method,
-              /*fine_search_interval=*/0);
-          full_ms_params.run_mesh_search = 1;
-          full_ms_params.mv_cost_params.mv_cost_type = mv_cost_type;
+          subblock_motion_search(
+              cpi, mb, frame_to_filter, ref_frame, block_size, mb_row, mb_col,
+              midblock_size, midblock_idx, i, j, &full_ms_params, &ms_params,
+              &baseline_mv, start_mv, search_site_cfg, search_method,
+              mv_cost_type, step_param, subpel_search_type, midblock_mvs,
+              midblock_mses, &best_mv_stats, q);
 
-          if (cpi->sf.mv_sf.prune_mesh_search == PRUNE_MESH_SEARCH_LVL_1) {
-            // Enable prune_mesh_search based on q for PRUNE_MESH_SEARCH_LVL_1.
-            full_ms_params.prune_mesh_search = (q <= 20) ? 0 : 1;
-            full_ms_params.mesh_search_mv_diff_threshold = 2;
+          // On 4 sub-blocks in 1 mid-block.
+          {
+            const BLOCK_SIZE subblock_size =
+                av1_ss_size_lookup[midblock_size][1][1];
+            const int subblock_height = block_size_high[subblock_size];
+            const int subblock_width = block_size_wide[subblock_size];
+
+            start_mv = get_fullmv_from_mv(&midblock_mvs[midblock_idx]);
+
+            int bidx = midblock_idx * 4;
+            for (int bi = 0; bi < midblock_height; bi += subblock_height) {
+              for (int bj = 0; bj < midblock_width; bj += subblock_width) {
+                subblock_motion_search(
+                    cpi, mb, frame_to_filter, ref_frame, block_size, mb_row,
+                    mb_col, subblock_size, bidx, (i + bi), (j + bj),
+                    &full_ms_params, &ms_params, &baseline_mv, start_mv,
+                    search_site_cfg, search_method, mv_cost_type, step_param,
+                    subpel_search_type, subblock_mvs, subblock_mses,
+                    &best_mv_stats, q);
+                ++bidx;
+              }
+            }
           }
-          av1_full_pixel_search(start_mv, &full_ms_params, step_param,
-                                cond_cost_list(cpi, cost_list),
-                                &best_mv.as_fullmv, &best_mv_stats, NULL);
-
-          av1_make_default_subpel_ms_params(&ms_params, cpi, mb, subblock_size,
-                                            &baseline_mv, cost_list);
-          ms_params.forced_stop = EIGHTH_PEL;
-          ms_params.var_params.subpel_search_type = subpel_search_type;
-          // Since we are merely refining the result from full pixel search, we
-          // don't need regularization for subpel search
-          ms_params.mv_cost_params.mv_cost_type = MV_COST_NONE;
-          best_mv_stats.err_cost = 0;
-
-          subpel_start_mv = get_mv_from_fullmv(&best_mv.as_fullmv);
-          assert(
-              av1_is_subpelmv_in_range(&ms_params.mv_limits, subpel_start_mv));
-          error = cpi->mv_search_params.find_fractional_mv_step(
-              &mb->e_mbd, &cpi->common, &ms_params, subpel_start_mv,
-              &best_mv_stats, &best_mv.as_mv, &distortion, &sse, NULL);
-          subblock_mses[subblock_idx] = DIVIDE_AND_ROUND(error, subblock_pels);
-          subblock_mvs[subblock_idx] = best_mv.as_mv;
-          ++subblock_idx;
+          ++midblock_idx;
         }
       }
     }
@@ -348,11 +429,11 @@
 
   // Make partition decision.
   if (allow_me_for_sub_blks) {
-    tf_determine_block_partition(block_mv, block_mse, subblock_mvs,
-                                 subblock_mses);
+    tf_determine_block_partition(block_mv, block_mse, midblock_mvs,
+                                 midblock_mses, subblock_mvs, subblock_mses);
   } else {
-    // Copy 32X32 block mv and mse values to sub blocks
-    for (int i = 0; i < 4; ++i) {
+    // Copy 64X64 block mv and mse values to sub blocks
+    for (int i = 0; i < 16; ++i) {
       subblock_mvs[i] = block_mv;
       subblock_mses[i] = block_mse;
     }
@@ -372,6 +453,8 @@
 //   block_mv: Motion vector for the entire block (ONLY as reference).
 //   block_mse: Motion search error (MSE) for the entire block (ONLY as
 //              reference).
+//   midblock_mvs: Pointer to the motion vectors for 4 mid-blocks.
+//   midblock_mses: Pointer to the search errors (MSE) for 4 mid-blocks.
 //   subblock_mvs: Pointer to the motion vectors for 4 sub-blocks (will be
 //                 modified based on the partition decision).
 //   subblock_mses: Pointer to the search errors (MSE) for 4 sub-blocks (will
@@ -380,23 +463,53 @@
 //   Nothing will be returned. Results are saved in `subblock_mvs` and
 //   `subblock_mses`.
 static void tf_determine_block_partition(const MV block_mv, const int block_mse,
+                                         const MV *midblock_mvs,
+                                         const int *midblock_mses,
                                          MV *subblock_mvs, int *subblock_mses) {
   int min_subblock_mse = INT_MAX;
   int max_subblock_mse = INT_MIN;
   int64_t sum_subblock_mse = 0;
-  for (int i = 0; i < 4; ++i) {
+  int i;
+
+  // Go through 4 32x32 blocks.
+  for (int idx = 0; idx < 4; ++idx) {
+    min_subblock_mse = INT_MAX;
+    max_subblock_mse = INT_MIN;
+    sum_subblock_mse = 0;
+
+    const int sub_idx = idx * 4;
+    for (i = sub_idx; i < sub_idx + 4; ++i) {
+      sum_subblock_mse += subblock_mses[i];
+      min_subblock_mse = AOMMIN(min_subblock_mse, subblock_mses[i]);
+      max_subblock_mse = AOMMAX(max_subblock_mse, subblock_mses[i]);
+    }
+
+    if (((midblock_mses[idx] * 15 <= sum_subblock_mse * 4) &&
+         max_subblock_mse - min_subblock_mse < 48) ||
+        ((midblock_mses[idx] * 14 <= sum_subblock_mse * 4) &&
+         max_subblock_mse - min_subblock_mse < 24)) {  // No split.
+      for (i = sub_idx; i < sub_idx + 4; ++i) {
+        subblock_mvs[i] = midblock_mvs[idx];
+        subblock_mses[i] = midblock_mses[idx];
+      }
+    }
+  }
+
+  min_subblock_mse = INT_MAX;
+  max_subblock_mse = INT_MIN;
+  sum_subblock_mse = 0;
+  for (i = 0; i < 16; ++i) {
     sum_subblock_mse += subblock_mses[i];
     min_subblock_mse = AOMMIN(min_subblock_mse, subblock_mses[i]);
     max_subblock_mse = AOMMAX(max_subblock_mse, subblock_mses[i]);
   }
 
-  // TODO(any): The following magic numbers may be tuned to improve the
-  // performance OR find a way to get rid of these magic numbers.
-  if (((block_mse * 15 < sum_subblock_mse * 4) &&
-       max_subblock_mse - min_subblock_mse < 48) ||
-      ((block_mse * 14 < sum_subblock_mse * 4) &&
-       max_subblock_mse - min_subblock_mse < 24)) {  // No split.
-    for (int i = 0; i < 4; ++i) {
+  if (((block_mse * 15 <= sum_subblock_mse) &&
+       (max_subblock_mse - min_subblock_mse) * 16 < sum_subblock_mse * 3) ||
+      ((block_mse * 14 <= sum_subblock_mse) &&
+       (max_subblock_mse - min_subblock_mse) * 8 <
+           sum_subblock_mse)) {  // No split.
+    for (i = 0; i < 16; ++i) {
       subblock_mvs[i] = block_mv;
       subblock_mses[i] = block_mse;
     }
@@ -466,35 +579,45 @@
     const int plane_w = mb_width >> subsampling_x;   // Plane width.
     const int plane_y = mb_y >> subsampling_y;       // Y-coord (Top-left).
     const int plane_x = mb_x >> subsampling_x;       // X-coord (Top-left).
-    const int h = plane_h >> 1;                      // Sub-block height.
-    const int w = plane_w >> 1;                      // Sub-block width.
-    const int is_y_plane = (plane == 0);             // Is Y-plane?
+
+    const int h32 = plane_h >> 1;  // 32x32 sub-block height.
+    const int w32 = plane_w >> 1;  // 32x32 sub-block width.
+    const int h16 = plane_h >> 2;  // 16x16 sub-block height.
+    const int w16 = plane_w >> 2;  // 16x16 sub-block width.
+
+    const int is_y_plane = (plane == 0);  // Is Y-plane?
 
     const struct buf_2d ref_buf = { NULL, ref_frame->buffers[plane],
                                     ref_frame->widths[is_y_plane ? 0 : 1],
                                     ref_frame->heights[is_y_plane ? 0 : 1],
                                     ref_frame->strides[is_y_plane ? 0 : 1] };
 
-    // Handle each subblock.
-    int subblock_idx = 0;
-    for (int i = 0; i < plane_h; i += h) {
-      for (int j = 0; j < plane_w; j += w) {
-        // Choose proper motion vector.
-        const MV mv = subblock_mvs[subblock_idx++];
-        assert(mv.row >= INT16_MIN && mv.row <= INT16_MAX &&
-               mv.col >= INT16_MIN && mv.col <= INT16_MAX);
+    const int sub_y[4] = { 0, 0, h32, h32 };
+    const int sub_x[4] = { 0, w32, 0, w32 };
+    // Handle each 16x16 subblock.
+    for (int idx = 0; idx < 4; ++idx) {
+      int subblock_idx = idx * 4;
+      for (int i = 0; i < h32; i += h16) {
+        for (int j = 0; j < w32; j += w16) {
+          // Choose proper motion vector.
+          const MV mv = subblock_mvs[subblock_idx++];
+          assert(mv.row >= INT16_MIN && mv.row <= INT16_MAX &&
+                 mv.col >= INT16_MIN && mv.col <= INT16_MAX);
 
-        const int y = plane_y + i;
-        const int x = plane_x + j;
+          const int y = plane_y + sub_y[idx] + i;
+          const int x = plane_x + sub_x[idx] + j;
 
-        // Build predictior for each sub-block on current plane.
-        InterPredParams inter_pred_params;
-        av1_init_inter_params(&inter_pred_params, w, h, y, x, subsampling_x,
-                              subsampling_y, bit_depth, is_high_bitdepth,
-                              is_intrabc, scale, &ref_buf, interp_filters);
-        inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth);
-        av1_enc_build_one_inter_predictor(&pred[plane_offset + i * plane_w + j],
-                                          plane_w, &mv, &inter_pred_params);
+          // Build predictior for each sub-block on current plane.
+          InterPredParams inter_pred_params;
+          av1_init_inter_params(&inter_pred_params, w16, h16, y, x,
+                                subsampling_x, subsampling_y, bit_depth,
+                                is_high_bitdepth, is_intrabc, scale, &ref_buf,
+                                interp_filters);
+          inter_pred_params.conv_params = get_conv_params(0, plane, bit_depth);
+          av1_enc_build_one_inter_predictor(
+              &pred[plane_offset + (sub_y[idx] + i) * plane_w + sub_x[idx] + j],
+              plane_w, &mv, &inter_pred_params);
+        }
       }
     }
     plane_offset += plane_h * plane_w;
@@ -711,16 +834,6 @@
     const double n_decay = 0.5 + log(2 * noise_levels[plane] + 5.0);
     decay_factor[plane] = 1 / (n_decay * q_decay * s_decay);
   }
-  double d_factor[4] = { 0 };
-  for (int subblock_idx = 0; subblock_idx < 4; subblock_idx++) {
-    // Larger motion vector -> smaller filtering weight.
-    const MV mv = subblock_mvs[subblock_idx];
-    const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
-    double distance_threshold = min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD;
-    distance_threshold = AOMMAX(distance_threshold, 1);
-    d_factor[subblock_idx] = distance / distance_threshold;
-    d_factor[subblock_idx] = AOMMAX(d_factor[subblock_idx], 1);
-  }
 
   // Allocate memory for pixel-wise squared differences. They,
   // regardless of the subsampling, are assigned with memory of size `mb_pels`.
@@ -796,14 +909,26 @@
 
         // Combine window error and block error, and normalize it.
         const double window_error = sum_square_diff * inv_num_ref_pixels;
-        const int subblock_idx = (i >= h / 2) * 2 + (j >= w / 2);
+
+        // 16x16 block index
+        const int y32 = i / (h / 2);
+        const int x32 = j / (w / 2);
+        const int y16 = (i % (h / 2)) / (h / 4);
+        const int x16 = (j % (w / 2)) / (w / 4);
+        const int subblock_idx = (y32 * 2 + x32) * 4 + (y16 * 2 + x16);
         const double block_error = (double)subblock_mses[subblock_idx];
         const double combined_error =
             weight_factor * window_error + block_error * inv_factor;
 
+        // Larger motion vector -> smaller filtering weight.
+        const MV mv = subblock_mvs[subblock_idx];
+        const double distance = sqrt(pow(mv.row, 2) + pow(mv.col, 2));
+        const double distance_threshold =
+            (double)AOMMAX(min_frame_size * TF_SEARCH_DISTANCE_THRESHOLD, 1);
+        const double d_factor = AOMMAX(distance / distance_threshold, 1);
+
         // Compute filter weight.
-        double scaled_error =
-            combined_error * d_factor[subblock_idx] * decay_factor[plane];
+        double scaled_error = combined_error * d_factor * decay_factor[plane];
         scaled_error = AOMMIN(scaled_error, 7);
         int weight;
         if (tf_wgt_calc_lvl == 0) {
@@ -930,6 +1055,17 @@
   const GF_GROUP *gf_group = &cpi->ppi->gf_group;
   const FRAME_TYPE frame_type = gf_group->frame_type[cpi->gf_frame_index];
 
+  // Determine whether the video is with `YUV 4:2:2` format, since the avx2/sse2
+  // function only supports square block size. We will use C function instead
+  // for videos with `YUV 4:2:2` format.
+  int is_yuv422_format = 0;
+  for (int plane = 1; plane < num_planes; ++plane) {
+    if (mbd->plane[plane].subsampling_x != mbd->plane[plane].subsampling_y) {
+      is_yuv422_format = 1;
+      break;
+    }
+  }
+
   // Do filtering.
   FRAME_DIFF *diff = &td->tf_data.diff;
   av1_set_mv_row_limits(&cpi->common.mi_params, &mb->mv_limits,
@@ -967,8 +1103,16 @@
       if (frames[frame] == NULL) continue;
 
       // Motion search.
-      MV subblock_mvs[4] = { kZeroMv, kZeroMv, kZeroMv, kZeroMv };
-      int subblock_mses[4] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX };
+      // block size is 64x64. 16 16x16 in 1 64x64.
+      // Store motion search results in 16x16 units.
+      MV subblock_mvs[16] = { kZeroMv, kZeroMv, kZeroMv, kZeroMv,
+                              kZeroMv, kZeroMv, kZeroMv, kZeroMv,
+                              kZeroMv, kZeroMv, kZeroMv, kZeroMv,
+                              kZeroMv, kZeroMv, kZeroMv, kZeroMv };
+      int subblock_mses[16] = { INT_MAX, INT_MAX, INT_MAX, INT_MAX,
+                                INT_MAX, INT_MAX, INT_MAX, INT_MAX,
+                                INT_MAX, INT_MAX, INT_MAX, INT_MAX,
+                                INT_MAX, INT_MAX, INT_MAX, INT_MAX };
       int is_dc_diff_large = 0;
       int is_low_cntras = 0;
 
@@ -1006,7 +1150,8 @@
         // only supports 32x32 block size and 5x5 filtering window.
         if (is_frame_high_bitdepth(frame_to_filter)) {  // for high bit-depth
 #if CONFIG_AV1_HIGHBITDEPTH
-          if (TF_BLOCK_SIZE == BLOCK_32X32 && TF_WINDOW_LENGTH == 5) {
+          if (!is_yuv422_format && TF_BLOCK_SIZE == BLOCK_32X32 &&
+              TF_WINDOW_LENGTH == 5) {
             av1_highbd_apply_temporal_filter(
                 frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
                 noise_levels, subblock_mvs, subblock_mses, q_factor,
@@ -1022,7 +1167,8 @@
 #endif  // CONFIG_AV1_HIGHBITDEPTH
         } else {
           // for 8-bit
-          if (TF_BLOCK_SIZE == BLOCK_32X32 && TF_WINDOW_LENGTH == 5) {
+          if (!is_yuv422_format && TF_BLOCK_SIZE == BLOCK_32X32 &&
+              TF_WINDOW_LENGTH == 5) {
             av1_apply_temporal_filter(
                 frame_to_filter, mbd, block_size, mb_row, mb_col, num_planes,
                 noise_levels, subblock_mvs, subblock_mses, q_factor,

diff --git a/av1/encoder/temporal_filter.h b/av1/encoder/temporal_filter.h
index 9585942..3f93ab0 100644
--- a/av1/encoder/temporal_filter.h
+++ b/av1/encoder/temporal_filter.h

@@ -30,7 +30,7 @@
 #define BW 32
 
 // Block size used in temporal filtering.
-#define TF_BLOCK_SIZE BLOCK_32X32
+#define TF_BLOCK_SIZE BLOCK_64X64
 
 // Window size for temporal filtering.
 #define TF_WINDOW_LENGTH 5

diff --git a/test/temporal_filter_test.cc b/test/temporal_filter_test.cc
index 2e0ae0f..8d9afaa 100644
--- a/test/temporal_filter_test.cc
+++ b/test/temporal_filter_test.cc

@@ -131,9 +131,9 @@
                                  ColorFormat color_fmt) {
   aom_usec_timer ref_timer, test_timer;
   const BLOCK_SIZE block_size = TF_BLOCK_SIZE;
-  static_assert(block_size == BLOCK_32X32, "");
-  const int width = 32;
-  const int height = 32;
+  static_assert(block_size == BLOCK_64X64, "");
+  const int width = 64;
+  const int height = 64;
   int num_planes = MAX_MB_PLANE;
   int subsampling_x = 0;
   int subsampling_y = 0;
@@ -174,7 +174,7 @@
     memset(accumulator_mod, 0, 1024 * 3 * sizeof(accumulator_mod[0]));
     memset(count_mod, 0, 1024 * 3 * sizeof(count_mod[0]));
 
-    static_assert(width == 32 && height == 32, "");
+    static_assert(width == 64 && height == 64, "");
     const MV subblock_mvs[4] = { { 0, 0 }, { 5, 5 }, { 7, 8 }, { 2, 10 } };
     const int subblock_mses[4] = { 15, 16, 17, 18 };
     const int q_factor = 12;
@@ -284,39 +284,46 @@
   RunTest(1, 100000, I444);
 }
 
-#if HAVE_AVX2
-TemporalFilterFuncParam temporal_filter_test_avx2[] = { TemporalFilterFuncParam(
-    &av1_apply_temporal_filter_c, &av1_apply_temporal_filter_avx2) };
-INSTANTIATE_TEST_SUITE_P(AVX2, TemporalFilterTest,
-                         Combine(ValuesIn(temporal_filter_test_avx2),
-                                 Values(0, 1)));
-#endif  // HAVE_AVX2
+// av1_apply_temporal_filter_c works on 64x64 TF block now, the SIMD function
+// needs to be updated.
+// #if HAVE_AVX2
+// TemporalFilterFuncParam temporal_filter_test_avx2[] = {
+// TemporalFilterFuncParam(
+//    &av1_apply_temporal_filter_c, &av1_apply_temporal_filter_avx2) };
+// INSTANTIATE_TEST_SUITE_P(AVX2, TemporalFilterTest,
+//                         Combine(ValuesIn(temporal_filter_test_avx2),
+//                                 Values(0, 1)));
+// #endif  // HAVE_AVX2
+//
+// #if HAVE_SSE2
+// TemporalFilterFuncParam temporal_filter_test_sse2[] = {
+// TemporalFilterFuncParam(
+//    &av1_apply_temporal_filter_c, &av1_apply_temporal_filter_sse2) };
+// INSTANTIATE_TEST_SUITE_P(SSE2, TemporalFilterTest,
+//                         Combine(ValuesIn(temporal_filter_test_sse2),
+//                                 Values(0, 1)));
+// #endif  // HAVE_SSE2
 
-#if HAVE_SSE2
-TemporalFilterFuncParam temporal_filter_test_sse2[] = { TemporalFilterFuncParam(
-    &av1_apply_temporal_filter_c, &av1_apply_temporal_filter_sse2) };
-INSTANTIATE_TEST_SUITE_P(SSE2, TemporalFilterTest,
-                         Combine(ValuesIn(temporal_filter_test_sse2),
-                                 Values(0, 1)));
-#endif  // HAVE_SSE2
+// av1_apply_temporal_filter_c works on 64x64 TF block now, the SIMD function
+// needs to be updated.
+// #if HAVE_NEON
+// TemporalFilterFuncParam temporal_filter_test_neon[] = {
+// TemporalFilterFuncParam(
+//    &av1_apply_temporal_filter_c, &av1_apply_temporal_filter_neon) };
+// INSTANTIATE_TEST_SUITE_P(NEON, TemporalFilterTest,
+//                         Combine(ValuesIn(temporal_filter_test_neon),
+//                                 Values(0, 1)));
+// #endif  // HAVE_NEON
 
-#if HAVE_NEON
-TemporalFilterFuncParam temporal_filter_test_neon[] = { TemporalFilterFuncParam(
-    &av1_apply_temporal_filter_c, &av1_apply_temporal_filter_neon) };
-INSTANTIATE_TEST_SUITE_P(NEON, TemporalFilterTest,
-                         Combine(ValuesIn(temporal_filter_test_neon),
-                                 Values(0, 1)));
-#endif  // HAVE_NEON
-
-#if HAVE_NEON_DOTPROD
-TemporalFilterFuncParam temporal_filter_test_neon_dotprod[] = {
-  TemporalFilterFuncParam(&av1_apply_temporal_filter_c,
-                          &av1_apply_temporal_filter_neon_dotprod)
-};
-INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, TemporalFilterTest,
-                         Combine(ValuesIn(temporal_filter_test_neon_dotprod),
-                                 Values(0, 1)));
-#endif  // HAVE_NEON_DOTPROD
+// #if HAVE_NEON_DOTPROD
+// TemporalFilterFuncParam temporal_filter_test_neon_dotprod[] = {
+//   TemporalFilterFuncParam(&av1_apply_temporal_filter_c,
+//                           &av1_apply_temporal_filter_neon_dotprod)
+// };
+// INSTANTIATE_TEST_SUITE_P(NEON_DOTPROD, TemporalFilterTest,
+//                          Combine(ValuesIn(temporal_filter_test_neon_dotprod),
+//                                  Values(0, 1)));
+// #endif  // HAVE_NEON_DOTPROD
 
 #if HAVE_AVX2 || HAVE_NEON
 // Width and height for which av1_estimate_noise_from_single_plane() will be
@@ -514,9 +521,9 @@
                                     ColorFormat color_fmt) {
   aom_usec_timer ref_timer, test_timer;
   const BLOCK_SIZE block_size = TF_BLOCK_SIZE;
-  static_assert(block_size == BLOCK_32X32, "");
-  const int width = 32;
-  const int height = 32;
+  static_assert(block_size == BLOCK_64X64, "");
+  const int width = 64;
+  const int height = 64;
   int num_planes = MAX_MB_PLANE;
   int subsampling_x = 0;
   int subsampling_y = 0;
@@ -557,7 +564,7 @@
     memset(accumulator_mod, 0, 1024 * 3 * sizeof(accumulator_mod[0]));
     memset(count_mod, 0, 1024 * 3 * sizeof(count_mod[0]));
 
-    static_assert(width == 32 && height == 32, "");
+    static_assert(width == 64 && height == 64, "");
     const MV subblock_mvs[4] = { { 0, 0 }, { 5, 5 }, { 7, 8 }, { 2, 10 } };
     const int subblock_mses[4] = { 15, 16, 17, 18 };
     const int q_factor = 12;
@@ -667,34 +674,39 @@
   RunTest(1, 100000, 10, I422);
   RunTest(1, 100000, 10, I444);
 }
-#if HAVE_SSE2
-HBDTemporalFilterFuncParam HBDtemporal_filter_test_sse2[] = {
-  HBDTemporalFilterFuncParam(&av1_highbd_apply_temporal_filter_c,
-                             &av1_highbd_apply_temporal_filter_sse2)
-};
-INSTANTIATE_TEST_SUITE_P(SSE2, HBDTemporalFilterTest,
-                         Combine(ValuesIn(HBDtemporal_filter_test_sse2),
-                                 Values(0, 1)));
-#endif  // HAVE_SSE2
-#if HAVE_AVX2
-HBDTemporalFilterFuncParam HBDtemporal_filter_test_avx2[] = {
-  HBDTemporalFilterFuncParam(&av1_highbd_apply_temporal_filter_c,
-                             &av1_highbd_apply_temporal_filter_avx2)
-};
-INSTANTIATE_TEST_SUITE_P(AVX2, HBDTemporalFilterTest,
-                         Combine(ValuesIn(HBDtemporal_filter_test_avx2),
-                                 Values(0, 1)));
-#endif  // HAVE_AVX2
 
-#if HAVE_NEON
-HBDTemporalFilterFuncParam HBDtemporal_filter_test_neon[] = {
-  HBDTemporalFilterFuncParam(&av1_highbd_apply_temporal_filter_c,
-                             &av1_highbd_apply_temporal_filter_neon)
-};
-INSTANTIATE_TEST_SUITE_P(NEON, HBDTemporalFilterTest,
-                         Combine(ValuesIn(HBDtemporal_filter_test_neon),
-                                 Values(0, 1)));
-#endif  // HAVE_NEON
+// av1_apply_temporal_filter_c works on 64x64 TF block now, the SIMD function
+// needs to be updated.
+// #if HAVE_SSE2
+// HBDTemporalFilterFuncParam HBDtemporal_filter_test_sse2[] = {
+//  HBDTemporalFilterFuncParam(&av1_highbd_apply_temporal_filter_c,
+//                             &av1_highbd_apply_temporal_filter_sse2)
+//};
+// INSTANTIATE_TEST_SUITE_P(SSE2, HBDTemporalFilterTest,
+//                         Combine(ValuesIn(HBDtemporal_filter_test_sse2),
+//                                 Values(0, 1)));
+// #endif  // HAVE_SSE2
+// #if HAVE_AVX2
+// HBDTemporalFilterFuncParam HBDtemporal_filter_test_avx2[] = {
+//  HBDTemporalFilterFuncParam(&av1_highbd_apply_temporal_filter_c,
+//                             &av1_highbd_apply_temporal_filter_avx2)
+//};
+// INSTANTIATE_TEST_SUITE_P(AVX2, HBDTemporalFilterTest,
+//                         Combine(ValuesIn(HBDtemporal_filter_test_avx2),
+//                                 Values(0, 1)));
+// #endif  // HAVE_AVX2
+
+// av1_apply_temporal_filter_c works on 64x64 TF block now, the SIMD function
+// needs to be updated.
+// #if HAVE_NEON
+// HBDTemporalFilterFuncParam HBDtemporal_filter_test_neon[] = {
+//  HBDTemporalFilterFuncParam(&av1_highbd_apply_temporal_filter_c,
+//                             &av1_highbd_apply_temporal_filter_neon)
+//};
+// INSTANTIATE_TEST_SUITE_P(NEON, HBDTemporalFilterTest,
+//                         Combine(ValuesIn(HBDtemporal_filter_test_neon),
+//                                 Values(0, 1)));
+// #endif  // HAVE_NEON
 
 using HBDEstimateNoiseFunc = double (*)(const uint16_t *src, int height,
                                         int width, int stride, int bit_depth,