Revert "Revert "AV1 RT: Dual filter for nonRD in RT mode""

This reverts commit 5d1b573bdf32a95835383ddfc70f101a8318ad14.

Bringing back AV1 RT Dual filter with fixing test failures

Change-Id: I5ea08b0bd6958a9e4353b0eecb6a5c42093e4a73
diff --git a/av1/common/av1_loopfilter.c b/av1/common/av1_loopfilter.c
index fbee8da..18ae0f2 100644
--- a/av1/common/av1_loopfilter.c
+++ b/av1/common/av1_loopfilter.c
@@ -462,6 +462,84 @@
   }
 }
 
+void av1_filter_block_plane_vert_rt(const AV1_COMMON *const cm,
+                                    const MACROBLOCKD *const xd,
+                                    const int plane,
+                                    const MACROBLOCKD_PLANE *const plane_ptr,
+                                    const uint32_t mi_row,
+                                    const uint32_t mi_col) {
+  const uint32_t scale_horz = plane_ptr->subsampling_x;
+  const uint32_t scale_vert = plane_ptr->subsampling_y;
+  uint8_t *const dst_ptr = plane_ptr->dst.buf;
+  const int dst_stride = plane_ptr->dst.stride;
+  const int plane_mi_rows =
+      ROUND_POWER_OF_TWO(cm->mi_params.mi_rows, scale_vert);
+  const int plane_mi_cols =
+      ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, scale_horz);
+  const int y_range = AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)),
+                             (MAX_MIB_SIZE >> scale_vert));
+  const int x_range = AOMMIN((int)(plane_mi_cols - (mi_col >> scale_horz)),
+                             (MAX_MIB_SIZE >> scale_horz));
+  assert(!plane);
+  assert(!(y_range % 2));
+  for (int y = 0; y < y_range; y += 2) {
+    uint8_t *p = dst_ptr + y * MI_SIZE * dst_stride;
+    for (int x = 0; x < x_range;) {
+      // inner loop always filter vertical edges in a MI block. If MI size
+      // is 8x8, it will filter the vertical edge aligned with a 8x8 block.
+      // If 4x4 transform is used, it will then filter the internal edge
+      //  aligned with a 4x4 block
+      const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
+      const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
+      uint32_t advance_units;
+      TX_SIZE tx_size;
+      AV1_DEBLOCKING_PARAMETERS params;
+      memset(&params, 0, sizeof(params));
+
+      tx_size =
+          set_lpf_parameters(&params, ((ptrdiff_t)1 << scale_horz), cm, xd,
+                             VERT_EDGE, curr_x, curr_y, plane, plane_ptr);
+      if (tx_size == TX_INVALID) {
+        params.filter_length = 0;
+        tx_size = TX_4X4;
+      }
+
+      switch (params.filter_length) {
+        // apply 4-tap filtering
+        case 4:
+          aom_lpf_vertical_4_dual(p, dst_stride, params.mblim, params.lim,
+                                  params.hev_thr, params.mblim, params.lim,
+                                  params.hev_thr);
+          break;
+        case 6:  // apply 6-tap filter for chroma plane only
+          assert(plane != 0);
+          aom_lpf_vertical_6_dual(p, dst_stride, params.mblim, params.lim,
+                                  params.hev_thr, params.mblim, params.lim,
+                                  params.hev_thr);
+          break;
+        // apply 8-tap filtering
+        case 8:
+          aom_lpf_vertical_8_dual(p, dst_stride, params.mblim, params.lim,
+                                  params.hev_thr, params.mblim, params.lim,
+                                  params.hev_thr);
+          break;
+        // apply 14-tap filtering
+        case 14:
+          aom_lpf_vertical_14_dual(p, dst_stride, params.mblim, params.lim,
+                                   params.hev_thr, params.mblim, params.lim,
+                                   params.hev_thr);
+          break;
+        // no filtering
+        default: break;
+      }
+      // advance the destination pointer
+      advance_units = tx_size_wide_unit[tx_size];
+      x += advance_units;
+      p += advance_units * MI_SIZE;
+    }
+  }
+}
+
 void av1_filter_block_plane_horz(const AV1_COMMON *const cm,
                                  const MACROBLOCKD *const xd, const int plane,
                                  const MACROBLOCKD_PLANE *const plane_ptr,
@@ -584,6 +662,84 @@
   }
 }
 
+void av1_filter_block_plane_horz_rt(const AV1_COMMON *const cm,
+                                    const MACROBLOCKD *const xd,
+                                    const int plane,
+                                    const MACROBLOCKD_PLANE *const plane_ptr,
+                                    const uint32_t mi_row,
+                                    const uint32_t mi_col) {
+  const uint32_t scale_horz = plane_ptr->subsampling_x;
+  const uint32_t scale_vert = plane_ptr->subsampling_y;
+  uint8_t *const dst_ptr = plane_ptr->dst.buf;
+  const int dst_stride = plane_ptr->dst.stride;
+  const int plane_mi_rows =
+      ROUND_POWER_OF_TWO(cm->mi_params.mi_rows, scale_vert);
+  const int plane_mi_cols =
+      ROUND_POWER_OF_TWO(cm->mi_params.mi_cols, scale_horz);
+  const int y_range = AOMMIN((int)(plane_mi_rows - (mi_row >> scale_vert)),
+                             (MAX_MIB_SIZE >> scale_vert));
+  const int x_range = AOMMIN((int)(plane_mi_cols - (mi_col >> scale_horz)),
+                             (MAX_MIB_SIZE >> scale_horz));
+  assert(!plane);
+  for (int x = 0; x < x_range; x += 2) {
+    uint8_t *p = dst_ptr + x * MI_SIZE;
+    for (int y = 0; y < y_range;) {
+      // inner loop always filter vertical edges in a MI block. If MI size
+      // is 8x8, it will first filter the vertical edge aligned with a 8x8
+      // block. If 4x4 transform is used, it will then filter the internal
+      // edge aligned with a 4x4 block
+      const uint32_t curr_x = ((mi_col * MI_SIZE) >> scale_horz) + x * MI_SIZE;
+      const uint32_t curr_y = ((mi_row * MI_SIZE) >> scale_vert) + y * MI_SIZE;
+      uint32_t advance_units;
+      TX_SIZE tx_size;
+      AV1_DEBLOCKING_PARAMETERS params;
+      memset(&params, 0, sizeof(params));
+
+      tx_size = set_lpf_parameters(
+          &params, (cm->mi_params.mi_stride << scale_vert), cm, xd, HORZ_EDGE,
+          curr_x, curr_y, plane, plane_ptr);
+      if (tx_size == TX_INVALID) {
+        params.filter_length = 0;
+        tx_size = TX_4X4;
+      }
+
+      switch (params.filter_length) {
+        // apply 4-tap filtering
+        case 4:
+          aom_lpf_horizontal_4_dual(p, dst_stride, params.mblim, params.lim,
+                                    params.hev_thr, params.mblim, params.lim,
+                                    params.hev_thr);
+          break;
+        // apply 6-tap filtering
+        case 6:
+          assert(plane != 0);
+          aom_lpf_horizontal_6_dual(p, dst_stride, params.mblim, params.lim,
+                                    params.hev_thr, params.mblim, params.lim,
+                                    params.hev_thr);
+          break;
+        // apply 8-tap filtering
+        case 8:
+          aom_lpf_horizontal_8_dual(p, dst_stride, params.mblim, params.lim,
+                                    params.hev_thr, params.mblim, params.lim,
+                                    params.hev_thr);
+          break;
+        // apply 14-tap filtering
+        case 14:
+          aom_lpf_horizontal_14_dual(p, dst_stride, params.mblim, params.lim,
+                                     params.hev_thr, params.mblim, params.lim,
+                                     params.hev_thr);
+          break;
+        // no filtering
+        default: break;
+      }
+      // advance the destination pointer
+      advance_units = tx_size_high_unit[tx_size];
+      y += advance_units;
+      p += advance_units * dst_stride * MI_SIZE;
+    }
+  }
+}
+
 void av1_filter_block_plane_vert_test(const AV1_COMMON *const cm,
                                       const MACROBLOCKD *const xd,
                                       const int plane,
@@ -673,7 +829,7 @@
 #if CONFIG_LPF_MASK
                              int is_decoding,
 #endif
-                             int plane_start, int plane_end) {
+                             int plane_start, int plane_end, int is_realtime) {
   struct macroblockd_plane *pd = xd->plane;
   const int col_start = 0;
   const int col_end = cm->mi_params.mi_cols;
@@ -728,7 +884,6 @@
       continue;
     else if (plane == 2 && !(cm->lf.filter_level_v))
       continue;
-
     if (cm->lf.combine_vert_horz_lf) {
       // filter all vertical and horizontal edges in every 128x128 super block
       for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
@@ -736,22 +891,57 @@
           // filter vertical edges
           av1_setup_dst_planes(pd, cm->seq_params->sb_size, frame_buffer,
                                mi_row, mi_col, plane, plane + 1);
+#if CONFIG_AV1_HIGHBITDEPTH
+          (void)is_realtime;
           av1_filter_block_plane_vert(cm, xd, plane, &pd[plane], mi_row,
                                       mi_col);
+#else
+          if (is_realtime && !plane) {
+            av1_filter_block_plane_vert_rt(cm, xd, plane, &pd[plane], mi_row,
+                                           mi_col);
+
+          } else {
+            av1_filter_block_plane_vert(cm, xd, plane, &pd[plane], mi_row,
+                                        mi_col);
+          }
+#endif
           // filter horizontal edges
           if (mi_col - MAX_MIB_SIZE >= 0) {
             av1_setup_dst_planes(pd, cm->seq_params->sb_size, frame_buffer,
                                  mi_row, mi_col - MAX_MIB_SIZE, plane,
                                  plane + 1);
+#if CONFIG_AV1_HIGHBITDEPTH
+            (void)is_realtime;
             av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
                                         mi_col - MAX_MIB_SIZE);
+#else
+            if (is_realtime && !plane) {
+              av1_filter_block_plane_horz_rt(cm, xd, plane, &pd[plane], mi_row,
+                                             mi_col - MAX_MIB_SIZE);
+            } else {
+              av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
+                                          mi_col - MAX_MIB_SIZE);
+            }
+#endif
           }
         }
         // filter horizontal edges
         av1_setup_dst_planes(pd, cm->seq_params->sb_size, frame_buffer, mi_row,
                              mi_col - MAX_MIB_SIZE, plane, plane + 1);
+#if CONFIG_AV1_HIGHBITDEPTH
+        (void)is_realtime;
         av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
                                     mi_col - MAX_MIB_SIZE);
+#else
+        if (is_realtime && !plane) {
+          av1_filter_block_plane_horz_rt(cm, xd, plane, &pd[plane], mi_row,
+                                         mi_col - MAX_MIB_SIZE);
+
+        } else {
+          av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
+                                      mi_col - MAX_MIB_SIZE);
+        }
+#endif
       }
     } else {
       // filter all vertical edges in every 128x128 super block
@@ -759,8 +949,19 @@
         for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) {
           av1_setup_dst_planes(pd, cm->seq_params->sb_size, frame_buffer,
                                mi_row, mi_col, plane, plane + 1);
+#if CONFIG_AV1_HIGHBITDEPTH
+          (void)is_realtime;
           av1_filter_block_plane_vert(cm, xd, plane, &pd[plane], mi_row,
                                       mi_col);
+#else
+          if (is_realtime && !plane) {
+            av1_filter_block_plane_vert_rt(cm, xd, plane, &pd[plane], mi_row,
+                                           mi_col);
+          } else {
+            av1_filter_block_plane_vert(cm, xd, plane, &pd[plane], mi_row,
+                                        mi_col);
+          }
+#endif
         }
       }
 
@@ -769,8 +970,20 @@
         for (mi_col = col_start; mi_col < col_end; mi_col += MAX_MIB_SIZE) {
           av1_setup_dst_planes(pd, cm->seq_params->sb_size, frame_buffer,
                                mi_row, mi_col, plane, plane + 1);
+#if CONFIG_AV1_HIGHBITDEPTH
+          (void)is_realtime;
           av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
                                       mi_col);
+#else
+          if (is_realtime && !plane) {
+            av1_filter_block_plane_horz_rt(cm, xd, plane, &pd[plane], mi_row,
+                                           mi_col);
+
+          } else {
+            av1_filter_block_plane_horz(cm, xd, plane, &pd[plane], mi_row,
+                                        mi_col);
+          }
+#endif
         }
       }
     }
@@ -782,7 +995,8 @@
 #if CONFIG_LPF_MASK
                            int is_decoding,
 #endif
-                           int plane_start, int plane_end, int partial_frame) {
+                           int plane_start, int plane_end, int partial_frame,
+                           int is_realtime) {
   int start_mi_row, end_mi_row, mi_rows_to_filter;
 
   start_mi_row = 0;
@@ -798,5 +1012,5 @@
 #if CONFIG_LPF_MASK
                    is_decoding,
 #endif
-                   plane_start, plane_end);
+                   plane_start, plane_end, is_realtime);
 }
diff --git a/av1/common/av1_loopfilter.h b/av1/common/av1_loopfilter.h
index ca16bbe..ed4453b 100644
--- a/av1/common/av1_loopfilter.h
+++ b/av1/common/av1_loopfilter.h
@@ -151,7 +151,7 @@
 #else
 void av1_loop_filter_frame(YV12_BUFFER_CONFIG *frame, struct AV1Common *cm,
                            struct macroblockd *xd, int plane_start,
-                           int plane_end, int partial_frame);
+                           int plane_end, int partial_frame, int is_realtime);
 #endif
 
 void av1_filter_block_plane_vert(const struct AV1Common *const cm,
@@ -164,6 +164,20 @@
                                  const MACROBLOCKD_PLANE *const plane_ptr,
                                  const uint32_t mi_row, const uint32_t mi_col);
 
+void av1_filter_block_plane_vert_rt(const struct AV1Common *const cm,
+                                    const MACROBLOCKD *const xd,
+                                    const int plane,
+                                    const MACROBLOCKD_PLANE *const plane_ptr,
+                                    const uint32_t mi_row,
+                                    const uint32_t mi_col);
+
+void av1_filter_block_plane_horz_rt(const struct AV1Common *const cm,
+                                    const MACROBLOCKD *const xd,
+                                    const int plane,
+                                    const MACROBLOCKD_PLANE *const plane_ptr,
+                                    const uint32_t mi_row,
+                                    const uint32_t mi_col);
+
 uint8_t av1_get_filter_level(const struct AV1Common *cm,
                              const loop_filter_info_n *lfi_n, const int dir_idx,
                              int plane, const MB_MODE_INFO *mbmi);
diff --git a/av1/common/thread_common.c b/av1/common/thread_common.c
index ef23450..0c45749 100644
--- a/av1/common/thread_common.c
+++ b/av1/common/thread_common.c
@@ -266,7 +266,7 @@
 #if CONFIG_LPF_MASK
                             int is_decoding,
 #endif
-                            int plane_start, int plane_end) {
+                            int plane_start, int plane_end, int is_realtime) {
   int mi_row, plane, dir;
   AV1LfMTInfo *lf_job_queue = lf_sync->job_queue;
   lf_sync->jobs_enqueued = 0;
@@ -293,6 +293,7 @@
         lf_job_queue->mi_row = mi_row;
         lf_job_queue->plane = plane;
         lf_job_queue->dir = dir;
+        lf_job_queue->is_realtime = is_realtime;
         lf_job_queue++;
         lf_sync->jobs_enqueued++;
       }
@@ -327,7 +328,7 @@
   const int sb_cols =
       ALIGN_POWER_OF_TWO(cm->mi_params.mi_cols, MAX_MIB_SIZE_LOG2) >>
       MAX_MIB_SIZE_LOG2;
-  int mi_row, mi_col, plane, dir;
+  int mi_row, mi_col, plane, dir, is_realtime;
   int r, c;
 
   while (1) {
@@ -338,6 +339,7 @@
       plane = cur_job_info->plane;
       dir = cur_job_info->dir;
       r = mi_row >> MAX_MIB_SIZE_LOG2;
+      is_realtime = cur_job_info->is_realtime && !plane;
 
       if (dir == 0) {
         for (mi_col = 0; mi_col < cm->mi_params.mi_cols;
@@ -346,9 +348,20 @@
 
           av1_setup_dst_planes(planes, cm->seq_params->sb_size, frame_buffer,
                                mi_row, mi_col, plane, plane + 1);
-
+#if CONFIG_AV1_HIGHBITDEPTH
+          (void)is_realtime;
           av1_filter_block_plane_vert(cm, xd, plane, &planes[plane], mi_row,
                                       mi_col);
+#else
+          if (is_realtime) {
+            av1_filter_block_plane_vert_rt(cm, xd, plane, &planes[plane],
+                                           mi_row, mi_col);
+
+          } else {
+            av1_filter_block_plane_vert(cm, xd, plane, &planes[plane], mi_row,
+                                        mi_col);
+          }
+#endif
           sync_write(lf_sync, r, c, sb_cols, plane);
         }
       } else if (dir == 1) {
@@ -366,8 +379,19 @@
 
           av1_setup_dst_planes(planes, cm->seq_params->sb_size, frame_buffer,
                                mi_row, mi_col, plane, plane + 1);
+#if CONFIG_AV1_HIGHBITDEPTH
+          (void)is_realtime;
           av1_filter_block_plane_horz(cm, xd, plane, &planes[plane], mi_row,
                                       mi_col);
+#else
+          if (is_realtime) {
+            av1_filter_block_plane_horz_rt(cm, xd, plane, &planes[plane],
+                                           mi_row, mi_col);
+          } else {
+            av1_filter_block_plane_horz(cm, xd, plane, &planes[plane], mi_row,
+                                        mi_col);
+          }
+#endif
         }
       }
     } else {
@@ -460,7 +484,7 @@
                                 int is_decoding,
 #endif
                                 AVxWorker *workers, int nworkers,
-                                AV1LfSync *lf_sync) {
+                                AV1LfSync *lf_sync, int is_realtime) {
   const AVxWorkerInterface *const winterface = aom_get_worker_interface();
 #if CONFIG_LPF_MASK
   int sb_rows;
@@ -496,7 +520,7 @@
 #if CONFIG_LPF_MASK
                   is_decoding,
 #endif
-                  plane_start, plane_end);
+                  plane_start, plane_end, is_realtime);
 
   // Set up loopfilter thread data.
   for (i = num_workers - 1; i >= 0; --i) {
@@ -539,7 +563,7 @@
                               int is_decoding,
 #endif
                               AVxWorker *workers, int num_workers,
-                              AV1LfSync *lf_sync) {
+                              AV1LfSync *lf_sync, int is_realtime) {
   int start_mi_row, end_mi_row, mi_rows_to_filter;
 
   start_mi_row = 0;
@@ -581,7 +605,7 @@
   }
 #else
   loop_filter_rows_mt(frame, cm, xd, start_mi_row, end_mi_row, plane_start,
-                      plane_end, workers, num_workers, lf_sync);
+                      plane_end, workers, num_workers, lf_sync, is_realtime);
 #endif
 }
 
diff --git a/av1/common/thread_common.h b/av1/common/thread_common.h
index b04f4fd..bcb4b87 100644
--- a/av1/common/thread_common.h
+++ b/av1/common/thread_common.h
@@ -28,6 +28,7 @@
   int mi_row;
   int plane;
   int dir;
+  int is_realtime;
 } AV1LfMTInfo;
 
 // Loopfilter row synchronization
@@ -157,7 +158,7 @@
                               int is_decoding,
 #endif
                               AVxWorker *workers, int num_workers,
-                              AV1LfSync *lf_sync);
+                              AV1LfSync *lf_sync, int is_realtime);
 
 #if !CONFIG_REALTIME_ONLY
 void av1_loop_restoration_filter_frame_mt(YV12_BUFFER_CONFIG *frame,
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index c4000bd..b5818f5 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -5293,13 +5293,13 @@
 #if CONFIG_LPF_MASK
             1,
 #endif
-            pbi->tile_workers, pbi->num_workers, &pbi->lf_row_sync);
+            pbi->tile_workers, pbi->num_workers, &pbi->lf_row_sync, 0);
       } else {
         av1_loop_filter_frame(&cm->cur_frame->buf, cm, &pbi->dcb.xd,
 #if CONFIG_LPF_MASK
                               1,
 #endif
-                              0, num_planes, 0);
+                              0, num_planes, 0, 0);
       }
     }
 
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 23f725c..d51339a 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -2184,6 +2184,14 @@
   const int use_restoration = cm->seq_params->enable_restoration &&
                               !cm->features.all_lossless &&
                               !cm->tiles.large_scale;
+  const int cur_width = cm->cur_frame->width;
+  const int cur_height = cm->cur_frame->height;
+  const int cur_width_mib = cm->mi_params.mi_cols * MI_SIZE;
+  const int cur_height_mib = cm->mi_params.mi_rows * MI_SIZE;
+  const int is_realtime =
+      cpi->sf.rt_sf.use_nonrd_pick_mode && !(cm->mi_params.mi_cols % 2) &&
+      !(cm->mi_params.mi_rows % 2) && (cur_width_mib - cur_width < MI_SIZE) &&
+      (cur_height_mib - cur_height < MI_SIZE);
 
   struct loopfilter *lf = &cm->lf;
 
@@ -2206,13 +2214,13 @@
                                0,
 #endif
                                mt_info->workers, num_workers,
-                               &mt_info->lf_row_sync);
+                               &mt_info->lf_row_sync, is_realtime);
     else
       av1_loop_filter_frame(&cm->cur_frame->buf, cm, xd,
 #if CONFIG_LPF_MASK
                             0,
 #endif
-                            0, num_planes, 0);
+                            0, num_planes, 0, is_realtime);
   }
 #if CONFIG_COLLECT_COMPONENT_TIMING
   end_timing(cpi, loop_filter_time);
diff --git a/av1/encoder/picklpf.c b/av1/encoder/picklpf.c
index f2168c4..9858e20 100644
--- a/av1/encoder/picklpf.c
+++ b/av1/encoder/picklpf.c
@@ -78,13 +78,13 @@
                              0,
 #endif
                              mt_info->workers, num_workers,
-                             &mt_info->lf_row_sync);
+                             &mt_info->lf_row_sync, 0);
   else
     av1_loop_filter_frame(&cm->cur_frame->buf, cm, &cpi->td.mb.e_mbd,
 #if CONFIG_LPF_MASK
                           0,
 #endif
-                          plane, plane + 1, partial_frame);
+                          plane, plane + 1, partial_frame, 0);
 
   filt_err = aom_get_sse_plane(sd, &cm->cur_frame->buf, plane,
                                cm->seq_params->use_highbitdepth);