Optimize loop restoration filter

While there is no cdef and no superres, we can write an optimized loop
restoration filter. This patch removes the operations to save boundary
lines for each 64x64 processing unit. During setup and restore of the
stripe boundary, we only need to save and restore 1 line from above
and 1 line from below. The boundaries buffer is not used.
(TODO: not allocate memory for boundaries buffer in this case.)

This change doesn't cause bitstream change. Test results verified that.

Change-Id: I26588154fd8de79c094f91a86a512586c10e1876
diff --git a/av1/common/restoration.c b/av1/common/restoration.c
index 06a8acd..8f97223 100644
--- a/av1/common/restoration.c
+++ b/av1/common/restoration.c
@@ -253,7 +253,7 @@
 static void setup_processing_stripe_boundary(
     const RestorationTileLimits *limits, const RestorationStripeBoundaries *rsb,
     int rsb_row, int use_highbd, int h, uint8_t *data8, int data_stride,
-    RestorationLineBuffers *rlbs, int copy_above, int copy_below) {
+    RestorationLineBuffers *rlbs, int copy_above, int copy_below, int opt) {
   // Offsets within the line buffers. The buffer logically starts at column
   // -RESTORATION_EXTRA_HORZ so the 1st column (at x0 - RESTORATION_EXTRA_HORZ)
   // has column x0 in the buffer.
@@ -276,37 +276,66 @@
   // logical 64-pixel-high stripe which has been split into an 8-pixel high
   // stripe and a 56-pixel high stripe (the current one). So, in this case,
   // we want to leave the boundary alone!
-  if (copy_above) {
-    uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
+  if (!opt) {
+    if (copy_above) {
+      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
 
-    for (int i = -RESTORATION_BORDER; i < 0; ++i) {
-      const int buf_row = rsb_row + AOMMAX(i + RESTORATION_CTX_VERT, 0);
-      const int buf_off = buf_x0_off + buf_row * buf_stride;
-      const uint8_t *buf = rsb->stripe_boundary_above + (buf_off << use_highbd);
-      uint8_t *dst8 = data8_tl + i * data_stride;
-      // Save old pixels, then replace with data from stripe_boundary_above
-      memcpy(rlbs->tmp_save_above[i + RESTORATION_BORDER],
-             REAL_PTR(use_highbd, dst8), line_size);
-      memcpy(REAL_PTR(use_highbd, dst8), buf, line_size);
+      for (int i = -RESTORATION_BORDER; i < 0; ++i) {
+        const int buf_row = rsb_row + AOMMAX(i + RESTORATION_CTX_VERT, 0);
+        const int buf_off = buf_x0_off + buf_row * buf_stride;
+        const uint8_t *buf =
+            rsb->stripe_boundary_above + (buf_off << use_highbd);
+        uint8_t *dst8 = data8_tl + i * data_stride;
+        // Save old pixels, then replace with data from stripe_boundary_above
+        memcpy(rlbs->tmp_save_above[i + RESTORATION_BORDER],
+               REAL_PTR(use_highbd, dst8), line_size);
+        memcpy(REAL_PTR(use_highbd, dst8), buf, line_size);
+      }
     }
-  }
 
-  // Replace RESTORATION_BORDER pixels below the bottom of the stripe.
-  // The second buffer row is repeated, so src_row gets the values 0, 1, 1
-  // for i = 0, 1, 2.
-  if (copy_below) {
-    const int stripe_end = limits->v_start + h;
-    uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
+    // Replace RESTORATION_BORDER pixels below the bottom of the stripe.
+    // The second buffer row is repeated, so src_row gets the values 0, 1, 1
+    // for i = 0, 1, 2.
+    if (copy_below) {
+      const int stripe_end = limits->v_start + h;
+      uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
 
-    for (int i = 0; i < RESTORATION_BORDER; ++i) {
-      const int buf_row = rsb_row + AOMMIN(i, RESTORATION_CTX_VERT - 1);
-      const int buf_off = buf_x0_off + buf_row * buf_stride;
-      const uint8_t *src = rsb->stripe_boundary_below + (buf_off << use_highbd);
+      for (int i = 0; i < RESTORATION_BORDER; ++i) {
+        const int buf_row = rsb_row + AOMMIN(i, RESTORATION_CTX_VERT - 1);
+        const int buf_off = buf_x0_off + buf_row * buf_stride;
+        const uint8_t *src =
+            rsb->stripe_boundary_below + (buf_off << use_highbd);
 
-      uint8_t *dst8 = data8_bl + i * data_stride;
+        uint8_t *dst8 = data8_bl + i * data_stride;
+        // Save old pixels, then replace with data from stripe_boundary_below
+        memcpy(rlbs->tmp_save_below[i], REAL_PTR(use_highbd, dst8), line_size);
+        memcpy(REAL_PTR(use_highbd, dst8), src, line_size);
+      }
+    }
+  } else {
+    if (copy_above) {
+      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
+
+      // Only save and overwrite i=-RESTORATION_BORDER line.
+      uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
+      // Save old pixels, then replace with data from stripe_boundary_above
+      memcpy(rlbs->tmp_save_above[0], REAL_PTR(use_highbd, dst8), line_size);
+      memcpy(REAL_PTR(use_highbd, dst8),
+             REAL_PTR(use_highbd,
+                      data8_tl + (-RESTORATION_BORDER + 1) * data_stride),
+             line_size);
+    }
+
+    if (copy_below) {
+      const int stripe_end = limits->v_start + h;
+      uint8_t *data8_bl = data8 + data_x0 + stripe_end * data_stride;
+
+      // Only save and overwrite i=2 line.
+      uint8_t *dst8 = data8_bl + 2 * data_stride;
       // Save old pixels, then replace with data from stripe_boundary_below
-      memcpy(rlbs->tmp_save_below[i], REAL_PTR(use_highbd, dst8), line_size);
-      memcpy(REAL_PTR(use_highbd, dst8), src, line_size);
+      memcpy(rlbs->tmp_save_below[2], REAL_PTR(use_highbd, dst8), line_size);
+      memcpy(REAL_PTR(use_highbd, dst8),
+             REAL_PTR(use_highbd, data8_bl + (2 - 1) * data_stride), line_size);
     }
   }
 }
@@ -327,31 +356,52 @@
 static void restore_processing_stripe_boundary(
     const RestorationTileLimits *limits, const RestorationLineBuffers *rlbs,
     int use_highbd, int h, uint8_t *data8, int data_stride, int copy_above,
-    int copy_below) {
+    int copy_below, int opt) {
   const int line_width =
       (limits->h_end - limits->h_start) + 2 * RESTORATION_EXTRA_HORZ;
   const int line_size = line_width << use_highbd;
 
   const int data_x0 = limits->h_start - RESTORATION_EXTRA_HORZ;
 
-  if (copy_above) {
-    uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
-    for (int i = -RESTORATION_BORDER; i < 0; ++i) {
-      uint8_t *dst8 = data8_tl + i * data_stride;
-      memcpy(REAL_PTR(use_highbd, dst8),
-             rlbs->tmp_save_above[i + RESTORATION_BORDER], line_size);
+  if (!opt) {
+    if (copy_above) {
+      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
+      for (int i = -RESTORATION_BORDER; i < 0; ++i) {
+        uint8_t *dst8 = data8_tl + i * data_stride;
+        memcpy(REAL_PTR(use_highbd, dst8),
+               rlbs->tmp_save_above[i + RESTORATION_BORDER], line_size);
+      }
     }
-  }
 
-  if (copy_below) {
-    const int stripe_bottom = limits->v_start + h;
-    uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
+    if (copy_below) {
+      const int stripe_bottom = limits->v_start + h;
+      uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
 
-    for (int i = 0; i < RESTORATION_BORDER; ++i) {
-      if (stripe_bottom + i >= limits->v_end + RESTORATION_BORDER) break;
+      for (int i = 0; i < RESTORATION_BORDER; ++i) {
+        if (stripe_bottom + i >= limits->v_end + RESTORATION_BORDER) break;
 
-      uint8_t *dst8 = data8_bl + i * data_stride;
-      memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[i], line_size);
+        uint8_t *dst8 = data8_bl + i * data_stride;
+        memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[i], line_size);
+      }
+    }
+  } else {
+    if (copy_above) {
+      uint8_t *data8_tl = data8 + data_x0 + limits->v_start * data_stride;
+
+      // Only restore i=-RESTORATION_BORDER line.
+      uint8_t *dst8 = data8_tl + (-RESTORATION_BORDER) * data_stride;
+      memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_above[0], line_size);
+    }
+
+    if (copy_below) {
+      const int stripe_bottom = limits->v_start + h;
+      uint8_t *data8_bl = data8 + data_x0 + stripe_bottom * data_stride;
+
+      // Only restore i=2 line.
+      if (stripe_bottom + 2 < limits->v_end + RESTORATION_BORDER) {
+        uint8_t *dst8 = data8_bl + 2 * data_stride;
+        memcpy(REAL_PTR(use_highbd, dst8), rlbs->tmp_save_below[2], line_size);
+      }
     }
   }
 }
@@ -992,7 +1042,7 @@
     const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
     const AV1PixelRect *tile_rect, int tile_stripe0, int ss_x, int ss_y,
     int highbd, int bit_depth, uint8_t *data8, int stride, uint8_t *dst8,
-    int dst_stride, int32_t *tmpbuf) {
+    int dst_stride, int32_t *tmpbuf, int opt) {
   RestorationType unit_rtype = rui->restoration_type;
 
   int unit_h = limits->v_end - limits->v_start;
@@ -1042,13 +1092,14 @@
 
     setup_processing_stripe_boundary(&remaining_stripes, rsb, rsb_row, highbd,
                                      h, data8, stride, rlbs, copy_above,
-                                     copy_below);
+                                     copy_below, opt);
 
     stripe_filter(rui, unit_w, h, procunit_width, data8_tl + i * stride, stride,
                   dst8_tl + i * dst_stride, dst_stride, tmpbuf, bit_depth);
 
     restore_processing_stripe_boundary(&remaining_stripes, rlbs, highbd, h,
-                                       data8, stride, copy_above, copy_below);
+                                       data8, stride, copy_above, copy_below,
+                                       opt);
 
     i += h;
   }
@@ -1083,11 +1134,11 @@
       limits, &rsi->unit_info[rest_unit_idx], &rsi->boundaries, ctxt->rlbs,
       tile_rect, ctxt->tile_stripe0, ctxt->ss_x, ctxt->ss_y, ctxt->highbd,
       ctxt->bit_depth, ctxt->data8, ctxt->data_stride, ctxt->dst8,
-      ctxt->dst_stride, ctxt->tmpbuf);
+      ctxt->dst_stride, ctxt->tmpbuf, rsi->opt);
 }
 
 void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame,
-                                       AV1_COMMON *cm) {
+                                       AV1_COMMON *cm, int opt) {
   assert(!cm->all_lossless);
   const int num_planes = av1_num_planes(cm);
   typedef void (*copy_fun)(const YV12_BUFFER_CONFIG *src,
@@ -1111,8 +1162,10 @@
   const int highbd = cm->use_highbitdepth;
 
   for (int plane = 0; plane < num_planes; ++plane) {
-    const RestorationInfo *rsi = &cm->rst_info[plane];
+    RestorationInfo *rsi = &cm->rst_info[plane];
     RestorationType rtype = rsi->frame_restoration_type;
+    rsi->opt = opt;
+
     if (rtype == RESTORE_NONE) {
       continue;
     }
diff --git a/av1/common/restoration.h b/av1/common/restoration.h
index 2e20e50..8a765ea 100644
--- a/av1/common/restoration.h
+++ b/av1/common/restoration.h
@@ -226,6 +226,7 @@
   int vert_units_per_tile, horz_units_per_tile;
   RestorationUnitInfo *unit_info;
   RestorationStripeBoundaries boundaries;
+  int opt;
 } RestorationInfo;
 
 static INLINE void set_default_sgrproj(SgrprojInfo *sgrproj_info) {
@@ -286,10 +287,10 @@
     const RestorationStripeBoundaries *rsb, RestorationLineBuffers *rlbs,
     const AV1PixelRect *tile_rect, int tile_stripe0, int ss_x, int ss_y,
     int highbd, int bit_depth, uint8_t *data8, int stride, uint8_t *dst8,
-    int dst_stride, int32_t *tmpbuf);
+    int dst_stride, int32_t *tmpbuf, int opt);
 
 void av1_loop_restoration_filter_frame(YV12_BUFFER_CONFIG *frame,
-                                       struct AV1Common *cm);
+                                       struct AV1Common *cm, int opt);
 void av1_loop_restoration_precal();
 
 typedef void (*rest_unit_visitor_t)(const RestorationTileLimits *limits,
diff --git a/av1/decoder/decodeframe.c b/av1/decoder/decodeframe.c
index 7a501c7..5c2ae74 100644
--- a/av1/decoder/decodeframe.c
+++ b/av1/decoder/decodeframe.c
@@ -3330,14 +3330,15 @@
   }
 
   if (!(cm->allow_intrabc && NO_FILTER_FOR_IBC)) {
-    int do_loop_restoration =
+    const int do_loop_restoration =
         cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
         cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
         cm->rst_info[2].frame_restoration_type != RESTORE_NONE;
-    int do_cdef =
+    const int do_cdef =
         !cm->skip_loop_filter && !cm->coded_lossless &&
         (cm->cdef_bits || cm->cdef_strengths[0] || cm->cdef_uv_strengths[0]);
-    int do_superres = av1_superres_unscaled(cm) == 0;
+    const int do_superres = !av1_superres_unscaled(cm);
+    const int optimized_loop_restoration = !do_cdef && !do_superres;
 
     if (do_cdef || do_superres) {
       if (do_loop_restoration)
@@ -3349,19 +3350,15 @@
 
       if (do_loop_restoration) {
         av1_loop_restoration_save_boundary_lines(&pbi->cur_buf->buf, cm, 1);
-        av1_loop_restoration_filter_frame((YV12_BUFFER_CONFIG *)xd->cur_buf,
-                                          cm);
+        av1_loop_restoration_filter_frame((YV12_BUFFER_CONFIG *)xd->cur_buf, cm,
+                                          optimized_loop_restoration);
       }
     } else {
-      // TODO: no cdef and no superres case. Write an optimized version of
-      // loop_restoration_filter for this special case.
-      // (Placeholder)
-      if (do_loop_restoration) {
-        av1_loop_restoration_save_boundary_lines(&pbi->cur_buf->buf, cm, 0);
-        av1_loop_restoration_save_boundary_lines(&pbi->cur_buf->buf, cm, 1);
-        av1_loop_restoration_filter_frame((YV12_BUFFER_CONFIG *)xd->cur_buf,
-                                          cm);
-      }
+      // In no cdef and no superres case. Provide an optimized version of
+      // loop_restoration_filter.
+      if (do_loop_restoration)
+        av1_loop_restoration_filter_frame((YV12_BUFFER_CONFIG *)xd->cur_buf, cm,
+                                          optimized_loop_restoration);
     }
   }
 
diff --git a/av1/encoder/encoder.c b/av1/encoder/encoder.c
index 432e4ae..8dff4d9 100644
--- a/av1/encoder/encoder.c
+++ b/av1/encoder/encoder.c
@@ -4022,7 +4022,7 @@
     if (cm->rst_info[0].frame_restoration_type != RESTORE_NONE ||
         cm->rst_info[1].frame_restoration_type != RESTORE_NONE ||
         cm->rst_info[2].frame_restoration_type != RESTORE_NONE) {
-      av1_loop_restoration_filter_frame(cm->frame_to_show, cm);
+      av1_loop_restoration_filter_frame(cm->frame_to_show, cm, 0);
     }
   }
 }
diff --git a/av1/encoder/pickrst.c b/av1/encoder/pickrst.c
index 8f938c8..f41a473 100644
--- a/av1/encoder/pickrst.c
+++ b/av1/encoder/pickrst.c
@@ -157,12 +157,15 @@
   const int highbd = cm->use_highbitdepth;
 
   const YV12_BUFFER_CONFIG *fts = cm->frame_to_show;
+  // TODO(yunqing): For now, only use optimized LR filter in decoder. Can be
+  // also used in encoder.
+  int opt = 0;
 
   av1_loop_restoration_filter_unit(
       limits, rui, &rsi->boundaries, &rlbs, tile_rect, rsc->tile_stripe0,
       is_uv && cm->subsampling_x, is_uv && cm->subsampling_y, highbd, bit_depth,
       fts->buffers[plane], fts->strides[is_uv], rsc->dst->buffers[plane],
-      rsc->dst->strides[is_uv], cm->rst_tmpbuf);
+      rsc->dst->strides[is_uv], cm->rst_tmpbuf, opt);
 
   return sse_restoration_unit(limits, rsc->src, rsc->dst, plane, highbd);
 }