Reduce line buffer size for Wiener filter.

This patch forces the vertical filtering for the top and bottom
rows of a processing unit for the Wiener filter to be 5-tap.
The 5-taps are derived from the primary 7-tap fitler by forcing
the taps at the end to be zero, and absorbing their weights into
the other taps to maintain normalization.
This will effectively reduce the line buffer size for luma Wiener
filter to 4 (from 6).

Change-Id: I5e21b58369777eabf553a8987387d112f98a5598
diff --git a/av1/common/restoration.c b/av1/common/restoration.c
index f1411db..2064330 100644
--- a/av1/common/restoration.c
+++ b/av1/common/restoration.c
@@ -146,6 +146,17 @@
            h_end - h_start);
 }
 
+// Convert 7-tap filter to 5-tap for top and bottom rows of a processing unit
+static void stepdown_wiener_kernel(const InterpKernel orig, InterpKernel vert) {
+  memcpy(vert, orig, sizeof(InterpKernel));
+  int delta = vert[0] / 2;
+  vert[1] += delta;
+  vert[WIENER_WIN - 2] += delta;
+  vert[2] += vert[0] - delta;
+  vert[WIENER_WIN - 3] += vert[0] - delta;
+  vert[0] = vert[WIENER_WIN - 1] = 0;
+}
+
 static void loop_wiener_filter_tile(uint8_t *data, int tile_idx, int width,
                                     int height, int stride,
                                     RestorationInternal *rst, uint8_t *dst,
@@ -161,6 +172,9 @@
                    dst_stride);
     return;
   }
+  InterpKernel vertical_topbot;
+  stepdown_wiener_kernel(rst->rsi->wiener_info[tile_idx].vfilter,
+                         vertical_topbot);
   av1_get_rest_tile_limits(tile_idx, 0, 0, rst->nhtiles, rst->nvtiles,
                            tile_width, tile_height, width, height, 0, 0,
                            &h_start, &h_end, &v_start, &v_end);
@@ -172,15 +186,41 @@
       int h = AOMMIN(procunit_height, (v_end - i + 15) & ~15);
       const uint8_t *data_p = data + i * stride + j;
       uint8_t *dst_p = dst + i * dst_stride + j;
+// Use 5-tap vertical filtering for top and bottom rows in
+// processing unit
+#if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
+      aom_convolve8_add_src_hip(data_p, stride, dst_p, dst_stride,
+                                rst->rsi->wiener_info[tile_idx].hfilter, 16,
+                                vertical_topbot, 16, w, 1);
+#else
+      aom_convolve8_add_src(data_p, stride, dst_p, dst_stride,
+                            rst->rsi->wiener_info[tile_idx].hfilter, 16,
+                            vertical_topbot, 16, w, 1);
+#endif  // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
+      data_p += stride;
+      dst_p += dst_stride;
+// Note h is at least 16
 #if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
       aom_convolve8_add_src_hip(data_p, stride, dst_p, dst_stride,
                                 rst->rsi->wiener_info[tile_idx].hfilter, 16,
                                 rst->rsi->wiener_info[tile_idx].vfilter, 16, w,
-                                h);
+                                h - 2);
 #else
       aom_convolve8_add_src(data_p, stride, dst_p, dst_stride,
                             rst->rsi->wiener_info[tile_idx].hfilter, 16,
-                            rst->rsi->wiener_info[tile_idx].vfilter, 16, w, h);
+                            rst->rsi->wiener_info[tile_idx].vfilter, 16, w,
+                            h - 2);
+#endif  // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
+      data_p += stride * (h - 2);
+      dst_p += dst_stride * (h - 2);
+#if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
+      aom_convolve8_add_src_hip(data_p, stride, dst_p, dst_stride,
+                                rst->rsi->wiener_info[tile_idx].hfilter, 16,
+                                vertical_topbot, 16, w, 1);
+#else
+      aom_convolve8_add_src(data_p, stride, dst_p, dst_stride,
+                            rst->rsi->wiener_info[tile_idx].hfilter, 16,
+                            vertical_topbot, 16, w, 1);
 #endif  // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
     }
 }
@@ -1011,6 +1051,9 @@
   av1_get_rest_tile_limits(tile_idx, 0, 0, rst->nhtiles, rst->nvtiles,
                            tile_width, tile_height, width, height, 0, 0,
                            &h_start, &h_end, &v_start, &v_end);
+  InterpKernel vertical_topbot;
+  stepdown_wiener_kernel(rst->rsi->wiener_info[tile_idx].vfilter,
+                         vertical_topbot);
   // Convolve the whole tile (done in blocks here to match the requirements
   // of the vectorized convolve functions, but the result is equivalent)
   for (i = v_start; i < v_end; i += procunit_height)
@@ -1019,16 +1062,45 @@
       int h = AOMMIN(procunit_height, (v_end - i + 15) & ~15);
       const uint16_t *data_p = data + i * stride + j;
       uint16_t *dst_p = dst + i * dst_stride + j;
+// if the filter is 7-tap do only horizontal filtering for top and
+// bottom rows.
 #if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
       aom_highbd_convolve8_add_src_hip(
           CONVERT_TO_BYTEPTR(data_p), stride, CONVERT_TO_BYTEPTR(dst_p),
           dst_stride, rst->rsi->wiener_info[tile_idx].hfilter, 16,
-          rst->rsi->wiener_info[tile_idx].vfilter, 16, w, h, bit_depth);
+          vertical_topbot, 16, w, 1, bit_depth);
+#else
+      aom_highbd_convolve8_add_src(CONVERT_TO_BYTEPTR(data_p), stride,
+                                   CONVERT_TO_BYTEPTR(dst_p), dst_stride,
+                                   rst->rsi->wiener_info[tile_idx].hfilter, 16,
+                                   vertical_topbot, 16, w, 1, bit_depth);
+#endif  // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
+      data_p += stride;
+      dst_p += dst_stride;
+// Note h is at least 16
+#if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
+      aom_highbd_convolve8_add_src_hip(
+          CONVERT_TO_BYTEPTR(data_p), stride, CONVERT_TO_BYTEPTR(dst_p),
+          dst_stride, rst->rsi->wiener_info[tile_idx].hfilter, 16,
+          rst->rsi->wiener_info[tile_idx].vfilter, 16, w, h - 2, bit_depth);
 #else
       aom_highbd_convolve8_add_src(
           CONVERT_TO_BYTEPTR(data_p), stride, CONVERT_TO_BYTEPTR(dst_p),
           dst_stride, rst->rsi->wiener_info[tile_idx].hfilter, 16,
-          rst->rsi->wiener_info[tile_idx].vfilter, 16, w, h, bit_depth);
+          rst->rsi->wiener_info[tile_idx].vfilter, 16, w, h - 2, bit_depth);
+#endif  // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
+      data_p += stride * (h - 2);
+      dst_p += dst_stride * (h - 2);
+#if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
+      aom_highbd_convolve8_add_src_hip(
+          CONVERT_TO_BYTEPTR(data_p), stride, CONVERT_TO_BYTEPTR(dst_p),
+          dst_stride, rst->rsi->wiener_info[tile_idx].hfilter, 16,
+          vertical_topbot, 16, w, 1, bit_depth);
+#else
+      aom_highbd_convolve8_add_src(CONVERT_TO_BYTEPTR(data_p), stride,
+                                   CONVERT_TO_BYTEPTR(dst_p), dst_stride,
+                                   rst->rsi->wiener_info[tile_idx].hfilter, 16,
+                                   vertical_topbot, 16, w, 1, bit_depth);
 #endif  // USE_WIENER_HIGH_INTERMEDIATE_PRECISION
     }
 }
diff --git a/av1/encoder/pickrst.c b/av1/encoder/pickrst.c
index e530137..9e682ea 100644
--- a/av1/encoder/pickrst.c
+++ b/av1/encoder/pickrst.c
@@ -358,8 +358,8 @@
   int ep, bestep = 0;
   int64_t err, besterr = -1;
   int exqd[2], bestxqd[2] = { 0, 0 };
-  int flt1_stride = width;
-  int flt2_stride = width;
+  int flt1_stride = ((width + 7) & ~7) + 8;
+  int flt2_stride = ((width + 7) & ~7) + 8;
   assert(pu_width == (RESTORATION_PROC_UNIT_SIZE >> 1) ||
          pu_width == RESTORATION_PROC_UNIT_SIZE);
   assert(pu_height == (RESTORATION_PROC_UNIT_SIZE >> 1) ||
@@ -385,11 +385,11 @@
                                      flt1_stride, sgr_params[ep].corner,
                                      sgr_params[ep].edge);
 #else
-          av1_selfguided_restoration_highbd_c(
+          av1_selfguided_restoration_highbd(
               dat_p, w, h, dat_stride, flt1_p, flt1_stride, bit_depth,
               sgr_params[ep].r1, sgr_params[ep].e1, tmpbuf2);
 #endif  // USE_HIGHPASS_IN_SGRPROJ
-          av1_selfguided_restoration_highbd_c(
+          av1_selfguided_restoration_highbd(
               dat_p, w, h, dat_stride, flt2_p, flt2_stride, bit_depth,
               sgr_params[ep].r2, sgr_params[ep].e2, tmpbuf2);
         }
@@ -406,13 +406,13 @@
           av1_highpass_filter(dat_p, w, h, dat_stride, flt1_p, flt1_stride,
                               sgr_params[ep].corner, sgr_params[ep].edge);
 #else
-        av1_selfguided_restoration_c(dat_p, w, h, dat_stride, flt1_p,
-                                     flt1_stride, sgr_params[ep].r1,
-                                     sgr_params[ep].e1, tmpbuf2);
+        av1_selfguided_restoration(dat_p, w, h, dat_stride, flt1_p, flt1_stride,
+                                   sgr_params[ep].r1, sgr_params[ep].e1,
+                                   tmpbuf2);
 #endif  // USE_HIGHPASS_IN_SGRPROJ
-          av1_selfguided_restoration_c(dat_p, w, h, dat_stride, flt2_p,
-                                       flt2_stride, sgr_params[ep].r2,
-                                       sgr_params[ep].e2, tmpbuf2);
+          av1_selfguided_restoration(dat_p, w, h, dat_stride, flt2_p,
+                                     flt2_stride, sgr_params[ep].r2,
+                                     sgr_params[ep].e2, tmpbuf2);
         }
 #if CONFIG_HIGHBITDEPTH
     }