loop-restoration: Remove special case in Wiener filter

Remove the special case handling for the topmost/bottommost
rows in each processing unit. This causes slightly different
effects depending on whether striped-loop-restoration is enabled.

With striped-loop-restoration:
  Now that we explicitly fill out 3 rows of above/below pixels
  for each stripe, we don't need to use stepdown_wiener_kernel.
  Instead, the duplication of the topmost/bottommost pixels
  accomplishes the same task, while making the code much cleaner.

  This patch should not cause a change in output, except in a
  couple of cases which were already questionable. In particular,
  it fixes bug #953, where the Wiener filter could not handle
  small processing units (<4 rows high)

Without striped-loop-restoration:
  The Wiener filter returns to using a full 3 pixels above/below
  the processing unit. In order to make sure there are enough
  pixels, we need to expand WIENER_BORDER_VERT to 3 pixels.

  This will result in a slight change in output, but should be
  fairly minor.

BUG=aomedia:953

Change-Id: I9530ef55909246f7ba488b7ecfd92d59e776b2f9
diff --git a/av1/common/restoration.c b/av1/common/restoration.c
index 619a1b1..e93ca49 100644
--- a/av1/common/restoration.c
+++ b/av1/common/restoration.c
@@ -364,38 +364,6 @@
 #undef REAL_PTR
 #endif
 
-static void stepdown_wiener_kernel(const InterpKernel orig, InterpKernel vert,
-                                   int boundary_dist, int istop) {
-  memcpy(vert, orig, sizeof(InterpKernel));
-  switch (boundary_dist) {
-    case 0:
-      vert[WIENER_HALFWIN] += vert[2] + vert[1] + vert[0];
-      vert[2] = vert[1] = vert[0] = 0;
-      break;
-    case 1:
-      vert[2] += vert[1] + vert[0];
-      vert[1] = vert[0] = 0;
-      break;
-    case 2:
-      vert[1] += vert[0];
-      vert[0] = 0;
-      break;
-    default: break;
-  }
-  if (!istop) {
-    int tmp;
-    tmp = vert[0];
-    vert[0] = vert[WIENER_WIN - 1];
-    vert[WIENER_WIN - 1] = tmp;
-    tmp = vert[1];
-    vert[1] = vert[WIENER_WIN - 2];
-    vert[WIENER_WIN - 2] = tmp;
-    tmp = vert[2];
-    vert[2] = vert[WIENER_WIN - 3];
-    vert[WIENER_WIN - 3] = tmp;
-  }
-}
-
 #if USE_WIENER_HIGH_INTERMEDIATE_PRECISION
 #define wiener_convolve8_add_src aom_convolve8_add_src_hip
 #else
@@ -411,40 +379,13 @@
   (void)bit_depth;
   assert(bit_depth == 8);
 
-  const int mid_height =
-      stripe_height - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2;
-  assert(mid_height > 0);
   for (int j = 0; j < stripe_width; j += procunit_width) {
     int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
     const uint8_t *src_p = src + j;
     uint8_t *dst_p = dst + j;
-    for (int b = 0; b < WIENER_HALFWIN - WIENER_BORDER_VERT; ++b) {
-      InterpKernel vertical_top;
-      stepdown_wiener_kernel(rui->wiener_info.vfilter, vertical_top,
-                             WIENER_BORDER_VERT + b, 1);
-      wiener_convolve8_add_src(src_p, src_stride, dst_p, dst_stride,
-                               rui->wiener_info.hfilter, 16, vertical_top, 16,
-                               w, 1);
-      src_p += src_stride;
-      dst_p += dst_stride;
-    }
-
     wiener_convolve8_add_src(src_p, src_stride, dst_p, dst_stride,
                              rui->wiener_info.hfilter, 16,
-                             rui->wiener_info.vfilter, 16, w, mid_height);
-    src_p += src_stride * mid_height;
-    dst_p += dst_stride * mid_height;
-
-    for (int b = WIENER_HALFWIN - WIENER_BORDER_VERT - 1; b >= 0; --b) {
-      InterpKernel vertical_bot;
-      stepdown_wiener_kernel(rui->wiener_info.vfilter, vertical_bot,
-                             WIENER_BORDER_VERT + b, 0);
-      wiener_convolve8_add_src(src_p, src_stride, dst_p, dst_stride,
-                               rui->wiener_info.hfilter, 16, vertical_bot, 16,
-                               w, 1);
-      src_p += src_stride;
-      dst_p += dst_stride;
-    }
+                             rui->wiener_info.vfilter, 16, w, stripe_height);
   }
 }
 
@@ -1203,41 +1144,13 @@
                                         int bit_depth) {
   (void)tmpbuf;
 
-  const int mid_height =
-      stripe_height - (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2;
-  assert(mid_height > 0);
-
   for (int j = 0; j < stripe_width; j += procunit_width) {
     int w = AOMMIN(procunit_width, (stripe_width - j + 15) & ~15);
     const uint8_t *src8_p = src8 + j;
     uint8_t *dst8_p = dst8 + j;
-
-    for (int b = 0; b < WIENER_HALFWIN - WIENER_BORDER_VERT; ++b) {
-      InterpKernel vertical_top;
-      stepdown_wiener_kernel(rui->wiener_info.vfilter, vertical_top,
-                             WIENER_BORDER_VERT + b, 1);
-      wiener_highbd_convolve8_add_src(src8_p, src_stride, dst8_p, dst_stride,
-                                      rui->wiener_info.hfilter, 16,
-                                      vertical_top, 16, w, 1, bit_depth);
-      src8_p += src_stride;
-      dst8_p += dst_stride;
-    }
-    assert(stripe_height > (WIENER_HALFWIN - WIENER_BORDER_VERT) * 2);
     wiener_highbd_convolve8_add_src(
         src8_p, src_stride, dst8_p, dst_stride, rui->wiener_info.hfilter, 16,
-        rui->wiener_info.vfilter, 16, w, mid_height, bit_depth);
-    src8_p += src_stride * (mid_height);
-    dst8_p += dst_stride * (mid_height);
-    for (int b = WIENER_HALFWIN - WIENER_BORDER_VERT - 1; b >= 0; --b) {
-      InterpKernel vertical_bot;
-      stepdown_wiener_kernel(rui->wiener_info.vfilter, vertical_bot,
-                             WIENER_BORDER_VERT + b, 0);
-      wiener_highbd_convolve8_add_src(src8_p, src_stride, dst8_p, dst_stride,
-                                      rui->wiener_info.hfilter, 16,
-                                      vertical_bot, 16, w, 1, bit_depth);
-      src8_p += src_stride;
-      dst8_p += dst_stride;
-    }
+        rui->wiener_info.vfilter, 16, w, stripe_height, bit_depth);
   }
 }
 
@@ -1461,8 +1374,6 @@
     int h = setup_processing_stripe_boundary(&remaining_stripes, rsb,
                                              procunit_height, ss_y, highbd,
                                              data8, stride, rlbs);
-    // The wiener filter needs a height>=4 in order to not assert on mid_height
-    if (unit_rtype == RESTORE_WIENER) h = ALIGN_POWER_OF_TWO(h, 2);
 #else
     const int h = AOMMIN(procunit_height, (unit_h - i + 15) & ~15);
 #endif
diff --git a/av1/common/restoration.h b/av1/common/restoration.h
index e51aa19..98e5ddd 100644
--- a/av1/common/restoration.h
+++ b/av1/common/restoration.h
@@ -41,7 +41,7 @@
 #if CONFIG_STRIPED_LOOP_RESTORATION
 #define WIENER_BORDER_VERT 2  // Vertical border used for Wiener
 #else
-#define WIENER_BORDER_VERT 1  // Vertical border used for Wiener
+#define WIENER_BORDER_VERT 3  // Vertical border used for Wiener
 #endif
 #define WIENER_HALFWIN 3
 #define WIENER_BORDER_HORZ (WIENER_HALFWIN)  // Horizontal border for Wiener