Add new convolve variant for loop-restoration

The convolve filters generated by loop_wiener_filter_tile
are not compatible with some existing convolve implementations
(they can have coefficients >128, sums of (certain subsets of)
coefficients >128, etc.)

So we implement a new variant, which takes a filter with 128
subtracted from its central element and which adds an extra copy
of the source just before clipping to a pixel (reinstating the
128 we subtracted). This should be easy to adapt from the existing
convolve functions, and this patch includes SSE2 highbd and
SSSE3 lowbd implementations.

Change-Id: I0abf4c2915f0665c49d88fe450dbc77b783f69e1
diff --git a/aom_dsp/aom_convolve.c b/aom_dsp/aom_convolve.c
index a079d1b..addfc31 100644
--- a/aom_dsp/aom_convolve.c
+++ b/aom_dsp/aom_convolve.c
@@ -332,6 +332,122 @@
                       filter_y, y_step_q4, w, h);
 }
 
+#if CONFIG_LOOP_RESTORATION
+static void convolve_add_src_horiz(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const InterpKernel *x_filters, int x0_q4,
+                                   int x_step_q4, int w, int h) {
+  int x, y;
+  src -= SUBPEL_TAPS / 2 - 1;
+  for (y = 0; y < h; ++y) {
+    int x_q4 = x0_q4;
+    for (x = 0; x < w; ++x) {
+      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
+      dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
+                          src_x[SUBPEL_TAPS / 2 - 1]);
+      x_q4 += x_step_q4;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void convolve_add_src_vert(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const InterpKernel *y_filters, int y0_q4,
+                                  int y_step_q4, int w, int h) {
+  int x, y;
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+
+  for (x = 0; x < w; ++x) {
+    int y_q4 = y0_q4;
+    for (y = 0; y < h; ++y) {
+      const unsigned char *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k)
+        sum += src_y[k * src_stride] * y_filter[k];
+      dst[y * dst_stride] =
+          clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
+                     src_y[(SUBPEL_TAPS / 2 - 1) * src_stride]);
+      y_q4 += y_step_q4;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+static void convolve_add_src(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const InterpKernel *const x_filters, int x0_q4,
+                             int x_step_q4, const InterpKernel *const y_filters,
+                             int y0_q4, int y_step_q4, int w, int h) {
+  uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
+  int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+  assert(w <= MAX_SB_SIZE);
+  assert(h <= MAX_SB_SIZE);
+
+  assert(y_step_q4 <= 32);
+  assert(x_step_q4 <= 32);
+
+  convolve_add_src_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
+                         temp, MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
+                         intermediate_height);
+  convolve_add_src_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE,
+                        dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h);
+}
+
+void aom_convolve8_add_src_horiz_c(const uint8_t *src, ptrdiff_t src_stride,
+                                   uint8_t *dst, ptrdiff_t dst_stride,
+                                   const int16_t *filter_x, int x_step_q4,
+                                   const int16_t *filter_y, int y_step_q4,
+                                   int w, int h) {
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+  (void)filter_y;
+  (void)y_step_q4;
+
+  convolve_add_src_horiz(src, src_stride, dst, dst_stride, filters_x, x0_q4,
+                         x_step_q4, w, h);
+}
+
+void aom_convolve8_add_src_vert_c(const uint8_t *src, ptrdiff_t src_stride,
+                                  uint8_t *dst, ptrdiff_t dst_stride,
+                                  const int16_t *filter_x, int x_step_q4,
+                                  const int16_t *filter_y, int y_step_q4, int w,
+                                  int h) {
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+  (void)filter_x;
+  (void)x_step_q4;
+
+  convolve_add_src_vert(src, src_stride, dst, dst_stride, filters_y, y0_q4,
+                        y_step_q4, w, h);
+}
+
+void aom_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const int16_t *filter_x, int x_step_q4,
+                             const int16_t *filter_y, int y_step_q4, int w,
+                             int h) {
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+  convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4,
+                   x_step_q4, filters_y, y0_q4, y_step_q4, w, h);
+}
+#endif  // CONFIG_LOOP_RESTORATION
+
 #if CONFIG_AOM_HIGHBITDEPTH
 static void highbd_convolve_horiz(const uint8_t *src8, ptrdiff_t src_stride,
                                   uint8_t *dst8, ptrdiff_t dst_stride,
@@ -597,4 +713,142 @@
     dst += dst_stride;
   }
 }
-#endif
+
+#if CONFIG_LOOP_RESTORATION
+static void highbd_convolve_add_src_horiz(const uint8_t *src8,
+                                          ptrdiff_t src_stride, uint8_t *dst8,
+                                          ptrdiff_t dst_stride,
+                                          const InterpKernel *x_filters,
+                                          int x0_q4, int x_step_q4, int w,
+                                          int h, int bd) {
+  int x, y;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  src -= SUBPEL_TAPS / 2 - 1;
+  for (y = 0; y < h; ++y) {
+    int x_q4 = x0_q4;
+    for (x = 0; x < w; ++x) {
+      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *const x_filter = x_filters[x_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k) sum += src_x[k] * x_filter[k];
+      dst[x] = clip_pixel_highbd(
+          ROUND_POWER_OF_TWO(sum, FILTER_BITS) + src_x[SUBPEL_TAPS / 2 - 1],
+          bd);
+      x_q4 += x_step_q4;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void highbd_convolve_add_src_vert(const uint8_t *src8,
+                                         ptrdiff_t src_stride, uint8_t *dst8,
+                                         ptrdiff_t dst_stride,
+                                         const InterpKernel *y_filters,
+                                         int y0_q4, int y_step_q4, int w, int h,
+                                         int bd) {
+  int x, y;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  src -= src_stride * (SUBPEL_TAPS / 2 - 1);
+  for (x = 0; x < w; ++x) {
+    int y_q4 = y0_q4;
+    for (y = 0; y < h; ++y) {
+      const uint16_t *src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *const y_filter = y_filters[y_q4 & SUBPEL_MASK];
+      int k, sum = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k)
+        sum += src_y[k * src_stride] * y_filter[k];
+      dst[y * dst_stride] =
+          clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS) +
+                                src_y[(SUBPEL_TAPS / 2 - 1) * src_stride],
+                            bd);
+      y_q4 += y_step_q4;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+static void highbd_convolve_add_src(const uint8_t *src, ptrdiff_t src_stride,
+                                    uint8_t *dst, ptrdiff_t dst_stride,
+                                    const InterpKernel *const x_filters,
+                                    int x0_q4, int x_step_q4,
+                                    const InterpKernel *const y_filters,
+                                    int y0_q4, int y_step_q4, int w, int h,
+                                    int bd) {
+  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
+  // 2d filtering proceeds in 2 steps:
+  //   (1) Interpolate horizontally into an intermediate buffer, temp.
+  //   (2) Interpolate temp vertically to derive the sub-pixel result.
+  // Deriving the maximum number of rows in the temp buffer (135):
+  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+  // --Largest block size is 64x64 pixels.
+  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+  //   original frame (in 1/16th pixel units).
+  // --Must round-up because block may be located at sub-pixel position.
+  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+  uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
+  int intermediate_height =
+      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
+
+  assert(w <= MAX_SB_SIZE);
+  assert(h <= MAX_SB_SIZE);
+  assert(y_step_q4 <= 32);
+  assert(x_step_q4 <= 32);
+
+  highbd_convolve_add_src_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1),
+                                src_stride, CONVERT_TO_BYTEPTR(temp),
+                                MAX_SB_SIZE, x_filters, x0_q4, x_step_q4, w,
+                                intermediate_height, bd);
+  highbd_convolve_add_src_vert(
+      CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
+      MAX_SB_SIZE, dst, dst_stride, y_filters, y0_q4, y_step_q4, w, h, bd);
+}
+
+void aom_highbd_convolve8_add_src_horiz_c(
+    const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+    ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
+    const int16_t *filter_y, int y_step_q4, int w, int h, int bd) {
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+  (void)filter_y;
+  (void)y_step_q4;
+
+  highbd_convolve_add_src_horiz(src, src_stride, dst, dst_stride, filters_x,
+                                x0_q4, x_step_q4, w, h, bd);
+}
+
+void aom_highbd_convolve8_add_src_vert_c(const uint8_t *src,
+                                         ptrdiff_t src_stride, uint8_t *dst,
+                                         ptrdiff_t dst_stride,
+                                         const int16_t *filter_x, int x_step_q4,
+                                         const int16_t *filter_y, int y_step_q4,
+                                         int w, int h, int bd) {
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+  (void)filter_x;
+  (void)x_step_q4;
+
+  highbd_convolve_add_src_vert(src, src_stride, dst, dst_stride, filters_y,
+                               y0_q4, y_step_q4, w, h, bd);
+}
+
+void aom_highbd_convolve8_add_src_c(const uint8_t *src, ptrdiff_t src_stride,
+                                    uint8_t *dst, ptrdiff_t dst_stride,
+                                    const int16_t *filter_x, int x_step_q4,
+                                    const int16_t *filter_y, int y_step_q4,
+                                    int w, int h, int bd) {
+  const InterpKernel *const filters_x = get_filter_base(filter_x);
+  const int x0_q4 = get_filter_offset(filter_x, filters_x);
+
+  const InterpKernel *const filters_y = get_filter_base(filter_y);
+  const int y0_q4 = get_filter_offset(filter_y, filters_y);
+
+  highbd_convolve_add_src(src, src_stride, dst, dst_stride, filters_x, x0_q4,
+                          x_step_q4, filters_y, y0_q4, y_step_q4, w, h, bd);
+}
+#endif  // CONFIG_LOOP_RESTORATION
+#endif  // CONFIG_AOM_HIGHBITDEPTH
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index de150e6..11c8109 100644
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -734,6 +734,16 @@
 specialize qw/aom_convolve8_avg_vert  sse2 ssse3/;
 specialize qw/aom_scaled_2d                ssse3/;
 
+if (aom_config("CONFIG_LOOP_RESTORATION") eq "yes") {
+  add_proto qw/void aom_convolve8_add_src/,       "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+  add_proto qw/void aom_convolve8_add_src_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+  add_proto qw/void aom_convolve8_add_src_vert/,  "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+
+  specialize qw/aom_convolve8_add_src         ssse3/;
+  specialize qw/aom_convolve8_add_src_horiz   ssse3/;
+  specialize qw/aom_convolve8_add_src_vert    ssse3/;
+}  # CONFIG_LOOP_RESTORATION
+
 # TODO(any): These need to be extended to up to 128x128 block sizes
 if (!(aom_config("CONFIG_AV1") eq "yes" && aom_config("CONFIG_EXT_PARTITION") eq "yes")) {
   specialize qw/aom_convolve_copy       neon dspr2 msa/;
@@ -770,6 +780,16 @@
 
   add_proto qw/void aom_highbd_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
   specialize qw/aom_highbd_convolve8_avg_vert/, "$sse2_x86_64";
+
+  if (aom_config("CONFIG_LOOP_RESTORATION") eq "yes") {
+    add_proto qw/void aom_highbd_convolve8_add_src/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+    add_proto qw/void aom_highbd_convolve8_add_src_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+    add_proto qw/void aom_highbd_convolve8_add_src_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
+
+    specialize qw/aom_highbd_convolve8_add_src         sse2/;
+    specialize qw/aom_highbd_convolve8_add_src_horiz   sse2/;
+    specialize qw/aom_highbd_convolve8_add_src_vert    sse2/;
+  }  # CONFIG_LOOP_RESTORATION
 }  # CONFIG_AOM_HIGHBITDEPTH
 
 #
diff --git a/aom_dsp/x86/aom_asm_stubs.c b/aom_dsp/x86/aom_asm_stubs.c
index 1b71a9f..97a7df4 100644
--- a/aom_dsp/x86/aom_asm_stubs.c
+++ b/aom_dsp/x86/aom_asm_stubs.c
@@ -159,5 +159,24 @@
 //                                    int w, int h, int bd);
 HIGH_FUN_CONV_2D(, sse2);
 HIGH_FUN_CONV_2D(avg_, sse2);
+
+#if CONFIG_LOOP_RESTORATION
+// The SSE2 highbd convolve functions can deal with coefficients up to 32767.
+// So redirect highbd_convolve8_add_src to regular highbd_convolve8.
+void aom_highbd_convolve8_add_src_sse2(const uint8_t *src, ptrdiff_t src_stride,
+                                       uint8_t *dst, ptrdiff_t dst_stride,
+                                       const int16_t *filter_x, int x_step_q4,
+                                       const int16_t *filter_y, int y_step_q4,
+                                       int w, int h, int bd) {
+  assert(x_step_q4 == 16);
+  assert(y_step_q4 == 16);
+  ((int16_t *)filter_x)[3] += 128;
+  ((int16_t *)filter_y)[3] += 128;
+  aom_highbd_convolve8_sse2(src, src_stride, dst, dst_stride, filter_x,
+                            x_step_q4, filter_y, y_step_q4, w, h, bd);
+  ((int16_t *)filter_x)[3] -= 128;
+  ((int16_t *)filter_y)[3] -= 128;
+}
+#endif  // CONFIG_LOOP_RESTORATION
 #endif  // CONFIG_AOM_HIGHBITDEPTH && ARCH_X86_64
 #endif  // HAVE_SSE2
diff --git a/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c b/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
index 04a06e5..be37738 100644
--- a/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
+++ b/aom_dsp/x86/aom_subpixel_8t_intrin_ssse3.c
@@ -291,6 +291,14 @@
 filter8_1dfunction aom_filter_block1d8_h8_avg_ssse3;
 filter8_1dfunction aom_filter_block1d4_v8_avg_ssse3;
 filter8_1dfunction aom_filter_block1d4_h8_avg_ssse3;
+#if CONFIG_LOOP_RESTORATION
+filter8_1dfunction aom_filter_block1d16_v8_add_src_ssse3;
+filter8_1dfunction aom_filter_block1d16_h8_add_src_ssse3;
+filter8_1dfunction aom_filter_block1d8_v8_add_src_ssse3;
+filter8_1dfunction aom_filter_block1d8_h8_add_src_ssse3;
+filter8_1dfunction aom_filter_block1d4_v8_add_src_ssse3;
+filter8_1dfunction aom_filter_block1d4_h8_add_src_ssse3;
+#endif
 
 filter8_1dfunction aom_filter_block1d16_v2_ssse3;
 filter8_1dfunction aom_filter_block1d16_h2_ssse3;
@@ -331,6 +339,13 @@
 FUN_CONV_1D(avg_vert, y_step_q4, filter_y, v, src - src_stride * 3, avg_,
             ssse3);
 
+#if CONFIG_LOOP_RESTORATION
+FUN_CONV_1D_NO_BILINEAR(add_src_horiz, x_step_q4, filter_x, h, src, add_src_,
+                        ssse3);
+FUN_CONV_1D_NO_BILINEAR(add_src_vert, y_step_q4, filter_y, v,
+                        src - src_stride * 3, add_src_, ssse3);
+#endif
+
 #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
                       out2, out3, out4, out5, out6, out7)                 \
   {                                                                       \
@@ -900,3 +915,6 @@
 //                              int w, int h);
 FUN_CONV_2D(, ssse3);
 FUN_CONV_2D(avg_, ssse3);
+#if CONFIG_LOOP_RESTORATION
+FUN_CONV_2D_NO_BILINEAR(add_src_, add_src_, ssse3);
+#endif
diff --git a/aom_dsp/x86/aom_subpixel_8t_ssse3.asm b/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
index 91febbc..357f374 100644
--- a/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
+++ b/aom_dsp/x86/aom_subpixel_8t_ssse3.asm
@@ -15,6 +15,7 @@
 
 SECTION_RODATA
 pw_64:    times 8 dw 64
+even_byte_mask: times 8 dw 0x00ff
 
 ; %define USE_PMULHRSW
 ; NOTE: pmulhrsw has a latency of 5 cycles.  Tests showed a performance loss
@@ -142,6 +143,14 @@
     paddsw              m0, m1
     paddsw              m0, krd
     psraw               m0, 7
+%ifidn %1, h8_add_src
+    pxor                 m3, m3
+    movu                 m4, [srcq]
+    movu                 m5, [srcq + sstrideq]
+    punpckldq            m4, m5 ; Bytes 0,1,2,3 from row 0, then 0,1,2,3 from row 2
+    punpcklbw            m4, m3
+    paddsw               m0, m4
+%endif
     packuswb            m0, m0
     psrldq              m1, m0, 4
 
@@ -178,6 +187,12 @@
     paddsw              m0, m1
     paddsw              m0, krd
     psraw               m0, 7
+%ifidn %1, h8_add_src
+    pxor                m3, m3
+    movu                m4, [srcq]
+    punpcklbw           m4, m3
+    paddsw              m0, m4
+%endif
     packuswb            m0, m0
 %ifidn %1, h8_avg
     movd                m4, [dstq]
@@ -235,6 +250,15 @@
     paddsw               m6, m0
     paddsw               m6, krd
     psraw                m6, 7
+%ifidn %1, h8_add_src
+    pxor                 m3, m3
+    movu                 m4, [srcq]
+    movu                 m5, [srcq + sstrideq]
+    punpcklbw            m4, m3
+    punpcklbw            m5, m3
+    paddsw               m1, m4
+    paddsw               m6, m5
+%endif
     packuswb             m1, m6
 %ifidn %1, h8_avg
     pavgb                m1, m2
@@ -269,6 +293,12 @@
     paddsw               m1, m4
     paddsw               m1, krd
     psraw                m1, 7
+%ifidn %1, h8_add_src
+    pxor                 m6, m6
+    movu                 m5, [srcq]
+    punpcklbw            m5, m6
+    paddsw               m1, m5
+%endif
     packuswb             m1, m1
 %ifidn %1, h8_avg
     movh                 m0, [dstq]
@@ -315,6 +345,14 @@
     paddsw        m4, krd
     psraw         m0, 7
     psraw         m4, 7
+%ifidn %1, h8_add_src
+    movu          m5, [srcq]
+    mova          m7, m5
+    pand          m5, [even_byte_mask]
+    psrlw         m7, 8
+    paddsw        m0, m5
+    paddsw        m4, m7
+%endif
     packuswb      m0, m0
     packuswb      m4, m4
     punpcklbw     m0, m4
@@ -337,6 +375,12 @@
 SUBPIX_HFILTER4  h8
 SUBPIX_HFILTER4  h8_avg
 
+%if CONFIG_LOOP_RESTORATION
+SUBPIX_HFILTER16 h8_add_src
+SUBPIX_HFILTER8  h8_add_src
+SUBPIX_HFILTER4  h8_add_src
+%endif
+
 ;-------------------------------------------------------------------------------
 
 ; TODO(Linfeng): Detect cpu type and choose the code with better performance.
@@ -413,12 +457,23 @@
     paddsw                   m0, krd
     psraw                    m0, 7
     paddsw                   m1, m5
+%ifidn %1, v8_add_src
+    pxor                     m6, m6
+    movu                     m4, [srcq]
+    punpcklbw                m4, m6
+    paddsw                   m0, m4
+%endif
     packuswb                 m0, m0
 
     paddsw                   m3, m7
     paddsw                   m1, m3
     paddsw                   m1, krd
     psraw                    m1, 7
+%ifidn %1, v8_add_src
+    movu                     m4, [src1q]
+    punpcklbw                m4, m6
+    paddsw                   m1, m4
+%endif
     lea                    srcq, [srcq + sstrideq * 2 ]
     lea                   src1q, [src1q + sstrideq * 2]
     packuswb                 m1, m1
@@ -462,6 +517,12 @@
     paddsw                   m0, m2
     paddsw                   m0, krd
     psraw                    m0, 7
+%ifidn %1, v8_add_src
+    pxor                     m6, m6
+    movu                     m4, [srcq]
+    punpcklbw                m4, m6
+    paddsw                   m0, m4
+%endif
     packuswb                 m0, m0
 %ifidn %1, v8_avg
     movx                     m1, [dstq]
@@ -643,6 +704,15 @@
     paddsw                   m3, m7
     paddsw                   m3, krd
     psraw                    m3, 7
+%ifidn %1, v8_add_src
+    pxor                     m6, m6
+    movu                     m4, [src1q + 2 * sstrideq] ; Fetch from 3 rows down
+    mova                     m5, m4
+    punpcklbw                m4, m6
+    punpckhbw                m5, m6
+    paddsw                   m0, m4
+    paddsw                   m3, m5
+%endif
     packuswb                 m0, m3
 
     add                    srcq, sstrideq
@@ -804,3 +874,10 @@
 SUBPIX_VFILTER   v8_avg, 8
 SUBPIX_VFILTER       v8, 4
 SUBPIX_VFILTER   v8_avg, 4
+
+%if (ARCH_X86 || X86_SUBPIX_VFILTER_PREFER_SLOW_CELERON) && \
+    CONFIG_LOOP_RESTORATION
+SUBPIX_VFILTER16 v8_add_src
+SUBPIX_VFILTER   v8_add_src, 8
+SUBPIX_VFILTER   v8_add_src, 4
+%endif
diff --git a/aom_dsp/x86/convolve.h b/aom_dsp/x86/convolve.h
index 09e4de8..ca2488d 100644
--- a/aom_dsp/x86/convolve.h
+++ b/aom_dsp/x86/convolve.h
@@ -31,11 +31,7 @@
     (void)x_step_q4;                                                         \
     (void)filter_y;                                                          \
     (void)y_step_q4;                                                         \
-    if (filter[3] >= 128) {                                                  \
-      aom_convolve8_##name##_c(src, src_stride, dst, dst_stride, filter_x,   \
-                               x_step_q4, filter_y, y_step_q4, w, h);        \
-      return;                                                                \
-    }                                                                        \
+    assert((-128 <= filter[3]) && (filter[3] <= 127));                       \
     assert(step_q4 == 16);                                                   \
     if (filter[0] | filter[1] | filter[2]) {                                 \
       while (w >= 16) {                                                      \
@@ -93,11 +89,8 @@
       const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,                \
       ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,          \
       const int16_t *filter_y, int y_step_q4, int w, int h) {                \
-    if (filter_x[3] >= 128 || filter_y[3] >= 128) {                          \
-      aom_convolve8_##avg##c(src, src_stride, dst, dst_stride, filter_x,     \
-                             x_step_q4, filter_y, y_step_q4, w, h);          \
-      return;                                                                \
-    }                                                                        \
+    assert((-128 <= filter_x[3]) && (filter_x[3] <= 127));                   \
+    assert((-128 <= filter_y[3]) && (filter_y[3] <= 127));                   \
     assert(w <= MAX_SB_SIZE);                                                \
     assert(h <= MAX_SB_SIZE);                                                \
     assert(x_step_q4 == 16);                                                 \
@@ -122,8 +115,71 @@
     }                                                                        \
   }
 
-#if CONFIG_AOM_HIGHBITDEPTH
+#if CONFIG_LOOP_RESTORATION
+// convolve_add_src is only used by the Wiener filter, which will never
+// end up calling the bilinear functions (it uses a symmetric filter, so
+// the possible numbers of taps are 1,3,5,7)
+#define FUN_CONV_1D_NO_BILINEAR(name, step_q4, filter, dir, src_start, avg, \
+                                opt)                                        \
+  void aom_convolve8_##name##_##opt(                                        \
+      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,               \
+      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,         \
+      const int16_t *filter_y, int y_step_q4, int w, int h) {               \
+    (void)filter_x;                                                         \
+    (void)x_step_q4;                                                        \
+    (void)filter_y;                                                         \
+    (void)y_step_q4;                                                        \
+    assert((-128 <= filter[3]) && (filter[3] <= 127));                      \
+    assert(step_q4 == 16);                                                  \
+    while (w >= 16) {                                                       \
+      aom_filter_block1d16_##dir##8_##avg##opt(src_start, src_stride, dst,  \
+                                               dst_stride, h, filter);      \
+      src += 16;                                                            \
+      dst += 16;                                                            \
+      w -= 16;                                                              \
+    }                                                                       \
+    while (w >= 8) {                                                        \
+      aom_filter_block1d8_##dir##8_##avg##opt(src_start, src_stride, dst,   \
+                                              dst_stride, h, filter);       \
+      src += 8;                                                             \
+      dst += 8;                                                             \
+      w -= 8;                                                               \
+    }                                                                       \
+    while (w >= 4) {                                                        \
+      aom_filter_block1d4_##dir##8_##avg##opt(src_start, src_stride, dst,   \
+                                              dst_stride, h, filter);       \
+      src += 4;                                                             \
+      dst += 4;                                                             \
+      w -= 4;                                                               \
+    }                                                                       \
+    if (w) {                                                                \
+      aom_convolve8_##name##_c(src, src_stride, dst, dst_stride, filter_x,  \
+                               x_step_q4, filter_y, y_step_q4, w, h);       \
+    }                                                                       \
+  }
 
+#define FUN_CONV_2D_NO_BILINEAR(type, htype, opt)                           \
+  void aom_convolve8_##type##opt(                                           \
+      const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,               \
+      ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,         \
+      const int16_t *filter_y, int y_step_q4, int w, int h) {               \
+    DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE + 7)]);  \
+    assert((-128 <= filter_x[3]) && (filter_x[3] <= 127));                  \
+    assert((-128 <= filter_y[3]) && (filter_y[3] <= 127));                  \
+    assert(w <= MAX_SB_SIZE);                                               \
+    assert(h <= MAX_SB_SIZE);                                               \
+    assert(x_step_q4 == 16);                                                \
+    assert(y_step_q4 == 16);                                                \
+    aom_convolve8_##htype##horiz_##opt(                                     \
+        src - 3 * src_stride, src_stride, fdata2, MAX_SB_SIZE, filter_x,    \
+        x_step_q4, filter_y, y_step_q4, w, h + 7);                          \
+    aom_convolve8_##type##vert_##opt(fdata2 + 3 * MAX_SB_SIZE, MAX_SB_SIZE, \
+                                     dst, dst_stride, filter_x, x_step_q4,  \
+                                     filter_y, y_step_q4, w, h);            \
+  }
+#endif
+
+#if CONFIG_AOM_HIGHBITDEPTH
 typedef void highbd_filter8_1dfunction(const uint16_t *src_ptr,
                                        const ptrdiff_t src_pitch,
                                        uint16_t *output_ptr,
diff --git a/av1/common/restoration.c b/av1/common/restoration.c
index 6573380..7bc2930 100644
--- a/av1/common/restoration.c
+++ b/av1/common/restoration.c
@@ -208,6 +208,8 @@
   }
   hkernel[WIENER_WIN] = 0;
   vkernel[WIENER_WIN] = 0;
+  hkernel[3] -= 128;
+  vkernel[3] -= 128;
   av1_get_rest_tile_limits(tile_idx, 0, 0, rst->nhtiles, rst->nvtiles,
                            tile_width, tile_height, width, height, 0, 0,
                            &h_start, &h_end, &v_start, &v_end);
@@ -219,8 +221,8 @@
       int h = AOMMIN(MAX_SB_SIZE, (v_end - i + 15) & ~15);
       const uint8_t *data_p = data + i * stride + j;
       uint8_t *dst_p = dst + i * dst_stride + j;
-      aom_convolve8(data_p, stride, dst_p, dst_stride, hkernel, 16, vkernel, 16,
-                    w, h);
+      aom_convolve8_add_src(data_p, stride, dst_p, dst_stride, hkernel, 16,
+                            vkernel, 16, w, h);
     }
 }
 
@@ -779,6 +781,8 @@
   }
   hkernel[WIENER_WIN] = 0;
   vkernel[WIENER_WIN] = 0;
+  hkernel[3] -= 128;
+  vkernel[3] -= 128;
   av1_get_rest_tile_limits(tile_idx, 0, 0, rst->nhtiles, rst->nvtiles,
                            tile_width, tile_height, width, height, 0, 0,
                            &h_start, &h_end, &v_start, &v_end);
@@ -790,9 +794,9 @@
       int h = AOMMIN(MAX_SB_SIZE, (v_end - i + 15) & ~15);
       const uint16_t *data_p = data + i * stride + j;
       uint16_t *dst_p = dst + i * dst_stride + j;
-      aom_highbd_convolve8_c(CONVERT_TO_BYTEPTR(data_p), stride,
-                             CONVERT_TO_BYTEPTR(dst_p), dst_stride, hkernel, 16,
-                             vkernel, 16, w, h, bit_depth);
+      aom_highbd_convolve8_add_src(CONVERT_TO_BYTEPTR(data_p), stride,
+                                   CONVERT_TO_BYTEPTR(dst_p), dst_stride,
+                                   hkernel, 16, vkernel, 16, w, h, bit_depth);
     }
 }