Fix address sanitizer issue in av1_wiener_convolve_add_src_neon

This CL fixes heap-buffer-overflow error in
av1_wiener_convolve_add_src_neon() for unit test
NEON/AV1HiprecConvolveTest.CheckOutput/8.

The intrinsic function processes eight or four rows per iteration
for a given height. Hence, for some odd heights the function may
process more rows than actual height which causes overflow error.
This CL resolves the issue by facilitating single row processing
for odd heights.

At module level, the scaling numbers are similar to the parent CL.

Change-Id: I934f2853c88bee0d6f6c9baa9c510db16cf6e6ee
(cherry picked from commit 876d70156206bc1b016f2d24d28fda5f3fde89b4)
diff --git a/av1/common/arm/wiener_convolve_neon.c b/av1/common/arm/wiener_convolve_neon.c
index 06e7555..0a12c88 100644
--- a/av1/common/arm/wiener_convolve_neon.c
+++ b/av1/common/arm/wiener_convolve_neon.c
@@ -22,25 +22,103 @@
 #include "av1/common/common.h"
 #include "av1/common/arm/convolve_neon.h"
 
+#define HORZ_FILTERING_CORE(t0, t1, t2, t3, t4, t5, t6, res)                 \
+  res0 = vreinterpretq_s16_u16(vaddl_u8(t0, t1));                            \
+  res1 = vreinterpretq_s16_u16(vaddl_u8(t2, t3));                            \
+  res2 = vreinterpretq_s16_u16(vaddl_u8(t4, t5));                            \
+  res3 = vreinterpretq_s16_u16(vmovl_u8(t6));                                \
+  res = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp, bd, \
+                                   conv_params->round_0);
+
+#define PROCESS_ROW_FOR_VERTICAL_FILTER                                      \
+  __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride);                          \
+                                                                             \
+  do {                                                                       \
+    s7 = vld1q_s16(s);                                                       \
+    s += src_stride;                                                         \
+                                                                             \
+    t0 = wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6, filter_y_tmp, \
+                                   bd, conv_params->round_1);                \
+    vst1_u8(d, t0);                                                          \
+    d += dst_stride;                                                         \
+                                                                             \
+    s0 = s1;                                                                 \
+    s1 = s2;                                                                 \
+    s2 = s3;                                                                 \
+    s3 = s4;                                                                 \
+    s4 = s5;                                                                 \
+    s5 = s6;                                                                 \
+    s6 = s7;                                                                 \
+    height--;                                                                \
+  } while (height > 0);
+
+static INLINE void process_row_for_horz_filtering(
+    uint16_t *dst_ptr, int16_t *filter_x, const uint8_t *src_ptr,
+    ptrdiff_t src_stride, ptrdiff_t dst_stride, int round0_bits, int w,
+    int height, int bd) {
+  do {
+    __builtin_prefetch(src_ptr);
+
+    uint8x8_t tt0 = vld1_u8(src_ptr);  // a0 a1 a2 a3 a4 a5 a6 a7
+
+    __builtin_prefetch(dst_ptr);
+
+    const uint8_t *ss = src_ptr + 8;
+    uint16_t *d_tmp = dst_ptr;
+    int width = w;
+
+    do {
+      uint8x8_t tt7 = vld1_u8(ss);  // a8 a9 a10 a11 a12 a13 a14 a15
+      uint8x8_t ttemp_0 = tt0;
+      tt0 = tt7;
+
+      uint8x8_t tt1 = vext_u8(ttemp_0, tt7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
+      uint8x8_t tt2 = vext_u8(ttemp_0, tt7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
+      uint8x8_t tt3 = vext_u8(ttemp_0, tt7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
+      uint8x8_t tt4 = vext_u8(ttemp_0, tt7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
+      uint8x8_t tt5 = vext_u8(ttemp_0, tt7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
+      uint8x8_t tt6 = vext_u8(ttemp_0, tt7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
+      tt7 = vext_u8(ttemp_0, tt7, 7);            // a7 a8 a9 a10 a11 a12 a13 a14
+
+      int16x8_t ttt0 = vreinterpretq_s16_u16(vaddl_u8(ttemp_0, tt6));
+      int16x8_t ttt1 = vreinterpretq_s16_u16(vaddl_u8(tt1, tt5));
+      int16x8_t ttt2 = vreinterpretq_s16_u16(vaddl_u8(tt2, tt4));
+      int16x8_t ttt3 = vreinterpretq_s16_u16(vmovl_u8(tt3));
+      uint16x8_t dd0 = wiener_convolve8_horiz_8x8(ttt0, ttt1, ttt2, ttt3,
+                                                  filter_x, bd, round0_bits);
+
+      vst1q_u16(d_tmp, dd0);
+
+      ss += 8;
+      d_tmp += 8;
+      width -= 8;
+    } while (width > 0);
+
+    src_ptr += src_stride;
+    dst_ptr += dst_stride;
+    height--;
+  } while (height > 0);
+}
+
 /* Wiener filter 2D
    Apply horizontal filter and store in a temporary buffer. When applying
    vertical filter, overwrite the original pixel values.
- */
+*/
 void av1_wiener_convolve_add_src_neon(const uint8_t *src, ptrdiff_t src_stride,
                                       uint8_t *dst, ptrdiff_t dst_stride,
                                       const int16_t *filter_x, int x_step_q4,
                                       const int16_t *filter_y, int y_step_q4,
                                       int w, int h,
                                       const ConvolveParams *conv_params) {
-  uint16_t *d_tmp;
   uint8_t *d;
   const uint8_t *src_ptr, *s_tmp;
   uint16_t *dst_ptr;
   (void)x_step_q4;
   (void)y_step_q4;
 
-  int width, height;
+  int height;
   const int bd = 8;
+  // Indicates the height needs to be processed during horizontal filtering.
   const int intermediate_height = h + SUBPEL_TAPS - 1;
   const int center_tap = ((SUBPEL_TAPS - 1) / 2);
   int16_t filter_x_tmp[7], filter_y_tmp[7];
@@ -74,15 +152,15 @@
   src_ptr = s_tmp;
   height = intermediate_height;
 
-  /* if height is a multiple of 8 */
-  if (!(h & 7)) {
-    int16x8_t res0, res1, res2, res3;
-    uint16x8_t res4;
-    uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
+  // For aarch_64.
 #if defined(__aarch64__)
-    uint16x8_t res5, res6, res7, res8, res9, res10, res11;
-    uint8x8_t t8, t9, t10, t11, t12, t13, t14;
-
+  int processed_height = 0;
+  uint16_t *d_tmp;
+  int width, remaining_height;
+  // Start of horizontal filtering.
+  if (intermediate_height > 7) {
+    uint16x8_t res4, res5, res6, res7, res8, res9, res10, res11;
+    uint8x8_t t0, t1, t2, t3, t4, t5, t6, t7;
     do {
       const uint8_t *s;
 
@@ -112,64 +190,19 @@
       __builtin_prefetch(dst_ptr + 7 * dst_stride);
 
       do {
+        int16x8_t res0, res1, res2, res3;
+        uint8x8_t t8, t9, t10, t11, t12, t13, t14;
         load_u8_8x8(s, src_stride, &t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14);
         transpose_u8_8x8(&t7, &t8, &t9, &t10, &t11, &t12, &t13, &t14);
 
-        res0 = vreinterpretq_s16_u16(vaddl_u8(t0, t6));
-        res1 = vreinterpretq_s16_u16(vaddl_u8(t1, t5));
-        res2 = vreinterpretq_s16_u16(vaddl_u8(t2, t4));
-        res3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-        res4 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
-                                          bd, conv_params->round_0);
-
-        res0 = vreinterpretq_s16_u16(vaddl_u8(t1, t7));
-        res1 = vreinterpretq_s16_u16(vaddl_u8(t2, t6));
-        res2 = vreinterpretq_s16_u16(vaddl_u8(t3, t5));
-        res3 = vreinterpretq_s16_u16(vmovl_u8(t4));
-        res5 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
-                                          bd, conv_params->round_0);
-
-        res0 = vreinterpretq_s16_u16(vaddl_u8(t2, t8));
-        res1 = vreinterpretq_s16_u16(vaddl_u8(t3, t7));
-        res2 = vreinterpretq_s16_u16(vaddl_u8(t4, t6));
-        res3 = vreinterpretq_s16_u16(vmovl_u8(t5));
-        res6 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
-                                          bd, conv_params->round_0);
-
-        res0 = vreinterpretq_s16_u16(vaddl_u8(t3, t9));
-        res1 = vreinterpretq_s16_u16(vaddl_u8(t4, t8));
-        res2 = vreinterpretq_s16_u16(vaddl_u8(t5, t7));
-        res3 = vreinterpretq_s16_u16(vmovl_u8(t6));
-        res7 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
-                                          bd, conv_params->round_0);
-
-        res0 = vreinterpretq_s16_u16(vaddl_u8(t4, t10));
-        res1 = vreinterpretq_s16_u16(vaddl_u8(t5, t9));
-        res2 = vreinterpretq_s16_u16(vaddl_u8(t6, t8));
-        res3 = vreinterpretq_s16_u16(vmovl_u8(t7));
-        res8 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
-                                          bd, conv_params->round_0);
-
-        res0 = vreinterpretq_s16_u16(vaddl_u8(t5, t11));
-        res1 = vreinterpretq_s16_u16(vaddl_u8(t6, t10));
-        res2 = vreinterpretq_s16_u16(vaddl_u8(t7, t9));
-        res3 = vreinterpretq_s16_u16(vmovl_u8(t8));
-        res9 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
-                                          bd, conv_params->round_0);
-
-        res0 = vreinterpretq_s16_u16(vaddl_u8(t6, t12));
-        res1 = vreinterpretq_s16_u16(vaddl_u8(t7, t11));
-        res2 = vreinterpretq_s16_u16(vaddl_u8(t8, t10));
-        res3 = vreinterpretq_s16_u16(vmovl_u8(t9));
-        res10 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
-                                           bd, conv_params->round_0);
-
-        res0 = vreinterpretq_s16_u16(vaddl_u8(t7, t13));
-        res1 = vreinterpretq_s16_u16(vaddl_u8(t8, t12));
-        res2 = vreinterpretq_s16_u16(vaddl_u8(t9, t11));
-        res3 = vreinterpretq_s16_u16(vmovl_u8(t10));
-        res11 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
-                                           bd, conv_params->round_0);
+        HORZ_FILTERING_CORE(t0, t6, t1, t5, t2, t4, t3, res4)
+        HORZ_FILTERING_CORE(t1, t7, t2, t6, t3, t5, t4, res5)
+        HORZ_FILTERING_CORE(t2, t8, t3, t7, t4, t6, t5, res6)
+        HORZ_FILTERING_CORE(t3, t9, t4, t8, t5, t7, t6, res7)
+        HORZ_FILTERING_CORE(t4, t10, t5, t9, t6, t8, t7, res8)
+        HORZ_FILTERING_CORE(t5, t11, t6, t10, t7, t9, t8, res9)
+        HORZ_FILTERING_CORE(t6, t12, t7, t11, t8, t10, t9, res10)
+        HORZ_FILTERING_CORE(t7, t13, t8, t12, t9, t11, t10, res11)
 
         transpose_u16_8x8(&res4, &res5, &res6, &res7, &res8, &res9, &res10,
                           &res11);
@@ -190,212 +223,19 @@
       src_ptr += 8 * src_stride;
       dst_ptr += 8 * MAX_SB_SIZE;
       height -= 8;
-    } while (height > 0);
-#else
-    uint8x8_t temp_0;
-
-    do {
-      const uint8_t *s;
-
-      __builtin_prefetch(src_ptr);
-
-      t0 = vld1_u8(src_ptr);  // a0 a1 a2 a3 a4 a5 a6 a7
-      s = src_ptr + 8;
-      d_tmp = dst_ptr;
-      width = w;
-
-      __builtin_prefetch(dst_ptr);
-
-      do {
-        t7 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
-        temp_0 = t0;
-        t0 = t7;
-
-        t1 = vext_u8(temp_0, t7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
-        t2 = vext_u8(temp_0, t7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
-        t3 = vext_u8(temp_0, t7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
-        t4 = vext_u8(temp_0, t7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
-        t5 = vext_u8(temp_0, t7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
-        t6 = vext_u8(temp_0, t7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
-        t7 = vext_u8(temp_0, t7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
-
-        res0 = vreinterpretq_s16_u16(vaddl_u8(temp_0, t6));
-        res1 = vreinterpretq_s16_u16(vaddl_u8(t1, t5));
-        res2 = vreinterpretq_s16_u16(vaddl_u8(t2, t4));
-        res3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-        res4 = wiener_convolve8_horiz_8x8(res0, res1, res2, res3, filter_x_tmp,
-                                          bd, conv_params->round_0);
-
-        vst1q_u16(d_tmp, res4);
-
-        s += 8;
-        d_tmp += 8;
-        width -= 8;
-      } while (width > 0);
-      src_ptr += src_stride;
-      dst_ptr += MAX_SB_SIZE;
-      height--;
-    } while (height > 0);
-#endif
-  } else {
-    /*if height is a multiple of 4*/
-    const uint8_t *s;
-    int16x8_t tt0, tt1, tt2, tt3;
-    uint16x8_t d0;
-    uint8x8_t t0, t1, t2, t3;
-
-#if defined(__aarch64__)
-    uint16x4_t res0, res1, res2, res3, res4, res5, res6, res7;
-    uint16x8_t d1, d2, d3;
-    int16x4_t s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10;
-    int16x4_t s11, s12, s13, s14;
-    do {
-      __builtin_prefetch(src_ptr + 0 * src_stride);
-      __builtin_prefetch(src_ptr + 1 * src_stride);
-      __builtin_prefetch(src_ptr + 2 * src_stride);
-      __builtin_prefetch(src_ptr + 3 * src_stride);
-
-      load_u8_8x4(src_ptr, src_stride, &t0, &t1, &t2, &t3); /*8x4*/
-      transpose_u8_8x4(&t0, &t1, &t2,
-                       &t3); /*first 8 pixels of 4 rows transposed-- 4x8*/
-
-      tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-      tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-      tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-      tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-
-      s0 = vget_low_s16(tt0);  /*pa0 pb0 pc0 pd0 -- pixel_a0*/
-      s1 = vget_low_s16(tt1);  /*pa1 pb1 pc1 pd1 */
-      s2 = vget_low_s16(tt2);  /*pa2 pb2 pc2 pd2 */
-      s3 = vget_low_s16(tt3);  /*pa3 pb3 pc3 pd3 */
-      s4 = vget_high_s16(tt0); /*pa4 pb4 pc4 pd4 */
-      s5 = vget_high_s16(tt1); /*pa5 pb5 pc5 pd5 */
-      s6 = vget_high_s16(tt2); /*pa6 pb6 pc6 pd6 */
-
-      __builtin_prefetch(dst_ptr + 0 * dst_stride);
-      __builtin_prefetch(dst_ptr + 1 * dst_stride);
-      __builtin_prefetch(dst_ptr + 2 * dst_stride);
-      __builtin_prefetch(dst_ptr + 3 * dst_stride);
-
-      s = src_ptr + 7;
-      d_tmp = dst_ptr;
-      width = w;
-
-      do {
-        load_u8_8x4(s, src_stride, &t0, &t1, &t2, &t3); /*8x4*/
-        transpose_u8_8x4(&t0, &t1, &t2, &t3);
-
-        tt0 = vreinterpretq_s16_u16(vmovl_u8(t0));
-        tt1 = vreinterpretq_s16_u16(vmovl_u8(t1));
-        tt2 = vreinterpretq_s16_u16(vmovl_u8(t2));
-        tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-
-        s7 = vget_low_s16(tt0); /*pa7  pb7  pc7  pd7  */ /*4x8*/
-        s8 = vget_low_s16(tt1);   /*pa8  pb8  pc8  pd8  */
-        s9 = vget_low_s16(tt2);   /*pa9  pb9  pc9  pd9  */
-        s10 = vget_low_s16(tt3);  /*pa10 pb10 pc10 pd10 */
-        s11 = vget_high_s16(tt0); /*pa11 pb11 pc11 pd11 */
-        s12 = vget_high_s16(tt1); /*pa12 pb12 pc12 pd12 */
-        s13 = vget_high_s16(tt2); /*pa13 pb13 pc13 pd13 */
-        s14 = vget_high_s16(tt3); /*pa14 pb14 pc14 pd14 */
-
-        res0 = wiener_convolve8_horiz_4x8(
-            s0, s1, s2, s3, s4, s5, s6, filter_x_tmp, bd, conv_params->round_0);
-        res1 = wiener_convolve8_horiz_4x8(
-            s1, s2, s3, s4, s5, s6, s7, filter_x_tmp, bd, conv_params->round_0);
-        res2 = wiener_convolve8_horiz_4x8(
-            s2, s3, s4, s5, s6, s7, s8, filter_x_tmp, bd, conv_params->round_0);
-        res3 = wiener_convolve8_horiz_4x8(
-            s3, s4, s5, s6, s7, s8, s9, filter_x_tmp, bd, conv_params->round_0);
-        res4 =
-            wiener_convolve8_horiz_4x8(s4, s5, s6, s7, s8, s9, s10,
-                                       filter_x_tmp, bd, conv_params->round_0);
-        res5 =
-            wiener_convolve8_horiz_4x8(s5, s6, s7, s8, s9, s10, s11,
-                                       filter_x_tmp, bd, conv_params->round_0);
-        res6 =
-            wiener_convolve8_horiz_4x8(s6, s7, s8, s9, s10, s11, s12,
-                                       filter_x_tmp, bd, conv_params->round_0);
-        res7 =
-            wiener_convolve8_horiz_4x8(s7, s8, s9, s10, s11, s12, s13,
-                                       filter_x_tmp, bd, conv_params->round_0);
-
-        transpose_u16_4x8(&res0, &res1, &res2, &res3, &res4, &res5, &res6,
-                          &res7, &d0, &d1, &d2, &d3);
-
-        store_u16_8x4(d_tmp, MAX_SB_SIZE, d0, d1, d2, d3);
-
-        s0 = s8;
-        s1 = s9;
-        s2 = s10;
-        s3 = s11;
-        s4 = s12;
-        s5 = s13;
-        s6 = s14;
-        s += 8;
-        d_tmp += 8;
-        width -= 8;
-      } while (width > 0);
-
-      src_ptr += 4 * src_stride;
-      dst_ptr += 4 * MAX_SB_SIZE;
-      height -= 4;
-    } while (height > 0);
-#else
-    uint8x8_t temp_0, t4, t5, t6, t7;
-
-    do {
-      __builtin_prefetch(src_ptr);
-
-      t0 = vld1_u8(src_ptr);  // a0 a1 a2 a3 a4 a5 a6 a7
-
-      __builtin_prefetch(dst_ptr);
-
-      s = src_ptr + 8;
-      d_tmp = dst_ptr;
-      width = w;
-
-      do {
-        t7 = vld1_u8(s);  // a8 a9 a10 a11 a12 a13 a14 a15
-        temp_0 = t0;
-        t0 = t7;
-
-        t1 = vext_u8(temp_0, t7, 1);  // a1 a2 a3 a4 a5 a6 a7 a8
-        t2 = vext_u8(temp_0, t7, 2);  // a2 a3 a4 a5 a6 a7 a8 a9
-        t3 = vext_u8(temp_0, t7, 3);  // a3 a4 a5 a6 a7 a8 a9 a10
-        t4 = vext_u8(temp_0, t7, 4);  // a4 a5 a6 a7 a8 a9 a10 a11
-        t5 = vext_u8(temp_0, t7, 5);  // a5 a6 a7 a8 a9 a10 a11 a12
-        t6 = vext_u8(temp_0, t7, 6);  // a6 a7 a8 a9 a10 a11 a12 a13
-        t7 = vext_u8(temp_0, t7, 7);  // a7 a8 a9 a10 a11 a12 a13 a14
-
-        tt0 = vreinterpretq_s16_u16(vaddl_u8(temp_0, t6));
-        tt1 = vreinterpretq_s16_u16(vaddl_u8(t1, t5));
-        tt2 = vreinterpretq_s16_u16(vaddl_u8(t2, t4));
-        tt3 = vreinterpretq_s16_u16(vmovl_u8(t3));
-        d0 = wiener_convolve8_horiz_8x8(tt0, tt1, tt2, tt3, filter_x_tmp, bd,
-                                        conv_params->round_0);
-
-        vst1q_u16(d_tmp, d0);
-
-        s += 8;
-        d_tmp += 8;
-        width -= 8;
-      } while (width > 0);
-
-      src_ptr += src_stride;
-      dst_ptr += MAX_SB_SIZE;
-      height -= 1;
-    } while (height > 0);
-#endif
+      processed_height += 8;
+    } while (height > 7);
   }
 
+  // Process the remaining rows for horizontal filtering.
+  remaining_height = intermediate_height - processed_height;
+  if (remaining_height)
+    process_row_for_horz_filtering(dst_ptr, filter_x_tmp, src_ptr, src_stride,
+                                   MAX_SB_SIZE, conv_params->round_0, w, height,
+                                   bd);
+
+  // Start of vertical filtering.
   {
-    int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
-    uint8x8_t t0;
-#if defined(__aarch64__)
-    int16x8_t s8, s9, s10;
-    uint8x8_t t1, t2, t3;
-#endif
     int16_t *src_tmp_ptr, *s;
     uint8_t *dst_tmp_ptr;
     height = h;
@@ -405,6 +245,8 @@
     src_stride = MAX_SB_SIZE;
 
     do {
+      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+      uint8x8_t t0;
       s = src_tmp_ptr;
       s0 = vld1q_s16(s);
       s += src_stride;
@@ -423,8 +265,9 @@
       d = dst_tmp_ptr;
       height = h;
 
-#if defined(__aarch64__)
       do {
+        int16x8_t s8, s9, s10;
+        uint8x8_t t1, t2, t3;
         __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride);
         __builtin_prefetch(dst_tmp_ptr + 1 * dst_stride);
         __builtin_prefetch(dst_tmp_ptr + 2 * dst_stride);
@@ -467,64 +310,55 @@
         height -= 4;
       } while (height > 3);
 
-      if (height != 0) {
-        __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride);
-        __builtin_prefetch(dst_tmp_ptr + 1 * dst_stride);
-
-        do {
-          s7 = vld1q_s16(s);
-          s += src_stride;
-
-          t0 =
-              wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6,
-                                        filter_y_tmp, bd, conv_params->round_1);
-          vst1_u8(d, t0);
-          d += dst_stride;
-
-          s0 = s1;
-          s1 = s2;
-          s2 = s3;
-          s3 = s4;
-          s4 = s5;
-          s5 = s6;
-          s6 = s7;
-          height -= 1;
-        } while (height > 0);
+      if (height) {
+        PROCESS_ROW_FOR_VERTICAL_FILTER
       }
-
       src_tmp_ptr += 8;
       dst_tmp_ptr += 8;
-
       w -= 8;
     } while (w > 0);
-#else
-      do {
-        __builtin_prefetch(dst_tmp_ptr + 0 * dst_stride);
-
-        s7 = vld1q_s16(s);
-        s += src_stride;
-
-        t0 = wiener_convolve8_vert_4x8(s0, s1, s2, s3, s4, s5, s6, filter_y_tmp,
-                                       bd, conv_params->round_1);
-
-        vst1_u8(d, t0);
-        d += dst_stride;
-
-        s0 = s1;
-        s1 = s2;
-        s2 = s3;
-        s3 = s4;
-        s4 = s5;
-        s5 = s6;
-        s6 = s7;
-        height -= 1;
-      } while (height > 0);
-
-      src_tmp_ptr += 8;
-      dst_tmp_ptr += 8;
-
-      w -= 8;
-    } while (w > 0);
-#endif
   }
+#else
+  // Start of horizontal filtering.
+  process_row_for_horz_filtering(dst_ptr, filter_x_tmp, src_ptr, src_stride,
+                                 MAX_SB_SIZE, conv_params->round_0, w, height,
+                                 bd);
+
+  // Start of vertical filtering.
+  {
+    int16_t *src_tmp_ptr, *s;
+    uint8_t *dst_tmp_ptr;
+    src_tmp_ptr = (int16_t *)temp;
+    dst_tmp_ptr = dst;
+    src_stride = MAX_SB_SIZE;
+
+    do {
+      uint8x8_t t0;
+      int16x8_t s0, s1, s2, s3, s4, s5, s6, s7;
+      s = src_tmp_ptr;
+      s0 = vld1q_s16(s);
+      s += src_stride;
+      s1 = vld1q_s16(s);
+      s += src_stride;
+      s2 = vld1q_s16(s);
+      s += src_stride;
+      s3 = vld1q_s16(s);
+      s += src_stride;
+      s4 = vld1q_s16(s);
+      s += src_stride;
+      s5 = vld1q_s16(s);
+      s += src_stride;
+      s6 = vld1q_s16(s);
+      s += src_stride;
+      d = dst_tmp_ptr;
+      height = h;
+      PROCESS_ROW_FOR_VERTICAL_FILTER
+
+      src_tmp_ptr += 8;
+      dst_tmp_ptr += 8;
+
+      w -= 8;
+    } while (w > 0);
+  }
+#endif
 }