Add SSE2 vectorized warp filter for lowbd

End-to-end speed improvements: (measured on tempete_cif.y4m,
20 frames for encoder and all 260 frames for decoder)

* GLOBAL_MOTION encoder: ~10% faster
* GLOBAL_MOTION decoder: 100-200% faster depending on bitrate
* WARPED_MOTION encoder: ~2.5% faster
* WARPED_MOTION decoder: ~20-40% faster depending on bitrate

The improvement in the GLOBAL_MOTION decoder is particularly
large because its runtime is dominated by calls to warp_plane().

This introduces minor changes to the output of the warp filter,
but these should be rare.

Change-Id: I5813ab9e90311e27587045153c32d400b6b9eb92
diff --git a/av1/av1_common.mk b/av1/av1_common.mk
index 04dd817..abc812a 100644
--- a/av1/av1_common.mk
+++ b/av1/av1_common.mk
@@ -157,4 +157,8 @@
 AV1_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/filterintra_sse4.c
 endif
 
+ifneq ($(findstring yes,$(CONFIG_GLOBAL_MOTION) $(CONFIG_WARPED_MOTION)),)
+AV1_COMMON_SRCS-$(HAVE_SSE2) += common/x86/warp_plane_sse2.c
+endif
+
 $(eval $(call rtcd_h_template,av1_rtcd,av1/common/av1_rtcd_defs.pl))
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index e4d1e6e..f081c19 100644
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -765,4 +765,12 @@
   specialize qw/od_filter_dering_orthogonal_8x8 sse4_1/;
 }
 
+# WARPED_MOTION / GLOBAL_MOTION functions
+
+if ((aom_config("CONFIG_WARPED_MOTION") eq "yes") ||
+    (aom_config("CONFIG_GLOBAL_MOTION") eq "yes")) {
+  add_proto qw/void warp_affine/, "int32_t *mat, uint8_t *ref, int width, int height, int stride, uint8_t *pred, int p_col, int p_row, int p_width, int p_height, int p_stride, int subsampling_x, int subsampling_y, int ref_frm, int32_t alpha, int32_t beta, int32_t gamma, int32_t delta";
+  specialize qw/warp_affine sse2/;
+}
+
 1;
diff --git a/av1/common/warped_motion.c b/av1/common/warped_motion.c
index daded0c..d1107d5 100644
--- a/av1/common/warped_motion.c
+++ b/av1/common/warped_motion.c
@@ -15,6 +15,7 @@
 #include <math.h>
 #include <assert.h>
 
+#include "./av1_rtcd.h"
 #include "av1/common/warped_motion.h"
 
 /* clang-format off */
@@ -412,7 +413,7 @@
 // [-1, 2) * WARPEDPIXEL_PREC_SHIFTS.
 // We need an extra 2 taps to fit this in, for a total of 8 taps.
 /* clang-format off */
-static const int16_t warped_filter[WARPEDPIXEL_PREC_SHIFTS*3][8] = {
+const int16_t warped_filter[WARPEDPIXEL_PREC_SHIFTS*3][8] = {
   // [-1, 0)
   { 0,   0, 128,   0,   0, 0, 0, 0 }, { 0, - 1, 128,   2, - 1, 0, 0, 0 },
   { 1, - 3, 127,   4, - 1, 0, 0, 0 }, { 1, - 4, 126,   6, - 2, 1, 0, 0 },
@@ -891,7 +892,146 @@
    within the block, the parameters must satisfy
    4 * |alpha| + 7 * |beta| <= 1   and   4 * |gamma| + 7 * |delta| <= 1
    for this filter to be applicable.
+
+   Note: warp_affine() assumes that the caller has done all of the relevant
+   checks, ie. that we have a ROTZOOM or AFFINE model, that wm[4] and wm[5]
+   are set appropriately (if using a ROTZOOM model), and that alpha, beta,
+   gamma, delta are all in range.
+
+   TODO(david.barker): Maybe support scaled references?
 */
+static inline int16_t saturate_int16(int32_t v) {
+  if (v > 32767)
+    return 32767;
+  else if (v < -32768)
+    return -32768;
+  return v;
+}
+
+void warp_affine_c(int32_t *mat, uint8_t *ref, int width, int height,
+                   int stride, uint8_t *pred, int p_col, int p_row, int p_width,
+                   int p_height, int p_stride, int subsampling_x,
+                   int subsampling_y, int ref_frm, int32_t alpha, int32_t beta,
+                   int32_t gamma, int32_t delta) {
+  int16_t tmp[15 * 8];
+  int i, j, k, l, m;
+
+  /* Note: For this code to work, the left/right frame borders need to be
+     extended by at least 13 pixels each. By the time we get here, other
+     code will have set up this border, but we allow an explicit check
+     for debugging purposes.
+  */
+  /*for (i = 0; i < height; ++i) {
+    for (j = 0; j < 13; ++j) {
+      assert(ref[i * stride - 13 + j] == ref[i * stride]);
+      assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
+    }
+  }*/
+
+  for (i = p_row; i < p_row + p_height; i += 8) {
+    for (j = p_col; j < p_col + p_width; j += 8) {
+      int32_t x4, y4, ix4, sx4, iy4, sy4;
+      if (subsampling_x)
+        x4 = ROUND_POWER_OF_TWO_SIGNED(
+            mat[2] * 2 * (j + 4) + mat[3] * 2 * (i + 4) + mat[0] +
+                (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+            1);
+      else
+        x4 = mat[2] * (j + 4) + mat[3] * (i + 4) + mat[0];
+
+      if (subsampling_y)
+        y4 = ROUND_POWER_OF_TWO_SIGNED(
+            mat[4] * 2 * (j + 4) + mat[5] * 2 * (i + 4) + mat[1] +
+                (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+            1);
+      else
+        y4 = mat[4] * (j + 4) + mat[5] * (i + 4) + mat[1];
+
+      ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+      // Horizontal filter
+      for (k = -7; k < 8; ++k) {
+        int iy = iy4 + k;
+        if (iy < 0)
+          iy = 0;
+        else if (iy > height - 1)
+          iy = height - 1;
+
+        if (ix4 <= -7) {
+          // In this case, the rightmost pixel sampled is in column
+          // ix4 + 3 + 7 - 3 = ix4 + 7 <= 0, ie. the entire block
+          // will sample only from the leftmost column
+          // (once border extension is taken into account)
+          for (l = 0; l < 8; ++l) {
+            tmp[(k + 7) * 8 + l] =
+                ref[iy * stride] * (1 << WARPEDPIXEL_FILTER_BITS);
+          }
+        } else if (ix4 >= width + 6) {
+          // In this case, the leftmost pixel sampled is in column
+          // ix4 - 3 + 0 - 3 = ix4 - 6 >= width, ie. the entire block
+          // will sample only from the rightmost column
+          // (once border extension is taken into account)
+          for (l = 0; l < 8; ++l) {
+            tmp[(k + 7) * 8 + l] =
+                ref[iy * stride + (width - 1)] * (1 << WARPEDPIXEL_FILTER_BITS);
+          }
+        } else {
+          // If we get here, then
+          // the leftmost pixel sampled is
+          // ix4 - 4 + 0 - 3 = ix4 - 7 >= -13
+          // and the rightmost pixel sampled is at most
+          // ix4 + 3 + 7 - 3 = ix4 + 7 <= width + 12
+          // So, assuming that border extension has been done, we
+          // don't need to explicitly clamp values.
+          int sx = sx4 + alpha * (-4) + beta * k;
+
+          for (l = -4; l < 4; ++l) {
+            int ix = ix4 + l - 3;
+            // At this point, sx = sx4 + alpha * l + beta * k
+            const int16_t *coeffs =
+                warped_filter[ROUND_POWER_OF_TWO_SIGNED(sx,
+                                                        WARPEDDIFF_PREC_BITS) +
+                              WARPEDPIXEL_PREC_SHIFTS];
+            int32_t sum = 0;
+            for (m = 0; m < 8; ++m) {
+              sum += ref[iy * stride + ix + m] * coeffs[m];
+            }
+            tmp[(k + 7) * 8 + (l + 4)] = saturate_int16(sum);
+            sx += alpha;
+          }
+        }
+      }
+
+      // Vertical filter
+      for (k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
+        int sy = sy4 + gamma * (-4) + delta * k;
+        for (l = -4; l < 4; ++l) {
+          uint8_t *p =
+              &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
+          // At this point, sy = sy4 + gamma * l + delta * k
+          const int16_t *coeffs = warped_filter[ROUND_POWER_OF_TWO_SIGNED(
+                                                    sy, WARPEDDIFF_PREC_BITS) +
+                                                WARPEDPIXEL_PREC_SHIFTS];
+          int32_t sum = 0;
+          for (m = 0; m < 8; ++m) {
+            sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
+          }
+          sum =
+              clip_pixel(ROUND_POWER_OF_TWO(sum, 2 * WARPEDPIXEL_FILTER_BITS));
+          if (ref_frm)
+            *p = ROUND_POWER_OF_TWO(*p + sum, 1);
+          else
+            *p = sum;
+          sy += gamma;
+        }
+      }
+    }
+  }
+}
+
 static void warp_plane(WarpedMotionParams *wm, uint8_t *ref, int width,
                        int height, int stride, uint8_t *pred, int p_col,
                        int p_row, int p_width, int p_height, int p_stride,
@@ -901,9 +1041,8 @@
     wm->wmmat[5] = wm->wmmat[2];
     wm->wmmat[4] = -wm->wmmat[3];
   }
-  if (wm->wmtype == ROTZOOM || wm->wmtype == AFFINE) {
-    int32_t tmp[15 * 8];
-    int i, j, k, l, m;
+  if ((wm->wmtype == ROTZOOM || wm->wmtype == AFFINE) && x_scale == 16 &&
+      y_scale == 16) {
     int32_t *mat = wm->wmmat;
     int32_t alpha, beta, gamma, delta;
 
@@ -932,78 +1071,9 @@
       return;
     }
 
-    for (i = p_row; i < p_row + p_height; i += 8) {
-      for (j = p_col; j < p_col + p_width; j += 8) {
-        int32_t x4, y4, ix4, sx4, iy4, sy4;
-        if (subsampling_x)
-          x4 = ROUND_POWER_OF_TWO_SIGNED(
-              mat[2] * 2 * (j + 4) + mat[3] * 2 * (i + 4) + mat[0] +
-                  (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
-              1);
-        else
-          x4 = mat[2] * (j + 4) + mat[3] * (i + 4) + mat[0];
-
-        if (subsampling_y)
-          y4 = ROUND_POWER_OF_TWO_SIGNED(
-              mat[4] * 2 * (j + 4) + mat[5] * 2 * (i + 4) + mat[1] +
-                  (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
-              1);
-        else
-          y4 = mat[4] * (j + 4) + mat[5] * (i + 4) + mat[1];
-
-        ix4 = x4 >> WARPEDMODEL_PREC_BITS;
-        sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-        iy4 = y4 >> WARPEDMODEL_PREC_BITS;
-        sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
-
-        // Horizontal filter
-        for (k = -7; k < 8; ++k) {
-          int iy = iy4 + k;
-          if (iy < 0)
-            iy = 0;
-          else if (iy > height - 1)
-            iy = height - 1;
-
-          for (l = -4; l < 4; ++l) {
-            int ix = ix4 + l;
-            int sx = ROUND_POWER_OF_TWO_SIGNED(sx4 + alpha * l + beta * k,
-                                               WARPEDDIFF_PREC_BITS);
-            const int16_t *coeffs = warped_filter[sx + WARPEDPIXEL_PREC_SHIFTS];
-            int32_t sum = 0;
-            for (m = 0; m < 8; ++m) {
-              if (ix + m - 3 < 0)
-                sum += ref[iy * stride] * coeffs[m];
-              else if (ix + m - 3 > width - 1)
-                sum += ref[iy * stride + width - 1] * coeffs[m];
-              else
-                sum += ref[iy * stride + ix + m - 3] * coeffs[m];
-            }
-            tmp[(k + 7) * 8 + (l + 4)] = sum;
-          }
-        }
-
-        // Vertical filter
-        for (k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
-          for (l = -4; l < AOMMIN(4, p_col + p_width - j - 4); ++l) {
-            uint8_t *p =
-                &pred[(i - p_row + k + 4) * p_stride + (j - p_col + l + 4)];
-            int sy = ROUND_POWER_OF_TWO_SIGNED(sy4 + gamma * l + delta * k,
-                                               WARPEDDIFF_PREC_BITS);
-            const int16_t *coeffs = warped_filter[sy + WARPEDPIXEL_PREC_SHIFTS];
-            int32_t sum = 0;
-            for (m = 0; m < 8; ++m) {
-              sum += tmp[(k + m + 4) * 8 + (l + 4)] * coeffs[m];
-            }
-            sum = clip_pixel(
-                ROUND_POWER_OF_TWO_SIGNED(sum, 2 * WARPEDPIXEL_FILTER_BITS));
-            if (ref_frm)
-              *p = ROUND_POWER_OF_TWO_SIGNED(*p + sum, 1);
-            else
-              *p = sum;
-          }
-        }
-      }
-    }
+    warp_affine(mat, ref, width, height, stride, pred, p_col, p_row, p_width,
+                p_height, p_stride, subsampling_x, subsampling_y, ref_frm,
+                alpha, beta, gamma, delta);
   } else {
     warp_plane_old(wm, ref, width, height, stride, pred, p_col, p_row, p_width,
                    p_height, p_stride, subsampling_x, subsampling_y, x_scale,
diff --git a/av1/common/warped_motion.h b/av1/common/warped_motion.h
index 76ab53f..24eeb92 100644
--- a/av1/common/warped_motion.h
+++ b/av1/common/warped_motion.h
@@ -28,6 +28,8 @@
 #define DEFAULT_WMTYPE AFFINE
 #endif  // CONFIG_WARPED_MOTION
 
+const int16_t warped_filter[WARPEDPIXEL_PREC_SHIFTS * 3][8];
+
 typedef void (*ProjectPointsFunc)(int32_t *mat, int *points, int *proj,
                                   const int n, const int stride_points,
                                   const int stride_proj,
diff --git a/av1/common/x86/warp_plane_sse2.c b/av1/common/x86/warp_plane_sse2.c
new file mode 100644
index 0000000..5ecc30e
--- /dev/null
+++ b/av1/common/x86/warp_plane_sse2.c
@@ -0,0 +1,276 @@
+/*
+ * Copyright (c) 2016, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <emmintrin.h>
+
+#include "./av1_rtcd.h"
+#include "av1/common/warped_motion.h"
+
+/* SSE2 version of the rotzoom/affine warp filter */
+void warp_affine_sse2(int32_t *mat, uint8_t *ref, int width, int height,
+                      int stride, uint8_t *pred, int p_col, int p_row,
+                      int p_width, int p_height, int p_stride,
+                      int subsampling_x, int subsampling_y, int ref_frm,
+                      int32_t alpha, int32_t beta, int32_t gamma,
+                      int32_t delta) {
+  __m128i tmp[15];
+  int i, j, k, l;
+
+  /* Note: For this code to work, the left/right frame borders need to be
+     extended by at least 13 pixels each. By the time we get here, other
+     code will have set up this border, but we allow an explicit check
+     for debugging purposes.
+  */
+  /*for (i = 0; i < height; ++i) {
+    for (j = 0; j < 13; ++j) {
+      assert(ref[i * stride - 13 + j] == ref[i * stride]);
+      assert(ref[i * stride + width + j] == ref[i * stride + (width - 1)]);
+    }
+  }*/
+
+  for (i = p_row; i < p_row + p_height; i += 8) {
+    for (j = p_col; j < p_col + p_width; j += 8) {
+      int32_t x4, y4, ix4, sx4, iy4, sy4;
+      if (subsampling_x)
+        x4 = ROUND_POWER_OF_TWO_SIGNED(
+            mat[2] * 2 * (j + 4) + mat[3] * 2 * (i + 4) + mat[0] +
+                (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+            1);
+      else
+        x4 = mat[2] * (j + 4) + mat[3] * (i + 4) + mat[0];
+
+      if (subsampling_y)
+        y4 = ROUND_POWER_OF_TWO_SIGNED(
+            mat[4] * 2 * (j + 4) + mat[5] * 2 * (i + 4) + mat[1] +
+                (mat[4] + mat[5] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+            1);
+      else
+        y4 = mat[4] * (j + 4) + mat[5] * (i + 4) + mat[1];
+
+      ix4 = x4 >> WARPEDMODEL_PREC_BITS;
+      sx4 = x4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+      iy4 = y4 >> WARPEDMODEL_PREC_BITS;
+      sy4 = y4 & ((1 << WARPEDMODEL_PREC_BITS) - 1);
+
+      // Horizontal filter
+      for (k = -7; k < 8; ++k) {
+        int iy = iy4 + k;
+        if (iy < 0)
+          iy = 0;
+        else if (iy > height - 1)
+          iy = height - 1;
+
+        if (ix4 <= -7) {
+          // In this case, the rightmost pixel sampled is in column
+          // ix4 + 3 + 7 - 3 = ix4 + 7 <= 0, ie. the entire block
+          // will sample only from the leftmost column
+          // (once border extension is taken into account)
+          for (l = 0; l < 8; ++l) {
+            ((int16_t *)tmp)[(k + 7) * 8 + l] =
+                ref[iy * stride] * (1 << WARPEDPIXEL_FILTER_BITS);
+          }
+        } else if (ix4 >= width + 6) {
+          // In this case, the leftmost pixel sampled is in column
+          // ix4 - 3 + 0 - 3 = ix4 - 6 >= width, ie. the entire block
+          // will sample only from the rightmost column
+          // (once border extension is taken into account)
+          for (l = 0; l < 8; ++l) {
+            ((int16_t *)tmp)[(k + 7) * 8 + l] =
+                ref[iy * stride + (width - 1)] * (1 << WARPEDPIXEL_FILTER_BITS);
+          }
+        } else {
+          // If we get here, then
+          // the leftmost pixel sampled is
+          // ix4 - 4 + 0 - 3 = ix4 - 7 >= -13
+          // and the rightmost pixel sampled is at most
+          // ix4 + 3 + 7 - 3 = ix4 + 7 <= width + 12
+          // So, assuming that border extension has been done, we
+          // don't need to explicitly clamp values.
+          int sx = sx4 + alpha * (-4) + beta * k;
+
+          // Load per-pixel coefficients
+          __m128i coeffs[8];
+          for (l = 0; l < 8; ++l) {
+            coeffs[l] =
+                ((__m128i *)warped_filter)[ROUND_POWER_OF_TWO_SIGNED(
+                                               sx, WARPEDDIFF_PREC_BITS) +
+                                           WARPEDPIXEL_PREC_SHIFTS];
+            sx += alpha;
+          }
+
+          // Currently the coefficients are stored in the order
+          // 0 ... 7 for each pixel separately.
+          // Permute the coefficients into 8 registers:
+          // coeffs (0,1), (2,3), (4,5), (6,7) for
+          // i) the odd-index pixels, and ii) the even-index pixels
+          {
+            // 0 1 0 1 2 3 2 3 for 0, 2
+            __m128i tmp_0 = _mm_unpacklo_epi32(coeffs[0], coeffs[2]);
+            // 0 1 0 1 2 3 2 3 for 1, 3
+            __m128i tmp_1 = _mm_unpacklo_epi32(coeffs[1], coeffs[3]);
+            // 0 1 0 1 2 3 2 3 for 4, 6
+            __m128i tmp_2 = _mm_unpacklo_epi32(coeffs[4], coeffs[6]);
+            // 0 1 0 1 2 3 2 3 for 5, 7
+            __m128i tmp_3 = _mm_unpacklo_epi32(coeffs[5], coeffs[7]);
+            // 4 5 4 5 6 7 6 7 for 0, 2
+            __m128i tmp_4 = _mm_unpackhi_epi32(coeffs[0], coeffs[2]);
+            // 4 5 4 5 6 7 6 7 for 1, 3
+            __m128i tmp_5 = _mm_unpackhi_epi32(coeffs[1], coeffs[3]);
+            // 4 5 4 5 6 7 6 7 for 4, 6
+            __m128i tmp_6 = _mm_unpackhi_epi32(coeffs[4], coeffs[6]);
+            // 4 5 4 5 6 7 6 7 for 5, 7
+            __m128i tmp_7 = _mm_unpackhi_epi32(coeffs[5], coeffs[7]);
+
+            // 0 1 0 1 0 1 0 1 for 0, 2, 4, 6
+            __m128i coeff_a = _mm_unpacklo_epi64(tmp_0, tmp_2);
+            // 0 1 0 1 0 1 0 1 for 1, 3, 5, 7
+            __m128i coeff_b = _mm_unpacklo_epi64(tmp_1, tmp_3);
+            // 2 3 2 3 2 3 2 3 for 0, 2, 4, 6
+            __m128i coeff_c = _mm_unpackhi_epi64(tmp_0, tmp_2);
+            // 2 3 2 3 2 3 2 3 for 1, 3, 5, 7
+            __m128i coeff_d = _mm_unpackhi_epi64(tmp_1, tmp_3);
+            // 4 5 4 5 4 5 4 5 for 0, 2, 4, 6
+            __m128i coeff_e = _mm_unpacklo_epi64(tmp_4, tmp_6);
+            // 4 5 4 5 4 5 4 5 for 1, 3, 5, 7
+            __m128i coeff_f = _mm_unpacklo_epi64(tmp_5, tmp_7);
+            // 6 7 6 7 6 7 6 7 for 0, 2, 4, 6
+            __m128i coeff_g = _mm_unpackhi_epi64(tmp_4, tmp_6);
+            // 6 7 6 7 6 7 6 7 for 1, 3, 5, 7
+            __m128i coeff_h = _mm_unpackhi_epi64(tmp_5, tmp_7);
+
+            // Load source pixels and widen to 16 bits. We want the pixels
+            // ref[iy * stride + ix4 - 7] ... ref[iy * stride + ix4 + 7]
+            // inclusive.
+            __m128i zero = _mm_setzero_si128();
+            __m128i src =
+                _mm_loadu_si128((__m128i *)(ref + iy * stride + ix4 - 7));
+
+            // Calculate filtered results
+            __m128i src_a = _mm_unpacklo_epi8(src, zero);
+            __m128i res_a = _mm_madd_epi16(src_a, coeff_a);
+            __m128i src_b = _mm_unpacklo_epi8(_mm_srli_si128(src, 1), zero);
+            __m128i res_b = _mm_madd_epi16(src_b, coeff_b);
+            __m128i src_c = _mm_unpacklo_epi8(_mm_srli_si128(src, 2), zero);
+            __m128i res_c = _mm_madd_epi16(src_c, coeff_c);
+            __m128i src_d = _mm_unpacklo_epi8(_mm_srli_si128(src, 3), zero);
+            __m128i res_d = _mm_madd_epi16(src_d, coeff_d);
+            __m128i src_e = _mm_unpacklo_epi8(_mm_srli_si128(src, 4), zero);
+            __m128i res_e = _mm_madd_epi16(src_e, coeff_e);
+            __m128i src_f = _mm_unpacklo_epi8(_mm_srli_si128(src, 5), zero);
+            __m128i res_f = _mm_madd_epi16(src_f, coeff_f);
+            __m128i src_g = _mm_unpacklo_epi8(_mm_srli_si128(src, 6), zero);
+            __m128i res_g = _mm_madd_epi16(src_g, coeff_g);
+            __m128i src_h = _mm_unpacklo_epi8(_mm_srli_si128(src, 7), zero);
+            __m128i res_h = _mm_madd_epi16(src_h, coeff_h);
+
+            __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_a, res_e),
+                                             _mm_add_epi32(res_c, res_g));
+            __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_b, res_f),
+                                            _mm_add_epi32(res_d, res_h));
+
+            // Combine results into one register.
+            // We store the columns in the order 0, 2, 4, 6, 1, 3, 5, 7
+            // as this order helps with the vertical filter.
+            tmp[k + 7] = _mm_packs_epi32(res_even, res_odd);
+          }
+        }
+      }
+
+      // Vertical filter
+      for (k = -4; k < AOMMIN(4, p_row + p_height - i - 4); ++k) {
+        int sy = sy4 + gamma * (-4) + delta * k;
+
+        // Load per-pixel coefficients
+        __m128i coeffs[8];
+        for (l = 0; l < 8; ++l) {
+          coeffs[l] = ((__m128i *)warped_filter)[ROUND_POWER_OF_TWO_SIGNED(
+                                                     sy, WARPEDDIFF_PREC_BITS) +
+                                                 WARPEDPIXEL_PREC_SHIFTS];
+          sy += gamma;
+        }
+
+        {
+          // Rearrange coefficients in the same way as for the horizontal filter
+          __m128i tmp_0 = _mm_unpacklo_epi32(coeffs[0], coeffs[2]);
+          __m128i tmp_1 = _mm_unpacklo_epi32(coeffs[1], coeffs[3]);
+          __m128i tmp_2 = _mm_unpacklo_epi32(coeffs[4], coeffs[6]);
+          __m128i tmp_3 = _mm_unpacklo_epi32(coeffs[5], coeffs[7]);
+          __m128i tmp_4 = _mm_unpackhi_epi32(coeffs[0], coeffs[2]);
+          __m128i tmp_5 = _mm_unpackhi_epi32(coeffs[1], coeffs[3]);
+          __m128i tmp_6 = _mm_unpackhi_epi32(coeffs[4], coeffs[6]);
+          __m128i tmp_7 = _mm_unpackhi_epi32(coeffs[5], coeffs[7]);
+
+          __m128i coeff_a = _mm_unpacklo_epi64(tmp_0, tmp_2);
+          __m128i coeff_b = _mm_unpacklo_epi64(tmp_1, tmp_3);
+          __m128i coeff_c = _mm_unpackhi_epi64(tmp_0, tmp_2);
+          __m128i coeff_d = _mm_unpackhi_epi64(tmp_1, tmp_3);
+          __m128i coeff_e = _mm_unpacklo_epi64(tmp_4, tmp_6);
+          __m128i coeff_f = _mm_unpacklo_epi64(tmp_5, tmp_7);
+          __m128i coeff_g = _mm_unpackhi_epi64(tmp_4, tmp_6);
+          __m128i coeff_h = _mm_unpackhi_epi64(tmp_5, tmp_7);
+
+          // Load from tmp and rearrange pairs of consecutive rows into the
+          // column order 0 0 2 2 4 4 6 6; 1 1 3 3 5 5 7 7
+          __m128i *src = tmp + (k + 4);
+          __m128i src_a = _mm_unpacklo_epi16(src[0], src[1]);
+          __m128i src_b = _mm_unpackhi_epi16(src[0], src[1]);
+          __m128i src_c = _mm_unpacklo_epi16(src[2], src[3]);
+          __m128i src_d = _mm_unpackhi_epi16(src[2], src[3]);
+          __m128i src_e = _mm_unpacklo_epi16(src[4], src[5]);
+          __m128i src_f = _mm_unpackhi_epi16(src[4], src[5]);
+          __m128i src_g = _mm_unpacklo_epi16(src[6], src[7]);
+          __m128i src_h = _mm_unpackhi_epi16(src[6], src[7]);
+
+          // Calculate the filtered pixels
+          __m128i res_a = _mm_madd_epi16(src_a, coeff_a);
+          __m128i res_b = _mm_madd_epi16(src_b, coeff_b);
+          __m128i res_c = _mm_madd_epi16(src_c, coeff_c);
+          __m128i res_d = _mm_madd_epi16(src_d, coeff_d);
+          __m128i res_e = _mm_madd_epi16(src_e, coeff_e);
+          __m128i res_f = _mm_madd_epi16(src_f, coeff_f);
+          __m128i res_g = _mm_madd_epi16(src_g, coeff_g);
+          __m128i res_h = _mm_madd_epi16(src_h, coeff_h);
+
+          __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_a, res_c),
+                                           _mm_add_epi32(res_e, res_g));
+          __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_b, res_d),
+                                          _mm_add_epi32(res_f, res_h));
+
+          // Round and pack down to pixels
+          __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
+          __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);
+
+          __m128i round_const =
+              _mm_set1_epi32((1 << (2 * WARPEDPIXEL_FILTER_BITS)) >> 1);
+
+          __m128i res_lo_round = _mm_srai_epi32(
+              _mm_add_epi32(res_lo, round_const), 2 * WARPEDPIXEL_FILTER_BITS);
+          __m128i res_hi_round = _mm_srai_epi32(
+              _mm_add_epi32(res_hi, round_const), 2 * WARPEDPIXEL_FILTER_BITS);
+
+          __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
+          __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);
+
+          // Store, blending with 'pred' if needed
+          __m128i *p =
+              (__m128i *)&pred[(i - p_row + k + 4) * p_stride + (j - p_col)];
+
+          if (ref_frm) {
+            __m128i orig = _mm_loadl_epi64(p);
+            _mm_storel_epi64(p, _mm_avg_epu8(res_8bit, orig));
+          } else {
+            _mm_storel_epi64(p, res_8bit);
+          }
+        }
+      }
+    }
+  }
+}