SIMD implementation of horz superres

SSE4.1 implementations of av1_convolve_horiz_rs and
av1_highbd_convolve_horiz_rs have been added, along
with the corresponding speed and correctness tests.

The interp_taps argument was defunct and has now been
removed and replaced with the UPSCALE_NORMATIVE_TAPS
macro.

Code associated with values of UPSCALE_NORMATIVE_TAPS
that are no longer used has been removed.

Change-Id: Ie74d8ca479a70c8d473ac12883cfe4f10b37a66d
diff --git a/av1/av1.cmake b/av1/av1.cmake
index 5066ce2..c26aa3e 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -308,6 +308,12 @@
       ${AOM_AV1_COMMON_INTRIN_SSE2}
       "${AOM_ROOT}/av1/common/x86/convolve_sse2.c")
 
+if (CONFIG_HORZONLY_FRAME_SUPERRES)
+  set(AOM_AV1_COMMON_INTRIN_SSE4_1
+      ${AOM_AV1_COMMON_INTRIN_SSE4_1}
+      "${AOM_ROOT}/av1/common/x86/av1_convolve_horiz_rs_sse4.c")
+endif ()
+
 set(AOM_AV1_COMMON_INTRIN_AVX2
     ${AOM_AV1_COMMON_INTRIN_AVX2}
     "${AOM_ROOT}/av1/common/x86/convolve_avx2.c")
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 7f40213..edd5fcb 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -70,9 +70,11 @@
 specialize qw/av1_convolve_vert ssse3/;
 
 if (aom_config("CONFIG_HORZONLY_FRAME_SUPERRES") eq "yes") {
-  add_proto qw/void av1_convolve_horiz_rs/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int interp_taps, const int x0_qn, const int x_step_qn";
+  add_proto qw/void av1_convolve_horiz_rs/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, const int x0_qn, const int x_step_qn";
+  specialize qw/av1_convolve_horiz_rs sse4_1/;
 
-    add_proto qw/void av1_highbd_convolve_horiz_rs/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, int interp_taps, const int x0_qn, const int x_step_qn, int bd";
+  add_proto qw/void av1_highbd_convolve_horiz_rs/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const int16_t *x_filters, const int x0_qn, const int x_step_qn, int bd";
+  specialize qw/av1_highbd_convolve_horiz_rs sse4_1/;
 }
 
   add_proto qw/void av1_highbd_convolve_horiz/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg, int bd";
diff --git a/av1/common/convolve.c b/av1/common/convolve.c
index 00c87c9..7de1074 100644
--- a/av1/common/convolve.c
+++ b/av1/common/convolve.c
@@ -18,6 +18,7 @@
 #include "av1/common/convolve.h"
 #include "av1/common/filter.h"
 #include "av1/common/onyxc_int.h"
+#include "av1/common/resize.h"
 #include "aom_dsp/aom_dsp_common.h"
 #include "aom_ports/mem.h"
 
@@ -32,9 +33,9 @@
 
 void av1_convolve_horiz_rs_c(const uint8_t *src, int src_stride, uint8_t *dst,
                              int dst_stride, int w, int h,
-                             const int16_t *x_filters, int interp_taps,
-                             const int x0_qn, const int x_step_qn) {
-  src -= interp_taps / 2 - 1;
+                             const int16_t *x_filters, const int x0_qn,
+                             const int x_step_qn) {
+  src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
   for (int y = 0; y < h; ++y) {
     int x_qn = x0_qn;
     for (int x = 0; x < w; ++x) {
@@ -42,9 +43,11 @@
       const int x_filter_idx =
           (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
       assert(x_filter_idx <= RS_SUBPEL_MASK);
-      const int16_t *const x_filter = &x_filters[x_filter_idx * interp_taps];
+      const int16_t *const x_filter =
+          &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
       int sum = 0;
-      for (int k = 0; k < interp_taps; ++k) sum += src_x[k] * x_filter[k];
+      for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
+        sum += src_x[k] * x_filter[k];
       dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
       x_qn += x_step_qn;
     }
@@ -55,10 +58,9 @@
 // TODO(yaowu: remove "const" from pass-by-value params in this and other funcs)
 void av1_highbd_convolve_horiz_rs_c(const uint16_t *src, int src_stride,
                                     uint16_t *dst, int dst_stride, int w, int h,
-                                    const int16_t *x_filters, int interp_taps,
-                                    const int x0_qn, const int x_step_qn,
-                                    int bd) {
-  src -= interp_taps / 2 - 1;
+                                    const int16_t *x_filters, const int x0_qn,
+                                    const int x_step_qn, int bd) {
+  src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
   for (int y = 0; y < h; ++y) {
     int x_qn = x0_qn;
     for (int x = 0; x < w; ++x) {
@@ -66,9 +68,11 @@
       const int x_filter_idx =
           (x_qn & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
       assert(x_filter_idx <= RS_SUBPEL_MASK);
-      const int16_t *const x_filter = &x_filters[x_filter_idx * interp_taps];
+      const int16_t *const x_filter =
+          &x_filters[x_filter_idx * UPSCALE_NORMATIVE_TAPS];
       int sum = 0;
-      for (int k = 0; k < interp_taps; ++k) sum += src_x[k] * x_filter[k];
+      for (int k = 0; k < UPSCALE_NORMATIVE_TAPS; ++k)
+        sum += src_x[k] * x_filter[k];
       dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
       x_qn += x_step_qn;
     }
diff --git a/av1/common/resize.c b/av1/common/resize.c
index 4b2a4bd..0bbecbc 100644
--- a/av1/common/resize.c
+++ b/av1/common/resize.c
@@ -208,75 +208,7 @@
 #if CONFIG_HORZONLY_FRAME_SUPERRES
 const int16_t av1_resize_filter_normative[(
     1 << RS_SUBPEL_BITS)][UPSCALE_NORMATIVE_TAPS] = {
-#if UPSCALE_NORMATIVE_TAPS == 2
-  { 128, 0 },  { 126, 2 },  { 124, 4 },  { 122, 6 },  { 120, 8 },  { 118, 10 },
-  { 116, 12 }, { 114, 14 }, { 112, 16 }, { 110, 18 }, { 108, 20 }, { 106, 22 },
-  { 104, 24 }, { 102, 26 }, { 100, 28 }, { 98, 30 },  { 96, 32 },  { 94, 34 },
-  { 92, 36 },  { 90, 38 },  { 88, 40 },  { 86, 42 },  { 84, 44 },  { 82, 46 },
-  { 80, 48 },  { 78, 50 },  { 76, 52 },  { 74, 54 },  { 72, 56 },  { 70, 58 },
-  { 68, 60 },  { 66, 62 },  { 64, 64 },  { 62, 66 },  { 60, 68 },  { 58, 70 },
-  { 56, 72 },  { 54, 74 },  { 52, 76 },  { 50, 78 },  { 48, 80 },  { 46, 82 },
-  { 44, 84 },  { 42, 86 },  { 40, 88 },  { 38, 90 },  { 36, 92 },  { 34, 94 },
-  { 32, 96 },  { 30, 98 },  { 28, 100 }, { 26, 102 }, { 24, 104 }, { 22, 106 },
-  { 20, 108 }, { 18, 110 }, { 16, 112 }, { 14, 114 }, { 12, 116 }, { 10, 118 },
-  { 8, 120 },  { 6, 122 },  { 4, 124 },  { 2, 126 },
-#elif UPSCALE_NORMATIVE_TAPS == 4
-  { 0, 128, 0, 0 },      { -1, 128, 2, -1 },    { -2, 127, 4, -1 },
-  { -3, 126, 7, -2 },    { -4, 125, 9, -2 },    { -5, 125, 11, -3 },
-  { -6, 124, 13, -3 },   { -7, 123, 16, -4 },   { -7, 122, 18, -5 },
-  { -8, 121, 20, -5 },   { -9, 120, 23, -6 },   { -9, 118, 25, -6 },
-  { -10, 117, 28, -7 },  { -11, 116, 30, -7 },  { -11, 114, 33, -8 },
-  { -12, 113, 35, -8 },  { -12, 111, 38, -9 },  { -13, 109, 41, -9 },
-  { -13, 108, 43, -10 }, { -13, 106, 45, -10 }, { -13, 104, 48, -11 },
-  { -14, 102, 51, -11 }, { -14, 100, 53, -11 }, { -14, 98, 56, -12 },
-  { -14, 96, 58, -12 },  { -14, 94, 61, -13 },  { -15, 92, 64, -13 },
-  { -15, 90, 66, -13 },  { -15, 87, 69, -13 },  { -14, 85, 71, -14 },
-  { -14, 83, 73, -14 },  { -14, 80, 76, -14 },  { -14, 78, 78, -14 },
-  { -14, 76, 80, -14 },  { -14, 73, 83, -14 },  { -14, 71, 85, -14 },
-  { -13, 69, 87, -15 },  { -13, 66, 90, -15 },  { -13, 64, 92, -15 },
-  { -13, 61, 94, -14 },  { -12, 58, 96, -14 },  { -12, 56, 98, -14 },
-  { -11, 53, 100, -14 }, { -11, 51, 102, -14 }, { -11, 48, 104, -13 },
-  { -10, 45, 106, -13 }, { -10, 43, 108, -13 }, { -9, 41, 109, -13 },
-  { -9, 38, 111, -12 },  { -8, 35, 113, -12 },  { -8, 33, 114, -11 },
-  { -7, 30, 116, -11 },  { -7, 28, 117, -10 },  { -6, 25, 118, -9 },
-  { -6, 23, 120, -9 },   { -5, 20, 121, -8 },   { -5, 18, 122, -7 },
-  { -4, 16, 123, -7 },   { -3, 13, 124, -6 },   { -3, 11, 125, -5 },
-  { -2, 9, 125, -4 },    { -2, 7, 126, -3 },    { -1, 4, 127, -2 },
-  { -1, 2, 128, -1 },
-#elif UPSCALE_NORMATIVE_TAPS == 6
-  { 0, 0, 128, 0, 0, 0 },      { 0, -1, 128, 2, -1, 0 },
-  { 1, -3, 127, 4, -2, 1 },    { 1, -4, 127, 6, -3, 1 },
-  { 2, -6, 126, 8, -3, 1 },    { 2, -7, 125, 11, -4, 1 },
-  { 2, -9, 125, 13, -5, 2 },   { 3, -10, 124, 15, -6, 2 },
-  { 3, -11, 123, 18, -7, 2 },  { 3, -12, 122, 20, -8, 3 },
-  { 4, -13, 121, 22, -9, 3 },  { 4, -14, 119, 25, -9, 3 },
-  { 4, -15, 118, 27, -10, 4 }, { 4, -16, 117, 30, -11, 4 },
-  { 5, -17, 116, 32, -12, 4 }, { 5, -17, 114, 35, -13, 4 },
-  { 5, -18, 112, 37, -13, 5 }, { 5, -19, 111, 40, -14, 5 },
-  { 6, -19, 109, 42, -15, 5 }, { 6, -20, 107, 45, -15, 5 },
-  { 6, -20, 105, 48, -16, 5 }, { 6, -21, 103, 51, -17, 6 },
-  { 6, -21, 101, 53, -17, 6 }, { 6, -21, 99, 56, -18, 6 },
-  { 7, -22, 97, 58, -18, 6 },  { 7, -22, 95, 61, -19, 6 },
-  { 7, -22, 93, 63, -19, 6 },  { 7, -22, 91, 66, -20, 6 },
-  { 7, -22, 88, 69, -20, 6 },  { 7, -22, 86, 71, -21, 7 },
-  { 7, -22, 83, 74, -21, 7 },  { 7, -22, 81, 76, -21, 7 },
-  { 7, -22, 79, 79, -22, 7 },  { 7, -21, 76, 81, -22, 7 },
-  { 7, -21, 74, 83, -22, 7 },  { 7, -21, 71, 86, -22, 7 },
-  { 6, -20, 69, 88, -22, 7 },  { 6, -20, 66, 91, -22, 7 },
-  { 6, -19, 63, 93, -22, 7 },  { 6, -19, 61, 95, -22, 7 },
-  { 6, -18, 58, 97, -22, 7 },  { 6, -18, 56, 99, -21, 6 },
-  { 6, -17, 53, 101, -21, 6 }, { 6, -17, 51, 103, -21, 6 },
-  { 5, -16, 48, 105, -20, 6 }, { 5, -15, 45, 107, -20, 6 },
-  { 5, -15, 42, 109, -19, 6 }, { 5, -14, 40, 111, -19, 5 },
-  { 5, -13, 37, 112, -18, 5 }, { 4, -13, 35, 114, -17, 5 },
-  { 4, -12, 32, 116, -17, 5 }, { 4, -11, 30, 117, -16, 4 },
-  { 4, -10, 27, 118, -15, 4 }, { 3, -9, 25, 119, -14, 4 },
-  { 3, -9, 22, 121, -13, 4 },  { 3, -8, 20, 122, -12, 3 },
-  { 2, -7, 18, 123, -11, 3 },  { 2, -6, 15, 124, -10, 3 },
-  { 2, -5, 13, 125, -9, 2 },   { 1, -4, 11, 125, -7, 2 },
-  { 1, -3, 8, 126, -6, 2 },    { 1, -3, 6, 127, -4, 1 },
-  { 1, -2, 4, 127, -3, 1 },    { 0, -1, 2, 128, -1, 0 },
-#elif UPSCALE_NORMATIVE_TAPS == 8
+#if UPSCALE_NORMATIVE_TAPS == 8
   { 0, 0, 0, 128, 0, 0, 0, 0 },        { 0, 0, -1, 128, 2, -1, 0, 0 },
   { 0, 1, -3, 127, 4, -2, 1, 0 },      { 0, 1, -4, 127, 6, -3, 1, 0 },
   { 0, 2, -6, 126, 8, -3, 1, 0 },      { 0, 2, -7, 125, 11, -4, 1, 0 },
@@ -311,7 +243,7 @@
   { 0, 1, -2, 4, 127, -3, 1, 0 },      { 0, 0, -1, 2, 128, -1, 0, 0 },
 #else
 #error "Invalid value of UPSCALE_NORMATIVE_TAPS"
-#endif  // UPSCALE_NORMATIVE_TAPS == 2
+#endif  // UPSCALE_NORMATIVE_TAPS == 8
 };
 #endif  // CONFIG_HORZONLY_FRAME_SUPERRES
 
@@ -699,8 +631,8 @@
   }
 
   av1_convolve_horiz_rs(input - 1, in_stride, output, out_stride, width2,
-                        height2, &av1_resize_filter_normative[0][0],
-                        UPSCALE_NORMATIVE_TAPS, x0_qn, x_step_qn);
+                        height2, &av1_resize_filter_normative[0][0], x0_qn,
+                        x_step_qn);
 
   // Restore the left/right border pixels
   if (pad_left) {
@@ -1053,7 +985,7 @@
   av1_highbd_convolve_horiz_rs(CONVERT_TO_SHORTPTR(input - 1), in_stride,
                                CONVERT_TO_SHORTPTR(output), out_stride, width2,
                                height2, &av1_resize_filter_normative[0][0],
-                               UPSCALE_NORMATIVE_TAPS, x0_qn, x_step_qn, bd);
+                               x0_qn, x_step_qn, bd);
 
   // Restore the left/right border pixels
   if (pad_left) {
diff --git a/av1/common/x86/av1_convolve_horiz_rs_sse4.c b/av1/common/x86/av1_convolve_horiz_rs_sse4.c
new file mode 100644
index 0000000..82b65ed
--- /dev/null
+++ b/av1/common/x86/av1_convolve_horiz_rs_sse4.c
@@ -0,0 +1,228 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "./av1_rtcd.h"
+#include "av1/common/convolve.h"
+#include "av1/common/resize.h"
+#include "aom_dsp/x86/synonyms.h"
+
+// Note: If the crop width is not a multiple of 4, then, unlike the C version,
+// this function will overwrite some of the padding on the right hand side of
+// the frame. This padding appears to be trashed anyway, so this should not
+// affect the running of the decoder.
+void av1_convolve_horiz_rs_sse4_1(const uint8_t *src, int src_stride,
+                                  uint8_t *dst, int dst_stride, int w, int h,
+                                  const int16_t *x_filters, const int x0_qn,
+                                  const int x_step_qn) {
+  assert(UPSCALE_NORMATIVE_TAPS == 8);
+
+  src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
+
+  const __m128i round_add = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
+  const __m128i zero = _mm_setzero_si128();
+
+  const uint8_t *src_y;
+  uint8_t *dst_y;
+  int x_qn = x0_qn;
+  for (int x = 0; x < w; x += 4, x_qn += 4 * x_step_qn) {
+    const int x_filter_idx0 =
+        ((x_qn + 0 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+    const int x_filter_idx1 =
+        ((x_qn + 1 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+    const int x_filter_idx2 =
+        ((x_qn + 2 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+    const int x_filter_idx3 =
+        ((x_qn + 3 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+
+    assert(x_filter_idx0 <= RS_SUBPEL_MASK);
+    assert(x_filter_idx1 <= RS_SUBPEL_MASK);
+    assert(x_filter_idx2 <= RS_SUBPEL_MASK);
+    assert(x_filter_idx3 <= RS_SUBPEL_MASK);
+
+    const int16_t *const x_filter0 =
+        &x_filters[x_filter_idx0 * UPSCALE_NORMATIVE_TAPS];
+    const int16_t *const x_filter1 =
+        &x_filters[x_filter_idx1 * UPSCALE_NORMATIVE_TAPS];
+    const int16_t *const x_filter2 =
+        &x_filters[x_filter_idx2 * UPSCALE_NORMATIVE_TAPS];
+    const int16_t *const x_filter3 =
+        &x_filters[x_filter_idx3 * UPSCALE_NORMATIVE_TAPS];
+
+    const __m128i fil0_16 = xx_loadu_128(x_filter0);
+    const __m128i fil1_16 = xx_loadu_128(x_filter1);
+    const __m128i fil2_16 = xx_loadu_128(x_filter2);
+    const __m128i fil3_16 = xx_loadu_128(x_filter3);
+
+    src_y = src;
+    dst_y = dst;
+    for (int y = 0; y < h; y++, src_y += src_stride, dst_y += dst_stride) {
+      const uint8_t *const src_x0 =
+          &src_y[(x_qn + 0 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+      const uint8_t *const src_x1 =
+          &src_y[(x_qn + 1 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+      const uint8_t *const src_x2 =
+          &src_y[(x_qn + 2 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+      const uint8_t *const src_x3 =
+          &src_y[(x_qn + 3 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+
+      // Load up the source data. This is 8-bit input data, so each load
+      // gets 8 pixels.
+      const __m128i src0_8 = xx_loadl_64(src_x0);
+      const __m128i src1_8 = xx_loadl_64(src_x1);
+      const __m128i src2_8 = xx_loadl_64(src_x2);
+      const __m128i src3_8 = xx_loadl_64(src_x3);
+
+      // Now zero-extend up to 16-bit precision, i.e.
+      // [ 00 00 00 00 hg fe dc ba ] -> [ 0h 0g 0f 0e 0d 0c 0b 0a ]
+      const __m128i src0_16 = _mm_cvtepu8_epi16(src0_8);
+      const __m128i src1_16 = _mm_cvtepu8_epi16(src1_8);
+      const __m128i src2_16 = _mm_cvtepu8_epi16(src2_8);
+      const __m128i src3_16 = _mm_cvtepu8_epi16(src3_8);
+
+      // Multiply by filter coefficients (results in a 32-bit value),
+      // and add adjacent pairs, i.e.
+      // ([ s7 s6 s5 s4 s3 s2 s1 s0], [ f7 f6 f5 f4 f3 f2 f1 f0 ])
+      // -> [ {s7*f7+s6*f6} {s5*f5+s4*f4} {s3*f3+s2*f2} {s1*f1+s0*f0} ]
+      const __m128i conv0_32 = _mm_madd_epi16(src0_16, fil0_16);
+      const __m128i conv1_32 = _mm_madd_epi16(src1_16, fil1_16);
+      const __m128i conv2_32 = _mm_madd_epi16(src2_16, fil2_16);
+      const __m128i conv3_32 = _mm_madd_epi16(src3_16, fil3_16);
+
+      // Reduce horizontally and add, i.e.
+      // ([ D C B A ], [ S R Q P ]) -> [ S+R Q+P D+C B+A ]
+      const __m128i conv01_32 = _mm_hadd_epi32(conv0_32, conv1_32);
+      const __m128i conv23_32 = _mm_hadd_epi32(conv2_32, conv3_32);
+
+      const __m128i conv0123_32 = _mm_hadd_epi32(conv01_32, conv23_32);
+
+      // Divide down by (1 << FILTER_BITS), rounding to nearest.
+      const __m128i shifted_32 =
+          _mm_srai_epi32(_mm_add_epi32(conv0123_32, round_add), FILTER_BITS);
+
+      // Pack 32-bit values into 16-bit values, i.e.
+      // ([ D C B A ], [ 0 0 0 0 ]) -> [ 0 0 0 0 D C B A ]
+      const __m128i shifted_16 = _mm_packus_epi32(shifted_32, zero);
+
+      // Pack 16-bit values into 8-bit values, i.e.
+      // ([ 0 0 0 0 D C B A ], [ 0 0 0 0 0 0 0 0 ])
+      // -> [ 0 0 0 0 0 0 DC BA ]
+      const __m128i shifted_8 = _mm_packus_epi16(shifted_16, zero);
+
+      // Write to the output
+      xx_storel_32(&dst_y[x], shifted_8);
+    }
+  }
+}
+
+// Note: If the crop width is not a multiple of 4, then, unlike the C version,
+// this function will overwrite some of the padding on the right hand side of
+// the frame. This padding appears to be trashed anyway, so this should not
+// affect the running of the decoder.
+void av1_highbd_convolve_horiz_rs_sse4_1(const uint16_t *src, int src_stride,
+                                         uint16_t *dst, int dst_stride, int w,
+                                         int h, const int16_t *x_filters,
+                                         const int x0_qn, const int x_step_qn,
+                                         int bd) {
+  assert(UPSCALE_NORMATIVE_TAPS == 8);
+  assert(bd == 8 || bd == 10 || bd == 12);
+
+  src -= UPSCALE_NORMATIVE_TAPS / 2 - 1;
+
+  const __m128i round_add = _mm_set1_epi32((1 << FILTER_BITS) >> 1);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i clip_maximum = _mm_set1_epi16((1 << bd) - 1);
+
+  const uint16_t *src_y;
+  uint16_t *dst_y;
+  int x_qn = x0_qn;
+  for (int x = 0; x < w; x += 4, x_qn += 4 * x_step_qn) {
+    const int x_filter_idx0 =
+        ((x_qn + 0 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+    const int x_filter_idx1 =
+        ((x_qn + 1 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+    const int x_filter_idx2 =
+        ((x_qn + 2 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+    const int x_filter_idx3 =
+        ((x_qn + 3 * x_step_qn) & RS_SCALE_SUBPEL_MASK) >> RS_SCALE_EXTRA_BITS;
+
+    assert(x_filter_idx0 <= RS_SUBPEL_MASK);
+    assert(x_filter_idx1 <= RS_SUBPEL_MASK);
+    assert(x_filter_idx2 <= RS_SUBPEL_MASK);
+    assert(x_filter_idx3 <= RS_SUBPEL_MASK);
+
+    const int16_t *const x_filter0 =
+        &x_filters[x_filter_idx0 * UPSCALE_NORMATIVE_TAPS];
+    const int16_t *const x_filter1 =
+        &x_filters[x_filter_idx1 * UPSCALE_NORMATIVE_TAPS];
+    const int16_t *const x_filter2 =
+        &x_filters[x_filter_idx2 * UPSCALE_NORMATIVE_TAPS];
+    const int16_t *const x_filter3 =
+        &x_filters[x_filter_idx3 * UPSCALE_NORMATIVE_TAPS];
+
+    const __m128i fil0_16 = xx_loadu_128(x_filter0);
+    const __m128i fil1_16 = xx_loadu_128(x_filter1);
+    const __m128i fil2_16 = xx_loadu_128(x_filter2);
+    const __m128i fil3_16 = xx_loadu_128(x_filter3);
+
+    src_y = src;
+    dst_y = dst;
+    for (int y = 0; y < h; y++, src_y += src_stride, dst_y += dst_stride) {
+      const uint16_t *const src_x0 =
+          &src_y[(x_qn + 0 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+      const uint16_t *const src_x1 =
+          &src_y[(x_qn + 1 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+      const uint16_t *const src_x2 =
+          &src_y[(x_qn + 2 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+      const uint16_t *const src_x3 =
+          &src_y[(x_qn + 3 * x_step_qn) >> RS_SCALE_SUBPEL_BITS];
+
+      // Load up the source data. This is 16-bit input data, so each load
+      // gets 8 pixels.
+      const __m128i src0_16 = xx_loadu_128(src_x0);
+      const __m128i src1_16 = xx_loadu_128(src_x1);
+      const __m128i src2_16 = xx_loadu_128(src_x2);
+      const __m128i src3_16 = xx_loadu_128(src_x3);
+
+      // Multiply by filter coefficients (results in a 32-bit value),
+      // and add adjacent pairs, i.e.
+      // ([ s7 s6 s5 s4 s3 s2 s1 s0], [ f7 f6 f5 f4 f3 f2 f1 f0 ])
+      // -> [ {s7*f7+s6*f6} {s5*f5+s4*f4} {s3*f3+s2*f2} {s1*f1+s0*f0} ]
+      const __m128i conv0_32 = _mm_madd_epi16(src0_16, fil0_16);
+      const __m128i conv1_32 = _mm_madd_epi16(src1_16, fil1_16);
+      const __m128i conv2_32 = _mm_madd_epi16(src2_16, fil2_16);
+      const __m128i conv3_32 = _mm_madd_epi16(src3_16, fil3_16);
+
+      // Reduce horizontally and add, i.e.
+      // ([ D C B A ], [ S R Q P ]) -> [ S+R Q+P D+C B+A ]
+      const __m128i conv01_32 = _mm_hadd_epi32(conv0_32, conv1_32);
+      const __m128i conv23_32 = _mm_hadd_epi32(conv2_32, conv3_32);
+
+      const __m128i conv0123_32 = _mm_hadd_epi32(conv01_32, conv23_32);
+
+      // Divide down by (1 << FILTER_BITS), rounding to nearest.
+      const __m128i shifted_32 =
+          _mm_srai_epi32(_mm_add_epi32(conv0123_32, round_add), FILTER_BITS);
+
+      // Pack 32-bit values into 16-bit values, i.e.
+      // ([ D C B A ], [ 0 0 0 0 ]) -> [ 0 0 0 0 D C B A ]
+      const __m128i shifted_16 = _mm_packus_epi32(shifted_32, zero);
+
+      // Clip the values at (1 << bd) - 1
+      const __m128i clipped_16 = _mm_min_epi16(shifted_16, clip_maximum);
+
+      // Write to the output
+      xx_storel_64(&dst_y[x], clipped_16);
+    }
+  }
+}
diff --git a/test/av1_horz_only_frame_superres_test.cc b/test/av1_horz_only_frame_superres_test.cc
new file mode 100644
index 0000000..7ef2a27
--- /dev/null
+++ b/test/av1_horz_only_frame_superres_test.cc
@@ -0,0 +1,361 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <vector>
+
+#include "third_party/googletest/src/googletest/include/gtest/gtest.h"
+
+#include "./av1_rtcd.h"
+#include "aom_ports/aom_timer.h"
+#include "av1/common/convolve.h"
+#include "av1/common/resize.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+namespace {
+const int kTestIters = 10;
+const int kPerfIters = 1000;
+
+const int kVPad = 32;
+const int kHPad = 32;
+
+using std::tr1::tuple;
+using std::tr1::make_tuple;
+using libaom_test::ACMRandom;
+
+template <typename Pixel>
+class TestImage {
+ public:
+  TestImage(int w_src, int h, int superres_denom, int x0, int bd)
+      : w_src_(w_src), h_(h), superres_denom_(superres_denom), x0_(x0),
+        bd_(bd) {
+    assert(bd < 16);
+    assert(bd <= 8 * static_cast<int>(sizeof(Pixel)));
+    assert(9 <= superres_denom && superres_denom <= 16);
+    assert(SCALE_NUMERATOR == 8);
+    assert(0 <= x0_ && x0_ <= RS_SCALE_SUBPEL_MASK);
+
+    w_dst_ = w_src_;
+    av1_calculate_unscaled_superres_size(&w_dst_, NULL, superres_denom);
+
+    src_stride_ = ALIGN_POWER_OF_TWO(w_src_ + 2 * kHPad, 4);
+    dst_stride_ = ALIGN_POWER_OF_TWO(w_dst_ + 2 * kHPad, 4);
+
+    // Allocate image data
+    src_data_.resize(2 * src_block_size());
+    dst_data_.resize(2 * dst_block_size());
+  }
+
+  void Initialize(ACMRandom *rnd);
+  void Check() const;
+
+  int src_stride() const { return src_stride_; }
+  int dst_stride() const { return dst_stride_; }
+
+  int src_block_size() const { return (h_ + 2 * kVPad) * src_stride(); }
+  int dst_block_size() const { return (h_ + 2 * kVPad) * dst_stride(); }
+
+  int src_width() const { return w_src_; }
+  int dst_width() const { return w_dst_; }
+  int height() const { return h_; }
+  int x0() const { return x0_; }
+
+  const Pixel *GetSrcData(bool ref, bool borders) const {
+    const Pixel *block = &src_data_[ref ? 0 : src_block_size()];
+    return borders ? block : block + kHPad + src_stride_ * kVPad;
+  }
+
+  Pixel *GetDstData(bool ref, bool borders) {
+    Pixel *block = &dst_data_[ref ? 0 : dst_block_size()];
+    return borders ? block : block + kHPad + dst_stride_ * kVPad;
+  }
+
+ private:
+  int w_src_, w_dst_, h_, superres_denom_, x0_, bd_;
+  int src_stride_, dst_stride_;
+
+  std::vector<Pixel> src_data_;
+  std::vector<Pixel> dst_data_;
+};
+
+template <typename Pixel>
+void FillEdge(ACMRandom *rnd, int num_pixels, int bd, bool trash, Pixel *data) {
+  if (!trash) {
+    memset(data, 0, sizeof(*data) * num_pixels);
+    return;
+  }
+  const Pixel mask = (1 << bd) - 1;
+  for (int i = 0; i < num_pixels; ++i) data[i] = rnd->Rand16() & mask;
+}
+
+template <typename Pixel>
+void PrepBuffers(ACMRandom *rnd, int w, int h, int stride, int bd,
+                 bool trash_edges, Pixel *data) {
+  assert(rnd);
+  const Pixel mask = (1 << bd) - 1;
+
+  // Fill in the first buffer with random data
+  // Top border
+  FillEdge(rnd, stride * kVPad, bd, trash_edges, data);
+  for (int r = 0; r < h; ++r) {
+    Pixel *row_data = data + (kVPad + r) * stride;
+    // Left border, contents, right border
+    FillEdge(rnd, kHPad, bd, trash_edges, row_data);
+    for (int c = 0; c < w; ++c) row_data[kHPad + c] = rnd->Rand16() & mask;
+    FillEdge(rnd, kHPad, bd, trash_edges, row_data + kHPad + w);
+  }
+  // Bottom border
+  FillEdge(rnd, stride * kVPad, bd, trash_edges, data + stride * (kVPad + h));
+
+  const int bpp = sizeof(*data);
+  const int block_elts = stride * (h + 2 * kVPad);
+  const int block_size = bpp * block_elts;
+
+  // Now copy that to the second buffer
+  memcpy(data + block_elts, data, block_size);
+}
+
+template <typename Pixel>
+void TestImage<Pixel>::Initialize(ACMRandom *rnd) {
+  PrepBuffers(rnd, w_src_, h_, src_stride_, bd_, false, &src_data_[0]);
+  PrepBuffers(rnd, w_dst_, h_, dst_stride_, bd_, true, &dst_data_[0]);
+}
+
+template <typename Pixel>
+void TestImage<Pixel>::Check() const {
+  const int num_pixels = dst_block_size();
+  const Pixel *ref_dst = &dst_data_[0];
+  const Pixel *tst_dst = &dst_data_[num_pixels];
+
+  // If memcmp returns 0, there's nothing to do.
+  if (0 == memcmp(ref_dst, tst_dst, sizeof(*ref_dst) * num_pixels)) return;
+
+  // Otherwise, iterate through the buffer looking for differences, *ignoring
+  // the edges*
+  const int stride = dst_stride_;
+  for (int r = kVPad; r < h_ + kVPad; ++r) {
+    for (int c = kVPad; c < w_dst_ + kHPad; ++c) {
+      const int32_t ref_value = ref_dst[r * stride + c];
+      const int32_t tst_value = tst_dst[r * stride + c];
+
+      EXPECT_EQ(tst_value, ref_value)
+          << "Error at row: " << (r - kVPad) << ", col: " << (c - kHPad)
+          << ", superres_denom: " << superres_denom_ << ", height: " << h_
+          << ", src_width: " << w_src_ << ", dst_width: " << w_dst_
+          << ", x0: " << x0_;
+    }
+  }
+}
+
+template <typename Pixel>
+class ConvolveHorizRSTestBase : public ::testing::Test {
+ public:
+  ConvolveHorizRSTestBase() : image_(NULL) {}
+  virtual ~ConvolveHorizRSTestBase() {}
+  virtual void TearDown() { libaom_test::ClearSystemState(); }
+
+  // Implemented by subclasses (SetUp depends on the parameters passed
+  // in and RunOne depends on the function to be tested. These can't
+  // be templated for low/high bit depths because they have different
+  // numbers of parameters)
+  virtual void SetUp() = 0;
+  virtual void RunOne(bool ref) = 0;
+
+ protected:
+  void SetBitDepth(int bd) { bd_ = bd; }
+
+  void CorrectnessTest() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    for (int i = 0; i < kTestIters; ++i) {
+      for (int superres_denom = 9; superres_denom <= 16; superres_denom++) {
+        // Get a random height between 512 and 767
+        int height = rnd.Rand8() + 512;
+
+        // Get a random src width between 128 and 383
+        int width_src = rnd.Rand8() + 128;
+
+        // x0 is normally calculated by get_upscale_convolve_x0 in
+        // av1/common/resize.c. However, this test should work for
+        // any value of x0 between 0 and RS_SCALE_SUBPEL_MASK
+        // (inclusive), so we choose one at random.
+        int x0 = rnd.Rand16() % (RS_SCALE_SUBPEL_MASK + 1);
+
+        image_ =
+            new TestImage<Pixel>(width_src, height, superres_denom, x0, bd_);
+
+        Prep(&rnd);
+        RunOne(true);
+        RunOne(false);
+        image_->Check();
+
+        delete image_;
+      }
+    }
+  }
+
+  void SpeedTest() {
+    // Pick some specific parameters to test
+    int height = 767;
+    int width_src = 129;
+    int superres_denom = 13;
+    int x0 = RS_SCALE_SUBPEL_MASK >> 1;
+
+    image_ = new TestImage<Pixel>(width_src, height, superres_denom, x0, bd_);
+
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    Prep(&rnd);
+
+    aom_usec_timer ref_timer;
+    aom_usec_timer_start(&ref_timer);
+    for (int i = 0; i < kPerfIters; ++i) RunOne(true);
+    aom_usec_timer_mark(&ref_timer);
+    const int64_t ref_time = aom_usec_timer_elapsed(&ref_timer);
+
+    aom_usec_timer tst_timer;
+    aom_usec_timer_start(&tst_timer);
+    for (int i = 0; i < kPerfIters; ++i) RunOne(false);
+    aom_usec_timer_mark(&tst_timer);
+    const int64_t tst_time = aom_usec_timer_elapsed(&tst_timer);
+
+    std::cout << "[          ] C time = " << ref_time / 1000
+              << " ms, SIMD time = " << tst_time / 1000 << " ms\n";
+
+    EXPECT_GT(ref_time, tst_time)
+        << "Error: CDEFSpeedTest, SIMD slower than C.\n"
+        << "C time: " << ref_time << " us\n"
+        << "SIMD time: " << tst_time << " us\n";
+  }
+
+  void Prep(ACMRandom *rnd) {
+    assert(rnd);
+    image_->Initialize(rnd);
+  }
+
+  int bd_;
+  TestImage<Pixel> *image_;
+};
+
+typedef void (*LowBDConvolveHorizRsFunc)(const uint8_t *src, int src_stride,
+                                         uint8_t *dst, int dst_stride, int w,
+                                         int h, const int16_t *x_filters,
+                                         const int x0_qn, const int x_step_qn);
+
+// Test parameter list:
+//  <tst_fun_>
+typedef tuple<LowBDConvolveHorizRsFunc> LowBDParams;
+
+class LowBDConvolveHorizRSTest
+    : public ConvolveHorizRSTestBase<uint8_t>,
+      public ::testing::WithParamInterface<LowBDParams> {
+ public:
+  virtual ~LowBDConvolveHorizRSTest() {}
+
+  void SetUp() {
+    tst_fun_ = GET_PARAM(0);
+    const int bd = 8;
+    SetBitDepth(bd);
+  }
+
+  void RunOne(bool ref) {
+    const uint8_t *src = image_->GetSrcData(ref, false);
+    uint8_t *dst = image_->GetDstData(ref, false);
+    const int src_stride = image_->src_stride();
+    const int dst_stride = image_->dst_stride();
+    const int width_src = image_->src_width();
+    const int width_dst = image_->dst_width();
+    const int height = image_->height();
+    const int x0_qn = image_->x0();
+
+    const int32_t x_step_qn =
+        av1_get_upscale_convolve_step(width_src, width_dst);
+
+    if (ref) {
+      av1_convolve_horiz_rs_c(src, src_stride, dst, dst_stride, width_dst,
+                              height, &av1_resize_filter_normative[0][0], x0_qn,
+                              x_step_qn);
+    } else {
+      tst_fun_(src, src_stride, dst, dst_stride, width_dst, height,
+               &av1_resize_filter_normative[0][0], x0_qn, x_step_qn);
+    }
+  }
+
+ private:
+  LowBDConvolveHorizRsFunc tst_fun_;
+};
+
+TEST_P(LowBDConvolveHorizRSTest, Correctness) { CorrectnessTest(); }
+TEST_P(LowBDConvolveHorizRSTest, DISABLED_Speed) { SpeedTest(); }
+
+INSTANTIATE_TEST_CASE_P(SSE4_1, LowBDConvolveHorizRSTest,
+                        ::testing::Values(av1_convolve_horiz_rs_sse4_1));
+
+typedef void (*HighBDConvolveHorizRsFunc)(const uint16_t *src, int src_stride,
+                                          uint16_t *dst, int dst_stride, int w,
+                                          int h, const int16_t *x_filters,
+                                          const int x0_qn, const int x_step_qn,
+                                          int bd);
+
+// Test parameter list:
+//  <tst_fun_, bd_>
+typedef tuple<HighBDConvolveHorizRsFunc, int> HighBDParams;
+
+class HighBDConvolveHorizRSTest
+    : public ConvolveHorizRSTestBase<uint16_t>,
+      public ::testing::WithParamInterface<HighBDParams> {
+ public:
+  virtual ~HighBDConvolveHorizRSTest() {}
+
+  void SetUp() {
+    tst_fun_ = GET_PARAM(0);
+    const int bd = GET_PARAM(1);
+    SetBitDepth(bd);
+  }
+
+  void RunOne(bool ref) {
+    const uint16_t *src = image_->GetSrcData(ref, false);
+    uint16_t *dst = image_->GetDstData(ref, false);
+    const int src_stride = image_->src_stride();
+    const int dst_stride = image_->dst_stride();
+    const int width_src = image_->src_width();
+    const int width_dst = image_->dst_width();
+    const int height = image_->height();
+    const int x0_qn = image_->x0();
+
+    const int32_t x_step_qn =
+        av1_get_upscale_convolve_step(width_src, width_dst);
+
+    if (ref) {
+      av1_highbd_convolve_horiz_rs_c(
+          src, src_stride, dst, dst_stride, width_dst, height,
+          &av1_resize_filter_normative[0][0], x0_qn, x_step_qn, bd_);
+    } else {
+      tst_fun_(src, src_stride, dst, dst_stride, width_dst, height,
+               &av1_resize_filter_normative[0][0], x0_qn, x_step_qn, bd_);
+    }
+  }
+
+ private:
+  HighBDConvolveHorizRsFunc tst_fun_;
+};
+
+const int kBDs[] = { 8, 10, 12 };
+
+TEST_P(HighBDConvolveHorizRSTest, Correctness) { CorrectnessTest(); }
+TEST_P(HighBDConvolveHorizRSTest, DISABLED_Speed) { SpeedTest(); }
+
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, HighBDConvolveHorizRSTest,
+    ::testing::Combine(::testing::Values(av1_highbd_convolve_horiz_rs_sse4_1),
+                       ::testing::ValuesIn(kBDs)));
+
+}  // namespace
diff --git a/test/test.cmake b/test/test.cmake
index 1522436..7722242f 100644
--- a/test/test.cmake
+++ b/test/test.cmake
@@ -253,6 +253,12 @@
           "${AOM_ROOT}/test/av1_convolve_scale_test.cc")
     endif ()
 
+    if (HAVE_SSE4_1)
+      set(AOM_UNIT_TEST_ENCODER_SOURCES
+          ${AOM_UNIT_TEST_ENCODER_SOURCES}
+          "${AOM_ROOT}/test/av1_horz_only_frame_superres_test.cc")
+    endif ()
+
     set(AOM_UNIT_TEST_ENCODER_SOURCES
         ${AOM_UNIT_TEST_ENCODER_SOURCES}
         "${AOM_ROOT}/test/av1_fht16x32_test.cc"