SSE optimization of motion compensation functions

SSSE3 optimizations of av1_highbd_convolve_x_sr_c and
av1_highbd_convolve_y_sr_c have been added.

av1_highbd_convolve_x_sr_ssse3 - ~ 8.3 times faster than its
C implementation.

av1_highbd_convolve_y_sr_ssse3 - ~ 11.3 times faster than its
C implementation.

code cleanup of RunSpeedTest for AV1HighbdConvolve2DSrTest.

Change-Id: Ic6386f91c649d5d5513909f2dc2352eec338fc08
diff --git a/aom_dsp/aom_dsp.cmake b/aom_dsp/aom_dsp.cmake
index b813736..1949b25 100644
--- a/aom_dsp/aom_dsp.cmake
+++ b/aom_dsp/aom_dsp.cmake
@@ -180,6 +180,10 @@
       "${AOM_ROOT}/aom_dsp/x86/highbd_intrapred_sse2.c"
       "${AOM_ROOT}/aom_dsp/x86/highbd_loopfilter_sse2.c")
 
+  set(AOM_DSP_COMMON_INTRIN_SSSE3
+      ${AOM_DSP_COMMON_INTRIN_SSSE3}
+      "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_ssse3.c")
+
   set(AOM_DSP_COMMON_INTRIN_AVX2
       ${AOM_DSP_COMMON_INTRIN_AVX2}
       "${AOM_ROOT}/aom_dsp/x86/highbd_convolve_avx2.c"
diff --git a/aom_dsp/x86/highbd_convolve_ssse3.c b/aom_dsp/x86/highbd_convolve_ssse3.c
new file mode 100644
index 0000000..41f4d5f
--- /dev/null
+++ b/aom_dsp/x86/highbd_convolve_ssse3.c
@@ -0,0 +1,250 @@
+/*
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include <tmmintrin.h>
+#include <assert.h>
+
+#include "./aom_dsp_rtcd.h"
+#include "aom_dsp/x86/convolve_sse2.h"
+
+void av1_highbd_convolve_y_sr_ssse3(const uint16_t *src, int src_stride,
+                                    uint16_t *dst, int dst_stride, int w, int h,
+                                    InterpFilterParams *filter_params_x,
+                                    InterpFilterParams *filter_params_y,
+                                    const int subpel_x_q4,
+                                    const int subpel_y_q4,
+                                    ConvolveParams *conv_params, int bd) {
+  int i, j;
+  const int fo_vert = filter_params_y->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_vert * src_stride;
+  (void)filter_params_x;
+  (void)subpel_x_q4;
+  (void)conv_params;
+
+  assert(conv_params->round_0 <= FILTER_BITS);
+  assert(((conv_params->round_0 + conv_params->round_1) <= (FILTER_BITS + 1)) ||
+         ((conv_params->round_0 + conv_params->round_1) == (2 * FILTER_BITS)));
+
+  __m128i s[16], coeffs_y[4];
+
+  const int bits = FILTER_BITS;
+
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i clip_pixel =
+      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m128i zero = _mm_setzero_si128();
+
+  prepare_coeffs(filter_params_y, subpel_y_q4, coeffs_y);
+
+  for (j = 0; j < w; j += 8) {
+    const uint16_t *data = &src_ptr[j];
+    /* Vertical filter */
+    {
+      __m128i s0 = _mm_loadu_si128((__m128i *)(data + 0 * src_stride));
+      __m128i s1 = _mm_loadu_si128((__m128i *)(data + 1 * src_stride));
+      __m128i s2 = _mm_loadu_si128((__m128i *)(data + 2 * src_stride));
+      __m128i s3 = _mm_loadu_si128((__m128i *)(data + 3 * src_stride));
+      __m128i s4 = _mm_loadu_si128((__m128i *)(data + 4 * src_stride));
+      __m128i s5 = _mm_loadu_si128((__m128i *)(data + 5 * src_stride));
+      __m128i s6 = _mm_loadu_si128((__m128i *)(data + 6 * src_stride));
+
+      s[0] = _mm_unpacklo_epi16(s0, s1);
+      s[1] = _mm_unpacklo_epi16(s2, s3);
+      s[2] = _mm_unpacklo_epi16(s4, s5);
+
+      s[4] = _mm_unpackhi_epi16(s0, s1);
+      s[5] = _mm_unpackhi_epi16(s2, s3);
+      s[6] = _mm_unpackhi_epi16(s4, s5);
+
+      s[0 + 8] = _mm_unpacklo_epi16(s1, s2);
+      s[1 + 8] = _mm_unpacklo_epi16(s3, s4);
+      s[2 + 8] = _mm_unpacklo_epi16(s5, s6);
+
+      s[4 + 8] = _mm_unpackhi_epi16(s1, s2);
+      s[5 + 8] = _mm_unpackhi_epi16(s3, s4);
+      s[6 + 8] = _mm_unpackhi_epi16(s5, s6);
+
+      for (i = 0; i < h; i += 2) {
+        data = &src_ptr[i * src_stride + j];
+
+        __m128i s7 = _mm_loadu_si128((__m128i *)(data + 7 * src_stride));
+        __m128i s8 = _mm_loadu_si128((__m128i *)(data + 8 * src_stride));
+
+        s[3] = _mm_unpacklo_epi16(s6, s7);
+        s[7] = _mm_unpackhi_epi16(s6, s7);
+
+        s[3 + 8] = _mm_unpacklo_epi16(s7, s8);
+        s[7 + 8] = _mm_unpackhi_epi16(s7, s8);
+
+        const __m128i res_a0 = convolve(s, coeffs_y);
+        __m128i res_a_round0 = _mm_sra_epi32(
+            _mm_add_epi32(res_a0, round_const_bits), round_shift_bits);
+
+        const __m128i res_a1 = convolve(s + 8, coeffs_y);
+        __m128i res_a_round1 = _mm_sra_epi32(
+            _mm_add_epi32(res_a1, round_const_bits), round_shift_bits);
+
+        if (w - j > 4) {
+          const __m128i res_b0 = convolve(s + 4, coeffs_y);
+          __m128i res_b_round0 = _mm_sra_epi32(
+              _mm_add_epi32(res_b0, round_const_bits), round_shift_bits);
+
+          const __m128i res_b1 = convolve(s + 4 + 8, coeffs_y);
+          __m128i res_b_round1 = _mm_sra_epi32(
+              _mm_add_epi32(res_b1, round_const_bits), round_shift_bits);
+
+          __m128i res_16bit0 = _mm_packs_epi32(res_a_round0, res_b_round0);
+          res_16bit0 = _mm_min_epi16(res_16bit0, clip_pixel);
+          res_16bit0 = _mm_max_epi16(res_16bit0, zero);
+
+          __m128i res_16bit1 = _mm_packs_epi32(res_a_round1, res_b_round1);
+          res_16bit1 = _mm_min_epi16(res_16bit1, clip_pixel);
+          res_16bit1 = _mm_max_epi16(res_16bit1, zero);
+
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res_16bit0);
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           res_16bit1);
+        } else if (w == 4) {
+          res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+          res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+          res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+          res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+          res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+          res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res_a_round0);
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j + dst_stride],
+                           res_a_round1);
+        } else {
+          res_a_round0 = _mm_packs_epi32(res_a_round0, res_a_round0);
+          res_a_round0 = _mm_min_epi16(res_a_round0, clip_pixel);
+          res_a_round0 = _mm_max_epi16(res_a_round0, zero);
+
+          res_a_round1 = _mm_packs_epi32(res_a_round1, res_a_round1);
+          res_a_round1 = _mm_min_epi16(res_a_round1, clip_pixel);
+          res_a_round1 = _mm_max_epi16(res_a_round1, zero);
+
+          *((uint32_t *)(&dst[i * dst_stride + j])) =
+              _mm_cvtsi128_si32(res_a_round0);
+
+          *((uint32_t *)(&dst[i * dst_stride + j + dst_stride])) =
+              _mm_cvtsi128_si32(res_a_round1);
+        }
+
+        s[0] = s[1];
+        s[1] = s[2];
+        s[2] = s[3];
+
+        s[4] = s[5];
+        s[5] = s[6];
+        s[6] = s[7];
+
+        s[0 + 8] = s[1 + 8];
+        s[1 + 8] = s[2 + 8];
+        s[2 + 8] = s[3 + 8];
+
+        s[4 + 8] = s[5 + 8];
+        s[5 + 8] = s[6 + 8];
+        s[6 + 8] = s[7 + 8];
+
+        s6 = s8;
+      }
+    }
+  }
+}
+
+void av1_highbd_convolve_x_sr_ssse3(const uint16_t *src, int src_stride,
+                                    uint16_t *dst, int dst_stride, int w, int h,
+                                    InterpFilterParams *filter_params_x,
+                                    InterpFilterParams *filter_params_y,
+                                    const int subpel_x_q4,
+                                    const int subpel_y_q4,
+                                    ConvolveParams *conv_params, int bd) {
+  int i, j;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_horiz;
+  (void)subpel_y_q4;
+  (void)filter_params_y;
+
+  // Check that, even with 12-bit input, the intermediate values will fit
+  // into an unsigned 16-bit intermediate array.
+  assert(bd + FILTER_BITS + 2 - conv_params->round_0 <= 16);
+
+  __m128i s[4], coeffs_x[4];
+
+  const __m128i round_const_x =
+      _mm_set1_epi32(((1 << conv_params->round_0) >> 1));
+  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+
+  const int bits = FILTER_BITS - conv_params->round_0;
+
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+  const __m128i round_const_bits = _mm_set1_epi32((1 << bits) >> 1);
+  const __m128i clip_pixel =
+      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));
+  const __m128i zero = _mm_setzero_si128();
+
+  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+
+  for (j = 0; j < w; j += 8) {
+    /* Horizontal filter */
+    {
+      for (i = 0; i < h; i += 1) {
+        const __m128i row00 =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
+        const __m128i row01 =
+            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + (j + 8)]);
+
+        // even pixels
+        s[0] = _mm_alignr_epi8(row01, row00, 0);
+        s[1] = _mm_alignr_epi8(row01, row00, 4);
+        s[2] = _mm_alignr_epi8(row01, row00, 8);
+        s[3] = _mm_alignr_epi8(row01, row00, 12);
+
+        __m128i res_even = convolve(s, coeffs_x);
+        res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_x),
+                                 round_shift_x);
+
+        // odd pixels
+        s[0] = _mm_alignr_epi8(row01, row00, 2);
+        s[1] = _mm_alignr_epi8(row01, row00, 6);
+        s[2] = _mm_alignr_epi8(row01, row00, 10);
+        s[3] = _mm_alignr_epi8(row01, row00, 14);
+
+        __m128i res_odd = convolve(s, coeffs_x);
+        res_odd =
+            _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_x), round_shift_x);
+
+        res_even = _mm_sra_epi32(_mm_add_epi32(res_even, round_const_bits),
+                                 round_shift_bits);
+        res_odd = _mm_sra_epi32(_mm_add_epi32(res_odd, round_const_bits),
+                                round_shift_bits);
+
+        __m128i res_even1 = _mm_packs_epi32(res_even, res_even);
+        __m128i res_odd1 = _mm_packs_epi32(res_odd, res_odd);
+        __m128i res = _mm_unpacklo_epi16(res_even1, res_odd1);
+
+        res = _mm_min_epi16(res, clip_pixel);
+        res = _mm_max_epi16(res, zero);
+
+        if (w - j > 4) {
+          _mm_storeu_si128((__m128i *)&dst[i * dst_stride + j], res);
+        } else if (w == 4) {
+          _mm_storel_epi64((__m128i *)&dst[i * dst_stride + j], res);
+        } else {
+          *((uint32_t *)(&dst[i * dst_stride + j])) = _mm_cvtsi128_si32(res);
+        }
+      }
+    }
+  }
+}
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 37c3a28..3a2f879 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -524,12 +524,12 @@
 add_proto qw/void av1_highbd_convolve_x/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
 
 add_proto qw/void av1_highbd_convolve_x_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-specialize qw/av1_highbd_convolve_x_sr c avx2/;
+specialize qw/av1_highbd_convolve_x_sr c ssse3 avx2/;
 
 add_proto qw/void av1_highbd_convolve_y/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
 
 add_proto qw/void av1_highbd_convolve_y_sr/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
-specialize qw/av1_highbd_convolve_y_sr c avx2/;
+specialize qw/av1_highbd_convolve_y_sr c ssse3 avx2/;
 
 add_proto qw/void av1_highbd_convolve_rounding/, "const int32_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, int bits, int bd";
 specialize qw/av1_highbd_convolve_rounding avx2/;
diff --git a/test/av1_convolve_2d_test.cc b/test/av1_convolve_2d_test.cc
index 927b8ff..d18840f 100644
--- a/test/av1_convolve_2d_test.cc
+++ b/test/av1_convolve_2d_test.cc
@@ -220,6 +220,12 @@
 INSTANTIATE_TEST_CASE_P(SSSE3, AV1HighbdConvolve2DSrTest,
                         libaom_test::AV1HighbdConvolve2D::BuildParams(
                             av1_highbd_convolve_2d_sr_ssse3, 1, 1, 0));
+INSTANTIATE_TEST_CASE_P(SSSE3_X, AV1HighbdConvolve2DSrTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_convolve_x_sr_ssse3, 1, 0, 0));
+INSTANTIATE_TEST_CASE_P(SSSE3_Y, AV1HighbdConvolve2DSrTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_convolve_y_sr_ssse3, 0, 1, 0));
 #endif
 #if HAVE_AVX2
 INSTANTIATE_TEST_CASE_P(AVX2, AV1HighbdConvolve2DSrTest,
diff --git a/test/av1_convolve_2d_test_util.cc b/test/av1_convolve_2d_test_util.cc
index 1c22525..d87b041 100644
--- a/test/av1_convolve_2d_test_util.cc
+++ b/test/av1_convolve_2d_test_util.cc
@@ -489,8 +489,8 @@
     highbd_convolve_2d_func test_impl) {
   const int w = kMaxSize, h = kMaxSize;
   const int bd = GET_PARAM(0);
-  // const int has_subx = GET_PARAM(2);
-  // const int has_suby = GET_PARAM(3);
+  const int has_subx = GET_PARAM(2);
+  const int has_suby = GET_PARAM(3);
   const int is_compound = GET_PARAM(4);
   int hfilter, vfilter, subx, suby;
   uint16_t input[kMaxSize * kMaxSize];
@@ -500,42 +500,48 @@
   for (int i = 0; i < h; ++i)
     for (int j = 0; j < w; ++j)
       input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
-  for (int i = 0; i < MAX_SB_SQUARE; ++i) output[i] = rnd_.Rand31();
 
   hfilter = EIGHTTAP_REGULAR;
   vfilter = EIGHTTAP_REGULAR;
   int do_average = 0;
 
+  const int offset_r = 3;
+  const int offset_c = 3;
+  subx = 0;
+  suby = 0;
+
+  InterpFilterParams filter_params_x =
+      av1_get_interp_filter_params((InterpFilter)hfilter);
+  InterpFilterParams filter_params_y =
+      av1_get_interp_filter_params((InterpFilter)vfilter);
+
+  ConvolveParams conv_params =
+      get_conv_params_no_round(0, do_average, 0, NULL, 0, 0, bd);
+
   for (int block_idx = BLOCK_4X4; block_idx < BLOCK_SIZES_ALL; ++block_idx) {
     // Make sure that sizes 2xN and Nx2 are also tested for chroma.
+    const int num_sizes =
+        (block_size_wide[block_idx] == 4 || block_size_high[block_idx] == 4)
+            ? 2
+            : 1;
 
-    const int out_w = block_size_wide[block_idx];
-    const int out_h = block_size_high[block_idx];
+    for (int shift = 0; shift < num_sizes; ++shift) {  // luma and chroma
+      const int out_w = block_size_wide[block_idx] >> shift;
+      const int out_h = block_size_high[block_idx] >> shift;
+      const int num_loops = 1000000000 / (out_w + out_h);
 
-    InterpFilterParams filter_params_x =
-        av1_get_interp_filter_params((InterpFilter)hfilter);
-    InterpFilterParams filter_params_y =
-        av1_get_interp_filter_params((InterpFilter)vfilter);
+      aom_usec_timer timer;
+      aom_usec_timer_start(&timer);
+      for (int i = 0; i < num_loops; ++i)
+        test_impl(input + offset_r * w + offset_c, w, output, MAX_SB_SIZE,
+                  out_w, out_h, &filter_params_x, &filter_params_y, subx, suby,
+                  &conv_params, bd);
 
-    ConvolveParams conv_params =
-        get_conv_params_no_round(0, do_average, 0, NULL, 0, 0, bd);
-
-    const int offset_r = 3;
-    const int offset_c = 3;
-    subx = 0;
-    suby = 0;
-    const int num_loops = 1000000000 / (out_w + out_h);
-    aom_usec_timer timer;
-    aom_usec_timer_start(&timer);
-    for (int i = 0; i < num_loops; ++i)
-      test_impl(input + offset_r * w + offset_c, w, output, MAX_SB_SIZE, out_w,
-                out_h, &filter_params_x, &filter_params_y, subx, suby,
-                &conv_params, bd);
-
-    aom_usec_timer_mark(&timer);
-    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
-    printf("convolve %3dx%-3d: %7.2f ns\n", out_w, out_h,
-           1000.0 * elapsed_time / num_loops);
+      aom_usec_timer_mark(&timer);
+      const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+      printf("%d,%d convolve %3dx%-3d: %7.2f ns\n", has_subx, has_suby, out_w,
+             out_h, 1000.0 * elapsed_time / num_loops);
+    }
   }
 }