AVX2 optimization of motion compensation function

AVX2 implementation of av1_highbd_jnt_convolve_x_c has
been added.

ASSERT for (FILTER_BITS - conv_params->round_1) >=0 has
been included in function av1_highbd_jnt_convolve_x_c.

RunSpeedTest for AV1HighbdJntConvolve2DTest has been
added. It is disabled by default.

av1_highbd_jnt_convolve_x_avx2 - ~13.8 times faster than
C implementation.

Change-Id: I12c15b5a9894f2b95bf959123578916b6303f7b6
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 9f46879..4f13269 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -544,6 +544,7 @@
   specialize qw/av1_highbd_jnt_convolve_2d sse4_1 avx2/;
 
   add_proto qw/void av1_highbd_jnt_convolve_x/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
+  specialize qw/av1_highbd_jnt_convolve_x avx2/;
   
   add_proto qw/void av1_highbd_jnt_convolve_y/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, InterpFilterParams *filter_params_x, InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd";
 
diff --git a/av1/common/convolve.c b/av1/common/convolve.c
index a8fac2a..c2992b5 100644
--- a/av1/common/convolve.c
+++ b/av1/common/convolve.c
@@ -1407,6 +1407,7 @@
   (void)dst_stride0;
   (void)bd;
 
+  assert(bits >= 0);
   // horizontal filter
   const int16_t *x_filter = av1_get_interp_filter_subpel_kernel(
       *filter_params_x, subpel_x_q4 & SUBPEL_MASK);
diff --git a/av1/common/x86/highbd_jnt_convolve_avx2.c b/av1/common/x86/highbd_jnt_convolve_avx2.c
index 54b557f..847e2d2 100644
--- a/av1/common/x86/highbd_jnt_convolve_avx2.c
+++ b/av1/common/x86/highbd_jnt_convolve_avx2.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ * Copyright (c) 2018, Alliance for Open Media. All rights reserved
  *
  * This source code is subject to the terms of the BSD 2 Clause License and
  * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
@@ -275,3 +275,113 @@
   }
 }
 #endif
+
+#if CONFIG_JNT_COMP
+void av1_highbd_jnt_convolve_x_avx2(const uint16_t *src, int src_stride,
+                                    uint16_t *dst0, int dst_stride0, int w,
+                                    int h, InterpFilterParams *filter_params_x,
+                                    InterpFilterParams *filter_params_y,
+                                    const int subpel_x_q4,
+                                    const int subpel_y_q4,
+                                    ConvolveParams *conv_params, int bd) {
+  CONV_BUF_TYPE *dst = conv_params->dst;
+  int dst_stride = conv_params->dst_stride;
+  const int fo_horiz = filter_params_x->taps / 2 - 1;
+  const uint16_t *const src_ptr = src - fo_horiz;
+  const int bits = FILTER_BITS - conv_params->round_1;
+  (void)filter_params_y;
+  (void)subpel_y_q4;
+  (void)dst0;
+  (void)dst_stride0;
+  (void)bd;
+
+  int i, j;
+  __m256i s[4], coeffs_x[4];
+
+  const int do_average = conv_params->do_average;
+  const int w0 = conv_params->fwd_offset;
+  const int w1 = conv_params->bck_offset;
+  const __m256i wt0 = _mm256_set1_epi32(w0);
+  const __m256i wt1 = _mm256_set1_epi32(w1);
+  const __m128i wt0_128 = _mm256_castsi256_si128(wt0);
+  const __m128i wt1_128 = _mm256_castsi256_si128(wt1);
+
+  const __m256i round_const_x =
+      _mm256_set1_epi32(((1 << conv_params->round_0) >> 1));
+  const __m128i round_shift_x = _mm_cvtsi32_si128(conv_params->round_0);
+  const __m128i round_shift_bits = _mm_cvtsi32_si128(bits);
+
+  assert(bits >= 0);
+  prepare_coeffs(filter_params_x, subpel_x_q4, coeffs_x);
+
+  for (j = 0; j < w; j += 8) {
+    /* Horizontal filter */
+    for (i = 0; i < h; i += 2) {
+      const __m256i row0 =
+          _mm256_loadu_si256((__m256i *)&src_ptr[i * src_stride + j]);
+      __m256i row1 =
+          _mm256_loadu_si256((__m256i *)&src_ptr[(i + 1) * src_stride + j]);
+
+      const __m256i r0 = _mm256_permute2x128_si256(row0, row1, 0x20);
+      const __m256i r1 = _mm256_permute2x128_si256(row0, row1, 0x31);
+
+      // even pixels
+      s[0] = _mm256_alignr_epi8(r1, r0, 0);
+      s[1] = _mm256_alignr_epi8(r1, r0, 4);
+      s[2] = _mm256_alignr_epi8(r1, r0, 8);
+      s[3] = _mm256_alignr_epi8(r1, r0, 12);
+
+      __m256i res_even = convolve(s, coeffs_x);
+      res_even = _mm256_sra_epi32(_mm256_add_epi32(res_even, round_const_x),
+                                  round_shift_x);
+
+      // odd pixels
+      s[0] = _mm256_alignr_epi8(r1, r0, 2);
+      s[1] = _mm256_alignr_epi8(r1, r0, 6);
+      s[2] = _mm256_alignr_epi8(r1, r0, 10);
+      s[3] = _mm256_alignr_epi8(r1, r0, 14);
+
+      __m256i res_odd = convolve(s, coeffs_x);
+      res_odd = _mm256_sra_epi32(_mm256_add_epi32(res_odd, round_const_x),
+                                 round_shift_x);
+
+      res_even = _mm256_sll_epi32(res_even, round_shift_bits);
+      res_odd = _mm256_sll_epi32(res_odd, round_shift_bits);
+
+      __m256i res1 = _mm256_unpacklo_epi32(res_even, res_odd);
+
+      if (w - j > 4) {
+        __m256i res2 = _mm256_unpackhi_epi32(res_even, res_odd);
+
+        const __m256i res_ax = _mm256_permute2x128_si256(res1, res2, 0x20);
+        const __m256i res_bx = _mm256_permute2x128_si256(res1, res2, 0x31);
+
+        if (conv_params->use_jnt_comp_avg) {
+          mult_add_store_aligned_256(&dst[i * dst_stride + j], &res_ax, &wt0,
+                                     &wt1, do_average);
+          mult_add_store_aligned_256(&dst[i * dst_stride + j + dst_stride],
+                                     &res_bx, &wt0, &wt1, do_average);
+        } else {
+          add_store_aligned_256(&dst[i * dst_stride + j], &res_ax, do_average);
+          add_store_aligned_256(&dst[i * dst_stride + j + dst_stride], &res_bx,
+                                do_average);
+        }
+      } else {
+        const __m128i res_ax = _mm256_castsi256_si128(res1);
+        const __m128i res_bx = _mm256_extracti128_si256(res1, 1);
+
+        if (conv_params->use_jnt_comp_avg) {
+          mult_add_store(&dst[i * dst_stride + j], &res_ax, &wt0_128, &wt1_128,
+                         do_average);
+          mult_add_store(&dst[i * dst_stride + j + dst_stride], &res_bx,
+                         &wt0_128, &wt1_128, do_average);
+        } else {
+          add_store(&dst[i * dst_stride + j], &res_ax, do_average);
+          add_store(&dst[i * dst_stride + j + dst_stride], &res_bx, do_average);
+        }
+      }
+    }
+  }
+}
+
+#endif
diff --git a/test/av1_convolve_2d_test.cc b/test/av1_convolve_2d_test.cc
index 90512b0..735c1ae 100644
--- a/test/av1_convolve_2d_test.cc
+++ b/test/av1_convolve_2d_test.cc
@@ -230,6 +230,10 @@
   RunCheckOutput(GET_PARAM(1));
 }
 
+TEST_P(AV1HighbdJntConvolve2DTest, DISABLED_Speed) {
+  RunSpeedTest(GET_PARAM(1));
+}
+
 INSTANTIATE_TEST_CASE_P(SSE4_1, AV1HighbdJntConvolve2DTest,
                         libaom_test::AV1HighbdConvolve2D::BuildParams(
                             av1_highbd_jnt_convolve_2d_sse4_1, 1, 1, 1));
@@ -252,6 +256,9 @@
 INSTANTIATE_TEST_CASE_P(AVX2, AV1HighbdJntConvolve2DTest,
                         libaom_test::AV1HighbdConvolve2D::BuildParams(
                             av1_highbd_jnt_convolve_2d_avx2, 1, 1, 1));
+INSTANTIATE_TEST_CASE_P(AVX2_X, AV1HighbdJntConvolve2DTest,
+                        libaom_test::AV1HighbdConvolve2D::BuildParams(
+                            av1_highbd_jnt_convolve_x_avx2, 1, 0, 1));
 #endif
 #endif  // CONFIG_JNT_COMP
 #endif
diff --git a/test/av1_convolve_2d_test_util.cc b/test/av1_convolve_2d_test_util.cc
index b166630..81df68d 100644
--- a/test/av1_convolve_2d_test_util.cc
+++ b/test/av1_convolve_2d_test_util.cc
@@ -573,6 +573,59 @@
 
 void AV1HighbdJntConvolve2DTest::TearDown() { libaom_test::ClearSystemState(); }
 
+void AV1HighbdJntConvolve2DTest::RunSpeedTest(
+    highbd_convolve_2d_func test_impl) {
+  const int w = kMaxSize, h = kMaxSize;
+  const int bd = GET_PARAM(0);
+  // const int has_subx = GET_PARAM(2);
+  // const int has_suby = GET_PARAM(3);
+  int hfilter, vfilter, subx, suby;
+  uint16_t input[kMaxSize * kMaxSize];
+  DECLARE_ALIGNED(32, CONV_BUF_TYPE, output[MAX_SB_SQUARE]);
+
+  for (int i = 0; i < h; ++i)
+    for (int j = 0; j < w; ++j)
+      input[i * w + j] = rnd_.Rand16() & ((1 << bd) - 1);
+  for (int i = 0; i < MAX_SB_SQUARE; ++i) output[i] = rnd_.Rand31();
+  hfilter = EIGHTTAP_REGULAR;
+  vfilter = EIGHTTAP_REGULAR;
+  int do_average = 0;
+  for (int block_idx = BLOCK_4X4; block_idx < BLOCK_SIZES_ALL; ++block_idx) {
+    const int out_w = block_size_wide[block_idx];
+    const int out_h = block_size_high[block_idx];
+
+    InterpFilterParams filter_params_x =
+        av1_get_interp_filter_params((InterpFilter)hfilter);
+    InterpFilterParams filter_params_y =
+        av1_get_interp_filter_params((InterpFilter)vfilter);
+
+    ConvolveParams conv_params =
+        get_conv_params_no_round(0, do_average, 0, output, MAX_SB_SIZE, 1, bd);
+
+    // Test special case where jnt_comp_avg is not used
+    conv_params.use_jnt_comp_avg = 0;
+
+    subx = 0;
+    suby = 0;
+    // Choose random locations within the source block
+    const int offset_r = 3;
+    const int offset_c = 3;
+
+    const int num_loops = 1000000000 / (out_w + out_h);
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    for (int i = 0; i < num_loops; ++i)
+      test_impl(input + offset_r * w + offset_c, w, NULL, 0, out_w, out_h,
+                &filter_params_x, &filter_params_y, subx, suby, &conv_params,
+                bd);
+
+    aom_usec_timer_mark(&timer);
+    const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
+    printf("convolve %3dx%-3d: %7.2f ns\n", out_w, out_h,
+           1000.0 * elapsed_time / num_loops);
+  }
+}
+
 void AV1HighbdJntConvolve2DTest::RunCheckOutput(
     highbd_convolve_2d_func test_impl) {
   const int w = kMaxSize, h = kMaxSize;
diff --git a/test/av1_convolve_2d_test_util.h b/test/av1_convolve_2d_test_util.h
index 7f4c919..9b43938 100644
--- a/test/av1_convolve_2d_test_util.h
+++ b/test/av1_convolve_2d_test_util.h
@@ -134,6 +134,7 @@
 
  protected:
   void RunCheckOutput(highbd_convolve_2d_func test_impl);
+  void RunSpeedTest(highbd_convolve_2d_func test_impl);
 
   libaom_test::ACMRandom rnd_;
 };