Add AVX2 variant for sub_pixel_variance width 16

Added AVX2 variant for 16x64,16x32,16x16,16x8,16x4 blk_sizes
of sub_pixel_variance function.

Test bench level scaling gains are improved by 28% on average
w.r.t. SSSE3 module.

Change-Id: Iaf685ec819f0acd4a190e16c9f29319788aab899
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 904044d..b7d5a41 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -1161,9 +1161,9 @@
   specialize qw/aom_sub_pixel_variance32x64     avx2      msa sse2 ssse3/;
   specialize qw/aom_sub_pixel_variance32x32     avx2 neon msa sse2 ssse3/;
   specialize qw/aom_sub_pixel_variance32x16     avx2      msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance16x32               msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance16x16          neon msa sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance16x8                msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance16x32     avx2      msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance16x16     avx2 neon msa sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance16x8      avx2      msa sse2 ssse3/;
   specialize qw/aom_sub_pixel_variance8x16                msa sse2 ssse3/;
   specialize qw/aom_sub_pixel_variance8x8            neon msa sse2 ssse3/;
   specialize qw/aom_sub_pixel_variance8x4                 msa sse2 ssse3/;
@@ -1195,10 +1195,10 @@
   specialize qw/aom_variance64x16 sse2 avx2/;
 
   specialize qw/aom_sub_pixel_variance4x16 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance16x4 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance16x4 avx2 sse2 ssse3/;
   specialize qw/aom_sub_pixel_variance8x32 sse2 ssse3/;
   specialize qw/aom_sub_pixel_variance32x8 sse2 ssse3/;
-  specialize qw/aom_sub_pixel_variance16x64 sse2 ssse3/;
+  specialize qw/aom_sub_pixel_variance16x64 avx2 sse2 ssse3/;
   specialize qw/aom_sub_pixel_variance64x16 sse2 ssse3/;
   specialize qw/aom_sub_pixel_avg_variance4x16 sse2 ssse3/;
   specialize qw/aom_sub_pixel_avg_variance16x4 sse2 ssse3/;
diff --git a/aom_dsp/x86/variance_avx2.c b/aom_dsp/x86/variance_avx2.c
index 6371e2e..c4919ba 100644
--- a/aom_dsp/x86/variance_avx2.c
+++ b/aom_dsp/x86/variance_avx2.c
@@ -234,6 +234,10 @@
                                              int x_offset, int y_offset,
                                              const uint8_t *dst, int dst_stride,
                                              int height, unsigned int *sse);
+unsigned int aom_sub_pixel_variance16xh_avx2(const uint8_t *src, int src_stride,
+                                             int x_offset, int y_offset,
+                                             const uint8_t *dst, int dst_stride,
+                                             int height, unsigned int *sse);
 
 unsigned int aom_sub_pixel_avg_variance32xh_avx2(
     const uint8_t *src, int src_stride, int x_offset, int y_offset,
@@ -276,6 +280,11 @@
 AOM_SUB_PIXEL_VAR_AVX2(32, 64, 32, 5, 6);
 AOM_SUB_PIXEL_VAR_AVX2(32, 32, 32, 5, 5);
 AOM_SUB_PIXEL_VAR_AVX2(32, 16, 32, 5, 4);
+AOM_SUB_PIXEL_VAR_AVX2(16, 64, 16, 4, 6);
+AOM_SUB_PIXEL_VAR_AVX2(16, 32, 16, 4, 5);
+AOM_SUB_PIXEL_VAR_AVX2(16, 16, 16, 4, 4);
+AOM_SUB_PIXEL_VAR_AVX2(16, 8, 16, 4, 3);
+AOM_SUB_PIXEL_VAR_AVX2(16, 4, 16, 4, 2);
 
 #define AOM_SUB_PIXEL_AVG_VAR_AVX2(w, h, wf, wlog2, hlog2)                \
   unsigned int aom_sub_pixel_avg_variance##w##x##h##_avx2(                \
diff --git a/aom_dsp/x86/variance_impl_avx2.c b/aom_dsp/x86/variance_impl_avx2.c
index aef012b..f779270 100644
--- a/aom_dsp/x86/variance_impl_avx2.c
+++ b/aom_dsp/x86/variance_impl_avx2.c
@@ -104,6 +104,65 @@
   sum = _mm_cvtsi128_si32(_mm256_castsi256_si128(sum_reg)) +               \
         _mm_cvtsi128_si32(_mm256_extractf128_si256(sum_reg, 1));
 
+// Functions related to sub pixel variance width 16
+#define LOAD_SRC_DST_INSERT(src_stride, dst_stride)              \
+  /* load source and destination of 2 rows and insert*/          \
+  src_reg = _mm256_inserti128_si256(                             \
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src))), \
+      _mm_loadu_si128((__m128i *)(src + src_stride)), 1);        \
+  dst_reg = _mm256_inserti128_si256(                             \
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dst))), \
+      _mm_loadu_si128((__m128i *)(dst + dst_stride)), 1);
+
+#define AVG_NEXT_SRC_INSERT(src_reg, size_stride)                              \
+  src_next_reg = _mm256_inserti128_si256(                                      \
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + size_stride))), \
+      _mm_loadu_si128((__m128i *)(src + (size_stride << 1))), 1);              \
+  /* average between current and next stride source */                         \
+  src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+
+#define MERGE_NEXT_SRC_INSERT(src_reg, size_stride)                            \
+  src_next_reg = _mm256_inserti128_si256(                                      \
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + size_stride))), \
+      _mm_loadu_si128((__m128i *)(src + (src_stride + size_stride))), 1);      \
+  MERGE_WITH_SRC(src_reg, src_next_reg)
+
+#define LOAD_SRC_NEXT_BYTE_INSERT                                    \
+  /* load source and another source from next row   */               \
+  src_reg = _mm256_inserti128_si256(                                 \
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src))),     \
+      _mm_loadu_si128((__m128i *)(src + src_stride)), 1);            \
+  /* load source and next row source from 1 byte onwards   */        \
+  src_next_reg = _mm256_inserti128_si256(                            \
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(src + 1))), \
+      _mm_loadu_si128((__m128i *)(src + src_stride + 1)), 1);
+
+#define LOAD_DST_INSERT                                          \
+  dst_reg = _mm256_inserti128_si256(                             \
+      _mm256_castsi128_si256(_mm_loadu_si128((__m128i *)(dst))), \
+      _mm_loadu_si128((__m128i *)(dst + dst_stride)), 1);
+
+#define LOAD_SRC_MERGE_128BIT(filter)                        \
+  __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src));     \
+  __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1)); \
+  __m128i src_lo = _mm_unpacklo_epi8(src_reg_0, src_reg_1);  \
+  __m128i src_hi = _mm_unpackhi_epi8(src_reg_0, src_reg_1);  \
+  __m128i filter_128bit = _mm256_castsi256_si128(filter);    \
+  __m128i pw8_128bit = _mm256_castsi256_si128(pw8);
+
+#define FILTER_SRC_128BIT(filter)             \
+  /* filter the source */                     \
+  src_lo = _mm_maddubs_epi16(src_lo, filter); \
+  src_hi = _mm_maddubs_epi16(src_hi, filter); \
+                                              \
+  /* add 8 to source */                       \
+  src_lo = _mm_add_epi16(src_lo, pw8_128bit); \
+  src_hi = _mm_add_epi16(src_hi, pw8_128bit); \
+                                              \
+  /* divide source by 16 */                   \
+  src_lo = _mm_srai_epi16(src_lo, 4);         \
+  src_hi = _mm_srai_epi16(src_hi, 4);
+
 unsigned int aom_sub_pixel_variance32xh_avx2(const uint8_t *src, int src_stride,
                                              int x_offset, int y_offset,
                                              const uint8_t *dst, int dst_stride,
@@ -292,6 +351,244 @@
   return sum;
 }
 
+unsigned int aom_sub_pixel_variance16xh_avx2(const uint8_t *src, int src_stride,
+                                             int x_offset, int y_offset,
+                                             const uint8_t *dst, int dst_stride,
+                                             int height, unsigned int *sse) {
+  __m256i src_reg, dst_reg, exp_src_lo, exp_src_hi, exp_dst_lo, exp_dst_hi;
+  __m256i sse_reg, sum_reg, sse_reg_hi, res_cmp, sum_reg_lo, sum_reg_hi;
+  __m256i zero_reg;
+  int i, sum;
+  sum_reg = _mm256_set1_epi16(0);
+  sse_reg = _mm256_set1_epi16(0);
+  zero_reg = _mm256_set1_epi16(0);
+
+  // x_offset = 0 and y_offset = 0
+  if (x_offset == 0) {
+    if (y_offset == 0) {
+      for (i = 0; i < height; i += 2) {
+        LOAD_SRC_DST_INSERT(src_stride, dst_stride)
+        // expend each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += (src_stride << 1);
+        dst += (dst_stride << 1);
+      }
+      // x_offset = 0 and y_offset = 4
+    } else if (y_offset == 4) {
+      __m256i src_next_reg;
+      for (i = 0; i < height; i += 2) {
+        LOAD_SRC_DST_INSERT(src_stride, dst_stride)
+        AVG_NEXT_SRC_INSERT(src_reg, src_stride)
+        // expend each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += (src_stride << 1);
+        dst += (dst_stride << 1);
+      }
+      // x_offset = 0 and y_offset = bilin interpolation
+    } else {
+      __m256i filter, pw8, src_next_reg;
+      y_offset <<= 5;
+      filter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      for (i = 0; i < height; i += 2) {
+        LOAD_SRC_DST_INSERT(src_stride, dst_stride)
+        MERGE_NEXT_SRC_INSERT(src_reg, src_stride)
+        FILTER_SRC(filter)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += (src_stride << 1);
+        dst += (dst_stride << 1);
+      }
+    }
+    // x_offset = 4  and y_offset = 0
+  } else if (x_offset == 4) {
+    if (y_offset == 0) {
+      __m256i src_next_reg;
+      for (i = 0; i < height; i += 2) {
+        LOAD_SRC_NEXT_BYTE_INSERT
+        LOAD_DST_INSERT
+        /* average between current and next stride source */
+        src_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+        // expand each byte to 2 bytes
+        MERGE_WITH_SRC(src_reg, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += (src_stride << 1);
+        dst += (dst_stride << 1);
+      }
+      // x_offset = 4  and y_offset = 4
+    } else if (y_offset == 4) {
+      __m256i src_next_reg, src_avg, src_temp;
+      // load and insert source and next row source
+      LOAD_SRC_NEXT_BYTE_INSERT
+      src_avg = _mm256_avg_epu8(src_reg, src_next_reg);
+      src += src_stride << 1;
+      for (i = 0; i < height - 2; i += 2) {
+        LOAD_SRC_NEXT_BYTE_INSERT
+        src_next_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+        src_temp = _mm256_permute2x128_si256(src_avg, src_next_reg, 0x21);
+        src_temp = _mm256_avg_epu8(src_avg, src_temp);
+        LOAD_DST_INSERT
+        // expand each byte to 2 bytes
+        MERGE_WITH_SRC(src_temp, zero_reg)
+        // save current source average
+        src_avg = src_next_reg;
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst += dst_stride << 1;
+        src += src_stride << 1;
+      }
+      // last 2 rows processing happens here
+      __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src));
+      __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1));
+      src_reg_0 = _mm_avg_epu8(src_reg_0, src_reg_1);
+      src_next_reg = _mm256_permute2x128_si256(
+          src_avg, _mm256_castsi128_si256(src_reg_0), 0x21);
+      LOAD_DST_INSERT
+      src_avg = _mm256_avg_epu8(src_avg, src_next_reg);
+      MERGE_WITH_SRC(src_avg, zero_reg)
+      CALC_SUM_SSE_INSIDE_LOOP
+    } else {
+      // x_offset = 4  and y_offset = bilin interpolation
+      __m256i filter, pw8, src_next_reg, src_avg, src_temp;
+      y_offset <<= 5;
+      filter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      // load and insert source and next row source
+      LOAD_SRC_NEXT_BYTE_INSERT
+      src_avg = _mm256_avg_epu8(src_reg, src_next_reg);
+      src += src_stride << 1;
+      for (i = 0; i < height - 2; i += 2) {
+        LOAD_SRC_NEXT_BYTE_INSERT
+        src_next_reg = _mm256_avg_epu8(src_reg, src_next_reg);
+        src_temp = _mm256_permute2x128_si256(src_avg, src_next_reg, 0x21);
+        LOAD_DST_INSERT
+        MERGE_WITH_SRC(src_avg, src_temp)
+        // save current source average
+        src_avg = src_next_reg;
+        FILTER_SRC(filter)
+        CALC_SUM_SSE_INSIDE_LOOP
+        dst += dst_stride << 1;
+        src += src_stride << 1;
+      }
+      // last 2 rows processing happens here
+      __m128i src_reg_0 = _mm_loadu_si128((__m128i *)(src));
+      __m128i src_reg_1 = _mm_loadu_si128((__m128i *)(src + 1));
+      src_reg_0 = _mm_avg_epu8(src_reg_0, src_reg_1);
+      src_next_reg = _mm256_permute2x128_si256(
+          src_avg, _mm256_castsi128_si256(src_reg_0), 0x21);
+      LOAD_DST_INSERT
+      MERGE_WITH_SRC(src_avg, src_next_reg)
+      FILTER_SRC(filter)
+      CALC_SUM_SSE_INSIDE_LOOP
+    }
+    // x_offset = bilin interpolation and y_offset = 0
+  } else {
+    if (y_offset == 0) {
+      __m256i filter, pw8, src_next_reg;
+      x_offset <<= 5;
+      filter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + x_offset));
+      pw8 = _mm256_set1_epi16(8);
+      for (i = 0; i < height; i += 2) {
+        LOAD_SRC_DST_INSERT(src_stride, dst_stride)
+        MERGE_NEXT_SRC_INSERT(src_reg, 1)
+        FILTER_SRC(filter)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += (src_stride << 1);
+        dst += (dst_stride << 1);
+      }
+      // x_offset = bilin interpolation and y_offset = 4
+    } else if (y_offset == 4) {
+      __m256i filter, pw8, src_next_reg, src_pack;
+      x_offset <<= 5;
+      filter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + x_offset));
+      pw8 = _mm256_set1_epi16(8);
+      // load and insert source and next row source
+      LOAD_SRC_NEXT_BYTE_INSERT
+      MERGE_WITH_SRC(src_reg, src_next_reg)
+      FILTER_SRC(filter)
+      // convert each 16 bit to 8 bit to each low and high lane source
+      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      src += src_stride << 1;
+      for (i = 0; i < height - 2; i += 2) {
+        LOAD_SRC_NEXT_BYTE_INSERT
+        LOAD_DST_INSERT
+        MERGE_WITH_SRC(src_reg, src_next_reg)
+        FILTER_SRC(filter)
+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        src_next_reg = _mm256_permute2x128_si256(src_pack, src_reg, 0x21);
+        // average between previous pack to the current
+        src_pack = _mm256_avg_epu8(src_pack, src_next_reg);
+        MERGE_WITH_SRC(src_pack, zero_reg)
+        CALC_SUM_SSE_INSIDE_LOOP
+        src_pack = src_reg;
+        src += src_stride << 1;
+        dst += dst_stride << 1;
+      }
+      // last 2 rows processing happens here
+      LOAD_SRC_MERGE_128BIT(filter)
+      LOAD_DST_INSERT
+      FILTER_SRC_128BIT(filter_128bit)
+      src_reg_0 = _mm_packus_epi16(src_lo, src_hi);
+      src_next_reg = _mm256_permute2x128_si256(
+          src_pack, _mm256_castsi128_si256(src_reg_0), 0x21);
+      // average between previous pack to the current
+      src_pack = _mm256_avg_epu8(src_pack, src_next_reg);
+      MERGE_WITH_SRC(src_pack, zero_reg)
+      CALC_SUM_SSE_INSIDE_LOOP
+    } else {
+      // x_offset = bilin interpolation and y_offset = bilin interpolation
+      __m256i xfilter, yfilter, pw8, src_next_reg, src_pack;
+      x_offset <<= 5;
+      xfilter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + x_offset));
+      y_offset <<= 5;
+      yfilter = _mm256_load_si256(
+          (__m256i const *)(bilinear_filters_avx2 + y_offset));
+      pw8 = _mm256_set1_epi16(8);
+      // load and insert source and next row source
+      LOAD_SRC_NEXT_BYTE_INSERT
+      MERGE_WITH_SRC(src_reg, src_next_reg)
+      FILTER_SRC(xfilter)
+      // convert each 16 bit to 8 bit to each low and high lane source
+      src_pack = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+      src += src_stride << 1;
+      for (i = 0; i < height - 2; i += 2) {
+        LOAD_SRC_NEXT_BYTE_INSERT
+        LOAD_DST_INSERT
+        MERGE_WITH_SRC(src_reg, src_next_reg)
+        FILTER_SRC(xfilter)
+        src_reg = _mm256_packus_epi16(exp_src_lo, exp_src_hi);
+        src_next_reg = _mm256_permute2x128_si256(src_pack, src_reg, 0x21);
+        // average between previous pack to the current
+        MERGE_WITH_SRC(src_pack, src_next_reg)
+        // filter the source
+        FILTER_SRC(yfilter)
+        src_pack = src_reg;
+        CALC_SUM_SSE_INSIDE_LOOP
+        src += src_stride << 1;
+        dst += dst_stride << 1;
+      }
+      // last 2 rows processing happens here
+      LOAD_SRC_MERGE_128BIT(xfilter)
+      LOAD_DST_INSERT
+      FILTER_SRC_128BIT(filter_128bit)
+      src_reg_0 = _mm_packus_epi16(src_lo, src_hi);
+      src_next_reg = _mm256_permute2x128_si256(
+          src_pack, _mm256_castsi128_si256(src_reg_0), 0x21);
+      MERGE_WITH_SRC(src_pack, src_next_reg)
+      FILTER_SRC(yfilter)
+      CALC_SUM_SSE_INSIDE_LOOP
+    }
+  }
+  CALC_SUM_AND_SSE
+  _mm256_zeroupper();
+  return sum;
+}
+
 unsigned int aom_sub_pixel_avg_variance32xh_avx2(
     const uint8_t *src, int src_stride, int x_offset, int y_offset,
     const uint8_t *dst, int dst_stride, const uint8_t *sec, int sec_stride,
diff --git a/test/variance_test.cc b/test/variance_test.cc
index d39cd5d..1458ece 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -805,7 +805,7 @@
     }
   }
 
-  unsigned int sse1;
+  unsigned int sse1, sse2;
   int run_time = 1000000000 / block_size();
   aom_usec_timer timer;
 
@@ -818,8 +818,24 @@
   aom_usec_timer_mark(&timer);
 
   const int elapsed_time = static_cast<int>(aom_usec_timer_elapsed(&timer));
-  printf("sub_pixel_variance_%dx%d_%d: %d us\n", width(), height(),
-         params_.bit_depth, elapsed_time);
+
+  aom_usec_timer timer_c;
+
+  aom_usec_timer_start(&timer_c);
+  for (int i = 0; i < run_time; ++i) {
+    int x = rnd_(8);
+    int y = rnd_(8);
+    subpel_variance_ref(ref_, src_, params_.log2width, params_.log2height, x, y,
+                        &sse2, use_high_bit_depth(), params_.bit_depth);
+  }
+  aom_usec_timer_mark(&timer_c);
+
+  const int elapsed_time_c = static_cast<int>(aom_usec_timer_elapsed(&timer_c));
+
+  printf(
+      "sub_pixel_variance_%dx%d_%d: ref_time=%d us opt_time=%d us gain=%d \n",
+      width(), height(), params_.bit_depth, elapsed_time_c, elapsed_time,
+      elapsed_time_c / elapsed_time);
 }
 
 template <>
@@ -1076,6 +1092,7 @@
 TEST_P(SumOfSquaresTest, Ref) { RefTest(); }
 TEST_P(AvxSubpelVarianceTest, Ref) { RefTest(); }
 TEST_P(AvxSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
+TEST_P(AvxSubpelVarianceTest, DISABLED_Speed) { SpeedTest(); }
 TEST_P(AvxSubpelAvgVarianceTest, Ref) { RefTest(); }
 TEST_P(AvxDistWtdSubpelAvgVarianceTest, Ref) { RefTest(); }
 TEST_P(AvxObmcSubpelVarianceTest, Ref) { RefTest(); }
@@ -2272,7 +2289,12 @@
         SubpelVarianceParams(6, 5, &aom_sub_pixel_variance64x32_avx2, 0),
         SubpelVarianceParams(5, 6, &aom_sub_pixel_variance32x64_avx2, 0),
         SubpelVarianceParams(5, 5, &aom_sub_pixel_variance32x32_avx2, 0),
-        SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_avx2, 0)));
+        SubpelVarianceParams(5, 4, &aom_sub_pixel_variance32x16_avx2, 0),
+        SubpelVarianceParams(4, 6, &aom_sub_pixel_variance16x64_avx2, 0),
+        SubpelVarianceParams(4, 5, &aom_sub_pixel_variance16x32_avx2, 0),
+        SubpelVarianceParams(4, 4, &aom_sub_pixel_variance16x16_avx2, 0),
+        SubpelVarianceParams(4, 3, &aom_sub_pixel_variance16x8_avx2, 0),
+        SubpelVarianceParams(4, 2, &aom_sub_pixel_variance16x4_avx2, 0)));
 
 INSTANTIATE_TEST_SUITE_P(
     AVX2, AvxSubpelAvgVarianceTest,