AVX2: Add optimization for sad_16xhx4d

Around 5~10% improvement on functional level.

Performance from this series of commits:
RTC:
| SPD_SET |  TESTSET   | AVG_PSNR | OVR_PSNR |  SSIM   | ENC_T |
|---------|------------|----------|----------|---------|-------|
|    5    |    rtc     | +0.000%  | +0.000%  | +0.000% | -0.7% |
|    5    | rtc_1080p  | +0.000%  | +0.000%  | +0.000% | -0.4% |
|    5    |  rtc_derf  | +0.000%  | +0.000%  | +0.000% | -0.8% |
|    5    | rtc_screen | +0.000%  | +0.000%  | +0.000% | -0.2% |
|---------|------------|----------|----------|---------|-------|
|    6    |    rtc     | +0.000%  | +0.000%  | +0.000% | -0.8% |
|    6    | rtc_1080p  | +0.000%  | +0.000%  | +0.000% | -0.5% |
|    6    |  rtc_derf  | +0.000%  | +0.000%  | +0.000% | -0.9% |
|    6    | rtc_screen | +0.000%  | +0.000%  | +0.000% | -0.3% |
|---------|------------|----------|----------|---------|-------|
|    7    |    rtc     | +0.000%  | +0.000%  | +0.000% | -0.5% |
|    7    | rtc_1080p  | +0.000%  | +0.000%  | +0.000% | -0.5% |
|    7    |  rtc_derf  | +0.000%  | +0.000%  | +0.000% | -0.6% |
|    7    | rtc_screen | +0.000%  | +0.000%  | +0.000% | -0.2% |
|---------|------------|----------|----------|---------|-------|
|    8    |    rtc     | +0.000%  | +0.000%  | +0.000% | -0.5% |
|    8    | rtc_1080p  | +0.000%  | +0.000%  | +0.000% | -0.3% |
|    8    |  rtc_derf  | +0.000%  | +0.000%  | +0.000% | -0.5% |
|    8    | rtc_screen | +0.000%  | +0.000%  | +0.000% | -0.2% |
|---------|------------|----------|----------|---------|-------|
|    9    |    rtc     | +0.000%  | +0.000%  | +0.000% | -0.6% |
|    9    | rtc_1080p  | +0.000%  | +0.000%  | +0.000% | -0.3% |
|    9    |  rtc_derf  | +0.000%  | +0.000%  | +0.000% | -0.5% |
|    9    | rtc_screen | +0.000%  | +0.000%  | +0.000% | -0.5% |
|---------|------------|----------|----------|---------|-------|
|   10    |    rtc     | +0.000%  | +0.000%  | +0.000% | -0.5% |
|   10    | rtc_1080p  | +0.000%  | +0.000%  | +0.000% | -0.3% |
|   10    |  rtc_derf  | +0.000%  | +0.000%  | +0.000% | -0.5% |
|   10    | rtc_screen | +0.000%  | +0.000%  | +0.000% | -0.5% |

VOD:
| SPD_SET | TESTSET | AVG_PSNR | OVR_PSNR |  SSIM   | ENC_T |
|---------|---------|----------|----------|---------|-------|
|    1    | hdres2  | +0.000%  | +0.000%  | +0.000% | -0.5% |
|    1    | lowres2 | +0.000%  | +0.000%  | +0.000% | -0.1% |
|    1    | midres2 | +0.000%  | +0.000%  | +0.000% | -0.2% |
|---------|---------|----------|----------|---------|-------|
|    2    | hdres2  | +0.000%  | +0.000%  | +0.000% | -0.4% |
|    2    | lowres2 | +0.000%  | +0.000%  | +0.000% | -0.1% |
|    2    | midres2 | +0.000%  | +0.000%  | +0.000% | -0.2% |
|---------|---------|----------|----------|---------|-------|
|    3    | hdres2  | +0.000%  | +0.000%  | +0.000% | -0.8% |
|    3    | lowres2 | +0.000%  | +0.000%  | +0.000% | -0.3% |
|    3    | midres2 | +0.000%  | +0.000%  | +0.000% | -0.5% |
|---------|---------|----------|----------|---------|-------|
|    4    | hdres2  | +0.000%  | +0.000%  | +0.000% | -1.1% |
|    4    | lowres2 | +0.000%  | +0.000%  | +0.000% | -0.4% |
|    4    | midres2 | +0.000%  | +0.000%  | +0.000% | -0.7% |
|---------|---------|----------|----------|---------|-------|
|    5    | hdres2  | +0.000%  | +0.000%  | +0.000% | -1.5% |
|    5    | lowres2 | +0.000%  | +0.000%  | +0.000% | -0.6% |
|    5    | midres2 | +0.000%  | +0.000%  | +0.000% | -0.9% |
|---------|---------|----------|----------|---------|-------|
|    6    | hdres2  | +0.000%  | +0.000%  | +0.000% | -1.6% |
|    6    | lowres2 | +0.000%  | +0.000%  | +0.000% | -0.7% |
|    6    | midres2 | +0.000%  | +0.000%  | +0.000% | -1.1% |

BUG=aomedia:3358

Change-Id: I63dd9113e07314e0238268b1f04a8d4b2e3397c1
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 427a3dc..4c22060 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -1013,9 +1013,9 @@
   specialize qw/aom_sad32x64x4d   avx2 neon sse2/;
   specialize qw/aom_sad32x32x4d   avx2 neon sse2/;
   specialize qw/aom_sad32x16x4d   avx2 neon sse2/;
-  specialize qw/aom_sad16x32x4d        neon sse2/;
-  specialize qw/aom_sad16x16x4d        neon sse2/;
-  specialize qw/aom_sad16x8x4d         neon sse2/;
+  specialize qw/aom_sad16x32x4d   avx2 neon sse2/;
+  specialize qw/aom_sad16x16x4d   avx2 neon sse2/;
+  specialize qw/aom_sad16x8x4d    avx2 neon sse2/;
 
   specialize qw/aom_sad8x16x4d         neon sse2/;
   specialize qw/aom_sad8x8x4d          neon sse2/;
@@ -1026,8 +1026,8 @@
 
   specialize qw/aom_sad64x16x4d   avx2 neon sse2/;
   specialize qw/aom_sad32x8x4d    avx2 neon sse2/;
-  specialize qw/aom_sad16x64x4d        neon sse2/;
-  specialize qw/aom_sad16x4x4d         neon sse2/;
+  specialize qw/aom_sad16x64x4d   avx2 neon sse2/;
+  specialize qw/aom_sad16x4x4d    avx2 neon sse2/;
   specialize qw/aom_sad8x32x4d         neon sse2/;
   specialize qw/aom_sad4x16x4d         neon sse2/;
 
@@ -1042,10 +1042,10 @@
   specialize qw/aom_sad_skip_32x16x4d   avx2 sse2 neon/;
   specialize qw/aom_sad_skip_32x8x4d    avx2 sse2 neon/;
 
-  specialize qw/aom_sad_skip_16x64x4d        sse2 neon/;
-  specialize qw/aom_sad_skip_16x32x4d        sse2 neon/;
-  specialize qw/aom_sad_skip_16x16x4d        sse2 neon/;
-  specialize qw/aom_sad_skip_16x8x4d         sse2 neon/;
+  specialize qw/aom_sad_skip_16x64x4d   avx2 sse2 neon/;
+  specialize qw/aom_sad_skip_16x32x4d   avx2 sse2 neon/;
+  specialize qw/aom_sad_skip_16x16x4d   avx2 sse2 neon/;
+  specialize qw/aom_sad_skip_16x8x4d    avx2 sse2 neon/;
   specialize qw/aom_sad_skip_8x32x4d         sse2 neon/;
   specialize qw/aom_sad_skip_8x16x4d         sse2 neon/;
   specialize qw/aom_sad_skip_8x8x4d          sse2 neon/;
diff --git a/aom_dsp/x86/sad4d_avx2.c b/aom_dsp/x86/sad4d_avx2.c
index 7629cf4..adfbd43 100644
--- a/aom_dsp/x86/sad4d_avx2.c
+++ b/aom_dsp/x86/sad4d_avx2.c
@@ -225,12 +225,72 @@
                           _mm256_setzero_si256());
 }
 
+static AOM_FORCE_INLINE void aom_sad16xNx4d_avx2(int N, const uint8_t *src,
+                                                 int src_stride,
+                                                 const uint8_t *const ref[4],
+                                                 int ref_stride,
+                                                 uint32_t res[4]) {
+  __m256i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
+  __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
+  const uint8_t *ref0, *ref1, *ref2, *ref3;
+  assert(N % 2 == 0);
+
+  ref0 = ref[0];
+  ref1 = ref[1];
+  ref2 = ref[2];
+  ref3 = ref[3];
+
+  sum_ref0 = _mm256_setzero_si256();
+  sum_ref2 = _mm256_setzero_si256();
+  sum_ref1 = _mm256_setzero_si256();
+  sum_ref3 = _mm256_setzero_si256();
+
+  for (int i = 0; i < N; i += 2) {
+    // load src and all refs
+    src_reg = yy_loadu2_128(src + src_stride, src);
+    ref0_reg = yy_loadu2_128(ref0 + ref_stride, ref0);
+    ref1_reg = yy_loadu2_128(ref1 + ref_stride, ref1);
+    ref2_reg = yy_loadu2_128(ref2 + ref_stride, ref2);
+    ref3_reg = yy_loadu2_128(ref3 + ref_stride, ref3);
+
+    // sum of the absolute differences between every ref-i to src
+    ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
+    ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
+    ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
+    ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
+
+    // sum every ref-i
+    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
+    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
+    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
+    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
+
+    src += 2 * src_stride;
+    ref0 += 2 * ref_stride;
+    ref1 += 2 * ref_stride;
+    ref2 += 2 * ref_stride;
+    ref3 += 2 * ref_stride;
+  }
+
+  aggregate_and_store_sum(res, sum_ref0, sum_ref1, sum_ref2, sum_ref3);
+}
+
 #define SAD16XNX3_AVX2(n)                                                   \
   void aom_sad16x##n##x3d_avx2(const uint8_t *src, int src_stride,          \
                                const uint8_t *const ref[4], int ref_stride, \
                                uint32_t res[4]) {                           \
     aom_sad16xNx3d_avx2(n, src, src_stride, ref, ref_stride, res);          \
   }
+#define SAD16XNX4_AVX2(n)                                                   \
+  void aom_sad16x##n##x4d_avx2(const uint8_t *src, int src_stride,          \
+                               const uint8_t *const ref[4], int ref_stride, \
+                               uint32_t res[4]) {                           \
+    aom_sad16xNx4d_avx2(n, src, src_stride, ref, ref_stride, res);          \
+  }
+
+SAD16XNX4_AVX2(32)
+SAD16XNX4_AVX2(16)
+SAD16XNX4_AVX2(8)
 
 SAD16XNX3_AVX2(32)
 SAD16XNX3_AVX2(16)
@@ -239,4 +299,29 @@
 #if !CONFIG_REALTIME_ONLY
 SAD16XNX3_AVX2(64)
 SAD16XNX3_AVX2(4)
+
+SAD16XNX4_AVX2(64)
+SAD16XNX4_AVX2(4)
+
+#endif  // !CONFIG_REALTIME_ONLY
+
+#define SAD_SKIP_16XN_AVX2(n)                                                 \
+  void aom_sad_skip_16x##n##x4d_avx2(const uint8_t *src, int src_stride,      \
+                                     const uint8_t *const ref[4],             \
+                                     int ref_stride, uint32_t res[4]) {       \
+    aom_sad16xNx4d_avx2(((n) >> 1), src, 2 * src_stride, ref, 2 * ref_stride, \
+                        res);                                                 \
+    res[0] <<= 1;                                                             \
+    res[1] <<= 1;                                                             \
+    res[2] <<= 1;                                                             \
+    res[3] <<= 1;                                                             \
+  }
+
+SAD_SKIP_16XN_AVX2(32)
+SAD_SKIP_16XN_AVX2(16)
+SAD_SKIP_16XN_AVX2(8)
+
+#if !CONFIG_REALTIME_ONLY
+SAD_SKIP_16XN_AVX2(64)
+SAD_SKIP_16XN_AVX2(4)
 #endif  // !CONFIG_REALTIME_ONLY
diff --git a/test/sad_test.cc b/test/sad_test.cc
index 9dae336..98c8f51 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -337,6 +337,30 @@
     }
   }
 
+  virtual void SADForSpeedTest(unsigned int *results,
+                               const uint8_t *const *references) {
+    (void)results;
+    (void)references;
+  }
+
+  void SpeedSAD() {
+    int test_count = 20000000;
+    unsigned int exp_sad[4];
+    const uint8_t *references[] = { GetReference(0), GetReference(1),
+                                    GetReference(2), GetReference(3) };
+    aom_usec_timer timer;
+    aom_usec_timer_start(&timer);
+    while (test_count > 0) {
+      SADForSpeedTest(exp_sad, references);
+      test_count -= 1;
+    }
+    aom_usec_timer_mark(&timer);
+    const int64_t time = aom_usec_timer_elapsed(&timer) / 1000;
+    std::cout << "BLOCK_" << width_ << "X" << height_
+              << ", bit_depth:" << bit_depth_ << ",Time: " << time << "ms"
+              << std::endl;
+  }
+
   int width_, height_, mask_, bd_;
   aom_bit_depth_t bit_depth_;
   static uint8_t *source_data_;
@@ -376,9 +400,14 @@
         source_data_, source_stride_, references, reference_stride_, results));
   }
 
+  void SADForSpeedTest(unsigned int *results,
+                       const uint8_t *const *references) {
+    GET_PARAM(2)
+    (source_data_, source_stride_, references, reference_stride_, results);
+  }
+
   void CheckSADs() {
     unsigned int reference_sad, exp_sad[4];
-
     SADs(exp_sad);
     for (int block = 0; block < 4; ++block) {
       reference_sad = ReferenceSAD(block);
@@ -386,15 +415,6 @@
       EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block;
     }
   }
-
-  void SpeedSAD() {
-    int test_count = 2000000;
-    unsigned int exp_sad[4];
-    while (test_count > 0) {
-      SADs(exp_sad);
-      test_count -= 1;
-    }
-  }
 };
 
 class SADx3Test : public ::testing::WithParamInterface<SadMxNx4Param>,
@@ -407,8 +427,14 @@
     const uint8_t *references[] = { GetReference(0), GetReference(1),
                                     GetReference(2), GetReference(3) };
 
-    API_REGISTER_STATE_CHECK(GET_PARAM(2)(
-        source_data_, source_stride_, references, reference_stride_, results));
+    GET_PARAM(2)
+    (source_data_, source_stride_, references, reference_stride_, results);
+  }
+
+  void SADForSpeedTest(unsigned int *results,
+                       const uint8_t *const *references) {
+    GET_PARAM(2)
+    (source_data_, source_stride_, references, reference_stride_, results);
   }
 
   void CheckSADs() {
@@ -421,15 +447,6 @@
       EXPECT_EQ(reference_sad, exp_sad[block]) << "block " << block;
     }
   }
-
-  void SpeedSAD() {
-    int test_count = 2000000;
-    unsigned int exp_sad[4];
-    while (test_count > 0) {
-      SADs(exp_sad);
-      test_count -= 1;
-    }
-  }
 };
 
 class SADSkipx4Test : public ::testing::WithParamInterface<SadMxNx4Param>,
@@ -457,13 +474,10 @@
     }
   }
 
-  void SpeedSAD() {
-    int test_count = 2000000;
-    unsigned int exp_sad[4];
-    while (test_count > 0) {
-      SADs(exp_sad);
-      test_count -= 1;
-    }
+  void SADForSpeedTest(unsigned int *results,
+                       const uint8_t *const *references) {
+    GET_PARAM(2)
+    (source_data_, source_stride_, references, reference_stride_, results);
   }
 };
 
@@ -494,13 +508,11 @@
     }
   }
 
-  void SpeedSAD() {
-    int test_count = 200000;
-    unsigned int exp_sad[4];
-    while (test_count > 0) {
-      SADs(exp_sad);
-      test_count -= 1;
-    }
+  void SADForSpeedTest(unsigned int *results,
+                       const uint8_t *const *references) {
+    GET_PARAM(2)
+    (source_data_, source_stride_, references, reference_stride_, second_pred_,
+     results);
   }
 };
 #endif  // !CONFIG_REALTIME_ONLY
@@ -527,12 +539,11 @@
     ASSERT_EQ(reference_sad, exp_sad);
   }
 
-  void SpeedSAD() {
-    int test_count = 20000000;
-    while (test_count > 0) {
-      SAD(0);
-      test_count -= 1;
-    }
+  void SADForSpeedTest(unsigned int *results,
+                       const uint8_t *const *references) {
+    GET_PARAM(2)
+    (source_data_, source_stride_, references[0], reference_stride_);
+    (void)results;
   }
 };
 
@@ -558,12 +569,11 @@
     ASSERT_EQ(reference_sad, exp_sad);
   }
 
-  void SpeedSAD() {
-    int test_count = 20000000;
-    while (test_count > 0) {
-      SAD(0);
-      test_count -= 1;
-    }
+  void SADForSpeedTest(unsigned int *results,
+                       const uint8_t *const *references) {
+    GET_PARAM(2)
+    (source_data_, source_stride_, references[0], reference_stride_);
+    (void)results;
   }
 };
 
@@ -648,12 +658,12 @@
     ASSERT_EQ(reference_sad, exp_sad);
   }
 
-  void SpeedSAD() {
-    int test_count = 20000000;
-    while (test_count > 0) {
-      SAD(0);
-      test_count -= 1;
-    }
+  void SADForSpeedTest(unsigned int *results,
+                       const uint8_t *const *references) {
+    GET_PARAM(2)
+    (source_data_, source_stride_, references[0], reference_stride_, width_,
+     height_);
+    (void)results;
   }
 };
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(DistWtdSADTest);
@@ -2928,6 +2938,9 @@
   make_tuple(32, 64, &aom_sad_skip_32x64x4d_avx2, -1),
   make_tuple(32, 32, &aom_sad_skip_32x32x4d_avx2, -1),
   make_tuple(32, 16, &aom_sad_skip_32x16x4d_avx2, -1),
+  make_tuple(16, 32, &aom_sad_skip_16x32x4d_avx2, -1),
+  make_tuple(16, 16, &aom_sad_skip_16x16x4d_avx2, -1),
+  make_tuple(16, 8, &aom_sad_skip_16x8x4d_avx2, -1),
 
 #if CONFIG_AV1_HIGHBITDEPTH
   make_tuple(128, 128, &aom_highbd_sad_skip_128x128x4d_avx2, 8),
@@ -2984,6 +2997,8 @@
 #if !CONFIG_REALTIME_ONLY
   make_tuple(64, 16, &aom_sad_skip_64x16x4d_avx2, -1),
   make_tuple(32, 8, &aom_sad_skip_32x8x4d_avx2, -1),
+
+  make_tuple(16, 64, &aom_sad_skip_16x64x4d_avx2, -1),
 #endif
 };
 
@@ -2991,6 +3006,9 @@
                          ::testing::ValuesIn(skip_x4d_avx2_tests));
 
 const SadMxNx4Param x4d_avx2_tests[] = {
+  make_tuple(16, 32, &aom_sad16x32x4d_avx2, -1),
+  make_tuple(16, 16, &aom_sad16x16x4d_avx2, -1),
+  make_tuple(16, 8, &aom_sad16x8x4d_avx2, -1),
   make_tuple(32, 64, &aom_sad32x64x4d_avx2, -1),
   make_tuple(32, 32, &aom_sad32x32x4d_avx2, -1),
   make_tuple(32, 16, &aom_sad32x16x4d_avx2, -1),
@@ -3001,6 +3019,8 @@
   make_tuple(128, 64, &aom_sad128x64x4d_avx2, -1),
 
 #if !CONFIG_REALTIME_ONLY
+  make_tuple(16, 64, &aom_sad16x64x4d_avx2, -1),
+  make_tuple(16, 4, &aom_sad16x4x4d_avx2, -1),
   make_tuple(32, 8, &aom_sad32x8x4d_avx2, -1),
   make_tuple(64, 16, &aom_sad64x16x4d_avx2, -1),
 #endif