Optimize avg_8x8 sse2 and avx2 intrinsics

Microbenchmark results.
+--------------+-----------+-------+-------+
|              | Microarch |  AVX2 |  SSE2 |
+--------------+-----------+-------+-------+
|              |   Zen 2   | x1.22 | x2.66 |
| avg_8x8_quad +-----------+-------+-------+
|              |  Skylake  | x1.38 | x2.89 |
+--------------+-----------+-------+-------+
|              |   Zen 2   |  N/A  | x1.03 |
|    avg_8x8   +-----------+-------+-------+
|              |  Skylake  |  N/A  | x1.03 |
+--------------+-----------+-------+-------+
|              |   Zen 2   |  N/A  | x1.00 |
|    avg_4x4   +-----------+-------+-------+
|              | Skylake   |  N/A  | x1.03 |
+--------------+-----------+-------+-------+

Also improve test coverage by testing different offsets.

Change-Id: I290385a19d3ac2cc470a97b4c335e5b2abb9b966
diff --git a/aom_dsp/x86/avg_intrin_avx2.c b/aom_dsp/x86/avg_intrin_avx2.c
index 6f374ee..6f4436f 100644
--- a/aom_dsp/x86/avg_intrin_avx2.c
+++ b/aom_dsp/x86/avg_intrin_avx2.c
@@ -505,52 +505,42 @@
   }
 }
 
-static INLINE __m256i calc_avg_8x8_dual_avx2(const uint8_t *s, int p) {
-  const __m256i s0 =
-      _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(s)));
-  const __m256i s1 =
-      _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(s + p)));
-  const __m256i s2 =
-      _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(s + 2 * p)));
-  const __m256i s3 =
-      _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(s + 3 * p)));
-  const __m256i sum0 =
-      _mm256_add_epi16(_mm256_add_epi16(s0, s1), _mm256_add_epi16(s2, s3));
-  const __m256i s4 =
-      _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(s + 4 * p)));
-  const __m256i s5 =
-      _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(s + 5 * p)));
-  const __m256i s6 =
-      _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(s + 6 * p)));
-  const __m256i s7 =
-      _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(s + 7 * p)));
-  const __m256i sum1 =
-      _mm256_add_epi16(_mm256_add_epi16(s4, s5), _mm256_add_epi16(s6, s7));
-
-  // The result of two 8x8 sub-blocks in 16x16 block.
-  return _mm256_add_epi16(sum0, sum1);
+static INLINE __m256i xx_loadu2_mi128(const void *hi, const void *lo) {
+  __m256i a = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(lo)));
+  a = _mm256_inserti128_si256(a, _mm_loadu_si128((const __m128i *)(hi)), 1);
+  return a;
 }
 
 void aom_avg_8x8_quad_avx2(const uint8_t *s, int p, int x16_idx, int y16_idx,
                            int *avg) {
-  // Process 1st and 2nd 8x8 sub-blocks in a 16x16 block.
-  const uint8_t *s_tmp = s + y16_idx * p + x16_idx;
-  __m256i result_0 = calc_avg_8x8_dual_avx2(s_tmp, p);
+  const uint8_t *s_y0 = s + y16_idx * p + x16_idx;
+  const uint8_t *s_y1 = s_y0 + 8 * p;
+  __m256i sum0, sum1, s0, s1, s2, s3, u0;
+  u0 = _mm256_setzero_si256();
+  s0 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1, s_y0), u0);
+  s1 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + p, s_y0 + p), u0);
+  s2 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 2 * p, s_y0 + 2 * p), u0);
+  s3 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 3 * p, s_y0 + 3 * p), u0);
+  sum0 = _mm256_add_epi16(s0, s1);
+  sum1 = _mm256_add_epi16(s2, s3);
+  s0 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 4 * p, s_y0 + 4 * p), u0);
+  s1 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 5 * p, s_y0 + 5 * p), u0);
+  s2 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 6 * p, s_y0 + 6 * p), u0);
+  s3 = _mm256_sad_epu8(xx_loadu2_mi128(s_y1 + 7 * p, s_y0 + 7 * p), u0);
+  sum0 = _mm256_add_epi16(sum0, _mm256_add_epi16(s0, s1));
+  sum1 = _mm256_add_epi16(sum1, _mm256_add_epi16(s2, s3));
+  sum0 = _mm256_add_epi16(sum0, sum1);
 
-  // Process 3rd and 4th 8x8 sub-blocks in a 16x16 block.
-  s_tmp = s + ((y16_idx + 8) * p) + x16_idx;
-  __m256i result_1 = calc_avg_8x8_dual_avx2(s_tmp, p);
-
-  const __m256i constant_32 = _mm256_set1_epi16(32);
-  result_0 = _mm256_hadd_epi16(result_0, result_1);
-  result_1 = _mm256_adds_epu16(result_0, _mm256_srli_si256(result_0, 4));
-  result_0 = _mm256_adds_epu16(result_1, _mm256_srli_si256(result_1, 2));
-  result_0 = _mm256_adds_epu16(result_0, constant_32);
-  result_0 = _mm256_srli_epi16(result_0, 6);
-  avg[0] = _mm_extract_epi16(_mm256_castsi256_si128(result_0), 0);
-  avg[1] = _mm_extract_epi16(_mm256_extracti128_si256(result_0, 1), 0);
-  avg[2] = _mm_extract_epi16(_mm256_castsi256_si128(result_0), 4);
-  avg[3] = _mm_extract_epi16(_mm256_extracti128_si256(result_0, 1), 4);
+  // (avg + 32) >> 6
+  __m256i rounding = _mm256_set1_epi32(32);
+  sum0 = _mm256_add_epi32(sum0, rounding);
+  sum0 = _mm256_srli_epi32(sum0, 6);
+  __m128i lo = _mm256_castsi256_si128(sum0);
+  __m128i hi = _mm256_extracti128_si256(sum0, 1);
+  avg[0] = _mm_cvtsi128_si32(lo);
+  avg[1] = _mm_extract_epi32(lo, 2);
+  avg[2] = _mm_cvtsi128_si32(hi);
+  avg[3] = _mm_extract_epi32(hi, 2);
 }
 
 void aom_int_pro_row_avx2(int16_t *hbuf, const uint8_t *ref,
diff --git a/aom_dsp/x86/avg_intrin_sse2.c b/aom_dsp/x86/avg_intrin_sse2.c
index 9657ecc..c48da15 100644
--- a/aom_dsp/x86/avg_intrin_sse2.c
+++ b/aom_dsp/x86/avg_intrin_sse2.c
@@ -14,6 +14,7 @@
 #include "config/aom_dsp_rtcd.h"
 #include "aom/aom_integer.h"
 #include "aom_dsp/x86/bitdepth_conversion_sse2.h"
+#include "aom_dsp/x86/mem_sse2.h"
 #include "aom_ports/mem.h"
 
 void aom_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
@@ -95,39 +96,61 @@
 }
 
 unsigned int aom_avg_8x8_sse2(const uint8_t *s, int p) {
-  __m128i s0, s1, u0;
+  __m128i sum0, sum1, s0, s1, s2, s3, u0;
   unsigned int avg = 0;
   u0 = _mm_setzero_si128();
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
-  s0 = _mm_adds_epu16(s0, s1);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
-  s0 = _mm_adds_epu16(s0, s1);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
-  s0 = _mm_adds_epu16(s0, s1);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
-  s0 = _mm_adds_epu16(s0, s1);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
-  s0 = _mm_adds_epu16(s0, s1);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
-  s0 = _mm_adds_epu16(s0, s1);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
-  s0 = _mm_adds_epu16(s0, s1);
+  s0 = loadh_epi64((const __m128i *)(s + p),
+                   _mm_loadl_epi64((const __m128i *)(s)));
+  s1 = loadh_epi64((const __m128i *)(s + 3 * p),
+                   _mm_loadl_epi64((const __m128i *)(s + 2 * p)));
+  s2 = loadh_epi64((const __m128i *)(s + 5 * p),
+                   _mm_loadl_epi64((const __m128i *)(s + 4 * p)));
+  s3 = loadh_epi64((const __m128i *)(s + 7 * p),
+                   _mm_loadl_epi64((const __m128i *)(s + 6 * p)));
+  s0 = _mm_sad_epu8(s0, u0);
+  s1 = _mm_sad_epu8(s1, u0);
+  s2 = _mm_sad_epu8(s2, u0);
+  s3 = _mm_sad_epu8(s3, u0);
 
-  s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8));
-  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 32));
-  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
-  avg = _mm_extract_epi16(s0, 0);
+  sum0 = _mm_add_epi16(s0, s1);
+  sum1 = _mm_add_epi16(s2, s3);
+  sum0 = _mm_add_epi16(sum0, sum1);
+  sum0 = _mm_add_epi16(sum0, _mm_srli_si128(sum0, 8));
+  avg = _mm_cvtsi128_si32(sum0);
   return (avg + 32) >> 6;
 }
 
+void calc_avg_8x8_dual_sse2(const uint8_t *s, int p, int *avg) {
+  __m128i sum0, sum1, s0, s1, s2, s3, u0;
+  u0 = _mm_setzero_si128();
+  s0 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s)), u0);
+  s1 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + p)), u0);
+  s2 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 2 * p)), u0);
+  s3 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 3 * p)), u0);
+  sum0 = _mm_add_epi16(s0, s1);
+  sum1 = _mm_add_epi16(s2, s3);
+  s0 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 4 * p)), u0);
+  s1 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 5 * p)), u0);
+  s2 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 6 * p)), u0);
+  s3 = _mm_sad_epu8(_mm_loadu_si128((const __m128i *)(s + 7 * p)), u0);
+  sum0 = _mm_add_epi16(sum0, _mm_add_epi16(s0, s1));
+  sum1 = _mm_add_epi16(sum1, _mm_add_epi16(s2, s3));
+  sum0 = _mm_add_epi16(sum0, sum1);
+
+  // (avg + 32) >> 6
+  __m128i rounding = _mm_set1_epi32(32);
+  sum0 = _mm_add_epi32(sum0, rounding);
+  sum0 = _mm_srli_epi32(sum0, 6);
+  avg[0] = _mm_cvtsi128_si32(sum0);
+  avg[1] = _mm_extract_epi16(sum0, 4);
+}
+
 void aom_avg_8x8_quad_sse2(const uint8_t *s, int p, int x16_idx, int y16_idx,
                            int *avg) {
-  for (int k = 0; k < 4; k++) {
-    const int x8_idx = x16_idx + ((k & 1) << 3);
-    const int y8_idx = y16_idx + ((k >> 1) << 3);
-    const uint8_t *s_tmp = s + y8_idx * p + x8_idx;
-    avg[k] = aom_avg_8x8_sse2(s_tmp, p);
+  const uint8_t *s_ptr = s + y16_idx * p + x16_idx;
+  for (int k = 0; k < 2; k++) {
+    calc_avg_8x8_dual_sse2(s_ptr, p, avg + k * 2);
+    s_ptr += 8 * p;
   }
 }
 
@@ -135,17 +158,14 @@
   __m128i s0, s1, u0;
   unsigned int avg = 0;
   u0 = _mm_setzero_si128();
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
-  s0 = _mm_adds_epu16(s0, s1);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
-  s0 = _mm_adds_epu16(s0, s1);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
-  s0 = _mm_adds_epu16(s0, s1);
-
-  s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));
-  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
-  avg = _mm_extract_epi16(s0, 0);
+  s0 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)(s)),
+                          _mm_cvtsi32_si128(*(const int *)(s + p)));
+  s1 = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)(s + p * 2)),
+                          _mm_cvtsi32_si128(*(const int *)(s + p * 3)));
+  s0 = _mm_sad_epu8(s0, u0);
+  s1 = _mm_sad_epu8(s1, u0);
+  s0 = _mm_add_epi16(s0, s1);
+  avg = _mm_cvtsi128_si32(s0);
   return (avg + 8) >> 4;
 }
 
diff --git a/test/avg_test.cc b/test/avg_test.cc
index 8922f34..4e86f06 100644
--- a/test/avg_test.cc
+++ b/test/avg_test.cc
@@ -43,7 +43,9 @@
  protected:
   // Handle blocks up to 4 blocks 64x64 with stride up to 128
   static const int kDataAlignment = 16;
-  static const int kDataBlockSize = 128 * 128;
+  static const int kDataBlockWidth = 128;
+  static const int kDataBlockHeight = 128;
+  static const int kDataBlockSize = kDataBlockWidth * kDataBlockHeight;
 
   virtual void SetUp() {
     const testing::TestInfo *const test_info =
@@ -236,13 +238,11 @@
   using AverageTestBase<Pixel>::FillConstant;
   using AverageTestBase<Pixel>::FillRandom;
 
-  void CheckAverages(int iterations) {
+  void CheckAveragesAt(int iterations, int x16_idx, int y16_idx) {
     ASSERT_EQ(sizeof(Pixel), 1u);
     const int block_size = GET_PARAM(4);
     (void)block_size;
     int expected[4] = { 0 };
-    int x16_idx = 0;
-    int y16_idx = 0;
 
     // The reference frame, but not the source frame, may be unaligned for
     // certain types of searches.
@@ -285,19 +285,25 @@
     }
   }
 
+  void CheckAverages() {
+    for (int x16_idx = 0; x16_idx < this->kDataBlockWidth / 8; x16_idx += 2)
+      for (int y16_idx = 0; y16_idx < this->kDataBlockHeight / 8; y16_idx += 2)
+        CheckAveragesAt(1, x16_idx, y16_idx);
+  }
+
   void TestConstantValue(Pixel value) {
     FillConstant(value);
-    CheckAverages(1);
+    CheckAverages();
   }
 
   void TestRandom() {
     FillRandom();
-    CheckAverages(1);
+    CheckAverages();
   }
 
   void TestSpeed() {
     FillRandom();
-    CheckAverages(1000000);
+    CheckAveragesAt(1000000, 0, 0);
   }
 
   int64_t ref_elapsed_time_ = 0;