rtc: Optimize AVX2 intrinsic of aom_int_pro_col()

This CL optimizes aom_int_pro_col_avx2() by fully utilizing the
registers post sad operation. At module level scaling gains have
improved by 30% on average for different block sizes.

Encoder speed-up for RT preset,
                Instruction Count
cpu   Testset     Reduction(%)
 7         rtc      0.403
 7    rtc_derf      0.516
 8         rtc      0.448
 8    rtc_derf      0.289

Change-Id: Ia366a173b6d0e3834c8e0b2bb62a4523ef9c85f4
diff --git a/aom_dsp/x86/avg_intrin_avx2.c b/aom_dsp/x86/avg_intrin_avx2.c
index 6f4436f..c85d8c5 100644
--- a/aom_dsp/x86/avg_intrin_avx2.c
+++ b/aom_dsp/x86/avg_intrin_avx2.c
@@ -588,75 +588,183 @@
   }
 }
 
+static INLINE void load_from_src_buf(const uint8_t *ref1, __m256i *src,
+                                     const int stride) {
+  src[0] = _mm256_loadu_si256((const __m256i *)ref1);
+  src[1] = _mm256_loadu_si256((const __m256i *)(ref1 + stride));
+  src[2] = _mm256_loadu_si256((const __m256i *)(ref1 + (2 * stride)));
+  src[3] = _mm256_loadu_si256((const __m256i *)(ref1 + (3 * stride)));
+}
+
+#define CALC_TOT_SAD_AND_STORE                                                \
+  /* r00 r10 x x r01 r11 x x | r02 r12 x x r03 r13 x x */                     \
+  const __m256i r01 = _mm256_add_epi16(_mm256_slli_si256(r1, 2), r0);         \
+  /* r00 r10 r20 x r01 r11 r21 x | r02 r12 r22 x r03 r13 r23 x */             \
+  const __m256i r012 = _mm256_add_epi16(_mm256_slli_si256(r2, 4), r01);       \
+  /* r00 r10 r20 r30 r01 r11 r21 r31 | r02 r12 r22 r32 r03 r13 r23 r33 */     \
+  const __m256i result0 = _mm256_add_epi16(_mm256_slli_si256(r3, 6), r012);   \
+                                                                              \
+  const __m128i results0 = _mm_add_epi16(                                     \
+      _mm256_castsi256_si128(result0), _mm256_extractf128_si256(result0, 1)); \
+  const __m128i results1 =                                                    \
+      _mm_add_epi16(results0, _mm_srli_si128(results0, 8));                   \
+  _mm_storel_epi64((__m128i *)vbuf, _mm_srli_epi16(results1, norm_factor));
+
+static INLINE void aom_int_pro_col_16wd_avx2(int16_t *vbuf, const uint8_t *ref,
+                                             const int ref_stride,
+                                             const int height,
+                                             int norm_factor) {
+  const __m256i zero = _mm256_setzero_si256();
+  int ht = 0;
+  // Post sad operation, the data is present in lower 16-bit of each 64-bit lane
+  // and higher 16-bits are Zero. Here, we are processing 8 rows at a time to
+  // utilize the higher 16-bits efficiently.
+  do {
+    __m256i src_00 =
+        _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)(ref)));
+    src_00 = _mm256_inserti128_si256(
+        src_00, _mm_loadu_si128((const __m128i *)(ref + ref_stride * 4)), 1);
+    __m256i src_01 = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(ref + ref_stride * 1)));
+    src_01 = _mm256_inserti128_si256(
+        src_01, _mm_loadu_si128((const __m128i *)(ref + ref_stride * 5)), 1);
+    __m256i src_10 = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(ref + ref_stride * 2)));
+    src_10 = _mm256_inserti128_si256(
+        src_10, _mm_loadu_si128((const __m128i *)(ref + ref_stride * 6)), 1);
+    __m256i src_11 = _mm256_castsi128_si256(
+        _mm_loadu_si128((const __m128i *)(ref + ref_stride * 3)));
+    src_11 = _mm256_inserti128_si256(
+        src_11, _mm_loadu_si128((const __m128i *)(ref + ref_stride * 7)), 1);
+
+    // s00 x x x s01 x x x | s40 x x x s41 x x x
+    const __m256i s0 = _mm256_sad_epu8(src_00, zero);
+    // s10 x x x s11 x x x | s50 x x x s51 x x x
+    const __m256i s1 = _mm256_sad_epu8(src_01, zero);
+    // s20 x x x s21 x x x | s60 x x x s61 x x x
+    const __m256i s2 = _mm256_sad_epu8(src_10, zero);
+    // s30 x x x s31 x x x | s70 x x x s71 x x x
+    const __m256i s3 = _mm256_sad_epu8(src_11, zero);
+
+    // s00 s10 x x x x x x | s40 s50 x x x x x x
+    const __m256i s0_lo = _mm256_unpacklo_epi16(s0, s1);
+    // s01 s11 x x x x x x | s41 s51 x x x x x x
+    const __m256i s0_hi = _mm256_unpackhi_epi16(s0, s1);
+    // s20 s30 x x x x x x | s60 s70 x x x x x x
+    const __m256i s1_lo = _mm256_unpacklo_epi16(s2, s3);
+    // s21 s31 x x x x x x | s61 s71 x x x x x x
+    const __m256i s1_hi = _mm256_unpackhi_epi16(s2, s3);
+
+    // s0 s1 x x x x x x | s4 s5 x x x x x x
+    const __m256i s0_add = _mm256_add_epi16(s0_lo, s0_hi);
+    // s2 s3 x x x x x x | s6 s7 x x x x x x
+    const __m256i s1_add = _mm256_add_epi16(s1_lo, s1_hi);
+
+    // s1 s1 s2 s3 s4 s5 s6 s7
+    const __m128i results = _mm256_castsi256_si128(
+        _mm256_permute4x64_epi64(_mm256_unpacklo_epi32(s0_add, s1_add), 0x08));
+    _mm_storeu_si128((__m128i *)vbuf, _mm_srli_epi16(results, norm_factor));
+    vbuf += 8;
+    ref += (ref_stride << 3);
+    ht += 8;
+  } while (ht < height);
+}
+
 void aom_int_pro_col_avx2(int16_t *vbuf, const uint8_t *ref,
                           const int ref_stride, const int width,
                           const int height, int norm_factor) {
-  // SIMD implementation assumes width to be multiple of 16. For any odd width,
-  // SIMD support needs to be added.
   assert(width % 16 == 0);
-
   if (width == 128) {
     const __m256i zero = _mm256_setzero_si256();
-    for (int ht = 0; ht < height; ++ht) {
-      const __m256i src_line0 = _mm256_loadu_si256((const __m256i *)ref);
-      const __m256i src_line1 = _mm256_loadu_si256((const __m256i *)(ref + 32));
-      const __m256i src_line2 = _mm256_loadu_si256((const __m256i *)(ref + 64));
-      const __m256i src_line3 = _mm256_loadu_si256((const __m256i *)(ref + 96));
-      const __m256i s0 = _mm256_sad_epu8(src_line0, zero);
-      const __m256i s1 = _mm256_sad_epu8(src_line1, zero);
-      const __m256i s2 = _mm256_sad_epu8(src_line2, zero);
-      const __m256i s3 = _mm256_sad_epu8(src_line3, zero);
-      const __m256i result0_256bit = _mm256_add_epi16(s0, s1);
-      const __m256i result1_256bit = _mm256_add_epi16(s2, s3);
-      const __m256i result_256bit =
-          _mm256_add_epi16(result0_256bit, result1_256bit);
+    for (int ht = 0; ht < height; ht += 4) {
+      __m256i src[16];
+      // Load source data.
+      load_from_src_buf(ref, &src[0], ref_stride);
+      load_from_src_buf(ref + 32, &src[4], ref_stride);
+      load_from_src_buf(ref + 64, &src[8], ref_stride);
+      load_from_src_buf(ref + 96, &src[12], ref_stride);
 
-      const __m128i result =
-          _mm_add_epi16(_mm256_castsi256_si128(result_256bit),
-                        _mm256_extractf128_si256(result_256bit, 1));
-      __m128i result1 = _mm_add_epi16(result, _mm_srli_si128(result, 8));
-      vbuf[ht] = _mm_cvtsi128_si32(result1) >> norm_factor;
-      ref += ref_stride;
+      // Row0 output: r00 x x x r01 x x x | r02 x x x r03 x x x
+      const __m256i s0 = _mm256_add_epi16(_mm256_sad_epu8(src[0], zero),
+                                          _mm256_sad_epu8(src[4], zero));
+      const __m256i s1 = _mm256_add_epi16(_mm256_sad_epu8(src[8], zero),
+                                          _mm256_sad_epu8(src[12], zero));
+      const __m256i r0 = _mm256_add_epi16(s0, s1);
+      // Row1 output: r10 x x x r11 x x x | r12 x x x r13 x x x
+      const __m256i s2 = _mm256_add_epi16(_mm256_sad_epu8(src[1], zero),
+                                          _mm256_sad_epu8(src[5], zero));
+      const __m256i s3 = _mm256_add_epi16(_mm256_sad_epu8(src[9], zero),
+                                          _mm256_sad_epu8(src[13], zero));
+      const __m256i r1 = _mm256_add_epi16(s2, s3);
+      // Row2 output: r20 x x x r21 x x x | r22 x x x r23 x x x
+      const __m256i s4 = _mm256_add_epi16(_mm256_sad_epu8(src[2], zero),
+                                          _mm256_sad_epu8(src[6], zero));
+      const __m256i s5 = _mm256_add_epi16(_mm256_sad_epu8(src[10], zero),
+                                          _mm256_sad_epu8(src[14], zero));
+      const __m256i r2 = _mm256_add_epi16(s4, s5);
+      // Row3 output: r30 x x x r31 x x x | r32 x x x r33 x x x
+      const __m256i s6 = _mm256_add_epi16(_mm256_sad_epu8(src[3], zero),
+                                          _mm256_sad_epu8(src[7], zero));
+      const __m256i s7 = _mm256_add_epi16(_mm256_sad_epu8(src[11], zero),
+                                          _mm256_sad_epu8(src[15], zero));
+      const __m256i r3 = _mm256_add_epi16(s6, s7);
+
+      CALC_TOT_SAD_AND_STORE
+      vbuf += 4;
+      ref += ref_stride << 2;
     }
   } else if (width == 64) {
     const __m256i zero = _mm256_setzero_si256();
-    for (int ht = 0; ht < height; ++ht) {
-      const __m256i src_line0 = _mm256_loadu_si256((const __m256i *)ref);
-      const __m256i src_line1 = _mm256_loadu_si256((const __m256i *)(ref + 32));
-      const __m256i s1 = _mm256_sad_epu8(src_line0, zero);
-      const __m256i s2 = _mm256_sad_epu8(src_line1, zero);
-      const __m256i result_256bit = _mm256_add_epi16(s1, s2);
+    for (int ht = 0; ht < height; ht += 4) {
+      __m256i src[8];
+      // Load source data.
+      load_from_src_buf(ref, &src[0], ref_stride);
+      load_from_src_buf(ref + 32, &src[4], ref_stride);
 
-      const __m128i result =
-          _mm_add_epi16(_mm256_castsi256_si128(result_256bit),
-                        _mm256_extractf128_si256(result_256bit, 1));
-      __m128i result1 = _mm_add_epi16(result, _mm_srli_si128(result, 8));
-      vbuf[ht] = _mm_cvtsi128_si32(result1) >> norm_factor;
-      ref += ref_stride;
+      // Row0 output: r00 x x x r01 x x x | r02 x x x r03 x x x
+      const __m256i s0 = _mm256_sad_epu8(src[0], zero);
+      const __m256i s1 = _mm256_sad_epu8(src[4], zero);
+      const __m256i r0 = _mm256_add_epi16(s0, s1);
+      // Row1 output: r10 x x x r11 x x x | r12 x x x r13 x x x
+      const __m256i s2 = _mm256_sad_epu8(src[1], zero);
+      const __m256i s3 = _mm256_sad_epu8(src[5], zero);
+      const __m256i r1 = _mm256_add_epi16(s2, s3);
+      // Row2 output: r20 x x x r21 x x x | r22 x x x r23 x x x
+      const __m256i s4 = _mm256_sad_epu8(src[2], zero);
+      const __m256i s5 = _mm256_sad_epu8(src[6], zero);
+      const __m256i r2 = _mm256_add_epi16(s4, s5);
+      // Row3 output: r30 x x x r31 x x x | r32 x x x r33 x x x
+      const __m256i s6 = _mm256_sad_epu8(src[3], zero);
+      const __m256i s7 = _mm256_sad_epu8(src[7], zero);
+      const __m256i r3 = _mm256_add_epi16(s6, s7);
+
+      CALC_TOT_SAD_AND_STORE
+      vbuf += 4;
+      ref += ref_stride << 2;
     }
   } else if (width == 32) {
     assert(height % 2 == 0);
     const __m256i zero = _mm256_setzero_si256();
-    for (int ht = 0; ht < height; ht += 2) {
-      const __m256i src_line0 = _mm256_loadu_si256((const __m256i *)ref);
-      const __m256i src_line1 =
-          _mm256_loadu_si256((const __m256i *)(ref + ref_stride));
-      const __m256i s0 = _mm256_sad_epu8(src_line0, zero);
-      const __m256i s1 = _mm256_sad_epu8(src_line1, zero);
+    for (int ht = 0; ht < height; ht += 4) {
+      __m256i src[4];
+      // Load source data.
+      load_from_src_buf(ref, &src[0], ref_stride);
 
-      __m128i result0 = _mm_add_epi16(_mm256_castsi256_si128(s0),
-                                      _mm256_extractf128_si256(s0, 1));
-      __m128i result1 = _mm_add_epi16(_mm256_castsi256_si128(s1),
-                                      _mm256_extractf128_si256(s1, 1));
-      __m128i result2 = _mm_add_epi16(result0, _mm_srli_si128(result0, 8));
-      __m128i result3 = _mm_add_epi16(result1, _mm_srli_si128(result1, 8));
+      // s00 x x x s01 x x x s02 x x x s03 x x x
+      const __m256i r0 = _mm256_sad_epu8(src[0], zero);
+      // s10 x x x s11 x x x s12 x x x s13 x x x
+      const __m256i r1 = _mm256_sad_epu8(src[1], zero);
+      // s20 x x x s21 x x x s22 x x x s23 x x x
+      const __m256i r2 = _mm256_sad_epu8(src[2], zero);
+      // s30 x x x s31 x x x s32 x x x s33 x x x
+      const __m256i r3 = _mm256_sad_epu8(src[3], zero);
 
-      vbuf[ht] = _mm_cvtsi128_si32(result2) >> norm_factor;
-      vbuf[ht + 1] = _mm_cvtsi128_si32(result3) >> norm_factor;
-      ref += (2 * ref_stride);
+      CALC_TOT_SAD_AND_STORE
+      vbuf += 4;
+      ref += ref_stride << 2;
     }
   } else if (width == 16) {
-    aom_int_pro_col_sse2(vbuf, ref, ref_stride, width, height, norm_factor);
+    aom_int_pro_col_16wd_avx2(vbuf, ref, ref_stride, height, norm_factor);
   }
 }