[CFL] AVX2/SSSE3 faster luma_subsampling_420_lbd

Taking advantage of width and height being constant
propagated to speed up the 420 low bit depth luma
subsampling SIMD. This new implementation is about
twice the speed of the previous version.

Based on the observation that for smaller widths,
the AVX2 code tends to be slower than its SSSE3
counterpart, the AVX2 code now calls the SSSE3
functions when width < 32.

SSSE3/CFLSubsampleTest
4x4: C time = 120 us, SIMD time = 80 us (~1.5x)
8x8: C time = 346 us, SIMD time = 96 us (~3.6x)
16x16: C time = 1327 us, SIMD time = 155 us (~8.6x)
32x32: C time = 5721 us, SIMD time = 530 us (~11x)

AVX2/CFLSubsampleTest
4x4: C time = 132 us, SIMD time = 89 us (~1.5x)
8x8: C time = 353 us, SIMD time = 111 us (~3.2x)
16x16: C time = 1290 us, SIMD time = 145 us (~8.9x)
32x32: C time = 5665 us, SIMD time = 524 us (~11x)

Change-Id: I9b2516ff5fc8addb2064904b6b362c8d89909933
diff --git a/av1/common/x86/cfl_avx2.c b/av1/common/x86/cfl_avx2.c
index 3afa55c..d7ecad1 100644
--- a/av1/common/x86/cfl_avx2.c
+++ b/av1/common/x86/cfl_avx2.c
@@ -32,33 +32,53 @@
                                               int input_stride,
                                               int16_t *pred_buf_q3, int width,
                                               int height) {
-  (void)width;  // Max chroma width is 16, so all widths fit in one __m256i
-
+  (void)width;                               // Forever 32
   const __m256i twos = _mm256_set1_epi8(2);  // Thirty two twos
   const int luma_stride = input_stride << 1;
-  const int16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE;
+  __m256i *row = (__m256i *)pred_buf_q3;
+  const __m256i *row_end = row + (height >> 1) * CFL_BUF_LINE_I256;
   do {
-    // Load 32 values for the top and bottom rows.
-    // t_0, t_1, ... t_31
-    __m256i top = _mm256_loadu_si256((__m256i *)(input));
-    // b_0, b_1, ... b_31
+    __m256i top = _mm256_loadu_si256((__m256i *)input);
     __m256i bot = _mm256_loadu_si256((__m256i *)(input + input_stride));
 
-    // Horizontal add of the 32 values into 16 values that are multiplied by 2
-    // (t_0 + t_1) * 2, (t_2 + t_3) * 2, ... (t_30 + t_31) *2
-    top = _mm256_maddubs_epi16(top, twos);
-    // (b_0 + b_1) * 2, (b_2 + b_3) * 2, ... (b_30 + b_31) *2
-    bot = _mm256_maddubs_epi16(bot, twos);
+    __m256i top_16x16 = _mm256_maddubs_epi16(top, twos);
+    __m256i bot_16x16 = _mm256_maddubs_epi16(bot, twos);
+    __m256i sum_16x16 = _mm256_add_epi16(top_16x16, bot_16x16);
 
-    // Add the 16 values in top with the 16 values in bottom
-    _mm256_storeu_si256((__m256i *)pred_buf_q3, _mm256_add_epi16(top, bot));
+    _mm256_storeu_si256(row, sum_16x16);
 
     input += luma_stride;
-    pred_buf_q3 += CFL_BUF_LINE;
-  } while (pred_buf_q3 < end);
+  } while ((row += CFL_BUF_LINE_I256) < row_end);
 }
 
-CFL_GET_SUBSAMPLE_FUNCTION(avx2)
+CFL_SUBSAMPLE(avx2, 420, lbd, 32, 32)
+CFL_SUBSAMPLE(avx2, 420, lbd, 32, 16)
+CFL_SUBSAMPLE(avx2, 420, lbd, 32, 8)
+
+cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd_avx2(TX_SIZE tx_size) {
+  static const cfl_subsample_lbd_fn subfn_420[TX_SIZES_ALL] = {
+    subsample_lbd_420_4x4_ssse3,   /* 4x4 */
+    subsample_lbd_420_8x8_ssse3,   /* 8x8 */
+    subsample_lbd_420_16x16_ssse3, /* 16x16 */
+    subsample_lbd_420_32x32_avx2,  /* 32x32 */
+    cfl_subsample_lbd_null,        /* 64x64 (invalid CFL size) */
+    subsample_lbd_420_4x8_ssse3,   /* 4x8 */
+    subsample_lbd_420_8x4_ssse3,   /* 8x4 */
+    subsample_lbd_420_8x16_ssse3,  /* 8x16 */
+    subsample_lbd_420_16x8_ssse3,  /* 16x8 */
+    subsample_lbd_420_16x32_ssse3, /* 16x32 */
+    subsample_lbd_420_32x16_avx2,  /* 32x16 */
+    cfl_subsample_lbd_null,        /* 32x64 (invalid CFL size) */
+    cfl_subsample_lbd_null,        /* 64x32 (invalid CFL size) */
+    subsample_lbd_420_4x16_ssse3,  /* 4x16  */
+    subsample_lbd_420_16x4_ssse3,  /* 16x4  */
+    subsample_lbd_420_8x32_ssse3,  /* 8x32  */
+    subsample_lbd_420_32x8_avx2,   /* 32x8  */
+    cfl_subsample_lbd_null,        /* 16x64 (invalid CFL size) */
+    cfl_subsample_lbd_null,        /* 64x16 (invalid CFL size) */
+  };
+  return subfn_420[tx_size];
+}
 
 static INLINE __m256i predict_unclipped(const __m256i *input, __m256i alpha_q12,
                                         __m256i alpha_sign, __m256i dc_q0) {
diff --git a/av1/common/x86/cfl_simd.h b/av1/common/x86/cfl_simd.h
index f796a74..17aaf15 100644
--- a/av1/common/x86/cfl_simd.h
+++ b/av1/common/x86/cfl_simd.h
@@ -9,6 +9,8 @@
  * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
  */
 
+#include "av1/common/blockd.h"
+
 // SSSE3 version is optimal for with == 4, we reuse it in AVX2
 void cfl_predict_lbd_4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
                              int dst_stride, TX_SIZE tx_size, int alpha_q3);
@@ -30,3 +32,31 @@
 void cfl_predict_hbd_8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
                              int dst_stride, TX_SIZE tx_size, int alpha_q3,
                              int bd);
+
+// SSSE3 version is optimal for with == 4, we reuse them in AVX2
+void subsample_lbd_420_4x4_ssse3(const uint8_t *input, int input_stride,
+                                 int16_t *output_q3);
+void subsample_lbd_420_4x8_ssse3(const uint8_t *input, int input_stride,
+                                 int16_t *output_q3);
+void subsample_lbd_420_4x16_ssse3(const uint8_t *input, int input_stride,
+                                  int16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void subsample_lbd_420_8x4_ssse3(const uint8_t *input, int input_stride,
+                                 int16_t *output_q3);
+void subsample_lbd_420_8x8_ssse3(const uint8_t *input, int input_stride,
+                                 int16_t *output_q3);
+void subsample_lbd_420_8x16_ssse3(const uint8_t *input, int input_stride,
+                                  int16_t *output_q3);
+void subsample_lbd_420_8x32_ssse3(const uint8_t *input, int input_stride,
+                                  int16_t *output_q3);
+
+// SSSE3 version is optimal for with == 16, we reuse it in AVX2
+void subsample_lbd_420_16x4_ssse3(const uint8_t *input, int input_stride,
+                                  int16_t *output_q3);
+void subsample_lbd_420_16x8_ssse3(const uint8_t *input, int input_stride,
+                                  int16_t *output_q3);
+void subsample_lbd_420_16x16_ssse3(const uint8_t *input, int input_stride,
+                                   int16_t *output_q3);
+void subsample_lbd_420_16x32_ssse3(const uint8_t *input, int input_stride,
+                                   int16_t *output_q3);
diff --git a/av1/common/x86/cfl_ssse3.c b/av1/common/x86/cfl_ssse3.c
index ff92811..f7b6c6f 100644
--- a/av1/common/x86/cfl_ssse3.c
+++ b/av1/common/x86/cfl_ssse3.c
@@ -26,57 +26,55 @@
  *
  * Note: We don't need to worry about going over the active area, as long as we
  * stay inside the CfL prediction buffer.
- *
- * Note: For 4:2:0 luma subsampling, the width will never be greater than 16.
  */
-static void cfl_luma_subsampling_420_lbd_ssse3(const uint8_t *input,
-                                               int input_stride,
-                                               int16_t *pred_buf_q3, int width,
-                                               int height) {
-  const __m128i twos = _mm_set1_epi8(2);  // Sixteen twos
-
-  // Sixteen int8 values fit in one __m128i register. If this is enough to do
-  // the entire row, the next value is two rows down, otherwise we move to the
-  // next sixteen values.
-  const int next = (width == 32) ? 16 : input_stride << 1;
-
-  // Values in the prediction buffer are subsampled, so we only need to move
-  // down one row or forward by eight values.
-  const int next_chroma = (width == 32) ? 8 : CFL_BUF_LINE;
-
-  // When the width is less than 16, we double the stride, because we process
-  // four lines by iteration (instead of two).
-  const int luma_stride = input_stride << (1 + (width < 32));
-  const int chroma_stride = CFL_BUF_LINE << (width < 32);
-
+static INLINE void cfl_luma_subsampling_420_lbd_ssse3(const uint8_t *input,
+                                                      int input_stride,
+                                                      int16_t *pred_buf_q3,
+                                                      int width, int height) {
+  const __m128i twos = _mm_set1_epi8(2);
   const int16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE;
+  const int luma_stride = input_stride << 1;
+
+  __m128i top, bot, next_top, next_bot, top_16x8, bot_16x8, next_top_16x8,
+      next_bot_16x8, sum_16x8, next_sum_16x8;
   do {
-    // Load 16 values for the top and bottom rows.
-    // t_0, t_1, ... t_15
-    __m128i top = _mm_loadu_si128((__m128i *)(input));
-    // b_0, b_1, ... b_15
-    __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride));
+    if (width == 4) {
+      top = _mm_cvtsi32_si128(*((int *)input));
+      bot = _mm_cvtsi32_si128(*((int *)(input + input_stride)));
+    } else if (width == 8) {
+      top = _mm_loadl_epi64((__m128i *)input);
+      bot = _mm_loadl_epi64((__m128i *)(input + input_stride));
+    } else {
+      top = _mm_loadu_si128((__m128i *)input);
+      bot = _mm_loadu_si128((__m128i *)(input + input_stride));
+      if (width == 32) {
+        next_top = _mm_loadu_si128((__m128i *)(input + 16));
+        next_bot = _mm_loadu_si128((__m128i *)(input + 16 + input_stride));
+      }
+    }
 
-    // Load either the next line or the next 16 values
-    __m128i next_top = _mm_loadu_si128((__m128i *)(input + next));
-    __m128i next_bot =
-        _mm_loadu_si128((__m128i *)(input + next + input_stride));
+    top_16x8 = _mm_maddubs_epi16(top, twos);
+    bot_16x8 = _mm_maddubs_epi16(bot, twos);
+    sum_16x8 = _mm_add_epi16(top_16x8, bot_16x8);
+    if (width == 32) {
+      next_top_16x8 = _mm_maddubs_epi16(next_top, twos);
+      next_bot_16x8 = _mm_maddubs_epi16(next_bot, twos);
+      next_sum_16x8 = _mm_add_epi16(next_top_16x8, next_bot_16x8);
+    }
 
-    // Horizontal add of the 16 values into 8 values that are multiplied by 2
-    // (t_0 + t_1) * 2, (t_2 + t_3) * 2, ... (t_14 + t_15) *2
-    top = _mm_maddubs_epi16(top, twos);
-    next_top = _mm_maddubs_epi16(next_top, twos);
-    // (b_0 + b_1) * 2, (b_2 + b_3) * 2, ... (b_14 + b_15) *2
-    bot = _mm_maddubs_epi16(bot, twos);
-    next_bot = _mm_maddubs_epi16(next_bot, twos);
-
-    // Add the 8 values in top with the 8 values in bottom
-    _mm_storeu_si128((__m128i *)pred_buf_q3, _mm_add_epi16(top, bot));
-    _mm_storeu_si128((__m128i *)(pred_buf_q3 + next_chroma),
-                     _mm_add_epi16(next_top, next_bot));
+    if (width == 4) {
+      *((int *)pred_buf_q3) = _mm_cvtsi128_si32(sum_16x8);
+    } else if (width == 8) {
+      _mm_storel_epi64((__m128i *)pred_buf_q3, sum_16x8);
+    } else {
+      _mm_storeu_si128((__m128i *)pred_buf_q3, sum_16x8);
+      if (width == 32) {
+        _mm_storeu_si128((__m128i *)(pred_buf_q3 + 8), next_sum_16x8);
+      }
+    }
 
     input += luma_stride;
-    pred_buf_q3 += chroma_stride;
+    pred_buf_q3 += CFL_BUF_LINE;
   } while (pred_buf_q3 < end);
 }