[CFL] Reuse Subsampling SSSE3 in AVX2 For smaller blocks, AVX2 SIMD offers no advantage over SSSE3. In order to avoid duplicate code, the AVX2 verion of the code is composed of the SSSE3 for small blocks and AVX2 for bigger blocks. Slight code changes have been made to uniformize both version. SSSE3/CFLPredictTest 4x4: C time = 802 us, SIMD time = 160 us (~5x) 8x8: C time = 2626 us, SIMD time = 248 us (~11x) 16x16: C time = 9409 us, SIMD time = 696 us (~14x) 32x32: C time = 25375 us, SIMD time = 2249 us (~11x) AVX2/CFLPredictTest 4x4: C time = 823 us, SIMD time = 160 us (~5.1x) 8x8: C time = 2580 us, SIMD time = 249 us (~10x) 16x16: C time = 9630 us, SIMD time = 671 us (~14x) 32x32: C time = 25061 us, SIMD time = 1713 us (~15x) SSSE3/CFLPredictHBDTest 4x4: C time = 782 us, SIMD time = 195 us (~4x) 8x8: C time = 2633 us, SIMD time = 289 us (~9.1x) 16x16: C time = 10416 us, SIMD time = 833 us (~13x) 32x32: C time = 41339 us, SIMD time = 3226 us (~13x) AVX2/CFLPredictHBDTest 4x4: C time = 760 us, SIMD time = 186 us (~4.1x) 8x8: C time = 2687 us, SIMD time = 309 us (~8.7x) 16x16: C time = 10003 us, SIMD time = 634 us (~16x) 32x32: C time = 40958 us, SIMD time = 2119 us (~19x) By uniformizing the behavior between SSSE3 and AVX2 we also resolve: BUG=aomedia:1419 Change-Id: I89c1a17d37e94178f6c19a5ac938bce1c645e9fb

commit: 956e36bf864dcff19a2ccf5eb613ce0dec1ff784 [log] [tgz]
author: Luc Trudeau <luc@trud.ca> Fri Feb 23 12:07:14 2018 -0500
committer: Yaowu Xu <yaowu@google.com> Sat Feb 24 00:09:57 2018 +0000
tree: 0875c5170741b8f7960924779b63c23c4bef84ea
parent: 0238457b9dd220c67eb0cf9b84694cdf50f8e065 [diff]
diff --git a/av1/common/cfl.h b/av1/common/cfl.h
index f8bd4ef..e26c0d0 100644
--- a/av1/common/cfl.h
+++ b/av1/common/cfl.h

@@ -136,4 +136,18 @@
     return sub_avg[tx_size % TX_SIZES_ALL];                                 \
   }
 
+#define CFL_PREDICT_LBD_X(width, arch)                                         \
+  void cfl_predict_lbd_##width##_##arch(const int16_t *pred_buf_q3,            \
+                                        uint8_t *dst, int dst_stride,          \
+                                        TX_SIZE tx_size, int alpha_q3) {       \
+    cfl_predict_lbd_x(pred_buf_q3, dst, dst_stride, tx_size, alpha_q3, width); \
+  }
+
+#define CFL_PREDICT_HBD_X(width, arch)                                     \
+  void cfl_predict_hbd_##width##_##arch(                                   \
+      const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride,           \
+      TX_SIZE tx_size, int alpha_q3, int bd) {                             \
+    cfl_predict_hbd_x(pred_buf_q3, dst, dst_stride, tx_size, alpha_q3, bd, \
+                      width);                                              \
+  }
 #endif  // AV1_COMMON_CFL_H_

diff --git a/av1/common/x86/cfl_avx2.c b/av1/common/x86/cfl_avx2.c
index 730bffd..3afa55c 100644
--- a/av1/common/x86/cfl_avx2.c
+++ b/av1/common/x86/cfl_avx2.c

@@ -14,6 +14,8 @@
 
 #include "av1/common/cfl.h"
 
+#include "av1/common/x86/cfl_simd.h"
+
 /**
  * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
  * precise version of a box filter 4:2:0 pixel subsampling in Q3.
@@ -68,39 +70,23 @@
   return _mm256_add_epi16(scaled_luma_q0, dc_q0);
 }
 
-static INLINE void cfl_predict_lbd_x(const int16_t *pred_buf_q3, uint8_t *dst,
-                                     int dst_stride, TX_SIZE tx_size,
-                                     int alpha_q3, int width) {
-  const int16_t *row_end = pred_buf_q3 + tx_size_high[tx_size] * CFL_BUF_LINE;
+static INLINE void cfl_predict_lbd_32_avx2(const int16_t *pred_buf_q3,
+                                           uint8_t *dst, int dst_stride,
+                                           TX_SIZE tx_size, int alpha_q3) {
   const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3);
   const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9);
   const __m256i dc_q0 = _mm256_set1_epi16(*dst);
+  __m256i *row = (__m256i *)pred_buf_q3;
+  const __m256i *row_end = row + tx_size_high[tx_size] * CFL_BUF_LINE_I256;
+
   do {
-    __m256i res =
-        predict_unclipped((__m256i *)pred_buf_q3, alpha_q12, alpha_sign, dc_q0);
-    __m256i next = res;
-    if (width == 32)
-      next = predict_unclipped((__m256i *)(pred_buf_q3 + 16), alpha_q12,
-                               alpha_sign, dc_q0);
+    __m256i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0);
+    __m256i next = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0);
     res = _mm256_packus_epi16(res, next);
-    if (width == 4) {
-      *(int32_t *)dst = _mm256_extract_epi32(res, 0);
-    } else if (width == 8) {
-#ifdef __x86_64__
-      *(int64_t *)dst = _mm256_extract_epi64(res, 0);
-#else
-      _mm_storel_epi64((__m128i *)dst, _mm256_castsi256_si128(res));
-#endif
-    } else {
-      res = _mm256_permute4x64_epi64(res, _MM_SHUFFLE(3, 1, 2, 0));
-      if (width == 16)
-        _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(res));
-      else
-        _mm256_storeu_si256((__m256i *)dst, res);
-    }
+    res = _mm256_permute4x64_epi64(res, _MM_SHUFFLE(3, 1, 2, 0));
+    _mm256_storeu_si256((__m256i *)dst, res);
     dst += dst_stride;
-    pred_buf_q3 += CFL_BUF_LINE;
-  } while (pred_buf_q3 < row_end);
+  } while ((row += CFL_BUF_LINE_I256) < row_end);
 }
 
 static __m256i highbd_max_epi16(int bd) {
@@ -113,76 +99,53 @@
   return _mm256_max_epi16(_mm256_min_epi16(u, max), zero);
 }
 
+static INLINE void cfl_predict_hbd(__m256i *dst, __m256i *src,
+                                   __m256i alpha_q12, __m256i alpha_sign,
+                                   __m256i dc_q0, __m256i max) {
+  __m256i res = predict_unclipped(src, alpha_q12, alpha_sign, dc_q0);
+  _mm256_storeu_si256(dst,
+                      highbd_clamp_epi16(res, _mm256_setzero_si256(), max));
+}
+
 static INLINE void cfl_predict_hbd_x(const int16_t *pred_buf_q3, uint16_t *dst,
                                      int dst_stride, TX_SIZE tx_size,
                                      int alpha_q3, int bd, int width) {
-  const int16_t *row_end = pred_buf_q3 + tx_size_high[tx_size] * CFL_BUF_LINE;
+  // Use SSSE3 version for smaller widths
+  assert(width == 16 || width == 32);
   const __m256i alpha_sign = _mm256_set1_epi16(alpha_q3);
   const __m256i alpha_q12 = _mm256_slli_epi16(_mm256_abs_epi16(alpha_sign), 9);
   const __m256i dc_q0 = _mm256_loadu_si256((__m256i *)dst);
   const __m256i max = highbd_max_epi16(bd);
-  const __m256i zero = _mm256_setzero_si256();
+
+  __m256i *row = (__m256i *)pred_buf_q3;
+  const __m256i *row_end = row + tx_size_high[tx_size] * CFL_BUF_LINE_I256;
   do {
-    __m256i res =
-        predict_unclipped((__m256i *)pred_buf_q3, alpha_q12, alpha_sign, dc_q0);
-    res = highbd_clamp_epi16(res, zero, max);
-    if (width == 4)
-#ifdef __x86_64__
-      *(int64_t *)dst = _mm256_extract_epi64(res, 0);
-#else
-      _mm_storel_epi64((__m128i *)dst, _mm256_castsi256_si128(res));
-#endif
-    else if (width == 8)
-      _mm_storeu_si128((__m128i *)dst, _mm256_castsi256_si128(res));
-    else
-      _mm256_storeu_si256((__m256i *)dst, res);
+    cfl_predict_hbd((__m256i *)dst, row, alpha_q12, alpha_sign, dc_q0, max);
     if (width == 32) {
-      res = predict_unclipped((__m256i *)(pred_buf_q3 + 16), alpha_q12,
-                              alpha_sign, dc_q0);
-      res = highbd_clamp_epi16(res, zero, max);
-      _mm256_storeu_si256((__m256i *)(dst + 16), res);
+      cfl_predict_hbd((__m256i *)(dst + 16), row + 1, alpha_q12, alpha_sign,
+                      dc_q0, max);
     }
     dst += dst_stride;
-    pred_buf_q3 += CFL_BUF_LINE;
-  } while (pred_buf_q3 < row_end);
+  } while ((row += CFL_BUF_LINE_I256) < row_end);
 }
 
-#define CFL_PREDICT_LBD_X(width)                                               \
-  static void cfl_predict_lbd_##width(const int16_t *pred_buf_q3,              \
-                                      uint8_t *dst, int dst_stride,            \
-                                      TX_SIZE tx_size, int alpha_q3) {         \
-    cfl_predict_lbd_x(pred_buf_q3, dst, dst_stride, tx_size, alpha_q3, width); \
-  }
-
-CFL_PREDICT_LBD_X(4)
-CFL_PREDICT_LBD_X(8)
-CFL_PREDICT_LBD_X(16)
-CFL_PREDICT_LBD_X(32)
-
-#define CFL_PREDICT_HBD_X(width)                                               \
-  static void cfl_predict_hbd_##width(const int16_t *pred_buf_q3,              \
-                                      uint16_t *dst, int dst_stride,           \
-                                      TX_SIZE tx_size, int alpha_q3, int bd) { \
-    cfl_predict_hbd_x(pred_buf_q3, dst, dst_stride, tx_size, alpha_q3, bd,     \
-                      width);                                                  \
-  }
-
-CFL_PREDICT_HBD_X(4)
-CFL_PREDICT_HBD_X(8)
-CFL_PREDICT_HBD_X(16)
-CFL_PREDICT_HBD_X(32)
+CFL_PREDICT_HBD_X(16, avx2)
+CFL_PREDICT_HBD_X(32, avx2)
 
 cfl_predict_lbd_fn get_predict_lbd_fn_avx2(TX_SIZE tx_size) {
-  static const cfl_predict_lbd_fn predict_lbd[4] = {
-    cfl_predict_lbd_4, cfl_predict_lbd_8, cfl_predict_lbd_16, cfl_predict_lbd_32
-  };
+  // Sizes 4, 8 and 16 reuse the SSSE3 version
+  static const cfl_predict_lbd_fn predict_lbd[4] = { cfl_predict_lbd_4_ssse3,
+                                                     cfl_predict_lbd_8_ssse3,
+                                                     cfl_predict_lbd_16_ssse3,
+                                                     cfl_predict_lbd_32_avx2 };
   return predict_lbd[(tx_size_wide_log2[tx_size] - tx_size_wide_log2[0]) & 3];
 }
 
 cfl_predict_hbd_fn get_predict_hbd_fn_avx2(TX_SIZE tx_size) {
-  static const cfl_predict_hbd_fn predict_hbd[4] = {
-    cfl_predict_hbd_4, cfl_predict_hbd_8, cfl_predict_hbd_16, cfl_predict_hbd_32
-  };
+  static const cfl_predict_hbd_fn predict_hbd[4] = { cfl_predict_hbd_4_ssse3,
+                                                     cfl_predict_hbd_8_ssse3,
+                                                     cfl_predict_hbd_16_avx2,
+                                                     cfl_predict_hbd_32_avx2 };
   return predict_hbd[(tx_size_wide_log2[tx_size] - tx_size_wide_log2[0]) & 3];
 }
 

diff --git a/av1/common/x86/cfl_simd.h b/av1/common/x86/cfl_simd.h
new file mode 100644
index 0000000..f796a74
--- /dev/null
+++ b/av1/common/x86/cfl_simd.h

@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+// SSSE3 version is optimal for with == 4, we reuse it in AVX2
+void cfl_predict_lbd_4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                             int dst_stride, TX_SIZE tx_size, int alpha_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void cfl_predict_lbd_8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                             int dst_stride, TX_SIZE tx_size, int alpha_q3);
+
+// SSSE3 version is optimal for with == 16, we reuse it in AVX2
+void cfl_predict_lbd_16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst,
+                              int dst_stride, TX_SIZE tx_size, int alpha_q3);
+
+// SSSE3 version is optimal for with == 4, we reuse it in AVX2
+void cfl_predict_hbd_4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                             int dst_stride, TX_SIZE tx_size, int alpha_q3,
+                             int bd);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void cfl_predict_hbd_8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst,
+                             int dst_stride, TX_SIZE tx_size, int alpha_q3,
+                             int bd);

diff --git a/av1/common/x86/cfl_ssse3.c b/av1/common/x86/cfl_ssse3.c
index 913131d..ff92811 100644
--- a/av1/common/x86/cfl_ssse3.c
+++ b/av1/common/x86/cfl_ssse3.c

@@ -15,6 +15,8 @@
 
 #include "av1/common/cfl.h"
 
+#include "av1/common/x86/cfl_simd.h"
+
 /**
  * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
  * precise version of a box filter 4:2:0 pixel subsampling in Q3.
@@ -136,11 +138,9 @@
 
 static INLINE void cfl_predict_hbd(__m128i *dst, __m128i *src,
                                    __m128i alpha_q12, __m128i alpha_sign,
-                                   __m128i dc_q0, int bd) {
-  const __m128i max = highbd_max_epi16(bd);
-  const __m128i zero = _mm_setzero_si128();
+                                   __m128i dc_q0, __m128i max) {
   __m128i res = predict_unclipped(src, alpha_q12, alpha_sign, dc_q0);
-  _mm_storeu_si128(dst, highbd_clamp_epi16(res, zero, max));
+  _mm_storeu_si128(dst, highbd_clamp_epi16(res, _mm_setzero_si128(), max));
 }
 
 static INLINE void cfl_predict_hbd_x(const int16_t *pred_buf_q3, uint16_t *dst,
@@ -149,68 +149,54 @@
   uint16_t *row_end = dst + tx_size_high[tx_size] * dst_stride;
   const __m128i alpha_sign = _mm_set1_epi16(alpha_q3);
   const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9);
-  const __m128i dc_q0 = width == 4 ? _mm_loadl_epi64((__m128i *)dst)
-                                   : _mm_load_si128((__m128i *)dst);
+  const __m128i dc_q0 = _mm_set1_epi16(*dst);
+  const __m128i max = highbd_max_epi16(bd);
   do {
     if (width == 4) {
-      const __m128i max = highbd_max_epi16(bd);
-      const __m128i zero = _mm_setzero_si128();
       __m128i res = predict_unclipped((__m128i *)(pred_buf_q3), alpha_q12,
                                       alpha_sign, dc_q0);
-      _mm_storel_epi64((__m128i *)dst, highbd_clamp_epi16(res, zero, max));
+      _mm_storel_epi64((__m128i *)dst,
+                       highbd_clamp_epi16(res, _mm_setzero_si128(), max));
     } else {
       cfl_predict_hbd((__m128i *)dst, (__m128i *)pred_buf_q3, alpha_q12,
-                      alpha_sign, dc_q0, bd);
+                      alpha_sign, dc_q0, max);
     }
     if (width >= 16)
       cfl_predict_hbd((__m128i *)(dst + 8), (__m128i *)(pred_buf_q3 + 8),
-                      alpha_q12, alpha_sign, dc_q0, bd);
+                      alpha_q12, alpha_sign, dc_q0, max);
     if (width == 32) {
       cfl_predict_hbd((__m128i *)(dst + 16), (__m128i *)(pred_buf_q3 + 16),
-                      alpha_q12, alpha_sign, dc_q0, bd);
+                      alpha_q12, alpha_sign, dc_q0, max);
       cfl_predict_hbd((__m128i *)(dst + 24), (__m128i *)(pred_buf_q3 + 24),
-                      alpha_q12, alpha_sign, dc_q0, bd);
+                      alpha_q12, alpha_sign, dc_q0, max);
     }
     dst += dst_stride;
     pred_buf_q3 += CFL_BUF_LINE;
   } while (dst < row_end);
 }
 
-#define CFL_PREDICT_LBD_X(width)                                               \
-  static void cfl_predict_lbd_##width(const int16_t *pred_buf_q3,              \
-                                      uint8_t *dst, int dst_stride,            \
-                                      TX_SIZE tx_size, int alpha_q3) {         \
-    cfl_predict_lbd_x(pred_buf_q3, dst, dst_stride, tx_size, alpha_q3, width); \
-  }
+CFL_PREDICT_LBD_X(4, ssse3)
+CFL_PREDICT_LBD_X(8, ssse3)
+CFL_PREDICT_LBD_X(16, ssse3)
+CFL_PREDICT_LBD_X(32, ssse3)
 
-CFL_PREDICT_LBD_X(4)
-CFL_PREDICT_LBD_X(8)
-CFL_PREDICT_LBD_X(16)
-CFL_PREDICT_LBD_X(32)
-
-#define CFL_PREDICT_HBD_X(width)                                               \
-  static void cfl_predict_hbd_##width(const int16_t *pred_buf_q3,              \
-                                      uint16_t *dst, int dst_stride,           \
-                                      TX_SIZE tx_size, int alpha_q3, int bd) { \
-    cfl_predict_hbd_x(pred_buf_q3, dst, dst_stride, tx_size, alpha_q3, bd,     \
-                      width);                                                  \
-  }
-
-CFL_PREDICT_HBD_X(4)
-CFL_PREDICT_HBD_X(8)
-CFL_PREDICT_HBD_X(16)
-CFL_PREDICT_HBD_X(32)
+CFL_PREDICT_HBD_X(4, ssse3)
+CFL_PREDICT_HBD_X(8, ssse3)
+CFL_PREDICT_HBD_X(16, ssse3)
+CFL_PREDICT_HBD_X(32, ssse3)
 
 cfl_predict_lbd_fn get_predict_lbd_fn_ssse3(TX_SIZE tx_size) {
-  static const cfl_predict_lbd_fn predict_lbd[4] = {
-    cfl_predict_lbd_4, cfl_predict_lbd_8, cfl_predict_lbd_16, cfl_predict_lbd_32
-  };
+  static const cfl_predict_lbd_fn predict_lbd[4] = { cfl_predict_lbd_4_ssse3,
+                                                     cfl_predict_lbd_8_ssse3,
+                                                     cfl_predict_lbd_16_ssse3,
+                                                     cfl_predict_lbd_32_ssse3 };
   return predict_lbd[(tx_size_wide_log2[tx_size] - tx_size_wide_log2[0]) & 3];
 }
 
 cfl_predict_hbd_fn get_predict_hbd_fn_ssse3(TX_SIZE tx_size) {
-  static const cfl_predict_hbd_fn predict_hbd[4] = {
-    cfl_predict_hbd_4, cfl_predict_hbd_8, cfl_predict_hbd_16, cfl_predict_hbd_32
-  };
+  static const cfl_predict_hbd_fn predict_hbd[4] = { cfl_predict_hbd_4_ssse3,
+                                                     cfl_predict_hbd_8_ssse3,
+                                                     cfl_predict_hbd_16_ssse3,
+                                                     cfl_predict_hbd_32_ssse3 };
   return predict_hbd[(tx_size_wide_log2[tx_size] - tx_size_wide_log2[0]) & 3];
 }
commit	956e36bf864dcff19a2ccf5eb613ce0dec1ff784	[log] [tgz]
author	Luc Trudeau <luc@trud.ca>	Fri Feb 23 12:07:14 2018 -0500
committer	Yaowu Xu <yaowu@google.com>	Sat Feb 24 00:09:57 2018 +0000
tree	0875c5170741b8f7960924779b63c23c4bef84ea
parent	0238457b9dd220c67eb0cf9b84694cdf50f8e065 [diff]