[CFL] SSSE3/AVX2/NEON 4:2:2 Subsampling
SSSE3/Subsample422SpeedTest
4x4: C time = 104 us, SIMD time = 48 us (~2.2x)
8x8: C time = 351 us, SIMD time = 80 us (~4.4x)
16x16: C time = 1454 us, SIMD time = 163 us (~8.9x)
32x32: C time = 5345 us, SIMD time = 680 us (~7.9x)
AVX2/Subsample422SpeedTest
32x32: C time = 5382 us, SIMD time = 782 us (~6.9x)
NEON/Subsample422SpeedTest
4x4: C time = 1131 us, SIMD time = 768 us (~1.5x)
8x8: C time = 4214 us, SIMD time = 1451 us (~2.9x)
16x16: C time = 16898 us, SIMD time = 3529 us (~4.8x)
32x32: C time = 66097 us, SIMD time = 13776 us (~4.8x)
Change-Id: Iaa0153222bd214d48d7661b20ac06d1ddbd48997
diff --git a/av1/common/arm/cfl_neon.c b/av1/common/arm/cfl_neon.c
index beaaa13..8bc8489 100644
--- a/av1/common/arm/cfl_neon.c
+++ b/av1/common/arm/cfl_neon.c
@@ -63,6 +63,31 @@
} while ((pred_buf_q3 += CFL_BUF_LINE) < end);
}
+static void cfl_luma_subsampling_422_lbd_neon(const uint8_t *input,
+ int input_stride,
+ int16_t *pred_buf_q3, int width,
+ int height) {
+ const int16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+ do {
+ if (width == 4) {
+ const uint16x4_t top = vpaddl_u8(vldh_dup_u8(input));
+ vsth_s16(pred_buf_q3, vshl_n_s16(vreinterpret_s16_u16(top), 2));
+ } else if (width == 8) {
+ const uint16x4_t top = vpaddl_u8(vld1_u8(input));
+ vst1_s16(pred_buf_q3, vshl_n_s16(vreinterpret_s16_u16(top), 2));
+ } else {
+ const uint16x8_t top = vpaddlq_u8(vld1q_u8(input));
+ vst1q_s16(pred_buf_q3, vshlq_n_s16(vreinterpretq_s16_u16(top), 2));
+ if (width == 32) {
+ const uint16x8_t next_top = vpaddlq_u8(vld1q_u8(input + 16));
+ vst1q_s16(pred_buf_q3 + 8,
+ vshlq_n_s16(vreinterpretq_s16_u16(next_top), 2));
+ }
+ }
+ input += input_stride;
+ } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
+}
+
CFL_GET_SUBSAMPLE_FUNCTION(neon)
static INLINE void subtract_average_neon(int16_t *pred_buf, int width,
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index cc2e016..9b3a83b 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -353,6 +353,9 @@
add_proto qw/cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd/, "TX_SIZE tx_size";
specialize qw/cfl_get_luma_subsampling_420_lbd ssse3 avx2 neon/;
+add_proto qw/cfl_subsample_lbd_fn cfl_get_luma_subsampling_422_lbd/, "TX_SIZE tx_size";
+specialize qw/cfl_get_luma_subsampling_422_lbd ssse3 avx2 neon/;
+
add_proto qw/cfl_predict_lbd_fn get_predict_lbd_fn/, "TX_SIZE tx_size";
specialize qw/get_predict_lbd_fn ssse3 avx2/;
diff --git a/av1/common/cfl.c b/av1/common/cfl.c
index ac16786..41433b0 100644
--- a/av1/common/cfl.c
+++ b/av1/common/cfl.c
@@ -335,10 +335,6 @@
}
}
-// TODO(ltrudeau) Move into the CFL_GET_SUBSAMPLE_FUNCTION when LBD 422 SIMD
-// will be implemented
-CFL_SUBSAMPLE_FUNCTIONS(c, 422, lbd)
-
// TODO(ltrudeau) Move into the CFL_GET_SUBSAMPLE_FUNCTION when LBD 444 SIMD
// will be implemented
CFL_SUBSAMPLE_FUNCTIONS(c, 444, lbd)
@@ -357,41 +353,6 @@
CFL_GET_SUBSAMPLE_FUNCTION(c)
-// TODO(ltrudeau) Move into the CFL_GET_SUBSAMPLE_FUNCTION when LBD 444 SIMD
-// will be implemented
-cfl_subsample_hbd_fn cfl_get_luma_subsampling_420_hbd_c(TX_SIZE tx_size) {
- CFL_SUBSAMPLE_FUNCTION_ARRAY(c, 420, hbd)
- return subfn_420[tx_size];
-}
-
-// TODO(ltrudeau) Move into the CFL_GET_SUBSAMPLE_FUNCTION when LBD 444 SIMD
-// will be implemented
-cfl_subsample_hbd_fn cfl_get_luma_subsampling_422_hbd_c(TX_SIZE tx_size) {
- CFL_SUBSAMPLE_FUNCTION_ARRAY(c, 422, hbd)
- return subfn_422[tx_size];
-}
-
-// TODO(ltrudeau) Move into the CFL_GET_SUBSAMPLE_FUNCTION when LBD 444 SIMD
-// will be implemented
-cfl_subsample_hbd_fn cfl_get_luma_subsampling_444_hbd_c(TX_SIZE tx_size) {
- CFL_SUBSAMPLE_FUNCTION_ARRAY(c, 444, hbd)
- return subfn_444[tx_size];
-}
-
-// TODO(ltrudeau) Move into the CFL_GET_SUBSAMPLE_FUNCTION when LBD 422 SIMD
-// will be implemented
-cfl_subsample_lbd_fn cfl_get_luma_subsampling_422_lbd_c(TX_SIZE tx_size) {
- CFL_SUBSAMPLE_FUNCTION_ARRAY(c, 422, lbd)
- return subfn_422[tx_size];
-}
-
-// TODO(ltrudeau) Move into the CFL_GET_SUBSAMPLE_FUNCTION when LBD 444 SIMD
-// will be implemented
-cfl_subsample_lbd_fn cfl_get_luma_subsampling_444_lbd_c(TX_SIZE tx_size) {
- CFL_SUBSAMPLE_FUNCTION_ARRAY(c, 444, lbd)
- return subfn_444[tx_size];
-}
-
static INLINE cfl_subsample_hbd_fn cfl_subsampling_hbd(TX_SIZE tx_size,
int sub_x, int sub_y) {
if (sub_x == 1) {
diff --git a/av1/common/cfl.h b/av1/common/cfl.h
index cb76b8d..2850c02 100644
--- a/av1/common/cfl.h
+++ b/av1/common/cfl.h
@@ -78,21 +78,26 @@
}
// Declare size-specific wrappers for all valid CfL sizes.
-#define CFL_SUBSAMPLE_FUNCTIONS(arch, sub, bd) \
- CFL_SUBSAMPLE(arch, sub, bd, 4, 4) \
- CFL_SUBSAMPLE(arch, sub, bd, 8, 8) \
- CFL_SUBSAMPLE(arch, sub, bd, 16, 16) \
- CFL_SUBSAMPLE(arch, sub, bd, 32, 32) \
- CFL_SUBSAMPLE(arch, sub, bd, 4, 8) \
- CFL_SUBSAMPLE(arch, sub, bd, 8, 4) \
- CFL_SUBSAMPLE(arch, sub, bd, 8, 16) \
- CFL_SUBSAMPLE(arch, sub, bd, 16, 8) \
- CFL_SUBSAMPLE(arch, sub, bd, 16, 32) \
- CFL_SUBSAMPLE(arch, sub, bd, 32, 16) \
- CFL_SUBSAMPLE(arch, sub, bd, 4, 16) \
- CFL_SUBSAMPLE(arch, sub, bd, 16, 4) \
- CFL_SUBSAMPLE(arch, sub, bd, 8, 32) \
- CFL_SUBSAMPLE(arch, sub, bd, 32, 8)
+#define CFL_SUBSAMPLE_FUNCTIONS(arch, sub, bd) \
+ CFL_SUBSAMPLE(arch, sub, bd, 4, 4) \
+ CFL_SUBSAMPLE(arch, sub, bd, 8, 8) \
+ CFL_SUBSAMPLE(arch, sub, bd, 16, 16) \
+ CFL_SUBSAMPLE(arch, sub, bd, 32, 32) \
+ CFL_SUBSAMPLE(arch, sub, bd, 4, 8) \
+ CFL_SUBSAMPLE(arch, sub, bd, 8, 4) \
+ CFL_SUBSAMPLE(arch, sub, bd, 8, 16) \
+ CFL_SUBSAMPLE(arch, sub, bd, 16, 8) \
+ CFL_SUBSAMPLE(arch, sub, bd, 16, 32) \
+ CFL_SUBSAMPLE(arch, sub, bd, 32, 16) \
+ CFL_SUBSAMPLE(arch, sub, bd, 4, 16) \
+ CFL_SUBSAMPLE(arch, sub, bd, 16, 4) \
+ CFL_SUBSAMPLE(arch, sub, bd, 8, 32) \
+ CFL_SUBSAMPLE(arch, sub, bd, 32, 8) \
+ cfl_subsample_##bd##_fn cfl_get_luma_subsampling_##sub##_##bd##_##arch( \
+ TX_SIZE tx_size) { \
+ CFL_SUBSAMPLE_FUNCTION_ARRAY(arch, sub, bd) \
+ return subfn_##sub[tx_size]; \
+ }
// Declare an architecture-specific array of function pointers for size-specific
// wrappers.
@@ -121,13 +126,9 @@
// The RTCD script does not support passing in the an array, so we wrap it in
// this function.
-#define CFL_GET_SUBSAMPLE_FUNCTION(arch) \
- CFL_SUBSAMPLE_FUNCTIONS(arch, 420, lbd) \
- cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd_##arch( \
- TX_SIZE tx_size) { \
- CFL_SUBSAMPLE_FUNCTION_ARRAY(arch, 420, lbd) \
- return subfn_420[tx_size]; \
- }
+#define CFL_GET_SUBSAMPLE_FUNCTION(arch) \
+ CFL_SUBSAMPLE_FUNCTIONS(arch, 420, lbd) \
+ CFL_SUBSAMPLE_FUNCTIONS(arch, 422, lbd)
// Null function used for invalid tx_sizes
static INLINE void cfl_subtract_average_null(int16_t *pred_buf_q3) {
diff --git a/av1/common/x86/cfl_avx2.c b/av1/common/x86/cfl_avx2.c
index 5d3a141..ae096e7 100644
--- a/av1/common/x86/cfl_avx2.c
+++ b/av1/common/x86/cfl_avx2.c
@@ -16,6 +16,36 @@
#include "av1/common/x86/cfl_simd.h"
+#define CFL_GET_SUBSAMPLE_FUNCTION_AVX2(sub, bd) \
+ CFL_SUBSAMPLE(avx2, sub, bd, 32, 32) \
+ CFL_SUBSAMPLE(avx2, sub, bd, 32, 16) \
+ CFL_SUBSAMPLE(avx2, sub, bd, 32, 8) \
+ cfl_subsample_##bd##_fn cfl_get_luma_subsampling_##sub##_##bd##_avx2( \
+ TX_SIZE tx_size) { \
+ static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = { \
+ subsample_##bd##_##sub##_4x4_ssse3, /* 4x4 */ \
+ subsample_##bd##_##sub##_8x8_ssse3, /* 8x8 */ \
+ subsample_##bd##_##sub##_16x16_ssse3, /* 16x16 */ \
+ subsample_##bd##_##sub##_32x32_avx2, /* 32x32 */ \
+ cfl_subsample_##bd##_null, /* 64x64 (invalid CFL size) */ \
+ subsample_##bd##_##sub##_4x8_ssse3, /* 4x8 */ \
+ subsample_##bd##_##sub##_8x4_ssse3, /* 8x4 */ \
+ subsample_##bd##_##sub##_8x16_ssse3, /* 8x16 */ \
+ subsample_##bd##_##sub##_16x8_ssse3, /* 16x8 */ \
+ subsample_##bd##_##sub##_16x32_ssse3, /* 16x32 */ \
+ subsample_##bd##_##sub##_32x16_avx2, /* 32x16 */ \
+ cfl_subsample_##bd##_null, /* 32x64 (invalid CFL size) */ \
+ cfl_subsample_##bd##_null, /* 64x32 (invalid CFL size) */ \
+ subsample_##bd##_##sub##_4x16_ssse3, /* 4x16 */ \
+ subsample_##bd##_##sub##_16x4_ssse3, /* 16x4 */ \
+ subsample_##bd##_##sub##_8x32_ssse3, /* 8x32 */ \
+ subsample_##bd##_##sub##_32x8_avx2, /* 32x8 */ \
+ cfl_subsample_##bd##_null, /* 16x64 (invalid CFL size) */ \
+ cfl_subsample_##bd##_null, /* 64x16 (invalid CFL size) */ \
+ }; \
+ return subfn_##sub[tx_size]; \
+ }
+
/**
* Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
* precise version of a box filter 4:2:0 pixel subsampling in Q3.
@@ -51,35 +81,36 @@
} while ((row += CFL_BUF_LINE_I256) < row_end);
}
-CFL_SUBSAMPLE(avx2, 420, lbd, 32, 32)
-CFL_SUBSAMPLE(avx2, 420, lbd, 32, 16)
-CFL_SUBSAMPLE(avx2, 420, lbd, 32, 8)
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(420, lbd)
-cfl_subsample_lbd_fn cfl_get_luma_subsampling_420_lbd_avx2(TX_SIZE tx_size) {
- static const cfl_subsample_lbd_fn subfn_420[TX_SIZES_ALL] = {
- subsample_lbd_420_4x4_ssse3, /* 4x4 */
- subsample_lbd_420_8x8_ssse3, /* 8x8 */
- subsample_lbd_420_16x16_ssse3, /* 16x16 */
- subsample_lbd_420_32x32_avx2, /* 32x32 */
- cfl_subsample_lbd_null, /* 64x64 (invalid CFL size) */
- subsample_lbd_420_4x8_ssse3, /* 4x8 */
- subsample_lbd_420_8x4_ssse3, /* 8x4 */
- subsample_lbd_420_8x16_ssse3, /* 8x16 */
- subsample_lbd_420_16x8_ssse3, /* 16x8 */
- subsample_lbd_420_16x32_ssse3, /* 16x32 */
- subsample_lbd_420_32x16_avx2, /* 32x16 */
- cfl_subsample_lbd_null, /* 32x64 (invalid CFL size) */
- cfl_subsample_lbd_null, /* 64x32 (invalid CFL size) */
- subsample_lbd_420_4x16_ssse3, /* 4x16 */
- subsample_lbd_420_16x4_ssse3, /* 16x4 */
- subsample_lbd_420_8x32_ssse3, /* 8x32 */
- subsample_lbd_420_32x8_avx2, /* 32x8 */
- cfl_subsample_lbd_null, /* 16x64 (invalid CFL size) */
- cfl_subsample_lbd_null, /* 64x16 (invalid CFL size) */
- };
- return subfn_420[tx_size];
+/**
+ * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more
+ * precise version of a box filter 4:2:2 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static void cfl_luma_subsampling_422_lbd_avx2(const uint8_t *input,
+ int input_stride,
+ int16_t *pred_buf_q3, int width,
+ int height) {
+ (void)width; // Forever 32
+ const __m256i fours = _mm256_set1_epi8(4); // Thirty two fours
+ __m256i *row = (__m256i *)pred_buf_q3;
+ const __m256i *row_end = row + height * CFL_BUF_LINE_I256;
+ do {
+ __m256i top = _mm256_loadu_si256((__m256i *)input);
+ __m256i top_16x16 = _mm256_maddubs_epi16(top, fours);
+ _mm256_storeu_si256(row, top_16x16);
+ input += input_stride;
+ } while ((row += CFL_BUF_LINE_I256) < row_end);
}
+CFL_GET_SUBSAMPLE_FUNCTION_AVX2(422, lbd)
+
static INLINE __m256i predict_unclipped(const __m256i *input, __m256i alpha_q12,
__m256i alpha_sign, __m256i dc_q0) {
__m256i ac_q3 = _mm256_loadu_si256(input);
@@ -136,8 +167,8 @@
cfl_predict_lbd_null, /* 16x64 (invalid CFL size) */
cfl_predict_lbd_null, /* 64x16 (invalid CFL size) */
};
- /* Modulo TX_SIZES_ALL to ensure that an attacker won't be able to
- */ /* index the function pointer array out of bounds. */
+ // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the
+ // function pointer array out of bounds.
return pred[tx_size % TX_SIZES_ALL];
}
diff --git a/av1/common/x86/cfl_simd.h b/av1/common/x86/cfl_simd.h
index 058d170..88968e0 100644
--- a/av1/common/x86/cfl_simd.h
+++ b/av1/common/x86/cfl_simd.h
@@ -39,6 +39,34 @@
void subsample_lbd_420_16x32_ssse3(const uint8_t *input, int input_stride,
int16_t *output_q3);
+// SSSE3 version is optimal for with == 4, we reuse them in AVX2
+void subsample_lbd_422_4x4_ssse3(const uint8_t *input, int input_stride,
+ int16_t *output_q3);
+void subsample_lbd_422_4x8_ssse3(const uint8_t *input, int input_stride,
+ int16_t *output_q3);
+void subsample_lbd_422_4x16_ssse3(const uint8_t *input, int input_stride,
+ int16_t *output_q3);
+
+// SSSE3 version is optimal for with == 8, we reuse it in AVX2
+void subsample_lbd_422_8x4_ssse3(const uint8_t *input, int input_stride,
+ int16_t *output_q3);
+void subsample_lbd_422_8x8_ssse3(const uint8_t *input, int input_stride,
+ int16_t *output_q3);
+void subsample_lbd_422_8x16_ssse3(const uint8_t *input, int input_stride,
+ int16_t *output_q3);
+void subsample_lbd_422_8x32_ssse3(const uint8_t *input, int input_stride,
+ int16_t *output_q3);
+
+// SSSE3 version is optimal for with == 16, we reuse it in AVX2
+void subsample_lbd_422_16x4_ssse3(const uint8_t *input, int input_stride,
+ int16_t *output_q3);
+void subsample_lbd_422_16x8_ssse3(const uint8_t *input, int input_stride,
+ int16_t *output_q3);
+void subsample_lbd_422_16x16_ssse3(const uint8_t *input, int input_stride,
+ int16_t *output_q3);
+void subsample_lbd_422_16x32_ssse3(const uint8_t *input, int input_stride,
+ int16_t *output_q3);
+
// SSE2 version is optimal for with == 4, we reuse them in AVX2
void subtract_average_4x4_sse2(int16_t *pred_buf_q3);
void subtract_average_4x8_sse2(int16_t *pred_buf_q3);
diff --git a/av1/common/x86/cfl_ssse3.c b/av1/common/x86/cfl_ssse3.c
index 735b806..2e9eb35 100644
--- a/av1/common/x86/cfl_ssse3.c
+++ b/av1/common/x86/cfl_ssse3.c
@@ -78,6 +78,54 @@
} while (pred_buf_q3 < end);
}
+/**
+ * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more
+ * precise version of a box filter 4:2:2 pixel subsampling in Q3.
+ *
+ * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
+ * active area is specified using width and height.
+ *
+ * Note: We don't need to worry about going over the active area, as long as we
+ * stay inside the CfL prediction buffer.
+ */
+static INLINE void cfl_luma_subsampling_422_lbd_ssse3(const uint8_t *input,
+ int input_stride,
+ int16_t *pred_buf_q3,
+ int width, int height) {
+ const __m128i fours = _mm_set1_epi8(4);
+ const int16_t *end = pred_buf_q3 + height * CFL_BUF_LINE;
+ const int luma_stride = input_stride;
+ __m128i top, next_top, top_16x8, next_top_16x8;
+ do {
+ switch (width) {
+ case 4: top = _mm_cvtsi32_si128(*((int *)input)); break;
+ case 8: top = _mm_loadl_epi64((__m128i *)input); break;
+ case 16: top = _mm_loadu_si128((__m128i *)input); break;
+ case 32:
+ top = _mm_loadu_si128((__m128i *)input);
+ next_top = _mm_loadu_si128((__m128i *)(input + 16));
+ break;
+ default: assert(0);
+ }
+ top_16x8 = _mm_maddubs_epi16(top, fours);
+ if (width == 32) {
+ next_top_16x8 = _mm_maddubs_epi16(next_top, fours);
+ }
+ switch (width) {
+ case 4: *((int *)pred_buf_q3) = _mm_cvtsi128_si32(top_16x8); break;
+ case 8: _mm_storel_epi64((__m128i *)pred_buf_q3, top_16x8); break;
+ case 16: _mm_storeu_si128((__m128i *)pred_buf_q3, top_16x8); break;
+ case 32:
+ _mm_storeu_si128((__m128i *)pred_buf_q3, top_16x8);
+ _mm_storeu_si128((__m128i *)(pred_buf_q3 + 8), next_top_16x8);
+ break;
+ default: assert(0);
+ }
+ input += luma_stride;
+ pred_buf_q3 += CFL_BUF_LINE;
+ } while (pred_buf_q3 < end);
+}
+
CFL_GET_SUBSAMPLE_FUNCTION(ssse3)
static INLINE __m128i predict_unclipped(const __m128i *input, __m128i alpha_q12,
diff --git a/test/cfl_test.cc b/test/cfl_test.cc
index dadbfdc..41b0057 100644
--- a/test/cfl_test.cc
+++ b/test/cfl_test.cc
@@ -31,6 +31,21 @@
make_tuple(TX_16X32, &function), make_tuple(TX_32X8, &function), \
make_tuple(TX_32X16, &function), make_tuple(TX_32X32, &function)
+#define ALL_CFL_TX_SIZES_SUBSAMPLE(fun420, fun422) \
+ make_tuple(TX_4X4, &fun420, &fun422), make_tuple(TX_4X8, &fun420, &fun422), \
+ make_tuple(TX_4X16, &fun420, &fun422), \
+ make_tuple(TX_8X4, &fun420, &fun422), \
+ make_tuple(TX_8X8, &fun420, &fun422), \
+ make_tuple(TX_8X16, &fun420, &fun422), \
+ make_tuple(TX_8X32, &fun420, &fun422), \
+ make_tuple(TX_16X4, &fun420, &fun422), \
+ make_tuple(TX_16X8, &fun420, &fun422), \
+ make_tuple(TX_16X16, &fun420, &fun422), \
+ make_tuple(TX_16X32, &fun420, &fun422), \
+ make_tuple(TX_32X8, &fun420, &fun422), \
+ make_tuple(TX_32X16, &fun420, &fun422), \
+ make_tuple(TX_32X32, &fun420, &fun422)
+
namespace {
template <typename A>
@@ -187,7 +202,8 @@
}
typedef cfl_subsample_lbd_fn (*get_subsample_fn)(TX_SIZE tx_size);
-typedef ::testing::tuple<TX_SIZE, get_subsample_fn> subsample_param;
+typedef ::testing::tuple<TX_SIZE, get_subsample_fn, get_subsample_fn>
+ subsample_param;
class CFLSubsampleTest : public ::testing::TestWithParam<subsample_param>,
public CFLTestWithData<uint8_t> {
public:
@@ -196,12 +212,16 @@
virtual void SetUp() {
CFLTest::init(::testing::get<0>(this->GetParam()));
fun_420 = ::testing::get<1>(this->GetParam())(tx_size);
+ fun_422 = ::testing::get<2>(this->GetParam())(tx_size);
fun_420_ref = cfl_get_luma_subsampling_420_lbd_c(tx_size);
+ fun_422_ref = cfl_get_luma_subsampling_422_lbd_c(tx_size);
}
protected:
cfl_subsample_lbd_fn fun_420;
+ cfl_subsample_lbd_fn fun_422;
cfl_subsample_lbd_fn fun_420_ref;
+ cfl_subsample_lbd_fn fun_422_ref;
void subsampleTest(cfl_subsample_lbd_fn fun, cfl_subsample_lbd_fn fun_ref,
int sub_width, int sub_height) {
@@ -251,6 +271,14 @@
subsampleSpeedTest(fun_420, fun_420_ref);
}
+TEST_P(CFLSubsampleTest, Subsample422Test) {
+ subsampleTest(fun_422, fun_422_ref, width >> 1, height);
+}
+
+TEST_P(CFLSubsampleTest, DISABLED_Subsample422SpeedTest) {
+ subsampleSpeedTest(fun_422, fun_422_ref);
+}
+
typedef cfl_predict_lbd_fn (*get_predict_fn)(TX_SIZE tx_size);
typedef ::testing::tuple<TX_SIZE, get_predict_fn> predict_param;
class CFLPredictTest : public ::testing::TestWithParam<predict_param>,
@@ -360,8 +388,9 @@
#if HAVE_SSSE3
-const subsample_param subsample_sizes_ssse3[] = { ALL_CFL_TX_SIZES(
- cfl_get_luma_subsampling_420_lbd_ssse3) };
+const subsample_param subsample_sizes_ssse3[] = { ALL_CFL_TX_SIZES_SUBSAMPLE(
+ cfl_get_luma_subsampling_420_lbd_ssse3,
+ cfl_get_luma_subsampling_422_lbd_ssse3) };
const predict_param predict_sizes_ssse3[] = { ALL_CFL_TX_SIZES(
get_predict_lbd_fn_ssse3) };
@@ -383,8 +412,9 @@
const sub_avg_param sub_avg_sizes_avx2[] = { ALL_CFL_TX_SIZES(
get_subtract_average_fn_avx2) };
-const subsample_param subsample_sizes_avx2[] = { ALL_CFL_TX_SIZES(
- cfl_get_luma_subsampling_420_lbd_avx2) };
+const subsample_param subsample_sizes_avx2[] = { ALL_CFL_TX_SIZES_SUBSAMPLE(
+ cfl_get_luma_subsampling_420_lbd_avx2,
+ cfl_get_luma_subsampling_422_lbd_avx2) };
const predict_param predict_sizes_avx2[] = { ALL_CFL_TX_SIZES(
get_predict_lbd_fn_avx2) };
@@ -409,8 +439,9 @@
const sub_avg_param sub_avg_sizes_neon[] = { ALL_CFL_TX_SIZES(
get_subtract_average_fn_neon) };
-const subsample_param subsample_sizes_neon[] = { ALL_CFL_TX_SIZES(
- cfl_get_luma_subsampling_420_lbd_neon) };
+const subsample_param subsample_sizes_neon[] = { ALL_CFL_TX_SIZES_SUBSAMPLE(
+ cfl_get_luma_subsampling_420_lbd_neon,
+ cfl_get_luma_subsampling_422_lbd_neon) };
INSTANTIATE_TEST_CASE_P(NEON, CFLSubAvgTest,
::testing::ValuesIn(sub_avg_sizes_neon));