Prefix a lot of global cfl functions with cfl_ BUG=aomedia:1540 Change-Id: If4cc3eef4f6ae35d8f270f99503a8f5d7e715049
diff --git a/av1/common/cfl.h b/av1/common/cfl.h index 052bf88..f6965ee 100644 --- a/av1/common/cfl.h +++ b/av1/common/cfl.h
@@ -89,7 +89,7 @@ // will be constant allowing for loop unrolling and other constant propagated // goodness. #define CFL_SUBSAMPLE(arch, sub, bd, width, height) \ - void subsample_##bd##_##sub##_##width##x##height##_##arch( \ + void cfl_subsample_##bd##_##sub##_##width##x##height##_##arch( \ const CFL_##bd##_TYPE, int input_stride, uint16_t *output_q3) { \ cfl_luma_subsampling_##sub##_##bd##_##arch(cfl_type, input_stride, \ output_q3, width, height); \ @@ -119,27 +119,27 @@ // Declare an architecture-specific array of function pointers for size-specific // wrappers. -#define CFL_SUBSAMPLE_FUNCTION_ARRAY(arch, sub, bd) \ - static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = { \ - subsample_##bd##_##sub##_4x4_##arch, /* 4x4 */ \ - subsample_##bd##_##sub##_8x8_##arch, /* 8x8 */ \ - subsample_##bd##_##sub##_16x16_##arch, /* 16x16 */ \ - subsample_##bd##_##sub##_32x32_##arch, /* 32x32 */ \ - NULL, /* 64x64 (invalid CFL size) */ \ - subsample_##bd##_##sub##_4x8_##arch, /* 4x8 */ \ - subsample_##bd##_##sub##_8x4_##arch, /* 8x4 */ \ - subsample_##bd##_##sub##_8x16_##arch, /* 8x16 */ \ - subsample_##bd##_##sub##_16x8_##arch, /* 16x8 */ \ - subsample_##bd##_##sub##_16x32_##arch, /* 16x32 */ \ - subsample_##bd##_##sub##_32x16_##arch, /* 32x16 */ \ - NULL, /* 32x64 (invalid CFL size) */ \ - NULL, /* 64x32 (invalid CFL size) */ \ - subsample_##bd##_##sub##_4x16_##arch, /* 4x16 */ \ - subsample_##bd##_##sub##_16x4_##arch, /* 16x4 */ \ - subsample_##bd##_##sub##_8x32_##arch, /* 8x32 */ \ - subsample_##bd##_##sub##_32x8_##arch, /* 32x8 */ \ - NULL, /* 16x64 (invalid CFL size) */ \ - NULL, /* 64x16 (invalid CFL size) */ \ +#define CFL_SUBSAMPLE_FUNCTION_ARRAY(arch, sub, bd) \ + static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = { \ + cfl_subsample_##bd##_##sub##_4x4_##arch, /* 4x4 */ \ + cfl_subsample_##bd##_##sub##_8x8_##arch, /* 8x8 */ \ + cfl_subsample_##bd##_##sub##_16x16_##arch, /* 16x16 */ \ + cfl_subsample_##bd##_##sub##_32x32_##arch, /* 32x32 */ \ + NULL, /* 64x64 (invalid CFL size) */ \ + cfl_subsample_##bd##_##sub##_4x8_##arch, /* 4x8 */ \ + cfl_subsample_##bd##_##sub##_8x4_##arch, /* 8x4 */ \ + cfl_subsample_##bd##_##sub##_8x16_##arch, /* 8x16 */ \ + cfl_subsample_##bd##_##sub##_16x8_##arch, /* 16x8 */ \ + cfl_subsample_##bd##_##sub##_16x32_##arch, /* 16x32 */ \ + cfl_subsample_##bd##_##sub##_32x16_##arch, /* 32x16 */ \ + NULL, /* 32x64 (invalid CFL size) */ \ + NULL, /* 64x32 (invalid CFL size) */ \ + cfl_subsample_##bd##_##sub##_4x16_##arch, /* 4x16 */ \ + cfl_subsample_##bd##_##sub##_16x4_##arch, /* 16x4 */ \ + cfl_subsample_##bd##_##sub##_8x32_##arch, /* 8x32 */ \ + cfl_subsample_##bd##_##sub##_32x8_##arch, /* 32x8 */ \ + NULL, /* 16x64 (invalid CFL size) */ \ + NULL, /* 64x16 (invalid CFL size) */ \ }; // The RTCD script does not support passing in an array, so we wrap it in this @@ -156,11 +156,11 @@ // will inline the size generic function in here, the advantage is that the size // will be constant allowing for loop unrolling and other constant propagated // goodness. -#define CFL_SUB_AVG_X(arch, width, height, round_offset, num_pel_log2) \ - void subtract_average_##width##x##height##_##arch(const uint16_t *src, \ - int16_t *dst) { \ - subtract_average_##arch(src, dst, width, height, round_offset, \ - num_pel_log2); \ +#define CFL_SUB_AVG_X(arch, width, height, round_offset, num_pel_log2) \ + void cfl_subtract_average_##width##x##height##_##arch(const uint16_t *src, \ + int16_t *dst) { \ + subtract_average_##arch(src, dst, width, height, round_offset, \ + num_pel_log2); \ } // Declare size-specific wrappers for all valid CfL sizes. @@ -182,25 +182,25 @@ cfl_subtract_average_fn cfl_get_subtract_average_fn_##arch( \ TX_SIZE tx_size) { \ static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = { \ - subtract_average_4x4_##arch, /* 4x4 */ \ - subtract_average_8x8_##arch, /* 8x8 */ \ - subtract_average_16x16_##arch, /* 16x16 */ \ - subtract_average_32x32_##arch, /* 32x32 */ \ - NULL, /* 64x64 (invalid CFL size) */ \ - subtract_average_4x8_##arch, /* 4x8 */ \ - subtract_average_8x4_##arch, /* 8x4 */ \ - subtract_average_8x16_##arch, /* 8x16 */ \ - subtract_average_16x8_##arch, /* 16x8 */ \ - subtract_average_16x32_##arch, /* 16x32 */ \ - subtract_average_32x16_##arch, /* 32x16 */ \ - NULL, /* 32x64 (invalid CFL size) */ \ - NULL, /* 64x32 (invalid CFL size) */ \ - subtract_average_4x16_##arch, /* 4x16 (invalid CFL size) */ \ - subtract_average_16x4_##arch, /* 16x4 (invalid CFL size) */ \ - subtract_average_8x32_##arch, /* 8x32 (invalid CFL size) */ \ - subtract_average_32x8_##arch, /* 32x8 (invalid CFL size) */ \ - NULL, /* 16x64 (invalid CFL size) */ \ - NULL, /* 64x16 (invalid CFL size) */ \ + cfl_subtract_average_4x4_##arch, /* 4x4 */ \ + cfl_subtract_average_8x8_##arch, /* 8x8 */ \ + cfl_subtract_average_16x16_##arch, /* 16x16 */ \ + cfl_subtract_average_32x32_##arch, /* 32x32 */ \ + NULL, /* 64x64 (invalid CFL size) */ \ + cfl_subtract_average_4x8_##arch, /* 4x8 */ \ + cfl_subtract_average_8x4_##arch, /* 8x4 */ \ + cfl_subtract_average_8x16_##arch, /* 8x16 */ \ + cfl_subtract_average_16x8_##arch, /* 16x8 */ \ + cfl_subtract_average_16x32_##arch, /* 16x32 */ \ + cfl_subtract_average_32x16_##arch, /* 32x16 */ \ + NULL, /* 32x64 (invalid CFL size) */ \ + NULL, /* 64x32 (invalid CFL size) */ \ + cfl_subtract_average_4x16_##arch, /* 4x16 (invalid CFL size) */ \ + cfl_subtract_average_16x4_##arch, /* 16x4 (invalid CFL size) */ \ + cfl_subtract_average_8x32_##arch, /* 8x32 (invalid CFL size) */ \ + cfl_subtract_average_32x8_##arch, /* 32x8 (invalid CFL size) */ \ + NULL, /* 16x64 (invalid CFL size) */ \ + NULL, /* 64x16 (invalid CFL size) */ \ }; \ /* Modulo TX_SIZES_ALL to ensure that an attacker won't be able to */ \ /* index the function pointer array out of bounds. */ \ @@ -209,24 +209,24 @@ // For VSX SIMD optimization, the C versions of width == 4 subtract are // faster than the VSX. As such, the VSX code calls the C versions. -void subtract_average_4x4_c(const uint16_t *src, int16_t *dst); -void subtract_average_4x8_c(const uint16_t *src, int16_t *dst); -void subtract_average_4x16_c(const uint16_t *src, int16_t *dst); +void cfl_subtract_average_4x4_c(const uint16_t *src, int16_t *dst); +void cfl_subtract_average_4x8_c(const uint16_t *src, int16_t *dst); +void cfl_subtract_average_4x16_c(const uint16_t *src, int16_t *dst); -#define CFL_PREDICT_lbd(arch, width, height) \ - void predict_lbd_##width##x##height##_##arch(const int16_t *pred_buf_q3, \ - uint8_t *dst, int dst_stride, \ - int alpha_q3) { \ - cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width, \ - height); \ +#define CFL_PREDICT_lbd(arch, width, height) \ + void cfl_predict_lbd_##width##x##height##_##arch( \ + const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride, \ + int alpha_q3) { \ + cfl_predict_lbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, width, \ + height); \ } -#define CFL_PREDICT_hbd(arch, width, height) \ - void predict_hbd_##width##x##height##_##arch(const int16_t *pred_buf_q3, \ - uint16_t *dst, int dst_stride, \ - int alpha_q3, int bd) { \ - cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width, \ - height); \ +#define CFL_PREDICT_hbd(arch, width, height) \ + void cfl_predict_hbd_##width##x##height##_##arch( \ + const int16_t *pred_buf_q3, uint16_t *dst, int dst_stride, int alpha_q3, \ + int bd) { \ + cfl_predict_hbd_##arch(pred_buf_q3, dst, dst_stride, alpha_q3, bd, width, \ + height); \ } // This wrapper exists because clang format does not like calling macros with @@ -251,25 +251,25 @@ CFL_PREDICT_X(arch, 32, 32, bd) \ cfl_predict_##bd##_fn cfl_get_predict_##bd##_fn_##arch(TX_SIZE tx_size) { \ static const cfl_predict_##bd##_fn pred[TX_SIZES_ALL] = { \ - predict_##bd##_4x4_##arch, /* 4x4 */ \ - predict_##bd##_8x8_##arch, /* 8x8 */ \ - predict_##bd##_16x16_##arch, /* 16x16 */ \ - predict_##bd##_32x32_##arch, /* 32x32 */ \ - NULL, /* 64x64 (invalid CFL size) */ \ - predict_##bd##_4x8_##arch, /* 4x8 */ \ - predict_##bd##_8x4_##arch, /* 8x4 */ \ - predict_##bd##_8x16_##arch, /* 8x16 */ \ - predict_##bd##_16x8_##arch, /* 16x8 */ \ - predict_##bd##_16x32_##arch, /* 16x32 */ \ - predict_##bd##_32x16_##arch, /* 32x16 */ \ - NULL, /* 32x64 (invalid CFL size) */ \ - NULL, /* 64x32 (invalid CFL size) */ \ - predict_##bd##_4x16_##arch, /* 4x16 */ \ - predict_##bd##_16x4_##arch, /* 16x4 */ \ - predict_##bd##_8x32_##arch, /* 8x32 */ \ - predict_##bd##_32x8_##arch, /* 32x8 */ \ - NULL, /* 16x64 (invalid CFL size) */ \ - NULL, /* 64x16 (invalid CFL size) */ \ + cfl_predict_##bd##_4x4_##arch, /* 4x4 */ \ + cfl_predict_##bd##_8x8_##arch, /* 8x8 */ \ + cfl_predict_##bd##_16x16_##arch, /* 16x16 */ \ + cfl_predict_##bd##_32x32_##arch, /* 32x32 */ \ + NULL, /* 64x64 (invalid CFL size) */ \ + cfl_predict_##bd##_4x8_##arch, /* 4x8 */ \ + cfl_predict_##bd##_8x4_##arch, /* 8x4 */ \ + cfl_predict_##bd##_8x16_##arch, /* 8x16 */ \ + cfl_predict_##bd##_16x8_##arch, /* 16x8 */ \ + cfl_predict_##bd##_16x32_##arch, /* 16x32 */ \ + cfl_predict_##bd##_32x16_##arch, /* 32x16 */ \ + NULL, /* 32x64 (invalid CFL size) */ \ + NULL, /* 64x32 (invalid CFL size) */ \ + cfl_predict_##bd##_4x16_##arch, /* 4x16 */ \ + cfl_predict_##bd##_16x4_##arch, /* 16x4 */ \ + cfl_predict_##bd##_8x32_##arch, /* 8x32 */ \ + cfl_predict_##bd##_32x8_##arch, /* 32x8 */ \ + NULL, /* 16x64 (invalid CFL size) */ \ + NULL, /* 64x16 (invalid CFL size) */ \ }; \ /* Modulo TX_SIZES_ALL to ensure that an attacker won't be able to */ \ /* index the function pointer array out of bounds. */ \
diff --git a/av1/common/ppc/cfl_ppc.c b/av1/common/ppc/cfl_ppc.c index dca860b..6f88768 100644 --- a/av1/common/ppc/cfl_ppc.c +++ b/av1/common/ppc/cfl_ppc.c
@@ -126,25 +126,25 @@ // load and store intrinsics). So we call the C code for block widths 4. cfl_subtract_average_fn cfl_get_subtract_average_fn_vsx(TX_SIZE tx_size) { static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = { - subtract_average_4x4_c, /* 4x4 */ - subtract_average_8x8_vsx, /* 8x8 */ - subtract_average_16x16_vsx, /* 16x16 */ - subtract_average_32x32_vsx, /* 32x32 */ - NULL, /* 64x64 (invalid CFL size) */ - subtract_average_4x8_c, /* 4x8 */ - subtract_average_8x4_vsx, /* 8x4 */ - subtract_average_8x16_vsx, /* 8x16 */ - subtract_average_16x8_vsx, /* 16x8 */ - subtract_average_16x32_vsx, /* 16x32 */ - subtract_average_32x16_vsx, /* 32x16 */ - NULL, /* 32x64 (invalid CFL size) */ - NULL, /* 64x32 (invalid CFL size) */ - subtract_average_4x16_c, /* 4x16 */ - subtract_average_16x4_vsx, /* 16x4 */ - subtract_average_8x32_vsx, /* 8x32 */ - subtract_average_32x8_vsx, /* 32x8 */ - NULL, /* 16x64 (invalid CFL size) */ - NULL, /* 64x16 (invalid CFL size) */ + cfl_subtract_average_4x4_c, /* 4x4 */ + cfl_subtract_average_8x8_vsx, /* 8x8 */ + cfl_subtract_average_16x16_vsx, /* 16x16 */ + cfl_subtract_average_32x32_vsx, /* 32x32 */ + NULL, /* 64x64 (invalid CFL size) */ + cfl_subtract_average_4x8_c, /* 4x8 */ + cfl_subtract_average_8x4_vsx, /* 8x4 */ + cfl_subtract_average_8x16_vsx, /* 8x16 */ + cfl_subtract_average_16x8_vsx, /* 16x8 */ + cfl_subtract_average_16x32_vsx, /* 16x32 */ + cfl_subtract_average_32x16_vsx, /* 32x16 */ + NULL, /* 32x64 (invalid CFL size) */ + NULL, /* 64x32 (invalid CFL size) */ + cfl_subtract_average_4x16_c, /* 4x16 */ + cfl_subtract_average_16x4_vsx, /* 16x4 */ + cfl_subtract_average_8x32_vsx, /* 8x32 */ + cfl_subtract_average_32x8_vsx, /* 32x8 */ + NULL, /* 16x64 (invalid CFL size) */ + NULL, /* 64x16 (invalid CFL size) */ }; // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to // index the function pointer array out of bounds.
diff --git a/av1/common/x86/cfl_avx2.c b/av1/common/x86/cfl_avx2.c index 3d183b4..3356984 100644 --- a/av1/common/x86/cfl_avx2.c +++ b/av1/common/x86/cfl_avx2.c
@@ -16,34 +16,34 @@ #include "av1/common/x86/cfl_simd.h" -#define CFL_GET_SUBSAMPLE_FUNCTION_AVX2(sub, bd) \ - CFL_SUBSAMPLE(avx2, sub, bd, 32, 32) \ - CFL_SUBSAMPLE(avx2, sub, bd, 32, 16) \ - CFL_SUBSAMPLE(avx2, sub, bd, 32, 8) \ - cfl_subsample_##bd##_fn cfl_get_luma_subsampling_##sub##_##bd##_avx2( \ - TX_SIZE tx_size) { \ - static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = { \ - subsample_##bd##_##sub##_4x4_ssse3, /* 4x4 */ \ - subsample_##bd##_##sub##_8x8_ssse3, /* 8x8 */ \ - subsample_##bd##_##sub##_16x16_ssse3, /* 16x16 */ \ - subsample_##bd##_##sub##_32x32_avx2, /* 32x32 */ \ - NULL, /* 64x64 (invalid CFL size) */ \ - subsample_##bd##_##sub##_4x8_ssse3, /* 4x8 */ \ - subsample_##bd##_##sub##_8x4_ssse3, /* 8x4 */ \ - subsample_##bd##_##sub##_8x16_ssse3, /* 8x16 */ \ - subsample_##bd##_##sub##_16x8_ssse3, /* 16x8 */ \ - subsample_##bd##_##sub##_16x32_ssse3, /* 16x32 */ \ - subsample_##bd##_##sub##_32x16_avx2, /* 32x16 */ \ - NULL, /* 32x64 (invalid CFL size) */ \ - NULL, /* 64x32 (invalid CFL size) */ \ - subsample_##bd##_##sub##_4x16_ssse3, /* 4x16 */ \ - subsample_##bd##_##sub##_16x4_ssse3, /* 16x4 */ \ - subsample_##bd##_##sub##_8x32_ssse3, /* 8x32 */ \ - subsample_##bd##_##sub##_32x8_avx2, /* 32x8 */ \ - NULL, /* 16x64 (invalid CFL size) */ \ - NULL, /* 64x16 (invalid CFL size) */ \ - }; \ - return subfn_##sub[tx_size]; \ +#define CFL_GET_SUBSAMPLE_FUNCTION_AVX2(sub, bd) \ + CFL_SUBSAMPLE(avx2, sub, bd, 32, 32) \ + CFL_SUBSAMPLE(avx2, sub, bd, 32, 16) \ + CFL_SUBSAMPLE(avx2, sub, bd, 32, 8) \ + cfl_subsample_##bd##_fn cfl_get_luma_subsampling_##sub##_##bd##_avx2( \ + TX_SIZE tx_size) { \ + static const cfl_subsample_##bd##_fn subfn_##sub[TX_SIZES_ALL] = { \ + cfl_subsample_##bd##_##sub##_4x4_ssse3, /* 4x4 */ \ + cfl_subsample_##bd##_##sub##_8x8_ssse3, /* 8x8 */ \ + cfl_subsample_##bd##_##sub##_16x16_ssse3, /* 16x16 */ \ + cfl_subsample_##bd##_##sub##_32x32_avx2, /* 32x32 */ \ + NULL, /* 64x64 (invalid CFL size) */ \ + cfl_subsample_##bd##_##sub##_4x8_ssse3, /* 4x8 */ \ + cfl_subsample_##bd##_##sub##_8x4_ssse3, /* 8x4 */ \ + cfl_subsample_##bd##_##sub##_8x16_ssse3, /* 8x16 */ \ + cfl_subsample_##bd##_##sub##_16x8_ssse3, /* 16x8 */ \ + cfl_subsample_##bd##_##sub##_16x32_ssse3, /* 16x32 */ \ + cfl_subsample_##bd##_##sub##_32x16_avx2, /* 32x16 */ \ + NULL, /* 32x64 (invalid CFL size) */ \ + NULL, /* 64x32 (invalid CFL size) */ \ + cfl_subsample_##bd##_##sub##_4x16_ssse3, /* 4x16 */ \ + cfl_subsample_##bd##_##sub##_16x4_ssse3, /* 16x4 */ \ + cfl_subsample_##bd##_##sub##_8x32_ssse3, /* 8x32 */ \ + cfl_subsample_##bd##_##sub##_32x8_avx2, /* 32x8 */ \ + NULL, /* 16x64 (invalid CFL size) */ \ + NULL, /* 64x16 (invalid CFL size) */ \ + }; \ + return subfn_##sub[tx_size]; \ } /** @@ -275,25 +275,25 @@ cfl_predict_lbd_fn cfl_get_predict_lbd_fn_avx2(TX_SIZE tx_size) { static const cfl_predict_lbd_fn pred[TX_SIZES_ALL] = { - predict_lbd_4x4_ssse3, /* 4x4 */ - predict_lbd_8x8_ssse3, /* 8x8 */ - predict_lbd_16x16_ssse3, /* 16x16 */ - predict_lbd_32x32_avx2, /* 32x32 */ - NULL, /* 64x64 (invalid CFL size) */ - predict_lbd_4x8_ssse3, /* 4x8 */ - predict_lbd_8x4_ssse3, /* 8x4 */ - predict_lbd_8x16_ssse3, /* 8x16 */ - predict_lbd_16x8_ssse3, /* 16x8 */ - predict_lbd_16x32_ssse3, /* 16x32 */ - predict_lbd_32x16_avx2, /* 32x16 */ - NULL, /* 32x64 (invalid CFL size) */ - NULL, /* 64x32 (invalid CFL size) */ - predict_lbd_4x16_ssse3, /* 4x16 */ - predict_lbd_16x4_ssse3, /* 16x4 */ - predict_lbd_8x32_ssse3, /* 8x32 */ - predict_lbd_32x8_avx2, /* 32x8 */ - NULL, /* 16x64 (invalid CFL size) */ - NULL, /* 64x16 (invalid CFL size) */ + cfl_predict_lbd_4x4_ssse3, /* 4x4 */ + cfl_predict_lbd_8x8_ssse3, /* 8x8 */ + cfl_predict_lbd_16x16_ssse3, /* 16x16 */ + cfl_predict_lbd_32x32_avx2, /* 32x32 */ + NULL, /* 64x64 (invalid CFL size) */ + cfl_predict_lbd_4x8_ssse3, /* 4x8 */ + cfl_predict_lbd_8x4_ssse3, /* 8x4 */ + cfl_predict_lbd_8x16_ssse3, /* 8x16 */ + cfl_predict_lbd_16x8_ssse3, /* 16x8 */ + cfl_predict_lbd_16x32_ssse3, /* 16x32 */ + cfl_predict_lbd_32x16_avx2, /* 32x16 */ + NULL, /* 32x64 (invalid CFL size) */ + NULL, /* 64x32 (invalid CFL size) */ + cfl_predict_lbd_4x16_ssse3, /* 4x16 */ + cfl_predict_lbd_16x4_ssse3, /* 16x4 */ + cfl_predict_lbd_8x32_ssse3, /* 8x32 */ + cfl_predict_lbd_32x8_avx2, /* 32x8 */ + NULL, /* 16x64 (invalid CFL size) */ + NULL, /* 64x16 (invalid CFL size) */ }; // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the // function pointer array out of bounds. @@ -348,25 +348,25 @@ cfl_predict_hbd_fn cfl_get_predict_hbd_fn_avx2(TX_SIZE tx_size) { static const cfl_predict_hbd_fn pred[TX_SIZES_ALL] = { - predict_hbd_4x4_ssse3, /* 4x4 */ - predict_hbd_8x8_ssse3, /* 8x8 */ - predict_hbd_16x16_avx2, /* 16x16 */ - predict_hbd_32x32_avx2, /* 32x32 */ - NULL, /* 64x64 (invalid CFL size) */ - predict_hbd_4x8_ssse3, /* 4x8 */ - predict_hbd_8x4_ssse3, /* 8x4 */ - predict_hbd_8x16_ssse3, /* 8x16 */ - predict_hbd_16x8_avx2, /* 16x8 */ - predict_hbd_16x32_avx2, /* 16x32 */ - predict_hbd_32x16_avx2, /* 32x16 */ - NULL, /* 32x64 (invalid CFL size) */ - NULL, /* 64x32 (invalid CFL size) */ - predict_hbd_4x16_ssse3, /* 4x16 */ - predict_hbd_16x4_avx2, /* 16x4 */ - predict_hbd_8x32_ssse3, /* 8x32 */ - predict_hbd_32x8_avx2, /* 32x8 */ - NULL, /* 16x64 (invalid CFL size) */ - NULL, /* 64x16 (invalid CFL size) */ + cfl_predict_hbd_4x4_ssse3, /* 4x4 */ + cfl_predict_hbd_8x8_ssse3, /* 8x8 */ + cfl_predict_hbd_16x16_avx2, /* 16x16 */ + cfl_predict_hbd_32x32_avx2, /* 32x32 */ + NULL, /* 64x64 (invalid CFL size) */ + cfl_predict_hbd_4x8_ssse3, /* 4x8 */ + cfl_predict_hbd_8x4_ssse3, /* 8x4 */ + cfl_predict_hbd_8x16_ssse3, /* 8x16 */ + cfl_predict_hbd_16x8_avx2, /* 16x8 */ + cfl_predict_hbd_16x32_avx2, /* 16x32 */ + cfl_predict_hbd_32x16_avx2, /* 32x16 */ + NULL, /* 32x64 (invalid CFL size) */ + NULL, /* 64x32 (invalid CFL size) */ + cfl_predict_hbd_4x16_ssse3, /* 4x16 */ + cfl_predict_hbd_16x4_avx2, /* 16x4 */ + cfl_predict_hbd_8x32_ssse3, /* 8x32 */ + cfl_predict_hbd_32x8_avx2, /* 32x8 */ + NULL, /* 16x64 (invalid CFL size) */ + NULL, /* 64x16 (invalid CFL size) */ }; // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to index the // function pointer array out of bounds. @@ -465,25 +465,25 @@ // SSE2, we call the SSE2 code for block widths 4 and 8. cfl_subtract_average_fn cfl_get_subtract_average_fn_avx2(TX_SIZE tx_size) { static const cfl_subtract_average_fn sub_avg[TX_SIZES_ALL] = { - subtract_average_4x4_sse2, /* 4x4 */ - subtract_average_8x8_sse2, /* 8x8 */ - subtract_average_16x16_avx2, /* 16x16 */ - subtract_average_32x32_avx2, /* 32x32 */ - NULL, /* 64x64 (invalid CFL size) */ - subtract_average_4x8_sse2, /* 4x8 */ - subtract_average_8x4_sse2, /* 8x4 */ - subtract_average_8x16_sse2, /* 8x16 */ - subtract_average_16x8_avx2, /* 16x8 */ - subtract_average_16x32_avx2, /* 16x32 */ - subtract_average_32x16_avx2, /* 32x16 */ - NULL, /* 32x64 (invalid CFL size) */ - NULL, /* 64x32 (invalid CFL size) */ - subtract_average_4x16_sse2, /* 4x16 */ - subtract_average_16x4_avx2, /* 16x4 */ - subtract_average_8x32_sse2, /* 8x32 */ - subtract_average_32x8_avx2, /* 32x8 */ - NULL, /* 16x64 (invalid CFL size) */ - NULL, /* 64x16 (invalid CFL size) */ + cfl_subtract_average_4x4_sse2, /* 4x4 */ + cfl_subtract_average_8x8_sse2, /* 8x8 */ + cfl_subtract_average_16x16_avx2, /* 16x16 */ + cfl_subtract_average_32x32_avx2, /* 32x32 */ + NULL, /* 64x64 (invalid CFL size) */ + cfl_subtract_average_4x8_sse2, /* 4x8 */ + cfl_subtract_average_8x4_sse2, /* 8x4 */ + cfl_subtract_average_8x16_sse2, /* 8x16 */ + cfl_subtract_average_16x8_avx2, /* 16x8 */ + cfl_subtract_average_16x32_avx2, /* 16x32 */ + cfl_subtract_average_32x16_avx2, /* 32x16 */ + NULL, /* 32x64 (invalid CFL size) */ + NULL, /* 64x32 (invalid CFL size) */ + cfl_subtract_average_4x16_sse2, /* 4x16 */ + cfl_subtract_average_16x4_avx2, /* 16x4 */ + cfl_subtract_average_8x32_sse2, /* 8x32 */ + cfl_subtract_average_32x8_avx2, /* 32x8 */ + NULL, /* 16x64 (invalid CFL size) */ + NULL, /* 64x16 (invalid CFL size) */ }; // Modulo TX_SIZES_ALL to ensure that an attacker won't be able to // index the function pointer array out of bounds.
diff --git a/av1/common/x86/cfl_simd.h b/av1/common/x86/cfl_simd.h index 3b342cd..c1ad182 100644 --- a/av1/common/x86/cfl_simd.h +++ b/av1/common/x86/cfl_simd.h
@@ -15,229 +15,229 @@ #include "av1/common/blockd.h" // SSSE3 version is optimal for with == 4, we reuse them in AVX2 -void subsample_lbd_420_4x4_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_420_4x8_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_420_4x16_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_lbd_420_4x4_ssse3(const uint8_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_420_4x8_ssse3(const uint8_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_420_4x16_ssse3(const uint8_t *input, int input_stride, + uint16_t *output_q3); // SSSE3 version is optimal for with == 8, we reuse it in AVX2 -void subsample_lbd_420_8x4_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_420_8x8_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_420_8x16_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_420_8x32_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_lbd_420_8x4_ssse3(const uint8_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_420_8x8_ssse3(const uint8_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_420_8x16_ssse3(const uint8_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_420_8x32_ssse3(const uint8_t *input, int input_stride, + uint16_t *output_q3); // SSSE3 version is optimal for with == 16, we reuse it in AVX2 -void subsample_lbd_420_16x4_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_420_16x8_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_420_16x16_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_420_16x32_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_lbd_420_16x4_ssse3(const uint8_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_420_16x8_ssse3(const uint8_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_420_16x16_ssse3(const uint8_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_420_16x32_ssse3(const uint8_t *input, int input_stride, + uint16_t *output_q3); // SSSE3 version is optimal for with == 4, we reuse them in AVX2 -void subsample_lbd_422_4x4_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_422_4x8_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_422_4x16_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_lbd_422_4x4_ssse3(const uint8_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_422_4x8_ssse3(const uint8_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_422_4x16_ssse3(const uint8_t *input, int input_stride, + uint16_t *output_q3); // SSSE3 version is optimal for with == 8, we reuse it in AVX2 -void subsample_lbd_422_8x4_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_422_8x8_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_422_8x16_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_422_8x32_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_lbd_422_8x4_ssse3(const uint8_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_422_8x8_ssse3(const uint8_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_422_8x16_ssse3(const uint8_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_422_8x32_ssse3(const uint8_t *input, int input_stride, + uint16_t *output_q3); // SSSE3 version is optimal for with == 16, we reuse it in AVX2 -void subsample_lbd_422_16x4_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_422_16x8_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_422_16x16_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_422_16x32_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_lbd_422_16x4_ssse3(const uint8_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_422_16x8_ssse3(const uint8_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_422_16x16_ssse3(const uint8_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_422_16x32_ssse3(const uint8_t *input, int input_stride, + uint16_t *output_q3); // SSSE3 version is optimal for with == 4, we reuse them in AVX2 -void subsample_lbd_444_4x4_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_444_4x8_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_444_4x16_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_lbd_444_4x4_ssse3(const uint8_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_444_4x8_ssse3(const uint8_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_444_4x16_ssse3(const uint8_t *input, int input_stride, + uint16_t *output_q3); // SSSE3 version is optimal for with == 8, we reuse it in AVX2 -void subsample_lbd_444_8x4_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_444_8x8_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_444_8x16_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_444_8x32_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_lbd_444_8x4_ssse3(const uint8_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_444_8x8_ssse3(const uint8_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_444_8x16_ssse3(const uint8_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_444_8x32_ssse3(const uint8_t *input, int input_stride, + uint16_t *output_q3); // SSSE3 version is optimal for with == 16, we reuse it in AVX2 -void subsample_lbd_444_16x4_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_444_16x8_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_444_16x16_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); -void subsample_lbd_444_16x32_ssse3(const uint8_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_lbd_444_16x4_ssse3(const uint8_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_444_16x8_ssse3(const uint8_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_444_16x16_ssse3(const uint8_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_lbd_444_16x32_ssse3(const uint8_t *input, int input_stride, + uint16_t *output_q3); -void subsample_hbd_420_4x4_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_420_4x8_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_420_4x16_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_hbd_420_4x4_ssse3(const uint16_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_420_4x8_ssse3(const uint16_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_420_4x16_ssse3(const uint16_t *input, int input_stride, + uint16_t *output_q3); // SSSE3 version is optimal for with == 8, we reuse it in AVX2 -void subsample_hbd_420_8x4_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_420_8x8_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_420_8x16_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_420_8x32_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_hbd_420_8x4_ssse3(const uint16_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_420_8x8_ssse3(const uint16_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_420_8x16_ssse3(const uint16_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_420_8x32_ssse3(const uint16_t *input, int input_stride, + uint16_t *output_q3); // SSSE3 version is faster for with == 16, we reuse it in AVX2 -void subsample_hbd_420_16x4_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_420_16x8_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_420_16x16_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_420_16x32_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_hbd_420_16x4_ssse3(const uint16_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_420_16x8_ssse3(const uint16_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_420_16x16_ssse3(const uint16_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_420_16x32_ssse3(const uint16_t *input, int input_stride, + uint16_t *output_q3); -void subsample_hbd_422_4x4_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_422_4x8_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_422_4x16_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_hbd_422_4x4_ssse3(const uint16_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_422_4x8_ssse3(const uint16_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_422_4x16_ssse3(const uint16_t *input, int input_stride, + uint16_t *output_q3); // SSSE3 version is optimal for with == 8, we reuse it in AVX2 -void subsample_hbd_422_8x4_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_422_8x8_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_422_8x16_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_422_8x32_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_hbd_422_8x4_ssse3(const uint16_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_422_8x8_ssse3(const uint16_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_422_8x16_ssse3(const uint16_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_422_8x32_ssse3(const uint16_t *input, int input_stride, + uint16_t *output_q3); // SSSE3 version is faster for with == 16, we reuse it in AVX2 -void subsample_hbd_422_16x4_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_422_16x8_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_422_16x16_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_422_16x32_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_hbd_422_16x4_ssse3(const uint16_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_422_16x8_ssse3(const uint16_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_422_16x16_ssse3(const uint16_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_422_16x32_ssse3(const uint16_t *input, int input_stride, + uint16_t *output_q3); -void subsample_hbd_444_4x4_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_444_4x8_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_444_4x16_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_hbd_444_4x4_ssse3(const uint16_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_444_4x8_ssse3(const uint16_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_444_4x16_ssse3(const uint16_t *input, int input_stride, + uint16_t *output_q3); // SSSE3 version is optimal for with == 8, we reuse it in AVX2 -void subsample_hbd_444_8x4_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_444_8x8_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_444_8x16_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_444_8x32_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_hbd_444_8x4_ssse3(const uint16_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_444_8x8_ssse3(const uint16_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_444_8x16_ssse3(const uint16_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_444_8x32_ssse3(const uint16_t *input, int input_stride, + uint16_t *output_q3); // SSSE3 version is faster for with == 16, we reuse it in AVX2 -void subsample_hbd_444_16x4_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_444_16x8_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_444_16x16_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); -void subsample_hbd_444_16x32_ssse3(const uint16_t *input, int input_stride, - uint16_t *output_q3); +void cfl_subsample_hbd_444_16x4_ssse3(const uint16_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_444_16x8_ssse3(const uint16_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_444_16x16_ssse3(const uint16_t *input, int input_stride, + uint16_t *output_q3); +void cfl_subsample_hbd_444_16x32_ssse3(const uint16_t *input, int input_stride, + uint16_t *output_q3); // SSE2 version is optimal for with == 4, we reuse them in AVX2 -void subtract_average_4x4_sse2(const uint16_t *src, int16_t *dst); -void subtract_average_4x8_sse2(const uint16_t *src, int16_t *dst); -void subtract_average_4x16_sse2(const uint16_t *src, int16_t *dst); +void cfl_subtract_average_4x4_sse2(const uint16_t *src, int16_t *dst); +void cfl_subtract_average_4x8_sse2(const uint16_t *src, int16_t *dst); +void cfl_subtract_average_4x16_sse2(const uint16_t *src, int16_t *dst); // SSE2 version is optimal for with == 8, we reuse them in AVX2 -void subtract_average_8x4_sse2(const uint16_t *src, int16_t *dst); -void subtract_average_8x8_sse2(const uint16_t *src, int16_t *dst); -void subtract_average_8x16_sse2(const uint16_t *src, int16_t *dst); -void subtract_average_8x32_sse2(const uint16_t *src, int16_t *dst); +void cfl_subtract_average_8x4_sse2(const uint16_t *src, int16_t *dst); +void cfl_subtract_average_8x8_sse2(const uint16_t *src, int16_t *dst); +void cfl_subtract_average_8x16_sse2(const uint16_t *src, int16_t *dst); +void cfl_subtract_average_8x32_sse2(const uint16_t *src, int16_t *dst); -void predict_lbd_4x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, - int dst_stride, int alpha_q3); -void predict_lbd_4x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, - int dst_stride, int alpha_q3); -void predict_lbd_4x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, - int dst_stride, int alpha_q3); +void cfl_predict_lbd_4x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); +void cfl_predict_lbd_4x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); +void cfl_predict_lbd_4x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); -void predict_lbd_8x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, - int dst_stride, int alpha_q3); -void predict_lbd_8x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, - int dst_stride, int alpha_q3); -void predict_lbd_8x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, - int dst_stride, int alpha_q3); -void predict_lbd_8x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, - int dst_stride, int alpha_q3); +void cfl_predict_lbd_8x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); +void cfl_predict_lbd_8x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); +void cfl_predict_lbd_8x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); +void cfl_predict_lbd_8x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); -void predict_lbd_16x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, - int dst_stride, int alpha_q3); -void predict_lbd_16x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, - int dst_stride, int alpha_q3); -void predict_lbd_16x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, - int dst_stride, int alpha_q3); -void predict_lbd_16x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, - int dst_stride, int alpha_q3); +void cfl_predict_lbd_16x4_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); +void cfl_predict_lbd_16x8_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); +void cfl_predict_lbd_16x16_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); +void cfl_predict_lbd_16x32_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, + int dst_stride, int alpha_q3); -void predict_hbd_4x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, - int dst_stride, int alpha_q3, int bd); -void predict_hbd_4x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, - int dst_stride, int alpha_q3, int bd); -void predict_hbd_4x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, - int dst_stride, int alpha_q3, int bd); +void cfl_predict_hbd_4x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +void cfl_predict_hbd_4x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +void cfl_predict_hbd_4x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); -void predict_hbd_8x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, - int dst_stride, int alpha_q3, int bd); -void predict_hbd_8x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, - int dst_stride, int alpha_q3, int bd); -void predict_hbd_8x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, - int dst_stride, int alpha_q3, int bd); -void predict_hbd_8x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, - int dst_stride, int alpha_q3, int bd); +void cfl_predict_hbd_8x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +void cfl_predict_hbd_8x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +void cfl_predict_hbd_8x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +void cfl_predict_hbd_8x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); -void predict_hbd_16x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, - int dst_stride, int alpha_q3, int bd); -void predict_hbd_16x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, - int dst_stride, int alpha_q3, int bd); -void predict_hbd_16x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, - int dst_stride, int alpha_q3, int bd); -void predict_hbd_16x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, - int dst_stride, int alpha_q3, int bd); +void cfl_predict_hbd_16x4_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +void cfl_predict_hbd_16x8_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +void cfl_predict_hbd_16x16_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); +void cfl_predict_hbd_16x32_ssse3(const int16_t *pred_buf_q3, uint16_t *dst, + int dst_stride, int alpha_q3, int bd); #endif // AOM_AV1_COMMON_X86_CFL_SIMD_H_