Explicit requirement about sizeof(tran_low_t) Here, we're testing CONFIG_HIGHBITDEPTH but what we really depend upon is the actual size of the coefficients. Change-Id: I33d71e4b38b4b83bb4232346f4d449f20bcf740e
diff --git a/aom_dsp/x86/fwd_txfm_avx2.h b/aom_dsp/x86/fwd_txfm_avx2.h index d3aceae..86df4a6 100644 --- a/aom_dsp/x86/fwd_txfm_avx2.h +++ b/aom_dsp/x86/fwd_txfm_avx2.h
@@ -15,21 +15,21 @@ #include "./aom_config.h" static INLINE void storeu_output_avx2(const __m256i *coeff, tran_low_t *out) { -#if CONFIG_HIGHBITDEPTH - const __m256i zero = _mm256_setzero_si256(); - const __m256i sign = _mm256_cmpgt_epi16(zero, *coeff); + if (sizeof(tran_low_t) == 4) { + const __m256i zero = _mm256_setzero_si256(); + const __m256i sign = _mm256_cmpgt_epi16(zero, *coeff); - __m256i x0 = _mm256_unpacklo_epi16(*coeff, sign); - __m256i x1 = _mm256_unpackhi_epi16(*coeff, sign); + __m256i x0 = _mm256_unpacklo_epi16(*coeff, sign); + __m256i x1 = _mm256_unpackhi_epi16(*coeff, sign); - __m256i y0 = _mm256_permute2x128_si256(x0, x1, 0x20); - __m256i y1 = _mm256_permute2x128_si256(x0, x1, 0x31); + __m256i y0 = _mm256_permute2x128_si256(x0, x1, 0x20); + __m256i y1 = _mm256_permute2x128_si256(x0, x1, 0x31); - _mm256_storeu_si256((__m256i *)out, y0); - _mm256_storeu_si256((__m256i *)(out + 8), y1); -#else - _mm256_storeu_si256((__m256i *)out, *coeff); -#endif + _mm256_storeu_si256((__m256i *)out, y0); + _mm256_storeu_si256((__m256i *)(out + 8), y1); + } else { + _mm256_storeu_si256((__m256i *)out, *coeff); + } } #endif // AOM_DSP_X86_FWD_TXFM_AVX2_H
diff --git a/aom_dsp/x86/fwd_txfm_sse2.h b/aom_dsp/x86/fwd_txfm_sse2.h index 26b2db2..58e8971 100644 --- a/aom_dsp/x86/fwd_txfm_sse2.h +++ b/aom_dsp/x86/fwd_txfm_sse2.h
@@ -247,16 +247,16 @@ } static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) { -#if CONFIG_HIGHBITDEPTH - const __m128i zero = _mm_setzero_si128(); - const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); - __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); - __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); - _mm_store_si128((__m128i *)(dst_ptr), out0); - _mm_store_si128((__m128i *)(dst_ptr + 4), out1); -#else - _mm_store_si128((__m128i *)(dst_ptr), *poutput); -#endif // CONFIG_HIGHBITDEPTH + if (sizeof(tran_low_t) == 4) { + const __m128i zero = _mm_setzero_si128(); + const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); + __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); + __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); + _mm_store_si128((__m128i *)(dst_ptr), out0); + _mm_store_si128((__m128i *)(dst_ptr + 4), out1); + } else { + _mm_store_si128((__m128i *)(dst_ptr), *poutput); + } } static INLINE __m128i mult_round_shift(const __m128i *pin0, const __m128i *pin1,
diff --git a/aom_dsp/x86/inv_txfm_common_avx2.h b/aom_dsp/x86/inv_txfm_common_avx2.h index 4238e65..26c5cfe 100644 --- a/aom_dsp/x86/inv_txfm_common_avx2.h +++ b/aom_dsp/x86/inv_txfm_common_avx2.h
@@ -18,17 +18,17 @@ #include "aom_dsp/x86/txfm_common_avx2.h" static INLINE void load_coeff(const tran_low_t *coeff, __m256i *in) { -#if CONFIG_HIGHBITDEPTH - *in = _mm256_setr_epi16( - (int16_t)coeff[0], (int16_t)coeff[1], (int16_t)coeff[2], - (int16_t)coeff[3], (int16_t)coeff[4], (int16_t)coeff[5], - (int16_t)coeff[6], (int16_t)coeff[7], (int16_t)coeff[8], - (int16_t)coeff[9], (int16_t)coeff[10], (int16_t)coeff[11], - (int16_t)coeff[12], (int16_t)coeff[13], (int16_t)coeff[14], - (int16_t)coeff[15]); -#else - *in = _mm256_loadu_si256((const __m256i *)coeff); -#endif + if (sizeof(tran_low_t) == 4) { + *in = _mm256_setr_epi16( + (int16_t)coeff[0], (int16_t)coeff[1], (int16_t)coeff[2], + (int16_t)coeff[3], (int16_t)coeff[4], (int16_t)coeff[5], + (int16_t)coeff[6], (int16_t)coeff[7], (int16_t)coeff[8], + (int16_t)coeff[9], (int16_t)coeff[10], (int16_t)coeff[11], + (int16_t)coeff[12], (int16_t)coeff[13], (int16_t)coeff[14], + (int16_t)coeff[15]); + } else { + *in = _mm256_loadu_si256((const __m256i *)coeff); + } } static INLINE void load_buffer_16x16(const tran_low_t *coeff, __m256i *in) {
diff --git a/aom_dsp/x86/inv_txfm_sse2.h b/aom_dsp/x86/inv_txfm_sse2.h index 95d246c..3428169 100644 --- a/aom_dsp/x86/inv_txfm_sse2.h +++ b/aom_dsp/x86/inv_txfm_sse2.h
@@ -133,12 +133,12 @@ // Function to allow 8 bit optimisations to be used when profile 0 is used with // highbitdepth enabled static INLINE __m128i load_input_data(const tran_low_t *data) { -#if CONFIG_HIGHBITDEPTH - return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5], - data[6], data[7]); -#else - return _mm_load_si128((const __m128i *)data); -#endif + if (sizeof(tran_low_t) == 4) { + return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5], + data[6], data[7]); + } else { + return _mm_load_si128((const __m128i *)data); + } } static INLINE void load_buffer_8x16(const tran_low_t *input, __m128i *in) {
diff --git a/aom_dsp/x86/quantize_sse2.c b/aom_dsp/x86/quantize_sse2.c index 890c1f0..0e7f679 100644 --- a/aom_dsp/x86/quantize_sse2.c +++ b/aom_dsp/x86/quantize_sse2.c
@@ -16,29 +16,29 @@ #include "aom/aom_integer.h" static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) { -#if CONFIG_HIGHBITDEPTH - return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1], - (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3], - (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5], - (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]); -#else - return _mm_load_si128((const __m128i *)coeff_ptr); -#endif + if (sizeof(tran_low_t) == 4) { + return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1], + (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3], + (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5], + (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]); + } else { + return _mm_load_si128((const __m128i *)coeff_ptr); + } } static INLINE void store_coefficients(__m128i coeff_vals, tran_low_t *coeff_ptr) { -#if CONFIG_HIGHBITDEPTH - __m128i one = _mm_set1_epi16(1); - __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one); - __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one); - __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi); - __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi); - _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1); - _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2); -#else - _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals); -#endif + if (sizeof(tran_low_t) == 4) { + __m128i one = _mm_set1_epi16(1); + __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one); + __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one); + __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi); + __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi); + _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1); + _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2); + } else { + _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals); + } } void aom_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
diff --git a/aom_dsp/x86/txfm_common_intrin.h b/aom_dsp/x86/txfm_common_intrin.h index e4ac563..4e6eecd 100644 --- a/aom_dsp/x86/txfm_common_intrin.h +++ b/aom_dsp/x86/txfm_common_intrin.h
@@ -16,16 +16,16 @@ // This header file should be put below any x86 intrinsics head file static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) { -#if CONFIG_HIGHBITDEPTH - const __m128i zero = _mm_setzero_si128(); - const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); - __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); - __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); - _mm_storeu_si128((__m128i *)(dst_ptr), out0); - _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1); -#else - _mm_storeu_si128((__m128i *)(dst_ptr), *poutput); -#endif // CONFIG_HIGHBITDEPTH + if (sizeof(tran_low_t) == 4) { + const __m128i zero = _mm_setzero_si128(); + const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero); + __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits); + __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits); + _mm_storeu_si128((__m128i *)(dst_ptr), out0); + _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1); + } else { + _mm_storeu_si128((__m128i *)(dst_ptr), *poutput); + } } #endif // _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_
diff --git a/av1/encoder/x86/av1_quantize_avx2.c b/av1/encoder/x86/av1_quantize_avx2.c index 1c0a120..078a675 100644 --- a/av1/encoder/x86/av1_quantize_avx2.c +++ b/av1/encoder/x86/av1_quantize_avx2.c
@@ -16,24 +16,24 @@ #include "aom_dsp/aom_dsp_common.h" static INLINE void read_coeff(const tran_low_t *coeff, __m256i *c) { -#if CONFIG_HIGHBITDEPTH - const __m256i x0 = _mm256_loadu_si256((const __m256i *)coeff); - const __m256i x1 = _mm256_loadu_si256((const __m256i *)coeff + 1); - *c = _mm256_packs_epi32(x0, x1); - *c = _mm256_permute4x64_epi64(*c, 0xD8); -#else - *c = _mm256_loadu_si256((const __m256i *)coeff); -#endif + if (sizeof(tran_low_t) == 4) { + const __m256i x0 = _mm256_loadu_si256((const __m256i *)coeff); + const __m256i x1 = _mm256_loadu_si256((const __m256i *)coeff + 1); + *c = _mm256_packs_epi32(x0, x1); + *c = _mm256_permute4x64_epi64(*c, 0xD8); + } else { + *c = _mm256_loadu_si256((const __m256i *)coeff); + } } static INLINE void write_zero(tran_low_t *qcoeff) { const __m256i zero = _mm256_setzero_si256(); -#if CONFIG_HIGHBITDEPTH - _mm256_storeu_si256((__m256i *)qcoeff, zero); - _mm256_storeu_si256((__m256i *)qcoeff + 1, zero); -#else - _mm256_storeu_si256((__m256i *)qcoeff, zero); -#endif + if (sizeof(tran_low_t) == 4) { + _mm256_storeu_si256((__m256i *)qcoeff, zero); + _mm256_storeu_si256((__m256i *)qcoeff + 1, zero); + } else { + _mm256_storeu_si256((__m256i *)qcoeff, zero); + } } static INLINE void init_one_qp(const __m128i *p, __m256i *qp) { @@ -83,19 +83,16 @@ _mm256_storeu_si256((__m256i *)addr + 1, x1); \ } while (0) -#if CONFIG_HIGHBITDEPTH -#define store_two_quan(q, addr1, dq, addr2) \ - do { \ - store_quan(q, addr1); \ - store_quan(dq, addr2); \ +#define store_two_quan(q, addr1, dq, addr2) \ + do { \ + if (sizeof(tran_low_t) == 4) { \ + store_quan(q, addr1); \ + store_quan(dq, addr2); \ + } else { \ + _mm256_storeu_si256((__m256i *)addr1, q); \ + _mm256_storeu_si256((__m256i *)addr2, dq); \ + } \ } while (0) -#else -#define store_two_quan(q, addr1, dq, addr2) \ - do { \ - _mm256_storeu_si256((__m256i *)addr1, q); \ - _mm256_storeu_si256((__m256i *)addr2, dq); \ - } while (0) -#endif static INLINE void quantize(const __m256i *thr, const __m256i *qp, __m256i *c, const int16_t *iscan_ptr, tran_low_t *qcoeff,
diff --git a/av1/encoder/x86/av1_quantize_sse2.c b/av1/encoder/x86/av1_quantize_sse2.c index 1903173..4f7c095 100644 --- a/av1/encoder/x86/av1_quantize_sse2.c +++ b/av1/encoder/x86/av1_quantize_sse2.c
@@ -18,53 +18,53 @@ static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset, __m128i *c0, __m128i *c1) { const tran_low_t *addr = coeff + offset; -#if CONFIG_HIGHBITDEPTH - const __m128i x0 = _mm_load_si128((const __m128i *)addr); - const __m128i x1 = _mm_load_si128((const __m128i *)addr + 1); - const __m128i x2 = _mm_load_si128((const __m128i *)addr + 2); - const __m128i x3 = _mm_load_si128((const __m128i *)addr + 3); - *c0 = _mm_packs_epi32(x0, x1); - *c1 = _mm_packs_epi32(x2, x3); -#else - *c0 = _mm_load_si128((const __m128i *)addr); - *c1 = _mm_load_si128((const __m128i *)addr + 1); -#endif + if (sizeof(tran_low_t) == 4) { + const __m128i x0 = _mm_load_si128((const __m128i *)addr); + const __m128i x1 = _mm_load_si128((const __m128i *)addr + 1); + const __m128i x2 = _mm_load_si128((const __m128i *)addr + 2); + const __m128i x3 = _mm_load_si128((const __m128i *)addr + 3); + *c0 = _mm_packs_epi32(x0, x1); + *c1 = _mm_packs_epi32(x2, x3); + } else { + *c0 = _mm_load_si128((const __m128i *)addr); + *c1 = _mm_load_si128((const __m128i *)addr + 1); + } } static INLINE void write_qcoeff(const __m128i *qc0, const __m128i *qc1, tran_low_t *qcoeff, intptr_t offset) { tran_low_t *addr = qcoeff + offset; -#if CONFIG_HIGHBITDEPTH - const __m128i zero = _mm_setzero_si128(); - __m128i sign_bits = _mm_cmplt_epi16(*qc0, zero); - __m128i y0 = _mm_unpacklo_epi16(*qc0, sign_bits); - __m128i y1 = _mm_unpackhi_epi16(*qc0, sign_bits); - _mm_store_si128((__m128i *)addr, y0); - _mm_store_si128((__m128i *)addr + 1, y1); + if (sizeof(tran_low_t) == 4) { + const __m128i zero = _mm_setzero_si128(); + __m128i sign_bits = _mm_cmplt_epi16(*qc0, zero); + __m128i y0 = _mm_unpacklo_epi16(*qc0, sign_bits); + __m128i y1 = _mm_unpackhi_epi16(*qc0, sign_bits); + _mm_store_si128((__m128i *)addr, y0); + _mm_store_si128((__m128i *)addr + 1, y1); - sign_bits = _mm_cmplt_epi16(*qc1, zero); - y0 = _mm_unpacklo_epi16(*qc1, sign_bits); - y1 = _mm_unpackhi_epi16(*qc1, sign_bits); - _mm_store_si128((__m128i *)addr + 2, y0); - _mm_store_si128((__m128i *)addr + 3, y1); -#else - _mm_store_si128((__m128i *)addr, *qc0); - _mm_store_si128((__m128i *)addr + 1, *qc1); -#endif + sign_bits = _mm_cmplt_epi16(*qc1, zero); + y0 = _mm_unpacklo_epi16(*qc1, sign_bits); + y1 = _mm_unpackhi_epi16(*qc1, sign_bits); + _mm_store_si128((__m128i *)addr + 2, y0); + _mm_store_si128((__m128i *)addr + 3, y1); + } else { + _mm_store_si128((__m128i *)addr, *qc0); + _mm_store_si128((__m128i *)addr + 1, *qc1); + } } static INLINE void write_zero(tran_low_t *qcoeff, intptr_t offset) { const __m128i zero = _mm_setzero_si128(); tran_low_t *addr = qcoeff + offset; -#if CONFIG_HIGHBITDEPTH - _mm_store_si128((__m128i *)addr, zero); - _mm_store_si128((__m128i *)addr + 1, zero); - _mm_store_si128((__m128i *)addr + 2, zero); - _mm_store_si128((__m128i *)addr + 3, zero); -#else - _mm_store_si128((__m128i *)addr, zero); - _mm_store_si128((__m128i *)addr + 1, zero); -#endif + if (sizeof(tran_low_t) == 4) { + _mm_store_si128((__m128i *)addr, zero); + _mm_store_si128((__m128i *)addr + 1, zero); + _mm_store_si128((__m128i *)addr + 2, zero); + _mm_store_si128((__m128i *)addr + 3, zero); + } else { + _mm_store_si128((__m128i *)addr, zero); + _mm_store_si128((__m128i *)addr + 1, zero); + } } void av1_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
diff --git a/av1/encoder/x86/error_intrin_avx2.c b/av1/encoder/x86/error_intrin_avx2.c index 20ba414..6599630 100644 --- a/av1/encoder/x86/error_intrin_avx2.c +++ b/av1/encoder/x86/error_intrin_avx2.c
@@ -17,14 +17,15 @@ static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset, __m256i *c) { const tran_low_t *addr = coeff + offset; -#if CONFIG_HIGHBITDEPTH - const __m256i x0 = _mm256_loadu_si256((const __m256i *)addr); - const __m256i x1 = _mm256_loadu_si256((const __m256i *)addr + 1); - const __m256i y = _mm256_packs_epi32(x0, x1); - *c = _mm256_permute4x64_epi64(y, 0xD8); -#else - *c = _mm256_loadu_si256((const __m256i *)addr); -#endif + + if (sizeof(tran_low_t) == 4) { + const __m256i x0 = _mm256_loadu_si256((const __m256i *)addr); + const __m256i x1 = _mm256_loadu_si256((const __m256i *)addr + 1); + const __m256i y = _mm256_packs_epi32(x0, x1); + *c = _mm256_permute4x64_epi64(y, 0xD8); + } else { + *c = _mm256_loadu_si256((const __m256i *)addr); + } } int64_t av1_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff,