Explicit requirement about sizeof(tran_low_t)
Here, we're testing CONFIG_HIGHBITDEPTH but what we really depend upon
is the actual size of the coefficients.
Change-Id: I33d71e4b38b4b83bb4232346f4d449f20bcf740e
diff --git a/aom_dsp/x86/fwd_txfm_avx2.h b/aom_dsp/x86/fwd_txfm_avx2.h
index d3aceae..86df4a6 100644
--- a/aom_dsp/x86/fwd_txfm_avx2.h
+++ b/aom_dsp/x86/fwd_txfm_avx2.h
@@ -15,21 +15,21 @@
#include "./aom_config.h"
static INLINE void storeu_output_avx2(const __m256i *coeff, tran_low_t *out) {
-#if CONFIG_HIGHBITDEPTH
- const __m256i zero = _mm256_setzero_si256();
- const __m256i sign = _mm256_cmpgt_epi16(zero, *coeff);
+ if (sizeof(tran_low_t) == 4) {
+ const __m256i zero = _mm256_setzero_si256();
+ const __m256i sign = _mm256_cmpgt_epi16(zero, *coeff);
- __m256i x0 = _mm256_unpacklo_epi16(*coeff, sign);
- __m256i x1 = _mm256_unpackhi_epi16(*coeff, sign);
+ __m256i x0 = _mm256_unpacklo_epi16(*coeff, sign);
+ __m256i x1 = _mm256_unpackhi_epi16(*coeff, sign);
- __m256i y0 = _mm256_permute2x128_si256(x0, x1, 0x20);
- __m256i y1 = _mm256_permute2x128_si256(x0, x1, 0x31);
+ __m256i y0 = _mm256_permute2x128_si256(x0, x1, 0x20);
+ __m256i y1 = _mm256_permute2x128_si256(x0, x1, 0x31);
- _mm256_storeu_si256((__m256i *)out, y0);
- _mm256_storeu_si256((__m256i *)(out + 8), y1);
-#else
- _mm256_storeu_si256((__m256i *)out, *coeff);
-#endif
+ _mm256_storeu_si256((__m256i *)out, y0);
+ _mm256_storeu_si256((__m256i *)(out + 8), y1);
+ } else {
+ _mm256_storeu_si256((__m256i *)out, *coeff);
+ }
}
#endif // AOM_DSP_X86_FWD_TXFM_AVX2_H
diff --git a/aom_dsp/x86/fwd_txfm_sse2.h b/aom_dsp/x86/fwd_txfm_sse2.h
index 26b2db2..58e8971 100644
--- a/aom_dsp/x86/fwd_txfm_sse2.h
+++ b/aom_dsp/x86/fwd_txfm_sse2.h
@@ -247,16 +247,16 @@
}
static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) {
-#if CONFIG_HIGHBITDEPTH
- const __m128i zero = _mm_setzero_si128();
- const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
- __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
- __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
- _mm_store_si128((__m128i *)(dst_ptr), out0);
- _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
-#else
- _mm_store_si128((__m128i *)(dst_ptr), *poutput);
-#endif // CONFIG_HIGHBITDEPTH
+ if (sizeof(tran_low_t) == 4) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+ __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+ __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+ _mm_store_si128((__m128i *)(dst_ptr), out0);
+ _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
+ } else {
+ _mm_store_si128((__m128i *)(dst_ptr), *poutput);
+ }
}
static INLINE __m128i mult_round_shift(const __m128i *pin0, const __m128i *pin1,
diff --git a/aom_dsp/x86/inv_txfm_common_avx2.h b/aom_dsp/x86/inv_txfm_common_avx2.h
index 4238e65..26c5cfe 100644
--- a/aom_dsp/x86/inv_txfm_common_avx2.h
+++ b/aom_dsp/x86/inv_txfm_common_avx2.h
@@ -18,17 +18,17 @@
#include "aom_dsp/x86/txfm_common_avx2.h"
static INLINE void load_coeff(const tran_low_t *coeff, __m256i *in) {
-#if CONFIG_HIGHBITDEPTH
- *in = _mm256_setr_epi16(
- (int16_t)coeff[0], (int16_t)coeff[1], (int16_t)coeff[2],
- (int16_t)coeff[3], (int16_t)coeff[4], (int16_t)coeff[5],
- (int16_t)coeff[6], (int16_t)coeff[7], (int16_t)coeff[8],
- (int16_t)coeff[9], (int16_t)coeff[10], (int16_t)coeff[11],
- (int16_t)coeff[12], (int16_t)coeff[13], (int16_t)coeff[14],
- (int16_t)coeff[15]);
-#else
- *in = _mm256_loadu_si256((const __m256i *)coeff);
-#endif
+ if (sizeof(tran_low_t) == 4) {
+ *in = _mm256_setr_epi16(
+ (int16_t)coeff[0], (int16_t)coeff[1], (int16_t)coeff[2],
+ (int16_t)coeff[3], (int16_t)coeff[4], (int16_t)coeff[5],
+ (int16_t)coeff[6], (int16_t)coeff[7], (int16_t)coeff[8],
+ (int16_t)coeff[9], (int16_t)coeff[10], (int16_t)coeff[11],
+ (int16_t)coeff[12], (int16_t)coeff[13], (int16_t)coeff[14],
+ (int16_t)coeff[15]);
+ } else {
+ *in = _mm256_loadu_si256((const __m256i *)coeff);
+ }
}
static INLINE void load_buffer_16x16(const tran_low_t *coeff, __m256i *in) {
diff --git a/aom_dsp/x86/inv_txfm_sse2.h b/aom_dsp/x86/inv_txfm_sse2.h
index 95d246c..3428169 100644
--- a/aom_dsp/x86/inv_txfm_sse2.h
+++ b/aom_dsp/x86/inv_txfm_sse2.h
@@ -133,12 +133,12 @@
// Function to allow 8 bit optimisations to be used when profile 0 is used with
// highbitdepth enabled
static INLINE __m128i load_input_data(const tran_low_t *data) {
-#if CONFIG_HIGHBITDEPTH
- return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5],
- data[6], data[7]);
-#else
- return _mm_load_si128((const __m128i *)data);
-#endif
+ if (sizeof(tran_low_t) == 4) {
+ return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5],
+ data[6], data[7]);
+ } else {
+ return _mm_load_si128((const __m128i *)data);
+ }
}
static INLINE void load_buffer_8x16(const tran_low_t *input, __m128i *in) {
diff --git a/aom_dsp/x86/quantize_sse2.c b/aom_dsp/x86/quantize_sse2.c
index 890c1f0..0e7f679 100644
--- a/aom_dsp/x86/quantize_sse2.c
+++ b/aom_dsp/x86/quantize_sse2.c
@@ -16,29 +16,29 @@
#include "aom/aom_integer.h"
static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) {
-#if CONFIG_HIGHBITDEPTH
- return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1],
- (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3],
- (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5],
- (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]);
-#else
- return _mm_load_si128((const __m128i *)coeff_ptr);
-#endif
+ if (sizeof(tran_low_t) == 4) {
+ return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1],
+ (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3],
+ (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5],
+ (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]);
+ } else {
+ return _mm_load_si128((const __m128i *)coeff_ptr);
+ }
}
static INLINE void store_coefficients(__m128i coeff_vals,
tran_low_t *coeff_ptr) {
-#if CONFIG_HIGHBITDEPTH
- __m128i one = _mm_set1_epi16(1);
- __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
- __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
- __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
- __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
- _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1);
- _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2);
-#else
- _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals);
-#endif
+ if (sizeof(tran_low_t) == 4) {
+ __m128i one = _mm_set1_epi16(1);
+ __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
+ __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
+ __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
+ __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
+ _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1);
+ _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2);
+ } else {
+ _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals);
+ }
}
void aom_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
diff --git a/aom_dsp/x86/txfm_common_intrin.h b/aom_dsp/x86/txfm_common_intrin.h
index e4ac563..4e6eecd 100644
--- a/aom_dsp/x86/txfm_common_intrin.h
+++ b/aom_dsp/x86/txfm_common_intrin.h
@@ -16,16 +16,16 @@
// This header file should be put below any x86 intrinsics head file
static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) {
-#if CONFIG_HIGHBITDEPTH
- const __m128i zero = _mm_setzero_si128();
- const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
- __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
- __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
- _mm_storeu_si128((__m128i *)(dst_ptr), out0);
- _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
-#else
- _mm_storeu_si128((__m128i *)(dst_ptr), *poutput);
-#endif // CONFIG_HIGHBITDEPTH
+ if (sizeof(tran_low_t) == 4) {
+ const __m128i zero = _mm_setzero_si128();
+ const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+ __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+ __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+ _mm_storeu_si128((__m128i *)(dst_ptr), out0);
+ _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
+ } else {
+ _mm_storeu_si128((__m128i *)(dst_ptr), *poutput);
+ }
}
#endif // _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_
diff --git a/av1/encoder/x86/av1_quantize_avx2.c b/av1/encoder/x86/av1_quantize_avx2.c
index 1c0a120..078a675 100644
--- a/av1/encoder/x86/av1_quantize_avx2.c
+++ b/av1/encoder/x86/av1_quantize_avx2.c
@@ -16,24 +16,24 @@
#include "aom_dsp/aom_dsp_common.h"
static INLINE void read_coeff(const tran_low_t *coeff, __m256i *c) {
-#if CONFIG_HIGHBITDEPTH
- const __m256i x0 = _mm256_loadu_si256((const __m256i *)coeff);
- const __m256i x1 = _mm256_loadu_si256((const __m256i *)coeff + 1);
- *c = _mm256_packs_epi32(x0, x1);
- *c = _mm256_permute4x64_epi64(*c, 0xD8);
-#else
- *c = _mm256_loadu_si256((const __m256i *)coeff);
-#endif
+ if (sizeof(tran_low_t) == 4) {
+ const __m256i x0 = _mm256_loadu_si256((const __m256i *)coeff);
+ const __m256i x1 = _mm256_loadu_si256((const __m256i *)coeff + 1);
+ *c = _mm256_packs_epi32(x0, x1);
+ *c = _mm256_permute4x64_epi64(*c, 0xD8);
+ } else {
+ *c = _mm256_loadu_si256((const __m256i *)coeff);
+ }
}
static INLINE void write_zero(tran_low_t *qcoeff) {
const __m256i zero = _mm256_setzero_si256();
-#if CONFIG_HIGHBITDEPTH
- _mm256_storeu_si256((__m256i *)qcoeff, zero);
- _mm256_storeu_si256((__m256i *)qcoeff + 1, zero);
-#else
- _mm256_storeu_si256((__m256i *)qcoeff, zero);
-#endif
+ if (sizeof(tran_low_t) == 4) {
+ _mm256_storeu_si256((__m256i *)qcoeff, zero);
+ _mm256_storeu_si256((__m256i *)qcoeff + 1, zero);
+ } else {
+ _mm256_storeu_si256((__m256i *)qcoeff, zero);
+ }
}
static INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
@@ -83,19 +83,16 @@
_mm256_storeu_si256((__m256i *)addr + 1, x1); \
} while (0)
-#if CONFIG_HIGHBITDEPTH
-#define store_two_quan(q, addr1, dq, addr2) \
- do { \
- store_quan(q, addr1); \
- store_quan(dq, addr2); \
+#define store_two_quan(q, addr1, dq, addr2) \
+ do { \
+ if (sizeof(tran_low_t) == 4) { \
+ store_quan(q, addr1); \
+ store_quan(dq, addr2); \
+ } else { \
+ _mm256_storeu_si256((__m256i *)addr1, q); \
+ _mm256_storeu_si256((__m256i *)addr2, dq); \
+ } \
} while (0)
-#else
-#define store_two_quan(q, addr1, dq, addr2) \
- do { \
- _mm256_storeu_si256((__m256i *)addr1, q); \
- _mm256_storeu_si256((__m256i *)addr2, dq); \
- } while (0)
-#endif
static INLINE void quantize(const __m256i *thr, const __m256i *qp, __m256i *c,
const int16_t *iscan_ptr, tran_low_t *qcoeff,
diff --git a/av1/encoder/x86/av1_quantize_sse2.c b/av1/encoder/x86/av1_quantize_sse2.c
index 1903173..4f7c095 100644
--- a/av1/encoder/x86/av1_quantize_sse2.c
+++ b/av1/encoder/x86/av1_quantize_sse2.c
@@ -18,53 +18,53 @@
static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset,
__m128i *c0, __m128i *c1) {
const tran_low_t *addr = coeff + offset;
-#if CONFIG_HIGHBITDEPTH
- const __m128i x0 = _mm_load_si128((const __m128i *)addr);
- const __m128i x1 = _mm_load_si128((const __m128i *)addr + 1);
- const __m128i x2 = _mm_load_si128((const __m128i *)addr + 2);
- const __m128i x3 = _mm_load_si128((const __m128i *)addr + 3);
- *c0 = _mm_packs_epi32(x0, x1);
- *c1 = _mm_packs_epi32(x2, x3);
-#else
- *c0 = _mm_load_si128((const __m128i *)addr);
- *c1 = _mm_load_si128((const __m128i *)addr + 1);
-#endif
+ if (sizeof(tran_low_t) == 4) {
+ const __m128i x0 = _mm_load_si128((const __m128i *)addr);
+ const __m128i x1 = _mm_load_si128((const __m128i *)addr + 1);
+ const __m128i x2 = _mm_load_si128((const __m128i *)addr + 2);
+ const __m128i x3 = _mm_load_si128((const __m128i *)addr + 3);
+ *c0 = _mm_packs_epi32(x0, x1);
+ *c1 = _mm_packs_epi32(x2, x3);
+ } else {
+ *c0 = _mm_load_si128((const __m128i *)addr);
+ *c1 = _mm_load_si128((const __m128i *)addr + 1);
+ }
}
static INLINE void write_qcoeff(const __m128i *qc0, const __m128i *qc1,
tran_low_t *qcoeff, intptr_t offset) {
tran_low_t *addr = qcoeff + offset;
-#if CONFIG_HIGHBITDEPTH
- const __m128i zero = _mm_setzero_si128();
- __m128i sign_bits = _mm_cmplt_epi16(*qc0, zero);
- __m128i y0 = _mm_unpacklo_epi16(*qc0, sign_bits);
- __m128i y1 = _mm_unpackhi_epi16(*qc0, sign_bits);
- _mm_store_si128((__m128i *)addr, y0);
- _mm_store_si128((__m128i *)addr + 1, y1);
+ if (sizeof(tran_low_t) == 4) {
+ const __m128i zero = _mm_setzero_si128();
+ __m128i sign_bits = _mm_cmplt_epi16(*qc0, zero);
+ __m128i y0 = _mm_unpacklo_epi16(*qc0, sign_bits);
+ __m128i y1 = _mm_unpackhi_epi16(*qc0, sign_bits);
+ _mm_store_si128((__m128i *)addr, y0);
+ _mm_store_si128((__m128i *)addr + 1, y1);
- sign_bits = _mm_cmplt_epi16(*qc1, zero);
- y0 = _mm_unpacklo_epi16(*qc1, sign_bits);
- y1 = _mm_unpackhi_epi16(*qc1, sign_bits);
- _mm_store_si128((__m128i *)addr + 2, y0);
- _mm_store_si128((__m128i *)addr + 3, y1);
-#else
- _mm_store_si128((__m128i *)addr, *qc0);
- _mm_store_si128((__m128i *)addr + 1, *qc1);
-#endif
+ sign_bits = _mm_cmplt_epi16(*qc1, zero);
+ y0 = _mm_unpacklo_epi16(*qc1, sign_bits);
+ y1 = _mm_unpackhi_epi16(*qc1, sign_bits);
+ _mm_store_si128((__m128i *)addr + 2, y0);
+ _mm_store_si128((__m128i *)addr + 3, y1);
+ } else {
+ _mm_store_si128((__m128i *)addr, *qc0);
+ _mm_store_si128((__m128i *)addr + 1, *qc1);
+ }
}
static INLINE void write_zero(tran_low_t *qcoeff, intptr_t offset) {
const __m128i zero = _mm_setzero_si128();
tran_low_t *addr = qcoeff + offset;
-#if CONFIG_HIGHBITDEPTH
- _mm_store_si128((__m128i *)addr, zero);
- _mm_store_si128((__m128i *)addr + 1, zero);
- _mm_store_si128((__m128i *)addr + 2, zero);
- _mm_store_si128((__m128i *)addr + 3, zero);
-#else
- _mm_store_si128((__m128i *)addr, zero);
- _mm_store_si128((__m128i *)addr + 1, zero);
-#endif
+ if (sizeof(tran_low_t) == 4) {
+ _mm_store_si128((__m128i *)addr, zero);
+ _mm_store_si128((__m128i *)addr + 1, zero);
+ _mm_store_si128((__m128i *)addr + 2, zero);
+ _mm_store_si128((__m128i *)addr + 3, zero);
+ } else {
+ _mm_store_si128((__m128i *)addr, zero);
+ _mm_store_si128((__m128i *)addr + 1, zero);
+ }
}
void av1_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
diff --git a/av1/encoder/x86/error_intrin_avx2.c b/av1/encoder/x86/error_intrin_avx2.c
index 20ba414..6599630 100644
--- a/av1/encoder/x86/error_intrin_avx2.c
+++ b/av1/encoder/x86/error_intrin_avx2.c
@@ -17,14 +17,15 @@
static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset,
__m256i *c) {
const tran_low_t *addr = coeff + offset;
-#if CONFIG_HIGHBITDEPTH
- const __m256i x0 = _mm256_loadu_si256((const __m256i *)addr);
- const __m256i x1 = _mm256_loadu_si256((const __m256i *)addr + 1);
- const __m256i y = _mm256_packs_epi32(x0, x1);
- *c = _mm256_permute4x64_epi64(y, 0xD8);
-#else
- *c = _mm256_loadu_si256((const __m256i *)addr);
-#endif
+
+ if (sizeof(tran_low_t) == 4) {
+ const __m256i x0 = _mm256_loadu_si256((const __m256i *)addr);
+ const __m256i x1 = _mm256_loadu_si256((const __m256i *)addr + 1);
+ const __m256i y = _mm256_packs_epi32(x0, x1);
+ *c = _mm256_permute4x64_epi64(y, 0xD8);
+ } else {
+ *c = _mm256_loadu_si256((const __m256i *)addr);
+ }
}
int64_t av1_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff,