Explicit requirement about sizeof(tran_low_t)

Here, we're testing CONFIG_HIGHBITDEPTH but what we really depend upon
is the actual size of the coefficients.

Change-Id: I33d71e4b38b4b83bb4232346f4d449f20bcf740e
diff --git a/aom_dsp/x86/fwd_txfm_avx2.h b/aom_dsp/x86/fwd_txfm_avx2.h
index d3aceae..86df4a6 100644
--- a/aom_dsp/x86/fwd_txfm_avx2.h
+++ b/aom_dsp/x86/fwd_txfm_avx2.h
@@ -15,21 +15,21 @@
 #include "./aom_config.h"
 
 static INLINE void storeu_output_avx2(const __m256i *coeff, tran_low_t *out) {
-#if CONFIG_HIGHBITDEPTH
-  const __m256i zero = _mm256_setzero_si256();
-  const __m256i sign = _mm256_cmpgt_epi16(zero, *coeff);
+  if (sizeof(tran_low_t) == 4) {
+    const __m256i zero = _mm256_setzero_si256();
+    const __m256i sign = _mm256_cmpgt_epi16(zero, *coeff);
 
-  __m256i x0 = _mm256_unpacklo_epi16(*coeff, sign);
-  __m256i x1 = _mm256_unpackhi_epi16(*coeff, sign);
+    __m256i x0 = _mm256_unpacklo_epi16(*coeff, sign);
+    __m256i x1 = _mm256_unpackhi_epi16(*coeff, sign);
 
-  __m256i y0 = _mm256_permute2x128_si256(x0, x1, 0x20);
-  __m256i y1 = _mm256_permute2x128_si256(x0, x1, 0x31);
+    __m256i y0 = _mm256_permute2x128_si256(x0, x1, 0x20);
+    __m256i y1 = _mm256_permute2x128_si256(x0, x1, 0x31);
 
-  _mm256_storeu_si256((__m256i *)out, y0);
-  _mm256_storeu_si256((__m256i *)(out + 8), y1);
-#else
-  _mm256_storeu_si256((__m256i *)out, *coeff);
-#endif
+    _mm256_storeu_si256((__m256i *)out, y0);
+    _mm256_storeu_si256((__m256i *)(out + 8), y1);
+  } else {
+    _mm256_storeu_si256((__m256i *)out, *coeff);
+  }
 }
 
 #endif  // AOM_DSP_X86_FWD_TXFM_AVX2_H
diff --git a/aom_dsp/x86/fwd_txfm_sse2.h b/aom_dsp/x86/fwd_txfm_sse2.h
index 26b2db2..58e8971 100644
--- a/aom_dsp/x86/fwd_txfm_sse2.h
+++ b/aom_dsp/x86/fwd_txfm_sse2.h
@@ -247,16 +247,16 @@
 }
 
 static INLINE void store_output(const __m128i *poutput, tran_low_t *dst_ptr) {
-#if CONFIG_HIGHBITDEPTH
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
-  __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
-  __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
-  _mm_store_si128((__m128i *)(dst_ptr), out0);
-  _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
-#else
-  _mm_store_si128((__m128i *)(dst_ptr), *poutput);
-#endif  // CONFIG_HIGHBITDEPTH
+  if (sizeof(tran_low_t) == 4) {
+    const __m128i zero = _mm_setzero_si128();
+    const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+    __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+    __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+    _mm_store_si128((__m128i *)(dst_ptr), out0);
+    _mm_store_si128((__m128i *)(dst_ptr + 4), out1);
+  } else {
+    _mm_store_si128((__m128i *)(dst_ptr), *poutput);
+  }
 }
 
 static INLINE __m128i mult_round_shift(const __m128i *pin0, const __m128i *pin1,
diff --git a/aom_dsp/x86/inv_txfm_common_avx2.h b/aom_dsp/x86/inv_txfm_common_avx2.h
index 4238e65..26c5cfe 100644
--- a/aom_dsp/x86/inv_txfm_common_avx2.h
+++ b/aom_dsp/x86/inv_txfm_common_avx2.h
@@ -18,17 +18,17 @@
 #include "aom_dsp/x86/txfm_common_avx2.h"
 
 static INLINE void load_coeff(const tran_low_t *coeff, __m256i *in) {
-#if CONFIG_HIGHBITDEPTH
-  *in = _mm256_setr_epi16(
-      (int16_t)coeff[0], (int16_t)coeff[1], (int16_t)coeff[2],
-      (int16_t)coeff[3], (int16_t)coeff[4], (int16_t)coeff[5],
-      (int16_t)coeff[6], (int16_t)coeff[7], (int16_t)coeff[8],
-      (int16_t)coeff[9], (int16_t)coeff[10], (int16_t)coeff[11],
-      (int16_t)coeff[12], (int16_t)coeff[13], (int16_t)coeff[14],
-      (int16_t)coeff[15]);
-#else
-  *in = _mm256_loadu_si256((const __m256i *)coeff);
-#endif
+  if (sizeof(tran_low_t) == 4) {
+    *in = _mm256_setr_epi16(
+        (int16_t)coeff[0], (int16_t)coeff[1], (int16_t)coeff[2],
+        (int16_t)coeff[3], (int16_t)coeff[4], (int16_t)coeff[5],
+        (int16_t)coeff[6], (int16_t)coeff[7], (int16_t)coeff[8],
+        (int16_t)coeff[9], (int16_t)coeff[10], (int16_t)coeff[11],
+        (int16_t)coeff[12], (int16_t)coeff[13], (int16_t)coeff[14],
+        (int16_t)coeff[15]);
+  } else {
+    *in = _mm256_loadu_si256((const __m256i *)coeff);
+  }
 }
 
 static INLINE void load_buffer_16x16(const tran_low_t *coeff, __m256i *in) {
diff --git a/aom_dsp/x86/inv_txfm_sse2.h b/aom_dsp/x86/inv_txfm_sse2.h
index 95d246c..3428169 100644
--- a/aom_dsp/x86/inv_txfm_sse2.h
+++ b/aom_dsp/x86/inv_txfm_sse2.h
@@ -133,12 +133,12 @@
 // Function to allow 8 bit optimisations to be used when profile 0 is used with
 // highbitdepth enabled
 static INLINE __m128i load_input_data(const tran_low_t *data) {
-#if CONFIG_HIGHBITDEPTH
-  return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5],
-                        data[6], data[7]);
-#else
-  return _mm_load_si128((const __m128i *)data);
-#endif
+  if (sizeof(tran_low_t) == 4) {
+    return octa_set_epi16(data[0], data[1], data[2], data[3], data[4], data[5],
+                          data[6], data[7]);
+  } else {
+    return _mm_load_si128((const __m128i *)data);
+  }
 }
 
 static INLINE void load_buffer_8x16(const tran_low_t *input, __m128i *in) {
diff --git a/aom_dsp/x86/quantize_sse2.c b/aom_dsp/x86/quantize_sse2.c
index 890c1f0..0e7f679 100644
--- a/aom_dsp/x86/quantize_sse2.c
+++ b/aom_dsp/x86/quantize_sse2.c
@@ -16,29 +16,29 @@
 #include "aom/aom_integer.h"
 
 static INLINE __m128i load_coefficients(const tran_low_t *coeff_ptr) {
-#if CONFIG_HIGHBITDEPTH
-  return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1],
-                        (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3],
-                        (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5],
-                        (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]);
-#else
-  return _mm_load_si128((const __m128i *)coeff_ptr);
-#endif
+  if (sizeof(tran_low_t) == 4) {
+    return _mm_setr_epi16((int16_t)coeff_ptr[0], (int16_t)coeff_ptr[1],
+                          (int16_t)coeff_ptr[2], (int16_t)coeff_ptr[3],
+                          (int16_t)coeff_ptr[4], (int16_t)coeff_ptr[5],
+                          (int16_t)coeff_ptr[6], (int16_t)coeff_ptr[7]);
+  } else {
+    return _mm_load_si128((const __m128i *)coeff_ptr);
+  }
 }
 
 static INLINE void store_coefficients(__m128i coeff_vals,
                                       tran_low_t *coeff_ptr) {
-#if CONFIG_HIGHBITDEPTH
-  __m128i one = _mm_set1_epi16(1);
-  __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
-  __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
-  __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
-  __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
-  _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1);
-  _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2);
-#else
-  _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals);
-#endif
+  if (sizeof(tran_low_t) == 4) {
+    __m128i one = _mm_set1_epi16(1);
+    __m128i coeff_vals_hi = _mm_mulhi_epi16(coeff_vals, one);
+    __m128i coeff_vals_lo = _mm_mullo_epi16(coeff_vals, one);
+    __m128i coeff_vals_1 = _mm_unpacklo_epi16(coeff_vals_lo, coeff_vals_hi);
+    __m128i coeff_vals_2 = _mm_unpackhi_epi16(coeff_vals_lo, coeff_vals_hi);
+    _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals_1);
+    _mm_store_si128((__m128i *)(coeff_ptr + 4), coeff_vals_2);
+  } else {
+    _mm_store_si128((__m128i *)(coeff_ptr), coeff_vals);
+  }
 }
 
 void aom_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
diff --git a/aom_dsp/x86/txfm_common_intrin.h b/aom_dsp/x86/txfm_common_intrin.h
index e4ac563..4e6eecd 100644
--- a/aom_dsp/x86/txfm_common_intrin.h
+++ b/aom_dsp/x86/txfm_common_intrin.h
@@ -16,16 +16,16 @@
 //  This header file should be put below any x86 intrinsics head file
 
 static INLINE void storeu_output(const __m128i *poutput, tran_low_t *dst_ptr) {
-#if CONFIG_HIGHBITDEPTH
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
-  __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
-  __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
-  _mm_storeu_si128((__m128i *)(dst_ptr), out0);
-  _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
-#else
-  _mm_storeu_si128((__m128i *)(dst_ptr), *poutput);
-#endif  // CONFIG_HIGHBITDEPTH
+  if (sizeof(tran_low_t) == 4) {
+    const __m128i zero = _mm_setzero_si128();
+    const __m128i sign_bits = _mm_cmplt_epi16(*poutput, zero);
+    __m128i out0 = _mm_unpacklo_epi16(*poutput, sign_bits);
+    __m128i out1 = _mm_unpackhi_epi16(*poutput, sign_bits);
+    _mm_storeu_si128((__m128i *)(dst_ptr), out0);
+    _mm_storeu_si128((__m128i *)(dst_ptr + 4), out1);
+  } else {
+    _mm_storeu_si128((__m128i *)(dst_ptr), *poutput);
+  }
 }
 
 #endif  // _AOM_DSP_X86_TXFM_COMMON_INTRIN_H_
diff --git a/av1/encoder/x86/av1_quantize_avx2.c b/av1/encoder/x86/av1_quantize_avx2.c
index 1c0a120..078a675 100644
--- a/av1/encoder/x86/av1_quantize_avx2.c
+++ b/av1/encoder/x86/av1_quantize_avx2.c
@@ -16,24 +16,24 @@
 #include "aom_dsp/aom_dsp_common.h"
 
 static INLINE void read_coeff(const tran_low_t *coeff, __m256i *c) {
-#if CONFIG_HIGHBITDEPTH
-  const __m256i x0 = _mm256_loadu_si256((const __m256i *)coeff);
-  const __m256i x1 = _mm256_loadu_si256((const __m256i *)coeff + 1);
-  *c = _mm256_packs_epi32(x0, x1);
-  *c = _mm256_permute4x64_epi64(*c, 0xD8);
-#else
-  *c = _mm256_loadu_si256((const __m256i *)coeff);
-#endif
+  if (sizeof(tran_low_t) == 4) {
+    const __m256i x0 = _mm256_loadu_si256((const __m256i *)coeff);
+    const __m256i x1 = _mm256_loadu_si256((const __m256i *)coeff + 1);
+    *c = _mm256_packs_epi32(x0, x1);
+    *c = _mm256_permute4x64_epi64(*c, 0xD8);
+  } else {
+    *c = _mm256_loadu_si256((const __m256i *)coeff);
+  }
 }
 
 static INLINE void write_zero(tran_low_t *qcoeff) {
   const __m256i zero = _mm256_setzero_si256();
-#if CONFIG_HIGHBITDEPTH
-  _mm256_storeu_si256((__m256i *)qcoeff, zero);
-  _mm256_storeu_si256((__m256i *)qcoeff + 1, zero);
-#else
-  _mm256_storeu_si256((__m256i *)qcoeff, zero);
-#endif
+  if (sizeof(tran_low_t) == 4) {
+    _mm256_storeu_si256((__m256i *)qcoeff, zero);
+    _mm256_storeu_si256((__m256i *)qcoeff + 1, zero);
+  } else {
+    _mm256_storeu_si256((__m256i *)qcoeff, zero);
+  }
 }
 
 static INLINE void init_one_qp(const __m128i *p, __m256i *qp) {
@@ -83,19 +83,16 @@
     _mm256_storeu_si256((__m256i *)addr + 1, x1);         \
   } while (0)
 
-#if CONFIG_HIGHBITDEPTH
-#define store_two_quan(q, addr1, dq, addr2) \
-  do {                                      \
-    store_quan(q, addr1);                   \
-    store_quan(dq, addr2);                  \
+#define store_two_quan(q, addr1, dq, addr2)      \
+  do {                                           \
+    if (sizeof(tran_low_t) == 4) {               \
+      store_quan(q, addr1);                      \
+      store_quan(dq, addr2);                     \
+    } else {                                     \
+      _mm256_storeu_si256((__m256i *)addr1, q);  \
+      _mm256_storeu_si256((__m256i *)addr2, dq); \
+    }                                            \
   } while (0)
-#else
-#define store_two_quan(q, addr1, dq, addr2)    \
-  do {                                         \
-    _mm256_storeu_si256((__m256i *)addr1, q);  \
-    _mm256_storeu_si256((__m256i *)addr2, dq); \
-  } while (0)
-#endif
 
 static INLINE void quantize(const __m256i *thr, const __m256i *qp, __m256i *c,
                             const int16_t *iscan_ptr, tran_low_t *qcoeff,
diff --git a/av1/encoder/x86/av1_quantize_sse2.c b/av1/encoder/x86/av1_quantize_sse2.c
index 1903173..4f7c095 100644
--- a/av1/encoder/x86/av1_quantize_sse2.c
+++ b/av1/encoder/x86/av1_quantize_sse2.c
@@ -18,53 +18,53 @@
 static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset,
                               __m128i *c0, __m128i *c1) {
   const tran_low_t *addr = coeff + offset;
-#if CONFIG_HIGHBITDEPTH
-  const __m128i x0 = _mm_load_si128((const __m128i *)addr);
-  const __m128i x1 = _mm_load_si128((const __m128i *)addr + 1);
-  const __m128i x2 = _mm_load_si128((const __m128i *)addr + 2);
-  const __m128i x3 = _mm_load_si128((const __m128i *)addr + 3);
-  *c0 = _mm_packs_epi32(x0, x1);
-  *c1 = _mm_packs_epi32(x2, x3);
-#else
-  *c0 = _mm_load_si128((const __m128i *)addr);
-  *c1 = _mm_load_si128((const __m128i *)addr + 1);
-#endif
+  if (sizeof(tran_low_t) == 4) {
+    const __m128i x0 = _mm_load_si128((const __m128i *)addr);
+    const __m128i x1 = _mm_load_si128((const __m128i *)addr + 1);
+    const __m128i x2 = _mm_load_si128((const __m128i *)addr + 2);
+    const __m128i x3 = _mm_load_si128((const __m128i *)addr + 3);
+    *c0 = _mm_packs_epi32(x0, x1);
+    *c1 = _mm_packs_epi32(x2, x3);
+  } else {
+    *c0 = _mm_load_si128((const __m128i *)addr);
+    *c1 = _mm_load_si128((const __m128i *)addr + 1);
+  }
 }
 
 static INLINE void write_qcoeff(const __m128i *qc0, const __m128i *qc1,
                                 tran_low_t *qcoeff, intptr_t offset) {
   tran_low_t *addr = qcoeff + offset;
-#if CONFIG_HIGHBITDEPTH
-  const __m128i zero = _mm_setzero_si128();
-  __m128i sign_bits = _mm_cmplt_epi16(*qc0, zero);
-  __m128i y0 = _mm_unpacklo_epi16(*qc0, sign_bits);
-  __m128i y1 = _mm_unpackhi_epi16(*qc0, sign_bits);
-  _mm_store_si128((__m128i *)addr, y0);
-  _mm_store_si128((__m128i *)addr + 1, y1);
+  if (sizeof(tran_low_t) == 4) {
+    const __m128i zero = _mm_setzero_si128();
+    __m128i sign_bits = _mm_cmplt_epi16(*qc0, zero);
+    __m128i y0 = _mm_unpacklo_epi16(*qc0, sign_bits);
+    __m128i y1 = _mm_unpackhi_epi16(*qc0, sign_bits);
+    _mm_store_si128((__m128i *)addr, y0);
+    _mm_store_si128((__m128i *)addr + 1, y1);
 
-  sign_bits = _mm_cmplt_epi16(*qc1, zero);
-  y0 = _mm_unpacklo_epi16(*qc1, sign_bits);
-  y1 = _mm_unpackhi_epi16(*qc1, sign_bits);
-  _mm_store_si128((__m128i *)addr + 2, y0);
-  _mm_store_si128((__m128i *)addr + 3, y1);
-#else
-  _mm_store_si128((__m128i *)addr, *qc0);
-  _mm_store_si128((__m128i *)addr + 1, *qc1);
-#endif
+    sign_bits = _mm_cmplt_epi16(*qc1, zero);
+    y0 = _mm_unpacklo_epi16(*qc1, sign_bits);
+    y1 = _mm_unpackhi_epi16(*qc1, sign_bits);
+    _mm_store_si128((__m128i *)addr + 2, y0);
+    _mm_store_si128((__m128i *)addr + 3, y1);
+  } else {
+    _mm_store_si128((__m128i *)addr, *qc0);
+    _mm_store_si128((__m128i *)addr + 1, *qc1);
+  }
 }
 
 static INLINE void write_zero(tran_low_t *qcoeff, intptr_t offset) {
   const __m128i zero = _mm_setzero_si128();
   tran_low_t *addr = qcoeff + offset;
-#if CONFIG_HIGHBITDEPTH
-  _mm_store_si128((__m128i *)addr, zero);
-  _mm_store_si128((__m128i *)addr + 1, zero);
-  _mm_store_si128((__m128i *)addr + 2, zero);
-  _mm_store_si128((__m128i *)addr + 3, zero);
-#else
-  _mm_store_si128((__m128i *)addr, zero);
-  _mm_store_si128((__m128i *)addr + 1, zero);
-#endif
+  if (sizeof(tran_low_t) == 4) {
+    _mm_store_si128((__m128i *)addr, zero);
+    _mm_store_si128((__m128i *)addr + 1, zero);
+    _mm_store_si128((__m128i *)addr + 2, zero);
+    _mm_store_si128((__m128i *)addr + 3, zero);
+  } else {
+    _mm_store_si128((__m128i *)addr, zero);
+    _mm_store_si128((__m128i *)addr + 1, zero);
+  }
 }
 
 void av1_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
diff --git a/av1/encoder/x86/error_intrin_avx2.c b/av1/encoder/x86/error_intrin_avx2.c
index 20ba414..6599630 100644
--- a/av1/encoder/x86/error_intrin_avx2.c
+++ b/av1/encoder/x86/error_intrin_avx2.c
@@ -17,14 +17,15 @@
 static INLINE void read_coeff(const tran_low_t *coeff, intptr_t offset,
                               __m256i *c) {
   const tran_low_t *addr = coeff + offset;
-#if CONFIG_HIGHBITDEPTH
-  const __m256i x0 = _mm256_loadu_si256((const __m256i *)addr);
-  const __m256i x1 = _mm256_loadu_si256((const __m256i *)addr + 1);
-  const __m256i y = _mm256_packs_epi32(x0, x1);
-  *c = _mm256_permute4x64_epi64(y, 0xD8);
-#else
-  *c = _mm256_loadu_si256((const __m256i *)addr);
-#endif
+
+  if (sizeof(tran_low_t) == 4) {
+    const __m256i x0 = _mm256_loadu_si256((const __m256i *)addr);
+    const __m256i x1 = _mm256_loadu_si256((const __m256i *)addr + 1);
+    const __m256i y = _mm256_packs_epi32(x0, x1);
+    *c = _mm256_permute4x64_epi64(y, 0xD8);
+  } else {
+    *c = _mm256_loadu_si256((const __m256i *)addr);
+  }
 }
 
 int64_t av1_block_error_avx2(const tran_low_t *coeff, const tran_low_t *dqcoeff,