Apply the rect fwd tx changes to SSE2 optimization
- Apply changes on tx_size: 4x8, 8x4, 8x16, 16x8.
- Turn on corresponding unit tests on SSE2.
- Partially fix aomedia:113.
Change-Id: I29d15540ab8e9e3681e9caa54e5162bcbbd7af11
diff --git a/av1/encoder/x86/dct_intrin_sse2.c b/av1/encoder/x86/dct_intrin_sse2.c
index 727ff19..768e9fe 100644
--- a/av1/encoder/x86/dct_intrin_sse2.c
+++ b/av1/encoder/x86/dct_intrin_sse2.c
@@ -2660,7 +2660,7 @@
xx_roundn_epi32_unsigned(v_p3b_d, DCT_CONST_BITS));
}
-static INLINE void scale_sqrt2_8x8_signed(__m128i *in) {
+static INLINE void scale_sqrt2_8x8_unsigned(__m128i *in) {
// Implements 'ROUND_POWER_OF_TWO_SIGNED(input * Sqrt2, DCT_CONST_BITS)'
// for each element
const __m128i v_scale_w = _mm_set1_epi16((int16_t)Sqrt2);
@@ -2699,22 +2699,22 @@
const __m128i v_p7a_d = _mm_unpacklo_epi16(v_p7l_w, v_p7h_w);
const __m128i v_p7b_d = _mm_unpackhi_epi16(v_p7l_w, v_p7h_w);
- in[0] = _mm_packs_epi32(xx_roundn_epi32(v_p0a_d, DCT_CONST_BITS),
- xx_roundn_epi32(v_p0b_d, DCT_CONST_BITS));
- in[1] = _mm_packs_epi32(xx_roundn_epi32(v_p1a_d, DCT_CONST_BITS),
- xx_roundn_epi32(v_p1b_d, DCT_CONST_BITS));
- in[2] = _mm_packs_epi32(xx_roundn_epi32(v_p2a_d, DCT_CONST_BITS),
- xx_roundn_epi32(v_p2b_d, DCT_CONST_BITS));
- in[3] = _mm_packs_epi32(xx_roundn_epi32(v_p3a_d, DCT_CONST_BITS),
- xx_roundn_epi32(v_p3b_d, DCT_CONST_BITS));
- in[4] = _mm_packs_epi32(xx_roundn_epi32(v_p4a_d, DCT_CONST_BITS),
- xx_roundn_epi32(v_p4b_d, DCT_CONST_BITS));
- in[5] = _mm_packs_epi32(xx_roundn_epi32(v_p5a_d, DCT_CONST_BITS),
- xx_roundn_epi32(v_p5b_d, DCT_CONST_BITS));
- in[6] = _mm_packs_epi32(xx_roundn_epi32(v_p6a_d, DCT_CONST_BITS),
- xx_roundn_epi32(v_p6b_d, DCT_CONST_BITS));
- in[7] = _mm_packs_epi32(xx_roundn_epi32(v_p7a_d, DCT_CONST_BITS),
- xx_roundn_epi32(v_p7b_d, DCT_CONST_BITS));
+ in[0] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p0a_d, DCT_CONST_BITS),
+ xx_roundn_epi32_unsigned(v_p0b_d, DCT_CONST_BITS));
+ in[1] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p1a_d, DCT_CONST_BITS),
+ xx_roundn_epi32_unsigned(v_p1b_d, DCT_CONST_BITS));
+ in[2] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p2a_d, DCT_CONST_BITS),
+ xx_roundn_epi32_unsigned(v_p2b_d, DCT_CONST_BITS));
+ in[3] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p3a_d, DCT_CONST_BITS),
+ xx_roundn_epi32_unsigned(v_p3b_d, DCT_CONST_BITS));
+ in[4] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p4a_d, DCT_CONST_BITS),
+ xx_roundn_epi32_unsigned(v_p4b_d, DCT_CONST_BITS));
+ in[5] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p5a_d, DCT_CONST_BITS),
+ xx_roundn_epi32_unsigned(v_p5b_d, DCT_CONST_BITS));
+ in[6] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p6a_d, DCT_CONST_BITS),
+ xx_roundn_epi32_unsigned(v_p6b_d, DCT_CONST_BITS));
+ in[7] = _mm_packs_epi32(xx_roundn_epi32_unsigned(v_p7a_d, DCT_CONST_BITS),
+ xx_roundn_epi32_unsigned(v_p7b_d, DCT_CONST_BITS));
}
static INLINE void scale_sqrt2_8x16(__m128i *in) {
@@ -2724,11 +2724,21 @@
scale_sqrt2_8x4(in + 12);
}
+static INLINE void prepare_4x8_row_first(__m128i *in) {
+ in[0] = _mm_unpacklo_epi64(in[0], in[2]);
+ in[1] = _mm_unpacklo_epi64(in[1], in[3]);
+ transpose_4x4(in);
+ in[4] = _mm_unpacklo_epi64(in[4], in[6]);
+ in[5] = _mm_unpacklo_epi64(in[5], in[7]);
+ transpose_4x4(in + 4);
+}
+
// Load input into the left-hand half of in (ie, into lanes 0..3 of
// each element of in). The right hand half (lanes 4..7) should be
// treated as being filled with "don't care" values.
static INLINE void load_buffer_4x8(const int16_t *input, __m128i *in,
int stride, int flipud, int fliplr) {
+ const int shift = 2;
if (!flipud) {
in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
@@ -2760,29 +2770,46 @@
in[7] = _mm_shufflelo_epi16(in[7], 0x1b);
}
- in[0] = _mm_slli_epi16(in[0], 3);
- in[1] = _mm_slli_epi16(in[1], 3);
- in[2] = _mm_slli_epi16(in[2], 3);
- in[3] = _mm_slli_epi16(in[3], 3);
- in[4] = _mm_slli_epi16(in[4], 3);
- in[5] = _mm_slli_epi16(in[5], 3);
- in[6] = _mm_slli_epi16(in[6], 3);
- in[7] = _mm_slli_epi16(in[7], 3);
+ in[0] = _mm_slli_epi16(in[0], shift);
+ in[1] = _mm_slli_epi16(in[1], shift);
+ in[2] = _mm_slli_epi16(in[2], shift);
+ in[3] = _mm_slli_epi16(in[3], shift);
+ in[4] = _mm_slli_epi16(in[4], shift);
+ in[5] = _mm_slli_epi16(in[5], shift);
+ in[6] = _mm_slli_epi16(in[6], shift);
+ in[7] = _mm_slli_epi16(in[7], shift);
scale_sqrt2_8x4(in);
scale_sqrt2_8x4(in + 4);
+ prepare_4x8_row_first(in);
}
static INLINE void write_buffer_4x8(tran_low_t *output, __m128i *res) {
- __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]);
- __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]);
- __m128i in45 = _mm_unpacklo_epi64(res[4], res[5]);
- __m128i in67 = _mm_unpacklo_epi64(res[6], res[7]);
+ __m128i in01, in23, in45, in67, sign01, sign23, sign45, sign67;
+ const int shift = 1;
- in01 = _mm_srai_epi16(in01, 2);
- in23 = _mm_srai_epi16(in23, 2);
- in45 = _mm_srai_epi16(in45, 2);
- in67 = _mm_srai_epi16(in67, 2);
+ // revert the 8x8 txfm's transpose
+ array_transpose_8x8(res, res);
+
+ in01 = _mm_unpacklo_epi64(res[0], res[1]);
+ in23 = _mm_unpacklo_epi64(res[2], res[3]);
+ in45 = _mm_unpacklo_epi64(res[4], res[5]);
+ in67 = _mm_unpacklo_epi64(res[6], res[7]);
+
+ sign01 = _mm_srai_epi16(in01, 15);
+ sign23 = _mm_srai_epi16(in23, 15);
+ sign45 = _mm_srai_epi16(in45, 15);
+ sign67 = _mm_srai_epi16(in67, 15);
+
+ in01 = _mm_sub_epi16(in01, sign01);
+ in23 = _mm_sub_epi16(in23, sign23);
+ in45 = _mm_sub_epi16(in45, sign45);
+ in67 = _mm_sub_epi16(in67, sign67);
+
+ in01 = _mm_srai_epi16(in01, shift);
+ in23 = _mm_srai_epi16(in23, shift);
+ in45 = _mm_srai_epi16(in45, shift);
+ in67 = _mm_srai_epi16(in67, shift);
store_output(&in01, (output + 0 * 8));
store_output(&in23, (output + 1 * 8));
@@ -2794,166 +2821,103 @@
int tx_type) {
__m128i in[8];
- load_buffer_4x8(input, in, stride, 0, 0);
switch (tx_type) {
case DCT_DCT:
- fdct8_sse2(in);
- // Repack data into two 4x4 blocks so we can reuse the 4x4 transforms
- // The other cases (and the 8x4 transforms) all behave similarly
- in[4] = _mm_shuffle_epi32(in[0], 0xe);
- in[5] = _mm_shuffle_epi32(in[1], 0xe);
- in[6] = _mm_shuffle_epi32(in[2], 0xe);
- in[7] = _mm_shuffle_epi32(in[3], 0xe);
+ load_buffer_4x8(input, in, stride, 0, 0);
fdct4_sse2(in);
fdct4_sse2(in + 4);
+ fdct8_sse2(in);
break;
case ADST_DCT:
- fadst8_sse2(in);
- in[4] = _mm_shuffle_epi32(in[0], 0xe);
- in[5] = _mm_shuffle_epi32(in[1], 0xe);
- in[6] = _mm_shuffle_epi32(in[2], 0xe);
- in[7] = _mm_shuffle_epi32(in[3], 0xe);
+ load_buffer_4x8(input, in, stride, 0, 0);
fdct4_sse2(in);
fdct4_sse2(in + 4);
+ fadst8_sse2(in);
break;
case DCT_ADST:
- fdct8_sse2(in);
- in[4] = _mm_shuffle_epi32(in[0], 0xe);
- in[5] = _mm_shuffle_epi32(in[1], 0xe);
- in[6] = _mm_shuffle_epi32(in[2], 0xe);
- in[7] = _mm_shuffle_epi32(in[3], 0xe);
+ load_buffer_4x8(input, in, stride, 0, 0);
fadst4_sse2(in);
fadst4_sse2(in + 4);
+ fdct8_sse2(in);
break;
case ADST_ADST:
- fadst8_sse2(in);
- in[4] = _mm_shuffle_epi32(in[0], 0xe);
- in[5] = _mm_shuffle_epi32(in[1], 0xe);
- in[6] = _mm_shuffle_epi32(in[2], 0xe);
- in[7] = _mm_shuffle_epi32(in[3], 0xe);
+ load_buffer_4x8(input, in, stride, 0, 0);
fadst4_sse2(in);
fadst4_sse2(in + 4);
+ fadst8_sse2(in);
break;
#if CONFIG_EXT_TX
case FLIPADST_DCT:
load_buffer_4x8(input, in, stride, 1, 0);
- fadst8_sse2(in);
- in[4] = _mm_shuffle_epi32(in[0], 0xe);
- in[5] = _mm_shuffle_epi32(in[1], 0xe);
- in[6] = _mm_shuffle_epi32(in[2], 0xe);
- in[7] = _mm_shuffle_epi32(in[3], 0xe);
fdct4_sse2(in);
fdct4_sse2(in + 4);
+ fadst8_sse2(in);
break;
case DCT_FLIPADST:
load_buffer_4x8(input, in, stride, 0, 1);
- fdct8_sse2(in);
- in[4] = _mm_shuffle_epi32(in[0], 0xe);
- in[5] = _mm_shuffle_epi32(in[1], 0xe);
- in[6] = _mm_shuffle_epi32(in[2], 0xe);
- in[7] = _mm_shuffle_epi32(in[3], 0xe);
fadst4_sse2(in);
fadst4_sse2(in + 4);
+ fdct8_sse2(in);
break;
case FLIPADST_FLIPADST:
load_buffer_4x8(input, in, stride, 1, 1);
- fadst8_sse2(in);
- in[4] = _mm_shuffle_epi32(in[0], 0xe);
- in[5] = _mm_shuffle_epi32(in[1], 0xe);
- in[6] = _mm_shuffle_epi32(in[2], 0xe);
- in[7] = _mm_shuffle_epi32(in[3], 0xe);
fadst4_sse2(in);
fadst4_sse2(in + 4);
+ fadst8_sse2(in);
break;
case ADST_FLIPADST:
load_buffer_4x8(input, in, stride, 0, 1);
- fadst8_sse2(in);
- in[4] = _mm_shuffle_epi32(in[0], 0xe);
- in[5] = _mm_shuffle_epi32(in[1], 0xe);
- in[6] = _mm_shuffle_epi32(in[2], 0xe);
- in[7] = _mm_shuffle_epi32(in[3], 0xe);
fadst4_sse2(in);
fadst4_sse2(in + 4);
+ fadst8_sse2(in);
break;
case FLIPADST_ADST:
load_buffer_4x8(input, in, stride, 1, 0);
- fadst8_sse2(in);
- in[4] = _mm_shuffle_epi32(in[0], 0xe);
- in[5] = _mm_shuffle_epi32(in[1], 0xe);
- in[6] = _mm_shuffle_epi32(in[2], 0xe);
- in[7] = _mm_shuffle_epi32(in[3], 0xe);
fadst4_sse2(in);
fadst4_sse2(in + 4);
+ fadst8_sse2(in);
break;
case IDTX:
load_buffer_4x8(input, in, stride, 0, 0);
- fidtx8_sse2(in);
- in[4] = _mm_shuffle_epi32(in[0], 0xe);
- in[5] = _mm_shuffle_epi32(in[1], 0xe);
- in[6] = _mm_shuffle_epi32(in[2], 0xe);
- in[7] = _mm_shuffle_epi32(in[3], 0xe);
fidtx4_sse2(in);
fidtx4_sse2(in + 4);
+ fidtx8_sse2(in);
break;
case V_DCT:
load_buffer_4x8(input, in, stride, 0, 0);
- fdct8_sse2(in);
- in[4] = _mm_shuffle_epi32(in[0], 0xe);
- in[5] = _mm_shuffle_epi32(in[1], 0xe);
- in[6] = _mm_shuffle_epi32(in[2], 0xe);
- in[7] = _mm_shuffle_epi32(in[3], 0xe);
fidtx4_sse2(in);
fidtx4_sse2(in + 4);
+ fdct8_sse2(in);
break;
case H_DCT:
load_buffer_4x8(input, in, stride, 0, 0);
- fidtx8_sse2(in);
- in[4] = _mm_shuffle_epi32(in[0], 0xe);
- in[5] = _mm_shuffle_epi32(in[1], 0xe);
- in[6] = _mm_shuffle_epi32(in[2], 0xe);
- in[7] = _mm_shuffle_epi32(in[3], 0xe);
fdct4_sse2(in);
fdct4_sse2(in + 4);
+ fidtx8_sse2(in);
break;
case V_ADST:
load_buffer_4x8(input, in, stride, 0, 0);
- fadst8_sse2(in);
- in[4] = _mm_shuffle_epi32(in[0], 0xe);
- in[5] = _mm_shuffle_epi32(in[1], 0xe);
- in[6] = _mm_shuffle_epi32(in[2], 0xe);
- in[7] = _mm_shuffle_epi32(in[3], 0xe);
fidtx4_sse2(in);
fidtx4_sse2(in + 4);
+ fadst8_sse2(in);
break;
case H_ADST:
load_buffer_4x8(input, in, stride, 0, 0);
- fidtx8_sse2(in);
- in[4] = _mm_shuffle_epi32(in[0], 0xe);
- in[5] = _mm_shuffle_epi32(in[1], 0xe);
- in[6] = _mm_shuffle_epi32(in[2], 0xe);
- in[7] = _mm_shuffle_epi32(in[3], 0xe);
fadst4_sse2(in);
fadst4_sse2(in + 4);
+ fidtx8_sse2(in);
break;
case V_FLIPADST:
load_buffer_4x8(input, in, stride, 1, 0);
- fadst8_sse2(in);
- in[4] = _mm_shuffle_epi32(in[0], 0xe);
- in[5] = _mm_shuffle_epi32(in[1], 0xe);
- in[6] = _mm_shuffle_epi32(in[2], 0xe);
- in[7] = _mm_shuffle_epi32(in[3], 0xe);
fidtx4_sse2(in);
fidtx4_sse2(in + 4);
+ fadst8_sse2(in);
break;
case H_FLIPADST:
load_buffer_4x8(input, in, stride, 0, 1);
- fidtx8_sse2(in);
- in[4] = _mm_shuffle_epi32(in[0], 0xe);
- in[5] = _mm_shuffle_epi32(in[1], 0xe);
- in[6] = _mm_shuffle_epi32(in[2], 0xe);
- in[7] = _mm_shuffle_epi32(in[3], 0xe);
fadst4_sse2(in);
fadst4_sse2(in + 4);
+ fidtx8_sse2(in);
break;
#endif
default: assert(0); break;
@@ -2970,6 +2934,7 @@
// This is to allow us to reuse 4x4 transforms.
static INLINE void load_buffer_8x4(const int16_t *input, __m128i *in,
int stride, int flipud, int fliplr) {
+ const int shift = 2;
if (!flipud) {
in[0] = _mm_loadu_si128((const __m128i *)(input + 0 * stride));
in[1] = _mm_loadu_si128((const __m128i *)(input + 1 * stride));
@@ -2989,10 +2954,10 @@
in[3] = mm_reverse_epi16(in[3]);
}
- in[0] = _mm_slli_epi16(in[0], 3);
- in[1] = _mm_slli_epi16(in[1], 3);
- in[2] = _mm_slli_epi16(in[2], 3);
- in[3] = _mm_slli_epi16(in[3], 3);
+ in[0] = _mm_slli_epi16(in[0], shift);
+ in[1] = _mm_slli_epi16(in[1], shift);
+ in[2] = _mm_slli_epi16(in[2], shift);
+ in[3] = _mm_slli_epi16(in[3], shift);
scale_sqrt2_8x4(in);
@@ -3003,10 +2968,22 @@
}
static INLINE void write_buffer_8x4(tran_low_t *output, __m128i *res) {
- const __m128i out0 = _mm_srai_epi16(res[0], 2);
- const __m128i out1 = _mm_srai_epi16(res[1], 2);
- const __m128i out2 = _mm_srai_epi16(res[2], 2);
- const __m128i out3 = _mm_srai_epi16(res[3], 2);
+ __m128i out0, out1, out2, out3, sign0, sign1, sign2, sign3;
+ const int shift = 1;
+ sign0 = _mm_srai_epi16(res[0], 15);
+ sign1 = _mm_srai_epi16(res[1], 15);
+ sign2 = _mm_srai_epi16(res[2], 15);
+ sign3 = _mm_srai_epi16(res[3], 15);
+
+ out0 = _mm_sub_epi16(res[0], sign0);
+ out1 = _mm_sub_epi16(res[1], sign1);
+ out2 = _mm_sub_epi16(res[2], sign2);
+ out3 = _mm_sub_epi16(res[3], sign3);
+
+ out0 = _mm_srai_epi16(out0, shift);
+ out1 = _mm_srai_epi16(out1, shift);
+ out2 = _mm_srai_epi16(out2, shift);
+ out3 = _mm_srai_epi16(out3, shift);
store_output(&out0, (output + 0 * 8));
store_output(&out1, (output + 1 * 8));
@@ -3135,9 +3112,23 @@
}
load_buffer_8x8(t, in, stride, flipud, fliplr);
- scale_sqrt2_8x8_signed(in);
+ scale_sqrt2_8x8_unsigned(in);
load_buffer_8x8(b, in + 8, stride, flipud, fliplr);
- scale_sqrt2_8x8_signed(in + 8);
+ scale_sqrt2_8x8_unsigned(in + 8);
+}
+
+static INLINE void round_power_of_two_signed(__m128i *x, int n) {
+ const __m128i rounding = _mm_set1_epi16((1 << n) >> 1);
+ const __m128i sign = _mm_srai_epi16(*x, 15);
+ const __m128i res = _mm_add_epi16(_mm_add_epi16(*x, rounding), sign);
+ *x = _mm_srai_epi16(res, n);
+}
+
+static void row_8x16_rounding(__m128i *in, int bits) {
+ int i;
+ for (i = 0; i < 16; i++) {
+ round_power_of_two_signed(&in[i], bits);
+ }
}
void av1_fht8x16_sse2(const int16_t *input, tran_low_t *output, int stride,
@@ -3150,138 +3141,152 @@
switch (tx_type) {
case DCT_DCT:
load_buffer_8x16(input, in, stride, 0, 0);
- fdct16_8col(in);
array_transpose_8x8(t, t);
array_transpose_8x8(b, b);
fdct8_sse2(t);
fdct8_sse2(b);
+ row_8x16_rounding(in, 2);
+ fdct16_8col(in);
break;
case ADST_DCT:
load_buffer_8x16(input, in, stride, 0, 0);
- fadst16_8col(in);
array_transpose_8x8(t, t);
array_transpose_8x8(b, b);
fdct8_sse2(t);
fdct8_sse2(b);
+ row_8x16_rounding(in, 2);
+ fadst16_8col(in);
break;
case DCT_ADST:
load_buffer_8x16(input, in, stride, 0, 0);
- fdct16_8col(in);
array_transpose_8x8(t, t);
array_transpose_8x8(b, b);
fadst8_sse2(t);
fadst8_sse2(b);
+ row_8x16_rounding(in, 2);
+ fdct16_8col(in);
break;
case ADST_ADST:
load_buffer_8x16(input, in, stride, 0, 0);
- fadst16_8col(in);
array_transpose_8x8(t, t);
array_transpose_8x8(b, b);
fadst8_sse2(t);
fadst8_sse2(b);
+ row_8x16_rounding(in, 2);
+ fadst16_8col(in);
break;
#if CONFIG_EXT_TX
case FLIPADST_DCT:
load_buffer_8x16(input, in, stride, 1, 0);
- fadst16_8col(in);
array_transpose_8x8(t, t);
array_transpose_8x8(b, b);
fdct8_sse2(t);
fdct8_sse2(b);
+ row_8x16_rounding(in, 2);
+ fadst16_8col(in);
break;
case DCT_FLIPADST:
load_buffer_8x16(input, in, stride, 0, 1);
- fdct16_8col(in);
array_transpose_8x8(t, t);
array_transpose_8x8(b, b);
fadst8_sse2(t);
fadst8_sse2(b);
+ row_8x16_rounding(in, 2);
+ fdct16_8col(in);
break;
case FLIPADST_FLIPADST:
load_buffer_8x16(input, in, stride, 1, 1);
- fadst16_8col(in);
array_transpose_8x8(t, t);
array_transpose_8x8(b, b);
fadst8_sse2(t);
fadst8_sse2(b);
+ row_8x16_rounding(in, 2);
+ fadst16_8col(in);
break;
case ADST_FLIPADST:
load_buffer_8x16(input, in, stride, 0, 1);
- fadst16_8col(in);
array_transpose_8x8(t, t);
array_transpose_8x8(b, b);
fadst8_sse2(t);
fadst8_sse2(b);
+ row_8x16_rounding(in, 2);
+ fadst16_8col(in);
break;
case FLIPADST_ADST:
load_buffer_8x16(input, in, stride, 1, 0);
- fadst16_8col(in);
array_transpose_8x8(t, t);
array_transpose_8x8(b, b);
fadst8_sse2(t);
fadst8_sse2(b);
+ row_8x16_rounding(in, 2);
+ fadst16_8col(in);
break;
case IDTX:
load_buffer_8x16(input, in, stride, 0, 0);
- fidtx16_8col(in);
array_transpose_8x8(t, t);
array_transpose_8x8(b, b);
fidtx8_sse2(t);
fidtx8_sse2(b);
+ row_8x16_rounding(in, 2);
+ fidtx16_8col(in);
break;
case V_DCT:
load_buffer_8x16(input, in, stride, 0, 0);
- fdct16_8col(in);
array_transpose_8x8(t, t);
array_transpose_8x8(b, b);
fidtx8_sse2(t);
fidtx8_sse2(b);
+ row_8x16_rounding(in, 2);
+ fdct16_8col(in);
break;
case H_DCT:
load_buffer_8x16(input, in, stride, 0, 0);
- fidtx16_8col(in);
array_transpose_8x8(t, t);
array_transpose_8x8(b, b);
fdct8_sse2(t);
fdct8_sse2(b);
+ row_8x16_rounding(in, 2);
+ fidtx16_8col(in);
break;
case V_ADST:
load_buffer_8x16(input, in, stride, 0, 0);
- fadst16_8col(in);
array_transpose_8x8(t, t);
array_transpose_8x8(b, b);
fidtx8_sse2(t);
fidtx8_sse2(b);
+ row_8x16_rounding(in, 2);
+ fadst16_8col(in);
break;
case H_ADST:
load_buffer_8x16(input, in, stride, 0, 0);
- fidtx16_8col(in);
array_transpose_8x8(t, t);
array_transpose_8x8(b, b);
fadst8_sse2(t);
fadst8_sse2(b);
+ row_8x16_rounding(in, 2);
+ fidtx16_8col(in);
break;
case V_FLIPADST:
load_buffer_8x16(input, in, stride, 1, 0);
- fadst16_8col(in);
array_transpose_8x8(t, t);
array_transpose_8x8(b, b);
fidtx8_sse2(t);
fidtx8_sse2(b);
+ row_8x16_rounding(in, 2);
+ fadst16_8col(in);
break;
case H_FLIPADST:
load_buffer_8x16(input, in, stride, 0, 1);
- fidtx16_8col(in);
array_transpose_8x8(t, t);
array_transpose_8x8(b, b);
fadst8_sse2(t);
fadst8_sse2(b);
+ row_8x16_rounding(in, 2);
+ fidtx16_8col(in);
break;
#endif
default: assert(0); break;
}
- right_shift_8x8(t, 2);
- right_shift_8x8(b, 2);
write_buffer_8x8(output, t, 8);
write_buffer_8x8(output + 64, b, 8);
}
@@ -3300,11 +3305,13 @@
// load first 8 columns
load_buffer_8x8(l, in, stride, flipud, fliplr);
- scale_sqrt2_8x8_signed(in);
+ scale_sqrt2_8x8_unsigned(in);
load_buffer_8x8(r, in + 8, stride, flipud, fliplr);
- scale_sqrt2_8x8_signed(in + 8);
+ scale_sqrt2_8x8_unsigned(in + 8);
}
+#define col_16x8_rounding row_8x16_rounding
+
void av1_fht16x8_sse2(const int16_t *input, tran_low_t *output, int stride,
int tx_type) {
__m128i in[16];
@@ -3318,24 +3325,28 @@
load_buffer_16x8(input, in, stride, 0, 0);
fdct8_sse2(l);
fdct8_sse2(r);
+ col_16x8_rounding(in, 2);
fdct16_8col(in);
break;
case ADST_DCT:
load_buffer_16x8(input, in, stride, 0, 0);
fadst8_sse2(l);
fadst8_sse2(r);
+ col_16x8_rounding(in, 2);
fdct16_8col(in);
break;
case DCT_ADST:
load_buffer_16x8(input, in, stride, 0, 0);
fdct8_sse2(l);
fdct8_sse2(r);
+ col_16x8_rounding(in, 2);
fadst16_8col(in);
break;
case ADST_ADST:
load_buffer_16x8(input, in, stride, 0, 0);
fadst8_sse2(l);
fadst8_sse2(r);
+ col_16x8_rounding(in, 2);
fadst16_8col(in);
break;
#if CONFIG_EXT_TX
@@ -3343,72 +3354,84 @@
load_buffer_16x8(input, in, stride, 1, 0);
fadst8_sse2(l);
fadst8_sse2(r);
+ col_16x8_rounding(in, 2);
fdct16_8col(in);
break;
case DCT_FLIPADST:
load_buffer_16x8(input, in, stride, 0, 1);
fdct8_sse2(l);
fdct8_sse2(r);
+ col_16x8_rounding(in, 2);
fadst16_8col(in);
break;
case FLIPADST_FLIPADST:
load_buffer_16x8(input, in, stride, 1, 1);
fadst8_sse2(l);
fadst8_sse2(r);
+ col_16x8_rounding(in, 2);
fadst16_8col(in);
break;
case ADST_FLIPADST:
load_buffer_16x8(input, in, stride, 0, 1);
fadst8_sse2(l);
fadst8_sse2(r);
+ col_16x8_rounding(in, 2);
fadst16_8col(in);
break;
case FLIPADST_ADST:
load_buffer_16x8(input, in, stride, 1, 0);
fadst8_sse2(l);
fadst8_sse2(r);
+ col_16x8_rounding(in, 2);
fadst16_8col(in);
break;
case IDTX:
load_buffer_16x8(input, in, stride, 0, 0);
fidtx8_sse2(l);
fidtx8_sse2(r);
+ col_16x8_rounding(in, 2);
fidtx16_8col(in);
break;
case V_DCT:
load_buffer_16x8(input, in, stride, 0, 0);
fdct8_sse2(l);
fdct8_sse2(r);
+ col_16x8_rounding(in, 2);
fidtx16_8col(in);
break;
case H_DCT:
load_buffer_16x8(input, in, stride, 0, 0);
fidtx8_sse2(l);
fidtx8_sse2(r);
+ col_16x8_rounding(in, 2);
fdct16_8col(in);
break;
case V_ADST:
load_buffer_16x8(input, in, stride, 0, 0);
fadst8_sse2(l);
fadst8_sse2(r);
+ col_16x8_rounding(in, 2);
fidtx16_8col(in);
break;
case H_ADST:
load_buffer_16x8(input, in, stride, 0, 0);
fidtx8_sse2(l);
fidtx8_sse2(r);
+ col_16x8_rounding(in, 2);
fadst16_8col(in);
break;
case V_FLIPADST:
load_buffer_16x8(input, in, stride, 1, 0);
fadst8_sse2(l);
fadst8_sse2(r);
+ col_16x8_rounding(in, 2);
fidtx16_8col(in);
break;
case H_FLIPADST:
load_buffer_16x8(input, in, stride, 0, 1);
fidtx8_sse2(l);
fidtx8_sse2(r);
+ col_16x8_rounding(in, 2);
fadst16_8col(in);
break;
#endif
@@ -3416,8 +3439,6 @@
}
array_transpose_8x8(l, l);
array_transpose_8x8(r, r);
- right_shift_8x8(l, 2);
- right_shift_8x8(r, 2);
write_buffer_8x8(output, l, 16);
write_buffer_8x8(output + 8, r, 16);
}
diff --git a/test/av1_fht16x8_test.cc b/test/av1_fht16x8_test.cc
index 39c2713..c92eba7 100644
--- a/test/av1_fht16x8_test.cc
+++ b/test/av1_fht16x8_test.cc
@@ -122,7 +122,7 @@
make_tuple(&av1_fht16x8_sse2, &av1_iht16x8_128_add_sse2, 15, AOM_BITS_8, 128)
#endif // CONFIG_EXT_TX
};
-INSTANTIATE_TEST_CASE_P(DISABLED_SSE2, AV1Trans16x8HT,
+INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans16x8HT,
::testing::ValuesIn(kArrayHt16x8Param_sse2));
#endif // HAVE_SSE2
diff --git a/test/av1_fht4x8_test.cc b/test/av1_fht4x8_test.cc
index 1fb581a..4962ec3 100644
--- a/test/av1_fht4x8_test.cc
+++ b/test/av1_fht4x8_test.cc
@@ -122,7 +122,7 @@
make_tuple(&av1_fht4x8_sse2, &av1_iht4x8_32_add_sse2, 15, AOM_BITS_8, 32)
#endif // CONFIG_EXT_TX
};
-INSTANTIATE_TEST_CASE_P(DISABLED_SSE2, AV1Trans4x8HT,
+INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans4x8HT,
::testing::ValuesIn(kArrayHt4x8Param_sse2));
#endif // HAVE_SSE2
diff --git a/test/av1_fht8x16_test.cc b/test/av1_fht8x16_test.cc
index 294219b..88c38ac 100644
--- a/test/av1_fht8x16_test.cc
+++ b/test/av1_fht8x16_test.cc
@@ -121,7 +121,7 @@
make_tuple(&av1_fht8x16_sse2, &av1_iht8x16_128_add_sse2, 15, AOM_BITS_8, 128)
#endif // CONFIG_EXT_TX
};
-INSTANTIATE_TEST_CASE_P(DISABLED_SSE2, AV1Trans8x16HT,
+INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans8x16HT,
::testing::ValuesIn(kArrayHt8x16Param_sse2));
#endif // HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE
diff --git a/test/av1_fht8x4_test.cc b/test/av1_fht8x4_test.cc
index cb71d38..83a2d0e 100644
--- a/test/av1_fht8x4_test.cc
+++ b/test/av1_fht8x4_test.cc
@@ -121,7 +121,7 @@
make_tuple(&av1_fht8x4_sse2, &av1_iht8x4_32_add_sse2, 15, AOM_BITS_8, 32)
#endif // CONFIG_EXT_TX
};
-INSTANTIATE_TEST_CASE_P(DISABLED_SSE2, AV1Trans8x4HT,
+INSTANTIATE_TEST_CASE_P(SSE2, AV1Trans8x4HT,
::testing::ValuesIn(kArrayHt8x4Param_sse2));
#endif // HAVE_SSE2