16x16 inverse 2D-DCT with DC only This commit provides special handle on 16x16 inverse 2D-DCT, where only DC coefficient is quantized to be non-zero value. Change-Id: I7bf71be7fa13384fab453dc8742b5b50e77a277c
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c index df9ff3b..38fec3e 100644 --- a/vp9/common/vp9_idct.c +++ b/vp9/common/vp9_idct.c
@@ -864,10 +864,18 @@ } } -void vp9_short_idct1_16x16_c(int16_t *input, int16_t *output) { +void vp9_short_idct16x16_1_add_c(int16_t *input, uint8_t *dest, + int dest_stride) { + int i, j; + int a1; int16_t out = dct_const_round_shift(input[0] * cospi_16_64); out = dct_const_round_shift(out * cospi_16_64); - output[0] = ROUND_POWER_OF_TWO(out, 6); + a1 = ROUND_POWER_OF_TWO(out, 6); + for (j = 0; j < 16; ++j) { + for (i = 0; i < 16; ++i) + dest[i] = clip_pixel(dest[i] + a1); + dest += dest_stride; + } } static void idct32_1d(int16_t *input, int16_t *output) {
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 18da330..f004d1c 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh
@@ -306,15 +306,15 @@ prototype void vp9_short_idct10_8x8_add "int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_short_idct10_8x8_add sse2 +prototype void vp9_short_idct16x16_1_add "int16_t *input, uint8_t *dest, int dest_stride" +specialize vp9_short_idct16x16_1_add sse2 + prototype void vp9_short_idct16x16_add "int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_short_idct16x16_add sse2 prototype void vp9_short_idct10_16x16_add "int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_short_idct10_16x16_add sse2 -prototype void vp9_short_idct1_16x16 "int16_t *input, int16_t *output" -specialize vp9_short_idct1_16x16 - prototype void vp9_short_idct32x32_add "int16_t *input, uint8_t *dest, int dest_stride" specialize vp9_short_idct32x32_add sse2
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c index 726c83f..e175fd2 100644 --- a/vp9/common/x86/vp9_idct_intrin_sse2.c +++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -1470,6 +1470,38 @@ } } +void vp9_short_idct16x16_1_add_sse2(int16_t *input, uint8_t *dest, int stride) { + __m128i dc_value; + const __m128i zero = _mm_setzero_si128(); + int a, i; + + a = dct_const_round_shift(input[0] * cospi_16_64); + a = dct_const_round_shift(a * cospi_16_64); + a = ROUND_POWER_OF_TWO(a, 6); + + dc_value = _mm_set1_epi16(a); + + for (i = 0; i < 2; ++i) { + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + RECON_AND_STORE(dest, dc_value); + dest += 8 - (stride * 16); + } +} + static INLINE void array_transpose_16x16(__m128i *res0, __m128i *res1) { __m128i tbuf[8]; array_transpose_8x8(res0, res0);
diff --git a/vp9/decoder/vp9_idct_blk.c b/vp9/decoder/vp9_idct_blk.c index 42b805f..395e636 100644 --- a/vp9/decoder/vp9_idct_blk.c +++ b/vp9/decoder/vp9_idct_blk.c
@@ -123,14 +123,8 @@ if (eob) { if (eob == 1) { /* DC only DCT coefficient. */ - int16_t in = input[0]; - int16_t out; - /* Note: the idct1 will need to be modified accordingly whenever - * vp9_short_idct16x16() is modified. */ - vp9_short_idct1_16x16_c(&in, &out); + vp9_short_idct16x16_1_add(input, dest, stride); input[0] = 0; - - vp9_add_constant_residual_16x16(out, dest, stride); } else if (eob <= 10) { vp9_short_idct10_16x16_add(input, dest, stride); vpx_memset(input, 0, 512);
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index 3597e73..a92ecf2 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c
@@ -61,7 +61,9 @@ static void inverse_transform_b_16x16_add(MACROBLOCKD *xd, int eob, int16_t *dqcoeff, uint8_t *dest, int stride) { - if (eob <= 10) + if (eob <= 1) + vp9_short_idct16x16_1_add(dqcoeff, dest, stride); + else if (eob <= 10) vp9_short_idct10_16x16_add(dqcoeff, dest, stride); else vp9_short_idct16x16_add(dqcoeff, dest, stride);