Add eob<=10 case in idct32x32 Simplified idct32x32 calculation when there are only 10 or less non-zero coefficients in 32x32 block. This helps the decoder performance. Change-Id: If7f8893d27b64a9892b4b2621a37fdf4ac0c2a6d
diff --git a/vp9/common/vp9_idctllm.c b/vp9/common/vp9_idctllm.c index 21a597c..673abd7 100644 --- a/vp9/common/vp9_idctllm.c +++ b/vp9/common/vp9_idctllm.c
@@ -1292,3 +1292,30 @@ out = dct_const_round_shift(out * cospi_16_64); output[0] = ROUND_POWER_OF_TWO(out, 6); } + +void vp9_short_idct10_32x32_c(int16_t *input, int16_t *output, int pitch) { + int16_t out[32 * 32]; + int16_t *outptr = out; + const int half_pitch = pitch >> 1; + int i, j; + int16_t temp_in[32], temp_out[32]; + + /* First transform rows. Since all non-zero dct coefficients are in + * upper-left 4x4 area, we only need to calculate first 4 rows here. + */ + vpx_memset(out, 0, sizeof(out)); + for (i = 0; i < 4; ++i) { + idct32_1d(input, outptr); + input += half_pitch; + outptr += 32; + } + + // Columns + for (i = 0; i < 32; ++i) { + for (j = 0; j < 32; ++j) + temp_in[j] = out[j * 32 + i]; + idct32_1d(temp_in, temp_out); + for (j = 0; j < 32; ++j) + output[j * 32 + i] = ROUND_POWER_OF_TWO(temp_out[j], 6); + } +}
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 2bd26c8..9cbf44c 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh
@@ -281,6 +281,9 @@ prototype void vp9_short_idct1_32x32 "int16_t *input, int16_t *output" specialize vp9_short_idct1_32x32 +prototype void vp9_short_idct10_32x32 "int16_t *input, int16_t *output, int pitch" +specialize vp9_short_idct10_32x32 + prototype void vp9_short_iht8x8 "int16_t *input, int16_t *output, int pitch, int tx_type" specialize vp9_short_iht8x8
diff --git a/vp9/decoder/vp9_dequantize.c b/vp9/decoder/vp9_dequantize.c index 46e5656..5a98b11 100644 --- a/vp9/decoder/vp9_dequantize.c +++ b/vp9/decoder/vp9_dequantize.c
@@ -314,14 +314,34 @@ if (eob) { input[0] = input[0] * dq[0] / 2; if (eob == 1) { - vp9_short_idct1_32x32_c(input, output); + vp9_short_idct1_32x32(input, output); add_constant_residual(output[0], pred, pitch, dest, stride, 32, 32); input[0] = 0; + } else if (eob <= 10) { + input[1] = input[1] * dq[1] / 2; + input[2] = input[2] * dq[1] / 2; + input[3] = input[3] * dq[1] / 2; + input[32] = input[32] * dq[1] / 2; + input[33] = input[33] * dq[1] / 2; + input[34] = input[34] * dq[1] / 2; + input[64] = input[64] * dq[1] / 2; + input[65] = input[65] * dq[1] / 2; + input[96] = input[96] * dq[1] / 2; + + // the idct halves ( >> 1) the pitch + vp9_short_idct10_32x32(input, output, 64); + + input[0] = input[1] = input[2] = input[3] = 0; + input[32] = input[33] = input[34] = 0; + input[64] = input[65] = 0; + input[96] = 0; + + add_residual(output, pred, pitch, dest, stride, 32, 32); } else { int i; for (i = 1; i < 1024; i++) input[i] = input[i] * dq[1] / 2; - vp9_short_idct32x32_c(input, output, 64); + vp9_short_idct32x32(input, output, 64); vpx_memset(input, 0, 2048); add_residual(output, pred, pitch, dest, stride, 32, 32); }