Using stride (# of elements) instead of pitch (bytes) in fdct32x32. Just making fdct consistent with iht/idct/fht functions which all use stride (# of elements) as input argument. Change-Id: Id623c5113262655fa50f7c9d6cec9a91fcb20bb4
diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc index f456abc..5abb9b1 100644 --- a/test/dct32x32_test.cc +++ b/test/dct32x32_test.cc
@@ -113,8 +113,7 @@ test_input_block[j] = src[j] - dst[j]; } - const int pitch = 64; - REGISTER_STATE_CHECK(fwd_txfm_(test_input_block, test_temp_block, pitch)); + REGISTER_STATE_CHECK(fwd_txfm_(test_input_block, test_temp_block, 32)); REGISTER_STATE_CHECK(inv_txfm_(test_temp_block, dst, 32)); for (int j = 0; j < kNumCoeffs; ++j) { @@ -150,9 +149,9 @@ for (int j = 0; j < kNumCoeffs; ++j) input_block[j] = rnd.Rand8() - rnd.Rand8(); - const int pitch = 64; - vp9_short_fdct32x32_c(input_block, output_ref_block, pitch); - REGISTER_STATE_CHECK(fwd_txfm_(input_block, output_block, pitch)); + const int stride = 32; + vp9_short_fdct32x32_c(input_block, output_ref_block, stride); + REGISTER_STATE_CHECK(fwd_txfm_(input_block, output_block, stride)); if (version_ == 0) { for (int j = 0; j < kNumCoeffs; ++j) @@ -188,9 +187,9 @@ for (int j = 0; j < kNumCoeffs; ++j) input_extreme_block[j] = -255; - const int pitch = 64; - vp9_short_fdct32x32_c(input_extreme_block, output_ref_block, pitch); - REGISTER_STATE_CHECK(fwd_txfm_(input_extreme_block, output_block, pitch)); + const int stride = 32; + vp9_short_fdct32x32_c(input_extreme_block, output_ref_block, stride); + REGISTER_STATE_CHECK(fwd_txfm_(input_extreme_block, output_block, stride)); // The minimum quant value is 4. for (int j = 0; j < kNumCoeffs; ++j) {
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh index 526be87..af96bb3 100644 --- a/vp9/common/vp9_rtcd_defs.sh +++ b/vp9/common/vp9_rtcd_defs.sh
@@ -701,10 +701,10 @@ prototype void vp9_short_fdct4x4 "int16_t *InputData, int16_t *OutputData, int pitch" specialize vp9_short_fdct4x4 sse2 -prototype void vp9_short_fdct32x32 "int16_t *InputData, int16_t *OutputData, int pitch" +prototype void vp9_short_fdct32x32 "int16_t *InputData, int16_t *OutputData, int stride" specialize vp9_short_fdct32x32 sse2 -prototype void vp9_short_fdct32x32_rd "int16_t *InputData, int16_t *OutputData, int pitch" +prototype void vp9_short_fdct32x32_rd "int16_t *InputData, int16_t *OutputData, int stride" specialize vp9_short_fdct32x32_rd sse2 prototype void vp9_short_fdct16x16 "int16_t *InputData, int16_t *OutputData, int pitch"
diff --git a/vp9/encoder/vp9_dct.c b/vp9/encoder/vp9_dct.c index b6555bc..00a2903 100644 --- a/vp9/encoder/vp9_dct.c +++ b/vp9/encoder/vp9_dct.c
@@ -1315,8 +1315,7 @@ output[31] = dct_32_round(step[31] * cospi_31_64 + step[16] * -cospi_1_64); } -void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int pitch) { - int shortpitch = pitch >> 1; +void vp9_short_fdct32x32_c(int16_t *input, int16_t *out, int stride) { int i, j; int output[32 * 32]; @@ -1324,7 +1323,7 @@ for (i = 0; i < 32; ++i) { int temp_in[32], temp_out[32]; for (j = 0; j < 32; ++j) - temp_in[j] = input[j * shortpitch + i] * 4; + temp_in[j] = input[j * stride + i] * 4; dct32_1d(temp_in, temp_out, 0); for (j = 0; j < 32; ++j) output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2; @@ -1344,8 +1343,7 @@ // Note that although we use dct_32_round in dct32_1d computation flow, // this 2d fdct32x32 for rate-distortion optimization loop is operating // within 16 bits precision. -void vp9_short_fdct32x32_rd_c(int16_t *input, int16_t *out, int pitch) { - int shortpitch = pitch >> 1; +void vp9_short_fdct32x32_rd_c(int16_t *input, int16_t *out, int stride) { int i, j; int output[32 * 32]; @@ -1353,7 +1351,7 @@ for (i = 0; i < 32; ++i) { int temp_in[32], temp_out[32]; for (j = 0; j < 32; ++j) - temp_in[j] = input[j * shortpitch + i] * 4; + temp_in[j] = input[j * stride + i] * 4; dct32_1d(temp_in, temp_out, 0); for (j = 0; j < 32; ++j) // TODO(cd): see quality impact of only doing
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c index c1e1a0d..2b5451b 100644 --- a/vp9/encoder/vp9_encodemb.c +++ b/vp9/encoder/vp9_encodemb.c
@@ -365,9 +365,9 @@ yoff = 32 * (block >> twl); src_diff = p->src_diff + 4 * bw * yoff + xoff; if (x->use_lp32x32fdct) - vp9_short_fdct32x32_rd(src_diff, coeff, bw * 8); + vp9_short_fdct32x32_rd(src_diff, coeff, bw * 4); else - vp9_short_fdct32x32(src_diff, coeff, bw * 8); + vp9_short_fdct32x32(src_diff, coeff, bw * 4); vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, p->zbin_extra, eob, scan, iscan); @@ -532,9 +532,9 @@ vp9_subtract_block(32, 32, src_diff, bw * 4, src, p->src.stride, dst, pd->dst.stride); if (x->use_lp32x32fdct) - vp9_short_fdct32x32_rd(src_diff, coeff, bw * 8); + vp9_short_fdct32x32_rd(src_diff, coeff, bw * 4); else - vp9_short_fdct32x32(src_diff, coeff, bw * 8); + vp9_short_fdct32x32(src_diff, coeff, bw * 4); vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant, p->zbin_extra, eob, scan, iscan);
diff --git a/vp9/encoder/x86/vp9_dct32x32_sse2.c b/vp9/encoder/x86/vp9_dct32x32_sse2.c index 11eec7f..de47a5b 100644 --- a/vp9/encoder/x86/vp9_dct32x32_sse2.c +++ b/vp9/encoder/x86/vp9_dct32x32_sse2.c
@@ -30,11 +30,11 @@ #endif void FDCT32x32_2D(int16_t *input, - int16_t *output_org, int pitch) { + int16_t *output_org, int stride) { // Calculate pre-multiplied strides - const int str1 = pitch >> 1; - const int str2 = pitch; - const int str3 = pitch + str1; + const int str1 = stride; + const int str2 = 2 * stride; + const int str3 = 2 * stride + str1; // We need an intermediate buffer between passes. DECLARE_ALIGNED(16, int16_t, intermediate[32 * 32]); // Constants