WIP: 8x8 idct/recon merge
This patch eliminates the intermediate diff buffer usage by
combining the short idct and the add residual into one function.
The encoder can use the same code as well.
Change-Id: Iacfd57324fbe2b7beca5d7f3dcae25c976e67f45
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index e1b2a07..90b4ecd 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -16,6 +16,7 @@
extern "C" {
#include "vp9_rtcd.h"
+void vp9_short_idct8x8_add_c(short *input, uint8_t *output, int pitch);
}
#include "acm_random.h"
@@ -100,11 +101,15 @@
for (int i = 0; i < count_test_block; ++i) {
int16_t test_input_block[64];
int16_t test_temp_block[64];
- int16_t test_output_block[64];
+ uint8_t dst[64], src[64];
+ for (int j = 0; j < 64; ++j) {
+ src[j] = rnd.Rand8();
+ dst[j] = rnd.Rand8();
+ }
// Initialize a test block with input range [-255, 255].
for (int j = 0; j < 64; ++j)
- test_input_block[j] = rnd.Rand8() - rnd.Rand8();
+ test_input_block[j] = src[j] - dst[j];
const int pitch = 16;
vp9_short_fdct8x8_c(test_input_block, test_temp_block, pitch);
@@ -119,10 +124,10 @@
test_temp_block[j] *= 4;
}
}
- vp9_short_idct8x8_c(test_temp_block, test_output_block, pitch);
+ vp9_short_idct8x8_add_c(test_temp_block, dst, 8);
for (int j = 0; j < 64; ++j) {
- const int diff = test_input_block[j] - test_output_block[j];
+ const int diff = dst[j] - src[j];
const int error = diff * diff;
if (max_error < error)
max_error = error;
@@ -145,18 +150,22 @@
for (int i = 0; i < count_test_block; ++i) {
int16_t test_input_block[64];
int16_t test_temp_block[64];
- int16_t test_output_block[64];
+ uint8_t dst[64], src[64];
- // Initialize a test block with input range {-255, 255}.
+ for (int j = 0; j < 64; ++j) {
+ src[j] = rnd.Rand8() % 2 ? 255 : 0;
+ dst[j] = src[j] > 0 ? 0 : 255;
+ }
+ // Initialize a test block with input range [-255, 255].
for (int j = 0; j < 64; ++j)
- test_input_block[j] = rnd.Rand8() % 2 ? 255 : -256;
+ test_input_block[j] = src[j] - dst[j];
const int pitch = 16;
vp9_short_fdct8x8_c(test_input_block, test_temp_block, pitch);
- vp9_short_idct8x8_c(test_temp_block, test_output_block, pitch);
+ vp9_short_idct8x8_add_c(test_temp_block, dst, 8);
for (int j = 0; j < 64; ++j) {
- const int diff = test_input_block[j] - test_output_block[j];
+ const int diff = dst[j] - src[j];
const int error = diff * diff;
if (max_error < error)
max_error = error;
diff --git a/test/idct8x8_test.cc b/test/idct8x8_test.cc
index 30a1ac3..67db78b 100644
--- a/test/idct8x8_test.cc
+++ b/test/idct8x8_test.cc
@@ -112,20 +112,23 @@
const int count_test_block = 10000;
for (int i = 0; i < count_test_block; ++i) {
int16_t input[64], coeff[64];
- int16_t output_c[64];
double output_r[64];
+ uint8_t dst[64], src[64];
+ for (int j = 0; j < 64; ++j) {
+ src[j] = rnd.Rand8();
+ dst[j] = rnd.Rand8();
+ }
// Initialize a test block with input range [-255, 255].
for (int j = 0; j < 64; ++j)
- input[j] = rnd.Rand8() - rnd.Rand8();
+ input[j] = src[j] - dst[j];
- const int pitch = 16;
reference_dct_2d(input, output_r);
for (int j = 0; j < 64; ++j)
coeff[j] = round(output_r[j]);
- vp9_short_idct8x8_c(coeff, output_c, pitch);
+ vp9_short_idct8x8_add_c(coeff, dst, 8);
for (int j = 0; j < 64; ++j) {
- const int diff = output_c[j] -input[j];
+ const int diff = dst[j] - src[j];
const int error = diff * diff;
EXPECT_GE(1, error)
<< "Error: 8x8 FDCT/IDCT has error " << error
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index b166fcb..2ff7696 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -219,27 +219,27 @@
output[7] = step1[0] - step1[7];
}
-void vp9_short_idct8x8_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
int16_t out[8 * 8];
int16_t *outptr = out;
- const int half_pitch = pitch >> 1;
int i, j;
int16_t temp_in[8], temp_out[8];
- // Rows
+ // First transform rows
for (i = 0; i < 8; ++i) {
idct8_1d(input, outptr);
input += 8;
outptr += 8;
}
- // Columns
+ // Then transform columns
for (i = 0; i < 8; ++i) {
for (j = 0; j < 8; ++j)
temp_in[j] = out[j * 8 + i];
idct8_1d(temp_in, temp_out);
for (j = 0; j < 8; ++j)
- output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 5);
+ dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+ + dest[j * dest_stride + i]);
}
}
@@ -400,8 +400,8 @@
{ iadst8_1d, iadst8_1d } // ADST_ADST = 3
};
-void vp9_short_iht8x8_c(int16_t *input, int16_t *output,
- int pitch, int tx_type) {
+void vp9_short_iht8x8_add_c(int16_t *input, uint8_t *dest, int dest_stride,
+ int tx_type) {
int i, j;
int16_t out[8 * 8];
int16_t *outptr = out;
@@ -421,14 +421,14 @@
temp_in[j] = out[j * 8 + i];
ht.cols(temp_in, temp_out);
for (j = 0; j < 8; ++j)
- output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 5);
- }
+ dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+ + dest[j * dest_stride + i]); }
}
-void vp9_short_idct10_8x8_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct10_8x8_add_c(int16_t *input, uint8_t *dest,
+ int dest_stride) {
int16_t out[8 * 8];
int16_t *outptr = out;
- const int half_pitch = pitch >> 1;
int i, j;
int16_t temp_in[8], temp_out[8];
@@ -447,7 +447,8 @@
temp_in[j] = out[j * 8 + i];
idct8_1d(temp_in, temp_out);
for (j = 0; j < 8; ++j)
- output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 5);
+ dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 5)
+ + dest[j * dest_stride + i]);
}
}
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index ea60fbb..5ecb0af 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -88,9 +88,6 @@
prototype void vp9_add_residual_4x4 "const int16_t *diff, uint8_t *dest, int stride"
specialize vp9_add_residual_4x4 sse2
-prototype void vp9_add_residual_8x8 "const int16_t *diff, uint8_t *dest, int stride"
-specialize vp9_add_residual_8x8 sse2
-
prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride"
specialize vp9_add_constant_residual_8x8 sse2
@@ -188,11 +185,11 @@
prototype void vp9_short_idct4x4 "int16_t *input, int16_t *output, int pitch"
specialize vp9_short_idct4x4 sse2
-prototype void vp9_short_idct8x8 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct8x8 sse2
+prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct8x8_add sse2
-prototype void vp9_short_idct10_8x8 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct10_8x8 sse2
+prototype void vp9_short_idct10_8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct10_8x8_add sse2
prototype void vp9_short_idct1_8x8 "int16_t *input, int16_t *output"
specialize vp9_short_idct1_8x8
@@ -215,8 +212,8 @@
prototype void vp9_short_idct10_32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
specialize vp9_short_idct10_32x32_add
-prototype void vp9_short_iht8x8 "int16_t *input, int16_t *output, int pitch, int tx_type"
-specialize vp9_short_iht8x8
+prototype void vp9_short_iht8x8_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
+specialize vp9_short_iht8x8_add
prototype void vp9_short_iht4x4 "int16_t *input, int16_t *output, int pitch, int tx_type"
specialize vp9_short_iht4x4
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index 667da33..ab8604c 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -403,8 +403,18 @@
in6 = _mm_subs_epi16(stp1_1, stp1_6); \
in7 = _mm_subs_epi16(stp1_0, stp2_7);
-void vp9_short_idct8x8_sse2(int16_t *input, int16_t *output, int pitch) {
- const int half_pitch = pitch >> 1;
+#define RECON_AND_STORE(dest, in_x) \
+ { \
+ __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
+ d0 = _mm_unpacklo_epi8(d0, zero); \
+ in_x = _mm_add_epi16(in_x, d0); \
+ in_x = _mm_packus_epi16(in_x, in_x); \
+ _mm_storel_epi64((__m128i *)(dest), in_x); \
+ dest += stride; \
+ }
+
+void vp9_short_idct8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
+ const __m128i zero = _mm_setzero_si128();
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1<<4);
const __m128i stg1_0 = pair_set_epi16(cospi_28_64, -cospi_4_64);
@@ -461,19 +471,17 @@
in6 = _mm_srai_epi16(in6, 5);
in7 = _mm_srai_epi16(in7, 5);
- // Store results
- _mm_store_si128((__m128i *)output, in0);
- _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
- _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
- _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
- _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
- _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
- _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
- _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
+ RECON_AND_STORE(dest, in0);
+ RECON_AND_STORE(dest, in1);
+ RECON_AND_STORE(dest, in2);
+ RECON_AND_STORE(dest, in3);
+ RECON_AND_STORE(dest, in4);
+ RECON_AND_STORE(dest, in5);
+ RECON_AND_STORE(dest, in6);
+ RECON_AND_STORE(dest, in7);
}
-void vp9_short_idct10_8x8_sse2(int16_t *input, int16_t *output, int pitch) {
- const int half_pitch = pitch >> 1;
+void vp9_short_idct10_8x8_add_sse2(int16_t *input, uint8_t *dest, int stride) {
const __m128i zero = _mm_setzero_si128();
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1<<4);
@@ -612,15 +620,14 @@
in6 = _mm_srai_epi16(in6, 5);
in7 = _mm_srai_epi16(in7, 5);
- // Store results
- _mm_store_si128((__m128i *)output, in0);
- _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
- _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
- _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
- _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
- _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
- _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
- _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
+ RECON_AND_STORE(dest, in0);
+ RECON_AND_STORE(dest, in1);
+ RECON_AND_STORE(dest, in2);
+ RECON_AND_STORE(dest, in3);
+ RECON_AND_STORE(dest, in4);
+ RECON_AND_STORE(dest, in5);
+ RECON_AND_STORE(dest, in6);
+ RECON_AND_STORE(dest, in7);
}
#define IDCT16x16_1D \
@@ -752,16 +759,6 @@
stp2_10, stp2_13, stp2_11, stp2_12) \
}
-#define RECON_AND_STORE(dest, in_x) \
- { \
- __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
- d0 = _mm_unpacklo_epi8(d0, zero); \
- in_x = _mm_add_epi16(in_x, d0); \
- in_x = _mm_packus_epi16(in_x, in_x); \
- _mm_storel_epi64((__m128i *)(dest), in_x); \
- dest += stride; \
- }
-
void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) {
const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
const __m128i final_rounding = _mm_set1_epi16(1<<5);
diff --git a/vp9/decoder/vp9_idct_blk.c b/vp9/decoder/vp9_idct_blk.c
index bc943fa..10b585b 100644
--- a/vp9/decoder/vp9_idct_blk.c
+++ b/vp9/decoder/vp9_idct_blk.c
@@ -101,10 +101,6 @@
add_residual(diff, dest, stride, 4, 4);
}
-void vp9_add_residual_8x8_c(const int16_t *diff, uint8_t *dest, int stride) {
- add_residual(diff, dest, stride, 8, 8);
-}
-
static void add_constant_residual(const int16_t diff, uint8_t *dest, int stride,
int width, int height) {
int r, c;
@@ -151,11 +147,8 @@
vp9_idct_add_8x8(input, dest, stride, eob);
} else {
if (eob > 0) {
- DECLARE_ALIGNED_ARRAY(16, int16_t, output, 64);
-
- vp9_short_iht8x8(input, output, 8, tx_type);
+ vp9_short_iht8x8_add(input, dest, stride, tx_type);
vpx_memset(input, 0, 128);
- vp9_add_residual_8x8(output, dest, stride);
}
}
}
@@ -210,8 +203,6 @@
}
void vp9_idct_add_8x8_c(int16_t *input, uint8_t *dest, int stride, int eob) {
- DECLARE_ALIGNED_ARRAY(16, int16_t, output, 64);
-
// If dc is 1, then input[0] is the reconstructed value, do not need
// dequantization. Also, when dc is 1, dc is counted in eobs, namely eobs >=1.
@@ -233,20 +224,15 @@
vp9_add_constant_residual_8x8(out, dest, stride);
#if !CONFIG_SCATTERSCAN
} else if (eob <= 10) {
- vp9_short_idct10_8x8(input, output, 16);
-
+ vp9_short_idct10_8x8_add(input, dest, stride);
input[0] = input[1] = input[2] = input[3] = 0;
input[8] = input[9] = input[10] = 0;
input[16] = input[17] = 0;
input[24] = 0;
-
- vp9_add_residual_8x8(output, dest, stride);
#endif
} else {
- // the idct halves ( >> 1) the pitch
- vp9_short_idct8x8(input, output, 8 << 1);
+ vp9_short_idct8x8_add(input, dest, stride);
vpx_memset(input, 0, 128);
- vp9_add_residual_8x8(output, dest, stride);
}
}
}
diff --git a/vp9/decoder/x86/vp9_dequantize_sse2.c b/vp9/decoder/x86/vp9_dequantize_sse2.c
index 796fc12..72036c2 100644
--- a/vp9/decoder/x86/vp9_dequantize_sse2.c
+++ b/vp9/decoder/x86/vp9_dequantize_sse2.c
@@ -58,70 +58,6 @@
*(int *)dest = _mm_cvtsi128_si32(p2);
}
-void vp9_add_residual_8x8_sse2(const int16_t *diff, uint8_t *dest, int stride) {
- const int width = 8;
- const __m128i zero = _mm_setzero_si128();
-
- // Diff data
- const __m128i d0 = _mm_load_si128((const __m128i *)(diff + 0 * width));
- const __m128i d1 = _mm_load_si128((const __m128i *)(diff + 1 * width));
- const __m128i d2 = _mm_load_si128((const __m128i *)(diff + 2 * width));
- const __m128i d3 = _mm_load_si128((const __m128i *)(diff + 3 * width));
- const __m128i d4 = _mm_load_si128((const __m128i *)(diff + 4 * width));
- const __m128i d5 = _mm_load_si128((const __m128i *)(diff + 5 * width));
- const __m128i d6 = _mm_load_si128((const __m128i *)(diff + 6 * width));
- const __m128i d7 = _mm_load_si128((const __m128i *)(diff + 7 * width));
-
- // Prediction data.
- __m128i p0 = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride));
- __m128i p1 = _mm_loadl_epi64((const __m128i *)(dest + 1 * stride));
- __m128i p2 = _mm_loadl_epi64((const __m128i *)(dest + 2 * stride));
- __m128i p3 = _mm_loadl_epi64((const __m128i *)(dest + 3 * stride));
- __m128i p4 = _mm_loadl_epi64((const __m128i *)(dest + 4 * stride));
- __m128i p5 = _mm_loadl_epi64((const __m128i *)(dest + 5 * stride));
- __m128i p6 = _mm_loadl_epi64((const __m128i *)(dest + 6 * stride));
- __m128i p7 = _mm_loadl_epi64((const __m128i *)(dest + 7 * stride));
-
- p0 = _mm_unpacklo_epi8(p0, zero);
- p1 = _mm_unpacklo_epi8(p1, zero);
- p2 = _mm_unpacklo_epi8(p2, zero);
- p3 = _mm_unpacklo_epi8(p3, zero);
- p4 = _mm_unpacklo_epi8(p4, zero);
- p5 = _mm_unpacklo_epi8(p5, zero);
- p6 = _mm_unpacklo_epi8(p6, zero);
- p7 = _mm_unpacklo_epi8(p7, zero);
-
- p0 = _mm_add_epi16(p0, d0);
- p1 = _mm_add_epi16(p1, d1);
- p2 = _mm_add_epi16(p2, d2);
- p3 = _mm_add_epi16(p3, d3);
- p4 = _mm_add_epi16(p4, d4);
- p5 = _mm_add_epi16(p5, d5);
- p6 = _mm_add_epi16(p6, d6);
- p7 = _mm_add_epi16(p7, d7);
-
- p0 = _mm_packus_epi16(p0, p1);
- p2 = _mm_packus_epi16(p2, p3);
- p4 = _mm_packus_epi16(p4, p5);
- p6 = _mm_packus_epi16(p6, p7);
-
- _mm_storel_epi64((__m128i *)(dest + 0 * stride), p0);
- p0 = _mm_srli_si128(p0, 8);
- _mm_storel_epi64((__m128i *)(dest + 1 * stride), p0);
-
- _mm_storel_epi64((__m128i *)(dest + 2 * stride), p2);
- p2 = _mm_srli_si128(p2, 8);
- _mm_storel_epi64((__m128i *)(dest + 3 * stride), p2);
-
- _mm_storel_epi64((__m128i *)(dest + 4 * stride), p4);
- p4 = _mm_srli_si128(p4, 8);
- _mm_storel_epi64((__m128i *)(dest + 5 * stride), p4);
-
- _mm_storel_epi64((__m128i *)(dest + 6 * stride), p6);
- p6 = _mm_srli_si128(p6, 8);
- _mm_storel_epi64((__m128i *)(dest + 7 * stride), p6);
-}
-
void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest,
int stride) {
uint8_t abs_diff;
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 221de74..bbc97da 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -534,11 +534,12 @@
case TX_8X8:
tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;
if (tx_type == DCT_DCT) {
- vp9_short_idct8x8(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
- diff, bw * 2);
+ vp9_short_idct8x8_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,
+ block, 16), dst, xd->plane[plane].dst.stride);
} else {
- vp9_short_iht8x8(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
- diff, bw, tx_type);
+ vp9_short_iht8x8_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,
+ block, 16), dst, xd->plane[plane].dst.stride,
+ tx_type);
}
*wip_txfrm_size = 8;
break;
@@ -589,7 +590,7 @@
foreach_transformed_block_in_plane(xd, bsize, 0,
encode_block, &arg);
- if (wip_txfrm_size < 32)
+ if (wip_txfrm_size < 8)
vp9_recon_sby(xd, bsize);
}
@@ -606,7 +607,7 @@
foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
- if (wip_txfrm_size < 16)
+ if (wip_txfrm_size < 8)
vp9_recon_sbuv(xd, bsize);
}
@@ -628,13 +629,13 @@
// wip version... will use foreach_transformed_block when done
foreach_transformed_block_in_plane(xd, bsize, 0,
encode_block, &arg);
- if (wip_txfrm_size < 16)
+ if (wip_txfrm_size < 8)
vp9_recon_sby(xd, bsize);
wip_txfrm_size = 0;
foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
- if (wip_txfrm_size < 16)
+ if (wip_txfrm_size < 8)
vp9_recon_sbuv(xd, bsize);
#endif
}