Remove dead code: non-scaled DCT
These are remnants from VP9 and are not used by aomenc/aomdec anymore.
Change-Id: Id44129d6ef44325d8867c38b8b643c5b6abc1693
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 14f1a63..a178a76 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -448,15 +448,6 @@
add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/aom_fdct8x8 sse2/, "$ssse3_x86_64";
- add_proto qw/void aom_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/aom_fdct16x16 sse2/;
-
- add_proto qw/void aom_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/aom_fdct32x32 sse2 avx2/;
-
- add_proto qw/void aom_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/aom_fdct32x32_rd sse2 avx2/;
-
# High bit depth
add_proto qw/void aom_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/aom_highbd_fdct4x4 sse2/;
@@ -464,15 +455,6 @@
add_proto qw/void aom_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
specialize qw/aom_highbd_fdct8x8 sse2/;
- add_proto qw/void aom_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/aom_highbd_fdct16x16 sse2/;
-
- add_proto qw/void aom_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/aom_highbd_fdct32x32 sse2/;
-
- add_proto qw/void aom_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
- specialize qw/aom_highbd_fdct32x32_rd sse2/;
-
} # CONFIG_AV1_ENCODER
#
@@ -492,41 +474,6 @@
add_proto qw/void aom_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
specialize qw/aom_idct4x4_1_add sse2/;
-
- add_proto qw/void aom_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/aom_idct8x8_64_add sse2 ssse3/;
-
- add_proto qw/void aom_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/aom_idct8x8_12_add sse2 ssse3/;
-
- add_proto qw/void aom_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/aom_idct8x8_1_add sse2/;
-
- add_proto qw/void aom_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/aom_idct16x16_256_add sse2 avx2/;
-
- add_proto qw/void aom_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/aom_idct16x16_38_add avx2/;
-
- add_proto qw/void aom_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/aom_idct16x16_10_add sse2 avx2/;
-
- add_proto qw/void aom_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/aom_idct16x16_1_add sse2 avx2/;
-
- add_proto qw/void aom_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/aom_idct32x32_1024_add sse2 ssse3 avx2/;
-
- add_proto qw/void aom_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/aom_idct32x32_135_add sse2 ssse3 avx2/;
- # Need to add 135 eob idct32x32 implementations.
- $aom_idct32x32_135_add_sse2=aom_idct32x32_1024_add_sse2;
-
- add_proto qw/void aom_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/aom_idct32x32_34_add sse2 ssse3 avx2/;
-
- add_proto qw/void aom_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
- specialize qw/aom_idct32x32_1_add sse2 avx2/;
} # CONFIG_AV1
#
diff --git a/aom_dsp/fwd_txfm.c b/aom_dsp/fwd_txfm.c
index 935bd07..c1317d8 100644
--- a/aom_dsp/fwd_txfm.c
+++ b/aom_dsp/fwd_txfm.c
@@ -172,186 +172,6 @@
}
}
-void aom_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
- // The 2D transform is done with two passes which are actually pretty
- // similar. In the first one, we transform the columns and transpose
- // the results. In the second one, we transform the rows. To achieve that,
- // as the first pass results are transposed, we transpose the columns (that
- // is the transposed rows) and transpose the results (so that it goes back
- // in normal/row positions).
- int pass;
- // We need an intermediate buffer between passes.
- tran_low_t intermediate[256];
- const tran_low_t *in_low = NULL;
- tran_low_t *out = intermediate;
- // Do the two transform/transpose passes
- for (pass = 0; pass < 2; ++pass) {
- tran_high_t step1[8]; // canbe16
- tran_high_t step2[8]; // canbe16
- tran_high_t step3[8]; // canbe16
- tran_high_t in_high[8]; // canbe16
- tran_high_t temp1, temp2; // needs32
- int i;
- for (i = 0; i < 16; i++) {
- if (0 == pass) {
- // Calculate input for the first 8 results.
- in_high[0] = (input[0 * stride] + input[15 * stride]) * 4;
- in_high[1] = (input[1 * stride] + input[14 * stride]) * 4;
- in_high[2] = (input[2 * stride] + input[13 * stride]) * 4;
- in_high[3] = (input[3 * stride] + input[12 * stride]) * 4;
- in_high[4] = (input[4 * stride] + input[11 * stride]) * 4;
- in_high[5] = (input[5 * stride] + input[10 * stride]) * 4;
- in_high[6] = (input[6 * stride] + input[9 * stride]) * 4;
- in_high[7] = (input[7 * stride] + input[8 * stride]) * 4;
- // Calculate input for the next 8 results.
- step1[0] = (input[7 * stride] - input[8 * stride]) * 4;
- step1[1] = (input[6 * stride] - input[9 * stride]) * 4;
- step1[2] = (input[5 * stride] - input[10 * stride]) * 4;
- step1[3] = (input[4 * stride] - input[11 * stride]) * 4;
- step1[4] = (input[3 * stride] - input[12 * stride]) * 4;
- step1[5] = (input[2 * stride] - input[13 * stride]) * 4;
- step1[6] = (input[1 * stride] - input[14 * stride]) * 4;
- step1[7] = (input[0 * stride] - input[15 * stride]) * 4;
- } else {
- // Calculate input for the first 8 results.
- assert(in_low != NULL);
- in_high[0] = ((in_low[0 * 16] + 1) >> 2) + ((in_low[15 * 16] + 1) >> 2);
- in_high[1] = ((in_low[1 * 16] + 1) >> 2) + ((in_low[14 * 16] + 1) >> 2);
- in_high[2] = ((in_low[2 * 16] + 1) >> 2) + ((in_low[13 * 16] + 1) >> 2);
- in_high[3] = ((in_low[3 * 16] + 1) >> 2) + ((in_low[12 * 16] + 1) >> 2);
- in_high[4] = ((in_low[4 * 16] + 1) >> 2) + ((in_low[11 * 16] + 1) >> 2);
- in_high[5] = ((in_low[5 * 16] + 1) >> 2) + ((in_low[10 * 16] + 1) >> 2);
- in_high[6] = ((in_low[6 * 16] + 1) >> 2) + ((in_low[9 * 16] + 1) >> 2);
- in_high[7] = ((in_low[7 * 16] + 1) >> 2) + ((in_low[8 * 16] + 1) >> 2);
- // Calculate input for the next 8 results.
- step1[0] = ((in_low[7 * 16] + 1) >> 2) - ((in_low[8 * 16] + 1) >> 2);
- step1[1] = ((in_low[6 * 16] + 1) >> 2) - ((in_low[9 * 16] + 1) >> 2);
- step1[2] = ((in_low[5 * 16] + 1) >> 2) - ((in_low[10 * 16] + 1) >> 2);
- step1[3] = ((in_low[4 * 16] + 1) >> 2) - ((in_low[11 * 16] + 1) >> 2);
- step1[4] = ((in_low[3 * 16] + 1) >> 2) - ((in_low[12 * 16] + 1) >> 2);
- step1[5] = ((in_low[2 * 16] + 1) >> 2) - ((in_low[13 * 16] + 1) >> 2);
- step1[6] = ((in_low[1 * 16] + 1) >> 2) - ((in_low[14 * 16] + 1) >> 2);
- step1[7] = ((in_low[0 * 16] + 1) >> 2) - ((in_low[15 * 16] + 1) >> 2);
- in_low++;
- }
- // Work on the first eight values; fdct8(input, even_results);
- {
- tran_high_t s0, s1, s2, s3, s4, s5, s6, s7; // canbe16
- tran_high_t t0, t1, t2, t3; // needs32
- tran_high_t x0, x1, x2, x3; // canbe16
-
- // stage 1
- s0 = in_high[0] + in_high[7];
- s1 = in_high[1] + in_high[6];
- s2 = in_high[2] + in_high[5];
- s3 = in_high[3] + in_high[4];
- s4 = in_high[3] - in_high[4];
- s5 = in_high[2] - in_high[5];
- s6 = in_high[1] - in_high[6];
- s7 = in_high[0] - in_high[7];
-
- // fdct4(step, step);
- x0 = s0 + s3;
- x1 = s1 + s2;
- x2 = s1 - s2;
- x3 = s0 - s3;
- t0 = (x0 + x1) * cospi_16_64;
- t1 = (x0 - x1) * cospi_16_64;
- t2 = x3 * cospi_8_64 + x2 * cospi_24_64;
- t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
- out[0] = (tran_low_t)fdct_round_shift(t0);
- out[4] = (tran_low_t)fdct_round_shift(t2);
- out[8] = (tran_low_t)fdct_round_shift(t1);
- out[12] = (tran_low_t)fdct_round_shift(t3);
-
- // Stage 2
- t0 = (s6 - s5) * cospi_16_64;
- t1 = (s6 + s5) * cospi_16_64;
- t2 = fdct_round_shift(t0);
- t3 = fdct_round_shift(t1);
-
- // Stage 3
- x0 = s4 + t2;
- x1 = s4 - t2;
- x2 = s7 - t3;
- x3 = s7 + t3;
-
- // Stage 4
- t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
- t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
- t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
- t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
- out[2] = (tran_low_t)fdct_round_shift(t0);
- out[6] = (tran_low_t)fdct_round_shift(t2);
- out[10] = (tran_low_t)fdct_round_shift(t1);
- out[14] = (tran_low_t)fdct_round_shift(t3);
- }
- // Work on the next eight values; step1 -> odd_results
- {
- // step 2
- temp1 = (step1[5] - step1[2]) * cospi_16_64;
- temp2 = (step1[4] - step1[3]) * cospi_16_64;
- step2[2] = fdct_round_shift(temp1);
- step2[3] = fdct_round_shift(temp2);
- temp1 = (step1[4] + step1[3]) * cospi_16_64;
- temp2 = (step1[5] + step1[2]) * cospi_16_64;
- step2[4] = fdct_round_shift(temp1);
- step2[5] = fdct_round_shift(temp2);
- // step 3
- step3[0] = step1[0] + step2[3];
- step3[1] = step1[1] + step2[2];
- step3[2] = step1[1] - step2[2];
- step3[3] = step1[0] - step2[3];
- step3[4] = step1[7] - step2[4];
- step3[5] = step1[6] - step2[5];
- step3[6] = step1[6] + step2[5];
- step3[7] = step1[7] + step2[4];
- // step 4
- temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64;
- temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64;
- step2[1] = fdct_round_shift(temp1);
- step2[2] = fdct_round_shift(temp2);
- temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
- temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64;
- step2[5] = fdct_round_shift(temp1);
- step2[6] = fdct_round_shift(temp2);
- // step 5
- step1[0] = step3[0] + step2[1];
- step1[1] = step3[0] - step2[1];
- step1[2] = step3[3] + step2[2];
- step1[3] = step3[3] - step2[2];
- step1[4] = step3[4] - step2[5];
- step1[5] = step3[4] + step2[5];
- step1[6] = step3[7] - step2[6];
- step1[7] = step3[7] + step2[6];
- // step 6
- temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64;
- temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
- out[1] = (tran_low_t)fdct_round_shift(temp1);
- out[9] = (tran_low_t)fdct_round_shift(temp2);
- temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
- temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64;
- out[5] = (tran_low_t)fdct_round_shift(temp1);
- out[13] = (tran_low_t)fdct_round_shift(temp2);
- temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64;
- temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
- out[3] = (tran_low_t)fdct_round_shift(temp1);
- out[11] = (tran_low_t)fdct_round_shift(temp2);
- temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
- temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
- out[7] = (tran_low_t)fdct_round_shift(temp1);
- out[15] = (tran_low_t)fdct_round_shift(temp2);
- }
- // Do next column (which is a transposed row in second/horizontal pass)
- input++;
- out += 16;
- }
- // Setup in/out for next pass.
- in_low = intermediate;
- out = output;
- }
-}
-
static INLINE tran_high_t dct_32_round(tran_high_t input) {
tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
// TODO(debargha, peter.derivaz): Find new bounds for this assert,
@@ -712,34 +532,6 @@
}
}
-// Note that although we use dct_32_round in dct32 computation flow,
-// this 2d fdct32x32 for rate-distortion optimization loop is operating
-// within 16 bits precision.
-void aom_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
- int i, j;
- tran_high_t output[32 * 32];
-
- // Columns
- for (i = 0; i < 32; ++i) {
- tran_high_t temp_in[32], temp_out[32];
- for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
- aom_fdct32(temp_in, temp_out, 0);
- for (j = 0; j < 32; ++j)
- // TODO(cd): see quality impact of only doing
- // output[j * 32 + i] = (temp_out[j] + 1) >> 2;
- // PS: also change code in aom_dsp/x86/aom_dct_sse2.c
- output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
- }
-
- // Rows
- for (i = 0; i < 32; ++i) {
- tran_high_t temp_in[32], temp_out[32];
- for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32];
- aom_fdct32(temp_in, temp_out, 1);
- for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j];
- }
-}
-
void aom_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output,
int stride) {
aom_fdct4x4_c(input, output, stride);
@@ -750,15 +542,6 @@
aom_fdct8x8_c(input, final_output, stride);
}
-void aom_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output,
- int stride) {
- aom_fdct16x16_c(input, output, stride);
-}
-
void aom_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
aom_fdct32x32_c(input, out, stride);
}
-void aom_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
- int stride) {
- aom_fdct32x32_rd_c(input, out, stride);
-}
diff --git a/aom_dsp/inv_txfm.c b/aom_dsp/inv_txfm.c
index c4efae6..546b1c1 100644
--- a/aom_dsp/inv_txfm.c
+++ b/aom_dsp/inv_txfm.c
@@ -210,43 +210,6 @@
output[7] = WRAPLOW(step1[0] - step1[7]);
}
-void aom_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
- tran_low_t out[8 * 8];
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[8], temp_out[8];
-
- // First transform rows
- for (i = 0; i < 8; ++i) {
- aom_idct8_c(input, outptr);
- input += 8;
- outptr += 8;
- }
-
- // Then transform columns
- for (i = 0; i < 8; ++i) {
- for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
- aom_idct8_c(temp_in, temp_out);
- for (j = 0; j < 8; ++j) {
- dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
- ROUND_POWER_OF_TWO(temp_out[j], 5));
- }
- }
-}
-
-void aom_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
- int i, j;
- tran_high_t a1;
- tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
- out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
- a1 = ROUND_POWER_OF_TWO(out, 5);
- if (a1 == 0) return;
- for (j = 0; j < 8; ++j) {
- for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1);
- dest += stride;
- }
-}
-
void aom_iadst4_c(const tran_low_t *input, tran_low_t *output) {
tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
@@ -361,31 +324,6 @@
output[7] = WRAPLOW(-x1);
}
-void aom_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
- tran_low_t out[8 * 8] = { 0 };
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[8], temp_out[8];
-
- // First transform rows
- // only first 4 row has non-zero coefs
- for (i = 0; i < 4; ++i) {
- aom_idct8_c(input, outptr);
- input += 8;
- outptr += 8;
- }
-
- // Then transform columns
- for (i = 0; i < 8; ++i) {
- for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
- aom_idct8_c(temp_in, temp_out);
- for (j = 0; j < 8; ++j) {
- dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
- ROUND_POWER_OF_TWO(temp_out[j], 5));
- }
- }
-}
-
void aom_idct16_c(const tran_low_t *input, tran_low_t *output) {
tran_low_t step1[16], step2[16];
tran_high_t temp1, temp2;
@@ -551,31 +489,6 @@
output[15] = WRAPLOW(step2[0] - step2[15]);
}
-void aom_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
- int stride) {
- tran_low_t out[16 * 16];
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[16], temp_out[16];
-
- // First transform rows
- for (i = 0; i < 16; ++i) {
- aom_idct16_c(input, outptr);
- input += 16;
- outptr += 16;
- }
-
- // Then transform columns
- for (i = 0; i < 16; ++i) {
- for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
- aom_idct16_c(temp_in, temp_out);
- for (j = 0; j < 16; ++j) {
- dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
- ROUND_POWER_OF_TWO(temp_out[j], 6));
- }
- }
-}
-
void aom_iadst16_c(const tran_low_t *input, tran_low_t *output) {
tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
tran_high_t s9, s10, s11, s12, s13, s14, s15;
@@ -747,71 +660,6 @@
output[15] = WRAPLOW(-x1);
}
-void aom_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest,
- int stride) {
- int i, j;
- tran_low_t out[16 * 16] = { 0 };
- tran_low_t *outptr = out;
- tran_low_t temp_in[16], temp_out[16];
-
- // First transform rows. Since all non-zero dct coefficients are in
- // upper-left 8x8 area, we only need to calculate first 8 rows here.
- for (i = 0; i < 8; ++i) {
- aom_idct16_c(input, outptr);
- input += 16;
- outptr += 16;
- }
-
- // Then transform columns
- for (i = 0; i < 16; ++i) {
- for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
- aom_idct16_c(temp_in, temp_out);
- for (j = 0; j < 16; ++j) {
- dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
- ROUND_POWER_OF_TWO(temp_out[j], 6));
- }
- }
-}
-
-void aom_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
- int stride) {
- tran_low_t out[16 * 16] = { 0 };
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[16], temp_out[16];
-
- // First transform rows. Since all non-zero dct coefficients are in
- // upper-left 4x4 area, we only need to calculate first 4 rows here.
- for (i = 0; i < 4; ++i) {
- aom_idct16_c(input, outptr);
- input += 16;
- outptr += 16;
- }
-
- // Then transform columns
- for (i = 0; i < 16; ++i) {
- for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
- aom_idct16_c(temp_in, temp_out);
- for (j = 0; j < 16; ++j) {
- dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
- ROUND_POWER_OF_TWO(temp_out[j], 6));
- }
- }
-}
-
-void aom_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
- int i, j;
- tran_high_t a1;
- tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
- out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
- a1 = ROUND_POWER_OF_TWO(out, 6);
- if (a1 == 0) return;
- for (j = 0; j < 16; ++j) {
- for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1);
- dest += stride;
- }
-}
-
void aom_idct32_c(const tran_low_t *input, tran_low_t *output) {
tran_low_t step1[32], step2[32];
tran_high_t temp1, temp2;
@@ -1178,111 +1026,6 @@
output[30] = WRAPLOW(step1[1] - step1[30]);
output[31] = WRAPLOW(step1[0] - step1[31]);
}
-
-void aom_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
- int stride) {
- tran_low_t out[32 * 32];
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[32], temp_out[32];
-
- // Rows
- for (i = 0; i < 32; ++i) {
- int16_t zero_coeff[16];
- for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
- for (j = 0; j < 8; ++j)
- zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
- for (j = 0; j < 4; ++j)
- zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
- for (j = 0; j < 2; ++j)
- zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-
- if (zero_coeff[0] | zero_coeff[1])
- aom_idct32_c(input, outptr);
- else
- memset(outptr, 0, sizeof(tran_low_t) * 32);
- input += 32;
- outptr += 32;
- }
-
- // Columns
- for (i = 0; i < 32; ++i) {
- for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
- aom_idct32_c(temp_in, temp_out);
- for (j = 0; j < 32; ++j) {
- dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
- ROUND_POWER_OF_TWO(temp_out[j], 6));
- }
- }
-}
-
-void aom_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
- int stride) {
- tran_low_t out[32 * 32] = { 0 };
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[32], temp_out[32];
-
- // Rows
- // only upper-left 16x16 has non-zero coeff
- for (i = 0; i < 16; ++i) {
- aom_idct32_c(input, outptr);
- input += 32;
- outptr += 32;
- }
-
- // Columns
- for (i = 0; i < 32; ++i) {
- for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
- aom_idct32_c(temp_in, temp_out);
- for (j = 0; j < 32; ++j) {
- dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
- ROUND_POWER_OF_TWO(temp_out[j], 6));
- }
- }
-}
-
-void aom_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
- int stride) {
- tran_low_t out[32 * 32] = { 0 };
- tran_low_t *outptr = out;
- int i, j;
- tran_low_t temp_in[32], temp_out[32];
-
- // Rows
- // only upper-left 8x8 has non-zero coeff
- for (i = 0; i < 8; ++i) {
- aom_idct32_c(input, outptr);
- input += 32;
- outptr += 32;
- }
-
- // Columns
- for (i = 0; i < 32; ++i) {
- for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
- aom_idct32_c(temp_in, temp_out);
- for (j = 0; j < 32; ++j) {
- dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
- ROUND_POWER_OF_TWO(temp_out[j], 6));
- }
- }
-}
-
-void aom_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
- int i, j;
- tran_high_t a1;
-
- tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
- out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
- a1 = ROUND_POWER_OF_TWO(out, 6);
- if (a1 == 0) return;
-
- for (j = 0; j < 32; ++j) {
- for (i = 0; i < 32; ++i) dest[i] = clip_pixel_add(dest[i], a1);
- dest += stride;
- }
-}
-
void aom_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
int stride, int bd) {
/* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
diff --git a/aom_dsp/x86/fwd_txfm_impl_sse2.h b/aom_dsp/x86/fwd_txfm_impl_sse2.h
index 20931db..c1c072d 100644
--- a/aom_dsp/x86/fwd_txfm_impl_sse2.h
+++ b/aom_dsp/x86/fwd_txfm_impl_sse2.h
@@ -566,451 +566,5 @@
}
}
-void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
- // The 2D transform is done with two passes which are actually pretty
- // similar. In the first one, we transform the columns and transpose
- // the results. In the second one, we transform the rows. To achieve that,
- // as the first pass results are transposed, we transpose the columns (that
- // is the transposed rows) and transpose the results (so that it goes back
- // in normal/row positions).
- int pass;
- // We need an intermediate buffer between passes.
- DECLARE_ALIGNED(16, int16_t, intermediate[256]);
- const int16_t *in = input;
- int16_t *out0 = intermediate;
- tran_low_t *out1 = output;
- // Constants
- // When we use them, in one case, they are all the same. In all others
- // it's a pair of them that we need to repeat four times. This is done
- // by constructing the 32 bit constant corresponding to that pair.
- const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
- const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
- const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
- const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
- const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
- const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
- const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
- const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
- const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
- const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
- const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
- const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
- const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
- const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
- const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
- const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
- const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
- const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
- const __m128i kOne = _mm_set1_epi16(1);
- // Do the two transform/transpose passes
- for (pass = 0; pass < 2; ++pass) {
- // We process eight columns (transposed rows in second pass) at a time.
- int column_start;
-#if DCT_HIGH_BIT_DEPTH
- int overflow;
-#endif
- for (column_start = 0; column_start < 16; column_start += 8) {
- __m128i in00, in01, in02, in03, in04, in05, in06, in07;
- __m128i in08, in09, in10, in11, in12, in13, in14, in15;
- __m128i input0, input1, input2, input3, input4, input5, input6, input7;
- __m128i step1_0, step1_1, step1_2, step1_3;
- __m128i step1_4, step1_5, step1_6, step1_7;
- __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
- __m128i step3_0, step3_1, step3_2, step3_3;
- __m128i step3_4, step3_5, step3_6, step3_7;
- __m128i res00, res01, res02, res03, res04, res05, res06, res07;
- __m128i res08, res09, res10, res11, res12, res13, res14, res15;
- // Load and pre-condition input.
- if (0 == pass) {
- in00 = _mm_load_si128((const __m128i *)(in + 0 * stride));
- in01 = _mm_load_si128((const __m128i *)(in + 1 * stride));
- in02 = _mm_load_si128((const __m128i *)(in + 2 * stride));
- in03 = _mm_load_si128((const __m128i *)(in + 3 * stride));
- in04 = _mm_load_si128((const __m128i *)(in + 4 * stride));
- in05 = _mm_load_si128((const __m128i *)(in + 5 * stride));
- in06 = _mm_load_si128((const __m128i *)(in + 6 * stride));
- in07 = _mm_load_si128((const __m128i *)(in + 7 * stride));
- in08 = _mm_load_si128((const __m128i *)(in + 8 * stride));
- in09 = _mm_load_si128((const __m128i *)(in + 9 * stride));
- in10 = _mm_load_si128((const __m128i *)(in + 10 * stride));
- in11 = _mm_load_si128((const __m128i *)(in + 11 * stride));
- in12 = _mm_load_si128((const __m128i *)(in + 12 * stride));
- in13 = _mm_load_si128((const __m128i *)(in + 13 * stride));
- in14 = _mm_load_si128((const __m128i *)(in + 14 * stride));
- in15 = _mm_load_si128((const __m128i *)(in + 15 * stride));
- // x = x << 2
- in00 = _mm_slli_epi16(in00, 2);
- in01 = _mm_slli_epi16(in01, 2);
- in02 = _mm_slli_epi16(in02, 2);
- in03 = _mm_slli_epi16(in03, 2);
- in04 = _mm_slli_epi16(in04, 2);
- in05 = _mm_slli_epi16(in05, 2);
- in06 = _mm_slli_epi16(in06, 2);
- in07 = _mm_slli_epi16(in07, 2);
- in08 = _mm_slli_epi16(in08, 2);
- in09 = _mm_slli_epi16(in09, 2);
- in10 = _mm_slli_epi16(in10, 2);
- in11 = _mm_slli_epi16(in11, 2);
- in12 = _mm_slli_epi16(in12, 2);
- in13 = _mm_slli_epi16(in13, 2);
- in14 = _mm_slli_epi16(in14, 2);
- in15 = _mm_slli_epi16(in15, 2);
- } else {
- in00 = _mm_load_si128((const __m128i *)(in + 0 * 16));
- in01 = _mm_load_si128((const __m128i *)(in + 1 * 16));
- in02 = _mm_load_si128((const __m128i *)(in + 2 * 16));
- in03 = _mm_load_si128((const __m128i *)(in + 3 * 16));
- in04 = _mm_load_si128((const __m128i *)(in + 4 * 16));
- in05 = _mm_load_si128((const __m128i *)(in + 5 * 16));
- in06 = _mm_load_si128((const __m128i *)(in + 6 * 16));
- in07 = _mm_load_si128((const __m128i *)(in + 7 * 16));
- in08 = _mm_load_si128((const __m128i *)(in + 8 * 16));
- in09 = _mm_load_si128((const __m128i *)(in + 9 * 16));
- in10 = _mm_load_si128((const __m128i *)(in + 10 * 16));
- in11 = _mm_load_si128((const __m128i *)(in + 11 * 16));
- in12 = _mm_load_si128((const __m128i *)(in + 12 * 16));
- in13 = _mm_load_si128((const __m128i *)(in + 13 * 16));
- in14 = _mm_load_si128((const __m128i *)(in + 14 * 16));
- in15 = _mm_load_si128((const __m128i *)(in + 15 * 16));
- // x = (x + 1) >> 2
- in00 = _mm_add_epi16(in00, kOne);
- in01 = _mm_add_epi16(in01, kOne);
- in02 = _mm_add_epi16(in02, kOne);
- in03 = _mm_add_epi16(in03, kOne);
- in04 = _mm_add_epi16(in04, kOne);
- in05 = _mm_add_epi16(in05, kOne);
- in06 = _mm_add_epi16(in06, kOne);
- in07 = _mm_add_epi16(in07, kOne);
- in08 = _mm_add_epi16(in08, kOne);
- in09 = _mm_add_epi16(in09, kOne);
- in10 = _mm_add_epi16(in10, kOne);
- in11 = _mm_add_epi16(in11, kOne);
- in12 = _mm_add_epi16(in12, kOne);
- in13 = _mm_add_epi16(in13, kOne);
- in14 = _mm_add_epi16(in14, kOne);
- in15 = _mm_add_epi16(in15, kOne);
- in00 = _mm_srai_epi16(in00, 2);
- in01 = _mm_srai_epi16(in01, 2);
- in02 = _mm_srai_epi16(in02, 2);
- in03 = _mm_srai_epi16(in03, 2);
- in04 = _mm_srai_epi16(in04, 2);
- in05 = _mm_srai_epi16(in05, 2);
- in06 = _mm_srai_epi16(in06, 2);
- in07 = _mm_srai_epi16(in07, 2);
- in08 = _mm_srai_epi16(in08, 2);
- in09 = _mm_srai_epi16(in09, 2);
- in10 = _mm_srai_epi16(in10, 2);
- in11 = _mm_srai_epi16(in11, 2);
- in12 = _mm_srai_epi16(in12, 2);
- in13 = _mm_srai_epi16(in13, 2);
- in14 = _mm_srai_epi16(in14, 2);
- in15 = _mm_srai_epi16(in15, 2);
- }
- in += 8;
- // Calculate input for the first 8 results.
- {
- input0 = ADD_EPI16(in00, in15);
- input1 = ADD_EPI16(in01, in14);
- input2 = ADD_EPI16(in02, in13);
- input3 = ADD_EPI16(in03, in12);
- input4 = ADD_EPI16(in04, in11);
- input5 = ADD_EPI16(in05, in10);
- input6 = ADD_EPI16(in06, in09);
- input7 = ADD_EPI16(in07, in08);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x8(&input0, &input1, &input2, &input3,
- &input4, &input5, &input6, &input7);
- if (overflow) {
- aom_highbd_fdct16x16_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- // Calculate input for the next 8 results.
- {
- step1_0 = SUB_EPI16(in07, in08);
- step1_1 = SUB_EPI16(in06, in09);
- step1_2 = SUB_EPI16(in05, in10);
- step1_3 = SUB_EPI16(in04, in11);
- step1_4 = SUB_EPI16(in03, in12);
- step1_5 = SUB_EPI16(in02, in13);
- step1_6 = SUB_EPI16(in01, in14);
- step1_7 = SUB_EPI16(in00, in15);
-#if DCT_HIGH_BIT_DEPTH
- overflow =
- check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3,
- &step1_4, &step1_5, &step1_6, &step1_7);
- if (overflow) {
- aom_highbd_fdct16x16_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- // Work on the first eight values; fdct8(input, even_results);
- {
- // Add/subtract
- const __m128i q0 = ADD_EPI16(input0, input7);
- const __m128i q1 = ADD_EPI16(input1, input6);
- const __m128i q2 = ADD_EPI16(input2, input5);
- const __m128i q3 = ADD_EPI16(input3, input4);
- const __m128i q4 = SUB_EPI16(input3, input4);
- const __m128i q5 = SUB_EPI16(input2, input5);
- const __m128i q6 = SUB_EPI16(input1, input6);
- const __m128i q7 = SUB_EPI16(input0, input7);
-#if DCT_HIGH_BIT_DEPTH
- overflow =
- check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
- if (overflow) {
- aom_highbd_fdct16x16_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- // Work on first four results
- {
- // Add/subtract
- const __m128i r0 = ADD_EPI16(q0, q3);
- const __m128i r1 = ADD_EPI16(q1, q2);
- const __m128i r2 = SUB_EPI16(q1, q2);
- const __m128i r3 = SUB_EPI16(q0, q3);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
- if (overflow) {
- aom_highbd_fdct16x16_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
-
- // Interleave to do the multiply by constants which gets us
- // into 32 bits.
- {
- const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
- const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
- const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
- const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
- res00 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- res08 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- res04 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- res12 = mult_round_shift(&t2, &t3, &k__cospi_m08_p24,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x4(&res00, &res08, &res04, &res12);
- if (overflow) {
- aom_highbd_fdct16x16_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- }
- // Work on next four results
- {
- // Interleave to do the multiply by constants which gets us
- // into 32 bits.
- const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
- const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
- const __m128i r0 =
- mult_round_shift(&d0, &d1, &k__cospi_p16_m16,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- const __m128i r1 =
- mult_round_shift(&d0, &d1, &k__cospi_p16_p16,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x2(&r0, &r1);
- if (overflow) {
- aom_highbd_fdct16x16_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- {
- // Add/subtract
- const __m128i x0 = ADD_EPI16(q4, r0);
- const __m128i x1 = SUB_EPI16(q4, r0);
- const __m128i x2 = SUB_EPI16(q7, r1);
- const __m128i x3 = ADD_EPI16(q7, r1);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
- if (overflow) {
- aom_highbd_fdct16x16_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
-
- // Interleave to do the multiply by constants which gets us
- // into 32 bits.
- {
- const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
- const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
- const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
- const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
- res02 = mult_round_shift(&t0, &t1, &k__cospi_p28_p04,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- res14 = mult_round_shift(&t0, &t1, &k__cospi_m04_p28,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- res10 = mult_round_shift(&t2, &t3, &k__cospi_p12_p20,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- res06 = mult_round_shift(&t2, &t3, &k__cospi_m20_p12,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
- overflow =
- check_epi16_overflow_x4(&res02, &res14, &res10, &res06);
- if (overflow) {
- aom_highbd_fdct16x16_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- }
- }
- }
- // Work on the next eight values; step1 -> odd_results
- {
- // step 2
- {
- const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
- const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
- const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
- const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
- step2_2 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- step2_3 = mult_round_shift(&t2, &t3, &k__cospi_p16_m16,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- step2_5 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- step2_4 = mult_round_shift(&t2, &t3, &k__cospi_p16_p16,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
- overflow =
- check_epi16_overflow_x4(&step2_2, &step2_3, &step2_5, &step2_4);
- if (overflow) {
- aom_highbd_fdct16x16_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- // step 3
- {
- step3_0 = ADD_EPI16(step1_0, step2_3);
- step3_1 = ADD_EPI16(step1_1, step2_2);
- step3_2 = SUB_EPI16(step1_1, step2_2);
- step3_3 = SUB_EPI16(step1_0, step2_3);
- step3_4 = SUB_EPI16(step1_7, step2_4);
- step3_5 = SUB_EPI16(step1_6, step2_5);
- step3_6 = ADD_EPI16(step1_6, step2_5);
- step3_7 = ADD_EPI16(step1_7, step2_4);
-#if DCT_HIGH_BIT_DEPTH
- overflow =
- check_epi16_overflow_x8(&step3_0, &step3_1, &step3_2, &step3_3,
- &step3_4, &step3_5, &step3_6, &step3_7);
- if (overflow) {
- aom_highbd_fdct16x16_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- // step 4
- {
- const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
- const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
- const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
- const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
- step2_1 = mult_round_shift(&t0, &t1, &k__cospi_m08_p24,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- step2_2 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- step2_6 = mult_round_shift(&t0, &t1, &k__cospi_p24_p08,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- step2_5 = mult_round_shift(&t2, &t3, &k__cospi_p08_m24,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
- overflow =
- check_epi16_overflow_x4(&step2_1, &step2_2, &step2_6, &step2_5);
- if (overflow) {
- aom_highbd_fdct16x16_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- // step 5
- {
- step1_0 = ADD_EPI16(step3_0, step2_1);
- step1_1 = SUB_EPI16(step3_0, step2_1);
- step1_2 = ADD_EPI16(step3_3, step2_2);
- step1_3 = SUB_EPI16(step3_3, step2_2);
- step1_4 = SUB_EPI16(step3_4, step2_5);
- step1_5 = ADD_EPI16(step3_4, step2_5);
- step1_6 = SUB_EPI16(step3_7, step2_6);
- step1_7 = ADD_EPI16(step3_7, step2_6);
-#if DCT_HIGH_BIT_DEPTH
- overflow =
- check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3,
- &step1_4, &step1_5, &step1_6, &step1_7);
- if (overflow) {
- aom_highbd_fdct16x16_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- // step 6
- {
- const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
- const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
- const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
- const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
- res01 = mult_round_shift(&t0, &t1, &k__cospi_p30_p02,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- res09 = mult_round_shift(&t2, &t3, &k__cospi_p14_p18,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- res15 = mult_round_shift(&t0, &t1, &k__cospi_m02_p30,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- res07 = mult_round_shift(&t2, &t3, &k__cospi_m18_p14,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x4(&res01, &res09, &res15, &res07);
- if (overflow) {
- aom_highbd_fdct16x16_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- {
- const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
- const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
- const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
- const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
- res05 = mult_round_shift(&t0, &t1, &k__cospi_p22_p10,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- res13 = mult_round_shift(&t2, &t3, &k__cospi_p06_p26,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- res11 = mult_round_shift(&t0, &t1, &k__cospi_m10_p22,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
- res03 = mult_round_shift(&t2, &t3, &k__cospi_m26_p06,
- &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
- overflow = check_epi16_overflow_x4(&res05, &res13, &res11, &res03);
- if (overflow) {
- aom_highbd_fdct16x16_c(input, output, stride);
- return;
- }
-#endif // DCT_HIGH_BIT_DEPTH
- }
- }
- // Transpose the results, do it as two 8x8 transposes.
- transpose_and_output8x8(&res00, &res01, &res02, &res03, &res04, &res05,
- &res06, &res07, pass, out0, out1);
- transpose_and_output8x8(&res08, &res09, &res10, &res11, &res12, &res13,
- &res14, &res15, pass, out0 + 8, out1 + 8);
- if (pass == 0) {
- out0 += 8 * 16;
- } else {
- out1 += 8 * 16;
- }
- }
- // Setup in/out for next pass.
- in = intermediate;
- }
-}
-
#undef ADD_EPI16
#undef SUB_EPI16
diff --git a/aom_dsp/x86/fwd_txfm_sse2.c b/aom_dsp/x86/fwd_txfm_sse2.c
index 463d71f..edee0fb 100644
--- a/aom_dsp/x86/fwd_txfm_sse2.c
+++ b/aom_dsp/x86/fwd_txfm_sse2.c
@@ -88,43 +88,14 @@
#define DCT_HIGH_BIT_DEPTH 0
#define FDCT4x4_2D aom_fdct4x4_sse2
#define FDCT8x8_2D aom_fdct8x8_sse2
-#define FDCT16x16_2D aom_fdct16x16_sse2
#include "aom_dsp/x86/fwd_txfm_impl_sse2.h"
#undef FDCT4x4_2D
#undef FDCT8x8_2D
-#undef FDCT16x16_2D
-#define FDCT32x32_2D aom_fdct32x32_rd_sse2
-#define FDCT32x32_HIGH_PRECISION 0
-#include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h"
-#undef FDCT32x32_2D
-#undef FDCT32x32_HIGH_PRECISION
-
-#define FDCT32x32_2D aom_fdct32x32_sse2
-#define FDCT32x32_HIGH_PRECISION 1
-#include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT
-#undef FDCT32x32_2D
-#undef FDCT32x32_HIGH_PRECISION
#undef DCT_HIGH_BIT_DEPTH
-
#define DCT_HIGH_BIT_DEPTH 1
#define FDCT4x4_2D aom_highbd_fdct4x4_sse2
#define FDCT8x8_2D aom_highbd_fdct8x8_sse2
-#define FDCT16x16_2D aom_highbd_fdct16x16_sse2
#include "aom_dsp/x86/fwd_txfm_impl_sse2.h" // NOLINT
#undef FDCT4x4_2D
#undef FDCT8x8_2D
-#undef FDCT16x16_2D
-
-#define FDCT32x32_2D aom_highbd_fdct32x32_rd_sse2
-#define FDCT32x32_HIGH_PRECISION 0
-#include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT
-#undef FDCT32x32_2D
-#undef FDCT32x32_HIGH_PRECISION
-
-#define FDCT32x32_2D aom_highbd_fdct32x32_sse2
-#define FDCT32x32_HIGH_PRECISION 1
-#include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h" // NOLINT
-#undef FDCT32x32_2D
-#undef FDCT32x32_HIGH_PRECISION
-#undef DCT_HIGH_BIT_DEPTH
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index 572feb8..9c1a70f 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -237,16 +237,6 @@
typedef std::tr1::tuple<IdctFunc, IdctFunc, TX_TYPE, aom_bit_depth_t>
Idct16x16Param;
-void fdct16x16_ref(const int16_t *in, tran_low_t *out, int stride,
- TxfmParam * /*txfm_param*/) {
- aom_fdct16x16_c(in, out, stride);
-}
-
-void idct16x16_ref(const tran_low_t *in, uint8_t *dest, int stride,
- const TxfmParam * /*txfm_param*/) {
- aom_idct16x16_256_add_c(in, dest, stride);
-}
-
void fht16x16_ref(const int16_t *in, tran_low_t *out, int stride,
TxfmParam *txfm_param) {
av1_fht16x16_c(in, out, stride, txfm_param);
@@ -542,50 +532,6 @@
TxfmParam txfm_param_;
};
-class Trans16x16DCT : public Trans16x16TestBase,
- public ::testing::TestWithParam<Dct16x16Param> {
- public:
- virtual ~Trans16x16DCT() {}
-
- virtual void SetUp() {
- fwd_txfm_ = GET_PARAM(0);
- inv_txfm_ = GET_PARAM(1);
- bit_depth_ = GET_PARAM(3);
- pitch_ = 16;
- fwd_txfm_ref = fdct16x16_ref;
- inv_txfm_ref = idct16x16_ref;
- mask_ = (1 << bit_depth_) - 1;
- inv_txfm_ref = idct16x16_ref;
- txfm_param_.tx_type = GET_PARAM(2);
- }
- virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
- void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
- fwd_txfm_(in, out, stride);
- }
- void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
- inv_txfm_(out, dst, stride);
- }
-
- FdctFunc fwd_txfm_;
- IdctFunc inv_txfm_;
-};
-
-TEST_P(Trans16x16DCT, AccuracyCheck) { RunAccuracyCheck(); }
-
-TEST_P(Trans16x16DCT, CoeffCheck) { RunCoeffCheck(); }
-
-TEST_P(Trans16x16DCT, MemCheck) { RunMemCheck(); }
-
-TEST_P(Trans16x16DCT, QuantCheck) {
- // Use maximally allowed quantization step sizes for DC and AC
- // coefficients respectively.
- RunQuantCheck(1336, 1828);
-}
-
-TEST_P(Trans16x16DCT, InvAccuracyCheck) { RunInvAccuracyCheck(); }
-
class Trans16x16HT : public Trans16x16TestBase,
public ::testing::TestWithParam<Ht16x16Param> {
public:
@@ -723,10 +669,6 @@
using std::tr1::make_tuple;
-INSTANTIATE_TEST_CASE_P(C, Trans16x16DCT,
- ::testing::Values(make_tuple(&aom_fdct16x16_c,
- &aom_idct16x16_256_add_c,
- DCT_DCT, AOM_BITS_8)));
INSTANTIATE_TEST_CASE_P(
C, Trans16x16HT,
::testing::Values(
@@ -748,10 +690,6 @@
AOM_BITS_8)));
#if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(SSE2, Trans16x16DCT,
- ::testing::Values(make_tuple(&aom_fdct16x16_sse2,
- &aom_idct16x16_256_add_c,
- DCT_DCT, AOM_BITS_8)));
INSTANTIATE_TEST_CASE_P(
SSE2, Trans16x16HT,
::testing::Values(make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_c,
diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index 290d069..d89c09c 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -146,76 +146,6 @@
<< "Error: 32x32 FDCT/IDCT has average round-trip error > 1 per block";
}
-TEST_P(Trans32x32Test, CoeffCheck) {
- ACMRandom rnd(ACMRandom::DeterministicSeed());
- const int count_test_block = 1000;
-
- DECLARE_ALIGNED(16, int16_t, input_block[kNumCoeffs]);
- DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
- DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);
-
- for (int i = 0; i < count_test_block; ++i) {
- for (int j = 0; j < kNumCoeffs; ++j)
- input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
-
- const int stride = 32;
- aom_fdct32x32_c(input_block, output_ref_block, stride);
- ASM_REGISTER_STATE_CHECK(fwd_txfm_(input_block, output_block, stride));
-
- if (version_ == 0) {
- for (int j = 0; j < kNumCoeffs; ++j)
- EXPECT_EQ(output_block[j], output_ref_block[j])
- << "Error: 32x32 FDCT versions have mismatched coefficients";
- } else {
- for (int j = 0; j < kNumCoeffs; ++j)
- EXPECT_GE(6, abs(output_block[j] - output_ref_block[j]))
- << "Error: 32x32 FDCT rd has mismatched coefficients";
- }
- }
-}
-
-TEST_P(Trans32x32Test, MemCheck) {
- ACMRandom rnd(ACMRandom::DeterministicSeed());
- const int count_test_block = 2000;
-
- DECLARE_ALIGNED(16, int16_t, input_extreme_block[kNumCoeffs]);
- DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
- DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);
-
- for (int i = 0; i < count_test_block; ++i) {
- // Initialize a test block with input range [-mask_, mask_].
- for (int j = 0; j < kNumCoeffs; ++j) {
- input_extreme_block[j] = rnd.Rand8() & 1 ? mask_ : -mask_;
- }
- if (i == 0) {
- for (int j = 0; j < kNumCoeffs; ++j) input_extreme_block[j] = mask_;
- } else if (i == 1) {
- for (int j = 0; j < kNumCoeffs; ++j) input_extreme_block[j] = -mask_;
- }
-
- const int stride = 32;
- aom_fdct32x32_c(input_extreme_block, output_ref_block, stride);
- ASM_REGISTER_STATE_CHECK(
- fwd_txfm_(input_extreme_block, output_block, stride));
-
- // The minimum quant value is 4.
- for (int j = 0; j < kNumCoeffs; ++j) {
- if (version_ == 0) {
- EXPECT_EQ(output_block[j], output_ref_block[j])
- << "Error: 32x32 FDCT versions have mismatched coefficients";
- } else {
- EXPECT_GE(6, abs(output_block[j] - output_ref_block[j]))
- << "Error: 32x32 FDCT rd has mismatched coefficients";
- }
- EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_ref_block[j]))
- << "Error: 32x32 FDCT C has coefficient larger than 4*DCT_MAX_VALUE";
- EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_block[j]))
- << "Error: 32x32 FDCT has coefficient larger than "
- << "4*DCT_MAX_VALUE";
- }
- }
-}
-
TEST_P(Trans32x32Test, InverseAccuracy) {
ACMRandom rnd(ACMRandom::DeterministicSeed());
const int count_test_block = 1000;
@@ -313,33 +243,4 @@
EXPECT_EQ(sum >> 3, output[0]);
}
-using std::tr1::make_tuple;
-
-INSTANTIATE_TEST_CASE_P(
- C, Trans32x32Test,
- ::testing::Values(make_tuple(&aom_fdct32x32_c, &aom_idct32x32_1024_add_c, 0,
- AOM_BITS_8),
- make_tuple(&aom_fdct32x32_rd_c, &aom_idct32x32_1024_add_c,
- 1, AOM_BITS_8)));
-
-#if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(SSE2, Trans32x32Test,
- ::testing::Values(make_tuple(&aom_fdct32x32_sse2,
- &aom_idct32x32_1024_add_c,
- DCT_DCT, AOM_BITS_8),
- make_tuple(&aom_fdct32x32_rd_sse2,
- &aom_idct32x32_1024_add_c,
- ADST_DCT, AOM_BITS_8)));
-#endif // HAVE_SSE2
-
-#if HAVE_AVX2
-INSTANTIATE_TEST_CASE_P(
- AVX2, Trans32x32Test,
- ::testing::Values(make_tuple(&aom_fdct32x32_avx2,
- &aom_idct32x32_1024_add_sse2, DCT_DCT,
- AOM_BITS_8),
- make_tuple(&aom_fdct32x32_rd_avx2,
- &aom_idct32x32_1024_add_sse2, ADST_DCT,
- AOM_BITS_8)));
-#endif // HAVE_AVX2
} // namespace
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index 955b617..8178a2d 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -567,11 +567,6 @@
using std::tr1::make_tuple;
-INSTANTIATE_TEST_CASE_P(C, FwdTrans8x8DCT,
- ::testing::Values(make_tuple(&aom_fdct8x8_c,
- &aom_idct8x8_64_add_c,
- DCT_DCT, AOM_BITS_8)));
-
INSTANTIATE_TEST_CASE_P(
C, FwdTrans8x8HT,
::testing::Values(
@@ -590,10 +585,6 @@
AOM_BITS_8)));
#if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(SSE2, FwdTrans8x8DCT,
- ::testing::Values(make_tuple(&aom_fdct8x8_sse2,
- &aom_idct8x8_64_add_c,
- DCT_DCT, AOM_BITS_8)));
INSTANTIATE_TEST_CASE_P(
SSE2, FwdTrans8x8HT,
::testing::Values(make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_c,
@@ -606,11 +597,4 @@
ADST_ADST, AOM_BITS_8)));
#endif // HAVE_SSE2
-#if HAVE_SSSE3 && ARCH_X86_64
-INSTANTIATE_TEST_CASE_P(SSSE3, FwdTrans8x8DCT,
- ::testing::Values(make_tuple(&aom_fdct8x8_ssse3,
- &aom_idct8x8_64_add_ssse3,
- DCT_DCT, AOM_BITS_8)));
-#endif
-
} // namespace
diff --git a/test/idct8x8_test.cc b/test/idct8x8_test.cc
index fc3d04b..060c76d 100644
--- a/test/idct8x8_test.cc
+++ b/test/idct8x8_test.cc
@@ -54,33 +54,4 @@
for (int i = 0; i < 64; ++i) output[i] *= 2;
}
-TEST(AV1Idct8x8Test, AccuracyCheck) {
- ACMRandom rnd(ACMRandom::DeterministicSeed());
- const int count_test_block = 10000;
- for (int i = 0; i < count_test_block; ++i) {
- int16_t input[64];
- tran_low_t coeff[64];
- double output_r[64];
- uint8_t dst[64], src[64];
-
- for (int j = 0; j < 64; ++j) {
- src[j] = rnd.Rand8();
- dst[j] = rnd.Rand8();
- }
- // Initialize a test block with input range [-255, 255].
- for (int j = 0; j < 64; ++j) input[j] = src[j] - dst[j];
-
- reference_dct_2d(input, output_r);
- for (int j = 0; j < 64; ++j)
- coeff[j] = static_cast<tran_low_t>(round(output_r[j]));
- aom_idct8x8_64_add_c(coeff, dst, 8);
- for (int j = 0; j < 64; ++j) {
- const int diff = dst[j] - src[j];
- const int error = diff * diff;
- EXPECT_GE(1, error) << "Error: 8x8 FDCT/IDCT has error " << error
- << " at index " << j;
- }
- }
-}
-
} // namespace