Remove dead code: non-scaled DCT

These are remnants from VP9 and are not used by aomenc/aomdec anymore.

Change-Id: Id44129d6ef44325d8867c38b8b643c5b6abc1693
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 14f1a63..a178a76 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -448,15 +448,6 @@
     add_proto qw/void aom_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
     specialize qw/aom_fdct8x8 sse2/, "$ssse3_x86_64";
 
-    add_proto qw/void aom_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/aom_fdct16x16 sse2/;
-
-    add_proto qw/void aom_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/aom_fdct32x32 sse2 avx2/;
-
-    add_proto qw/void aom_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/aom_fdct32x32_rd sse2 avx2/;
-
     # High bit depth
     add_proto qw/void aom_highbd_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
     specialize qw/aom_highbd_fdct4x4 sse2/;
@@ -464,15 +455,6 @@
     add_proto qw/void aom_highbd_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
     specialize qw/aom_highbd_fdct8x8 sse2/;
 
-    add_proto qw/void aom_highbd_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/aom_highbd_fdct16x16 sse2/;
-
-    add_proto qw/void aom_highbd_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/aom_highbd_fdct32x32 sse2/;
-
-    add_proto qw/void aom_highbd_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-    specialize qw/aom_highbd_fdct32x32_rd sse2/;
-
 }  # CONFIG_AV1_ENCODER
 
 #
@@ -492,41 +474,6 @@
 
   add_proto qw/void aom_idct4x4_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
   specialize qw/aom_idct4x4_1_add sse2/;
-
-  add_proto qw/void aom_idct8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/aom_idct8x8_64_add sse2 ssse3/;
-
-  add_proto qw/void aom_idct8x8_12_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/aom_idct8x8_12_add sse2 ssse3/;
-
-  add_proto qw/void aom_idct8x8_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/aom_idct8x8_1_add sse2/;
-
-  add_proto qw/void aom_idct16x16_256_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/aom_idct16x16_256_add sse2 avx2/;
-
-  add_proto qw/void aom_idct16x16_38_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/aom_idct16x16_38_add avx2/;
-
-  add_proto qw/void aom_idct16x16_10_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/aom_idct16x16_10_add sse2 avx2/;
-
-  add_proto qw/void aom_idct16x16_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/aom_idct16x16_1_add sse2 avx2/;
-
-  add_proto qw/void aom_idct32x32_1024_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/aom_idct32x32_1024_add sse2 ssse3 avx2/;
-
-  add_proto qw/void aom_idct32x32_135_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/aom_idct32x32_135_add sse2 ssse3 avx2/;
-  # Need to add 135 eob idct32x32 implementations.
-  $aom_idct32x32_135_add_sse2=aom_idct32x32_1024_add_sse2;
-
-  add_proto qw/void aom_idct32x32_34_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/aom_idct32x32_34_add sse2 ssse3 avx2/;
-
-  add_proto qw/void aom_idct32x32_1_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride";
-  specialize qw/aom_idct32x32_1_add sse2 avx2/;
 }  # CONFIG_AV1
 
 #
diff --git a/aom_dsp/fwd_txfm.c b/aom_dsp/fwd_txfm.c
index 935bd07..c1317d8 100644
--- a/aom_dsp/fwd_txfm.c
+++ b/aom_dsp/fwd_txfm.c
@@ -172,186 +172,6 @@
   }
 }
 
-void aom_fdct16x16_c(const int16_t *input, tran_low_t *output, int stride) {
-  // The 2D transform is done with two passes which are actually pretty
-  // similar. In the first one, we transform the columns and transpose
-  // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we transpose the columns (that
-  // is the transposed rows) and transpose the results (so that it goes back
-  // in normal/row positions).
-  int pass;
-  // We need an intermediate buffer between passes.
-  tran_low_t intermediate[256];
-  const tran_low_t *in_low = NULL;
-  tran_low_t *out = intermediate;
-  // Do the two transform/transpose passes
-  for (pass = 0; pass < 2; ++pass) {
-    tran_high_t step1[8];      // canbe16
-    tran_high_t step2[8];      // canbe16
-    tran_high_t step3[8];      // canbe16
-    tran_high_t in_high[8];    // canbe16
-    tran_high_t temp1, temp2;  // needs32
-    int i;
-    for (i = 0; i < 16; i++) {
-      if (0 == pass) {
-        // Calculate input for the first 8 results.
-        in_high[0] = (input[0 * stride] + input[15 * stride]) * 4;
-        in_high[1] = (input[1 * stride] + input[14 * stride]) * 4;
-        in_high[2] = (input[2 * stride] + input[13 * stride]) * 4;
-        in_high[3] = (input[3 * stride] + input[12 * stride]) * 4;
-        in_high[4] = (input[4 * stride] + input[11 * stride]) * 4;
-        in_high[5] = (input[5 * stride] + input[10 * stride]) * 4;
-        in_high[6] = (input[6 * stride] + input[9 * stride]) * 4;
-        in_high[7] = (input[7 * stride] + input[8 * stride]) * 4;
-        // Calculate input for the next 8 results.
-        step1[0] = (input[7 * stride] - input[8 * stride]) * 4;
-        step1[1] = (input[6 * stride] - input[9 * stride]) * 4;
-        step1[2] = (input[5 * stride] - input[10 * stride]) * 4;
-        step1[3] = (input[4 * stride] - input[11 * stride]) * 4;
-        step1[4] = (input[3 * stride] - input[12 * stride]) * 4;
-        step1[5] = (input[2 * stride] - input[13 * stride]) * 4;
-        step1[6] = (input[1 * stride] - input[14 * stride]) * 4;
-        step1[7] = (input[0 * stride] - input[15 * stride]) * 4;
-      } else {
-        // Calculate input for the first 8 results.
-        assert(in_low != NULL);
-        in_high[0] = ((in_low[0 * 16] + 1) >> 2) + ((in_low[15 * 16] + 1) >> 2);
-        in_high[1] = ((in_low[1 * 16] + 1) >> 2) + ((in_low[14 * 16] + 1) >> 2);
-        in_high[2] = ((in_low[2 * 16] + 1) >> 2) + ((in_low[13 * 16] + 1) >> 2);
-        in_high[3] = ((in_low[3 * 16] + 1) >> 2) + ((in_low[12 * 16] + 1) >> 2);
-        in_high[4] = ((in_low[4 * 16] + 1) >> 2) + ((in_low[11 * 16] + 1) >> 2);
-        in_high[5] = ((in_low[5 * 16] + 1) >> 2) + ((in_low[10 * 16] + 1) >> 2);
-        in_high[6] = ((in_low[6 * 16] + 1) >> 2) + ((in_low[9 * 16] + 1) >> 2);
-        in_high[7] = ((in_low[7 * 16] + 1) >> 2) + ((in_low[8 * 16] + 1) >> 2);
-        // Calculate input for the next 8 results.
-        step1[0] = ((in_low[7 * 16] + 1) >> 2) - ((in_low[8 * 16] + 1) >> 2);
-        step1[1] = ((in_low[6 * 16] + 1) >> 2) - ((in_low[9 * 16] + 1) >> 2);
-        step1[2] = ((in_low[5 * 16] + 1) >> 2) - ((in_low[10 * 16] + 1) >> 2);
-        step1[3] = ((in_low[4 * 16] + 1) >> 2) - ((in_low[11 * 16] + 1) >> 2);
-        step1[4] = ((in_low[3 * 16] + 1) >> 2) - ((in_low[12 * 16] + 1) >> 2);
-        step1[5] = ((in_low[2 * 16] + 1) >> 2) - ((in_low[13 * 16] + 1) >> 2);
-        step1[6] = ((in_low[1 * 16] + 1) >> 2) - ((in_low[14 * 16] + 1) >> 2);
-        step1[7] = ((in_low[0 * 16] + 1) >> 2) - ((in_low[15 * 16] + 1) >> 2);
-        in_low++;
-      }
-      // Work on the first eight values; fdct8(input, even_results);
-      {
-        tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;  // canbe16
-        tran_high_t t0, t1, t2, t3;                  // needs32
-        tran_high_t x0, x1, x2, x3;                  // canbe16
-
-        // stage 1
-        s0 = in_high[0] + in_high[7];
-        s1 = in_high[1] + in_high[6];
-        s2 = in_high[2] + in_high[5];
-        s3 = in_high[3] + in_high[4];
-        s4 = in_high[3] - in_high[4];
-        s5 = in_high[2] - in_high[5];
-        s6 = in_high[1] - in_high[6];
-        s7 = in_high[0] - in_high[7];
-
-        // fdct4(step, step);
-        x0 = s0 + s3;
-        x1 = s1 + s2;
-        x2 = s1 - s2;
-        x3 = s0 - s3;
-        t0 = (x0 + x1) * cospi_16_64;
-        t1 = (x0 - x1) * cospi_16_64;
-        t2 = x3 * cospi_8_64 + x2 * cospi_24_64;
-        t3 = x3 * cospi_24_64 - x2 * cospi_8_64;
-        out[0] = (tran_low_t)fdct_round_shift(t0);
-        out[4] = (tran_low_t)fdct_round_shift(t2);
-        out[8] = (tran_low_t)fdct_round_shift(t1);
-        out[12] = (tran_low_t)fdct_round_shift(t3);
-
-        // Stage 2
-        t0 = (s6 - s5) * cospi_16_64;
-        t1 = (s6 + s5) * cospi_16_64;
-        t2 = fdct_round_shift(t0);
-        t3 = fdct_round_shift(t1);
-
-        // Stage 3
-        x0 = s4 + t2;
-        x1 = s4 - t2;
-        x2 = s7 - t3;
-        x3 = s7 + t3;
-
-        // Stage 4
-        t0 = x0 * cospi_28_64 + x3 * cospi_4_64;
-        t1 = x1 * cospi_12_64 + x2 * cospi_20_64;
-        t2 = x2 * cospi_12_64 + x1 * -cospi_20_64;
-        t3 = x3 * cospi_28_64 + x0 * -cospi_4_64;
-        out[2] = (tran_low_t)fdct_round_shift(t0);
-        out[6] = (tran_low_t)fdct_round_shift(t2);
-        out[10] = (tran_low_t)fdct_round_shift(t1);
-        out[14] = (tran_low_t)fdct_round_shift(t3);
-      }
-      // Work on the next eight values; step1 -> odd_results
-      {
-        // step 2
-        temp1 = (step1[5] - step1[2]) * cospi_16_64;
-        temp2 = (step1[4] - step1[3]) * cospi_16_64;
-        step2[2] = fdct_round_shift(temp1);
-        step2[3] = fdct_round_shift(temp2);
-        temp1 = (step1[4] + step1[3]) * cospi_16_64;
-        temp2 = (step1[5] + step1[2]) * cospi_16_64;
-        step2[4] = fdct_round_shift(temp1);
-        step2[5] = fdct_round_shift(temp2);
-        // step 3
-        step3[0] = step1[0] + step2[3];
-        step3[1] = step1[1] + step2[2];
-        step3[2] = step1[1] - step2[2];
-        step3[3] = step1[0] - step2[3];
-        step3[4] = step1[7] - step2[4];
-        step3[5] = step1[6] - step2[5];
-        step3[6] = step1[6] + step2[5];
-        step3[7] = step1[7] + step2[4];
-        // step 4
-        temp1 = step3[1] * -cospi_8_64 + step3[6] * cospi_24_64;
-        temp2 = step3[2] * cospi_24_64 + step3[5] * cospi_8_64;
-        step2[1] = fdct_round_shift(temp1);
-        step2[2] = fdct_round_shift(temp2);
-        temp1 = step3[2] * cospi_8_64 - step3[5] * cospi_24_64;
-        temp2 = step3[1] * cospi_24_64 + step3[6] * cospi_8_64;
-        step2[5] = fdct_round_shift(temp1);
-        step2[6] = fdct_round_shift(temp2);
-        // step 5
-        step1[0] = step3[0] + step2[1];
-        step1[1] = step3[0] - step2[1];
-        step1[2] = step3[3] + step2[2];
-        step1[3] = step3[3] - step2[2];
-        step1[4] = step3[4] - step2[5];
-        step1[5] = step3[4] + step2[5];
-        step1[6] = step3[7] - step2[6];
-        step1[7] = step3[7] + step2[6];
-        // step 6
-        temp1 = step1[0] * cospi_30_64 + step1[7] * cospi_2_64;
-        temp2 = step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
-        out[1] = (tran_low_t)fdct_round_shift(temp1);
-        out[9] = (tran_low_t)fdct_round_shift(temp2);
-        temp1 = step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
-        temp2 = step1[3] * cospi_6_64 + step1[4] * cospi_26_64;
-        out[5] = (tran_low_t)fdct_round_shift(temp1);
-        out[13] = (tran_low_t)fdct_round_shift(temp2);
-        temp1 = step1[3] * -cospi_26_64 + step1[4] * cospi_6_64;
-        temp2 = step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
-        out[3] = (tran_low_t)fdct_round_shift(temp1);
-        out[11] = (tran_low_t)fdct_round_shift(temp2);
-        temp1 = step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
-        temp2 = step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
-        out[7] = (tran_low_t)fdct_round_shift(temp1);
-        out[15] = (tran_low_t)fdct_round_shift(temp2);
-      }
-      // Do next column (which is a transposed row in second/horizontal pass)
-      input++;
-      out += 16;
-    }
-    // Setup in/out for next pass.
-    in_low = intermediate;
-    out = output;
-  }
-}
-
 static INLINE tran_high_t dct_32_round(tran_high_t input) {
   tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
   // TODO(debargha, peter.derivaz): Find new bounds for this assert,
@@ -712,34 +532,6 @@
   }
 }
 
-// Note that although we use dct_32_round in dct32 computation flow,
-// this 2d fdct32x32 for rate-distortion optimization loop is operating
-// within 16 bits precision.
-void aom_fdct32x32_rd_c(const int16_t *input, tran_low_t *out, int stride) {
-  int i, j;
-  tran_high_t output[32 * 32];
-
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    tran_high_t temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
-    aom_fdct32(temp_in, temp_out, 0);
-    for (j = 0; j < 32; ++j)
-      // TODO(cd): see quality impact of only doing
-      //           output[j * 32 + i] = (temp_out[j] + 1) >> 2;
-      //           PS: also change code in aom_dsp/x86/aom_dct_sse2.c
-      output[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
-  }
-
-  // Rows
-  for (i = 0; i < 32; ++i) {
-    tran_high_t temp_in[32], temp_out[32];
-    for (j = 0; j < 32; ++j) temp_in[j] = output[j + i * 32];
-    aom_fdct32(temp_in, temp_out, 1);
-    for (j = 0; j < 32; ++j) out[j + i * 32] = (tran_low_t)temp_out[j];
-  }
-}
-
 void aom_highbd_fdct4x4_c(const int16_t *input, tran_low_t *output,
                           int stride) {
   aom_fdct4x4_c(input, output, stride);
@@ -750,15 +542,6 @@
   aom_fdct8x8_c(input, final_output, stride);
 }
 
-void aom_highbd_fdct16x16_c(const int16_t *input, tran_low_t *output,
-                            int stride) {
-  aom_fdct16x16_c(input, output, stride);
-}
-
 void aom_highbd_fdct32x32_c(const int16_t *input, tran_low_t *out, int stride) {
   aom_fdct32x32_c(input, out, stride);
 }
-void aom_highbd_fdct32x32_rd_c(const int16_t *input, tran_low_t *out,
-                               int stride) {
-  aom_fdct32x32_rd_c(input, out, stride);
-}
diff --git a/aom_dsp/inv_txfm.c b/aom_dsp/inv_txfm.c
index c4efae6..546b1c1 100644
--- a/aom_dsp/inv_txfm.c
+++ b/aom_dsp/inv_txfm.c
@@ -210,43 +210,6 @@
   output[7] = WRAPLOW(step1[0] - step1[7]);
 }
 
-void aom_idct8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  tran_low_t out[8 * 8];
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[8], temp_out[8];
-
-  // First transform rows
-  for (i = 0; i < 8; ++i) {
-    aom_idct8_c(input, outptr);
-    input += 8;
-    outptr += 8;
-  }
-
-  // Then transform columns
-  for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
-    aom_idct8_c(temp_in, temp_out);
-    for (j = 0; j < 8; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 5));
-    }
-  }
-}
-
-void aom_idct8x8_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  int i, j;
-  tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
-  a1 = ROUND_POWER_OF_TWO(out, 5);
-  if (a1 == 0) return;
-  for (j = 0; j < 8; ++j) {
-    for (i = 0; i < 8; ++i) dest[i] = clip_pixel_add(dest[i], a1);
-    dest += stride;
-  }
-}
-
 void aom_iadst4_c(const tran_low_t *input, tran_low_t *output) {
   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7;
 
@@ -361,31 +324,6 @@
   output[7] = WRAPLOW(-x1);
 }
 
-void aom_idct8x8_12_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  tran_low_t out[8 * 8] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[8], temp_out[8];
-
-  // First transform rows
-  // only first 4 row has non-zero coefs
-  for (i = 0; i < 4; ++i) {
-    aom_idct8_c(input, outptr);
-    input += 8;
-    outptr += 8;
-  }
-
-  // Then transform columns
-  for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j) temp_in[j] = out[j * 8 + i];
-    aom_idct8_c(temp_in, temp_out);
-    for (j = 0; j < 8; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 5));
-    }
-  }
-}
-
 void aom_idct16_c(const tran_low_t *input, tran_low_t *output) {
   tran_low_t step1[16], step2[16];
   tran_high_t temp1, temp2;
@@ -551,31 +489,6 @@
   output[15] = WRAPLOW(step2[0] - step2[15]);
 }
 
-void aom_idct16x16_256_add_c(const tran_low_t *input, uint8_t *dest,
-                             int stride) {
-  tran_low_t out[16 * 16];
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[16], temp_out[16];
-
-  // First transform rows
-  for (i = 0; i < 16; ++i) {
-    aom_idct16_c(input, outptr);
-    input += 16;
-    outptr += 16;
-  }
-
-  // Then transform columns
-  for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
-    aom_idct16_c(temp_in, temp_out);
-    for (j = 0; j < 16; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
-    }
-  }
-}
-
 void aom_iadst16_c(const tran_low_t *input, tran_low_t *output) {
   tran_high_t s0, s1, s2, s3, s4, s5, s6, s7, s8;
   tran_high_t s9, s10, s11, s12, s13, s14, s15;
@@ -747,71 +660,6 @@
   output[15] = WRAPLOW(-x1);
 }
 
-void aom_idct16x16_38_add_c(const tran_low_t *input, uint8_t *dest,
-                            int stride) {
-  int i, j;
-  tran_low_t out[16 * 16] = { 0 };
-  tran_low_t *outptr = out;
-  tran_low_t temp_in[16], temp_out[16];
-
-  // First transform rows. Since all non-zero dct coefficients are in
-  // upper-left 8x8 area, we only need to calculate first 8 rows here.
-  for (i = 0; i < 8; ++i) {
-    aom_idct16_c(input, outptr);
-    input += 16;
-    outptr += 16;
-  }
-
-  // Then transform columns
-  for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
-    aom_idct16_c(temp_in, temp_out);
-    for (j = 0; j < 16; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
-    }
-  }
-}
-
-void aom_idct16x16_10_add_c(const tran_low_t *input, uint8_t *dest,
-                            int stride) {
-  tran_low_t out[16 * 16] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[16], temp_out[16];
-
-  // First transform rows. Since all non-zero dct coefficients are in
-  // upper-left 4x4 area, we only need to calculate first 4 rows here.
-  for (i = 0; i < 4; ++i) {
-    aom_idct16_c(input, outptr);
-    input += 16;
-    outptr += 16;
-  }
-
-  // Then transform columns
-  for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j) temp_in[j] = out[j * 16 + i];
-    aom_idct16_c(temp_in, temp_out);
-    for (j = 0; j < 16; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
-    }
-  }
-}
-
-void aom_idct16x16_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  int i, j;
-  tran_high_t a1;
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
-  a1 = ROUND_POWER_OF_TWO(out, 6);
-  if (a1 == 0) return;
-  for (j = 0; j < 16; ++j) {
-    for (i = 0; i < 16; ++i) dest[i] = clip_pixel_add(dest[i], a1);
-    dest += stride;
-  }
-}
-
 void aom_idct32_c(const tran_low_t *input, tran_low_t *output) {
   tran_low_t step1[32], step2[32];
   tran_high_t temp1, temp2;
@@ -1178,111 +1026,6 @@
   output[30] = WRAPLOW(step1[1] - step1[30]);
   output[31] = WRAPLOW(step1[0] - step1[31]);
 }
-
-void aom_idct32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
-                              int stride) {
-  tran_low_t out[32 * 32];
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[32], temp_out[32];
-
-  // Rows
-  for (i = 0; i < 32; ++i) {
-    int16_t zero_coeff[16];
-    for (j = 0; j < 16; ++j) zero_coeff[j] = input[2 * j] | input[2 * j + 1];
-    for (j = 0; j < 8; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-    for (j = 0; j < 4; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-    for (j = 0; j < 2; ++j)
-      zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
-
-    if (zero_coeff[0] | zero_coeff[1])
-      aom_idct32_c(input, outptr);
-    else
-      memset(outptr, 0, sizeof(tran_low_t) * 32);
-    input += 32;
-    outptr += 32;
-  }
-
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
-    aom_idct32_c(temp_in, temp_out);
-    for (j = 0; j < 32; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
-    }
-  }
-}
-
-void aom_idct32x32_135_add_c(const tran_low_t *input, uint8_t *dest,
-                             int stride) {
-  tran_low_t out[32 * 32] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[32], temp_out[32];
-
-  // Rows
-  // only upper-left 16x16 has non-zero coeff
-  for (i = 0; i < 16; ++i) {
-    aom_idct32_c(input, outptr);
-    input += 32;
-    outptr += 32;
-  }
-
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
-    aom_idct32_c(temp_in, temp_out);
-    for (j = 0; j < 32; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
-    }
-  }
-}
-
-void aom_idct32x32_34_add_c(const tran_low_t *input, uint8_t *dest,
-                            int stride) {
-  tran_low_t out[32 * 32] = { 0 };
-  tran_low_t *outptr = out;
-  int i, j;
-  tran_low_t temp_in[32], temp_out[32];
-
-  // Rows
-  // only upper-left 8x8 has non-zero coeff
-  for (i = 0; i < 8; ++i) {
-    aom_idct32_c(input, outptr);
-    input += 32;
-    outptr += 32;
-  }
-
-  // Columns
-  for (i = 0; i < 32; ++i) {
-    for (j = 0; j < 32; ++j) temp_in[j] = out[j * 32 + i];
-    aom_idct32_c(temp_in, temp_out);
-    for (j = 0; j < 32; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
-    }
-  }
-}
-
-void aom_idct32x32_1_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
-  int i, j;
-  tran_high_t a1;
-
-  tran_low_t out = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64));
-  out = WRAPLOW(dct_const_round_shift(out * cospi_16_64));
-  a1 = ROUND_POWER_OF_TWO(out, 6);
-  if (a1 == 0) return;
-
-  for (j = 0; j < 32; ++j) {
-    for (i = 0; i < 32; ++i) dest[i] = clip_pixel_add(dest[i], a1);
-    dest += stride;
-  }
-}
-
 void aom_highbd_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
                                  int stride, int bd) {
   /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
diff --git a/aom_dsp/x86/fwd_txfm_impl_sse2.h b/aom_dsp/x86/fwd_txfm_impl_sse2.h
index 20931db..c1c072d 100644
--- a/aom_dsp/x86/fwd_txfm_impl_sse2.h
+++ b/aom_dsp/x86/fwd_txfm_impl_sse2.h
@@ -566,451 +566,5 @@
   }
 }
 
-void FDCT16x16_2D(const int16_t *input, tran_low_t *output, int stride) {
-  // The 2D transform is done with two passes which are actually pretty
-  // similar. In the first one, we transform the columns and transpose
-  // the results. In the second one, we transform the rows. To achieve that,
-  // as the first pass results are transposed, we transpose the columns (that
-  // is the transposed rows) and transpose the results (so that it goes back
-  // in normal/row positions).
-  int pass;
-  // We need an intermediate buffer between passes.
-  DECLARE_ALIGNED(16, int16_t, intermediate[256]);
-  const int16_t *in = input;
-  int16_t *out0 = intermediate;
-  tran_low_t *out1 = output;
-  // Constants
-  //    When we use them, in one case, they are all the same. In all others
-  //    it's a pair of them that we need to repeat four times. This is done
-  //    by constructing the 32 bit constant corresponding to that pair.
-  const __m128i k__cospi_p16_p16 = _mm_set1_epi16((int16_t)cospi_16_64);
-  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
-  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
-  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
-  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
-  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
-  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
-  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
-  const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
-  const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
-  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
-  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
-  const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
-  const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
-  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
-  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
-  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const __m128i kOne = _mm_set1_epi16(1);
-  // Do the two transform/transpose passes
-  for (pass = 0; pass < 2; ++pass) {
-    // We process eight columns (transposed rows in second pass) at a time.
-    int column_start;
-#if DCT_HIGH_BIT_DEPTH
-    int overflow;
-#endif
-    for (column_start = 0; column_start < 16; column_start += 8) {
-      __m128i in00, in01, in02, in03, in04, in05, in06, in07;
-      __m128i in08, in09, in10, in11, in12, in13, in14, in15;
-      __m128i input0, input1, input2, input3, input4, input5, input6, input7;
-      __m128i step1_0, step1_1, step1_2, step1_3;
-      __m128i step1_4, step1_5, step1_6, step1_7;
-      __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
-      __m128i step3_0, step3_1, step3_2, step3_3;
-      __m128i step3_4, step3_5, step3_6, step3_7;
-      __m128i res00, res01, res02, res03, res04, res05, res06, res07;
-      __m128i res08, res09, res10, res11, res12, res13, res14, res15;
-      // Load and pre-condition input.
-      if (0 == pass) {
-        in00 = _mm_load_si128((const __m128i *)(in + 0 * stride));
-        in01 = _mm_load_si128((const __m128i *)(in + 1 * stride));
-        in02 = _mm_load_si128((const __m128i *)(in + 2 * stride));
-        in03 = _mm_load_si128((const __m128i *)(in + 3 * stride));
-        in04 = _mm_load_si128((const __m128i *)(in + 4 * stride));
-        in05 = _mm_load_si128((const __m128i *)(in + 5 * stride));
-        in06 = _mm_load_si128((const __m128i *)(in + 6 * stride));
-        in07 = _mm_load_si128((const __m128i *)(in + 7 * stride));
-        in08 = _mm_load_si128((const __m128i *)(in + 8 * stride));
-        in09 = _mm_load_si128((const __m128i *)(in + 9 * stride));
-        in10 = _mm_load_si128((const __m128i *)(in + 10 * stride));
-        in11 = _mm_load_si128((const __m128i *)(in + 11 * stride));
-        in12 = _mm_load_si128((const __m128i *)(in + 12 * stride));
-        in13 = _mm_load_si128((const __m128i *)(in + 13 * stride));
-        in14 = _mm_load_si128((const __m128i *)(in + 14 * stride));
-        in15 = _mm_load_si128((const __m128i *)(in + 15 * stride));
-        // x = x << 2
-        in00 = _mm_slli_epi16(in00, 2);
-        in01 = _mm_slli_epi16(in01, 2);
-        in02 = _mm_slli_epi16(in02, 2);
-        in03 = _mm_slli_epi16(in03, 2);
-        in04 = _mm_slli_epi16(in04, 2);
-        in05 = _mm_slli_epi16(in05, 2);
-        in06 = _mm_slli_epi16(in06, 2);
-        in07 = _mm_slli_epi16(in07, 2);
-        in08 = _mm_slli_epi16(in08, 2);
-        in09 = _mm_slli_epi16(in09, 2);
-        in10 = _mm_slli_epi16(in10, 2);
-        in11 = _mm_slli_epi16(in11, 2);
-        in12 = _mm_slli_epi16(in12, 2);
-        in13 = _mm_slli_epi16(in13, 2);
-        in14 = _mm_slli_epi16(in14, 2);
-        in15 = _mm_slli_epi16(in15, 2);
-      } else {
-        in00 = _mm_load_si128((const __m128i *)(in + 0 * 16));
-        in01 = _mm_load_si128((const __m128i *)(in + 1 * 16));
-        in02 = _mm_load_si128((const __m128i *)(in + 2 * 16));
-        in03 = _mm_load_si128((const __m128i *)(in + 3 * 16));
-        in04 = _mm_load_si128((const __m128i *)(in + 4 * 16));
-        in05 = _mm_load_si128((const __m128i *)(in + 5 * 16));
-        in06 = _mm_load_si128((const __m128i *)(in + 6 * 16));
-        in07 = _mm_load_si128((const __m128i *)(in + 7 * 16));
-        in08 = _mm_load_si128((const __m128i *)(in + 8 * 16));
-        in09 = _mm_load_si128((const __m128i *)(in + 9 * 16));
-        in10 = _mm_load_si128((const __m128i *)(in + 10 * 16));
-        in11 = _mm_load_si128((const __m128i *)(in + 11 * 16));
-        in12 = _mm_load_si128((const __m128i *)(in + 12 * 16));
-        in13 = _mm_load_si128((const __m128i *)(in + 13 * 16));
-        in14 = _mm_load_si128((const __m128i *)(in + 14 * 16));
-        in15 = _mm_load_si128((const __m128i *)(in + 15 * 16));
-        // x = (x + 1) >> 2
-        in00 = _mm_add_epi16(in00, kOne);
-        in01 = _mm_add_epi16(in01, kOne);
-        in02 = _mm_add_epi16(in02, kOne);
-        in03 = _mm_add_epi16(in03, kOne);
-        in04 = _mm_add_epi16(in04, kOne);
-        in05 = _mm_add_epi16(in05, kOne);
-        in06 = _mm_add_epi16(in06, kOne);
-        in07 = _mm_add_epi16(in07, kOne);
-        in08 = _mm_add_epi16(in08, kOne);
-        in09 = _mm_add_epi16(in09, kOne);
-        in10 = _mm_add_epi16(in10, kOne);
-        in11 = _mm_add_epi16(in11, kOne);
-        in12 = _mm_add_epi16(in12, kOne);
-        in13 = _mm_add_epi16(in13, kOne);
-        in14 = _mm_add_epi16(in14, kOne);
-        in15 = _mm_add_epi16(in15, kOne);
-        in00 = _mm_srai_epi16(in00, 2);
-        in01 = _mm_srai_epi16(in01, 2);
-        in02 = _mm_srai_epi16(in02, 2);
-        in03 = _mm_srai_epi16(in03, 2);
-        in04 = _mm_srai_epi16(in04, 2);
-        in05 = _mm_srai_epi16(in05, 2);
-        in06 = _mm_srai_epi16(in06, 2);
-        in07 = _mm_srai_epi16(in07, 2);
-        in08 = _mm_srai_epi16(in08, 2);
-        in09 = _mm_srai_epi16(in09, 2);
-        in10 = _mm_srai_epi16(in10, 2);
-        in11 = _mm_srai_epi16(in11, 2);
-        in12 = _mm_srai_epi16(in12, 2);
-        in13 = _mm_srai_epi16(in13, 2);
-        in14 = _mm_srai_epi16(in14, 2);
-        in15 = _mm_srai_epi16(in15, 2);
-      }
-      in += 8;
-      // Calculate input for the first 8 results.
-      {
-        input0 = ADD_EPI16(in00, in15);
-        input1 = ADD_EPI16(in01, in14);
-        input2 = ADD_EPI16(in02, in13);
-        input3 = ADD_EPI16(in03, in12);
-        input4 = ADD_EPI16(in04, in11);
-        input5 = ADD_EPI16(in05, in10);
-        input6 = ADD_EPI16(in06, in09);
-        input7 = ADD_EPI16(in07, in08);
-#if DCT_HIGH_BIT_DEPTH
-        overflow = check_epi16_overflow_x8(&input0, &input1, &input2, &input3,
-                                           &input4, &input5, &input6, &input7);
-        if (overflow) {
-          aom_highbd_fdct16x16_c(input, output, stride);
-          return;
-        }
-#endif  // DCT_HIGH_BIT_DEPTH
-      }
-      // Calculate input for the next 8 results.
-      {
-        step1_0 = SUB_EPI16(in07, in08);
-        step1_1 = SUB_EPI16(in06, in09);
-        step1_2 = SUB_EPI16(in05, in10);
-        step1_3 = SUB_EPI16(in04, in11);
-        step1_4 = SUB_EPI16(in03, in12);
-        step1_5 = SUB_EPI16(in02, in13);
-        step1_6 = SUB_EPI16(in01, in14);
-        step1_7 = SUB_EPI16(in00, in15);
-#if DCT_HIGH_BIT_DEPTH
-        overflow =
-            check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3,
-                                    &step1_4, &step1_5, &step1_6, &step1_7);
-        if (overflow) {
-          aom_highbd_fdct16x16_c(input, output, stride);
-          return;
-        }
-#endif  // DCT_HIGH_BIT_DEPTH
-      }
-      // Work on the first eight values; fdct8(input, even_results);
-      {
-        // Add/subtract
-        const __m128i q0 = ADD_EPI16(input0, input7);
-        const __m128i q1 = ADD_EPI16(input1, input6);
-        const __m128i q2 = ADD_EPI16(input2, input5);
-        const __m128i q3 = ADD_EPI16(input3, input4);
-        const __m128i q4 = SUB_EPI16(input3, input4);
-        const __m128i q5 = SUB_EPI16(input2, input5);
-        const __m128i q6 = SUB_EPI16(input1, input6);
-        const __m128i q7 = SUB_EPI16(input0, input7);
-#if DCT_HIGH_BIT_DEPTH
-        overflow =
-            check_epi16_overflow_x8(&q0, &q1, &q2, &q3, &q4, &q5, &q6, &q7);
-        if (overflow) {
-          aom_highbd_fdct16x16_c(input, output, stride);
-          return;
-        }
-#endif  // DCT_HIGH_BIT_DEPTH
-        // Work on first four results
-        {
-          // Add/subtract
-          const __m128i r0 = ADD_EPI16(q0, q3);
-          const __m128i r1 = ADD_EPI16(q1, q2);
-          const __m128i r2 = SUB_EPI16(q1, q2);
-          const __m128i r3 = SUB_EPI16(q0, q3);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(&r0, &r1, &r2, &r3);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-
-          // Interleave to do the multiply by constants which gets us
-          // into 32 bits.
-          {
-            const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
-            const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
-            const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
-            const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
-            res00 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-            res08 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-            res04 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-            res12 = mult_round_shift(&t2, &t3, &k__cospi_m08_p24,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
-            overflow = check_epi16_overflow_x4(&res00, &res08, &res04, &res12);
-            if (overflow) {
-              aom_highbd_fdct16x16_c(input, output, stride);
-              return;
-            }
-#endif  // DCT_HIGH_BIT_DEPTH
-          }
-        }
-        // Work on next four results
-        {
-          // Interleave to do the multiply by constants which gets us
-          // into 32 bits.
-          const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
-          const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
-          const __m128i r0 =
-              mult_round_shift(&d0, &d1, &k__cospi_p16_m16,
-                               &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          const __m128i r1 =
-              mult_round_shift(&d0, &d1, &k__cospi_p16_p16,
-                               &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x2(&r0, &r1);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-          {
-            // Add/subtract
-            const __m128i x0 = ADD_EPI16(q4, r0);
-            const __m128i x1 = SUB_EPI16(q4, r0);
-            const __m128i x2 = SUB_EPI16(q7, r1);
-            const __m128i x3 = ADD_EPI16(q7, r1);
-#if DCT_HIGH_BIT_DEPTH
-            overflow = check_epi16_overflow_x4(&x0, &x1, &x2, &x3);
-            if (overflow) {
-              aom_highbd_fdct16x16_c(input, output, stride);
-              return;
-            }
-#endif  // DCT_HIGH_BIT_DEPTH
-
-            // Interleave to do the multiply by constants which gets us
-            // into 32 bits.
-            {
-              const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
-              const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
-              const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
-              const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
-              res02 = mult_round_shift(&t0, &t1, &k__cospi_p28_p04,
-                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-              res14 = mult_round_shift(&t0, &t1, &k__cospi_m04_p28,
-                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-              res10 = mult_round_shift(&t2, &t3, &k__cospi_p12_p20,
-                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-              res06 = mult_round_shift(&t2, &t3, &k__cospi_m20_p12,
-                                       &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
-              overflow =
-                  check_epi16_overflow_x4(&res02, &res14, &res10, &res06);
-              if (overflow) {
-                aom_highbd_fdct16x16_c(input, output, stride);
-                return;
-              }
-#endif  // DCT_HIGH_BIT_DEPTH
-            }
-          }
-        }
-      }
-      // Work on the next eight values; step1 -> odd_results
-      {
-        // step 2
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
-          const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
-          const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
-          const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
-          step2_2 = mult_round_shift(&t0, &t1, &k__cospi_p16_m16,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          step2_3 = mult_round_shift(&t2, &t3, &k__cospi_p16_m16,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          step2_5 = mult_round_shift(&t0, &t1, &k__cospi_p16_p16,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          step2_4 = mult_round_shift(&t2, &t3, &k__cospi_p16_p16,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x4(&step2_2, &step2_3, &step2_5, &step2_4);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // step 3
-        {
-          step3_0 = ADD_EPI16(step1_0, step2_3);
-          step3_1 = ADD_EPI16(step1_1, step2_2);
-          step3_2 = SUB_EPI16(step1_1, step2_2);
-          step3_3 = SUB_EPI16(step1_0, step2_3);
-          step3_4 = SUB_EPI16(step1_7, step2_4);
-          step3_5 = SUB_EPI16(step1_6, step2_5);
-          step3_6 = ADD_EPI16(step1_6, step2_5);
-          step3_7 = ADD_EPI16(step1_7, step2_4);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x8(&step3_0, &step3_1, &step3_2, &step3_3,
-                                      &step3_4, &step3_5, &step3_6, &step3_7);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // step 4
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
-          const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
-          const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
-          const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
-          step2_1 = mult_round_shift(&t0, &t1, &k__cospi_m08_p24,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          step2_2 = mult_round_shift(&t2, &t3, &k__cospi_p24_p08,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          step2_6 = mult_round_shift(&t0, &t1, &k__cospi_p24_p08,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          step2_5 = mult_round_shift(&t2, &t3, &k__cospi_p08_m24,
-                                     &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x4(&step2_1, &step2_2, &step2_6, &step2_5);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // step 5
-        {
-          step1_0 = ADD_EPI16(step3_0, step2_1);
-          step1_1 = SUB_EPI16(step3_0, step2_1);
-          step1_2 = ADD_EPI16(step3_3, step2_2);
-          step1_3 = SUB_EPI16(step3_3, step2_2);
-          step1_4 = SUB_EPI16(step3_4, step2_5);
-          step1_5 = ADD_EPI16(step3_4, step2_5);
-          step1_6 = SUB_EPI16(step3_7, step2_6);
-          step1_7 = ADD_EPI16(step3_7, step2_6);
-#if DCT_HIGH_BIT_DEPTH
-          overflow =
-              check_epi16_overflow_x8(&step1_0, &step1_1, &step1_2, &step1_3,
-                                      &step1_4, &step1_5, &step1_6, &step1_7);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        // step 6
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
-          const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
-          const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
-          const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
-          res01 = mult_round_shift(&t0, &t1, &k__cospi_p30_p02,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          res09 = mult_round_shift(&t2, &t3, &k__cospi_p14_p18,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          res15 = mult_round_shift(&t0, &t1, &k__cospi_m02_p30,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          res07 = mult_round_shift(&t2, &t3, &k__cospi_m18_p14,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(&res01, &res09, &res15, &res07);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-        {
-          const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
-          const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
-          const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
-          const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
-          res05 = mult_round_shift(&t0, &t1, &k__cospi_p22_p10,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          res13 = mult_round_shift(&t2, &t3, &k__cospi_p06_p26,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          res11 = mult_round_shift(&t0, &t1, &k__cospi_m10_p22,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-          res03 = mult_round_shift(&t2, &t3, &k__cospi_m26_p06,
-                                   &k__DCT_CONST_ROUNDING, DCT_CONST_BITS);
-#if DCT_HIGH_BIT_DEPTH
-          overflow = check_epi16_overflow_x4(&res05, &res13, &res11, &res03);
-          if (overflow) {
-            aom_highbd_fdct16x16_c(input, output, stride);
-            return;
-          }
-#endif  // DCT_HIGH_BIT_DEPTH
-        }
-      }
-      // Transpose the results, do it as two 8x8 transposes.
-      transpose_and_output8x8(&res00, &res01, &res02, &res03, &res04, &res05,
-                              &res06, &res07, pass, out0, out1);
-      transpose_and_output8x8(&res08, &res09, &res10, &res11, &res12, &res13,
-                              &res14, &res15, pass, out0 + 8, out1 + 8);
-      if (pass == 0) {
-        out0 += 8 * 16;
-      } else {
-        out1 += 8 * 16;
-      }
-    }
-    // Setup in/out for next pass.
-    in = intermediate;
-  }
-}
-
 #undef ADD_EPI16
 #undef SUB_EPI16
diff --git a/aom_dsp/x86/fwd_txfm_sse2.c b/aom_dsp/x86/fwd_txfm_sse2.c
index 463d71f..edee0fb 100644
--- a/aom_dsp/x86/fwd_txfm_sse2.c
+++ b/aom_dsp/x86/fwd_txfm_sse2.c
@@ -88,43 +88,14 @@
 #define DCT_HIGH_BIT_DEPTH 0
 #define FDCT4x4_2D aom_fdct4x4_sse2
 #define FDCT8x8_2D aom_fdct8x8_sse2
-#define FDCT16x16_2D aom_fdct16x16_sse2
 #include "aom_dsp/x86/fwd_txfm_impl_sse2.h"
 #undef FDCT4x4_2D
 #undef FDCT8x8_2D
-#undef FDCT16x16_2D
 
-#define FDCT32x32_2D aom_fdct32x32_rd_sse2
-#define FDCT32x32_HIGH_PRECISION 0
-#include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h"
-#undef FDCT32x32_2D
-#undef FDCT32x32_HIGH_PRECISION
-
-#define FDCT32x32_2D aom_fdct32x32_sse2
-#define FDCT32x32_HIGH_PRECISION 1
-#include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h"  // NOLINT
-#undef FDCT32x32_2D
-#undef FDCT32x32_HIGH_PRECISION
 #undef DCT_HIGH_BIT_DEPTH
-
 #define DCT_HIGH_BIT_DEPTH 1
 #define FDCT4x4_2D aom_highbd_fdct4x4_sse2
 #define FDCT8x8_2D aom_highbd_fdct8x8_sse2
-#define FDCT16x16_2D aom_highbd_fdct16x16_sse2
 #include "aom_dsp/x86/fwd_txfm_impl_sse2.h"  // NOLINT
 #undef FDCT4x4_2D
 #undef FDCT8x8_2D
-#undef FDCT16x16_2D
-
-#define FDCT32x32_2D aom_highbd_fdct32x32_rd_sse2
-#define FDCT32x32_HIGH_PRECISION 0
-#include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h"  // NOLINT
-#undef FDCT32x32_2D
-#undef FDCT32x32_HIGH_PRECISION
-
-#define FDCT32x32_2D aom_highbd_fdct32x32_sse2
-#define FDCT32x32_HIGH_PRECISION 1
-#include "aom_dsp/x86/fwd_dct32x32_impl_sse2.h"  // NOLINT
-#undef FDCT32x32_2D
-#undef FDCT32x32_HIGH_PRECISION
-#undef DCT_HIGH_BIT_DEPTH
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index 572feb8..9c1a70f 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -237,16 +237,6 @@
 typedef std::tr1::tuple<IdctFunc, IdctFunc, TX_TYPE, aom_bit_depth_t>
     Idct16x16Param;
 
-void fdct16x16_ref(const int16_t *in, tran_low_t *out, int stride,
-                   TxfmParam * /*txfm_param*/) {
-  aom_fdct16x16_c(in, out, stride);
-}
-
-void idct16x16_ref(const tran_low_t *in, uint8_t *dest, int stride,
-                   const TxfmParam * /*txfm_param*/) {
-  aom_idct16x16_256_add_c(in, dest, stride);
-}
-
 void fht16x16_ref(const int16_t *in, tran_low_t *out, int stride,
                   TxfmParam *txfm_param) {
   av1_fht16x16_c(in, out, stride, txfm_param);
@@ -542,50 +532,6 @@
   TxfmParam txfm_param_;
 };
 
-class Trans16x16DCT : public Trans16x16TestBase,
-                      public ::testing::TestWithParam<Dct16x16Param> {
- public:
-  virtual ~Trans16x16DCT() {}
-
-  virtual void SetUp() {
-    fwd_txfm_ = GET_PARAM(0);
-    inv_txfm_ = GET_PARAM(1);
-    bit_depth_ = GET_PARAM(3);
-    pitch_ = 16;
-    fwd_txfm_ref = fdct16x16_ref;
-    inv_txfm_ref = idct16x16_ref;
-    mask_ = (1 << bit_depth_) - 1;
-    inv_txfm_ref = idct16x16_ref;
-    txfm_param_.tx_type = GET_PARAM(2);
-  }
-  virtual void TearDown() { libaom_test::ClearSystemState(); }
-
- protected:
-  void RunFwdTxfm(int16_t *in, tran_low_t *out, int stride) {
-    fwd_txfm_(in, out, stride);
-  }
-  void RunInvTxfm(tran_low_t *out, uint8_t *dst, int stride) {
-    inv_txfm_(out, dst, stride);
-  }
-
-  FdctFunc fwd_txfm_;
-  IdctFunc inv_txfm_;
-};
-
-TEST_P(Trans16x16DCT, AccuracyCheck) { RunAccuracyCheck(); }
-
-TEST_P(Trans16x16DCT, CoeffCheck) { RunCoeffCheck(); }
-
-TEST_P(Trans16x16DCT, MemCheck) { RunMemCheck(); }
-
-TEST_P(Trans16x16DCT, QuantCheck) {
-  // Use maximally allowed quantization step sizes for DC and AC
-  // coefficients respectively.
-  RunQuantCheck(1336, 1828);
-}
-
-TEST_P(Trans16x16DCT, InvAccuracyCheck) { RunInvAccuracyCheck(); }
-
 class Trans16x16HT : public Trans16x16TestBase,
                      public ::testing::TestWithParam<Ht16x16Param> {
  public:
@@ -723,10 +669,6 @@
 
 using std::tr1::make_tuple;
 
-INSTANTIATE_TEST_CASE_P(C, Trans16x16DCT,
-                        ::testing::Values(make_tuple(&aom_fdct16x16_c,
-                                                     &aom_idct16x16_256_add_c,
-                                                     DCT_DCT, AOM_BITS_8)));
 INSTANTIATE_TEST_CASE_P(
     C, Trans16x16HT,
     ::testing::Values(
@@ -748,10 +690,6 @@
                    AOM_BITS_8)));
 
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(SSE2, Trans16x16DCT,
-                        ::testing::Values(make_tuple(&aom_fdct16x16_sse2,
-                                                     &aom_idct16x16_256_add_c,
-                                                     DCT_DCT, AOM_BITS_8)));
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans16x16HT,
     ::testing::Values(make_tuple(&av1_fht16x16_sse2, &av1_iht16x16_256_add_c,
diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index 290d069..d89c09c 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -146,76 +146,6 @@
       << "Error: 32x32 FDCT/IDCT has average round-trip error > 1 per block";
 }
 
-TEST_P(Trans32x32Test, CoeffCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  const int count_test_block = 1000;
-
-  DECLARE_ALIGNED(16, int16_t, input_block[kNumCoeffs]);
-  DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
-  DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);
-
-  for (int i = 0; i < count_test_block; ++i) {
-    for (int j = 0; j < kNumCoeffs; ++j)
-      input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
-
-    const int stride = 32;
-    aom_fdct32x32_c(input_block, output_ref_block, stride);
-    ASM_REGISTER_STATE_CHECK(fwd_txfm_(input_block, output_block, stride));
-
-    if (version_ == 0) {
-      for (int j = 0; j < kNumCoeffs; ++j)
-        EXPECT_EQ(output_block[j], output_ref_block[j])
-            << "Error: 32x32 FDCT versions have mismatched coefficients";
-    } else {
-      for (int j = 0; j < kNumCoeffs; ++j)
-        EXPECT_GE(6, abs(output_block[j] - output_ref_block[j]))
-            << "Error: 32x32 FDCT rd has mismatched coefficients";
-    }
-  }
-}
-
-TEST_P(Trans32x32Test, MemCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  const int count_test_block = 2000;
-
-  DECLARE_ALIGNED(16, int16_t, input_extreme_block[kNumCoeffs]);
-  DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
-  DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);
-
-  for (int i = 0; i < count_test_block; ++i) {
-    // Initialize a test block with input range [-mask_, mask_].
-    for (int j = 0; j < kNumCoeffs; ++j) {
-      input_extreme_block[j] = rnd.Rand8() & 1 ? mask_ : -mask_;
-    }
-    if (i == 0) {
-      for (int j = 0; j < kNumCoeffs; ++j) input_extreme_block[j] = mask_;
-    } else if (i == 1) {
-      for (int j = 0; j < kNumCoeffs; ++j) input_extreme_block[j] = -mask_;
-    }
-
-    const int stride = 32;
-    aom_fdct32x32_c(input_extreme_block, output_ref_block, stride);
-    ASM_REGISTER_STATE_CHECK(
-        fwd_txfm_(input_extreme_block, output_block, stride));
-
-    // The minimum quant value is 4.
-    for (int j = 0; j < kNumCoeffs; ++j) {
-      if (version_ == 0) {
-        EXPECT_EQ(output_block[j], output_ref_block[j])
-            << "Error: 32x32 FDCT versions have mismatched coefficients";
-      } else {
-        EXPECT_GE(6, abs(output_block[j] - output_ref_block[j]))
-            << "Error: 32x32 FDCT rd has mismatched coefficients";
-      }
-      EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_ref_block[j]))
-          << "Error: 32x32 FDCT C has coefficient larger than 4*DCT_MAX_VALUE";
-      EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_block[j]))
-          << "Error: 32x32 FDCT has coefficient larger than "
-          << "4*DCT_MAX_VALUE";
-    }
-  }
-}
-
 TEST_P(Trans32x32Test, InverseAccuracy) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
   const int count_test_block = 1000;
@@ -313,33 +243,4 @@
   EXPECT_EQ(sum >> 3, output[0]);
 }
 
-using std::tr1::make_tuple;
-
-INSTANTIATE_TEST_CASE_P(
-    C, Trans32x32Test,
-    ::testing::Values(make_tuple(&aom_fdct32x32_c, &aom_idct32x32_1024_add_c, 0,
-                                 AOM_BITS_8),
-                      make_tuple(&aom_fdct32x32_rd_c, &aom_idct32x32_1024_add_c,
-                                 1, AOM_BITS_8)));
-
-#if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(SSE2, Trans32x32Test,
-                        ::testing::Values(make_tuple(&aom_fdct32x32_sse2,
-                                                     &aom_idct32x32_1024_add_c,
-                                                     DCT_DCT, AOM_BITS_8),
-                                          make_tuple(&aom_fdct32x32_rd_sse2,
-                                                     &aom_idct32x32_1024_add_c,
-                                                     ADST_DCT, AOM_BITS_8)));
-#endif  // HAVE_SSE2
-
-#if HAVE_AVX2
-INSTANTIATE_TEST_CASE_P(
-    AVX2, Trans32x32Test,
-    ::testing::Values(make_tuple(&aom_fdct32x32_avx2,
-                                 &aom_idct32x32_1024_add_sse2, DCT_DCT,
-                                 AOM_BITS_8),
-                      make_tuple(&aom_fdct32x32_rd_avx2,
-                                 &aom_idct32x32_1024_add_sse2, ADST_DCT,
-                                 AOM_BITS_8)));
-#endif  // HAVE_AVX2
 }  // namespace
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index 955b617..8178a2d 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -567,11 +567,6 @@
 
 using std::tr1::make_tuple;
 
-INSTANTIATE_TEST_CASE_P(C, FwdTrans8x8DCT,
-                        ::testing::Values(make_tuple(&aom_fdct8x8_c,
-                                                     &aom_idct8x8_64_add_c,
-                                                     DCT_DCT, AOM_BITS_8)));
-
 INSTANTIATE_TEST_CASE_P(
     C, FwdTrans8x8HT,
     ::testing::Values(
@@ -590,10 +585,6 @@
                    AOM_BITS_8)));
 
 #if HAVE_SSE2
-INSTANTIATE_TEST_CASE_P(SSE2, FwdTrans8x8DCT,
-                        ::testing::Values(make_tuple(&aom_fdct8x8_sse2,
-                                                     &aom_idct8x8_64_add_c,
-                                                     DCT_DCT, AOM_BITS_8)));
 INSTANTIATE_TEST_CASE_P(
     SSE2, FwdTrans8x8HT,
     ::testing::Values(make_tuple(&av1_fht8x8_sse2, &av1_iht8x8_64_add_c,
@@ -606,11 +597,4 @@
                                  ADST_ADST, AOM_BITS_8)));
 #endif  // HAVE_SSE2
 
-#if HAVE_SSSE3 && ARCH_X86_64
-INSTANTIATE_TEST_CASE_P(SSSE3, FwdTrans8x8DCT,
-                        ::testing::Values(make_tuple(&aom_fdct8x8_ssse3,
-                                                     &aom_idct8x8_64_add_ssse3,
-                                                     DCT_DCT, AOM_BITS_8)));
-#endif
-
 }  // namespace
diff --git a/test/idct8x8_test.cc b/test/idct8x8_test.cc
index fc3d04b..060c76d 100644
--- a/test/idct8x8_test.cc
+++ b/test/idct8x8_test.cc
@@ -54,33 +54,4 @@
   for (int i = 0; i < 64; ++i) output[i] *= 2;
 }
 
-TEST(AV1Idct8x8Test, AccuracyCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  const int count_test_block = 10000;
-  for (int i = 0; i < count_test_block; ++i) {
-    int16_t input[64];
-    tran_low_t coeff[64];
-    double output_r[64];
-    uint8_t dst[64], src[64];
-
-    for (int j = 0; j < 64; ++j) {
-      src[j] = rnd.Rand8();
-      dst[j] = rnd.Rand8();
-    }
-    // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < 64; ++j) input[j] = src[j] - dst[j];
-
-    reference_dct_2d(input, output_r);
-    for (int j = 0; j < 64; ++j)
-      coeff[j] = static_cast<tran_low_t>(round(output_r[j]));
-    aom_idct8x8_64_add_c(coeff, dst, 8);
-    for (int j = 0; j < 64; ++j) {
-      const int diff = dst[j] - src[j];
-      const int error = diff * diff;
-      EXPECT_GE(1, error) << "Error: 8x8 FDCT/IDCT has error " << error
-                          << " at index " << j;
-    }
-  }
-}
-
 }  // namespace