Change scales of fht 32x16 16x32 32x32 functions

Performance drop with ext_tx and rect_tx on
       BDRate
lowres -0.028
midres -0.075
hdres  -0.054

Change-Id: I50f89b9e9785d82ab05c3276a3c8b22b4dcfd408
diff --git a/av1/encoder/dct.c b/av1/encoder/dct.c
index cc76f3f..4a26124 100644
--- a/av1/encoder/dct.c
+++ b/av1/encoder/dct.c
@@ -1654,15 +1654,14 @@
           (tran_low_t)fdct_round_shift(input[i * stride + j] * 4 * Sqrt2);
     ht.rows(temp_in, temp_out);
     for (j = 0; j < n; ++j)
-      out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
+      out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4);
   }
 
   // Columns
   for (i = 0; i < n; ++i) {
     for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
     ht.cols(temp_in, temp_out);
-    for (j = 0; j < n2; ++j)
-      output[i + j * n] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
+    for (j = 0; j < n2; ++j) output[i + j * n] = temp_out[j];
   }
   // Note: overall scale factor of transform is 4 times unitary
 }
@@ -1707,15 +1706,14 @@
           (tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2);
     ht.cols(temp_in, temp_out);
     for (j = 0; j < n; ++j)
-      out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
+      out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4);
   }
 
   // Rows
   for (i = 0; i < n; ++i) {
     for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
     ht.rows(temp_in, temp_out);
-    for (j = 0; j < n2; ++j)
-      output[j + i * n2] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
+    for (j = 0; j < n2; ++j) output[j + i * n2] = temp_out[j];
   }
   // Note: overall scale factor of transform is 4 times unitary
 }
@@ -2074,17 +2072,6 @@
 }
 #endif  // CONFIG_AOM_HIGHBITDEPTH
 
-// TODO(luoyi): Adding this function to avoid DCT_DCT overflow.
-// Remove this function after we scale the column txfm output correctly.
-static INLINE int range_check_dct32x32(const int16_t *input, int16_t bound,
-                                       int size) {
-  int i;
-  for (i = 0; i < size; ++i) {
-    if (abs(input[i]) > bound) return 1;
-  }
-  return 0;
-}
-
 void av1_fht32x32_c(const int16_t *input, tran_low_t *output, int stride,
                     int tx_type) {
   static const transform_2d FHT[] = {
@@ -2117,27 +2104,19 @@
   maybe_flip_input(&input, &stride, 32, 32, flipped_input, tx_type);
 #endif
 
-  if (DCT_DCT == tx_type) {
-    if (range_check_dct32x32(input, (1 << 6) - 1, 1 << 10)) {
-      aom_fdct32x32_c(input, output, stride);
-      return;
-    }
-  }
   // Columns
   for (i = 0; i < 32; ++i) {
     for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
     ht.cols(temp_in, temp_out);
     for (j = 0; j < 32; ++j)
-      out[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+      out[j * 32 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4);
   }
 
   // Rows
   for (i = 0; i < 32; ++i) {
     for (j = 0; j < 32; ++j) temp_in[j] = out[j + i * 32];
     ht.rows(temp_in, temp_out);
-    for (j = 0; j < 32; ++j)
-      output[j + i * 32] =
-          (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
+    for (j = 0; j < 32; ++j) output[j + i * 32] = temp_out[j];
   }
 }
 
diff --git a/av1/encoder/hybrid_fwd_txfm.c b/av1/encoder/hybrid_fwd_txfm.c
index 9505f66..b7adac2 100644
--- a/av1/encoder/hybrid_fwd_txfm.c
+++ b/av1/encoder/hybrid_fwd_txfm.c
@@ -92,14 +92,14 @@
                            int diff_stride, TX_TYPE tx_type,
                            FWD_TXFM_OPT fwd_txfm_opt) {
   (void)fwd_txfm_opt;
-  av1_fht16x32(src_diff, coeff, diff_stride, tx_type);
+  av1_fht16x32_c(src_diff, coeff, diff_stride, tx_type);
 }
 
 static void fwd_txfm_32x16(const int16_t *src_diff, tran_low_t *coeff,
                            int diff_stride, TX_TYPE tx_type,
                            FWD_TXFM_OPT fwd_txfm_opt) {
   (void)fwd_txfm_opt;
-  av1_fht32x16(src_diff, coeff, diff_stride, tx_type);
+  av1_fht32x16_c(src_diff, coeff, diff_stride, tx_type);
 }
 
 static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
@@ -135,7 +135,7 @@
     av1_fwd_idtx_c(src_diff, coeff, diff_stride, 32, tx_type);
   else
 #endif
-    av1_fht32x32(src_diff, coeff, diff_stride, tx_type);
+    av1_fht32x32_c(src_diff, coeff, diff_stride, tx_type);
 }
 
 #if CONFIG_TX64X64