Change scales of fht 32x16 16x32 32x32 functions
Performance drop with ext_tx and rect_tx on
BDRate
lowres -0.028
midres -0.075
hdres -0.054
Change-Id: I50f89b9e9785d82ab05c3276a3c8b22b4dcfd408
diff --git a/av1/encoder/dct.c b/av1/encoder/dct.c
index cc76f3f..4a26124 100644
--- a/av1/encoder/dct.c
+++ b/av1/encoder/dct.c
@@ -1654,15 +1654,14 @@
(tran_low_t)fdct_round_shift(input[i * stride + j] * 4 * Sqrt2);
ht.rows(temp_in, temp_out);
for (j = 0; j < n; ++j)
- out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
+ out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4);
}
// Columns
for (i = 0; i < n; ++i) {
for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
ht.cols(temp_in, temp_out);
- for (j = 0; j < n2; ++j)
- output[i + j * n] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
+ for (j = 0; j < n2; ++j) output[i + j * n] = temp_out[j];
}
// Note: overall scale factor of transform is 4 times unitary
}
@@ -1707,15 +1706,14 @@
(tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2);
ht.cols(temp_in, temp_out);
for (j = 0; j < n; ++j)
- out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
+ out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4);
}
// Rows
for (i = 0; i < n; ++i) {
for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
ht.rows(temp_in, temp_out);
- for (j = 0; j < n2; ++j)
- output[j + i * n2] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
+ for (j = 0; j < n2; ++j) output[j + i * n2] = temp_out[j];
}
// Note: overall scale factor of transform is 4 times unitary
}
@@ -2074,17 +2072,6 @@
}
#endif // CONFIG_AOM_HIGHBITDEPTH
-// TODO(luoyi): Adding this function to avoid DCT_DCT overflow.
-// Remove this function after we scale the column txfm output correctly.
-static INLINE int range_check_dct32x32(const int16_t *input, int16_t bound,
- int size) {
- int i;
- for (i = 0; i < size; ++i) {
- if (abs(input[i]) > bound) return 1;
- }
- return 0;
-}
-
void av1_fht32x32_c(const int16_t *input, tran_low_t *output, int stride,
int tx_type) {
static const transform_2d FHT[] = {
@@ -2117,27 +2104,19 @@
maybe_flip_input(&input, &stride, 32, 32, flipped_input, tx_type);
#endif
- if (DCT_DCT == tx_type) {
- if (range_check_dct32x32(input, (1 << 6) - 1, 1 << 10)) {
- aom_fdct32x32_c(input, output, stride);
- return;
- }
- }
// Columns
for (i = 0; i < 32; ++i) {
for (j = 0; j < 32; ++j) temp_in[j] = input[j * stride + i] * 4;
ht.cols(temp_in, temp_out);
for (j = 0; j < 32; ++j)
- out[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+ out[j * 32 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4);
}
// Rows
for (i = 0; i < 32; ++i) {
for (j = 0; j < 32; ++j) temp_in[j] = out[j + i * 32];
ht.rows(temp_in, temp_out);
- for (j = 0; j < 32; ++j)
- output[j + i * 32] =
- (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
+ for (j = 0; j < 32; ++j) output[j + i * 32] = temp_out[j];
}
}
diff --git a/av1/encoder/hybrid_fwd_txfm.c b/av1/encoder/hybrid_fwd_txfm.c
index 9505f66..b7adac2 100644
--- a/av1/encoder/hybrid_fwd_txfm.c
+++ b/av1/encoder/hybrid_fwd_txfm.c
@@ -92,14 +92,14 @@
int diff_stride, TX_TYPE tx_type,
FWD_TXFM_OPT fwd_txfm_opt) {
(void)fwd_txfm_opt;
- av1_fht16x32(src_diff, coeff, diff_stride, tx_type);
+ av1_fht16x32_c(src_diff, coeff, diff_stride, tx_type);
}
static void fwd_txfm_32x16(const int16_t *src_diff, tran_low_t *coeff,
int diff_stride, TX_TYPE tx_type,
FWD_TXFM_OPT fwd_txfm_opt) {
(void)fwd_txfm_opt;
- av1_fht32x16(src_diff, coeff, diff_stride, tx_type);
+ av1_fht32x16_c(src_diff, coeff, diff_stride, tx_type);
}
static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
@@ -135,7 +135,7 @@
av1_fwd_idtx_c(src_diff, coeff, diff_stride, 32, tx_type);
else
#endif
- av1_fht32x32(src_diff, coeff, diff_stride, tx_type);
+ av1_fht32x32_c(src_diff, coeff, diff_stride, tx_type);
}
#if CONFIG_TX64X64