Tune the inv_shift
Let the second stage of 10 bit inv txfms fit within 16 bits
Change-Id: Ia087d65484cd410651190dcd9d3292cce6594d34
diff --git a/av1/common/av1_inv_txfm2d.c b/av1/common/av1_inv_txfm2d.c
index c929540..bf7d6a1 100644
--- a/av1/common/av1_inv_txfm2d.c
+++ b/av1/common/av1_inv_txfm2d.c
@@ -246,14 +246,14 @@
};
static const int8_t inv_shift_4x4[2] = { 0, -4 };
-static const int8_t inv_shift_8x8[2] = { 0, -5 };
-static const int8_t inv_shift_16x16[2] = { -1, -5 };
-static const int8_t inv_shift_32x32[2] = { -1, -5 };
+static const int8_t inv_shift_8x8[2] = { -1, -4 };
+static const int8_t inv_shift_16x16[2] = { -2, -4 };
+static const int8_t inv_shift_32x32[2] = { -2, -4 };
#if CONFIG_TX64X64
-static const int8_t inv_shift_64x64[2] = { -1, -5 };
+static const int8_t inv_shift_64x64[2] = { -2, -4 };
#endif
-static const int8_t inv_shift_4x8[2] = { 0, -4 };
-static const int8_t inv_shift_8x4[2] = { 0, -4 };
+static const int8_t inv_shift_4x8[2] = { -1, -3 };
+static const int8_t inv_shift_8x4[2] = { -1, -3 };
static const int8_t inv_shift_8x16[2] = { -1, -4 };
static const int8_t inv_shift_16x8[2] = { -1, -4 };
static const int8_t inv_shift_16x32[2] = { -1, -4 };
@@ -264,11 +264,11 @@
#endif
static const int8_t inv_shift_4x16[2] = { -1, -4 };
static const int8_t inv_shift_16x4[2] = { -1, -4 };
-static const int8_t inv_shift_8x32[2] = { -1, -5 };
-static const int8_t inv_shift_32x8[2] = { -1, -5 };
+static const int8_t inv_shift_8x32[2] = { -2, -4 };
+static const int8_t inv_shift_32x8[2] = { -2, -4 };
#if CONFIG_TX64X64
-static const int8_t inv_shift_16x64[2] = { -1, -5 };
-static const int8_t inv_shift_64x16[2] = { -1, -5 };
+static const int8_t inv_shift_16x64[2] = { -2, -4 };
+static const int8_t inv_shift_64x16[2] = { -2, -4 };
#endif // CONFIG_TX64X64
const int8_t *inv_txfm_shift_ls[TX_SIZES_ALL] = {
diff --git a/av1/common/x86/highbd_inv_txfm_sse4.c b/av1/common/x86/highbd_inv_txfm_sse4.c
index d0d54b5..3e004e2 100644
--- a/av1/common/x86/highbd_inv_txfm_sse4.c
+++ b/av1/common/x86/highbd_inv_txfm_sse4.c
@@ -831,6 +831,7 @@
transpose_8x8(in, out);
idct8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
transpose_8x8(in, out);
+ round_shift_8x8(out, -shift[0]);
idct8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
break;
@@ -841,6 +842,7 @@
transpose_8x8(in, out);
iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
transpose_8x8(in, out);
+ round_shift_8x8(out, -shift[0]);
idct8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
break;
@@ -851,6 +853,7 @@
transpose_8x8(in, out);
idct8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
transpose_8x8(in, out);
+ round_shift_8x8(out, -shift[0]);
iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
break;
@@ -861,6 +864,7 @@
transpose_8x8(in, out);
iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
transpose_8x8(in, out);
+ round_shift_8x8(out, -shift[0]);
iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
write_buffer_8x8(in, output, stride, 0, 0, -shift[1], bd);
break;
@@ -871,6 +875,7 @@
transpose_8x8(in, out);
idct8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
transpose_8x8(in, out);
+ round_shift_8x8(out, -shift[0]);
iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd);
break;
@@ -881,6 +886,7 @@
transpose_8x8(in, out);
iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
transpose_8x8(in, out);
+ round_shift_8x8(out, -shift[0]);
idct8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd);
break;
@@ -891,6 +897,7 @@
transpose_8x8(in, out);
iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
transpose_8x8(in, out);
+ round_shift_8x8(out, -shift[0]);
iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
write_buffer_8x8(in, output, stride, 1, 0, -shift[1], bd);
break;
@@ -901,6 +908,7 @@
transpose_8x8(in, out);
iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
transpose_8x8(in, out);
+ round_shift_8x8(out, -shift[0]);
iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
write_buffer_8x8(in, output, stride, 1, 1, -shift[1], bd);
break;
@@ -911,6 +919,7 @@
transpose_8x8(in, out);
iadst8x8_sse4_1(out, in, row_cfg->cos_bit[2]);
transpose_8x8(in, out);
+ round_shift_8x8(out, -shift[0]);
iadst8x8_sse4_1(out, in, col_cfg->cos_bit[2]);
write_buffer_8x8(in, output, stride, 0, 1, -shift[1], bd);
break;
diff --git a/test/av1_inv_txfm2d_test.cc b/test/av1_inv_txfm2d_test.cc
index 6a50723..fd7ed86 100644
--- a/test/av1_inv_txfm2d_test.cc
+++ b/test/av1_inv_txfm2d_test.cc
@@ -145,19 +145,19 @@
for (int t = 0; t < TX_TYPES; ++t) {
const TX_TYPE tx_type = static_cast<TX_TYPE>(t);
param_list.push_back(AV1InvTxfm2dParam(tx_type, TX_4X4, 2, 0.002));
- param_list.push_back(AV1InvTxfm2dParam(tx_type, TX_8X8, 2, 0.025));
+ param_list.push_back(AV1InvTxfm2dParam(tx_type, TX_8X8, 2, 0.05));
param_list.push_back(AV1InvTxfm2dParam(tx_type, TX_16X16, 2, 0.04));
param_list.push_back(AV1InvTxfm2dParam(tx_type, TX_32X32, 4, 0.4));
#if CONFIG_TX64X64
if (tx_type == DCT_DCT) { // Other types not supported by these tx sizes.
- param_list.push_back(AV1InvTxfm2dParam(tx_type, TX_64X64, 3, 0.2));
+ param_list.push_back(AV1InvTxfm2dParam(tx_type, TX_64X64, 3, 0.3));
}
#endif // CONFIG_TX64X64
- param_list.push_back(AV1InvTxfm2dParam(tx_type, TX_4X8, 2, 0.016));
- param_list.push_back(AV1InvTxfm2dParam(tx_type, TX_8X4, 2, 0.045));
- param_list.push_back(AV1InvTxfm2dParam(tx_type, TX_8X16, 2, 0.2));
- param_list.push_back(AV1InvTxfm2dParam(tx_type, TX_16X8, 2, 0.2));
+ param_list.push_back(AV1InvTxfm2dParam(tx_type, TX_4X8, 2, 0.09));
+ param_list.push_back(AV1InvTxfm2dParam(tx_type, TX_8X4, 2, 0.11));
+ param_list.push_back(AV1InvTxfm2dParam(tx_type, TX_8X16, 2, 0.03));
+ param_list.push_back(AV1InvTxfm2dParam(tx_type, TX_16X8, 2, 0.06));
param_list.push_back(AV1InvTxfm2dParam(tx_type, TX_16X32, 3, 0.4));
param_list.push_back(AV1InvTxfm2dParam(tx_type, TX_32X16, 3, 0.5));
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index 09e6e01..096a3c6 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -244,9 +244,9 @@
<< "Error: 8x8 FDCT/IDCT or FHT/IHT has an individual"
<< " roundtrip error > 1";
- EXPECT_GE((count_test_block << 2 * (bit_depth_ - 8)) / 5, total_error)
+ EXPECT_GE((count_test_block << 2 * (bit_depth_ - 8)) / 4, total_error)
<< "Error: 8x8 FDCT/IDCT or FHT/IHT has average roundtrip "
- << "error > 1/5 per block";
+ << "error > 1/4 per block";
}
void RunExtremalCheck() {