Move all of LBD Daala TX to up-4, down-1 shift
Now that tran_low_t is assumed to be 32 bit when Daala TX is active,
there's no reason for multi-stage shifting to fit coefficients into 16
bits for the inter-tranform transpose matrix. Go to a consistent up by
four, down by one shifting scheme for all TX block sizes.
(Note this is for the current AV1 coefficient scaling scheme with
av1_get_tx_scale and deeper coefficients for higher bitdepth input.
Daala TX is moving to the long-intended constant-coefficient-depth in
upcoming patches).
subset 1:
monty-4-1-baseline-s1@2017-11-11T05:57:15.857Z ->
monty-4-1-test-s1@2017-11-11T05:57:52.983Z
PSNR | PSNR Cb | PSNR Cr | PSNR HVS | SSIM | MS SSIM | CIEDE 2000
-0.0117 | -0.0246 | 0.0530 | 0.0238 | 0.0254 | 0.0447 | -0.0442
Change-Id: I2214e94ac822542c504d472276723277ed350abf
diff --git a/av1/common/idct.c b/av1/common/idct.c
index dc07155..c3bde4c 100644
--- a/av1/common/idct.c
+++ b/av1/common/idct.c
@@ -1008,9 +1008,9 @@
for (int i = 0; i < n2; ++i) {
#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
tran_low_t temp_in[16];
- for (int j = 0; j < n; j++) temp_in[j] = input[j] * 2;
+ for (int j = 0; j < n; j++) temp_in[j] = input[j] * 4;
IHT_16x32[tx_type].rows(temp_in, outtmp);
- for (int j = 0; j < n; ++j) tmp[j][i] = outtmp[j] * 4;
+ for (int j = 0; j < n; ++j) tmp[j][i] = outtmp[j];
#else
IHT_16x32[tx_type].rows(input, outtmp);
for (int j = 0; j < n; ++j)
@@ -1030,7 +1030,7 @@
int d = i * stride + j;
int s = j * outstride + i;
#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
- dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
+ dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
#else
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
#endif
@@ -1095,9 +1095,9 @@
for (int i = 0; i < n; ++i) {
#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
tran_low_t temp_in[32];
- for (int j = 0; j < n2; j++) temp_in[j] = input[j] * 2;
+ for (int j = 0; j < n2; j++) temp_in[j] = input[j] * 4;
IHT_32x16[tx_type].rows(temp_in, outtmp);
- for (int j = 0; j < n2; ++j) tmp[j][i] = outtmp[j] * 4;
+ for (int j = 0; j < n2; ++j) tmp[j][i] = outtmp[j];
#else
IHT_32x16[tx_type].rows(input, outtmp);
for (int j = 0; j < n2; ++j)
@@ -1117,7 +1117,7 @@
int d = i * stride + j;
int s = j * outstride + i;
#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
- dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
+ dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
#else
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
#endif
@@ -1360,7 +1360,7 @@
for (int i = 0; i < 32; ++i) {
#if CONFIG_DAALA_TX32
tran_low_t temp_in[32];
- for (int j = 0; j < 32; j++) temp_in[j] = input[j] * 2;
+ for (int j = 0; j < 32; j++) temp_in[j] = input[j] * 4;
IHT_32[tx_type].rows(temp_in, out[i]);
#else
IHT_32[tx_type].rows(input, out[i]);
@@ -1369,15 +1369,8 @@
}
// transpose
- for (int i = 0; i < 32; i++) {
- for (int j = 0; j < 32; j++) {
-#if CONFIG_DAALA_TX32
- tmp[j][i] = out[i][j] * 4;
-#else
- tmp[j][i] = out[i][j];
-#endif
- }
- }
+ for (int i = 0; i < 32; i++)
+ for (int j = 0; j < 32; j++) tmp[j][i] = out[i][j];
// inverse transform column vectors
for (int i = 0; i < 32; ++i) IHT_32[tx_type].cols(tmp[i], out[i]);
@@ -1390,7 +1383,7 @@
int d = i * stride + j;
int s = j * outstride + i;
#if CONFIG_DAALA_TX32
- dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
+ dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
#else
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
#endif
@@ -1455,9 +1448,8 @@
for (int i = 0; i < 64; ++i) {
#if CONFIG_DAALA_TX64
tran_low_t temp_in[64];
- for (int j = 0; j < 64; j++) temp_in[j] = input[j] * 2;
+ for (int j = 0; j < 64; j++) temp_in[j] = input[j] * 8;
IHT_64[tx_type].rows(temp_in, out[i]);
-// Do not rescale intermediate for Daala
#else
IHT_64[tx_type].rows(input, out[i]);
for (int j = 0; j < 64; ++j) out[i][j] = ROUND_POWER_OF_TWO(out[i][j], 1);
@@ -1483,7 +1475,7 @@
int d = i * stride + j;
int s = j * outstride + i;
#if CONFIG_DAALA_TX64
- dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 2));
+ dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
#else
dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
#endif
diff --git a/av1/encoder/dct.c b/av1/encoder/dct.c
index 6ae3080..43d059a 100644
--- a/av1/encoder/dct.c
+++ b/av1/encoder/dct.c
@@ -1965,7 +1965,7 @@
ht.rows(temp_in, temp_out);
for (j = 0; j < n; ++j) {
#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
- out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
+ out[j * n2 + i] = temp_out[j];
#else
out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4);
#endif
@@ -1976,7 +1976,12 @@
for (i = 0; i < n; ++i) {
for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
ht.cols(temp_in, temp_out);
+#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
+ for (j = 0; j < n2; ++j)
+ output[i + j * n] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
+#else
for (j = 0; j < n2; ++j) output[i + j * n] = temp_out[j];
+#endif
}
// Note: overall scale factor of transform is 4 times unitary
}
@@ -2049,7 +2054,7 @@
ht.cols(temp_in, temp_out);
for (j = 0; j < n; ++j) {
#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
- out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
+ out[j * n2 + i] = temp_out[j];
#else
out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4);
#endif
@@ -2060,7 +2065,12 @@
for (i = 0; i < n; ++i) {
for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
ht.rows(temp_in, temp_out);
+#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
+ for (j = 0; j < n2; ++j)
+ output[j + i * n2] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
+#else
for (j = 0; j < n2; ++j) output[j + i * n2] = temp_out[j];
+#endif
}
// Note: overall scale factor of transform is 4 times unitary
}
@@ -2489,8 +2499,7 @@
#if CONFIG_DAALA_TX64
for (j = 0; j < 64; ++j) temp_in[j] = input[j * stride + i] * 16;
ht.cols(temp_in, temp_out);
- for (j = 0; j < 64; ++j)
- out[j * 64 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 3;
+ for (j = 0; j < 64; ++j) out[j * 64 + i] = temp_out[j];
#else
for (j = 0; j < 64; ++j) temp_in[j] = input[j * stride + i];
@@ -2506,7 +2515,7 @@
ht.rows(temp_in, temp_out);
for (j = 0; j < 64; ++j)
#if CONFIG_DAALA_TX64
- output[j + i * 64] = temp_out[j];
+ output[j + i * 64] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 3);
#else
output[j + i * 64] =
(tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);