Add Daala TX to rectangular 32x64 and 64x32 transforms This patch adds Daala TX transforms ot the 32x64 and 64x32 transform block sizes using Q3 (up 4, down 1) scaling. subset 1: monty-daalaTX-fulltest-Daalabaseline-s1@2017-11-07T00:01:46.582Z -> monty-daalaTX-LBD-Daala32x64-s1-Z@2017-11-07T06:10:58.523Z PSNR | PSNR Cb | PSNR Cr | PSNR HVS | SSIM | MS SSIM | CIEDE 2000 0.0112 | -0.0769 | 0.0799 | 0.0567 | 0.0099 | -0.0077 | -0.0446 objective 1 fast: monty-daalaTX-fulltest-Daalabaseline-o1f4@2017-11-07T05:59:16.553Z -> monty-daalaTX-LBD-Daala32x64-o1f4-Z@2017-11-07T06:10:11.519Z PSNR | PSNR Cb | PSNR Cr | PSNR HVS | SSIM | MS SSIM | CIEDE 2000 -0.0190 | 0.0926 | -0.0730 | -0.0516 | -0.0037 | -0.0588 | 0.1310 Change-Id: I6246ecba388ae81deadc7b306dc3404fa7869aab
diff --git a/av1/common/idct.c b/av1/common/idct.c index 7124eda..dd4eada 100644 --- a/av1/common/idct.c +++ b/av1/common/idct.c
@@ -1768,6 +1768,24 @@ assert(tx_type == DCT_DCT); #endif static const transform_2d IHT_64x32[] = { +#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64 + { daala_idct32, daala_idct64 }, // DCT_DCT + { daala_idst32, daala_idct64 }, // ADST_DCT + { daala_idct32, daala_idst64 }, // DCT_ADST + { daala_idst32, daala_idst64 }, // ADST_ADST + { daala_idst32, daala_idct64 }, // FLIPADST_DCT + { daala_idct32, daala_idst64 }, // DCT_FLIPADST + { daala_idst32, daala_idst64 }, // FLIPADST_FLIPADST + { daala_idst32, daala_idst64 }, // ADST_FLIPADST + { daala_idst32, daala_idst64 }, // FLIPADST_ADST + { daala_idtx32, daala_idtx64 }, // IDTX + { daala_idct32, daala_idtx64 }, // V_DCT + { daala_idtx32, daala_idct64 }, // H_DCT + { daala_idst32, daala_idtx64 }, // V_ADST + { daala_idtx32, daala_idst64 }, // H_ADST + { daala_idst32, daala_idtx64 }, // V_FLIPADST + { daala_idtx32, daala_idst64 }, // H_FLIPADST +#else { aom_idct32_c, idct64_row_c }, // DCT_DCT { ihalfright32_c, idct64_row_c }, // ADST_DCT { aom_idct32_c, ihalfright64_c }, // DCT_ADST @@ -1784,6 +1802,7 @@ { iidtx32_c, ihalfright64_c }, // H_ADST { ihalfright32_c, iidtx64_c }, // V_FLIPADST { iidtx32_c, ihalfright64_c }, // H_FLIPADST +#endif }; const int n = 32; const int n2 = 64; @@ -1795,9 +1814,16 @@ // inverse transform row vectors and transpose for (i = 0; i < n; ++i) { +#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64 + tran_low_t temp_in[64]; + for (j = 0; j < n2; j++) temp_in[j] = input[j] * 8; + IHT_64x32[tx_type].rows(temp_in, outtmp); + for (j = 0; j < n2; ++j) tmp[j][i] = outtmp[j]; +#else IHT_64x32[tx_type].rows(input, outtmp); for (j = 0; j < n2; ++j) tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * InvSqrt2); +#endif input += n2; } @@ -1811,7 +1837,11 @@ for (j = 0; j < n2; ++j) { int d = i * stride + j; int s = j * outstride + i; +#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64 + dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4)); +#else dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5)); +#endif } } } @@ -1826,6 +1856,24 @@ assert(tx_type == DCT_DCT); #endif static const transform_2d IHT_32x64[] = { +#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64 + { daala_idct64, daala_idct32 }, // DCT_DCT + { daala_idst64, daala_idct32 }, // ADST_DCT + { daala_idct64, daala_idst32 }, // DCT_ADST + { daala_idst64, daala_idst32 }, // ADST_ADST + { daala_idst64, daala_idct32 }, // FLIPADST_DCT + { daala_idct64, daala_idst32 }, // DCT_FLIPADST + { daala_idst64, daala_idst32 }, // FLIPADST_FLIPADST + { daala_idst64, daala_idst32 }, // ADST_FLIPADST + { daala_idst64, daala_idst32 }, // FLIPADST_ADST + { daala_idtx64, daala_idtx32 }, // IDTX + { daala_idct64, daala_idtx32 }, // V_DCT + { daala_idtx64, daala_idct32 }, // H_DCT + { daala_idst64, daala_idtx32 }, // V_ADST + { daala_idtx64, daala_idst32 }, // H_ADST + { daala_idst64, daala_idtx32 }, // V_FLIPADST + { daala_idtx64, daala_idst32 }, // H_FLIPADST +#else { idct64_col_c, aom_idct32_c }, // DCT_DCT { ihalfright64_c, aom_idct32_c }, // ADST_DCT { idct64_col_c, ihalfright32_c }, // DCT_ADST @@ -1842,6 +1890,7 @@ { iidtx64_c, ihalfright32_c }, // H_ADST { ihalfright64_c, iidtx32_c }, // V_FLIPADST { iidtx64_c, ihalfright32_c }, // H_FLIPADST +#endif }; const int n = 32; @@ -1853,9 +1902,16 @@ // inverse transform row vectors and transpose for (i = 0; i < n2; ++i) { +#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64 + tran_low_t temp_in[32]; + for (j = 0; j < n; j++) temp_in[j] = input[j] * 8; + IHT_32x64[tx_type].rows(temp_in, outtmp); + for (j = 0; j < n; ++j) tmp[j][i] = outtmp[j]; +#else IHT_32x64[tx_type].rows(input, outtmp); for (j = 0; j < n; ++j) tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * InvSqrt2); +#endif input += n; } @@ -1869,7 +1925,11 @@ for (j = 0; j < n; ++j) { int d = i * stride + j; int s = j * outstride + i; +#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64 + dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4)); +#else dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5)); +#endif } } } @@ -2158,12 +2218,20 @@ #if CONFIG_TX64X64 static void inv_txfm_add_32x64(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param) { +#if CONFIG_DAALA_TX64 && CONFIG_DAALA_TX32 + av1_iht32x64_2048_add_c(input, dest, stride, txfm_param); +#else av1_iht32x64_2048_add(input, dest, stride, txfm_param); +#endif } static void inv_txfm_add_64x32(const tran_low_t *input, uint8_t *dest, int stride, const TxfmParam *txfm_param) { +#if CONFIG_DAALA_TX64 && CONFIG_DAALA_TX32 + av1_iht64x32_2048_add_c(input, dest, stride, txfm_param); +#else av1_iht64x32_2048_add(input, dest, stride, txfm_param); +#endif } #endif // CONFIG_TX64X64
diff --git a/av1/encoder/dct.c b/av1/encoder/dct.c index 2db5214..d79d64a 100644 --- a/av1/encoder/dct.c +++ b/av1/encoder/dct.c
@@ -2713,6 +2713,24 @@ assert(tx_type == DCT_DCT); #endif static const transform_2d FHT[] = { +#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64 + { daala_fdct32, daala_fdct64 }, // DCT_DCT + { daala_fdst32, daala_fdct64 }, // ADST_DCT + { daala_fdct32, daala_fdst64 }, // DCT_ADST + { daala_fdst32, daala_fdst64 }, // ADST_ADST + { daala_fdst32, daala_fdct64 }, // FLIPADST_DCT + { daala_fdct32, daala_fdst64 }, // DCT_FLIPADST + { daala_fdst32, daala_fdst64 }, // FLIPADST_FLIPADST + { daala_fdst32, daala_fdst64 }, // ADST_FLIPADST + { daala_fdst32, daala_fdst64 }, // FLIPADST_ADST + { daala_idtx32, daala_idtx64 }, // IDTX + { daala_fdct32, daala_idtx64 }, // V_DCT + { daala_idtx32, daala_fdct64 }, // H_DCT + { daala_fdst32, daala_idtx64 }, // V_ADST + { daala_idtx32, daala_fdst64 }, // H_ADST + { daala_fdst32, daala_idtx64 }, // V_FLIPADST + { daala_idtx32, daala_fdst64 }, // H_FLIPADST +#else { fdct32, fdct64_row }, // DCT_DCT { fhalfright32, fdct64_row }, // ADST_DCT { fdct32, fhalfright64 }, // DCT_ADST @@ -2729,6 +2747,7 @@ { fidtx32, fhalfright64 }, // H_ADST { fhalfright32, fidtx64 }, // V_FLIPADST { fidtx32, fhalfright64 }, // H_FLIPADST +#endif }; const transform_2d ht = FHT[tx_type]; tran_low_t out[2048]; @@ -2741,20 +2760,36 @@ // Columns for (i = 0; i < n2; ++i) { - for (j = 0; j < n; ++j) + for (j = 0; j < n; ++j) { +#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64 + temp_in[j] = input[j * stride + i] * 16; +#else temp_in[j] = (tran_low_t)fdct_round_shift(input[j * stride + i] * Sqrt2); +#endif + } ht.cols(temp_in, temp_out); - for (j = 0; j < n; ++j) + for (j = 0; j < n; ++j) { +#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64 + out[j * n2 + i] = temp_out[j]; +#else out[j * n2 + i] = (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2); +#endif + } } // Rows for (i = 0; i < n; ++i) { for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2]; ht.rows(temp_in, temp_out); - for (j = 0; j < n2; ++j) + for (j = 0; j < n2; ++j) { +#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64 + output[j + i * n2] = + (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 3); +#else output[j + i * n2] = (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2); +#endif + } } // Zero out right 32x32 area. @@ -2773,6 +2808,24 @@ assert(tx_type == DCT_DCT); #endif static const transform_2d FHT[] = { +#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64 + { daala_fdct64, daala_fdct32 }, // DCT_DCT + { daala_fdst64, daala_fdct32 }, // ADST_DCT + { daala_fdct64, daala_fdst32 }, // DCT_ADST + { daala_fdst64, daala_fdst32 }, // ADST_ADST + { daala_fdst64, daala_fdct32 }, // FLIPADST_DCT + { daala_fdct64, daala_fdst32 }, // DCT_FLIPADST + { daala_fdst64, daala_fdst32 }, // FLIPADST_FLIPADST + { daala_fdst64, daala_fdst32 }, // ADST_FLIPADST + { daala_fdst64, daala_fdst32 }, // FLIPADST_ADST + { daala_idtx64, daala_idtx32 }, // IDTX + { daala_fdct64, daala_idtx32 }, // V_DCT + { daala_idtx64, daala_fdct32 }, // H_DCT + { daala_fdst64, daala_idtx32 }, // V_ADST + { daala_idtx64, daala_fdst32 }, // H_ADST + { daala_fdst64, daala_idtx32 }, // V_FLIPADST + { daala_idtx64, daala_fdst32 }, // H_FLIPADST +#else { fdct64_row, fdct32 }, // DCT_DCT { fhalfright64, fdct32 }, // ADST_DCT { fdct64_row, fhalfright32 }, // DCT_ADST @@ -2789,6 +2842,7 @@ { fidtx64, fhalfright32 }, // H_ADST { fhalfright64, fidtx32 }, // V_FLIPADST { fidtx64, fhalfright32 }, // H_FLIPADST +#endif }; const transform_2d ht = FHT[tx_type]; tran_low_t out[32 * 64]; @@ -2801,19 +2855,34 @@ // Rows for (i = 0; i < n2; ++i) { - for (j = 0; j < n; ++j) + for (j = 0; j < n; ++j) { +#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64 + temp_in[j] = input[i * stride + j] * 16; +#else temp_in[j] = (tran_low_t)fdct_round_shift(input[i * stride + j] * Sqrt2); +#endif + } ht.rows(temp_in, temp_out); - for (j = 0; j < n; ++j) + for (j = 0; j < n; ++j) { +#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64 + out[j * n2 + i] = temp_out[j]; +#else out[j * n2 + i] = (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2); +#endif + } } // Columns for (i = 0; i < n; ++i) { for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2]; ht.cols(temp_in, temp_out); - for (j = 0; j < n2; ++j) + for (j = 0; j < n2; ++j) { +#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64 + output[i + j * n] = (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 3); +#else output[i + j * n] = (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2); +#endif + } } // Zero out the bottom 32x32 area.