Add Daala TX to rectangular 32x64 and 64x32 transforms This patch adds Daala TX transforms ot the 32x64 and 64x32 transform block sizes using Q3 (up 4, down 1) scaling. subset 1: monty-daalaTX-fulltest-Daalabaseline-s1@2017-11-07T00:01:46.582Z -> monty-daalaTX-LBD-Daala32x64-s1-Z@2017-11-07T06:10:58.523Z PSNR | PSNR Cb | PSNR Cr | PSNR HVS | SSIM | MS SSIM | CIEDE 2000 0.0112 | -0.0769 | 0.0799 | 0.0567 | 0.0099 | -0.0077 | -0.0446 objective 1 fast: monty-daalaTX-fulltest-Daalabaseline-o1f4@2017-11-07T05:59:16.553Z -> monty-daalaTX-LBD-Daala32x64-o1f4-Z@2017-11-07T06:10:11.519Z PSNR | PSNR Cb | PSNR Cr | PSNR HVS | SSIM | MS SSIM | CIEDE 2000 -0.0190 | 0.0926 | -0.0730 | -0.0516 | -0.0037 | -0.0588 | 0.1310 Change-Id: I6246ecba388ae81deadc7b306dc3404fa7869aab

diff --git a/av1/common/idct.c b/av1/common/idct.c
index 7124eda..dd4eada 100644
--- a/av1/common/idct.c
+++ b/av1/common/idct.c

@@ -1768,6 +1768,24 @@
   assert(tx_type == DCT_DCT);
 #endif
   static const transform_2d IHT_64x32[] = {
+#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
+    { daala_idct32, daala_idct64 },  // DCT_DCT
+    { daala_idst32, daala_idct64 },  // ADST_DCT
+    { daala_idct32, daala_idst64 },  // DCT_ADST
+    { daala_idst32, daala_idst64 },  // ADST_ADST
+    { daala_idst32, daala_idct64 },  // FLIPADST_DCT
+    { daala_idct32, daala_idst64 },  // DCT_FLIPADST
+    { daala_idst32, daala_idst64 },  // FLIPADST_FLIPADST
+    { daala_idst32, daala_idst64 },  // ADST_FLIPADST
+    { daala_idst32, daala_idst64 },  // FLIPADST_ADST
+    { daala_idtx32, daala_idtx64 },  // IDTX
+    { daala_idct32, daala_idtx64 },  // V_DCT
+    { daala_idtx32, daala_idct64 },  // H_DCT
+    { daala_idst32, daala_idtx64 },  // V_ADST
+    { daala_idtx32, daala_idst64 },  // H_ADST
+    { daala_idst32, daala_idtx64 },  // V_FLIPADST
+    { daala_idtx32, daala_idst64 },  // H_FLIPADST
+#else
     { aom_idct32_c, idct64_row_c },      // DCT_DCT
     { ihalfright32_c, idct64_row_c },    // ADST_DCT
     { aom_idct32_c, ihalfright64_c },    // DCT_ADST
@@ -1784,6 +1802,7 @@
     { iidtx32_c, ihalfright64_c },       // H_ADST
     { ihalfright32_c, iidtx64_c },       // V_FLIPADST
     { iidtx32_c, ihalfright64_c },       // H_FLIPADST
+#endif
   };
   const int n = 32;
   const int n2 = 64;
@@ -1795,9 +1814,16 @@
 
   // inverse transform row vectors and transpose
   for (i = 0; i < n; ++i) {
+#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
+    tran_low_t temp_in[64];
+    for (j = 0; j < n2; j++) temp_in[j] = input[j] * 8;
+    IHT_64x32[tx_type].rows(temp_in, outtmp);
+    for (j = 0; j < n2; ++j) tmp[j][i] = outtmp[j];
+#else
     IHT_64x32[tx_type].rows(input, outtmp);
     for (j = 0; j < n2; ++j)
       tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * InvSqrt2);
+#endif
     input += n2;
   }
 
@@ -1811,7 +1837,11 @@
     for (j = 0; j < n2; ++j) {
       int d = i * stride + j;
       int s = j * outstride + i;
+#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
+#else
       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
+#endif
     }
   }
 }
@@ -1826,6 +1856,24 @@
   assert(tx_type == DCT_DCT);
 #endif
   static const transform_2d IHT_32x64[] = {
+#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
+    { daala_idct64, daala_idct32 },  // DCT_DCT
+    { daala_idst64, daala_idct32 },  // ADST_DCT
+    { daala_idct64, daala_idst32 },  // DCT_ADST
+    { daala_idst64, daala_idst32 },  // ADST_ADST
+    { daala_idst64, daala_idct32 },  // FLIPADST_DCT
+    { daala_idct64, daala_idst32 },  // DCT_FLIPADST
+    { daala_idst64, daala_idst32 },  // FLIPADST_FLIPADST
+    { daala_idst64, daala_idst32 },  // ADST_FLIPADST
+    { daala_idst64, daala_idst32 },  // FLIPADST_ADST
+    { daala_idtx64, daala_idtx32 },  // IDTX
+    { daala_idct64, daala_idtx32 },  // V_DCT
+    { daala_idtx64, daala_idct32 },  // H_DCT
+    { daala_idst64, daala_idtx32 },  // V_ADST
+    { daala_idtx64, daala_idst32 },  // H_ADST
+    { daala_idst64, daala_idtx32 },  // V_FLIPADST
+    { daala_idtx64, daala_idst32 },  // H_FLIPADST
+#else
     { idct64_col_c, aom_idct32_c },      // DCT_DCT
     { ihalfright64_c, aom_idct32_c },    // ADST_DCT
     { idct64_col_c, ihalfright32_c },    // DCT_ADST
@@ -1842,6 +1890,7 @@
     { iidtx64_c, ihalfright32_c },       // H_ADST
     { ihalfright64_c, iidtx32_c },       // V_FLIPADST
     { iidtx64_c, ihalfright32_c },       // H_FLIPADST
+#endif
   };
 
   const int n = 32;
@@ -1853,9 +1902,16 @@
 
   // inverse transform row vectors and transpose
   for (i = 0; i < n2; ++i) {
+#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
+    tran_low_t temp_in[32];
+    for (j = 0; j < n; j++) temp_in[j] = input[j] * 8;
+    IHT_32x64[tx_type].rows(temp_in, outtmp);
+    for (j = 0; j < n; ++j) tmp[j][i] = outtmp[j];
+#else
     IHT_32x64[tx_type].rows(input, outtmp);
     for (j = 0; j < n; ++j)
       tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * InvSqrt2);
+#endif
     input += n;
   }
 
@@ -1869,7 +1925,11 @@
     for (j = 0; j < n; ++j) {
       int d = i * stride + j;
       int s = j * outstride + i;
+#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
+#else
       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
+#endif
     }
   }
 }
@@ -2158,12 +2218,20 @@
 #if CONFIG_TX64X64
 static void inv_txfm_add_32x64(const tran_low_t *input, uint8_t *dest,
                                int stride, const TxfmParam *txfm_param) {
+#if CONFIG_DAALA_TX64 && CONFIG_DAALA_TX32
+  av1_iht32x64_2048_add_c(input, dest, stride, txfm_param);
+#else
   av1_iht32x64_2048_add(input, dest, stride, txfm_param);
+#endif
 }
 
 static void inv_txfm_add_64x32(const tran_low_t *input, uint8_t *dest,
                                int stride, const TxfmParam *txfm_param) {
+#if CONFIG_DAALA_TX64 && CONFIG_DAALA_TX32
+  av1_iht64x32_2048_add_c(input, dest, stride, txfm_param);
+#else
   av1_iht64x32_2048_add(input, dest, stride, txfm_param);
+#endif
 }
 #endif  // CONFIG_TX64X64
 

diff --git a/av1/encoder/dct.c b/av1/encoder/dct.c
index 2db5214..d79d64a 100644
--- a/av1/encoder/dct.c
+++ b/av1/encoder/dct.c

@@ -2713,6 +2713,24 @@
   assert(tx_type == DCT_DCT);
 #endif
   static const transform_2d FHT[] = {
+#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
+    { daala_fdct32, daala_fdct64 },  // DCT_DCT
+    { daala_fdst32, daala_fdct64 },  // ADST_DCT
+    { daala_fdct32, daala_fdst64 },  // DCT_ADST
+    { daala_fdst32, daala_fdst64 },  // ADST_ADST
+    { daala_fdst32, daala_fdct64 },  // FLIPADST_DCT
+    { daala_fdct32, daala_fdst64 },  // DCT_FLIPADST
+    { daala_fdst32, daala_fdst64 },  // FLIPADST_FLIPADST
+    { daala_fdst32, daala_fdst64 },  // ADST_FLIPADST
+    { daala_fdst32, daala_fdst64 },  // FLIPADST_ADST
+    { daala_idtx32, daala_idtx64 },  // IDTX
+    { daala_fdct32, daala_idtx64 },  // V_DCT
+    { daala_idtx32, daala_fdct64 },  // H_DCT
+    { daala_fdst32, daala_idtx64 },  // V_ADST
+    { daala_idtx32, daala_fdst64 },  // H_ADST
+    { daala_fdst32, daala_idtx64 },  // V_FLIPADST
+    { daala_idtx32, daala_fdst64 },  // H_FLIPADST
+#else
     { fdct32, fdct64_row },          // DCT_DCT
     { fhalfright32, fdct64_row },    // ADST_DCT
     { fdct32, fhalfright64 },        // DCT_ADST
@@ -2729,6 +2747,7 @@
     { fidtx32, fhalfright64 },       // H_ADST
     { fhalfright32, fidtx64 },       // V_FLIPADST
     { fidtx32, fhalfright64 },       // H_FLIPADST
+#endif
   };
   const transform_2d ht = FHT[tx_type];
   tran_low_t out[2048];
@@ -2741,20 +2760,36 @@
 
   // Columns
   for (i = 0; i < n2; ++i) {
-    for (j = 0; j < n; ++j)
+    for (j = 0; j < n; ++j) {
+#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
+      temp_in[j] = input[j * stride + i] * 16;
+#else
       temp_in[j] = (tran_low_t)fdct_round_shift(input[j * stride + i] * Sqrt2);
+#endif
+    }
     ht.cols(temp_in, temp_out);
-    for (j = 0; j < n; ++j)
+    for (j = 0; j < n; ++j) {
+#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
+      out[j * n2 + i] = temp_out[j];
+#else
       out[j * n2 + i] = (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
+#endif
+    }
   }
 
   // Rows
   for (i = 0; i < n; ++i) {
     for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
     ht.rows(temp_in, temp_out);
-    for (j = 0; j < n2; ++j)
+    for (j = 0; j < n2; ++j) {
+#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
+      output[j + i * n2] =
+          (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 3);
+#else
       output[j + i * n2] =
           (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
+#endif
+    }
   }
 
   // Zero out right 32x32 area.
@@ -2773,6 +2808,24 @@
   assert(tx_type == DCT_DCT);
 #endif
   static const transform_2d FHT[] = {
+#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
+    { daala_fdct64, daala_fdct32 },  // DCT_DCT
+    { daala_fdst64, daala_fdct32 },  // ADST_DCT
+    { daala_fdct64, daala_fdst32 },  // DCT_ADST
+    { daala_fdst64, daala_fdst32 },  // ADST_ADST
+    { daala_fdst64, daala_fdct32 },  // FLIPADST_DCT
+    { daala_fdct64, daala_fdst32 },  // DCT_FLIPADST
+    { daala_fdst64, daala_fdst32 },  // FLIPADST_FLIPADST
+    { daala_fdst64, daala_fdst32 },  // ADST_FLIPADST
+    { daala_fdst64, daala_fdst32 },  // FLIPADST_ADST
+    { daala_idtx64, daala_idtx32 },  // IDTX
+    { daala_fdct64, daala_idtx32 },  // V_DCT
+    { daala_idtx64, daala_fdct32 },  // H_DCT
+    { daala_fdst64, daala_idtx32 },  // V_ADST
+    { daala_idtx64, daala_fdst32 },  // H_ADST
+    { daala_fdst64, daala_idtx32 },  // V_FLIPADST
+    { daala_idtx64, daala_fdst32 },  // H_FLIPADST
+#else
     { fdct64_row, fdct32 },          // DCT_DCT
     { fhalfright64, fdct32 },        // ADST_DCT
     { fdct64_row, fhalfright32 },    // DCT_ADST
@@ -2789,6 +2842,7 @@
     { fidtx64, fhalfright32 },       // H_ADST
     { fhalfright64, fidtx32 },       // V_FLIPADST
     { fidtx64, fhalfright32 },       // H_FLIPADST
+#endif
   };
   const transform_2d ht = FHT[tx_type];
   tran_low_t out[32 * 64];
@@ -2801,19 +2855,34 @@
 
   // Rows
   for (i = 0; i < n2; ++i) {
-    for (j = 0; j < n; ++j)
+    for (j = 0; j < n; ++j) {
+#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
+      temp_in[j] = input[i * stride + j] * 16;
+#else
       temp_in[j] = (tran_low_t)fdct_round_shift(input[i * stride + j] * Sqrt2);
+#endif
+    }
     ht.rows(temp_in, temp_out);
-    for (j = 0; j < n; ++j)
+    for (j = 0; j < n; ++j) {
+#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
+      out[j * n2 + i] = temp_out[j];
+#else
       out[j * n2 + i] = (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
+#endif
+    }
   }
 
   // Columns
   for (i = 0; i < n; ++i) {
     for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
     ht.cols(temp_in, temp_out);
-    for (j = 0; j < n2; ++j)
+    for (j = 0; j < n2; ++j) {
+#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
+      output[i + j * n] = (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 3);
+#else
       output[i + j * n] = (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
+#endif
+    }
   }
 
   // Zero out the bottom 32x32 area.