Add Daala TX to rectangular 32x64 and 64x32 transforms

This patch adds Daala TX transforms ot the 32x64 and 64x32 transform
block sizes using Q3 (up 4, down 1) scaling.

subset 1:
monty-daalaTX-fulltest-Daalabaseline-s1@2017-11-07T00:01:46.582Z ->
 monty-daalaTX-LBD-Daala32x64-s1-Z@2017-11-07T06:10:58.523Z

  PSNR | PSNR Cb | PSNR Cr | PSNR HVS |   SSIM | MS SSIM | CIEDE 2000
0.0112 | -0.0769 |  0.0799 |   0.0567 | 0.0099 | -0.0077 |    -0.0446

objective 1 fast:
monty-daalaTX-fulltest-Daalabaseline-o1f4@2017-11-07T05:59:16.553Z ->
 monty-daalaTX-LBD-Daala32x64-o1f4-Z@2017-11-07T06:10:11.519Z

   PSNR | PSNR Cb | PSNR Cr | PSNR HVS |    SSIM | MS SSIM | CIEDE 2000
-0.0190 |  0.0926 | -0.0730 |  -0.0516 | -0.0037 | -0.0588 |     0.1310

Change-Id: I6246ecba388ae81deadc7b306dc3404fa7869aab
diff --git a/av1/common/idct.c b/av1/common/idct.c
index 7124eda..dd4eada 100644
--- a/av1/common/idct.c
+++ b/av1/common/idct.c
@@ -1768,6 +1768,24 @@
   assert(tx_type == DCT_DCT);
 #endif
   static const transform_2d IHT_64x32[] = {
+#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
+    { daala_idct32, daala_idct64 },  // DCT_DCT
+    { daala_idst32, daala_idct64 },  // ADST_DCT
+    { daala_idct32, daala_idst64 },  // DCT_ADST
+    { daala_idst32, daala_idst64 },  // ADST_ADST
+    { daala_idst32, daala_idct64 },  // FLIPADST_DCT
+    { daala_idct32, daala_idst64 },  // DCT_FLIPADST
+    { daala_idst32, daala_idst64 },  // FLIPADST_FLIPADST
+    { daala_idst32, daala_idst64 },  // ADST_FLIPADST
+    { daala_idst32, daala_idst64 },  // FLIPADST_ADST
+    { daala_idtx32, daala_idtx64 },  // IDTX
+    { daala_idct32, daala_idtx64 },  // V_DCT
+    { daala_idtx32, daala_idct64 },  // H_DCT
+    { daala_idst32, daala_idtx64 },  // V_ADST
+    { daala_idtx32, daala_idst64 },  // H_ADST
+    { daala_idst32, daala_idtx64 },  // V_FLIPADST
+    { daala_idtx32, daala_idst64 },  // H_FLIPADST
+#else
     { aom_idct32_c, idct64_row_c },      // DCT_DCT
     { ihalfright32_c, idct64_row_c },    // ADST_DCT
     { aom_idct32_c, ihalfright64_c },    // DCT_ADST
@@ -1784,6 +1802,7 @@
     { iidtx32_c, ihalfright64_c },       // H_ADST
     { ihalfright32_c, iidtx64_c },       // V_FLIPADST
     { iidtx32_c, ihalfright64_c },       // H_FLIPADST
+#endif
   };
   const int n = 32;
   const int n2 = 64;
@@ -1795,9 +1814,16 @@
 
   // inverse transform row vectors and transpose
   for (i = 0; i < n; ++i) {
+#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
+    tran_low_t temp_in[64];
+    for (j = 0; j < n2; j++) temp_in[j] = input[j] * 8;
+    IHT_64x32[tx_type].rows(temp_in, outtmp);
+    for (j = 0; j < n2; ++j) tmp[j][i] = outtmp[j];
+#else
     IHT_64x32[tx_type].rows(input, outtmp);
     for (j = 0; j < n2; ++j)
       tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * InvSqrt2);
+#endif
     input += n2;
   }
 
@@ -1811,7 +1837,11 @@
     for (j = 0; j < n2; ++j) {
       int d = i * stride + j;
       int s = j * outstride + i;
+#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
+#else
       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
+#endif
     }
   }
 }
@@ -1826,6 +1856,24 @@
   assert(tx_type == DCT_DCT);
 #endif
   static const transform_2d IHT_32x64[] = {
+#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
+    { daala_idct64, daala_idct32 },  // DCT_DCT
+    { daala_idst64, daala_idct32 },  // ADST_DCT
+    { daala_idct64, daala_idst32 },  // DCT_ADST
+    { daala_idst64, daala_idst32 },  // ADST_ADST
+    { daala_idst64, daala_idct32 },  // FLIPADST_DCT
+    { daala_idct64, daala_idst32 },  // DCT_FLIPADST
+    { daala_idst64, daala_idst32 },  // FLIPADST_FLIPADST
+    { daala_idst64, daala_idst32 },  // ADST_FLIPADST
+    { daala_idst64, daala_idst32 },  // FLIPADST_ADST
+    { daala_idtx64, daala_idtx32 },  // IDTX
+    { daala_idct64, daala_idtx32 },  // V_DCT
+    { daala_idtx64, daala_idct32 },  // H_DCT
+    { daala_idst64, daala_idtx32 },  // V_ADST
+    { daala_idtx64, daala_idst32 },  // H_ADST
+    { daala_idst64, daala_idtx32 },  // V_FLIPADST
+    { daala_idtx64, daala_idst32 },  // H_FLIPADST
+#else
     { idct64_col_c, aom_idct32_c },      // DCT_DCT
     { ihalfright64_c, aom_idct32_c },    // ADST_DCT
     { idct64_col_c, ihalfright32_c },    // DCT_ADST
@@ -1842,6 +1890,7 @@
     { iidtx64_c, ihalfright32_c },       // H_ADST
     { ihalfright64_c, iidtx32_c },       // V_FLIPADST
     { iidtx64_c, ihalfright32_c },       // H_FLIPADST
+#endif
   };
 
   const int n = 32;
@@ -1853,9 +1902,16 @@
 
   // inverse transform row vectors and transpose
   for (i = 0; i < n2; ++i) {
+#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
+    tran_low_t temp_in[32];
+    for (j = 0; j < n; j++) temp_in[j] = input[j] * 8;
+    IHT_32x64[tx_type].rows(temp_in, outtmp);
+    for (j = 0; j < n; ++j) tmp[j][i] = outtmp[j];
+#else
     IHT_32x64[tx_type].rows(input, outtmp);
     for (j = 0; j < n; ++j)
       tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * InvSqrt2);
+#endif
     input += n;
   }
 
@@ -1869,7 +1925,11 @@
     for (j = 0; j < n; ++j) {
       int d = i * stride + j;
       int s = j * outstride + i;
+#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
+#else
       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
+#endif
     }
   }
 }
@@ -2158,12 +2218,20 @@
 #if CONFIG_TX64X64
 static void inv_txfm_add_32x64(const tran_low_t *input, uint8_t *dest,
                                int stride, const TxfmParam *txfm_param) {
+#if CONFIG_DAALA_TX64 && CONFIG_DAALA_TX32
+  av1_iht32x64_2048_add_c(input, dest, stride, txfm_param);
+#else
   av1_iht32x64_2048_add(input, dest, stride, txfm_param);
+#endif
 }
 
 static void inv_txfm_add_64x32(const tran_low_t *input, uint8_t *dest,
                                int stride, const TxfmParam *txfm_param) {
+#if CONFIG_DAALA_TX64 && CONFIG_DAALA_TX32
+  av1_iht64x32_2048_add_c(input, dest, stride, txfm_param);
+#else
   av1_iht64x32_2048_add(input, dest, stride, txfm_param);
+#endif
 }
 #endif  // CONFIG_TX64X64
 
diff --git a/av1/encoder/dct.c b/av1/encoder/dct.c
index 2db5214..d79d64a 100644
--- a/av1/encoder/dct.c
+++ b/av1/encoder/dct.c
@@ -2713,6 +2713,24 @@
   assert(tx_type == DCT_DCT);
 #endif
   static const transform_2d FHT[] = {
+#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
+    { daala_fdct32, daala_fdct64 },  // DCT_DCT
+    { daala_fdst32, daala_fdct64 },  // ADST_DCT
+    { daala_fdct32, daala_fdst64 },  // DCT_ADST
+    { daala_fdst32, daala_fdst64 },  // ADST_ADST
+    { daala_fdst32, daala_fdct64 },  // FLIPADST_DCT
+    { daala_fdct32, daala_fdst64 },  // DCT_FLIPADST
+    { daala_fdst32, daala_fdst64 },  // FLIPADST_FLIPADST
+    { daala_fdst32, daala_fdst64 },  // ADST_FLIPADST
+    { daala_fdst32, daala_fdst64 },  // FLIPADST_ADST
+    { daala_idtx32, daala_idtx64 },  // IDTX
+    { daala_fdct32, daala_idtx64 },  // V_DCT
+    { daala_idtx32, daala_fdct64 },  // H_DCT
+    { daala_fdst32, daala_idtx64 },  // V_ADST
+    { daala_idtx32, daala_fdst64 },  // H_ADST
+    { daala_fdst32, daala_idtx64 },  // V_FLIPADST
+    { daala_idtx32, daala_fdst64 },  // H_FLIPADST
+#else
     { fdct32, fdct64_row },          // DCT_DCT
     { fhalfright32, fdct64_row },    // ADST_DCT
     { fdct32, fhalfright64 },        // DCT_ADST
@@ -2729,6 +2747,7 @@
     { fidtx32, fhalfright64 },       // H_ADST
     { fhalfright32, fidtx64 },       // V_FLIPADST
     { fidtx32, fhalfright64 },       // H_FLIPADST
+#endif
   };
   const transform_2d ht = FHT[tx_type];
   tran_low_t out[2048];
@@ -2741,20 +2760,36 @@
 
   // Columns
   for (i = 0; i < n2; ++i) {
-    for (j = 0; j < n; ++j)
+    for (j = 0; j < n; ++j) {
+#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
+      temp_in[j] = input[j * stride + i] * 16;
+#else
       temp_in[j] = (tran_low_t)fdct_round_shift(input[j * stride + i] * Sqrt2);
+#endif
+    }
     ht.cols(temp_in, temp_out);
-    for (j = 0; j < n; ++j)
+    for (j = 0; j < n; ++j) {
+#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
+      out[j * n2 + i] = temp_out[j];
+#else
       out[j * n2 + i] = (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
+#endif
+    }
   }
 
   // Rows
   for (i = 0; i < n; ++i) {
     for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
     ht.rows(temp_in, temp_out);
-    for (j = 0; j < n2; ++j)
+    for (j = 0; j < n2; ++j) {
+#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
+      output[j + i * n2] =
+          (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 3);
+#else
       output[j + i * n2] =
           (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
+#endif
+    }
   }
 
   // Zero out right 32x32 area.
@@ -2773,6 +2808,24 @@
   assert(tx_type == DCT_DCT);
 #endif
   static const transform_2d FHT[] = {
+#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
+    { daala_fdct64, daala_fdct32 },  // DCT_DCT
+    { daala_fdst64, daala_fdct32 },  // ADST_DCT
+    { daala_fdct64, daala_fdst32 },  // DCT_ADST
+    { daala_fdst64, daala_fdst32 },  // ADST_ADST
+    { daala_fdst64, daala_fdct32 },  // FLIPADST_DCT
+    { daala_fdct64, daala_fdst32 },  // DCT_FLIPADST
+    { daala_fdst64, daala_fdst32 },  // FLIPADST_FLIPADST
+    { daala_fdst64, daala_fdst32 },  // ADST_FLIPADST
+    { daala_fdst64, daala_fdst32 },  // FLIPADST_ADST
+    { daala_idtx64, daala_idtx32 },  // IDTX
+    { daala_fdct64, daala_idtx32 },  // V_DCT
+    { daala_idtx64, daala_fdct32 },  // H_DCT
+    { daala_fdst64, daala_idtx32 },  // V_ADST
+    { daala_idtx64, daala_fdst32 },  // H_ADST
+    { daala_fdst64, daala_idtx32 },  // V_FLIPADST
+    { daala_idtx64, daala_fdst32 },  // H_FLIPADST
+#else
     { fdct64_row, fdct32 },          // DCT_DCT
     { fhalfright64, fdct32 },        // ADST_DCT
     { fdct64_row, fhalfright32 },    // DCT_ADST
@@ -2789,6 +2842,7 @@
     { fidtx64, fhalfright32 },       // H_ADST
     { fhalfright64, fidtx32 },       // V_FLIPADST
     { fidtx64, fhalfright32 },       // H_FLIPADST
+#endif
   };
   const transform_2d ht = FHT[tx_type];
   tran_low_t out[32 * 64];
@@ -2801,19 +2855,34 @@
 
   // Rows
   for (i = 0; i < n2; ++i) {
-    for (j = 0; j < n; ++j)
+    for (j = 0; j < n; ++j) {
+#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
+      temp_in[j] = input[i * stride + j] * 16;
+#else
       temp_in[j] = (tran_low_t)fdct_round_shift(input[i * stride + j] * Sqrt2);
+#endif
+    }
     ht.rows(temp_in, temp_out);
-    for (j = 0; j < n; ++j)
+    for (j = 0; j < n; ++j) {
+#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
+      out[j * n2 + i] = temp_out[j];
+#else
       out[j * n2 + i] = (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
+#endif
+    }
   }
 
   // Columns
   for (i = 0; i < n; ++i) {
     for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
     ht.cols(temp_in, temp_out);
-    for (j = 0; j < n2; ++j)
+    for (j = 0; j < n2; ++j) {
+#if CONFIG_DAALA_TX32 && CONFIG_DAALA_TX64
+      output[i + j * n] = (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 3);
+#else
       output[i + j * n] = (tran_low_t)ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
+#endif
+    }
   }
 
   // Zero out the bottom 32x32 area.