Add Daala TX to 16x32 and 32x16 transforms

Rectangular 416x32 and 32x16 will now use Daala TX when CONFIG_DAALA_DCT16 and
CONFIG_DAALA_DCT32 are both enabled.

Change-Id: Iab3737605fa10dc09ceab18856a26165c502e6e5
diff --git a/av1/common/idct.c b/av1/common/idct.c
index 0ac9612..358f558 100644
--- a/av1/common/idct.c
+++ b/av1/common/idct.c
@@ -1696,6 +1696,26 @@
   assert(tx_type == DCT_DCT);
 #endif
   static const transform_2d IHT_16x32[] = {
+#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
+    { daala_idct32, daala_idct16 },  // DCT_DCT  = 0
+    { daala_idst32, daala_idct16 },  // ADST_DCT = 1
+    { daala_idct32, daala_idst16 },  // DCT_ADST = 2
+    { daala_idst32, daala_idst16 },  // ADST_ADST = 3
+#if CONFIG_EXT_TX
+    { daala_idst32, daala_idct16 },  // FLIPADST_DCT
+    { daala_idct32, daala_idst16 },  // DCT_FLIPADST
+    { daala_idst32, daala_idst16 },  // FLIPADST_FLIPADST
+    { daala_idst32, daala_idst16 },  // ADST_FLIPADST
+    { daala_idst32, daala_idst16 },  // FLIPADST_ADST
+    { daala_idtx32, daala_idtx16 },  // IDTX
+    { daala_idct32, daala_idtx16 },  // V_DCT
+    { daala_idtx32, daala_idct16 },  // H_DCT
+    { daala_idst32, daala_idtx16 },  // V_ADST
+    { daala_idtx32, daala_idst16 },  // H_ADST
+    { daala_idst32, daala_idtx16 },  // V_FLIPADST
+    { daala_idtx32, daala_idst16 },  // H_FLIPADST
+#endif
+#else
     { aom_idct32_c, aom_idct16_c },     // DCT_DCT
     { ihalfright32_c, aom_idct16_c },   // ADST_DCT
     { aom_idct32_c, aom_iadst16_c },    // DCT_ADST
@@ -1714,6 +1734,7 @@
     { ihalfright32_c, iidtx16_c },      // V_FLIPADST
     { iidtx32_c, aom_iadst16_c },       // H_FLIPADST
 #endif
+#endif
   };
 
   const int n = 16;
@@ -1725,9 +1746,16 @@
 
   // inverse transform row vectors and transpose
   for (i = 0; i < n2; ++i) {
+#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
+    tran_low_t temp_in[16];
+    for (j = 0; j < n; j++) temp_in[j] = input[j] * 2;
+    IHT_16x32[tx_type].rows(temp_in, outtmp);
+    for (j = 0; j < n; ++j) tmp[j][i] = outtmp[j] * 4;
+#else
     IHT_16x32[tx_type].rows(input, outtmp);
     for (j = 0; j < n; ++j)
       tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
+#endif
     input += n;
   }
 
@@ -1743,7 +1771,11 @@
     for (j = 0; j < n; ++j) {
       int d = i * stride + j;
       int s = j * outstride + i;
+#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
+#else
       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
+#endif
     }
   }
 }
@@ -1758,6 +1790,26 @@
   assert(tx_type == DCT_DCT);
 #endif
   static const transform_2d IHT_32x16[] = {
+#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
+    { daala_idct16, daala_idct32 },  // DCT_DCT  = 0
+    { daala_idst16, daala_idct32 },  // ADST_DCT = 1
+    { daala_idct16, daala_idst32 },  // DCT_ADST = 2
+    { daala_idst16, daala_idst32 },  // ADST_ADST = 3
+#if CONFIG_EXT_TX
+    { daala_idst16, daala_idct32 },  // FLIPADST_DCT
+    { daala_idct16, daala_idst32 },  // DCT_FLIPADST
+    { daala_idst16, daala_idst32 },  // FLIPADST_FLIPADST
+    { daala_idst16, daala_idst32 },  // ADST_FLIPADST
+    { daala_idst16, daala_idst32 },  // FLIPADST_ADST
+    { daala_idtx16, daala_idtx32 },  // IDTX
+    { daala_idct16, daala_idtx32 },  // V_DCT
+    { daala_idtx16, daala_idct32 },  // H_DCT
+    { daala_idst16, daala_idtx32 },  // V_ADST
+    { daala_idtx16, daala_idst32 },  // H_ADST
+    { daala_idst16, daala_idtx32 },  // V_FLIPADST
+    { daala_idtx16, daala_idst32 },  // H_FLIPADST
+#endif
+#else
     { aom_idct16_c, aom_idct32_c },     // DCT_DCT
     { aom_iadst16_c, aom_idct32_c },    // ADST_DCT
     { aom_idct16_c, ihalfright32_c },   // DCT_ADST
@@ -1776,6 +1828,7 @@
     { aom_iadst16_c, iidtx32_c },       // V_FLIPADST
     { iidtx16_c, ihalfright32_c },      // H_FLIPADST
 #endif
+#endif
   };
   const int n = 16;
   const int n2 = 32;
@@ -1787,9 +1840,16 @@
 
   // inverse transform row vectors and transpose
   for (i = 0; i < n; ++i) {
+#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
+    tran_low_t temp_in[32];
+    for (j = 0; j < n2; j++) temp_in[j] = input[j] * 2;
+    IHT_32x16[tx_type].rows(temp_in, outtmp);
+    for (j = 0; j < n2; ++j) tmp[j][i] = outtmp[j] * 4;
+#else
     IHT_32x16[tx_type].rows(input, outtmp);
     for (j = 0; j < n2; ++j)
       tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
+#endif
     input += n2;
   }
 
@@ -1805,7 +1865,11 @@
     for (j = 0; j < n2; ++j) {
       int d = i * stride + j;
       int s = j * outstride + i;
+#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
+#else
       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
+#endif
     }
   }
 }
@@ -2623,12 +2687,20 @@
 
 static void inv_txfm_add_16x32(const tran_low_t *input, uint8_t *dest,
                                int stride, const TxfmParam *txfm_param) {
+#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
+  av1_iht16x32_512_add_c(input, dest, stride, txfm_param);
+#else
   av1_iht16x32_512_add(input, dest, stride, txfm_param);
+#endif
 }
 
 static void inv_txfm_add_32x16(const tran_low_t *input, uint8_t *dest,
                                int stride, const TxfmParam *txfm_param) {
+#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
+  av1_iht32x16_512_add_c(input, dest, stride, txfm_param);
+#else
   av1_iht32x16_512_add(input, dest, stride, txfm_param);
+#endif
 }
 
 #if CONFIG_TX64X64
diff --git a/av1/encoder/dct.c b/av1/encoder/dct.c
index ce4ca4d..c1a255b 100644
--- a/av1/encoder/dct.c
+++ b/av1/encoder/dct.c
@@ -2273,6 +2273,26 @@
   assert(tx_type == DCT_DCT);
 #endif
   static const transform_2d FHT[] = {
+#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
+    { daala_fdct32, daala_fdct16 },  // DCT_DCT
+    { daala_fdst32, daala_fdct16 },  // ADST_DCT
+    { daala_fdct32, daala_fdst16 },  // DCT_ADST
+    { daala_fdst32, daala_fdst16 },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { daala_fdst32, daala_fdct16 },  // FLIPADST_DCT
+    { daala_fdct32, daala_fdst16 },  // DCT_FLIPADST
+    { daala_fdst32, daala_fdst16 },  // FLIPADST_FLIPADST
+    { daala_fdst32, daala_fdst16 },  // ADST_FLIPADST
+    { daala_fdst32, daala_fdst16 },  // FLIPADST_ADST
+    { daala_idtx32, daala_idtx16 },  // IDTX
+    { daala_fdct32, daala_idtx16 },  // V_DCT
+    { daala_idtx32, daala_fdct16 },  // H_DCT
+    { daala_fdst32, daala_idtx16 },  // V_ADST
+    { daala_idtx32, daala_fdst16 },  // H_ADST
+    { daala_fdst32, daala_idtx16 },  // V_FLIPADST
+    { daala_idtx32, daala_fdst16 },  // H_FLIPADST
+#endif
+#else
     { fdct32, fdct16 },         // DCT_DCT
     { fhalfright32, fdct16 },   // ADST_DCT
     { fdct32, fadst16 },        // DCT_ADST
@@ -2291,6 +2311,7 @@
     { fhalfright32, fidtx16 },  // V_FLIPADST
     { fidtx32, fadst16 },       // H_FLIPADST
 #endif
+#endif
   };
   const transform_2d ht = FHT[tx_type];
   const int n = 16;
@@ -2305,12 +2326,22 @@
 
   // Rows
   for (i = 0; i < n2; ++i) {
-    for (j = 0; j < n; ++j)
+    for (j = 0; j < n; ++j) {
+#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
+      temp_in[j] = input[i * stride + j] * 16;
+#else
       temp_in[j] =
           (tran_low_t)fdct_round_shift(input[i * stride + j] * 4 * Sqrt2);
+#endif
+    }
     ht.rows(temp_in, temp_out);
-    for (j = 0; j < n; ++j)
+    for (j = 0; j < n; ++j) {
+#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
+      out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
+#else
       out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4);
+#endif
+    }
   }
 
   // Columns
@@ -2332,6 +2363,26 @@
   assert(tx_type == DCT_DCT);
 #endif
   static const transform_2d FHT[] = {
+#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
+    { daala_fdct16, daala_fdct32 },  // DCT_DCT
+    { daala_fdst16, daala_fdct32 },  // ADST_DCT
+    { daala_fdct16, daala_fdst32 },  // DCT_ADST
+    { daala_fdst16, daala_fdst32 },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { daala_fdst16, daala_fdct32 },  // FLIPADST_DCT
+    { daala_fdct16, daala_fdst32 },  // DCT_FLIPADST
+    { daala_fdst16, daala_fdst32 },  // FLIPADST_FLIPADST
+    { daala_fdst16, daala_fdst32 },  // ADST_FLIPADST
+    { daala_fdst16, daala_fdst32 },  // FLIPADST_ADST
+    { daala_idtx16, daala_idtx32 },  // IDTX
+    { daala_fdct16, daala_idtx32 },  // V_DCT
+    { daala_idtx16, daala_fdct32 },  // H_DCT
+    { daala_fdst16, daala_idtx32 },  // V_ADST
+    { daala_idtx16, daala_fdst32 },  // H_ADST
+    { daala_fdst16, daala_idtx32 },  // V_FLIPADST
+    { daala_idtx16, daala_fdst32 },  // H_FLIPADST
+#endif
+#else
     { fdct16, fdct32 },         // DCT_DCT
     { fadst16, fdct32 },        // ADST_DCT
     { fdct16, fhalfright32 },   // DCT_ADST
@@ -2350,6 +2401,7 @@
     { fadst16, fidtx32 },       // V_FLIPADST
     { fidtx16, fhalfright32 },  // H_FLIPADST
 #endif
+#endif
   };
   const transform_2d ht = FHT[tx_type];
   const int n = 16;
@@ -2364,12 +2416,22 @@
 
   // Columns
   for (i = 0; i < n2; ++i) {
-    for (j = 0; j < n; ++j)
+    for (j = 0; j < n; ++j) {
+#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
+      temp_in[j] = input[j * stride + i] * 16;
+#else
       temp_in[j] =
           (tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2);
+#endif
+    }
     ht.cols(temp_in, temp_out);
-    for (j = 0; j < n; ++j)
+    for (j = 0; j < n; ++j) {
+#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
+      out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
+#else
       out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4);
+#endif
+    }
   }
 
   // Rows
diff --git a/av1/encoder/hybrid_fwd_txfm.c b/av1/encoder/hybrid_fwd_txfm.c
index 772231e..486c604 100644
--- a/av1/encoder/hybrid_fwd_txfm.c
+++ b/av1/encoder/hybrid_fwd_txfm.c
@@ -70,12 +70,20 @@
 
 static void fwd_txfm_16x32(const int16_t *src_diff, tran_low_t *coeff,
                            int diff_stride, TxfmParam *txfm_param) {
+#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
+  av1_fht16x32_c(src_diff, coeff, diff_stride, txfm_param);
+#else
   av1_fht16x32(src_diff, coeff, diff_stride, txfm_param);
+#endif
 }
 
 static void fwd_txfm_32x16(const int16_t *src_diff, tran_low_t *coeff,
                            int diff_stride, TxfmParam *txfm_param) {
+#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
+  av1_fht32x16_c(src_diff, coeff, diff_stride, txfm_param);
+#else
   av1_fht32x16(src_diff, coeff, diff_stride, txfm_param);
+#endif
 }
 
 static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,