Misc fixes for 32x64 and 64x32 transforms

Change-Id: Ic843e99bd9b79cb9a0a26b95e3a48717ff2ec2a5
diff --git a/aom_dsp/txfm_common.h b/aom_dsp/txfm_common.h
index c33e38e..2bf0403 100644
--- a/aom_dsp/txfm_common.h
+++ b/aom_dsp/txfm_common.h
@@ -91,6 +91,7 @@
 
 // 16384 * sqrt(2)
 static const tran_high_t Sqrt2 = 23170;
+static const tran_high_t InvSqrt2 = 11585;
 
 static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
   tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
diff --git a/av1/common/av1_fwd_txfm1d.c b/av1/common/av1_fwd_txfm1d.c
index cfe2741..c9c7f43 100644
--- a/av1/common/av1_fwd_txfm1d.c
+++ b/av1/common/av1_fwd_txfm1d.c
@@ -1547,6 +1547,16 @@
   for (int i = 0; i < 32; ++i) output[i] = input[i] * 4;
   range_check(0, input, output, 32, stage_range[0]);
 }
+
+#if CONFIG_TX64X64
+void av1_fidentity64_c(const int32_t *input, int32_t *output,
+                       const int8_t *cos_bit, const int8_t *stage_range) {
+  (void)cos_bit;
+  for (int i = 0; i < 64; ++i)
+    output[i] = (int32_t)dct_const_round_shift(input[i] * 4 * Sqrt2);
+  range_check(0, input, output, 64, stage_range[0]);
+}
+#endif  // CONFIG_TX64X64
 #endif  // CONFIG_EXT_TX
 
 #if CONFIG_TX64X64
diff --git a/av1/common/av1_fwd_txfm1d.h b/av1/common/av1_fwd_txfm1d.h
index f641930..f880239 100644
--- a/av1/common/av1_fwd_txfm1d.h
+++ b/av1/common/av1_fwd_txfm1d.h
@@ -26,8 +26,10 @@
                     const int8_t *cos_bit, const int8_t *stage_range);
 void av1_fdct32_new(const int32_t *input, int32_t *output,
                     const int8_t *cos_bit, const int8_t *stage_range);
+#if CONFIG_TX64X64
 void av1_fdct64_new(const int32_t *input, int32_t *output,
                     const int8_t *cos_bit, const int8_t *stage_range);
+#endif  // CONFIG_TX64X64
 
 void av1_fadst4_new(const int32_t *input, int32_t *output,
                     const int8_t *cos_bit, const int8_t *stage_range);
@@ -46,6 +48,10 @@
                        const int8_t *cos_bit, const int8_t *stage_range);
 void av1_fidentity32_c(const int32_t *input, int32_t *output,
                        const int8_t *cos_bit, const int8_t *stage_range);
+#if CONFIG_TX64X64
+void av1_fidentity64_c(const int32_t *input, int32_t *output,
+                       const int8_t *cos_bit, const int8_t *stage_range);
+#endif  // CONFIG_TX64X64
 #endif  // CONFIG_EXT_TX
 
 #ifdef __cplusplus
diff --git a/av1/common/av1_fwd_txfm2d.c b/av1/common/av1_fwd_txfm2d.c
index 650b5bd..d4ff86b 100644
--- a/av1/common/av1_fwd_txfm2d.c
+++ b/av1/common/av1_fwd_txfm2d.c
@@ -36,6 +36,9 @@
     case TXFM_TYPE_IDENTITY8: return av1_fidentity8_c;
     case TXFM_TYPE_IDENTITY16: return av1_fidentity16_c;
     case TXFM_TYPE_IDENTITY32: return av1_fidentity32_c;
+#if CONFIG_TX64X64
+    case TXFM_TYPE_IDENTITY64: return av1_fidentity64_c;
+#endif  // CONFIG_TX64X64
 #endif  // CONFIG_EXT_TX
     default: assert(0); return NULL;
   }
diff --git a/av1/common/av1_inv_txfm1d.c b/av1/common/av1_inv_txfm1d.c
index 3bd8686..51f4b63 100644
--- a/av1/common/av1_inv_txfm1d.c
+++ b/av1/common/av1_inv_txfm1d.c
@@ -1593,6 +1593,16 @@
   for (int i = 0; i < 32; ++i) output[i] = input[i] * 4;
   range_check(0, input, output, 32, stage_range[0]);
 }
+
+#if CONFIG_TX64X64
+void av1_iidentity64_c(const int32_t *input, int32_t *output,
+                       const int8_t *cos_bit, const int8_t *stage_range) {
+  (void)cos_bit;
+  for (int i = 0; i < 64; ++i)
+    output[i] = (int32_t)dct_const_round_shift(input[i] * 4 * Sqrt2);
+  range_check(0, input, output, 64, stage_range[0]);
+}
+#endif  // CONFIG_TX64X64
 #endif  // CONFIG_EXT_TX
 
 #if CONFIG_TX64X64
diff --git a/av1/common/av1_inv_txfm1d.h b/av1/common/av1_inv_txfm1d.h
index 037a3c6..8996f7c 100644
--- a/av1/common/av1_inv_txfm1d.h
+++ b/av1/common/av1_inv_txfm1d.h
@@ -26,8 +26,10 @@
                     const int8_t *cos_bit, const int8_t *stage_range);
 void av1_idct32_new(const int32_t *input, int32_t *output,
                     const int8_t *cos_bit, const int8_t *stage_range);
+#if CONFIG_TX64X64
 void av1_idct64_new(const int32_t *input, int32_t *output,
                     const int8_t *cos_bit, const int8_t *stage_range);
+#endif  // CONFIG_TX64X64
 
 void av1_iadst4_new(const int32_t *input, int32_t *output,
                     const int8_t *cos_bit, const int8_t *stage_range);
@@ -46,6 +48,10 @@
                        const int8_t *cos_bit, const int8_t *stage_range);
 void av1_iidentity32_c(const int32_t *input, int32_t *output,
                        const int8_t *cos_bit, const int8_t *stage_range);
+#if CONFIG_TX64X64
+void av1_iidentity64_c(const int32_t *input, int32_t *output,
+                       const int8_t *cos_bit, const int8_t *stage_range);
+#endif  // CONFIG_TX64X64
 #endif  // CONFIG_EXT_TX
 
 #ifdef __cplusplus
diff --git a/av1/common/av1_inv_txfm1d_cfg.h b/av1/common/av1_inv_txfm1d_cfg.h
index 78c9585..aeb2aec 100644
--- a/av1/common/av1_inv_txfm1d_cfg.h
+++ b/av1/common/av1_inv_txfm1d_cfg.h
@@ -173,6 +173,7 @@
   TXFM_TYPE_DCT32              // .txfm_type
 };
 
+#if CONFIG_TX64X64
 //  ---------------- row config inv_dct_64 ----------------
 static const TXFM_1D_CFG inv_txfm_1d_row_cfg_dct_64 = {
   64,                          // .txfm_size
@@ -182,6 +183,7 @@
   inv_cos_bit_row_dct_64,      // .cos_bit
   TXFM_TYPE_DCT64,             // .txfm_type_col
 };
+#endif  // CONFIG_TX64X64
 
 //  ---------------- row config inv_adst_4 ----------------
 static const TXFM_1D_CFG inv_txfm_1d_row_cfg_adst_4 = {
@@ -353,5 +355,17 @@
   NULL,                    // .cos_bit
   TXFM_TYPE_IDENTITY32,    // .txfm_type
 };
+
+#if CONFIG_TX64X64
+//  ---------------- row/col config inv_identity_32 ----------------
+static const TXFM_1D_CFG inv_txfm_1d_cfg_identity_64 = {
+  64,                      // .txfm_size
+  1,                       // .stage_num
+  inv_shift_64,            // .shift
+  inv_stage_range_idx_64,  // .stage_range
+  NULL,                    // .cos_bit
+  TXFM_TYPE_IDENTITY64,    // .txfm_type
+};
+#endif  // CONFIG_TX64X64
 #endif  // CONFIG_EXT_TX
 #endif  // AV1_INV_TXFM2D_CFG_H_
diff --git a/av1/common/av1_inv_txfm2d.c b/av1/common/av1_inv_txfm2d.c
index 86d16b3..2c01f46 100644
--- a/av1/common/av1_inv_txfm2d.c
+++ b/av1/common/av1_inv_txfm2d.c
@@ -34,6 +34,9 @@
     case TXFM_TYPE_IDENTITY8: return av1_iidentity8_c;
     case TXFM_TYPE_IDENTITY16: return av1_iidentity16_c;
     case TXFM_TYPE_IDENTITY32: return av1_iidentity32_c;
+#if CONFIG_TX64X64
+    case TXFM_TYPE_IDENTITY64: return av1_iidentity64_c;
+#endif  // CONFIG_TX64X64
 #endif  // CONFIG_EXT_TX
     default: assert(0); return NULL;
   }
@@ -46,14 +49,22 @@
       NULL,
 #endif
       &inv_txfm_1d_col_cfg_dct_4, &inv_txfm_1d_col_cfg_dct_8,
-      &inv_txfm_1d_col_cfg_dct_16, &inv_txfm_1d_col_cfg_dct_32 },
+      &inv_txfm_1d_col_cfg_dct_16, &inv_txfm_1d_col_cfg_dct_32,
+#if CONFIG_TX64X64
+      &inv_txfm_1d_col_cfg_dct_64
+#endif  // CONFIG_TX64X64
+  },
   // ADST
   {
 #if CONFIG_CHROMA_2X2
       NULL,
 #endif
       &inv_txfm_1d_col_cfg_adst_4, &inv_txfm_1d_col_cfg_adst_8,
-      &inv_txfm_1d_col_cfg_adst_16, &inv_txfm_1d_col_cfg_adst_32 },
+      &inv_txfm_1d_col_cfg_adst_16, &inv_txfm_1d_col_cfg_adst_32,
+#if CONFIG_TX64X64
+      NULL
+#endif  // CONFIG_TX64X64
+  },
 #if CONFIG_EXT_TX
   // FLIPADST
   {
@@ -61,14 +72,22 @@
       NULL,
 #endif
       &inv_txfm_1d_col_cfg_adst_4, &inv_txfm_1d_col_cfg_adst_8,
-      &inv_txfm_1d_col_cfg_adst_16, &inv_txfm_1d_col_cfg_adst_32 },
+      &inv_txfm_1d_col_cfg_adst_16, &inv_txfm_1d_col_cfg_adst_32,
+#if CONFIG_TX64X64
+      NULL
+#endif  // CONFIG_TX64X64
+  },
   // IDENTITY
   {
 #if CONFIG_CHROMA_2X2
       NULL,
 #endif
       &inv_txfm_1d_cfg_identity_4, &inv_txfm_1d_cfg_identity_8,
-      &inv_txfm_1d_cfg_identity_16, &inv_txfm_1d_cfg_identity_32 },
+      &inv_txfm_1d_cfg_identity_16, &inv_txfm_1d_cfg_identity_32,
+#if CONFIG_TX64X64
+      &inv_txfm_1d_cfg_identity_64
+#endif  // CONFIG_TX64X64
+  },
 #endif  // CONFIG_EXT_TX
 };
 
@@ -79,14 +98,22 @@
       NULL,
 #endif
       &inv_txfm_1d_row_cfg_dct_4, &inv_txfm_1d_row_cfg_dct_8,
-      &inv_txfm_1d_row_cfg_dct_16, &inv_txfm_1d_row_cfg_dct_32 },
+      &inv_txfm_1d_row_cfg_dct_16, &inv_txfm_1d_row_cfg_dct_32,
+#if CONFIG_TX64X64
+      &inv_txfm_1d_row_cfg_dct_64,
+#endif  // CONFIG_TX64X64
+  },
   // ADST
   {
 #if CONFIG_CHROMA_2X2
       NULL,
 #endif
       &inv_txfm_1d_row_cfg_adst_4, &inv_txfm_1d_row_cfg_adst_8,
-      &inv_txfm_1d_row_cfg_adst_16, &inv_txfm_1d_row_cfg_adst_32 },
+      &inv_txfm_1d_row_cfg_adst_16, &inv_txfm_1d_row_cfg_adst_32,
+#if CONFIG_TX64X64
+      NULL
+#endif  // CONFIG_TX64X64
+  },
 #if CONFIG_EXT_TX
   // FLIPADST
   {
@@ -94,14 +121,22 @@
       NULL,
 #endif
       &inv_txfm_1d_row_cfg_adst_4, &inv_txfm_1d_row_cfg_adst_8,
-      &inv_txfm_1d_row_cfg_adst_16, &inv_txfm_1d_row_cfg_adst_32 },
+      &inv_txfm_1d_row_cfg_adst_16, &inv_txfm_1d_row_cfg_adst_32,
+#if CONFIG_TX64X64
+      NULL
+#endif  // CONFIG_TX64X64
+  },
   // IDENTITY
   {
 #if CONFIG_CHROMA_2X2
       NULL,
 #endif
       &inv_txfm_1d_cfg_identity_4, &inv_txfm_1d_cfg_identity_8,
-      &inv_txfm_1d_cfg_identity_16, &inv_txfm_1d_cfg_identity_32 },
+      &inv_txfm_1d_cfg_identity_16, &inv_txfm_1d_cfg_identity_32,
+#if CONFIG_TX64X64
+      &inv_txfm_1d_cfg_identity_64
+#endif  // CONFIG_TX64X64
+  },
 #endif  // CONFIG_EXT_TX
 };
 
@@ -117,6 +152,7 @@
   return cfg;
 }
 
+#if CONFIG_TX64X64
 TXFM_2D_FLIP_CFG av1_get_inv_txfm_64x64_cfg(int tx_type) {
   TXFM_2D_FLIP_CFG cfg = { 0, 0, NULL, NULL };
   switch (tx_type) {
@@ -130,6 +166,33 @@
   return cfg;
 }
 
+TXFM_2D_FLIP_CFG av1_get_inv_txfm_32x64_cfg(int tx_type) {
+  TXFM_2D_FLIP_CFG cfg = { 0, 0, NULL, NULL };
+  switch (tx_type) {
+    case DCT_DCT:
+      cfg.col_cfg = &inv_txfm_1d_col_cfg_dct_64;
+      cfg.row_cfg = &inv_txfm_1d_row_cfg_dct_32;
+      set_flip_cfg(tx_type, &cfg);
+      break;
+    default: assert(0);
+  }
+  return cfg;
+}
+
+TXFM_2D_FLIP_CFG av1_get_inv_txfm_64x32_cfg(int tx_type) {
+  TXFM_2D_FLIP_CFG cfg = { 0, 0, NULL, NULL };
+  switch (tx_type) {
+    case DCT_DCT:
+      cfg.col_cfg = &inv_txfm_1d_col_cfg_dct_32;
+      cfg.row_cfg = &inv_txfm_1d_row_cfg_dct_64;
+      set_flip_cfg(tx_type, &cfg);
+      break;
+    default: assert(0);
+  }
+  return cfg;
+}
+#endif  // CONFIG_TX64X64
+
 void av1_gen_inv_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
                              const TXFM_2D_FLIP_CFG *cfg, int8_t fwd_shift,
                              int bd) {
@@ -353,15 +416,30 @@
 void av1_inv_txfm2d_add_64x64_c(const int32_t *input, uint16_t *output,
                                 int stride, int tx_type, int bd) {
   int txfm_buf[64 * 64 + 64 + 64];
-  TXFM_2D_FLIP_CFG cfg = av1_get_inv_txfm_64x64_cfg(tx_type);
-  inv_txfm2d_add_c(input, output, stride, &cfg, txfm_buf, -4, bd);
-  assert(fwd_shift_sum[TX_64X64] == -4);
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_64X64, bd);
 }
 
 void av1_inv_txfm2d_add_64x32_c(const int32_t *input, uint16_t *output,
                                 int stride, int tx_type, int bd) {
+#if CONFIG_TXMG
   int txfm_buf[64 * 32 + 64 + 64];
-  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X64, bd);
+  int32_t rinput[64 * 32];
+  uint16_t routput[64 * 32];
+  int tx_size = TX_64X32;
+  int rtx_size = av1_rotate_tx_size(tx_size);
+  int rtx_type = av1_rotate_tx_type(tx_type);
+  int w = tx_size_wide[tx_size];
+  int h = tx_size_high[tx_size];
+  int rw = h;
+  int rh = w;
+  transpose_int32(rinput, rw, input, w, w, h);
+  transpose_uint16(routput, rw, output, stride, w, h);
+  inv_txfm2d_add_facade(rinput, routput, rw, txfm_buf, rtx_type, rtx_size, bd);
+  transpose_uint16(output, stride, routput, rw, rw, rh);
+#else
+  int txfm_buf[64 * 32 + 64 + 64];
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_64X32, bd);
+#endif
 }
 
 void av1_inv_txfm2d_add_32x64_c(const int32_t *input, uint16_t *output,
diff --git a/av1/common/av1_txfm.h b/av1/common/av1_txfm.h
index a61e25d..4c0a2d1 100644
--- a/av1/common/av1_txfm.h
+++ b/av1/common/av1_txfm.h
@@ -134,6 +134,7 @@
   TXFM_TYPE_IDENTITY8,
   TXFM_TYPE_IDENTITY16,
   TXFM_TYPE_IDENTITY32,
+  TXFM_TYPE_IDENTITY64,
 } TXFM_TYPE;
 
 typedef struct TXFM_1D_CFG {
diff --git a/av1/common/blockd.h b/av1/common/blockd.h
index 01214f3..eadd4eb 100644
--- a/av1/common/blockd.h
+++ b/av1/common/blockd.h
@@ -988,9 +988,9 @@
   const TX_SIZE tx_size_sqr = txsize_sqr_map[tx_size];
 #if CONFIG_CB4X4 && USE_TXTYPE_SEARCH_FOR_SUB8X8_IN_CB4X4
   (void)bs;
-  if (tx_size_sqr > TX_32X32) return EXT_TX_SET_DCTONLY;
+  if (tx_size_sqr_up > TX_32X32) return EXT_TX_SET_DCTONLY;
 #else
-  if (tx_size_sqr > TX_32X32 || bs < BLOCK_8X8) return EXT_TX_SET_DCTONLY;
+  if (tx_size_sqr_up > TX_32X32 || bs < BLOCK_8X8) return EXT_TX_SET_DCTONLY;
 #endif
   if (use_reduced_set)
     return is_inter ? EXT_TX_SET_DCT_IDTX : EXT_TX_SET_DTT4_IDTX;
diff --git a/av1/common/common_data.h b/av1/common/common_data.h
index ec28000..d063122 100644
--- a/av1/common/common_data.h
+++ b/av1/common/common_data.h
@@ -665,9 +665,9 @@
   TX_8X16,   TX_16X8,  TX_16X16,
   // 16X32,  32X16,    32X32
   TX_16X32,  TX_32X16, TX_32X32,
+#if CONFIG_TX64X64
   // 32X64,  64X32,
   TX_32X32,  TX_32X32,
-#if CONFIG_TX64X64
   // 64X64
   TX_64X64,
 #if CONFIG_EXT_PARTITION
@@ -675,6 +675,8 @@
   TX_64X64,  TX_64X64, TX_64X64,
 #endif  // CONFIG_EXT_PARTITION
 #else
+  // 32X64,  64X32,
+  TX_32X32,  TX_32X32,
   // 64X64
   TX_32X32,
 #if CONFIG_EXT_PARTITION
@@ -775,9 +777,9 @@
   TX_16X16 - TX_8X8,  TX_16X16 - TX_8X8,  TX_16X16 - TX_8X8,
   // 16X32,           32X16,              32X32
   TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,
-  // 32X64,           64X32,
-  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,
 #if CONFIG_TX64X64
+  // 32X64,           64X32,
+  TX_64X64 - TX_8X8,  TX_64X64 - TX_8X8,
   // 64X64
   TX_64X64 - TX_8X8,
 #if CONFIG_EXT_PARTITION
@@ -785,6 +787,8 @@
   TX_64X64 - TX_8X8,  TX_64X64 - TX_8X8,  TX_64X64 - TX_8X8,
 #endif  // CONFIG_EXT_PARTITION
 #else
+  // 32X64,           64X32,
+  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,
   // 64X64
   TX_32X32 - TX_8X8,
 #if CONFIG_EXT_PARTITION
@@ -818,9 +822,9 @@
   TX_8X8 - TX_8X8,    TX_8X8 - TX_8X8,    TX_16X16 - TX_8X8,
   // 16X32,           32X16,              32X32
   TX_16X16 - TX_8X8,  TX_16X16 - TX_8X8,  TX_32X32 - TX_8X8,
-  // 32X64,           64X32,
-  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,
 #if CONFIG_TX64X64
+  // 32X64,           64X32,
+  TX_64X64 - TX_8X8,  TX_64X64 - TX_8X8,
   // 64X64
   TX_64X64 - TX_8X8,
 #if CONFIG_EXT_PARTITION
@@ -828,6 +832,8 @@
   TX_64X64 - TX_8X8,  TX_64X64 - TX_8X8,  TX_64X64 - TX_8X8,
 #endif  // CONFIG_EXT_PARTITION
 #else
+  // 32X64,           64X32,
+  TX_32X32 - TX_8X8,  TX_32X32 - TX_8X8,
   // 64X64
   TX_32X32 - TX_8X8,
 #if CONFIG_EXT_PARTITION
diff --git a/av1/common/idct.c b/av1/common/idct.c
index ca1c361..5cde3b7 100644
--- a/av1/common/idct.c
+++ b/av1/common/idct.c
@@ -1566,7 +1566,7 @@
   for (i = 0; i < n; ++i) {
     IHT_64x32[tx_type].rows(input, outtmp);
     for (j = 0; j < n2; ++j)
-      tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
+      tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * InvSqrt2);
     input += n2;
   }
 
@@ -1628,7 +1628,7 @@
   for (i = 0; i < n2; ++i) {
     IHT_32x64[tx_type].rows(input, outtmp);
     for (j = 0; j < n; ++j)
-      tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * Sqrt2);
+      tmp[j][i] = (tran_low_t)dct_const_round_shift(outtmp[j] * InvSqrt2);
     input += n;
   }
 
@@ -2107,6 +2107,7 @@
 static void inv_txfm_add_64x64(const tran_low_t *input, uint8_t *dest,
                                int stride, const TxfmParam *txfm_param) {
   const TX_TYPE tx_type = txfm_param->tx_type;
+  assert(tx_type == DCT_DCT);
   switch (tx_type) {
 #if !CONFIG_DAALA_DCT64
     case DCT_DCT: idct64x64_add(input, dest, stride, txfm_param); break;
diff --git a/av1/common/scan.c b/av1/common/scan.c
index 146e748..a5a1a9f 100644
--- a/av1/common/scan.c
+++ b/av1/common/scan.c
@@ -4572,7 +4572,7 @@
   1726, 1789, 1789, 1852, 1852, 1915, 1915, 1978, 1978, 2041, 1727, 1790, 1790,
   1853, 1853, 1916, 1916, 1979, 1979, 2042, 1791, 1854, 1854, 1917, 1917, 1980,
   1980, 2043, 1855, 1918, 1918, 1981, 1981, 2044, 1919, 1982, 1982, 2045, 1983,
-  2046,
+  2046, 0,    0
 };
 
 DECLARE_ALIGNED(16, static const int16_t,