Add txmg experiment

This experiment aims at merging lbd/hbd txfms

So far this exp uses hbd transform on lbd path.
The performances I observed are
lowres -0.089%
midres  0.065%
(negative means performance drop)

Started from here, two main things are needed to be done.
1) Fix overflow due to quantizer noise
2) Generate a 16-bit version from the hbd txfm

Change-Id: I35bb1fc0cbb78decad2570ff5826ed665f739752
diff --git a/av1/common/idct.c b/av1/common/idct.c
index 1677780..58cfc45 100644
--- a/av1/common/idct.c
+++ b/av1/common/idct.c
@@ -2195,11 +2195,13 @@
 #endif
 }
 
+#if !CONFIG_TXMG
 typedef void (*InvTxfmFunc)(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
                             TxfmParam *txfm_param);
 
 static InvTxfmFunc inv_txfm_func[2] = { av1_inv_txfm_add,
                                         av1_highbd_inv_txfm_add };
+#endif
 
 // TODO(kslu) Change input arguments to TxfmParam, which contains mode,
 // tx_type, tx_size, dst, stride, eob. Thus, the additional argument when LGT
@@ -2240,8 +2242,29 @@
   txfm_param.mode = mode;
 #endif
 
+#if CONFIG_TXMG
+  DECLARE_ALIGNED(16, uint16_t, tmp[MAX_TX_SQUARE]);
+  int tmp_stride = MAX_TX_SIZE;
+  int w = tx_size_wide[tx_size];
+  int h = tx_size_high[tx_size];
+  for (int r = 0; r < h; ++r) {
+    for (int c = 0; c < w; ++c) {
+      tmp[r * tmp_stride + c] = dst[r * stride + c];
+    }
+  }
+
+  av1_highbd_inv_txfm_add(dqcoeff, CONVERT_TO_BYTEPTR(tmp), tmp_stride,
+                          &txfm_param);
+
+  for (int r = 0; r < h; ++r) {
+    for (int c = 0; c < w; ++c) {
+      dst[r * stride + c] = tmp[r * tmp_stride + c];
+    }
+  }
+#else   // CONFIG_TXMG
   const int is_hbd = get_bitdepth_data_path_index(xd);
   inv_txfm_func[is_hbd](dqcoeff, dst, stride, &txfm_param);
+#endif  // CONFIG_TXMG
 }
 
 void av1_inverse_transform_block_facade(MACROBLOCKD *xd, int plane, int block,
diff --git a/av1/encoder/encodemb.c b/av1/encoder/encodemb.c
index e7f4d31..de3f428 100644
--- a/av1/encoder/encodemb.c
+++ b/av1/encoder/encodemb.c
@@ -492,10 +492,12 @@
     };
 #endif  // !CONFIG_PVQ
 
+#if !CONFIG_TXMG
 typedef void (*fwdTxfmFunc)(const int16_t *diff, tran_low_t *coeff, int stride,
                             TxfmParam *txfm_param);
 static const fwdTxfmFunc fwd_txfm_func[2] = { av1_fwd_txfm,
                                               av1_highbd_fwd_txfm };
+#endif
 
 void av1_xform_quant(const AV1_COMMON *cm, MACROBLOCK *x, int plane, int block,
                      int blk_row, int blk_col, BLOCK_SIZE plane_bsize,
@@ -642,7 +644,12 @@
 #if !CONFIG_PVQ
   txfm_param.bd = xd->bd;
   const int is_hbd = get_bitdepth_data_path_index(xd);
+
+#if CONFIG_TXMG
+  av1_highbd_fwd_txfm(src_diff, coeff, diff_stride, &txfm_param);
+#else   // CONFIG_TXMG
   fwd_txfm_func[is_hbd](src_diff, coeff, diff_stride, &txfm_param);
+#endif  // CONFIG_TXMG
 
   if (xform_quant_idx != AV1_XFORM_QUANT_SKIP_QUANT) {
     if (LIKELY(!x->skip_block)) {
diff --git a/configure b/configure
index 0a13f21..07d3a8d 100755
--- a/configure
+++ b/configure
@@ -339,6 +339,7 @@
     lpf_direct
     uv_lvl
     no_frame_context_signaling
+    txmg
 "
 CONFIG_LIST="
     dependency_tracking