Add txmg experiment

This experiment aims at merging lbd/hbd txfms

So far this exp uses hbd transform on lbd path.
The performances I observed are
lowres -0.089%
midres  0.065%
(negative means performance drop)

Started from here, two main things are needed to be done.
1) Fix overflow due to quantizer noise
2) Generate a 16-bit version from the hbd txfm

Change-Id: I35bb1fc0cbb78decad2570ff5826ed665f739752
diff --git a/av1/common/idct.c b/av1/common/idct.c
index 1677780..58cfc45 100644
--- a/av1/common/idct.c
+++ b/av1/common/idct.c
@@ -2195,11 +2195,13 @@
 #endif
 }
 
+#if !CONFIG_TXMG
 typedef void (*InvTxfmFunc)(const tran_low_t *dqcoeff, uint8_t *dst, int stride,
                             TxfmParam *txfm_param);
 
 static InvTxfmFunc inv_txfm_func[2] = { av1_inv_txfm_add,
                                         av1_highbd_inv_txfm_add };
+#endif
 
 // TODO(kslu) Change input arguments to TxfmParam, which contains mode,
 // tx_type, tx_size, dst, stride, eob. Thus, the additional argument when LGT
@@ -2240,8 +2242,29 @@
   txfm_param.mode = mode;
 #endif
 
+#if CONFIG_TXMG
+  DECLARE_ALIGNED(16, uint16_t, tmp[MAX_TX_SQUARE]);
+  int tmp_stride = MAX_TX_SIZE;
+  int w = tx_size_wide[tx_size];
+  int h = tx_size_high[tx_size];
+  for (int r = 0; r < h; ++r) {
+    for (int c = 0; c < w; ++c) {
+      tmp[r * tmp_stride + c] = dst[r * stride + c];
+    }
+  }
+
+  av1_highbd_inv_txfm_add(dqcoeff, CONVERT_TO_BYTEPTR(tmp), tmp_stride,
+                          &txfm_param);
+
+  for (int r = 0; r < h; ++r) {
+    for (int c = 0; c < w; ++c) {
+      dst[r * stride + c] = tmp[r * tmp_stride + c];
+    }
+  }
+#else   // CONFIG_TXMG
   const int is_hbd = get_bitdepth_data_path_index(xd);
   inv_txfm_func[is_hbd](dqcoeff, dst, stride, &txfm_param);
+#endif  // CONFIG_TXMG
 }
 
 void av1_inverse_transform_block_facade(MACROBLOCKD *xd, int plane, int block,