daala_tx: Add inverse TX SIMD dispatch

This just adds a top-level daala_inv_txfm_add_avx2(), but no actual
SIMD functions yet. It dispatches back to the C version for all TX
types and sizes for the moment.

Change-Id: I7a578a4af363f989615d01ea67ce031d8ceff977
diff --git a/av1/av1.cmake b/av1/av1.cmake
index 18249a6..1b59bf4 100644
--- a/av1/av1.cmake
+++ b/av1/av1.cmake
@@ -185,6 +185,11 @@
 set(AOM_AV1_COMMON_INTRIN_AVX2
     "${AOM_ROOT}/av1/common/x86/highbd_inv_txfm_avx2.c"
     "${AOM_ROOT}/av1/common/x86/hybrid_inv_txfm_avx2.c")
+if (CONFIG_DAALA_TX)
+  set(AOM_AV1_COMMON_INTRIN_AVX2
+      ${AOM_AV1_COMMON_INTRIN_AVX2}
+      "${AOM_ROOT}/av1/common/x86/daala_inv_txfm_avx2.c")
+endif ()
 
 set(AOM_AV1_COMMON_INTRIN_MSA
     "${AOM_ROOT}/av1/common/mips/msa/av1_idct16x16_msa.c"
diff --git a/av1/av1_common.mk b/av1/av1_common.mk
index 6fcb4d9..40694a5 100644
--- a/av1/av1_common.mk
+++ b/av1/av1_common.mk
@@ -28,6 +28,7 @@
 AV1_COMMON_SRCS-yes += common/daala_tx.h
 AV1_COMMON_SRCS-yes += common/daala_inv_txfm.c
 AV1_COMMON_SRCS-yes += common/daala_inv_txfm.h
+AV1_COMMON_SRCS-$(HAVE_AVX2) += common/x86/daala_inv_txfm_avx2.c
 AV1_COMMON_SRCS-yes += common/entropy.h
 AV1_COMMON_SRCS-yes += common/entropymode.h
 AV1_COMMON_SRCS-yes += common/entropymv.h
diff --git a/av1/common/av1_rtcd_defs.pl b/av1/common/av1_rtcd_defs.pl
index 826884b..cbf79b8 100755
--- a/av1/common/av1_rtcd_defs.pl
+++ b/av1/common/av1_rtcd_defs.pl
@@ -18,6 +18,10 @@
 #include "av1/common/restoration.h"
 #endif
 
+#if CONFIG_DAALA_TX
+#include "av1/common/daala_inv_txfm.h"
+#endif
+
 struct macroblockd;
 
 /* Encoder forward decls */
@@ -624,4 +628,10 @@
   }
 }
 
+# DAALA_TX functions
+if (aom_config("CONFIG_DAALA_TX") eq "yes") {
+  add_proto qw/void daala_inv_txfm_add/, "const tran_low_t *input_coeffs, void *output_pixels, int output_stride, TxfmParam *txfm_param";
+  specialize qw/daala_inv_txfm_add avx2/;
+}
+
 1;
diff --git a/av1/common/daala_inv_txfm.c b/av1/common/daala_inv_txfm.c
index ceb5961..6b75cf6 100644
--- a/av1/common/daala_inv_txfm.c
+++ b/av1/common/daala_inv_txfm.c
@@ -46,8 +46,8 @@
 // intended for both low and high bitdepth cases with a tran_low_t of
 // 32 bits (matching od_coeff), and a passed-in pixel buffer of either
 // bytes (hbd=0) or shorts (hbd=1).
-void daala_inv_txfm_add(const tran_low_t *input_coeffs, void *output_pixels,
-                        int output_stride, TxfmParam *txfm_param) {
+void daala_inv_txfm_add_c(const tran_low_t *input_coeffs, void *output_pixels,
+                          int output_stride, TxfmParam *txfm_param) {
   const TX_SIZE tx_size = txfm_param->tx_size;
   const TX_TYPE tx_type = txfm_param->tx_type;
   const int px_depth = txfm_param->bd;
diff --git a/av1/common/daala_inv_txfm.h b/av1/common/daala_inv_txfm.h
index aa79a14..3e0df30 100644
--- a/av1/common/daala_inv_txfm.h
+++ b/av1/common/daala_inv_txfm.h
@@ -18,8 +18,8 @@
 extern "C" {
 #endif
 
-void daala_inv_txfm_add(const tran_low_t *input_coeffs, void *output_pixels,
-                        int output_stride, TxfmParam *txfm_param);
+void daala_inv_txfm_add_c(const tran_low_t *input_coeffs, void *output_pixels,
+                          int output_stride, TxfmParam *txfm_param);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/av1/common/x86/daala_inv_txfm_avx2.c b/av1/common/x86/daala_inv_txfm_avx2.c
new file mode 100644
index 0000000..6765bc7
--- /dev/null
+++ b/av1/common/x86/daala_inv_txfm_avx2.c
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2017, Alliance for Open Media. All rights reserved
+ *
+ * This source code is subject to the terms of the BSD 2 Clause License and
+ * the Alliance for Open Media Patent License 1.0. If the BSD 2 Clause License
+ * was not distributed with this source code in the LICENSE file, you can
+ * obtain it at www.aomedia.org/license/software. If the Alliance for Open
+ * Media Patent License 1.0 was not distributed with this source code in the
+ * PATENTS file, you can obtain it at www.aomedia.org/license/patent.
+ */
+
+#include "./av1_rtcd.h"
+#include "./aom_config.h"
+#include "./aom_dsp_rtcd.h"
+#include "av1/common/daala_tx.h"
+#include "av1/common/daala_inv_txfm.h"
+#include "av1/common/idct.h"
+
+#if CONFIG_DAALA_TX
+
+typedef void (*daala_row_itx)(int16_t *out, int rows, const tran_low_t *in);
+typedef void (*daala_col_itx_add)(unsigned char *output_pixels,
+                                  int output_stride, int cols,
+                                  const int16_t *in, int bd);
+
+static const daala_row_itx TX_ROW_MAP[TX_SIZES][TX_TYPES] = {
+  // 4-point transforms
+  { NULL, NULL, NULL, NULL },
+  // 8-point transforms
+  { NULL, NULL, NULL, NULL },
+  // 16-point transforms
+  { NULL, NULL, NULL, NULL },
+  // 32-point transforms
+  { NULL, NULL, NULL, NULL },
+#if CONFIG_TX64X64
+  // 64-point transforms
+  { NULL, NULL, NULL, NULL },
+#endif
+};
+
+static const daala_col_itx_add TX_COL_MAP[2][TX_SIZES][TX_TYPES] = {
+  // Low bit depth output
+  {
+      // 4-point transforms
+      { NULL, NULL, NULL, NULL },
+      // 8-point transforms
+      { NULL, NULL, NULL, NULL },
+      // 16-point transforms
+      { NULL, NULL, NULL, NULL },
+      // 32-point transforms
+      { NULL, NULL, NULL, NULL },
+#if CONFIG_TX64X64
+      // 64-point transforms
+      { NULL, NULL, NULL, NULL },
+#endif
+  },
+  // High bit depth output
+  {
+      // 4-point transforms
+      { NULL, NULL, NULL, NULL },
+      // 8-point transforms
+      { NULL, NULL, NULL, NULL },
+      // 16-point transforms
+      { NULL, NULL, NULL, NULL },
+      // 32-point transforms
+      { NULL, NULL, NULL, NULL },
+#if CONFIG_TX64X64
+      // 64-point transforms
+      { NULL, NULL, NULL, NULL },
+#endif
+  }
+};
+
+void daala_inv_txfm_add_avx2(const tran_low_t *input_coeffs,
+                             void *output_pixels, int output_stride,
+                             TxfmParam *txfm_param) {
+  const TX_SIZE tx_size = txfm_param->tx_size;
+  const TX_TYPE tx_type = txfm_param->tx_type;
+  assert(tx_size <= TX_SIZES_ALL);
+  assert(tx_type <= TX_TYPES);
+
+  if (txfm_param->lossless) {
+    daala_inv_txfm_add_c(input_coeffs, output_pixels, output_stride,
+                         txfm_param);
+  } else {
+    // General TX case
+    assert(sizeof(tran_low_t) == sizeof(od_coeff));
+    assert(sizeof(tran_low_t) >= 4);
+
+    // Hook into existing map translation infrastructure to select
+    // appropriate TX functions
+    const TX_SIZE col_idx = txsize_vert_map[tx_size];
+    const TX_SIZE row_idx = txsize_horz_map[tx_size];
+    assert(col_idx <= TX_SIZES);
+    assert(row_idx <= TX_SIZES);
+    assert(vtx_tab[tx_type] <= (int)TX_TYPES_1D);
+    assert(htx_tab[tx_type] <= (int)TX_TYPES_1D);
+    daala_row_itx row_tx = TX_ROW_MAP[row_idx][htx_tab[tx_type]];
+    daala_col_itx_add col_tx =
+        TX_COL_MAP[txfm_param->is_hbd][col_idx][vtx_tab[tx_type]];
+    int16_t tmpsq[MAX_TX_SQUARE];
+
+    if (row_tx == NULL || col_tx == NULL) {
+      daala_inv_txfm_add_c(input_coeffs, output_pixels, output_stride,
+                           txfm_param);
+    } else {
+      const int cols = tx_size_wide[tx_size];
+      const int rows = tx_size_high[tx_size];
+      // Inverse-transform rows
+      row_tx(tmpsq, rows, input_coeffs);
+      // Inverse-transform columns and sum with destination
+      col_tx(output_pixels, output_stride, cols, tmpsq, txfm_param->bd);
+    }
+  }
+}
+
+#endif