Various fixes to scale managed txfms and tests This patch clears all test failures with coeff range checking enabled for forward and inverse transforms. Also this patch ensures that there are no transposes for any of the rectangular transforms. Some fine-tunnig and refactoring are still pending. Some of the tests still need to be rewritten. Change-Id: Ib0e3a4ceccef665ba007d121f536fad7135f38d5

commit: 1158bff79714c056030dfc97e62464f309316935 [log] [tgz]
author: Debargha Mukherjee <debargha@google.com> Mon Jan 01 18:23:59 2018 -0800
committer: Debargha Mukherjee <debargha@google.com> Wed Jan 03 00:01:28 2018 +0000
tree: d04b9e25db9436c325bc1d9105f890b2ffce356f
parent: f0930dcab31fbb786b6329d6c18eecd08df7e4a4 [diff]
diff --git a/av1/common/av1_inv_txfm1d_cfg.h b/av1/common/av1_inv_txfm1d_cfg.h
index ba037c2..97beacf 100644
--- a/av1/common/av1_inv_txfm1d_cfg.h
+++ b/av1/common/av1_inv_txfm1d_cfg.h

@@ -16,8 +16,13 @@
 // sum of fwd_shift_##
 #if CONFIG_TX64X64
 static const int8_t fwd_shift_sum[TX_SIZES] = { 2, 1, 0, -2, -4 };
-#else  // CONFIG_TX64X64
+static const int8_t inv_start_range[TX_SIZES_ALL] = {
+  5, 6, 7, 7, 7, 6, 6, 7, 7, 7, 7, 7, 7, 6, 6, 7, 7, 7, 7
+};
+#else   // CONFIG_TX64X64
 static const int8_t fwd_shift_sum[TX_SIZES] = { 2, 1, 0, -2 };
+static const int8_t inv_start_range[TX_SIZES_ALL] = { 5, 6, 7, 7, 6, 6, 7,
+                                                      7, 7, 7, 6, 6, 7, 7 };
 #endif  // CONFIG_TX64X64
 
 //  ---------------- 4x4 1D config -----------------------
@@ -25,7 +30,7 @@
 static const int8_t inv_shift_4[2] = { 0, -4 };
 
 // stage range
-static const int8_t inv_stage_range_col_dct_4[4] = { 3, 3, 2, 2 };
+static const int8_t inv_stage_range_col_dct_4[4] = { 3, 3, 3, 3 };
 static const int8_t inv_stage_range_row_dct_4[4] = { 3, 3, 3, 3 };
 static const int8_t inv_stage_range_col_adst_4[6] = { 3, 3, 3, 3, 2, 2 };
 static const int8_t inv_stage_range_row_adst_4[6] = { 3, 3, 3, 3, 3, 3 };
@@ -111,11 +116,18 @@
 //  ---------------- 8x16 1D constants -----------------------
 #define inv_shift_8x16 inv_shift_16
 // stage range
+static const int8_t inv_stage_range_row_dct_8x16[6] = { 5, 5, 5, 5, 5, 5 };
+static const int8_t inv_stage_range_row_adst_8x16[8] = {
+  5, 5, 5, 5, 5, 5, 5, 5
+};
 static const int8_t inv_stage_range_col_dct_8x16[8] =
     ARRAYOFFSET8(-2, 7, 7, 7, 7, 7, 7, 7, 7);
 static const int8_t inv_stage_range_col_adst_8x16[10] =
     ARRAYOFFSET10(-2, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7);
 // cos bit
+static const int8_t inv_cos_bit_row_dct_8x16[6] = { 12, 12, 12, 12, 12, 12 };
+static const int8_t inv_cos_bit_row_adst_8x16[8] = { 12, 12, 12, 12,
+                                                     12, 12, 12, 12 };
 static const int8_t inv_cos_bit_col_dct_8x16[8] = { 13, 13, 13, 13,
                                                     13, 13, 13, 13 };
 static const int8_t inv_cos_bit_col_adst_8x16[10] = { 13, 13, 13, 13, 13,
@@ -249,11 +261,18 @@
 //  ---------------- 8x32 1D constants -----------------------
 #define inv_shift_8x32 inv_shift_32
 // stage range
+static const int8_t inv_stage_range_row_dct_8x32[6] = { 5, 5, 5, 5, 5, 5 };
+static const int8_t inv_stage_range_row_adst_8x32[8] = {
+  5, 5, 5, 5, 5, 5, 5, 5
+};
 static const int8_t inv_stage_range_col_dct_8x32[10] =
     ARRAYOFFSET10(-4, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9);
 static const int8_t inv_stage_range_col_adst_8x32[12] =
     ARRAYOFFSET12(-4, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9);
 // cos bit
+static const int8_t inv_cos_bit_row_dct_8x32[6] = { 12, 12, 12, 12, 12, 12 };
+static const int8_t inv_cos_bit_row_adst_8x32[8] = { 12, 12, 12, 12,
+                                                     12, 12, 12, 12 };
 static const int8_t inv_cos_bit_col_dct_8x32[10] = { 13, 13, 13, 13, 13,
                                                      13, 13, 13, 13, 13 };
 static const int8_t inv_cos_bit_col_adst_8x32[12] = { 13, 13, 13, 13, 13, 13,
@@ -692,6 +711,16 @@
   TXFM_TYPE_ADST16,               // .txfm_type
 };
 
+//  ---------------- row config inv_dct_8x16 ----------------
+static const TXFM_1D_CFG inv_txfm_1d_row_cfg_dct_8x16 = {
+  8,                             // .txfm_size
+  6,                             // .stage_num
+  inv_shift_8x16,                // .shift
+  inv_stage_range_row_dct_8x16,  // .stage_range
+  inv_cos_bit_row_dct_8x16,      // .cos_bit
+  TXFM_TYPE_DCT8                 // .txfm_type
+};
+
 //  ---------------- col config inv_dct_8x16 ----------------
 static const TXFM_1D_CFG inv_txfm_1d_col_cfg_dct_8x16 = {
   16,                            // .txfm_size
@@ -702,6 +731,16 @@
   TXFM_TYPE_DCT16                // .txfm_type
 };
 
+//  ---------------- row config inv_adst_8x16 ----------------
+static const TXFM_1D_CFG inv_txfm_1d_row_cfg_adst_8x16 = {
+  8,                              // .txfm_size
+  8,                              // .stage_num
+  inv_shift_8x16,                 // .shift
+  inv_stage_range_row_adst_8x16,  // .stage_range
+  inv_cos_bit_row_adst_8x16,      // .cos_bit
+  TXFM_TYPE_ADST8,                // .txfm_type
+};
+
 //  ---------------- col config inv_adst_8x16 ----------------
 static const TXFM_1D_CFG inv_txfm_1d_col_cfg_adst_8x16 = {
   16,                             // .txfm_size
@@ -744,6 +783,16 @@
 };
 #endif  // CONFIG_TX64X64
 
+//  ---------------- row config inv_dct_8x32 ----------------
+static const TXFM_1D_CFG inv_txfm_1d_row_cfg_dct_8x32 = {
+  8,                             // .txfm_size
+  6,                             // .stage_num
+  inv_shift_8x32,                // .shift
+  inv_stage_range_row_dct_8x32,  // .stage_range
+  inv_cos_bit_row_dct_8x32,      // .cos_bit_col
+  TXFM_TYPE_DCT8                 // .txfm_type
+};
+
 //  ---------------- col config inv_dct_8x32 ----------------
 static const TXFM_1D_CFG inv_txfm_1d_col_cfg_dct_8x32 = {
   32,                            // .txfm_size
@@ -754,6 +803,16 @@
   TXFM_TYPE_DCT32                // .txfm_type
 };
 
+//  ---------------- row config inv_adst_8x32 ----------------
+static const TXFM_1D_CFG inv_txfm_1d_row_cfg_adst_8x32 = {
+  8,                              // .txfm_size
+  8,                              // .stage_num
+  inv_shift_8x32,                 // .shift
+  inv_stage_range_row_adst_8x32,  // .stage_range
+  inv_cos_bit_row_adst_8x32,      // .cos_bit
+  TXFM_TYPE_ADST8,                // .txfm_type
+};
+
 //  ---------------- col config inv_adst_8x32 ----------------
 static const TXFM_1D_CFG inv_txfm_1d_col_cfg_adst_8x32 = {
   32,                             // .txfm_size

diff --git a/av1/common/av1_inv_txfm2d.c b/av1/common/av1_inv_txfm2d.c
index 1874819..3c83769 100644
--- a/av1/common/av1_inv_txfm2d.c
+++ b/av1/common/av1_inv_txfm2d.c

@@ -17,6 +17,8 @@
 #include "av1/common/av1_inv_txfm1d.h"
 #include "av1/common/av1_inv_txfm1d_cfg.h"
 
+#define NO_INV_TRANSPOSE 1
+
 static INLINE TxfmFunc inv_txfm_type_to_func(TXFM_TYPE txfm_type) {
   switch (txfm_type) {
     case TXFM_TYPE_DCT4: return av1_idct4_new;
@@ -141,21 +143,21 @@
 static const TXFM_1D_CFG *inv_txfm_row_cfg_ls[TX_TYPES_1D][TX_SIZES_ALL] = {
   // DCT
   {
-      &inv_txfm_1d_row_cfg_dct_4,  &inv_txfm_1d_row_cfg_dct_8,
-      &inv_txfm_1d_row_cfg_dct_16, &inv_txfm_1d_row_cfg_dct_32,
+      &inv_txfm_1d_row_cfg_dct_4,    &inv_txfm_1d_row_cfg_dct_8,
+      &inv_txfm_1d_row_cfg_dct_16,   &inv_txfm_1d_row_cfg_dct_32,
 #if CONFIG_TX64X64
       &inv_txfm_1d_row_cfg_dct_64,
 #endif  // CONFIG_TX64X64
-      &inv_txfm_1d_row_cfg_dct_4,  &inv_txfm_1d_row_cfg_dct_8,
-      &inv_txfm_1d_row_cfg_dct_8,  &inv_txfm_1d_row_cfg_dct_16,
-      &inv_txfm_1d_row_cfg_dct_16, &inv_txfm_1d_row_cfg_dct_32,
+      &inv_txfm_1d_row_cfg_dct_4,    &inv_txfm_1d_row_cfg_dct_8,
+      &inv_txfm_1d_row_cfg_dct_8x16, &inv_txfm_1d_row_cfg_dct_16,
+      &inv_txfm_1d_row_cfg_dct_16,   &inv_txfm_1d_row_cfg_dct_32,
 #if CONFIG_TX64X64
-      &inv_txfm_1d_row_cfg_dct_32, &inv_txfm_1d_row_cfg_dct_64,
+      &inv_txfm_1d_row_cfg_dct_32,   &inv_txfm_1d_row_cfg_dct_64,
 #endif  // CONFIG_TX64X64
-      &inv_txfm_1d_row_cfg_dct_4,  &inv_txfm_1d_row_cfg_dct_16,
-      &inv_txfm_1d_row_cfg_dct_8,  &inv_txfm_1d_row_cfg_dct_32,
+      &inv_txfm_1d_row_cfg_dct_4,    &inv_txfm_1d_row_cfg_dct_16,
+      &inv_txfm_1d_row_cfg_dct_8x32, &inv_txfm_1d_row_cfg_dct_32,
 #if CONFIG_TX64X64
-      &inv_txfm_1d_row_cfg_dct_16, &inv_txfm_1d_row_cfg_dct_64,
+      &inv_txfm_1d_row_cfg_dct_16,   &inv_txfm_1d_row_cfg_dct_64,
 #endif  // CONFIG_TX64X64
   },
   // ADST
@@ -169,7 +171,7 @@
 #endif  // CONFIG_TX64X64
       &inv_txfm_1d_row_cfg_adst_4,
       &inv_txfm_1d_row_cfg_adst_8,
-      &inv_txfm_1d_row_cfg_adst_8,
+      &inv_txfm_1d_row_cfg_adst_8x16,
       &inv_txfm_1d_row_cfg_adst_16,
       &inv_txfm_1d_row_cfg_adst_16,
       &inv_txfm_1d_row_cfg_adst_32,
@@ -179,7 +181,7 @@
 #endif  // CONFIG_TX64X64
       &inv_txfm_1d_row_cfg_adst_4,
       &inv_txfm_1d_row_cfg_adst_16,
-      &inv_txfm_1d_row_cfg_adst_8,
+      &inv_txfm_1d_row_cfg_adst_8x32,
       &inv_txfm_1d_row_cfg_adst_32,
 #if CONFIG_TX64X64
       &inv_txfm_1d_row_cfg_adst_16,
@@ -197,7 +199,7 @@
 #endif  // CONFIG_TX64X64
       &inv_txfm_1d_row_cfg_adst_4,
       &inv_txfm_1d_row_cfg_adst_8,
-      &inv_txfm_1d_row_cfg_adst_8,
+      &inv_txfm_1d_row_cfg_adst_8x16,
       &inv_txfm_1d_row_cfg_adst_16,
       &inv_txfm_1d_row_cfg_adst_16,
       &inv_txfm_1d_row_cfg_adst_32,
@@ -207,7 +209,7 @@
 #endif  // CONFIG_TX64X64
       &inv_txfm_1d_row_cfg_adst_4,
       &inv_txfm_1d_row_cfg_adst_16,
-      &inv_txfm_1d_row_cfg_adst_8,
+      &inv_txfm_1d_row_cfg_adst_8x32,
       &inv_txfm_1d_row_cfg_adst_32,
 #if CONFIG_TX64X64
       &inv_txfm_1d_row_cfg_adst_16,
@@ -246,7 +248,7 @@
 }
 
 void av1_gen_inv_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
-                             const TXFM_2D_FLIP_CFG *cfg, int8_t fwd_shift,
+                             const TXFM_2D_FLIP_CFG *cfg, TX_SIZE tx_size,
                              int bd) {
   // Note when assigning txfm_size_col, we use the txfm_size from the
   // row configuration and vice versa. This is intentionally done to
@@ -254,35 +256,12 @@
   // rectangular, the number of columns will be the same as the
   // txfm_size stored in the row cfg struct. It will make no difference
   // for square transforms.
-  const int txfm_size_col = cfg->row_cfg->txfm_size;
-  const int txfm_size_row = cfg->col_cfg->txfm_size;
-  // Take the shift from the larger dimension in the rectangular case.
-  const int8_t *shift = (txfm_size_col > txfm_size_row) ? cfg->row_cfg->shift
-                                                        : cfg->col_cfg->shift;
-  // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
-  for (int i = 0; i < cfg->row_cfg->stage_num && i < MAX_TXFM_STAGE_NUM; ++i) {
-    stage_range_row[i] = cfg->row_cfg->stage_range[i] + fwd_shift + bd + 1;
-  }
-  // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
-  for (int i = 0; i < cfg->col_cfg->stage_num && i < MAX_TXFM_STAGE_NUM; ++i) {
-    stage_range_col[i] =
-        cfg->col_cfg->stage_range[i] + fwd_shift + shift[0] + bd + 1;
-  }
-}
-
-static INLINE void inv_txfm2d_add_c(const int32_t *input, uint16_t *output,
-                                    int stride, TXFM_2D_FLIP_CFG *cfg,
-                                    int32_t *txfm_buf, int8_t fwd_shift,
-                                    int bd) {
-  // Note when assigning txfm_size_col, we use the txfm_size from the
-  // row configuration and vice versa. This is intentionally done to
-  // accurately perform rectangular transforms. When the transform is
-  // rectangular, the number of columns will be the same as the
-  // txfm_size stored in the row cfg struct. It will make no difference
-  // for square transforms.
+  // const int fwd_shift = fwd_shift_sum[txsize_sqr_up_map[tx_size]];
+  const int fwd_shift = inv_start_range[tx_size];
   const int txfm_size_col = cfg->row_cfg->txfm_size;
   const int txfm_size_row = cfg->col_cfg->txfm_size;
   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  if (txfm_size_col == txfm_size_row) assert(rect_type == 0);
   int rect_type2_shift = 0;
   if (rect_type == 2 || rect_type == -2) {
     const int txfm_size_max = AOMMAX(txfm_size_col, txfm_size_row);
@@ -293,11 +272,57 @@
   // Take the shift from the larger dimension in the rectangular case.
   const int8_t *shift = (txfm_size_col > txfm_size_row) ? cfg->row_cfg->shift
                                                         : cfg->col_cfg->shift;
+  int shift1 = shift[1];
+  while (rect_type2_shift > 0 && shift1 < 0) {
+    shift1++;
+    rect_type2_shift--;
+  }
+  // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
+  for (int i = 0; i < cfg->row_cfg->stage_num && i < MAX_TXFM_STAGE_NUM; ++i) {
+    stage_range_row[i] = cfg->row_cfg->stage_range[i] + fwd_shift + bd + 1 -
+                         cfg->row_cfg->stage_range[0];
+  }
+  // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
+  for (int i = 0; i < cfg->col_cfg->stage_num && i < MAX_TXFM_STAGE_NUM; ++i) {
+    stage_range_col[i] = cfg->col_cfg->stage_range[i] + fwd_shift + shift[0] +
+                         bd + 1 - cfg->col_cfg->stage_range[0] +
+                         rect_type2_shift;
+  }
+}
+
+static INLINE void inv_txfm2d_add_c(const int32_t *input, uint16_t *output,
+                                    int stride, TXFM_2D_FLIP_CFG *cfg,
+                                    int32_t *txfm_buf, TX_SIZE tx_size,
+                                    int bd) {
+  // Note when assigning txfm_size_col, we use the txfm_size from the
+  // row configuration and vice versa. This is intentionally done to
+  // accurately perform rectangular transforms. When the transform is
+  // rectangular, the number of columns will be the same as the
+  // txfm_size stored in the row cfg struct. It will make no difference
+  // for square transforms.
+  const int txfm_size_col = cfg->row_cfg->txfm_size;
+  const int txfm_size_row = cfg->col_cfg->txfm_size;
+  // Take the shift from the larger dimension in the rectangular case.
+  const int8_t *shift = (txfm_size_col > txfm_size_row) ? cfg->row_cfg->shift
+                                                        : cfg->col_cfg->shift;
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  int rect_type2_shift = 0;
+  int shift1 = shift[1];
+  if (rect_type == 2 || rect_type == -2) {
+    const int txfm_size_max = AOMMAX(txfm_size_col, txfm_size_row);
+    // For 16x4 / 4x16 shift 1 bit, for 32x8 / 8x32 / 64x16 / 16x64 no need
+    // for any additional shift.
+    rect_type2_shift = (txfm_size_max == 16 ? 1 : 0);
+    while (rect_type2_shift > 0 && shift1 < 0) {
+      shift1++;
+      rect_type2_shift--;
+    }
+  }
   int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
   int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
   assert(cfg->row_cfg->stage_num <= MAX_TXFM_STAGE_NUM);
   assert(cfg->col_cfg->stage_num <= MAX_TXFM_STAGE_NUM);
-  av1_gen_inv_stage_range(stage_range_col, stage_range_row, cfg, fwd_shift, bd);
+  av1_gen_inv_stage_range(stage_range_col, stage_range_row, cfg, tx_size, bd);
 
   const int8_t *cos_bit_col = cfg->col_cfg->cos_bit;
   const int8_t *cos_bit_row = cfg->row_cfg->cos_bit;
@@ -340,7 +365,7 @@
         temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
     }
     txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
-    av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
+    av1_round_shift_array(temp_out, txfm_size_row, -shift1);
     if (cfg->ud_flip == 0) {
       for (r = 0; r < txfm_size_row; ++r) {
         output[r * stride + c] =
@@ -364,9 +389,7 @@
   av1_get_inv_txfm_cfg(tx_type, tx_size, &cfg);
   // Forward shift sum uses larger square size, to be consistent with what
   // av1_gen_inv_stage_range() does for inverse shifts.
-  const TX_SIZE tx_size_sqr_up = txsize_sqr_up_map[tx_size];
-  inv_txfm2d_add_c(input, output, stride, &cfg, txfm_buf,
-                   fwd_shift_sum[tx_size_sqr_up], bd);
+  inv_txfm2d_add_c(input, output, stride, &cfg, txfm_buf, tx_size, bd);
 }
 
 void av1_inv_txfm2d_add_4x8_c(const int32_t *input, uint16_t *output,
@@ -378,6 +401,9 @@
 void av1_inv_txfm2d_add_8x4_c(const int32_t *input, uint16_t *output,
                               int stride, TX_TYPE tx_type, int bd) {
   DECLARE_ALIGNED(32, int, txfm_buf[8 * 4 + 8 + 8]);
+#if NO_INV_TRANSPOSE
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_8X4, bd);
+#else
   int32_t rinput[8 * 4];
   uint16_t routput[8 * 4];
   TX_SIZE tx_size = TX_8X4;
@@ -391,6 +417,7 @@
   transpose_uint16(routput, rw, output, stride, w, h);
   inv_txfm2d_add_facade(rinput, routput, rw, txfm_buf, rtx_type, rtx_size, bd);
   transpose_uint16(output, stride, routput, rw, rw, rh);
+#endif  // NO_INV_TRANSPOSE
 }
 
 void av1_inv_txfm2d_add_8x16_c(const int32_t *input, uint16_t *output,
@@ -402,6 +429,9 @@
 void av1_inv_txfm2d_add_16x8_c(const int32_t *input, uint16_t *output,
                                int stride, TX_TYPE tx_type, int bd) {
   DECLARE_ALIGNED(32, int, txfm_buf[16 * 8 + 16 + 16]);
+#if NO_INV_TRANSPOSE
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X8, bd);
+#else
   int32_t rinput[16 * 8];
   uint16_t routput[16 * 8];
   TX_SIZE tx_size = TX_16X8;
@@ -415,6 +445,7 @@
   transpose_uint16(routput, rw, output, stride, w, h);
   inv_txfm2d_add_facade(rinput, routput, rw, txfm_buf, rtx_type, rtx_size, bd);
   transpose_uint16(output, stride, routput, rw, rw, rh);
+#endif  // NO_INV_TRANSPOSE
 }
 
 void av1_inv_txfm2d_add_16x32_c(const int32_t *input, uint16_t *output,
@@ -426,6 +457,9 @@
 void av1_inv_txfm2d_add_32x16_c(const int32_t *input, uint16_t *output,
                                 int stride, TX_TYPE tx_type, int bd) {
   DECLARE_ALIGNED(32, int, txfm_buf[32 * 16 + 32 + 32]);
+#if NO_INV_TRANSPOSE
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X16, bd);
+#else
   int32_t rinput[32 * 16];
   uint16_t routput[32 * 16];
   TX_SIZE tx_size = TX_32X16;
@@ -439,6 +473,7 @@
   transpose_uint16(routput, rw, output, stride, w, h);
   inv_txfm2d_add_facade(rinput, routput, rw, txfm_buf, rtx_type, rtx_size, bd);
   transpose_uint16(output, stride, routput, rw, rw, rh);
+#endif  // NO_INV_TRANSPOSE
 }
 
 void av1_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output,
@@ -494,6 +529,10 @@
     memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
   }
   DECLARE_ALIGNED(32, int, txfm_buf[64 * 32 + 64 + 64]);
+#if NO_INV_TRANSPOSE
+  inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_64X32,
+                        bd);
+#else
   int32_t rinput[64 * 32];
   uint16_t routput[64 * 32];
   TX_SIZE tx_size = TX_64X32;
@@ -507,6 +546,7 @@
   transpose_uint16(routput, rw, output, stride, w, h);
   inv_txfm2d_add_facade(rinput, routput, rw, txfm_buf, rtx_type, rtx_size, bd);
   transpose_uint16(output, stride, routput, rw, rw, rh);
+#endif  // NO_INV_TRANSPOSE
 }
 
 void av1_inv_txfm2d_add_32x64_c(const int32_t *input, uint16_t *output,
@@ -546,6 +586,10 @@
     memset(mod_input + row * 64 + 32, 0, 32 * sizeof(*mod_input));
   }
   DECLARE_ALIGNED(32, int, txfm_buf[16 * 64 + 64 + 64]);
+#if NO_INV_TRANSPOSE
+  inv_txfm2d_add_facade(mod_input, output, stride, txfm_buf, tx_type, TX_64X16,
+                        bd);
+#else
   int32_t rinput[16 * 64];
   uint16_t routput[16 * 64];
   TX_SIZE tx_size = TX_64X16;
@@ -559,6 +603,7 @@
   transpose_uint16(routput, rw, output, stride, w, h);
   inv_txfm2d_add_facade(rinput, routput, rw, txfm_buf, rtx_type, rtx_size, bd);
   transpose_uint16(output, stride, routput, rw, rw, rh);
+#endif  // NO_INV_TRANSPOSE
 }
 #endif  // CONFIG_TX64X64
 
@@ -571,6 +616,9 @@
 void av1_inv_txfm2d_add_16x4_c(const int32_t *input, uint16_t *output,
                                int stride, TX_TYPE tx_type, int bd) {
   DECLARE_ALIGNED(32, int, txfm_buf[4 * 16 + 16 + 16]);
+#if NO_INV_TRANSPOSE
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_16X4, bd);
+#else
   int32_t rinput[4 * 16];
   uint16_t routput[4 * 16];
   TX_SIZE tx_size = TX_16X4;
@@ -584,6 +632,7 @@
   transpose_uint16(routput, rw, output, stride, w, h);
   inv_txfm2d_add_facade(rinput, routput, rw, txfm_buf, rtx_type, rtx_size, bd);
   transpose_uint16(output, stride, routput, rw, rw, rh);
+#endif  // NO_INV_TRANSPOSE
 }
 
 void av1_inv_txfm2d_add_8x32_c(const int32_t *input, uint16_t *output,
@@ -595,6 +644,9 @@
 void av1_inv_txfm2d_add_32x8_c(const int32_t *input, uint16_t *output,
                                int stride, TX_TYPE tx_type, int bd) {
   DECLARE_ALIGNED(32, int, txfm_buf[8 * 32 + 32 + 32]);
+#if NO_INV_TRANSPOSE
+  inv_txfm2d_add_facade(input, output, stride, txfm_buf, tx_type, TX_32X8, bd);
+#else
   int32_t rinput[8 * 32];
   uint16_t routput[8 * 32];
   TX_SIZE tx_size = TX_32X8;
@@ -608,4 +660,5 @@
   transpose_uint16(routput, rw, output, stride, w, h);
   inv_txfm2d_add_facade(rinput, routput, rw, txfm_buf, rtx_type, rtx_size, bd);
   transpose_uint16(output, stride, routput, rw, rw, rh);
+#endif  // NO_INV_TRANSPOSE
 }

diff --git a/av1/common/av1_txfm.h b/av1/common/av1_txfm.h
index 061e0d8..df9fc3c 100644
--- a/av1/common/av1_txfm.h
+++ b/av1/common/av1_txfm.h

@@ -263,7 +263,7 @@
                              const TXFM_2D_FLIP_CFG *cfg, int bd);
 
 void av1_gen_inv_stage_range(int8_t *stage_range_col, int8_t *stage_range_row,
-                             const TXFM_2D_FLIP_CFG *cfg, int8_t fwd_shift,
+                             const TXFM_2D_FLIP_CFG *cfg, TX_SIZE tx_size,
                              int bd);
 
 void av1_get_fwd_txfm_cfg(TX_TYPE tx_type, TX_SIZE tx_size,

diff --git a/av1/encoder/av1_fwd_txfm1d_cfg.h b/av1/encoder/av1_fwd_txfm1d_cfg.h
index 1d925ed..187f62d 100644
--- a/av1/encoder/av1_fwd_txfm1d_cfg.h
+++ b/av1/encoder/av1_fwd_txfm1d_cfg.h

@@ -209,8 +209,8 @@
     ARRAYOFFSET4(4, 0, 1, 2, 2);
 static const int8_t fwd_stage_range_row_adst_4x16[6] =
     ARRAYOFFSET6(4, 0, 0, 1, 2, 2, 2);
-static const int8_t fwd_cos_bit_row_dct_4x16[6] = { 13, 13, 13, 13 };
-static const int8_t fwd_cos_bit_row_adst_4x16[6] = { 13, 13, 13, 13, 13, 13 };
+static const int8_t fwd_cos_bit_row_dct_4x16[6] = { 12, 12, 12, 12 };
+static const int8_t fwd_cos_bit_row_adst_4x16[6] = { 12, 12, 12, 12, 12, 12 };
 
 //  ---------------- 16x4 1D constants -----------------------
 #define fwd_shift_16x4 fwd_shift_16
@@ -218,10 +218,10 @@
     ARRAYOFFSET8(2, 0, 1, 2, 3, 4, 4, 4, 4);
 static const int8_t fwd_stage_range_row_adst_16x4[10] =
     ARRAYOFFSET10(2, 0, 0, 1, 2, 2, 3, 3, 4, 4, 4);
-static const int8_t fwd_cos_bit_row_dct_16x4[8] = { 13, 13, 13, 13,
-                                                    13, 13, 13, 13 };
-static const int8_t fwd_cos_bit_row_adst_16x4[10] = { 13, 13, 13, 13, 13,
-                                                      13, 13, 13, 13, 13 };
+static const int8_t fwd_cos_bit_row_dct_16x4[8] = { 12, 12, 12, 12,
+                                                    12, 12, 12, 12 };
+static const int8_t fwd_cos_bit_row_adst_16x4[10] = { 12, 12, 12, 12, 12,
+                                                      12, 12, 12, 12, 12 };
 
 //  ---------------- 8x32 1D constants -----------------------
 #define fwd_shift_8x32 fwd_shift_32
@@ -229,9 +229,9 @@
     ARRAYOFFSET6(5, 0, 1, 2, 3, 3, 3);
 static const int8_t fwd_stage_range_row_adst_8x32[8] =
     ARRAYOFFSET8(5, 0, 0, 1, 2, 2, 3, 3, 3);
-static const int8_t fwd_cos_bit_row_dct_8x32[6] = { 13, 13, 13, 13, 13, 13 };
-static const int8_t fwd_cos_bit_row_adst_8x32[8] = { 13, 13, 13, 13,
-                                                     13, 13, 13, 13 };
+static const int8_t fwd_cos_bit_row_dct_8x32[6] = { 12, 12, 11, 11, 11, 11 };
+static const int8_t fwd_cos_bit_row_adst_8x32[8] = { 12, 12, 12, 12,
+                                                     11, 11, 11, 11 };
 
 //  ---------------- 32x8 1D constants -----------------------
 #define fwd_shift_32x8 fwd_shift_32
@@ -239,17 +239,17 @@
     ARRAYOFFSET10(3, 0, 1, 2, 3, 4, 5, 5, 5, 5, 5);
 static const int8_t fwd_stage_range_row_adst_32x8[12] =
     ARRAYOFFSET12(3, 0, 0, 1, 2, 2, 3, 3, 4, 4, 5, 5, 5);
-static const int8_t fwd_cos_bit_row_dct_32x8[10] = { 12, 12, 12, 12, 12,
-                                                     12, 12, 12, 12, 12 };
+static const int8_t fwd_cos_bit_row_dct_32x8[10] = { 12, 12, 12, 12, 11,
+                                                     11, 11, 11, 11, 11 };
 static const int8_t fwd_cos_bit_row_adst_32x8[12] = { 12, 12, 12, 12, 12, 12,
-                                                      12, 12, 12, 12, 12, 12 };
+                                                      12, 11, 11, 11, 11, 11 };
 
 //  ---------------- 16x64 1D constants -----------------------
 #define fwd_shift_16x64 fwd_shift_64
 static const int8_t fwd_stage_range_row_dct_16x64[8] =
     ARRAYOFFSET8(6, 0, 1, 2, 3, 4, 4, 4, 4);
-static const int8_t fwd_cos_bit_row_dct_16x64[8] = { 12, 12, 12, 11,
-                                                     11, 11, 11, 11 };
+static const int8_t fwd_cos_bit_row_dct_16x64[8] = { 12, 11, 10, 10,
+                                                     10, 10, 10, 10 };
 
 //  ---------------- 64x16 1D constants -----------------------
 #define fwd_shift_64x16 fwd_shift_64

diff --git a/av1/encoder/av1_fwd_txfm2d.c b/av1/encoder/av1_fwd_txfm2d.c
index 99102f2..250521e 100644
--- a/av1/encoder/av1_fwd_txfm2d.c
+++ b/av1/encoder/av1_fwd_txfm2d.c

@@ -19,6 +19,8 @@
 #include "av1/encoder/av1_fwd_txfm1d.h"
 #include "av1/encoder/av1_fwd_txfm1d_cfg.h"
 
+#define NO_FWD_TRANSPOSE 1
+
 static INLINE TxfmFunc fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
   switch (txfm_type) {
     case TXFM_TYPE_DCT4: return av1_fdct4_new;
@@ -61,10 +63,23 @@
     stage_range_col[i] = cfg->col_cfg->stage_range[i] + shift[0] + bd + 1;
   }
 
+  const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
+  int rect_shift = 0;
+  int shift2 = shift[2];
+  if (rect_type == 2 || rect_type == -2) {
+    const int txfm_size_max = AOMMAX(txfm_size_col, txfm_size_row);
+    // For 64x16 / 16x64 / 32x8 / 8x32 shift 2 bits, and
+    // For 16x4 / 4x16 shift by 1 bit.
+    rect_shift = (txfm_size_max >= 32) ? 2 : 1;
+  }
+  while (rect_shift > 0 && shift2 < 0) {
+    shift2++;
+    rect_shift--;
+  }
   // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
   for (int i = 0; i < cfg->row_cfg->stage_num && i < MAX_TXFM_STAGE_NUM; ++i) {
-    stage_range_row[i] =
-        cfg->row_cfg->stage_range[i] + shift[0] + shift[1] + bd + 1;
+    stage_range_row[i] = cfg->row_cfg->stage_range[i] + shift[0] + shift[1] +
+                         bd + 1 + rect_shift;
   }
 }
 
@@ -80,6 +95,10 @@
   // for square transforms.
   const int txfm_size_col = cfg->row_cfg->txfm_size;
   const int txfm_size_row = cfg->col_cfg->txfm_size;
+  // Take the shift from the larger dimension in the rectangular case.
+  const int8_t *shift = (txfm_size_col > txfm_size_row) ? cfg->row_cfg->shift
+                                                        : cfg->col_cfg->shift;
+  int shift2 = shift[2];
   const int rect_type = get_rect_tx_log_ratio(txfm_size_col, txfm_size_row);
   int rect_type2_shift = 0;
   if (rect_type == 2 || rect_type == -2) {
@@ -87,10 +106,11 @@
     // For 64x16 / 16x64 / 32x8 / 8x32 shift 2 bits, and
     // For 16x4 / 4x16 shift by 1 bit.
     rect_type2_shift = (txfm_size_max >= 32) ? 2 : 1;
+    while (rect_type2_shift > 0 && shift2 < 0) {
+      shift2++;
+      rect_type2_shift--;
+    }
   }
-  // Take the shift from the larger dimension in the rectangular case.
-  const int8_t *shift = (txfm_size_col > txfm_size_row) ? cfg->row_cfg->shift
-                                                        : cfg->col_cfg->shift;
   int8_t stage_range_col[MAX_TXFM_STAGE_NUM];
   int8_t stage_range_row[MAX_TXFM_STAGE_NUM];
   assert(cfg->col_cfg->stage_num <= MAX_TXFM_STAGE_NUM);
@@ -116,17 +136,17 @@
         temp_in[r] = input[(txfm_size_row - r - 1) * stride + c];
     }
     av1_round_shift_array(temp_in, txfm_size_row, -shift[0]);
+    txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
     // Multiply everything by Sqrt2 on the larger dimension if the
     // transform is rectangular and the size difference is a factor of 2.
     // If the size difference is a factor of 4, multiply by
     // 2^rect_type_2_extra_shift.
     if (rect_type == 1) {
       for (r = 0; r < txfm_size_row; ++r)
-        temp_in[r] = (int32_t)fdct_round_shift(temp_in[r] * Sqrt2);
+        temp_out[r] = (int32_t)fdct_round_shift(temp_out[r] * Sqrt2);
     } else if (rect_type == 2) {
-      av1_round_shift_array(temp_in, txfm_size_row, -rect_type2_shift);
+      av1_round_shift_array(temp_out, txfm_size_row, -rect_type2_shift);
     }
-    txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
     av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
     if (cfg->lr_flip == 0) {
       for (r = 0; r < txfm_size_row; ++r)
@@ -154,13 +174,18 @@
     }
     txfm_func_row(buf + r * txfm_size_col, output + r * txfm_size_col,
                   cos_bit_row, stage_range_row);
-    av1_round_shift_array(output + r * txfm_size_col, txfm_size_col, -shift[2]);
+    av1_round_shift_array(output + r * txfm_size_col, txfm_size_col, -shift2);
   }
 }
 
 void av1_fwd_txfm2d_4x8_c(const int16_t *input, int32_t *output, int stride,
                           TX_TYPE tx_type, int bd) {
   DECLARE_ALIGNED(32, int32_t, txfm_buf[4 * 8]);
+  TXFM_2D_FLIP_CFG cfg;
+#if NO_FWD_TRANSPOSE
+  av1_get_fwd_txfm_cfg(tx_type, TX_4X8, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+#else
   int16_t rinput[4 * 8];
   TX_SIZE tx_size = TX_4X8;
   TX_SIZE rtx_size = av1_rotate_tx_size(tx_size);
@@ -170,10 +195,10 @@
   int rw = h;
   int rh = w;
   transpose_int16(rinput, rw, input, stride, w, h);
-  TXFM_2D_FLIP_CFG cfg;
   av1_get_fwd_txfm_cfg(rtx_type, rtx_size, &cfg);
   fwd_txfm2d_c(rinput, txfm_buf, rw, &cfg, output, bd);
   transpose_int32(output, w, txfm_buf, rw, rw, rh);
+#endif  // NO_FWD_TRANSPOSE
 }
 
 void av1_fwd_txfm2d_8x4_c(const int16_t *input, int32_t *output, int stride,
@@ -187,6 +212,11 @@
 void av1_fwd_txfm2d_8x16_c(const int16_t *input, int32_t *output, int stride,
                            TX_TYPE tx_type, int bd) {
   DECLARE_ALIGNED(32, int32_t, txfm_buf[8 * 16]);
+  TXFM_2D_FLIP_CFG cfg;
+#if NO_FWD_TRANSPOSE
+  av1_get_fwd_txfm_cfg(tx_type, TX_8X16, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+#else
   int16_t rinput[8 * 16];
   TX_SIZE tx_size = TX_8X16;
   TX_SIZE rtx_size = av1_rotate_tx_size(tx_size);
@@ -196,10 +226,10 @@
   int rw = h;
   int rh = w;
   transpose_int16(rinput, rw, input, stride, w, h);
-  TXFM_2D_FLIP_CFG cfg;
   av1_get_fwd_txfm_cfg(rtx_type, rtx_size, &cfg);
   fwd_txfm2d_c(rinput, txfm_buf, rw, &cfg, output, bd);
   transpose_int32(output, w, txfm_buf, rw, rw, rh);
+#endif  // NO_FWD_TRANSPOSE
 }
 
 void av1_fwd_txfm2d_16x8_c(const int16_t *input, int32_t *output, int stride,
@@ -213,6 +243,11 @@
 void av1_fwd_txfm2d_16x32_c(const int16_t *input, int32_t *output, int stride,
                             TX_TYPE tx_type, int bd) {
   DECLARE_ALIGNED(32, int32_t, txfm_buf[16 * 32]);
+  TXFM_2D_FLIP_CFG cfg;
+#if NO_FWD_TRANSPOSE
+  av1_get_fwd_txfm_cfg(tx_type, TX_16X32, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+#else
   int16_t rinput[16 * 32];
   TX_SIZE tx_size = TX_16X32;
   TX_SIZE rtx_size = av1_rotate_tx_size(tx_size);
@@ -222,10 +257,10 @@
   int rw = h;
   int rh = w;
   transpose_int16(rinput, rw, input, stride, w, h);
-  TXFM_2D_FLIP_CFG cfg;
   av1_get_fwd_txfm_cfg(rtx_type, rtx_size, &cfg);
   fwd_txfm2d_c(rinput, txfm_buf, rw, &cfg, output, bd);
   transpose_int32(output, w, txfm_buf, rw, rw, rh);
+#endif  // NO_FWD_TRANSPOSE
 }
 
 void av1_fwd_txfm2d_32x16_c(const int16_t *input, int32_t *output, int stride,
@@ -239,6 +274,11 @@
 void av1_fwd_txfm2d_4x16_c(const int16_t *input, int32_t *output, int stride,
                            TX_TYPE tx_type, int bd) {
   DECLARE_ALIGNED(32, int32_t, txfm_buf[4 * 16]);
+  TXFM_2D_FLIP_CFG cfg;
+#if NO_FWD_TRANSPOSE
+  av1_get_fwd_txfm_cfg(tx_type, TX_4X16, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+#else
   int16_t rinput[4 * 16];
   TX_SIZE tx_size = TX_4X16;
   TX_SIZE rtx_size = av1_rotate_tx_size(tx_size);
@@ -248,10 +288,10 @@
   int rw = h;
   int rh = w;
   transpose_int16(rinput, rw, input, stride, w, h);
-  TXFM_2D_FLIP_CFG cfg;
   av1_get_fwd_txfm_cfg(rtx_type, rtx_size, &cfg);
   fwd_txfm2d_c(rinput, txfm_buf, rw, &cfg, output, bd);
   transpose_int32(output, w, txfm_buf, rw, rw, rh);
+#endif  // NO_FWD_TRANSPOSE
 }
 
 void av1_fwd_txfm2d_16x4_c(const int16_t *input, int32_t *output, int stride,
@@ -265,6 +305,11 @@
 void av1_fwd_txfm2d_8x32_c(const int16_t *input, int32_t *output, int stride,
                            TX_TYPE tx_type, int bd) {
   DECLARE_ALIGNED(32, int32_t, txfm_buf[32 * 8]);
+  TXFM_2D_FLIP_CFG cfg;
+#if NO_FWD_TRANSPOSE
+  av1_get_fwd_txfm_cfg(tx_type, TX_8X32, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+#else
   int16_t rinput[32 * 8];
   TX_SIZE tx_size = TX_8X32;
   TX_SIZE rtx_size = av1_rotate_tx_size(tx_size);
@@ -274,10 +319,10 @@
   int rw = h;
   int rh = w;
   transpose_int16(rinput, rw, input, stride, w, h);
-  TXFM_2D_FLIP_CFG cfg;
   av1_get_fwd_txfm_cfg(rtx_type, rtx_size, &cfg);
   fwd_txfm2d_c(rinput, txfm_buf, rw, &cfg, output, bd);
   transpose_int32(output, w, txfm_buf, rw, rw, rh);
+#endif  // NO_FWD_TRANSPOSE
 }
 
 void av1_fwd_txfm2d_32x8_c(const int16_t *input, int32_t *output, int stride,
@@ -343,6 +388,11 @@
 void av1_fwd_txfm2d_32x64_c(const int16_t *input, int32_t *output, int stride,
                             TX_TYPE tx_type, int bd) {
   DECLARE_ALIGNED(32, int32_t, txfm_buf[32 * 64]);
+  TXFM_2D_FLIP_CFG cfg;
+#if NO_FWD_TRANSPOSE
+  av1_get_fwd_txfm_cfg(tx_type, TX_32X64, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+#else
   int16_t rinput[64 * 32];
   TX_SIZE tx_size = TX_32X64;
   TX_SIZE rtx_size = av1_rotate_tx_size(tx_size);
@@ -352,10 +402,10 @@
   int rw = h;
   int rh = w;
   transpose_int16(rinput, rw, input, stride, w, h);
-  TXFM_2D_FLIP_CFG cfg;
   av1_get_fwd_txfm_cfg(rtx_type, rtx_size, &cfg);
   fwd_txfm2d_c(rinput, txfm_buf, rw, &cfg, output, bd);
   transpose_int32(output, w, txfm_buf, rw, rw, rh);
+#endif  // NO_FWD_TRANSPOSE
 
   // Zero out the bottom 32x32 area.
   memset(output + 32 * 32, 0, 32 * 32 * sizeof(*output));
@@ -382,6 +432,11 @@
 void av1_fwd_txfm2d_16x64_c(const int16_t *input, int32_t *output, int stride,
                             TX_TYPE tx_type, int bd) {
   DECLARE_ALIGNED(32, int32_t, txfm_buf[64 * 16]);
+  TXFM_2D_FLIP_CFG cfg;
+#if NO_FWD_TRANSPOSE
+  av1_get_fwd_txfm_cfg(tx_type, TX_16X64, &cfg);
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf, bd);
+#else
   int16_t rinput[64 * 16];
   TX_SIZE tx_size = TX_16X64;
   TX_SIZE rtx_size = av1_rotate_tx_size(tx_size);
@@ -391,10 +446,10 @@
   int rw = h;
   int rh = w;
   transpose_int16(rinput, rw, input, stride, w, h);
-  TXFM_2D_FLIP_CFG cfg;
   av1_get_fwd_txfm_cfg(rtx_type, rtx_size, &cfg);
   fwd_txfm2d_c(rinput, txfm_buf, rw, &cfg, output, bd);
   transpose_int32(output, w, txfm_buf, rw, rw, rh);
+#endif  // NO_FWD_TRANSPOSE
   // Zero out the bottom 16x32 area.
   memset(output + 16 * 32, 0, 16 * 32 * sizeof(*output));
   // Note: no repacking needed here.
commit	1158bff79714c056030dfc97e62464f309316935	[log] [tgz]
author	Debargha Mukherjee <debargha@google.com>	Mon Jan 01 18:23:59 2018 -0800
committer	Debargha Mukherjee <debargha@google.com>	Wed Jan 03 00:01:28 2018 +0000
tree	d04b9e25db9436c325bc1d9105f890b2ffce356f
parent	f0930dcab31fbb786b6329d6c18eecd08df7e4a4 [diff]