Move InvSqrt2 to the front of inv_txfm2d_add_c

This will simplify the range management of rect txfm

Change-Id: Icf678fe735dd299c6c42a215c592611025e87ba6
diff --git a/av1/common/av1_inv_txfm2d.c b/av1/common/av1_inv_txfm2d.c
index cf98f71..c929540 100644
--- a/av1/common/av1_inv_txfm2d.c
+++ b/av1/common/av1_inv_txfm2d.c
@@ -308,7 +308,6 @@
     stage_range_row[i] = cfg->row_cfg->stage_range[i] + fwd_shift + bd + 1;
   }
   // i < MAX_TXFM_STAGE_NUM will mute above array bounds warning
-  // TODO(angiebird): correct the range for rect txfms
   for (int i = 0; i < cfg->col_cfg->stage_num && i < MAX_TXFM_STAGE_NUM; ++i) {
     stage_range_col[i] =
         cfg->col_cfg->stage_range[i] + fwd_shift + shift[0] + bd + 1;
@@ -341,17 +340,25 @@
   const TxfmFunc txfm_func_col = inv_txfm_type_to_func(cfg->col_cfg->txfm_type);
   const TxfmFunc txfm_func_row = inv_txfm_type_to_func(cfg->row_cfg->txfm_type);
 
-  // txfm_buf's length is  txfm_size_row * txfm_size_col + 2 * txfm_size_row
+  // txfm_buf's length is  txfm_size_row * txfm_size_col + 2 *
+  // AOMMAX(txfm_size_row, txfm_size_col)
   // it is used for intermediate data buffering
   int32_t *temp_in = txfm_buf;
-  int32_t *temp_out = temp_in + txfm_size_row;
-  int32_t *buf = temp_out + txfm_size_row;
+  int32_t *temp_out = temp_in + AOMMAX(txfm_size_row, txfm_size_col);
+  int32_t *buf = temp_out + AOMMAX(txfm_size_row, txfm_size_col);
   int32_t *buf_ptr = buf;
   int c, r;
 
   // Rows
   for (r = 0; r < txfm_size_row; ++r) {
-    txfm_func_row(input, buf_ptr, cos_bit_row, stage_range_row);
+    if (abs(rect_type) == 1) {
+      for (c = 0; c < txfm_size_col; ++c) {
+        temp_in[c] = (int32_t)dct_const_round_shift(input[c] * InvSqrt2);
+      }
+      txfm_func_row(temp_in, buf_ptr, cos_bit_row, stage_range_row);
+    } else {
+      txfm_func_row(input, buf_ptr, cos_bit_row, stage_range_row);
+    }
     av1_round_shift_array(buf_ptr, txfm_size_col, -shift[0]);
     clamp_buf(buf_ptr, txfm_size_col, bd + 8);
     input += txfm_size_col;
@@ -369,11 +376,6 @@
         temp_in[r] = buf[r * txfm_size_col + (txfm_size_col - c - 1)];
     }
     txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
-    if (abs(rect_type) == 1) {
-      for (r = 0; r < txfm_size_row; ++r) {
-        temp_out[r] = (int32_t)dct_const_round_shift(temp_out[r] * InvSqrt2);
-      }
-    }
     av1_round_shift_array(temp_out, txfm_size_row, -shift[1]);
     clamp_buf(temp_out, txfm_size_row, bd + 1);
     if (cfg->ud_flip == 0) {
diff --git a/test/av1_inv_txfm2d_test.cc b/test/av1_inv_txfm2d_test.cc
index 9866eb5..6260424 100644
--- a/test/av1_inv_txfm2d_test.cc
+++ b/test/av1_inv_txfm2d_test.cc
@@ -169,7 +169,7 @@
 #if CONFIG_TX64X64
     if (tx_type == DCT_DCT) {  // Other types not supported by these tx sizes.
       param_list.push_back(AV1InvTxfm2dParam(tx_type, TX_32X64, 5, 0.38));
-      param_list.push_back(AV1InvTxfm2dParam(tx_type, TX_64X32, 5, 0.38));
+      param_list.push_back(AV1InvTxfm2dParam(tx_type, TX_64X32, 5, 0.39));
       param_list.push_back(AV1InvTxfm2dParam(tx_type, TX_16X64, 3, 0.38));
       param_list.push_back(AV1InvTxfm2dParam(tx_type, TX_64X16, 3, 0.38));
     }