Move all of LBD Daala TX to up-4, down-1 shift

Now that tran_low_t is assumed to be 32 bit when Daala TX is active,
there's no reason for multi-stage shifting to fit coefficients into 16
bits for the inter-tranform transpose matrix. Go to a consistent up by
four, down by one shifting scheme for all TX block sizes.

(Note this is for the current AV1 coefficient scaling scheme with
av1_get_tx_scale and deeper coefficients for higher bitdepth input.
Daala TX is moving to the long-intended constant-coefficient-depth in
upcoming patches).

subset 1:
monty-4-1-baseline-s1@2017-11-11T05:57:15.857Z ->
 monty-4-1-test-s1@2017-11-11T05:57:52.983Z

   PSNR | PSNR Cb | PSNR Cr | PSNR HVS |   SSIM | MS SSIM | CIEDE 2000
-0.0117 | -0.0246 |  0.0530 |   0.0238 | 0.0254 |  0.0447 |    -0.0442

Change-Id: I2214e94ac822542c504d472276723277ed350abf
diff --git a/av1/common/idct.c b/av1/common/idct.c
index dc07155..c3bde4c 100644
--- a/av1/common/idct.c
+++ b/av1/common/idct.c
@@ -1008,9 +1008,9 @@
   for (int i = 0; i < n2; ++i) {
 #if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
     tran_low_t temp_in[16];
-    for (int j = 0; j < n; j++) temp_in[j] = input[j] * 2;
+    for (int j = 0; j < n; j++) temp_in[j] = input[j] * 4;
     IHT_16x32[tx_type].rows(temp_in, outtmp);
-    for (int j = 0; j < n; ++j) tmp[j][i] = outtmp[j] * 4;
+    for (int j = 0; j < n; ++j) tmp[j][i] = outtmp[j];
 #else
     IHT_16x32[tx_type].rows(input, outtmp);
     for (int j = 0; j < n; ++j)
@@ -1030,7 +1030,7 @@
       int d = i * stride + j;
       int s = j * outstride + i;
 #if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
 #else
       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
 #endif
@@ -1095,9 +1095,9 @@
   for (int i = 0; i < n; ++i) {
 #if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
     tran_low_t temp_in[32];
-    for (int j = 0; j < n2; j++) temp_in[j] = input[j] * 2;
+    for (int j = 0; j < n2; j++) temp_in[j] = input[j] * 4;
     IHT_32x16[tx_type].rows(temp_in, outtmp);
-    for (int j = 0; j < n2; ++j) tmp[j][i] = outtmp[j] * 4;
+    for (int j = 0; j < n2; ++j) tmp[j][i] = outtmp[j];
 #else
     IHT_32x16[tx_type].rows(input, outtmp);
     for (int j = 0; j < n2; ++j)
@@ -1117,7 +1117,7 @@
       int d = i * stride + j;
       int s = j * outstride + i;
 #if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
 #else
       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
 #endif
@@ -1360,7 +1360,7 @@
   for (int i = 0; i < 32; ++i) {
 #if CONFIG_DAALA_TX32
     tran_low_t temp_in[32];
-    for (int j = 0; j < 32; j++) temp_in[j] = input[j] * 2;
+    for (int j = 0; j < 32; j++) temp_in[j] = input[j] * 4;
     IHT_32[tx_type].rows(temp_in, out[i]);
 #else
     IHT_32[tx_type].rows(input, out[i]);
@@ -1369,15 +1369,8 @@
   }
 
   // transpose
-  for (int i = 0; i < 32; i++) {
-    for (int j = 0; j < 32; j++) {
-#if CONFIG_DAALA_TX32
-      tmp[j][i] = out[i][j] * 4;
-#else
-      tmp[j][i] = out[i][j];
-#endif
-    }
-  }
+  for (int i = 0; i < 32; i++)
+    for (int j = 0; j < 32; j++) tmp[j][i] = out[i][j];
 
   // inverse transform column vectors
   for (int i = 0; i < 32; ++i) IHT_32[tx_type].cols(tmp[i], out[i]);
@@ -1390,7 +1383,7 @@
       int d = i * stride + j;
       int s = j * outstride + i;
 #if CONFIG_DAALA_TX32
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
 #else
       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
 #endif
@@ -1455,9 +1448,8 @@
   for (int i = 0; i < 64; ++i) {
 #if CONFIG_DAALA_TX64
     tran_low_t temp_in[64];
-    for (int j = 0; j < 64; j++) temp_in[j] = input[j] * 2;
+    for (int j = 0; j < 64; j++) temp_in[j] = input[j] * 8;
     IHT_64[tx_type].rows(temp_in, out[i]);
-// Do not rescale intermediate for Daala
 #else
     IHT_64[tx_type].rows(input, out[i]);
     for (int j = 0; j < 64; ++j) out[i][j] = ROUND_POWER_OF_TWO(out[i][j], 1);
@@ -1483,7 +1475,7 @@
       int d = i * stride + j;
       int s = j * outstride + i;
 #if CONFIG_DAALA_TX64
-      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 2));
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
 #else
       dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
 #endif
diff --git a/av1/encoder/dct.c b/av1/encoder/dct.c
index 6ae3080..43d059a 100644
--- a/av1/encoder/dct.c
+++ b/av1/encoder/dct.c
@@ -1965,7 +1965,7 @@
     ht.rows(temp_in, temp_out);
     for (j = 0; j < n; ++j) {
 #if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
-      out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
+      out[j * n2 + i] = temp_out[j];
 #else
       out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4);
 #endif
@@ -1976,7 +1976,12 @@
   for (i = 0; i < n; ++i) {
     for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
     ht.cols(temp_in, temp_out);
+#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
+    for (j = 0; j < n2; ++j)
+      output[i + j * n] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
+#else
     for (j = 0; j < n2; ++j) output[i + j * n] = temp_out[j];
+#endif
   }
   // Note: overall scale factor of transform is 4 times unitary
 }
@@ -2049,7 +2054,7 @@
     ht.cols(temp_in, temp_out);
     for (j = 0; j < n; ++j) {
 #if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
-      out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
+      out[j * n2 + i] = temp_out[j];
 #else
       out[j * n2 + i] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 4);
 #endif
@@ -2060,7 +2065,12 @@
   for (i = 0; i < n; ++i) {
     for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
     ht.rows(temp_in, temp_out);
+#if CONFIG_DAALA_TX16 && CONFIG_DAALA_TX32
+    for (j = 0; j < n2; ++j)
+      output[j + i * n2] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
+#else
     for (j = 0; j < n2; ++j) output[j + i * n2] = temp_out[j];
+#endif
   }
   // Note: overall scale factor of transform is 4 times unitary
 }
@@ -2489,8 +2499,7 @@
 #if CONFIG_DAALA_TX64
     for (j = 0; j < 64; ++j) temp_in[j] = input[j * stride + i] * 16;
     ht.cols(temp_in, temp_out);
-    for (j = 0; j < 64; ++j)
-      out[j * 64 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 3;
+    for (j = 0; j < 64; ++j) out[j * 64 + i] = temp_out[j];
 
 #else
     for (j = 0; j < 64; ++j) temp_in[j] = input[j * stride + i];
@@ -2506,7 +2515,7 @@
     ht.rows(temp_in, temp_out);
     for (j = 0; j < 64; ++j)
 #if CONFIG_DAALA_TX64
-      output[j + i * 64] = temp_out[j];
+      output[j + i * 64] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 3);
 #else
       output[j + i * 64] =
           (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);