Fix av1_get_tx_scale() for 32x8 and 8x32 tx

Make it 0 to run at higher precision

Change-Id: I51decbf9179efa18a1a06dcc3f0e939d9895a5cd
diff --git a/av1/common/idct.c b/av1/common/idct.c
index 878e9ed..469f5ba 100644
--- a/av1/common/idct.c
+++ b/av1/common/idct.c
@@ -21,13 +21,8 @@
 #include "av1/common/idct.h"
 
 int av1_get_tx_scale(const TX_SIZE tx_size) {
-  if (txsize_sqr_up_map[tx_size] == TX_32X32) return 1;
-#if CONFIG_TX64X64
-  else if (txsize_sqr_up_map[tx_size] == TX_64X64)
-    return 2;
-#endif  // CONFIG_TX64X64
-  else
-    return 0;
+  const int pels = tx_size_2d[tx_size];
+  return (pels > 256) + (pels > 1024) + (pels > 4096);
 }
 
 // NOTE: The implementation of all inverses need to be aware of the fact
diff --git a/av1/encoder/dct.c b/av1/encoder/dct.c
index 3b47fe9..b179674 100644
--- a/av1/encoder/dct.c
+++ b/av1/encoder/dct.c
@@ -1928,7 +1928,7 @@
     for (j = 0; j < n4; ++j)
       output[i + j * n] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
   }
-  // Note: overall scale factor of transform is 4 times unitary
+  // Note: overall scale factor of transform is 8 times unitary
 }
 
 void av1_fht32x8_c(const int16_t *input, tran_low_t *output, int stride,
@@ -1995,7 +1995,7 @@
     for (j = 0; j < n4; ++j)
       output[j + i * n4] = ROUND_POWER_OF_TWO_SIGNED(temp_out[j], 2);
   }
-  // Note: overall scale factor of transform is 4 times unitary
+  // Note: overall scale factor of transform is 8 times unitary
 }
 
 void av1_fht16x32_c(const int16_t *input, tran_low_t *output, int stride,