Fix rectangle transform computation overflow

- Add 16-bit saturation in fdct_round_shift().
- Add extreme value tests and round trip error tests.
- Fix inv 4x8 txfm calculation accuracy.
- Fix 4x8, 8x4, 8x16, 16x8, 16x32, 32x16 extreme value tests.
- BDRate: lowres: -0.034
          midres: -0.036
          hdres:  -0.013
BUG=webm:1340

Change-Id: I48365c1e50a03a7b1aa69b8856b732b483299fb5
diff --git a/aom_dsp/fwd_txfm.h b/aom_dsp/fwd_txfm.h
index fe8c182..ddc8283 100644
--- a/aom_dsp/fwd_txfm.h
+++ b/aom_dsp/fwd_txfm.h
@@ -14,12 +14,15 @@
 
 #include "aom_dsp/txfm_common.h"
 
+static INLINE tran_high_t saturate_int16(tran_high_t value) {
+  tran_high_t result;
+  result = value > INT16_MAX ? INT16_MAX : value;
+  return result < INT16_MIN ? INT16_MIN : result;
+}
+
 static INLINE tran_high_t fdct_round_shift(tran_high_t input) {
   tran_high_t rv = ROUND_POWER_OF_TWO(input, DCT_CONST_BITS);
-  // TODO(debargha, peter.derivaz): Find new bounds for this assert
-  // and make the bounds consts.
-  // assert(INT16_MIN <= rv && rv <= INT16_MAX);
-  return rv;
+  return saturate_int16(rv);
 }
 
 void aom_fdct32(const tran_high_t *input, tran_high_t *output, int round);
diff --git a/av1/common/x86/idct_intrin_sse2.c b/av1/common/x86/idct_intrin_sse2.c
index 315d1c3..828ed67 100644
--- a/av1/common/x86/idct_intrin_sse2.c
+++ b/av1/common/x86/idct_intrin_sse2.c
@@ -1190,8 +1190,6 @@
   in[6] = load_input_data(input + 2 * 8);
   in[7] = load_input_data(input + 3 * 8);
 
-  scale_sqrt2_8x4(in + 4);
-
   // Row transform
   switch (tx_type) {
     case DCT_DCT:
@@ -1230,6 +1228,8 @@
     default: assert(0); break;
   }
 
+  scale_sqrt2_8x4(in + 4);
+
   // Repack data
   in[0] = _mm_unpacklo_epi64(in[4], in[6]);
   in[1] = _mm_unpackhi_epi64(in[4], in[6]);
diff --git a/av1/encoder/dct.c b/av1/encoder/dct.c
index 600acbe..03a0c92 100644
--- a/av1/encoder/dct.c
+++ b/av1/encoder/dct.c
@@ -787,10 +787,10 @@
   s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
   s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
 
-  x0 = fdct_round_shift(s0 + s4);
-  x1 = fdct_round_shift(s1 + s5);
-  x2 = fdct_round_shift(s2 + s6);
-  x3 = fdct_round_shift(s3 + s7);
+  x0 = s0 + s4;
+  x1 = s1 + s5;
+  x2 = s2 + s6;
+  x3 = s3 + s7;
   x4 = fdct_round_shift(s0 - s4);
   x5 = fdct_round_shift(s1 - s5);
   x6 = fdct_round_shift(s2 - s6);
@@ -806,10 +806,10 @@
   s6 = -cospi_24_64 * x6 + cospi_8_64 * x7;
   s7 = cospi_8_64 * x6 + cospi_24_64 * x7;
 
-  x0 = s0 + s2;
-  x1 = s1 + s3;
-  x2 = s0 - s2;
-  x3 = s1 - s3;
+  x0 = fdct_round_shift(s0 + s2);
+  x1 = fdct_round_shift(s1 + s3);
+  x2 = fdct_round_shift(s0 - s2);
+  x3 = fdct_round_shift(s1 - s3);
   x4 = fdct_round_shift(s4 + s6);
   x5 = fdct_round_shift(s5 + s7);
   x6 = fdct_round_shift(s4 - s6);
@@ -875,14 +875,15 @@
   s14 = x14 * cospi_29_64 + x15 * cospi_3_64;
   s15 = x14 * cospi_3_64 - x15 * cospi_29_64;
 
-  x0 = fdct_round_shift(s0 + s8);
-  x1 = fdct_round_shift(s1 + s9);
-  x2 = fdct_round_shift(s2 + s10);
-  x3 = fdct_round_shift(s3 + s11);
-  x4 = fdct_round_shift(s4 + s12);
-  x5 = fdct_round_shift(s5 + s13);
-  x6 = fdct_round_shift(s6 + s14);
-  x7 = fdct_round_shift(s7 + s15);
+  x0 = s0 + s8;
+  x1 = s1 + s9;
+  x2 = s2 + s10;
+  x3 = s3 + s11;
+  x4 = s4 + s12;
+  x5 = s5 + s13;
+  x6 = s6 + s14;
+  x7 = s7 + s15;
+
   x8 = fdct_round_shift(s0 - s8);
   x9 = fdct_round_shift(s1 - s9);
   x10 = fdct_round_shift(s2 - s10);
@@ -914,14 +915,15 @@
   x1 = s1 + s5;
   x2 = s2 + s6;
   x3 = s3 + s7;
-  x4 = s0 - s4;
-  x5 = s1 - s5;
-  x6 = s2 - s6;
-  x7 = s3 - s7;
-  x8 = fdct_round_shift(s8 + s12);
-  x9 = fdct_round_shift(s9 + s13);
-  x10 = fdct_round_shift(s10 + s14);
-  x11 = fdct_round_shift(s11 + s15);
+  x4 = fdct_round_shift(s0 - s4);
+  x5 = fdct_round_shift(s1 - s5);
+  x6 = fdct_round_shift(s2 - s6);
+  x7 = fdct_round_shift(s3 - s7);
+
+  x8 = s8 + s12;
+  x9 = s9 + s13;
+  x10 = s10 + s14;
+  x11 = s11 + s15;
   x12 = fdct_round_shift(s8 - s12);
   x13 = fdct_round_shift(s9 - s13);
   x14 = fdct_round_shift(s10 - s14);
@@ -945,18 +947,21 @@
   s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
   s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
 
-  x0 = s0 + s2;
-  x1 = s1 + s3;
-  x2 = s0 - s2;
-  x3 = s1 - s3;
+  x0 = fdct_round_shift(s0 + s2);
+  x1 = fdct_round_shift(s1 + s3);
+  x2 = fdct_round_shift(s0 - s2);
+  x3 = fdct_round_shift(s1 - s3);
+
   x4 = fdct_round_shift(s4 + s6);
   x5 = fdct_round_shift(s5 + s7);
   x6 = fdct_round_shift(s4 - s6);
   x7 = fdct_round_shift(s5 - s7);
-  x8 = s8 + s10;
-  x9 = s9 + s11;
-  x10 = s8 - s10;
-  x11 = s9 - s11;
+
+  x8 = fdct_round_shift(s8 + s10);
+  x9 = fdct_round_shift(s9 + s11);
+  x10 = fdct_round_shift(s8 - s10);
+  x11 = fdct_round_shift(s9 - s11);
+
   x12 = fdct_round_shift(s12 + s14);
   x13 = fdct_round_shift(s13 + s15);
   x14 = fdct_round_shift(s12 - s14);
@@ -1230,7 +1235,7 @@
   for (i = 0; i < n2; ++i) {
     for (j = 0; j < n; ++j) temp_in[j] = out[j + i * n];
     ht.rows(temp_in, temp_out);
-    for (j = 0; j < n; ++j) output[j + i * n] = (temp_out[j] + 1) >> 2;
+    for (j = 0; j < n; ++j) output[j + i * n] = temp_out[j] >> 2;
   }
   // Note: overall scale factor of transform is 8 times unitary
 }
@@ -1281,7 +1286,7 @@
   for (i = 0; i < n; ++i) {
     for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
     ht.rows(temp_in, temp_out);
-    for (j = 0; j < n2; ++j) output[j + i * n2] = (temp_out[j] + 1) >> 2;
+    for (j = 0; j < n2; ++j) output[j + i * n2] = temp_out[j] >> 2;
   }
   // Note: overall scale factor of transform is 8 times unitary
 }
@@ -1332,8 +1337,7 @@
   for (i = 0; i < n2; ++i) {
     for (j = 0; j < n; ++j) temp_in[j] = out[j + i * n];
     ht.rows(temp_in, temp_out);
-    for (j = 0; j < n; ++j)
-      output[j + i * n] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+    for (j = 0; j < n; ++j) output[j + i * n] = temp_out[j] >> 2;
   }
   // Note: overall scale factor of transform is 8 times unitary
 }
@@ -1384,8 +1388,7 @@
   for (i = 0; i < n; ++i) {
     for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
     ht.rows(temp_in, temp_out);
-    for (j = 0; j < n2; ++j)
-      output[j + i * n2] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+    for (j = 0; j < n2; ++j) output[j + i * n2] = temp_out[j] >> 2;
   }
   // Note: overall scale factor of transform is 8 times unitary
 }
@@ -1435,9 +1438,7 @@
   for (i = 0; i < n2; ++i) {
     for (j = 0; j < n; ++j) temp_in[j] = out[j + i * n];
     ht.rows(temp_in, temp_out);
-    for (j = 0; j < n; ++j)
-      output[j + i * n] =
-          (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
+    for (j = 0; j < n; ++j) output[j + i * n] = temp_out[j] >> 2;
   }
   // Note: overall scale factor of transform is 4 times unitary
 }
@@ -1487,9 +1488,7 @@
   for (i = 0; i < n; ++i) {
     for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
     ht.rows(temp_in, temp_out);
-    for (j = 0; j < n2; ++j)
-      output[j + i * n2] =
-          (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
+    for (j = 0; j < n2; ++j) output[j + i * n2] = temp_out[j] >> 2;
   }
   // Note: overall scale factor of transform is 4 times unitary
 }
diff --git a/av1/encoder/x86/dct_intrin_sse2.c b/av1/encoder/x86/dct_intrin_sse2.c
index 8d602f1..1f3b669 100644
--- a/av1/encoder/x86/dct_intrin_sse2.c
+++ b/av1/encoder/x86/dct_intrin_sse2.c
@@ -1123,14 +1123,6 @@
   w15 = _mm_sub_epi32(u7, u15);
 
   // shift and rounding
-  v0 = _mm_add_epi32(w0, k__DCT_CONST_ROUNDING);
-  v1 = _mm_add_epi32(w1, k__DCT_CONST_ROUNDING);
-  v2 = _mm_add_epi32(w2, k__DCT_CONST_ROUNDING);
-  v3 = _mm_add_epi32(w3, k__DCT_CONST_ROUNDING);
-  v4 = _mm_add_epi32(w4, k__DCT_CONST_ROUNDING);
-  v5 = _mm_add_epi32(w5, k__DCT_CONST_ROUNDING);
-  v6 = _mm_add_epi32(w6, k__DCT_CONST_ROUNDING);
-  v7 = _mm_add_epi32(w7, k__DCT_CONST_ROUNDING);
   v8 = _mm_add_epi32(w8, k__DCT_CONST_ROUNDING);
   v9 = _mm_add_epi32(w9, k__DCT_CONST_ROUNDING);
   v10 = _mm_add_epi32(w10, k__DCT_CONST_ROUNDING);
@@ -1140,14 +1132,6 @@
   v14 = _mm_add_epi32(w14, k__DCT_CONST_ROUNDING);
   v15 = _mm_add_epi32(w15, k__DCT_CONST_ROUNDING);
 
-  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
-  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
-  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
-  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
-  u4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
-  u5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
-  u6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
-  u7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
   u8 = _mm_srai_epi32(v8, DCT_CONST_BITS);
   u9 = _mm_srai_epi32(v9, DCT_CONST_BITS);
   u10 = _mm_srai_epi32(v10, DCT_CONST_BITS);
@@ -1158,20 +1142,44 @@
   u15 = _mm_srai_epi32(v15, DCT_CONST_BITS);
 
   // back to 16-bit and pack 8 integers into __m128i
-  in[0] = _mm_packs_epi32(u0, u1);
-  in[1] = _mm_packs_epi32(u2, u3);
-  in[2] = _mm_packs_epi32(u4, u5);
-  in[3] = _mm_packs_epi32(u6, u7);
+  v0 = _mm_add_epi32(w0, w4);
+  v1 = _mm_add_epi32(w1, w5);
+  v2 = _mm_add_epi32(w2, w6);
+  v3 = _mm_add_epi32(w3, w7);
+  v4 = _mm_sub_epi32(w0, w4);
+  v5 = _mm_sub_epi32(w1, w5);
+  v6 = _mm_sub_epi32(w2, w6);
+  v7 = _mm_sub_epi32(w3, w7);
+
+  w0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+  w1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+  w2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+  w3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+  w4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+  w5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+  w6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+  w7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+  v0 = _mm_srai_epi32(w0, DCT_CONST_BITS);
+  v1 = _mm_srai_epi32(w1, DCT_CONST_BITS);
+  v2 = _mm_srai_epi32(w2, DCT_CONST_BITS);
+  v3 = _mm_srai_epi32(w3, DCT_CONST_BITS);
+  v4 = _mm_srai_epi32(w4, DCT_CONST_BITS);
+  v5 = _mm_srai_epi32(w5, DCT_CONST_BITS);
+  v6 = _mm_srai_epi32(w6, DCT_CONST_BITS);
+  v7 = _mm_srai_epi32(w7, DCT_CONST_BITS);
+
   in[4] = _mm_packs_epi32(u8, u9);
   in[5] = _mm_packs_epi32(u10, u11);
   in[6] = _mm_packs_epi32(u12, u13);
   in[7] = _mm_packs_epi32(u14, u15);
 
   // stage 2
-  s0 = _mm_add_epi16(in[0], in[2]);
-  s1 = _mm_add_epi16(in[1], in[3]);
-  s2 = _mm_sub_epi16(in[0], in[2]);
-  s3 = _mm_sub_epi16(in[1], in[3]);
+  s0 = _mm_packs_epi32(v0, v1);
+  s1 = _mm_packs_epi32(v2, v3);
+  s2 = _mm_packs_epi32(v4, v5);
+  s3 = _mm_packs_epi32(v6, v7);
+
   u0 = _mm_unpacklo_epi16(in[4], in[5]);
   u1 = _mm_unpackhi_epi16(in[4], in[5]);
   u2 = _mm_unpacklo_epi16(in[6], in[7]);
@@ -1914,22 +1922,6 @@
   u[30] = _mm_sub_epi32(v[14], v[30]);
   u[31] = _mm_sub_epi32(v[15], v[31]);
 
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
-  v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
-  v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
-  v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
-  v[11] = _mm_add_epi32(u[11], k__DCT_CONST_ROUNDING);
-  v[12] = _mm_add_epi32(u[12], k__DCT_CONST_ROUNDING);
-  v[13] = _mm_add_epi32(u[13], k__DCT_CONST_ROUNDING);
-  v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
-  v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
   v[16] = _mm_add_epi32(u[16], k__DCT_CONST_ROUNDING);
   v[17] = _mm_add_epi32(u[17], k__DCT_CONST_ROUNDING);
   v[18] = _mm_add_epi32(u[18], k__DCT_CONST_ROUNDING);
@@ -1947,22 +1939,6 @@
   v[30] = _mm_add_epi32(u[30], k__DCT_CONST_ROUNDING);
   v[31] = _mm_add_epi32(u[31], k__DCT_CONST_ROUNDING);
 
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
-  u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
-  u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
-  u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
-  u[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
-  u[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
-  u[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
-  u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
-  u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
   u[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
   u[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
   u[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
@@ -1980,14 +1956,77 @@
   u[30] = _mm_srai_epi32(v[30], DCT_CONST_BITS);
   u[31] = _mm_srai_epi32(v[31], DCT_CONST_BITS);
 
-  s[0] = _mm_packs_epi32(u[0], u[1]);
-  s[1] = _mm_packs_epi32(u[2], u[3]);
-  s[2] = _mm_packs_epi32(u[4], u[5]);
-  s[3] = _mm_packs_epi32(u[6], u[7]);
-  s[4] = _mm_packs_epi32(u[8], u[9]);
-  s[5] = _mm_packs_epi32(u[10], u[11]);
-  s[6] = _mm_packs_epi32(u[12], u[13]);
-  s[7] = _mm_packs_epi32(u[14], u[15]);
+  v[0] = _mm_add_epi32(u[0], u[8]);
+  v[1] = _mm_add_epi32(u[1], u[9]);
+  v[2] = _mm_add_epi32(u[2], u[10]);
+  v[3] = _mm_add_epi32(u[3], u[11]);
+  v[4] = _mm_add_epi32(u[4], u[12]);
+  v[5] = _mm_add_epi32(u[5], u[13]);
+  v[6] = _mm_add_epi32(u[6], u[14]);
+  v[7] = _mm_add_epi32(u[7], u[15]);
+
+  v[16] = _mm_add_epi32(v[0], v[4]);
+  v[17] = _mm_add_epi32(v[1], v[5]);
+  v[18] = _mm_add_epi32(v[2], v[6]);
+  v[19] = _mm_add_epi32(v[3], v[7]);
+  v[20] = _mm_sub_epi32(v[0], v[4]);
+  v[21] = _mm_sub_epi32(v[1], v[5]);
+  v[22] = _mm_sub_epi32(v[2], v[6]);
+  v[23] = _mm_sub_epi32(v[3], v[7]);
+  v[16] = _mm_add_epi32(v[16], k__DCT_CONST_ROUNDING);
+  v[17] = _mm_add_epi32(v[17], k__DCT_CONST_ROUNDING);
+  v[18] = _mm_add_epi32(v[18], k__DCT_CONST_ROUNDING);
+  v[19] = _mm_add_epi32(v[19], k__DCT_CONST_ROUNDING);
+  v[20] = _mm_add_epi32(v[20], k__DCT_CONST_ROUNDING);
+  v[21] = _mm_add_epi32(v[21], k__DCT_CONST_ROUNDING);
+  v[22] = _mm_add_epi32(v[22], k__DCT_CONST_ROUNDING);
+  v[23] = _mm_add_epi32(v[23], k__DCT_CONST_ROUNDING);
+  v[16] = _mm_srai_epi32(v[16], DCT_CONST_BITS);
+  v[17] = _mm_srai_epi32(v[17], DCT_CONST_BITS);
+  v[18] = _mm_srai_epi32(v[18], DCT_CONST_BITS);
+  v[19] = _mm_srai_epi32(v[19], DCT_CONST_BITS);
+  v[20] = _mm_srai_epi32(v[20], DCT_CONST_BITS);
+  v[21] = _mm_srai_epi32(v[21], DCT_CONST_BITS);
+  v[22] = _mm_srai_epi32(v[22], DCT_CONST_BITS);
+  v[23] = _mm_srai_epi32(v[23], DCT_CONST_BITS);
+  s[0] = _mm_packs_epi32(v[16], v[17]);
+  s[1] = _mm_packs_epi32(v[18], v[19]);
+  s[2] = _mm_packs_epi32(v[20], v[21]);
+  s[3] = _mm_packs_epi32(v[22], v[23]);
+
+  v[8] = _mm_sub_epi32(u[0], u[8]);
+  v[9] = _mm_sub_epi32(u[1], u[9]);
+  v[10] = _mm_sub_epi32(u[2], u[10]);
+  v[11] = _mm_sub_epi32(u[3], u[11]);
+  v[12] = _mm_sub_epi32(u[4], u[12]);
+  v[13] = _mm_sub_epi32(u[5], u[13]);
+  v[14] = _mm_sub_epi32(u[6], u[14]);
+  v[15] = _mm_sub_epi32(u[7], u[15]);
+
+  v[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+  v[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+  v[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+  v[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+  v[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+  v[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+  v[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+  v[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+
+  v[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+
+  s[4] = _mm_packs_epi32(v[8], v[9]);
+  s[5] = _mm_packs_epi32(v[10], v[11]);
+  s[6] = _mm_packs_epi32(v[12], v[13]);
+  s[7] = _mm_packs_epi32(v[14], v[15]);
+  //
+
   s[8] = _mm_packs_epi32(u[16], u[17]);
   s[9] = _mm_packs_epi32(u[18], u[19]);
   s[10] = _mm_packs_epi32(u[20], u[21]);
@@ -2041,14 +2080,6 @@
   u[14] = _mm_sub_epi32(v[6], v[14]);
   u[15] = _mm_sub_epi32(v[7], v[15]);
 
-  v[0] = _mm_add_epi32(u[0], k__DCT_CONST_ROUNDING);
-  v[1] = _mm_add_epi32(u[1], k__DCT_CONST_ROUNDING);
-  v[2] = _mm_add_epi32(u[2], k__DCT_CONST_ROUNDING);
-  v[3] = _mm_add_epi32(u[3], k__DCT_CONST_ROUNDING);
-  v[4] = _mm_add_epi32(u[4], k__DCT_CONST_ROUNDING);
-  v[5] = _mm_add_epi32(u[5], k__DCT_CONST_ROUNDING);
-  v[6] = _mm_add_epi32(u[6], k__DCT_CONST_ROUNDING);
-  v[7] = _mm_add_epi32(u[7], k__DCT_CONST_ROUNDING);
   v[8] = _mm_add_epi32(u[8], k__DCT_CONST_ROUNDING);
   v[9] = _mm_add_epi32(u[9], k__DCT_CONST_ROUNDING);
   v[10] = _mm_add_epi32(u[10], k__DCT_CONST_ROUNDING);
@@ -2058,14 +2089,6 @@
   v[14] = _mm_add_epi32(u[14], k__DCT_CONST_ROUNDING);
   v[15] = _mm_add_epi32(u[15], k__DCT_CONST_ROUNDING);
 
-  u[0] = _mm_srai_epi32(v[0], DCT_CONST_BITS);
-  u[1] = _mm_srai_epi32(v[1], DCT_CONST_BITS);
-  u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
-  u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
-  u[4] = _mm_srai_epi32(v[4], DCT_CONST_BITS);
-  u[5] = _mm_srai_epi32(v[5], DCT_CONST_BITS);
-  u[6] = _mm_srai_epi32(v[6], DCT_CONST_BITS);
-  u[7] = _mm_srai_epi32(v[7], DCT_CONST_BITS);
   u[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
   u[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
   u[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
@@ -2075,28 +2098,46 @@
   u[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
   u[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
 
-  x[0] = _mm_add_epi16(s[0], s[4]);
-  x[1] = _mm_add_epi16(s[1], s[5]);
-  x[2] = _mm_add_epi16(s[2], s[6]);
-  x[3] = _mm_add_epi16(s[3], s[7]);
-  x[4] = _mm_sub_epi16(s[0], s[4]);
-  x[5] = _mm_sub_epi16(s[1], s[5]);
-  x[6] = _mm_sub_epi16(s[2], s[6]);
-  x[7] = _mm_sub_epi16(s[3], s[7]);
-  x[8] = _mm_packs_epi32(u[0], u[1]);
-  x[9] = _mm_packs_epi32(u[2], u[3]);
-  x[10] = _mm_packs_epi32(u[4], u[5]);
-  x[11] = _mm_packs_epi32(u[6], u[7]);
+  v[8] = _mm_add_epi32(u[0], u[4]);
+  v[9] = _mm_add_epi32(u[1], u[5]);
+  v[10] = _mm_add_epi32(u[2], u[6]);
+  v[11] = _mm_add_epi32(u[3], u[7]);
+  v[12] = _mm_sub_epi32(u[0], u[4]);
+  v[13] = _mm_sub_epi32(u[1], u[5]);
+  v[14] = _mm_sub_epi32(u[2], u[6]);
+  v[15] = _mm_sub_epi32(u[3], u[7]);
+
+  v[8] = _mm_add_epi32(v[8], k__DCT_CONST_ROUNDING);
+  v[9] = _mm_add_epi32(v[9], k__DCT_CONST_ROUNDING);
+  v[10] = _mm_add_epi32(v[10], k__DCT_CONST_ROUNDING);
+  v[11] = _mm_add_epi32(v[11], k__DCT_CONST_ROUNDING);
+  v[12] = _mm_add_epi32(v[12], k__DCT_CONST_ROUNDING);
+  v[13] = _mm_add_epi32(v[13], k__DCT_CONST_ROUNDING);
+  v[14] = _mm_add_epi32(v[14], k__DCT_CONST_ROUNDING);
+  v[15] = _mm_add_epi32(v[15], k__DCT_CONST_ROUNDING);
+  v[8] = _mm_srai_epi32(v[8], DCT_CONST_BITS);
+  v[9] = _mm_srai_epi32(v[9], DCT_CONST_BITS);
+  v[10] = _mm_srai_epi32(v[10], DCT_CONST_BITS);
+  v[11] = _mm_srai_epi32(v[11], DCT_CONST_BITS);
+  v[12] = _mm_srai_epi32(v[12], DCT_CONST_BITS);
+  v[13] = _mm_srai_epi32(v[13], DCT_CONST_BITS);
+  v[14] = _mm_srai_epi32(v[14], DCT_CONST_BITS);
+  v[15] = _mm_srai_epi32(v[15], DCT_CONST_BITS);
+  s[8] = _mm_packs_epi32(v[8], v[9]);
+  s[9] = _mm_packs_epi32(v[10], v[11]);
+  s[10] = _mm_packs_epi32(v[12], v[13]);
+  s[11] = _mm_packs_epi32(v[14], v[15]);
+
   x[12] = _mm_packs_epi32(u[8], u[9]);
   x[13] = _mm_packs_epi32(u[10], u[11]);
   x[14] = _mm_packs_epi32(u[12], u[13]);
   x[15] = _mm_packs_epi32(u[14], u[15]);
 
   // stage 3
-  u[0] = _mm_unpacklo_epi16(x[4], x[5]);
-  u[1] = _mm_unpackhi_epi16(x[4], x[5]);
-  u[2] = _mm_unpacklo_epi16(x[6], x[7]);
-  u[3] = _mm_unpackhi_epi16(x[6], x[7]);
+  u[0] = _mm_unpacklo_epi16(s[4], s[5]);
+  u[1] = _mm_unpackhi_epi16(s[4], s[5]);
+  u[2] = _mm_unpacklo_epi16(s[6], s[7]);
+  u[3] = _mm_unpackhi_epi16(s[6], s[7]);
   u[4] = _mm_unpacklo_epi16(x[12], x[13]);
   u[5] = _mm_unpackhi_epi16(x[12], x[13]);
   u[6] = _mm_unpacklo_epi16(x[14], x[15]);
@@ -2170,18 +2211,11 @@
   v[14] = _mm_srai_epi32(u[14], DCT_CONST_BITS);
   v[15] = _mm_srai_epi32(u[15], DCT_CONST_BITS);
 
-  s[0] = _mm_add_epi16(x[0], x[2]);
-  s[1] = _mm_add_epi16(x[1], x[3]);
-  s[2] = _mm_sub_epi16(x[0], x[2]);
-  s[3] = _mm_sub_epi16(x[1], x[3]);
   s[4] = _mm_packs_epi32(v[0], v[1]);
   s[5] = _mm_packs_epi32(v[2], v[3]);
   s[6] = _mm_packs_epi32(v[4], v[5]);
   s[7] = _mm_packs_epi32(v[6], v[7]);
-  s[8] = _mm_add_epi16(x[8], x[10]);
-  s[9] = _mm_add_epi16(x[9], x[11]);
-  s[10] = _mm_sub_epi16(x[8], x[10]);
-  s[11] = _mm_sub_epi16(x[9], x[11]);
+
   s[12] = _mm_packs_epi32(v[8], v[9]);
   s[13] = _mm_packs_epi32(v[10], v[11]);
   s[14] = _mm_packs_epi32(v[12], v[13]);
@@ -2740,26 +2774,20 @@
 }
 
 static INLINE void write_buffer_4x8(tran_low_t *output, __m128i *res) {
-  const __m128i kOne = _mm_set1_epi16(1);
   __m128i in01 = _mm_unpacklo_epi64(res[0], res[1]);
   __m128i in23 = _mm_unpacklo_epi64(res[2], res[3]);
   __m128i in45 = _mm_unpacklo_epi64(res[4], res[5]);
   __m128i in67 = _mm_unpacklo_epi64(res[6], res[7]);
 
-  __m128i out01 = _mm_add_epi16(in01, kOne);
-  __m128i out23 = _mm_add_epi16(in23, kOne);
-  __m128i out45 = _mm_add_epi16(in45, kOne);
-  __m128i out67 = _mm_add_epi16(in67, kOne);
+  in01 = _mm_srai_epi16(in01, 2);
+  in23 = _mm_srai_epi16(in23, 2);
+  in45 = _mm_srai_epi16(in45, 2);
+  in67 = _mm_srai_epi16(in67, 2);
 
-  out01 = _mm_srai_epi16(out01, 2);
-  out23 = _mm_srai_epi16(out23, 2);
-  out45 = _mm_srai_epi16(out45, 2);
-  out67 = _mm_srai_epi16(out67, 2);
-
-  store_output(&out01, (output + 0 * 8));
-  store_output(&out23, (output + 1 * 8));
-  store_output(&out45, (output + 2 * 8));
-  store_output(&out67, (output + 3 * 8));
+  store_output(&in01, (output + 0 * 8));
+  store_output(&in23, (output + 1 * 8));
+  store_output(&in45, (output + 2 * 8));
+  store_output(&in67, (output + 3 * 8));
 }
 
 void av1_fht4x8_sse2(const int16_t *input, tran_low_t *output, int stride,
@@ -2975,16 +3003,10 @@
 }
 
 static INLINE void write_buffer_8x4(tran_low_t *output, __m128i *res) {
-  const __m128i kOne = _mm_set1_epi16(1);
-
-  __m128i out0 = _mm_add_epi16(res[0], kOne);
-  __m128i out1 = _mm_add_epi16(res[1], kOne);
-  __m128i out2 = _mm_add_epi16(res[2], kOne);
-  __m128i out3 = _mm_add_epi16(res[3], kOne);
-  out0 = _mm_srai_epi16(out0, 2);
-  out1 = _mm_srai_epi16(out1, 2);
-  out2 = _mm_srai_epi16(out2, 2);
-  out3 = _mm_srai_epi16(out3, 2);
+  const __m128i out0 = _mm_srai_epi16(res[0], 2);
+  const __m128i out1 = _mm_srai_epi16(res[1], 2);
+  const __m128i out2 = _mm_srai_epi16(res[2], 2);
+  const __m128i out3 = _mm_srai_epi16(res[3], 2);
 
   store_output(&out0, (output + 0 * 8));
   store_output(&out1, (output + 1 * 8));
@@ -3118,6 +3140,14 @@
   scale_sqrt2_8x8_signed(in + 8);
 }
 
+static INLINE void right_shift(__m128i *in, int size, int bit) {
+  int i = 0;
+  while (i < size) {
+    in[i] = _mm_srai_epi16(in[i], bit);
+    i += 1;
+  }
+}
+
 void av1_fht8x16_sse2(const int16_t *input, tran_low_t *output, int stride,
                       int tx_type) {
   __m128i in[16];
@@ -3258,8 +3288,8 @@
 #endif
     default: assert(0); break;
   }
-  right_shift_8x8(t, 2);
-  right_shift_8x8(b, 2);
+  right_shift(t, 8, 2);
+  right_shift(b, 8, 2);
   write_buffer_8x8(output, t, 8);
   write_buffer_8x8(output + 64, b, 8);
 }
@@ -3394,8 +3424,8 @@
   }
   array_transpose_8x8(l, l);
   array_transpose_8x8(r, r);
-  right_shift_8x8(l, 2);
-  right_shift_8x8(r, 2);
+  right_shift(l, 8, 2);
+  right_shift(r, 8, 2);
   write_buffer_8x8(output, l, 16);
   write_buffer_8x8(output + 8, r, 16);
 }
@@ -3492,19 +3522,14 @@
   scale_sqrt2_8x16(inbr);
 }
 
-static INLINE void right_shift_8x16(__m128i *res, const int bit) {
-  right_shift_8x8(res, bit);
-  right_shift_8x8(res + 8, bit);
-}
-
 static INLINE void write_buffer_16x32(tran_low_t *output, __m128i *restl,
                                       __m128i *restr, __m128i *resbl,
                                       __m128i *resbr) {
   int i;
-  right_shift_8x16(restl, 2);
-  right_shift_8x16(restr, 2);
-  right_shift_8x16(resbl, 2);
-  right_shift_8x16(resbr, 2);
+  right_shift(restl, 16, 2);
+  right_shift(restr, 16, 2);
+  right_shift(resbl, 16, 2);
+  right_shift(resbr, 16, 2);
   for (i = 0; i < 16; ++i) {
     store_output(&restl[i], output + i * 16 + 0);
     store_output(&restr[i], output + i * 16 + 8);
@@ -3663,10 +3688,10 @@
                                       __m128i *res1, __m128i *res2,
                                       __m128i *res3) {
   int i;
-  right_shift_8x16(res0, 2);
-  right_shift_8x16(res1, 2);
-  right_shift_8x16(res2, 2);
-  right_shift_8x16(res3, 2);
+  right_shift(res0, 16, 2);
+  right_shift(res1, 16, 2);
+  right_shift(res2, 16, 2);
+  right_shift(res3, 16, 2);
   for (i = 0; i < 16; ++i) {
     store_output(&res0[i], output + i * 32 + 0);
     store_output(&res1[i], output + i * 32 + 8);
diff --git a/test/av1_fht16x32_test.cc b/test/av1_fht16x32_test.cc
index 8ff96b3..d45fcde 100644
--- a/test/av1_fht16x32_test.cc
+++ b/test/av1_fht16x32_test.cc
@@ -69,10 +69,35 @@
   IhtFunc inv_txfm_;
 };
 
+TEST_P(AV1Trans16x32HT, AccuracyCheck) { RunAccuracyCheck(48); }
 TEST_P(AV1Trans16x32HT, CoeffCheck) { RunCoeffCheck(); }
+TEST_P(AV1Trans16x32HT, MemCheck) { RunMemCheck(); }
 TEST_P(AV1Trans16x32HT, InvCoeffCheck) { RunInvCoeffCheck(); }
+TEST_P(AV1Trans16x32HT, InvAccuracyCheck) { RunInvAccuracyCheck(9); }
 
 using std::tr1::make_tuple;
+const Ht16x32Param kArrayHt16x32Param_c[] = {
+  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, 0, AOM_BITS_8, 512),
+  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, 1, AOM_BITS_8, 512),
+  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, 2, AOM_BITS_8, 512),
+  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, 3, AOM_BITS_8, 512),
+#if CONFIG_EXT_TX
+  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, 4, AOM_BITS_8, 512),
+  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, 5, AOM_BITS_8, 512),
+  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, 6, AOM_BITS_8, 512),
+  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, 7, AOM_BITS_8, 512),
+  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, 8, AOM_BITS_8, 512),
+  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, 9, AOM_BITS_8, 512),
+  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, 10, AOM_BITS_8, 512),
+  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, 11, AOM_BITS_8, 512),
+  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, 12, AOM_BITS_8, 512),
+  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, 13, AOM_BITS_8, 512),
+  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, 14, AOM_BITS_8, 512),
+  make_tuple(&av1_fht16x32_c, &av1_iht16x32_512_add_c, 15, AOM_BITS_8, 512)
+#endif  // CONFIG_EXT_TX
+};
+INSTANTIATE_TEST_CASE_P(C, AV1Trans16x32HT,
+                        ::testing::ValuesIn(kArrayHt16x32Param_c));
 
 #if HAVE_SSE2
 const Ht16x32Param kArrayHt16x32Param_sse2[] = {
diff --git a/test/av1_fht16x8_test.cc b/test/av1_fht16x8_test.cc
index c2878c4..a70356a 100644
--- a/test/av1_fht16x8_test.cc
+++ b/test/av1_fht16x8_test.cc
@@ -69,8 +69,11 @@
   IhtFunc inv_txfm_;
 };
 
+TEST_P(AV1Trans16x8HT, AccuracyCheck) { RunAccuracyCheck(1); }
 TEST_P(AV1Trans16x8HT, CoeffCheck) { RunCoeffCheck(); }
+TEST_P(AV1Trans16x8HT, MemCheck) { RunMemCheck(); }
 TEST_P(AV1Trans16x8HT, InvCoeffCheck) { RunInvCoeffCheck(); }
+TEST_P(AV1Trans16x8HT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
 
 using std::tr1::make_tuple;
 
diff --git a/test/av1_fht32x16_test.cc b/test/av1_fht32x16_test.cc
index 41c0b1c..2470b83 100644
--- a/test/av1_fht32x16_test.cc
+++ b/test/av1_fht32x16_test.cc
@@ -69,10 +69,35 @@
   IhtFunc inv_txfm_;
 };
 
+TEST_P(AV1Trans32x16HT, MemCheck) { RunMemCheck(); }
+TEST_P(AV1Trans32x16HT, AccuracyCheck) { RunAccuracyCheck(43); }
 TEST_P(AV1Trans32x16HT, CoeffCheck) { RunCoeffCheck(); }
 TEST_P(AV1Trans32x16HT, InvCoeffCheck) { RunInvCoeffCheck(); }
+TEST_P(AV1Trans32x16HT, InvAccuracyCheck) { RunInvAccuracyCheck(9); }
 
 using std::tr1::make_tuple;
+const Ht32x16Param kArrayHt32x16Param_c[] = {
+  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, 0, AOM_BITS_8, 512),
+  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, 1, AOM_BITS_8, 512),
+  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, 2, AOM_BITS_8, 512),
+  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, 3, AOM_BITS_8, 512),
+#if CONFIG_EXT_TX
+  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, 4, AOM_BITS_8, 512),
+  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, 5, AOM_BITS_8, 512),
+  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, 6, AOM_BITS_8, 512),
+  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, 7, AOM_BITS_8, 512),
+  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, 8, AOM_BITS_8, 512),
+  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, 9, AOM_BITS_8, 512),
+  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, 10, AOM_BITS_8, 512),
+  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, 11, AOM_BITS_8, 512),
+  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, 12, AOM_BITS_8, 512),
+  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, 13, AOM_BITS_8, 512),
+  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, 14, AOM_BITS_8, 512),
+  make_tuple(&av1_fht32x16_c, &av1_iht32x16_512_add_c, 15, AOM_BITS_8, 512)
+#endif  // CONFIG_EXT_TX
+};
+INSTANTIATE_TEST_CASE_P(C, AV1Trans32x16HT,
+                        ::testing::ValuesIn(kArrayHt32x16Param_c));
 
 #if HAVE_SSE2
 const Ht32x16Param kArrayHt32x16Param_sse2[] = {
diff --git a/test/av1_fht4x8_test.cc b/test/av1_fht4x8_test.cc
index 4a5f3ff..d01c133 100644
--- a/test/av1_fht4x8_test.cc
+++ b/test/av1_fht4x8_test.cc
@@ -69,8 +69,11 @@
   IhtFunc inv_txfm_;
 };
 
+TEST_P(AV1Trans4x8HT, AccuracyCheck) { RunAccuracyCheck(0); }
 TEST_P(AV1Trans4x8HT, CoeffCheck) { RunCoeffCheck(); }
+TEST_P(AV1Trans4x8HT, MemCheck) { RunMemCheck(); }
 TEST_P(AV1Trans4x8HT, InvCoeffCheck) { RunInvCoeffCheck(); }
+TEST_P(AV1Trans4x8HT, InvAccuracyCheck) { RunInvAccuracyCheck(0); }
 
 using std::tr1::make_tuple;
 
diff --git a/test/av1_fht8x16_test.cc b/test/av1_fht8x16_test.cc
index 42907fd..e3e1819 100644
--- a/test/av1_fht8x16_test.cc
+++ b/test/av1_fht8x16_test.cc
@@ -69,8 +69,11 @@
   IhtFunc inv_txfm_;
 };
 
+TEST_P(AV1Trans8x16HT, MemCheck) { RunMemCheck(); }
+TEST_P(AV1Trans8x16HT, AccuracyCheck) { RunAccuracyCheck(1); }
 TEST_P(AV1Trans8x16HT, CoeffCheck) { RunCoeffCheck(); }
 TEST_P(AV1Trans8x16HT, InvCoeffCheck) { RunInvCoeffCheck(); }
+TEST_P(AV1Trans8x16HT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
 
 using std::tr1::make_tuple;
 
diff --git a/test/av1_fht8x4_test.cc b/test/av1_fht8x4_test.cc
index 46e8c62..fe1ec5e 100644
--- a/test/av1_fht8x4_test.cc
+++ b/test/av1_fht8x4_test.cc
@@ -69,8 +69,11 @@
   IhtFunc inv_txfm_;
 };
 
+TEST_P(AV1Trans8x4HT, AccuracyCheck) { RunAccuracyCheck(0); }
 TEST_P(AV1Trans8x4HT, CoeffCheck) { RunCoeffCheck(); }
+TEST_P(AV1Trans8x4HT, MemCheck) { RunMemCheck(); }
 TEST_P(AV1Trans8x4HT, InvCoeffCheck) { RunInvCoeffCheck(); }
+TEST_P(AV1Trans8x4HT, InvAccuracyCheck) { RunInvAccuracyCheck(0); }
 
 using std::tr1::make_tuple;
 
diff --git a/test/transform_test_base.h b/test/transform_test_base.h
index 64bf2d6..5f35f57 100644
--- a/test/transform_test_base.h
+++ b/test/transform_test_base.h
@@ -103,10 +103,10 @@
     }
 
     EXPECT_GE(static_cast<uint32_t>(limit), max_error)
-        << "Error: 4x4 FHT/IHT has an individual round trip error > " << limit;
+        << "Error: FHT/IHT has an individual round trip error > " << limit;
 
     EXPECT_GE(count_test_block * limit, total_error)
-        << "Error: 4x4 FHT/IHT has average round trip error > " << limit
+        << "Error: FHT/IHT has average round trip error > " << limit
         << " per block";
 
     aom_free(test_input_block);
@@ -249,7 +249,9 @@
       int row_length = FindRowLength();
       // The minimum quant value is 4.
       for (int j = 0; j < num_coeffs_; ++j) {
-        EXPECT_EQ(output_block[j], output_ref_block[j]);
+        EXPECT_EQ(output_block[j], output_ref_block[j])
+            << "Not bit-exact at test index: " << i << ", "
+            << "j = " << j << std::endl;
         EXPECT_GE(row_length * kDctMaxValue << (bit_depth_ - 8),
                   abs(output_block[j]))
             << "Error: NxN FDCT has coefficient larger than N*DCT_MAX_VALUE";