Improve rectangular transform accuracy

By adjusting the internal scaling and rounding in the transforms,
we can adjust the maximum round-trip errors to:
* 8x16 and 16x8: 0 pixel values (ie, transforms are exact)
* 16x32: 1 pixel value
* 32x16: 2 pixel values

Change-Id: I0ba691a8d27042dcf1dd5ae81568d07a92d68781
diff --git a/av1/encoder/dct.c b/av1/encoder/dct.c
index 03a0c92..168ffd2 100644
--- a/av1/encoder/dct.c
+++ b/av1/encoder/dct.c
@@ -1337,7 +1337,9 @@
   for (i = 0; i < n2; ++i) {
     for (j = 0; j < n; ++j) temp_in[j] = out[j + i * n];
     ht.rows(temp_in, temp_out);
-    for (j = 0; j < n; ++j) output[j + i * n] = temp_out[j] >> 2;
+    for (j = 0; j < n; ++j)
+      output[j + i * n] =
+          saturate_int16(temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
   }
   // Note: overall scale factor of transform is 8 times unitary
 }
@@ -1388,7 +1390,9 @@
   for (i = 0; i < n; ++i) {
     for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
     ht.rows(temp_in, temp_out);
-    for (j = 0; j < n2; ++j) output[j + i * n2] = temp_out[j] >> 2;
+    for (j = 0; j < n2; ++j)
+      output[j + i * n2] =
+          saturate_int16(temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
   }
   // Note: overall scale factor of transform is 8 times unitary
 }
@@ -1429,16 +1433,20 @@
   // Columns
   for (i = 0; i < n; ++i) {
     for (j = 0; j < n2; ++j)
-      temp_in[j] = (tran_low_t)fdct_round_shift(input[j * stride + i] * Sqrt2);
+      temp_in[j] =
+          (tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2);
     ht.cols(temp_in, temp_out);
-    for (j = 0; j < n2; ++j) out[j * n + i] = temp_out[j];
+    for (j = 0; j < n2; ++j)
+      out[j * n + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
   }
 
   // Rows
   for (i = 0; i < n2; ++i) {
     for (j = 0; j < n; ++j) temp_in[j] = out[j + i * n];
     ht.rows(temp_in, temp_out);
-    for (j = 0; j < n; ++j) output[j + i * n] = temp_out[j] >> 2;
+    for (j = 0; j < n; ++j)
+      output[j + i * n] =
+          saturate_int16(temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
   }
   // Note: overall scale factor of transform is 4 times unitary
 }
@@ -1479,16 +1487,20 @@
   // Columns
   for (i = 0; i < n2; ++i) {
     for (j = 0; j < n; ++j)
-      temp_in[j] = (tran_low_t)fdct_round_shift(input[j * stride + i] * Sqrt2);
+      temp_in[j] =
+          (tran_low_t)fdct_round_shift(input[j * stride + i] * 4 * Sqrt2);
     ht.cols(temp_in, temp_out);
-    for (j = 0; j < n; ++j) out[j * n2 + i] = temp_out[j];
+    for (j = 0; j < n; ++j)
+      out[j * n2 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
   }
 
   // Rows
   for (i = 0; i < n; ++i) {
     for (j = 0; j < n2; ++j) temp_in[j] = out[j + i * n2];
     ht.rows(temp_in, temp_out);
-    for (j = 0; j < n2; ++j) output[j + i * n2] = temp_out[j] >> 2;
+    for (j = 0; j < n2; ++j)
+      output[j + i * n2] =
+          saturate_int16(temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
   }
   // Note: overall scale factor of transform is 4 times unitary
 }
diff --git a/av1/encoder/x86/dct_intrin_sse2.c b/av1/encoder/x86/dct_intrin_sse2.c
index 1f3b669..727ff19 100644
--- a/av1/encoder/x86/dct_intrin_sse2.c
+++ b/av1/encoder/x86/dct_intrin_sse2.c
@@ -796,14 +796,14 @@
 
   if (bit == 2) {
     const __m128i const_rounding = _mm_set1_epi16(1);
-    res[0] = _mm_add_epi16(res[0], const_rounding);
-    res[1] = _mm_add_epi16(res[1], const_rounding);
-    res[2] = _mm_add_epi16(res[2], const_rounding);
-    res[3] = _mm_add_epi16(res[3], const_rounding);
-    res[4] = _mm_add_epi16(res[4], const_rounding);
-    res[5] = _mm_add_epi16(res[5], const_rounding);
-    res[6] = _mm_add_epi16(res[6], const_rounding);
-    res[7] = _mm_add_epi16(res[7], const_rounding);
+    res[0] = _mm_adds_epi16(res[0], const_rounding);
+    res[1] = _mm_adds_epi16(res[1], const_rounding);
+    res[2] = _mm_adds_epi16(res[2], const_rounding);
+    res[3] = _mm_adds_epi16(res[3], const_rounding);
+    res[4] = _mm_adds_epi16(res[4], const_rounding);
+    res[5] = _mm_adds_epi16(res[5], const_rounding);
+    res[6] = _mm_adds_epi16(res[6], const_rounding);
+    res[7] = _mm_adds_epi16(res[7], const_rounding);
   }
 
   res[0] = _mm_sub_epi16(res[0], sign0);
@@ -3140,14 +3140,6 @@
   scale_sqrt2_8x8_signed(in + 8);
 }
 
-static INLINE void right_shift(__m128i *in, int size, int bit) {
-  int i = 0;
-  while (i < size) {
-    in[i] = _mm_srai_epi16(in[i], bit);
-    i += 1;
-  }
-}
-
 void av1_fht8x16_sse2(const int16_t *input, tran_low_t *output, int stride,
                       int tx_type) {
   __m128i in[16];
@@ -3288,8 +3280,8 @@
 #endif
     default: assert(0); break;
   }
-  right_shift(t, 8, 2);
-  right_shift(b, 8, 2);
+  right_shift_8x8(t, 2);
+  right_shift_8x8(b, 2);
   write_buffer_8x8(output, t, 8);
   write_buffer_8x8(output + 64, b, 8);
 }
@@ -3424,8 +3416,8 @@
   }
   array_transpose_8x8(l, l);
   array_transpose_8x8(r, r);
-  right_shift(l, 8, 2);
-  right_shift(r, 8, 2);
+  right_shift_8x8(l, 2);
+  right_shift_8x8(r, 2);
   write_buffer_8x8(output, l, 16);
   write_buffer_8x8(output + 8, r, 16);
 }
@@ -3496,12 +3488,14 @@
   }
 
   for (i = 0; i < 16; ++i) {
-    intl[i + 0] = _mm_load_si128((const __m128i *)(input + i * stride + 0));
-    intr[i + 0] = _mm_load_si128((const __m128i *)(input + i * stride + 8));
-    inbl[i + 0] =
-        _mm_load_si128((const __m128i *)(input + (i + 16) * stride + 0));
-    inbr[i + 0] =
-        _mm_load_si128((const __m128i *)(input + (i + 16) * stride + 8));
+    intl[i] = _mm_slli_epi16(
+        _mm_load_si128((const __m128i *)(input + i * stride + 0)), 2);
+    intr[i] = _mm_slli_epi16(
+        _mm_load_si128((const __m128i *)(input + i * stride + 8)), 2);
+    inbl[i] = _mm_slli_epi16(
+        _mm_load_si128((const __m128i *)(input + (i + 16) * stride + 0)), 2);
+    inbr[i] = _mm_slli_epi16(
+        _mm_load_si128((const __m128i *)(input + (i + 16) * stride + 8)), 2);
   }
 
   if (fliplr) {
@@ -3526,10 +3520,8 @@
                                       __m128i *restr, __m128i *resbl,
                                       __m128i *resbr) {
   int i;
-  right_shift(restl, 16, 2);
-  right_shift(restr, 16, 2);
-  right_shift(resbl, 16, 2);
-  right_shift(resbr, 16, 2);
+  right_shift_16x16(restl, restr);
+  right_shift_16x16(resbl, resbr);
   for (i = 0; i < 16; ++i) {
     store_output(&restl[i], output + i * 16 + 0);
     store_output(&restr[i], output + i * 16 + 8);
@@ -3551,24 +3543,32 @@
     case DCT_DCT:
       load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
       fdct32_16col(intl, intr, inbl, inbr);
+      right_shift_16x16(intl, intr);
+      right_shift_16x16(inbl, inbr);
       fdct16_sse2(intl, intr);
       fdct16_sse2(inbl, inbr);
       break;
     case ADST_DCT:
       load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
       fhalfright32_16col(intl, intr, inbl, inbr);
+      right_shift_16x16(intl, intr);
+      right_shift_16x16(inbl, inbr);
       fdct16_sse2(intl, intr);
       fdct16_sse2(inbl, inbr);
       break;
     case DCT_ADST:
       load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
       fdct32_16col(intl, intr, inbl, inbr);
+      right_shift_16x16(intl, intr);
+      right_shift_16x16(inbl, inbr);
       fadst16_sse2(intl, intr);
       fadst16_sse2(inbl, inbr);
       break;
     case ADST_ADST:
       load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
       fhalfright32_16col(intl, intr, inbl, inbr);
+      right_shift_16x16(intl, intr);
+      right_shift_16x16(inbl, inbr);
       fadst16_sse2(intl, intr);
       fadst16_sse2(inbl, inbr);
       break;
@@ -3576,72 +3576,96 @@
     case FLIPADST_DCT:
       load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0);
       fhalfright32_16col(intl, intr, inbl, inbr);
+      right_shift_16x16(intl, intr);
+      right_shift_16x16(inbl, inbr);
       fdct16_sse2(intl, intr);
       fdct16_sse2(inbl, inbr);
       break;
     case DCT_FLIPADST:
       load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1);
       fdct32_16col(intl, intr, inbl, inbr);
+      right_shift_16x16(intl, intr);
+      right_shift_16x16(inbl, inbr);
       fadst16_sse2(intl, intr);
       fadst16_sse2(inbl, inbr);
       break;
     case FLIPADST_FLIPADST:
       load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 1);
       fhalfright32_16col(intl, intr, inbl, inbr);
+      right_shift_16x16(intl, intr);
+      right_shift_16x16(inbl, inbr);
       fadst16_sse2(intl, intr);
       fadst16_sse2(inbl, inbr);
       break;
     case ADST_FLIPADST:
       load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1);
       fhalfright32_16col(intl, intr, inbl, inbr);
+      right_shift_16x16(intl, intr);
+      right_shift_16x16(inbl, inbr);
       fadst16_sse2(intl, intr);
       fadst16_sse2(inbl, inbr);
       break;
     case FLIPADST_ADST:
       load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0);
       fhalfright32_16col(intl, intr, inbl, inbr);
+      right_shift_16x16(intl, intr);
+      right_shift_16x16(inbl, inbr);
       fadst16_sse2(intl, intr);
       fadst16_sse2(inbl, inbr);
       break;
     case IDTX:
       load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
       fidtx32_16col(intl, intr, inbl, inbr);
+      right_shift_16x16(intl, intr);
+      right_shift_16x16(inbl, inbr);
       fidtx16_sse2(intl, intr);
       fidtx16_sse2(inbl, inbr);
       break;
     case V_DCT:
       load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
       fdct32_16col(intl, intr, inbl, inbr);
+      right_shift_16x16(intl, intr);
+      right_shift_16x16(inbl, inbr);
       fidtx16_sse2(intl, intr);
       fidtx16_sse2(inbl, inbr);
       break;
     case H_DCT:
       load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
       fidtx32_16col(intl, intr, inbl, inbr);
+      right_shift_16x16(intl, intr);
+      right_shift_16x16(inbl, inbr);
       fdct16_sse2(intl, intr);
       fdct16_sse2(inbl, inbr);
       break;
     case V_ADST:
       load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
       fhalfright32_16col(intl, intr, inbl, inbr);
+      right_shift_16x16(intl, intr);
+      right_shift_16x16(inbl, inbr);
       fidtx16_sse2(intl, intr);
       fidtx16_sse2(inbl, inbr);
       break;
     case H_ADST:
       load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 0);
       fidtx32_16col(intl, intr, inbl, inbr);
+      right_shift_16x16(intl, intr);
+      right_shift_16x16(inbl, inbr);
       fadst16_sse2(intl, intr);
       fadst16_sse2(inbl, inbr);
       break;
     case V_FLIPADST:
       load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 1, 0);
       fhalfright32_16col(intl, intr, inbl, inbr);
+      right_shift_16x16(intl, intr);
+      right_shift_16x16(inbl, inbr);
       fidtx16_sse2(intl, intr);
       fidtx16_sse2(inbl, inbr);
       break;
     case H_FLIPADST:
       load_buffer_16x32(input, intl, intr, inbl, inbr, stride, 0, 1);
       fidtx32_16col(intl, intr, inbl, inbr);
+      right_shift_16x16(intl, intr);
+      right_shift_16x16(inbl, inbr);
       fadst16_sse2(intl, intr);
       fadst16_sse2(inbl, inbr);
       break;
@@ -3661,10 +3685,14 @@
   }
 
   for (i = 0; i < 16; ++i) {
-    in0[i] = _mm_load_si128((const __m128i *)(input + i * stride + 0));
-    in1[i] = _mm_load_si128((const __m128i *)(input + i * stride + 8));
-    in2[i] = _mm_load_si128((const __m128i *)(input + i * stride + 16));
-    in3[i] = _mm_load_si128((const __m128i *)(input + i * stride + 24));
+    in0[i] = _mm_slli_epi16(
+        _mm_load_si128((const __m128i *)(input + i * stride + 0)), 2);
+    in1[i] = _mm_slli_epi16(
+        _mm_load_si128((const __m128i *)(input + i * stride + 8)), 2);
+    in2[i] = _mm_slli_epi16(
+        _mm_load_si128((const __m128i *)(input + i * stride + 16)), 2);
+    in3[i] = _mm_slli_epi16(
+        _mm_load_si128((const __m128i *)(input + i * stride + 24)), 2);
   }
 
   if (fliplr) {
@@ -3688,10 +3716,8 @@
                                       __m128i *res1, __m128i *res2,
                                       __m128i *res3) {
   int i;
-  right_shift(res0, 16, 2);
-  right_shift(res1, 16, 2);
-  right_shift(res2, 16, 2);
-  right_shift(res3, 16, 2);
+  right_shift_16x16(res0, res1);
+  right_shift_16x16(res2, res3);
   for (i = 0; i < 16; ++i) {
     store_output(&res0[i], output + i * 32 + 0);
     store_output(&res1[i], output + i * 32 + 8);
@@ -3709,21 +3735,29 @@
     case DCT_DCT:
       fdct16_sse2(in0, in1);
       fdct16_sse2(in2, in3);
+      right_shift_16x16(in0, in1);
+      right_shift_16x16(in2, in3);
       fdct32_16col(in0, in1, in2, in3);
       break;
     case ADST_DCT:
       fadst16_sse2(in0, in1);
       fadst16_sse2(in2, in3);
+      right_shift_16x16(in0, in1);
+      right_shift_16x16(in2, in3);
       fdct32_16col(in0, in1, in2, in3);
       break;
     case DCT_ADST:
       fdct16_sse2(in0, in1);
       fdct16_sse2(in2, in3);
+      right_shift_16x16(in0, in1);
+      right_shift_16x16(in2, in3);
       fhalfright32_16col(in0, in1, in2, in3);
       break;
     case ADST_ADST:
       fadst16_sse2(in0, in1);
       fadst16_sse2(in2, in3);
+      right_shift_16x16(in0, in1);
+      right_shift_16x16(in2, in3);
       fhalfright32_16col(in0, in1, in2, in3);
       break;
 #if CONFIG_EXT_TX
@@ -3731,72 +3765,96 @@
       load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0);
       fadst16_sse2(in0, in1);
       fadst16_sse2(in2, in3);
+      right_shift_16x16(in0, in1);
+      right_shift_16x16(in2, in3);
       fdct32_16col(in0, in1, in2, in3);
       break;
     case DCT_FLIPADST:
       load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1);
       fdct16_sse2(in0, in1);
       fdct16_sse2(in2, in3);
+      right_shift_16x16(in0, in1);
+      right_shift_16x16(in2, in3);
       fhalfright32_16col(in0, in1, in2, in3);
       break;
     case FLIPADST_FLIPADST:
       load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 1);
       fadst16_sse2(in0, in1);
       fadst16_sse2(in2, in3);
+      right_shift_16x16(in0, in1);
+      right_shift_16x16(in2, in3);
       fhalfright32_16col(in0, in1, in2, in3);
       break;
     case ADST_FLIPADST:
       load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1);
       fadst16_sse2(in0, in1);
       fadst16_sse2(in2, in3);
+      right_shift_16x16(in0, in1);
+      right_shift_16x16(in2, in3);
       fhalfright32_16col(in0, in1, in2, in3);
       break;
     case FLIPADST_ADST:
       load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0);
       fadst16_sse2(in0, in1);
       fadst16_sse2(in2, in3);
+      right_shift_16x16(in0, in1);
+      right_shift_16x16(in2, in3);
       fhalfright32_16col(in0, in1, in2, in3);
       break;
     case IDTX:
       load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
       fidtx16_sse2(in0, in1);
       fidtx16_sse2(in2, in3);
+      right_shift_16x16(in0, in1);
+      right_shift_16x16(in2, in3);
       fidtx32_16col(in0, in1, in2, in3);
       break;
     case V_DCT:
       load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
       fdct16_sse2(in0, in1);
       fdct16_sse2(in2, in3);
+      right_shift_16x16(in0, in1);
+      right_shift_16x16(in2, in3);
       fidtx32_16col(in0, in1, in2, in3);
       break;
     case H_DCT:
       load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
       fidtx16_sse2(in0, in1);
       fidtx16_sse2(in2, in3);
+      right_shift_16x16(in0, in1);
+      right_shift_16x16(in2, in3);
       fdct32_16col(in0, in1, in2, in3);
       break;
     case V_ADST:
       load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
       fadst16_sse2(in0, in1);
       fadst16_sse2(in2, in3);
+      right_shift_16x16(in0, in1);
+      right_shift_16x16(in2, in3);
       fidtx32_16col(in0, in1, in2, in3);
       break;
     case H_ADST:
       load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 0);
       fidtx16_sse2(in0, in1);
       fidtx16_sse2(in2, in3);
+      right_shift_16x16(in0, in1);
+      right_shift_16x16(in2, in3);
       fhalfright32_16col(in0, in1, in2, in3);
       break;
     case V_FLIPADST:
       load_buffer_32x16(input, in0, in1, in2, in3, stride, 1, 0);
       fadst16_sse2(in0, in1);
       fadst16_sse2(in2, in3);
+      right_shift_16x16(in0, in1);
+      right_shift_16x16(in2, in3);
       fidtx32_16col(in0, in1, in2, in3);
       break;
     case H_FLIPADST:
       load_buffer_32x16(input, in0, in1, in2, in3, stride, 0, 1);
       fidtx16_sse2(in0, in1);
       fidtx16_sse2(in2, in3);
+      right_shift_16x16(in0, in1);
+      right_shift_16x16(in2, in3);
       fhalfright32_16col(in0, in1, in2, in3);
       break;
 #endif
diff --git a/test/av1_fht16x32_test.cc b/test/av1_fht16x32_test.cc
index d45fcde..0bba3d6 100644
--- a/test/av1_fht16x32_test.cc
+++ b/test/av1_fht16x32_test.cc
@@ -69,11 +69,11 @@
   IhtFunc inv_txfm_;
 };
 
-TEST_P(AV1Trans16x32HT, AccuracyCheck) { RunAccuracyCheck(48); }
+TEST_P(AV1Trans16x32HT, AccuracyCheck) { RunAccuracyCheck(1); }
 TEST_P(AV1Trans16x32HT, CoeffCheck) { RunCoeffCheck(); }
 TEST_P(AV1Trans16x32HT, MemCheck) { RunMemCheck(); }
 TEST_P(AV1Trans16x32HT, InvCoeffCheck) { RunInvCoeffCheck(); }
-TEST_P(AV1Trans16x32HT, InvAccuracyCheck) { RunInvAccuracyCheck(9); }
+TEST_P(AV1Trans16x32HT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
 
 using std::tr1::make_tuple;
 const Ht16x32Param kArrayHt16x32Param_c[] = {
diff --git a/test/av1_fht16x8_test.cc b/test/av1_fht16x8_test.cc
index a70356a..fb0b8e8 100644
--- a/test/av1_fht16x8_test.cc
+++ b/test/av1_fht16x8_test.cc
@@ -69,11 +69,11 @@
   IhtFunc inv_txfm_;
 };
 
-TEST_P(AV1Trans16x8HT, AccuracyCheck) { RunAccuracyCheck(1); }
+TEST_P(AV1Trans16x8HT, AccuracyCheck) { RunAccuracyCheck(0); }
 TEST_P(AV1Trans16x8HT, CoeffCheck) { RunCoeffCheck(); }
 TEST_P(AV1Trans16x8HT, MemCheck) { RunMemCheck(); }
 TEST_P(AV1Trans16x8HT, InvCoeffCheck) { RunInvCoeffCheck(); }
-TEST_P(AV1Trans16x8HT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
+TEST_P(AV1Trans16x8HT, InvAccuracyCheck) { RunInvAccuracyCheck(0); }
 
 using std::tr1::make_tuple;
 
diff --git a/test/av1_fht32x16_test.cc b/test/av1_fht32x16_test.cc
index 2470b83..f86e305 100644
--- a/test/av1_fht32x16_test.cc
+++ b/test/av1_fht32x16_test.cc
@@ -70,10 +70,10 @@
 };
 
 TEST_P(AV1Trans32x16HT, MemCheck) { RunMemCheck(); }
-TEST_P(AV1Trans32x16HT, AccuracyCheck) { RunAccuracyCheck(43); }
+TEST_P(AV1Trans32x16HT, AccuracyCheck) { RunAccuracyCheck(2); }
 TEST_P(AV1Trans32x16HT, CoeffCheck) { RunCoeffCheck(); }
 TEST_P(AV1Trans32x16HT, InvCoeffCheck) { RunInvCoeffCheck(); }
-TEST_P(AV1Trans32x16HT, InvAccuracyCheck) { RunInvAccuracyCheck(9); }
+TEST_P(AV1Trans32x16HT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
 
 using std::tr1::make_tuple;
 const Ht32x16Param kArrayHt32x16Param_c[] = {
diff --git a/test/av1_fht8x16_test.cc b/test/av1_fht8x16_test.cc
index e3e1819..7936074 100644
--- a/test/av1_fht8x16_test.cc
+++ b/test/av1_fht8x16_test.cc
@@ -70,10 +70,10 @@
 };
 
 TEST_P(AV1Trans8x16HT, MemCheck) { RunMemCheck(); }
-TEST_P(AV1Trans8x16HT, AccuracyCheck) { RunAccuracyCheck(1); }
+TEST_P(AV1Trans8x16HT, AccuracyCheck) { RunAccuracyCheck(0); }
 TEST_P(AV1Trans8x16HT, CoeffCheck) { RunCoeffCheck(); }
 TEST_P(AV1Trans8x16HT, InvCoeffCheck) { RunInvCoeffCheck(); }
-TEST_P(AV1Trans8x16HT, InvAccuracyCheck) { RunInvAccuracyCheck(1); }
+TEST_P(AV1Trans8x16HT, InvAccuracyCheck) { RunInvAccuracyCheck(0); }
 
 using std::tr1::make_tuple;