WIP: 16x16 idct/recon merge

This patch eliminates the intermediate diff buffer usage by
combining the short idct and the add residual into one function.
The encoder can use the same code as well.

Change-Id: Iea7976b22b1927d24b8004d2a3fddae7ecca3ba1
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index f6d2d59..9fb45d6 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -17,6 +17,7 @@
 extern "C" {
 #include "vp9/common/vp9_entropy.h"
 #include "vp9_rtcd.h"
+void vp9_short_idct16x16_add_c(short *input, uint8_t *output, int pitch);
 }
 
 #include "acm_random.h"
@@ -269,19 +270,23 @@
   const int count_test_block = 1000;
   for (int i = 0; i < count_test_block; ++i) {
     int16_t in[256], coeff[256];
-    int16_t out_c[256];
+    uint8_t dst[256], src[256];
     double out_r[256];
 
+    for (int j = 0; j < 256; ++j) {
+      src[j] = rnd.Rand8();
+      dst[j] = rnd.Rand8();
+    }
     // Initialize a test block with input range [-255, 255].
     for (int j = 0; j < 256; ++j)
-      in[j] = rnd.Rand8() - rnd.Rand8();
+      in[j] = src[j] - dst[j];
 
     reference_16x16_dct_2d(in, out_r);
     for (int j = 0; j < 256; j++)
       coeff[j] = round(out_r[j]);
-    vp9_short_idct16x16_c(coeff, out_c, 32);
+    vp9_short_idct16x16_add_c(coeff, dst, 16);
     for (int j = 0; j < 256; ++j) {
-      const int diff = out_c[j] - in[j];
+      const int diff = dst[j] - src[j];
       const int error = diff * diff;
       EXPECT_GE(1, error)
           << "Error: 16x16 IDCT has error " << error
@@ -289,7 +294,7 @@
     }
   }
 }
-#if 1
+
 // we need enable fdct test once we re-do the 16 point fdct.
 TEST(VP9Fdct16x16Test, AccuracyCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
@@ -299,18 +304,22 @@
   for (int i = 0; i < count_test_block; ++i) {
     int16_t test_input_block[256];
     int16_t test_temp_block[256];
-    int16_t test_output_block[256];
+    uint8_t dst[256], src[256];
 
+    for (int j = 0; j < 256; ++j) {
+      src[j] = rnd.Rand8();
+      dst[j] = rnd.Rand8();
+    }
     // Initialize a test block with input range [-255, 255].
     for (int j = 0; j < 256; ++j)
-      test_input_block[j] = rnd.Rand8() - rnd.Rand8();
+      test_input_block[j] = src[j] - dst[j];
 
     const int pitch = 32;
     vp9_short_fdct16x16_c(test_input_block, test_temp_block, pitch);
-    vp9_short_idct16x16_c(test_temp_block, test_output_block, pitch);
+    vp9_short_idct16x16_add_c(test_temp_block, dst, 16);
 
     for (int j = 0; j < 256; ++j) {
-      const int diff = test_input_block[j] - test_output_block[j];
+      const int diff = dst[j] - src[j];
       const int error = diff * diff;
       if (max_error < error)
         max_error = error;
@@ -354,6 +363,4 @@
     }
   }
 }
-#endif
-
 }  // namespace
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index 5e6384c..b166fcb 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -621,10 +621,9 @@
   output[15] = step2[0] - step2[15];
 }
 
-void vp9_short_idct16x16_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
   int16_t out[16 * 16];
   int16_t *outptr = out;
-  const int half_pitch = pitch >> 1;
   int i, j;
   int16_t temp_in[16], temp_out[16];
 
@@ -641,7 +640,8 @@
       temp_in[j] = out[j * 16 + i];
     idct16_1d(temp_in, temp_out);
     for (j = 0; j < 16; ++j)
-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                  + dest[j * dest_stride + i]);
   }
 }
 
@@ -823,8 +823,8 @@
   { iadst16_1d, iadst16_1d }   // ADST_ADST = 3
 };
 
-void vp9_short_iht16x16_c(int16_t *input, int16_t *output,
-                          int pitch, int tx_type) {
+void vp9_short_iht16x16_add_c(int16_t *input, uint8_t *dest, int dest_stride,
+                              int tx_type) {
   int i, j;
   int16_t out[16 * 16];
   int16_t *outptr = out;
@@ -844,38 +844,38 @@
       temp_in[j] = out[j * 16 + i];
     ht.cols(temp_in, temp_out);
     for (j = 0; j < 16; ++j)
-      output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                  + dest[j * dest_stride + i]);  }
+}
+
+void vp9_short_idct10_16x16_add_c(int16_t *input, uint8_t *dest,
+                                  int dest_stride) {
+  int16_t out[16 * 16];
+  int16_t *outptr = out;
+  int i, j;
+  int16_t temp_in[16], temp_out[16];
+
+  /* First transform rows. Since all non-zero dct coefficients are in
+   * upper-left 4x4 area, we only need to calculate first 4 rows here.
+   */
+  vpx_memset(out, 0, sizeof(out));
+  for (i = 0; i < 4; ++i) {
+    idct16_1d(input, outptr);
+    input += 16;
+    outptr += 16;
+  }
+
+  // Then transform columns
+  for (i = 0; i < 16; ++i) {
+    for (j = 0; j < 16; ++j)
+      temp_in[j] = out[j*16 + i];
+    idct16_1d(temp_in, temp_out);
+    for (j = 0; j < 16; ++j)
+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
+                                  + dest[j * dest_stride + i]);
   }
 }
 
-void vp9_short_idct10_16x16_c(int16_t *input, int16_t *output, int pitch) {
-    int16_t out[16 * 16];
-    int16_t *outptr = out;
-    const int half_pitch = pitch >> 1;
-    int i, j;
-    int16_t temp_in[16], temp_out[16];
-
-    /* First transform rows. Since all non-zero dct coefficients are in
-     * upper-left 4x4 area, we only need to calculate first 4 rows here.
-     */
-    vpx_memset(out, 0, sizeof(out));
-    for (i = 0; i < 4; ++i) {
-      idct16_1d(input, outptr);
-      input += 16;
-      outptr += 16;
-    }
-
-    // Then transform columns
-    for (i = 0; i < 16; ++i) {
-      for (j = 0; j < 16; ++j)
-        temp_in[j] = out[j*16 + i];
-      idct16_1d(temp_in, temp_out);
-      for (j = 0; j < 16; ++j)
-        output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 6);
-    }
-}
-
-
 void vp9_short_idct1_16x16_c(int16_t *input, int16_t *output) {
   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
   out = dct_const_round_shift(out * cospi_16_64);
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index c45d030..ea60fbb 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -91,9 +91,6 @@
 prototype void vp9_add_residual_8x8 "const int16_t *diff, uint8_t *dest, int stride"
 specialize vp9_add_residual_8x8 sse2
 
-prototype void vp9_add_residual_16x16 "const int16_t *diff, uint8_t *dest, int stride"
-specialize vp9_add_residual_16x16 sse2
-
 prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride"
 specialize vp9_add_constant_residual_8x8 sse2
 
@@ -200,11 +197,11 @@
 prototype void vp9_short_idct1_8x8 "int16_t *input, int16_t *output"
 specialize vp9_short_idct1_8x8
 
-prototype void vp9_short_idct16x16 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct16x16 sse2
+prototype void vp9_short_idct16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct16x16_add sse2
 
-prototype void vp9_short_idct10_16x16 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct10_16x16 sse2
+prototype void vp9_short_idct10_16x16_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct10_16x16_add sse2
 
 prototype void vp9_short_idct1_16x16 "int16_t *input, int16_t *output"
 specialize vp9_short_idct1_16x16
@@ -224,8 +221,8 @@
 prototype void vp9_short_iht4x4 "int16_t *input, int16_t *output, int pitch, int tx_type"
 specialize vp9_short_iht4x4
 
-prototype void vp9_short_iht16x16 "int16_t *input, int16_t *output, int pitch, int tx_type"
-specialize vp9_short_iht16x16
+prototype void vp9_short_iht16x16_add "int16_t *input, uint8_t *output, int pitch, int tx_type"
+specialize vp9_short_iht16x16_add
 
 prototype void vp9_idct4_1d "int16_t *input, int16_t *output"
 specialize vp9_idct4_1d sse2
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index e53a937..667da33 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -752,8 +752,17 @@
                            stp2_10, stp2_13, stp2_11, stp2_12) \
   }
 
-void vp9_short_idct16x16_sse2(int16_t *input, int16_t *output, int pitch) {
-  const int half_pitch = pitch >> 1;
+#define RECON_AND_STORE(dest, in_x) \
+  {                                                     \
+     __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
+      d0 = _mm_unpacklo_epi8(d0, zero); \
+      in_x = _mm_add_epi16(in_x, d0); \
+      in_x = _mm_packus_epi16(in_x, in_x); \
+      _mm_storel_epi64((__m128i *)(dest), in_x); \
+      dest += stride; \
+  }
+
+void vp9_short_idct16x16_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<5);
   const __m128i zero = _mm_setzero_si128();
@@ -938,31 +947,30 @@
       in14 = _mm_srai_epi16(in14, 6);
       in15 = _mm_srai_epi16(in15, 6);
 
-      // Store results
-      _mm_store_si128((__m128i *)output, in0);
-      _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
-      _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
-      _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
-      _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
-      _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
-      _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
-      _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
-      _mm_store_si128((__m128i *)(output + half_pitch * 8), in8);
-      _mm_store_si128((__m128i *)(output + half_pitch * 9), in9);
-      _mm_store_si128((__m128i *)(output + half_pitch * 10), in10);
-      _mm_store_si128((__m128i *)(output + half_pitch * 11), in11);
-      _mm_store_si128((__m128i *)(output + half_pitch * 12), in12);
-      _mm_store_si128((__m128i *)(output + half_pitch * 13), in13);
-      _mm_store_si128((__m128i *)(output + half_pitch * 14), in14);
-      _mm_store_si128((__m128i *)(output + half_pitch * 15), in15);
+      RECON_AND_STORE(dest, in0);
+      RECON_AND_STORE(dest, in1);
+      RECON_AND_STORE(dest, in2);
+      RECON_AND_STORE(dest, in3);
+      RECON_AND_STORE(dest, in4);
+      RECON_AND_STORE(dest, in5);
+      RECON_AND_STORE(dest, in6);
+      RECON_AND_STORE(dest, in7);
+      RECON_AND_STORE(dest, in8);
+      RECON_AND_STORE(dest, in9);
+      RECON_AND_STORE(dest, in10);
+      RECON_AND_STORE(dest, in11);
+      RECON_AND_STORE(dest, in12);
+      RECON_AND_STORE(dest, in13);
+      RECON_AND_STORE(dest, in14);
+      RECON_AND_STORE(dest, in15);
 
-      output += 8;
+      dest += 8 - (stride * 16);
     }
   }
 }
 
-void vp9_short_idct10_16x16_sse2(int16_t *input, int16_t *output, int pitch) {
-  const int half_pitch = pitch >> 1;
+void vp9_short_idct10_16x16_add_sse2(int16_t *input, uint8_t *dest,
+                                     int stride) {
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
   const __m128i final_rounding = _mm_set1_epi16(1<<5);
   const __m128i zero = _mm_setzero_si128();
@@ -1007,7 +1015,6 @@
           stp2_8, stp2_9, stp2_10, stp2_11, stp2_12, stp2_13, stp2_14, stp2_15;
   __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
   int i;
-
   // 1-D idct. Load input data.
   in0 = _mm_load_si128((__m128i *)input);
   in8 = _mm_load_si128((__m128i *)(input + 8 * 1));
@@ -1298,24 +1305,24 @@
     in14 = _mm_srai_epi16(in14, 6);
     in15 = _mm_srai_epi16(in15, 6);
 
-    // Store results
-    _mm_store_si128((__m128i *)output, in0);
-    _mm_store_si128((__m128i *)(output + half_pitch * 1), in1);
-    _mm_store_si128((__m128i *)(output + half_pitch * 2), in2);
-    _mm_store_si128((__m128i *)(output + half_pitch * 3), in3);
-    _mm_store_si128((__m128i *)(output + half_pitch * 4), in4);
-    _mm_store_si128((__m128i *)(output + half_pitch * 5), in5);
-    _mm_store_si128((__m128i *)(output + half_pitch * 6), in6);
-    _mm_store_si128((__m128i *)(output + half_pitch * 7), in7);
-    _mm_store_si128((__m128i *)(output + half_pitch * 8), in8);
-    _mm_store_si128((__m128i *)(output + half_pitch * 9), in9);
-    _mm_store_si128((__m128i *)(output + half_pitch * 10), in10);
-    _mm_store_si128((__m128i *)(output + half_pitch * 11), in11);
-    _mm_store_si128((__m128i *)(output + half_pitch * 12), in12);
-    _mm_store_si128((__m128i *)(output + half_pitch * 13), in13);
-    _mm_store_si128((__m128i *)(output + half_pitch * 14), in14);
-    _mm_store_si128((__m128i *)(output + half_pitch * 15), in15);
-    output += 8;
+    RECON_AND_STORE(dest, in0);
+    RECON_AND_STORE(dest, in1);
+    RECON_AND_STORE(dest, in2);
+    RECON_AND_STORE(dest, in3);
+    RECON_AND_STORE(dest, in4);
+    RECON_AND_STORE(dest, in5);
+    RECON_AND_STORE(dest, in6);
+    RECON_AND_STORE(dest, in7);
+    RECON_AND_STORE(dest, in8);
+    RECON_AND_STORE(dest, in9);
+    RECON_AND_STORE(dest, in10);
+    RECON_AND_STORE(dest, in11);
+    RECON_AND_STORE(dest, in12);
+    RECON_AND_STORE(dest, in13);
+    RECON_AND_STORE(dest, in14);
+    RECON_AND_STORE(dest, in15);
+
+    dest += 8 - (stride * 16);
   }
 }
 
@@ -1934,16 +1941,6 @@
       in30 = _mm_srai_epi16(in30, 6);
       in31 = _mm_srai_epi16(in31, 6);
 
-#define RECON_AND_STORE(dest, in_x) \
-  {                                                     \
-     __m128i d0 = _mm_loadl_epi64((__m128i *)(dest)); \
-      d0 = _mm_unpacklo_epi8(d0, zero); \
-      in_x = _mm_add_epi16(in_x, d0); \
-      in_x = _mm_packus_epi16(in_x, in_x); \
-      _mm_storel_epi64((__m128i *)(dest), in_x); \
-      dest += stride; \
-  }
-
       RECON_AND_STORE(dest, in0);
       RECON_AND_STORE(dest, in1);
       RECON_AND_STORE(dest, in2);
diff --git a/vp9/decoder/vp9_idct_blk.c b/vp9/decoder/vp9_idct_blk.c
index faaee73..bc943fa 100644
--- a/vp9/decoder/vp9_idct_blk.c
+++ b/vp9/decoder/vp9_idct_blk.c
@@ -105,10 +105,6 @@
   add_residual(diff, dest, stride, 8, 8);
 }
 
-void vp9_add_residual_16x16_c(const int16_t *diff, uint8_t *dest, int stride) {
-  add_residual(diff, dest, stride, 16, 16);
-}
-
 static void add_constant_residual(const int16_t diff, uint8_t *dest, int stride,
                                   int width, int height) {
   int r, c;
@@ -260,19 +256,14 @@
   if (tx_type == DCT_DCT) {
     vp9_idct_add_16x16(input, dest, stride, eob);
   } else {
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output, 256);
-
     if (eob > 0) {
-      vp9_short_iht16x16(input, output, 16, tx_type);
+      vp9_short_iht16x16_add(input, dest, stride, tx_type);
       vpx_memset(input, 0, 512);
-      vp9_add_residual_16x16(output, dest, stride);
     }
   }
 }
 
 void vp9_idct_add_16x16_c(int16_t *input, uint8_t *dest, int stride, int eob) {
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 256);
-
   /* The calculation can be simplified if there are not many non-zero dct
    * coefficients. Use eobs to separate different cases. */
   if (eob) {
@@ -288,21 +279,15 @@
       vp9_add_constant_residual_16x16(out, dest, stride);
 #if !CONFIG_SCATTERSCAN
     } else if (eob <= 10) {
-      // the idct halves ( >> 1) the pitch
-      vp9_short_idct10_16x16(input, output, 32);
-
+      vp9_short_idct10_16x16_add(input, dest, stride);
       input[0] = input[1] = input[2] = input[3] = 0;
       input[16] = input[17] = input[18] = 0;
       input[32] = input[33] = 0;
       input[48] = 0;
-
-      vp9_add_residual_16x16(output, dest, stride);
 #endif
     } else {
-      // the idct halves ( >> 1) the pitch
-      vp9_short_idct16x16(input, output, 16 << 1);
+      vp9_short_idct16x16_add(input, dest, stride);
       vpx_memset(input, 0, 512);
-      vp9_add_residual_16x16(output, dest, stride);
     }
   }
 }
diff --git a/vp9/decoder/x86/vp9_dequantize_sse2.c b/vp9/decoder/x86/vp9_dequantize_sse2.c
index 38fd5aa..796fc12 100644
--- a/vp9/decoder/x86/vp9_dequantize_sse2.c
+++ b/vp9/decoder/x86/vp9_dequantize_sse2.c
@@ -122,65 +122,6 @@
   _mm_storel_epi64((__m128i *)(dest + 7 * stride), p6);
 }
 
-void vp9_add_residual_16x16_sse2(const int16_t *diff, uint8_t *dest,
-                                 int stride) {
-  const int width = 16;
-  int i = 4;
-  const __m128i zero = _mm_setzero_si128();
-
-  // Diff data
-  __m128i d0, d1, d2, d3, d4, d5, d6, d7;
-  __m128i p0, p1, p2, p3, p4, p5, p6, p7;
-
-  do {
-    d0 = _mm_load_si128((const __m128i *)(diff + 0 * width));
-    d1 = _mm_load_si128((const __m128i *)(diff + 0 * width + 8));
-    d2 = _mm_load_si128((const __m128i *)(diff + 1 * width));
-    d3 = _mm_load_si128((const __m128i *)(diff + 1 * width + 8));
-    d4 = _mm_load_si128((const __m128i *)(diff + 2 * width));
-    d5 = _mm_load_si128((const __m128i *)(diff + 2 * width + 8));
-    d6 = _mm_load_si128((const __m128i *)(diff + 3 * width));
-    d7 = _mm_load_si128((const __m128i *)(diff + 3 * width + 8));
-
-    // Prediction data.
-    p1 = _mm_load_si128((const __m128i *)(dest + 0 * stride));
-    p3 = _mm_load_si128((const __m128i *)(dest + 1 * stride));
-    p5 = _mm_load_si128((const __m128i *)(dest + 2 * stride));
-    p7 = _mm_load_si128((const __m128i *)(dest + 3 * stride));
-
-    p0 = _mm_unpacklo_epi8(p1, zero);
-    p1 = _mm_unpackhi_epi8(p1, zero);
-    p2 = _mm_unpacklo_epi8(p3, zero);
-    p3 = _mm_unpackhi_epi8(p3, zero);
-    p4 = _mm_unpacklo_epi8(p5, zero);
-    p5 = _mm_unpackhi_epi8(p5, zero);
-    p6 = _mm_unpacklo_epi8(p7, zero);
-    p7 = _mm_unpackhi_epi8(p7, zero);
-
-    p0 = _mm_add_epi16(p0, d0);
-    p1 = _mm_add_epi16(p1, d1);
-    p2 = _mm_add_epi16(p2, d2);
-    p3 = _mm_add_epi16(p3, d3);
-    p4 = _mm_add_epi16(p4, d4);
-    p5 = _mm_add_epi16(p5, d5);
-    p6 = _mm_add_epi16(p6, d6);
-    p7 = _mm_add_epi16(p7, d7);
-
-    p0 = _mm_packus_epi16(p0, p1);
-    p1 = _mm_packus_epi16(p2, p3);
-    p2 = _mm_packus_epi16(p4, p5);
-    p3 = _mm_packus_epi16(p6, p7);
-
-    _mm_store_si128((__m128i *)(dest + 0 * stride), p0);
-    _mm_store_si128((__m128i *)(dest + 1 * stride), p1);
-    _mm_store_si128((__m128i *)(dest + 2 * stride), p2);
-    _mm_store_si128((__m128i *)(dest + 3 * stride), p3);
-
-    diff += 4 * width;
-    dest += 4 * stride;
-  } while (--i);
-}
-
 void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest,
                                         int stride) {
   uint8_t abs_diff;
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index db18555..221de74 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -522,11 +522,12 @@
     case TX_16X16:
       tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;
       if (tx_type == DCT_DCT) {
-        vp9_short_idct16x16(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
-                            diff, bw * 2);
+        vp9_short_idct16x16_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,
+                                block, 16), dst, xd->plane[plane].dst.stride);
       } else {
-        vp9_short_iht16x16(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
-                           diff, bw, tx_type);
+        vp9_short_iht16x16_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,
+                               block, 16), dst, xd->plane[plane].dst.stride,
+                               tx_type);
       }
       *wip_txfrm_size = 16;
       break;
@@ -605,7 +606,7 @@
 
   foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
 
-  if (wip_txfrm_size < 32)
+  if (wip_txfrm_size < 16)
     vp9_recon_sbuv(xd, bsize);
 }
 
@@ -627,13 +628,13 @@
   // wip version... will use foreach_transformed_block when done
   foreach_transformed_block_in_plane(xd, bsize, 0,
                                      encode_block, &arg);
-  if (wip_txfrm_size < 32)
+  if (wip_txfrm_size < 16)
     vp9_recon_sby(xd, bsize);
   wip_txfrm_size = 0;
 
   foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
 
-  if (wip_txfrm_size < 32)
+  if (wip_txfrm_size < 16)
     vp9_recon_sbuv(xd, bsize);
 #endif
 }