Merge "Refinements on modelcoef expt to reduce storage" into experimental
diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index dfb64c3..1c887bb 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -96,11 +96,15 @@
   for (int i = 0; i < count_test_block; ++i) {
     int16_t test_input_block[16];
     int16_t test_temp_block[16];
-    int16_t test_output_block[16];
+    uint8_t dst[16], src[16];
 
+    for (int j = 0; j < 16; ++j) {
+      src[j] = rnd.Rand8();
+      dst[j] = rnd.Rand8();
+    }
     // Initialize a test block with input range [-255, 255].
     for (int j = 0; j < 16; ++j)
-      test_input_block[j] = rnd.Rand8() - rnd.Rand8();
+      test_input_block[j] = src[j] - dst[j];
 
     // TODO(Yaowu): this should be converted to a parameterized test
     // to test optimized versions of this function.
@@ -120,10 +124,10 @@
     }
 
     // Because the bitstream is not frozen yet, use the idct in the codebase.
-    vp9_short_idct4x4_c(test_temp_block, test_output_block, pitch);
+    vp9_short_idct4x4_add_c(test_temp_block, dst, 4);
 
     for (int j = 0; j < 16; ++j) {
-      const int diff = test_input_block[j] - test_output_block[j];
+      const int diff = dst[j] - src[j];
       const int error = diff * diff;
       if (max_error < error)
         max_error = error;
diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index ec81fbd..d149f31 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c
@@ -205,7 +205,6 @@
   oci->no_lpf = 0;
   oci->use_bilinear_mc_filter = 0;
   oci->clr_type = REG_YUV;
-  oci->clamp_type = RECON_CLAMP_REQUIRED;
 
   // Initialize reference frame sign bias structure to defaults
   vpx_memset(oci->ref_frame_sign_bias, 0, sizeof(oci->ref_frame_sign_bias));
diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index b58945e..e795bba 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h
@@ -307,7 +307,6 @@
   DECLARE_ALIGNED(16, int16_t,  qcoeff[64 * 64]);
   DECLARE_ALIGNED(16, int16_t,  dqcoeff[64 * 64]);
   DECLARE_ALIGNED(16, uint16_t, eobs[256]);
-  DECLARE_ALIGNED(16, int16_t,  diff[64 * 64]);
   PLANE_TYPE plane_type;
   int subsampling_x;
   int subsampling_y;
@@ -391,8 +390,8 @@
 
   int lossless;
   /* Inverse transform function pointers. */
-  void (*inv_txm4x4_1)(int16_t *input, int16_t *output, int pitch);
-  void (*inv_txm4x4)(int16_t *input, int16_t *output, int pitch);
+  void (*inv_txm4x4_1_add)(int16_t *input, uint8_t *dest, int stride);
+  void (*inv_txm4x4_add)(int16_t *input, uint8_t *dest, int stride);
   void (*itxm_add)(int16_t *input, uint8_t *dest, int stride, int eob);
   void (*itxm_add_y_block)(int16_t *q, uint8_t *dst, int stride,
     struct macroblockd *xd);
diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index 2ff7696..80af49e 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c
@@ -18,12 +18,12 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_idct.h"
 
-void vp9_short_iwalsh4x4_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_iwalsh4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
   int i;
+  int16_t output[16];
   int a1, b1, c1, d1;
   int16_t *ip = input;
   int16_t *op = output;
-  const int half_pitch = pitch >> 1;
 
   for (i = 0; i < 4; i++) {
     a1 = (ip[0] + ip[3]) >> WHT_UPSCALE_FACTOR;
@@ -37,63 +37,60 @@
     op[3] = (d1 - c1) >> 1;
 
     ip += 4;
-    op += half_pitch;
+    op += 4;
   }
 
   ip = output;
-  op = output;
   for (i = 0; i < 4; i++) {
-    a1 = ip[half_pitch * 0] + ip[half_pitch * 3];
-    b1 = ip[half_pitch * 1] + ip[half_pitch * 2];
-    c1 = ip[half_pitch * 1] - ip[half_pitch * 2];
-    d1 = ip[half_pitch * 0] - ip[half_pitch * 3];
+    a1 = ip[4 * 0] + ip[4 * 3];
+    b1 = ip[4 * 1] + ip[4 * 2];
+    c1 = ip[4 * 1] - ip[4 * 2];
+    d1 = ip[4 * 0] - ip[4 * 3];
 
 
-    op[half_pitch * 0] = (a1 + b1 + 1) >> 1;
-    op[half_pitch * 1] = (c1 + d1) >> 1;
-    op[half_pitch * 2] = (a1 - b1) >> 1;
-    op[half_pitch * 3] = (d1 - c1) >> 1;
+    dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] +
+                                       ((a1 + b1 + 1) >> 1));
+    dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] +
+                                       ((c1 + d1) >> 1));
+    dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] +
+                                       ((a1 - b1) >> 1));
+    dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] +
+                                       ((d1 - c1) >> 1));
 
     ip++;
-    op++;
+    dest++;
   }
 }
 
-void vp9_short_iwalsh4x4_1_c(int16_t *in, int16_t *out, int pitch) {
+void vp9_short_iwalsh4x4_1_add_c(int16_t *in, uint8_t *dest, int dest_stride) {
   int i;
   int16_t tmp[4];
   int16_t *ip = in;
   int16_t *op = tmp;
-  const int half_pitch = pitch >> 1;
 
   op[0] = ((ip[0] >> WHT_UPSCALE_FACTOR) + 1) >> 1;
   op[1] = op[2] = op[3] = (ip[0] >> WHT_UPSCALE_FACTOR) >> 1;
 
   ip = tmp;
-  op = out;
   for (i = 0; i < 4; i++) {
-    op[half_pitch * 0] = (ip[0] + 1) >> 1;
-    op[half_pitch * 1] = op[half_pitch * 2] = op[half_pitch * 3] = ip[0] >> 1;
+    dest[dest_stride * 0] = clip_pixel(dest[dest_stride * 0] +
+                                       ((ip[0] + 1) >> 1));
+    dest[dest_stride * 1] = clip_pixel(dest[dest_stride * 1] +
+                                       (ip[0] >> 1));
+    dest[dest_stride * 2] = clip_pixel(dest[dest_stride * 2] +
+                                       (ip[0] >> 1));
+    dest[dest_stride * 3] = clip_pixel(dest[dest_stride * 3] +
+                                       (ip[0] >> 1));
     ip++;
-    op++;
+    dest++;
   }
 }
 
 void vp9_dc_only_inv_walsh_add_c(int input_dc, uint8_t *pred_ptr,
                                  uint8_t *dst_ptr,
                                  int pitch, int stride) {
-  int r, c;
   int16_t dc = input_dc;
-  int16_t tmp[4 * 4];
-  vp9_short_iwalsh4x4_1_c(&dc, tmp, 4 << 1);
-
-  for (r = 0; r < 4; r++) {
-    for (c = 0; c < 4; c++)
-      dst_ptr[c] = clip_pixel(tmp[r * 4 + c] + pred_ptr[c]);
-
-    dst_ptr += stride;
-    pred_ptr += pitch;
-  }
+  vp9_short_iwalsh4x4_1_add_c(&dc, dst_ptr, stride);
 }
 
 void vp9_idct4_1d_c(int16_t *input, int16_t *output) {
@@ -116,10 +113,9 @@
   output[3] = step[0] - step[3];
 }
 
-void vp9_short_idct4x4_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
   int16_t out[4 * 4];
   int16_t *outptr = out;
-  const int half_pitch = pitch >> 1;
   int i, j;
   int16_t temp_in[4], temp_out[4];
 
@@ -138,22 +134,24 @@
       temp_in[j] = out[j * 4 + i];
     vp9_idct4_1d(temp_in, temp_out);
     for (j = 0; j < 4; ++j)
-      output[j * half_pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4);
+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+                                  + dest[j * dest_stride + i]);
   }
 }
 
-void vp9_short_idct4x4_1_c(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct4x4_1_add_c(int16_t *input, uint8_t *dest, int dest_stride) {
   int i;
   int a1;
-  int16_t *op = output;
-  const int half_pitch = pitch >> 1;
   int16_t out = dct_const_round_shift(input[0] * cospi_16_64);
   out = dct_const_round_shift(out * cospi_16_64);
   a1 = ROUND_POWER_OF_TWO(out, 4);
 
   for (i = 0; i < 4; i++) {
-    op[0] = op[1] = op[2] = op[3] = a1;
-    op += half_pitch;
+    dest[0] = clip_pixel(dest[0] + a1);
+    dest[1] = clip_pixel(dest[1] + a1);
+    dest[2] = clip_pixel(dest[2] + a1);
+    dest[3] = clip_pixel(dest[3] + a1);
+    dest += dest_stride;
   }
 }
 
@@ -285,8 +283,8 @@
   output[3] = dct_const_round_shift(s3);
 }
 
-void vp9_short_iht4x4_c(int16_t *input, int16_t *output,
-                        int pitch, int tx_type) {
+void vp9_short_iht4x4_add_c(int16_t *input, uint8_t *dest, int dest_stride,
+                            int tx_type) {
   const transform_2d IHT_4[] = {
     { vp9_idct4_1d,  vp9_idct4_1d  },  // DCT_DCT  = 0
     { iadst4_1d, vp9_idct4_1d  },      // ADST_DCT = 1
@@ -312,10 +310,10 @@
       temp_in[j] = out[j * 4 + i];
     IHT_4[tx_type].cols(temp_in, temp_out);
     for (j = 0; j < 4; ++j)
-      output[j * pitch + i] = ROUND_POWER_OF_TWO(temp_out[j], 4);
+      dest[j * dest_stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 4)
+                                  + dest[j * dest_stride + i]);
   }
 }
-
 static void iadst8_1d(int16_t *input, int16_t *output) {
   int s0, s1, s2, s3, s4, s5, s6, s7;
 
diff --git a/vp9/common/vp9_invtrans.c b/vp9/common/vp9_invtrans.c
index 01859df..d47fca1 100644
--- a/vp9/common/vp9_invtrans.c
+++ b/vp9/common/vp9_invtrans.c
@@ -11,11 +11,10 @@
 #include "vp9/common/vp9_invtrans.h"
 #include "./vp9_rtcd.h"
 
-void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int eob,
-                                 int16_t *dqcoeff, int16_t *diff,
-                                 int pitch) {
+void vp9_inverse_transform_b_4x4_add(MACROBLOCKD *xd, int eob, int16_t *dqcoeff,
+                                     uint8_t *dest, int stride) {
   if (eob <= 1)
-    xd->inv_txm4x4_1(dqcoeff, diff, pitch);
+    xd->inv_txm4x4_1_add(dqcoeff, dest, stride);
   else
-    xd->inv_txm4x4(dqcoeff, diff, pitch);
+    xd->inv_txm4x4_add(dqcoeff, dest, stride);
 }
diff --git a/vp9/common/vp9_invtrans.h b/vp9/common/vp9_invtrans.h
index 2aeb584..dbdc50a 100644
--- a/vp9/common/vp9_invtrans.h
+++ b/vp9/common/vp9_invtrans.h
@@ -15,7 +15,6 @@
 #include "vpx/vpx_integer.h"
 #include "vp9/common/vp9_blockd.h"
 
-void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int eob,
-                                 int16_t *dqcoeff, int16_t *diff,
-                                 int pitch);
+void vp9_inverse_transform_b_4x4_add(MACROBLOCKD *xd, int eob, int16_t *dqcoeff,
+                                     uint8_t *dest, int stride);
 #endif  // VP9_COMMON_VP9_INVTRANS_H_
diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index a39694a..6ddc356 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h
@@ -112,11 +112,6 @@
 } FRAME_CONTEXT;
 
 typedef enum {
-  RECON_CLAMP_REQUIRED        = 0,
-  RECON_CLAMP_NOTREQUIRED     = 1
-} CLAMP_TYPE;
-
-typedef enum {
   SINGLE_PREDICTION_ONLY = 0,
   COMP_PREDICTION_ONLY   = 1,
   HYBRID_PREDICTION      = 2,
@@ -152,7 +147,6 @@
   int subsampling_y;
 
   YUV_TYPE clr_type;
-  CLAMP_TYPE  clamp_type;
 
   YV12_BUFFER_CONFIG *frame_to_show;
 
diff --git a/vp9/common/vp9_recon.c b/vp9/common/vp9_recon.c
deleted file mode 100644
index 69a4720..0000000
--- a/vp9/common/vp9_recon.c
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "./vpx_config.h"
-#include "vp9_rtcd.h"
-#include "vp9/common/vp9_blockd.h"
-
-static INLINE void recon(int rows, int cols,
-                         const int16_t *diff_ptr, int diff_stride,
-                         uint8_t *dst_ptr, int dst_stride) {
-  int r, c;
-
-  for (r = 0; r < rows; r++) {
-    for (c = 0; c < cols; c++)
-      dst_ptr[c] = clip_pixel(diff_ptr[c] + dst_ptr[c]);
-
-    dst_ptr += dst_stride;
-    diff_ptr += diff_stride;
-  }
-}
-
-
-void vp9_recon_b_c(uint8_t *pred_ptr, int16_t *diff_ptr, int diff_stride,
-                   uint8_t *dst_ptr, int stride) {
-  assert(pred_ptr == dst_ptr);
-  recon(4, 4, diff_ptr, diff_stride, dst_ptr, stride);
-}
-
-static void recon_plane(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize, int plane) {
-  const int bw = 4 << (b_width_log2(bsize) - xd->plane[plane].subsampling_x);
-  const int bh = 4 << (b_height_log2(bsize) - xd->plane[plane].subsampling_y);
-  recon(bh, bw,
-        xd->plane[plane].diff, bw,
-        xd->plane[plane].dst.buf, xd->plane[plane].dst.stride);
-}
-
-void vp9_recon_sby_c(MACROBLOCKD *mb, BLOCK_SIZE_TYPE bsize) {
-  recon_plane(mb, bsize, 0);
-}
-
-void vp9_recon_sbuv_c(MACROBLOCKD *mb, BLOCK_SIZE_TYPE bsize) {
-  int i;
-
-  for (i = 1; i < MAX_MB_PLANE; i++)
-    recon_plane(mb, bsize, i);
-}
-
-void vp9_recon_sb_c(MACROBLOCKD *xd, BLOCK_SIZE_TYPE bsize) {
-  vp9_recon_sby(xd, bsize);
-  vp9_recon_sbuv(xd, bsize);
-}
diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index cf8dd33..c68658d 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh
@@ -60,18 +60,6 @@
 prototype void vp9_copy_mem8x4 "const uint8_t *src, int src_pitch, uint8_t *dst, int dst_pitch"
 specialize vp9_copy_mem8x4 mmx
 
-prototype void vp9_recon_b "uint8_t *pred_ptr, int16_t *diff_ptr, int diff_stride, uint8_t *dst_ptr, int stride"
-specialize vp9_recon_b
-
-prototype void vp9_recon_sb "struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize"
-specialize vp9_recon_sb
-
-prototype void vp9_recon_sby "struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize"
-specialize vp9_recon_sby
-
-prototype void vp9_recon_sbuv "struct macroblockd *x, enum BLOCK_SIZE_TYPE bsize"
-specialize void vp9_recon_sbuv
-
 prototype void vp9_build_intra_predictors "uint8_t *src, int src_stride, uint8_t *pred, int y_stride, int mode, int bw, int bh, int up_available, int left_available, int right_available"
 specialize void vp9_build_intra_predictors
 
@@ -85,9 +73,6 @@
 specialize vp9_intra4x4_predict;
 
 if [ "$CONFIG_VP9_DECODER" = "yes" ]; then
-prototype void vp9_add_residual_4x4 "const int16_t *diff, uint8_t *dest, int stride"
-specialize vp9_add_residual_4x4 sse2
-
 prototype void vp9_add_constant_residual_8x8 "const int16_t diff, uint8_t *dest, int stride"
 specialize vp9_add_constant_residual_8x8 sse2
 
@@ -179,11 +164,11 @@
 #
 # dct
 #
-prototype void vp9_short_idct4x4_1 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct4x4_1
+prototype void vp9_short_idct4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct4x4_1_add
 
-prototype void vp9_short_idct4x4 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_idct4x4 sse2
+prototype void vp9_short_idct4x4_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_idct4x4_add sse2
 
 prototype void vp9_short_idct8x8_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct8x8_add sse2
@@ -212,12 +197,12 @@
 prototype void vp9_short_idct10_32x32_add "int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_short_idct10_32x32_add
 
+prototype void vp9_short_iht4x4_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
+specialize vp9_short_iht4x4_add
+
 prototype void vp9_short_iht8x8_add "int16_t *input, uint8_t *dest, int dest_stride, int tx_type"
 specialize vp9_short_iht8x8_add
 
-prototype void vp9_short_iht4x4 "int16_t *input, int16_t *output, int pitch, int tx_type"
-specialize vp9_short_iht4x4
-
 prototype void vp9_short_iht16x16_add "int16_t *input, uint8_t *output, int pitch, int tx_type"
 specialize vp9_short_iht16x16_add
 
@@ -229,12 +214,11 @@
 prototype void vp9_dc_only_idct_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"
 specialize vp9_dc_only_idct_add sse2
 
-prototype void vp9_short_iwalsh4x4_1 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_iwalsh4x4_1
-prototype void vp9_short_iwalsh4x4 "int16_t *input, int16_t *output, int pitch"
-specialize vp9_short_iwalsh4x4
-prototype void vp9_dc_only_inv_walsh_add "int input_dc, uint8_t *pred_ptr, uint8_t *dst_ptr, int pitch, int stride"
-specialize vp9_dc_only_inv_walsh_add
+prototype void vp9_short_iwalsh4x4_1_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_iwalsh4x4_1_add
+
+prototype void vp9_short_iwalsh4x4_add "int16_t *input, uint8_t *dest, int dest_stride"
+specialize vp9_short_iwalsh4x4_add
 
 prototype unsigned int vp9_sad32x3 "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int ref_stride, int max_sad"
 specialize vp9_sad32x3
diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index ab8604c..599dcff 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c
@@ -73,7 +73,7 @@
   *(int *)dst_ptr = _mm_cvtsi128_si32(p1);
 }
 
-void vp9_short_idct4x4_sse2(int16_t *input, int16_t *output, int pitch) {
+void vp9_short_idct4x4_add_sse2(int16_t *input, uint8_t *dest, int stride) {
   const __m128i zero = _mm_setzero_si128();
   const __m128i eight = _mm_set1_epi16(8);
   const __m128i cst = _mm_setr_epi16((int16_t)cospi_16_64, (int16_t)cospi_16_64,
@@ -81,7 +81,6 @@
                                     (int16_t)cospi_24_64, (int16_t)-cospi_8_64,
                                     (int16_t)cospi_8_64, (int16_t)cospi_24_64);
   const __m128i rounding = _mm_set1_epi32(DCT_CONST_ROUNDING);
-  const int half_pitch = pitch >> 1;
   __m128i input0, input1, input2, input3;
 
   // Rows
@@ -188,14 +187,23 @@
   input2 = _mm_srai_epi16(input2, 4);
   input3 = _mm_srai_epi16(input3, 4);
 
-  // Store results
-  _mm_storel_epi64((__m128i *)output, input2);
-  input2 = _mm_srli_si128(input2, 8);
-  _mm_storel_epi64((__m128i *)(output + half_pitch), input2);
+#define RECON_AND_STORE4X4(dest, in_x) \
+  {                                                     \
+      __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); \
+      d0 = _mm_unpacklo_epi8(d0, zero); \
+      d0 = _mm_add_epi16(in_x, d0); \
+      d0 = _mm_packus_epi16(d0, d0); \
+      *(int *)dest = _mm_cvtsi128_si32(d0); \
+      dest += stride; \
+  }
 
-  _mm_storel_epi64((__m128i *)(output + 3 * half_pitch), input3);
-  input3 = _mm_srli_si128(input3, 8);
-  _mm_storel_epi64((__m128i *)(output + 2 * half_pitch), input3);
+  input0 = _mm_srli_si128(input2, 8);
+  input1 = _mm_srli_si128(input3, 8);
+
+  RECON_AND_STORE4X4(dest, input2);
+  RECON_AND_STORE4X4(dest, input0);
+  RECON_AND_STORE4X4(dest, input1);
+  RECON_AND_STORE4X4(dest, input3);
 }
 
 void vp9_idct4_1d_sse2(int16_t *input, int16_t *output) {
diff --git a/vp9/common/x86/vp9_recon_mmx.asm b/vp9/common/x86/vp9_recon_mmx.asm
index fc03d3f..6fbbe48 100644
--- a/vp9/common/x86/vp9_recon_mmx.asm
+++ b/vp9/common/x86/vp9_recon_mmx.asm
@@ -10,55 +10,6 @@
 
 
 %include "vpx_ports/x86_abi_support.asm"
-;void vp9_recon_b_mmx(unsigned char *s, short *q, unsigned char *d, int stride)
-global sym(vp9_recon_b_mmx) PRIVATE
-sym(vp9_recon_b_mmx):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov       rsi, arg(0) ;s
-        mov       rdi, arg(2) ;d
-        mov       rdx, arg(1) ;q
-        movsxd    rax, dword ptr arg(3) ;stride
-        pxor      mm0, mm0
-
-        movd      mm1, [rsi]
-        punpcklbw mm1, mm0
-        paddsw    mm1, [rdx]
-        packuswb  mm1,  mm0              ; pack and unpack to saturate
-        movd      [rdi], mm1
-
-        movd      mm2, [rsi+16]
-        punpcklbw mm2, mm0
-        paddsw    mm2, [rdx+32]
-        packuswb  mm2, mm0              ; pack and unpack to saturate
-        movd      [rdi+rax], mm2
-
-        movd      mm3, [rsi+32]
-        punpcklbw mm3, mm0
-        paddsw    mm3, [rdx+64]
-        packuswb  mm3,  mm0              ; pack and unpack to saturate
-        movd      [rdi+2*rax], mm3
-
-        add       rdi, rax
-        movd      mm4, [rsi+48]
-        punpcklbw mm4, mm0
-        paddsw    mm4, [rdx+96]
-        packuswb  mm4, mm0              ; pack and unpack to saturate
-        movd      [rdi+2*rax], mm4
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
 ;void copy_mem8x8_mmx(
 ;    unsigned char *src,
 ;    int src_stride,
diff --git a/vp9/common/x86/vp9_recon_sse2.asm b/vp9/common/x86/vp9_recon_sse2.asm
index 154442d..9ee3043 100644
--- a/vp9/common/x86/vp9_recon_sse2.asm
+++ b/vp9/common/x86/vp9_recon_sse2.asm
@@ -10,122 +10,6 @@
 
 
 %include "vpx_ports/x86_abi_support.asm"
-;void vp9_recon2b_sse2(unsigned char *s, short *q, unsigned char *d, int stride)
-global sym(vp9_recon2b_sse2) PRIVATE
-sym(vp9_recon2b_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rsi,        arg(0) ;s
-        mov         rdi,        arg(2) ;d
-        mov         rdx,        arg(1) ;q
-        movsxd      rax,        dword ptr arg(3) ;stride
-        pxor        xmm0,       xmm0
-
-        movq        xmm1,       MMWORD PTR [rsi]
-        punpcklbw   xmm1,       xmm0
-        paddsw      xmm1,       XMMWORD PTR [rdx]
-        packuswb    xmm1,       xmm0              ; pack and unpack to saturate
-        movq        MMWORD PTR [rdi],   xmm1
-
-
-        movq        xmm2,       MMWORD PTR [rsi+8]
-        punpcklbw   xmm2,       xmm0
-        paddsw      xmm2,       XMMWORD PTR [rdx+16]
-        packuswb    xmm2,       xmm0              ; pack and unpack to saturate
-        movq        MMWORD PTR [rdi+rax],   xmm2
-
-
-        movq        xmm3,       MMWORD PTR [rsi+16]
-        punpcklbw   xmm3,       xmm0
-        paddsw      xmm3,       XMMWORD PTR [rdx+32]
-        packuswb    xmm3,       xmm0              ; pack and unpack to saturate
-        movq        MMWORD PTR [rdi+rax*2], xmm3
-
-        add         rdi, rax
-        movq        xmm4,       MMWORD PTR [rsi+24]
-        punpcklbw   xmm4,       xmm0
-        paddsw      xmm4,       XMMWORD PTR [rdx+48]
-        packuswb    xmm4,       xmm0              ; pack and unpack to saturate
-        movq        MMWORD PTR [rdi+rax*2], xmm4
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vp9_recon4b_sse2(unsigned char *s, short *q, unsigned char *d, int stride)
-global sym(vp9_recon4b_sse2) PRIVATE
-sym(vp9_recon4b_sse2):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 4
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    ; end prolog
-
-        mov         rsi,        arg(0) ;s
-        mov         rdi,        arg(2) ;d
-        mov         rdx,        arg(1) ;q
-        movsxd      rax,        dword ptr arg(3) ;stride
-        pxor        xmm0,       xmm0
-
-        movdqa      xmm1,       XMMWORD PTR [rsi]
-        movdqa      xmm5,       xmm1
-        punpcklbw   xmm1,       xmm0
-        punpckhbw   xmm5,       xmm0
-        paddsw      xmm1,       XMMWORD PTR [rdx]
-        paddsw      xmm5,       XMMWORD PTR [rdx+16]
-        packuswb    xmm1,       xmm5              ; pack and unpack to saturate
-        movdqa      XMMWORD PTR [rdi],  xmm1
-
-
-        movdqa      xmm2,       XMMWORD PTR [rsi+16]
-        movdqa      xmm6,       xmm2
-        punpcklbw   xmm2,       xmm0
-        punpckhbw   xmm6,       xmm0
-        paddsw      xmm2,       XMMWORD PTR [rdx+32]
-        paddsw      xmm6,       XMMWORD PTR [rdx+48]
-        packuswb    xmm2,       xmm6              ; pack and unpack to saturate
-        movdqa      XMMWORD PTR [rdi+rax],  xmm2
-
-
-        movdqa      xmm3,       XMMWORD PTR [rsi+32]
-        movdqa      xmm7,       xmm3
-        punpcklbw   xmm3,       xmm0
-        punpckhbw   xmm7,       xmm0
-        paddsw      xmm3,       XMMWORD PTR [rdx+64]
-        paddsw      xmm7,       XMMWORD PTR [rdx+80]
-        packuswb    xmm3,       xmm7              ; pack and unpack to saturate
-        movdqa      XMMWORD PTR [rdi+rax*2],    xmm3
-
-        add       rdi, rax
-        movdqa      xmm4,       XMMWORD PTR [rsi+48]
-        movdqa      xmm5,       xmm4
-        punpcklbw   xmm4,       xmm0
-        punpckhbw   xmm5,       xmm0
-        paddsw      xmm4,       XMMWORD PTR [rdx+96]
-        paddsw      xmm5,       XMMWORD PTR [rdx+112]
-        packuswb    xmm4,       xmm5              ; pack and unpack to saturate
-        movdqa      XMMWORD PTR [rdi+rax*2],    xmm4
-
-    ; begin epilog
-    pop rdi
-    pop rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
 ;void copy_mem16x16_sse2(
 ;    unsigned char *src,
 ;    int src_stride,
diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index d34bfa7..d116685 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c
@@ -530,7 +530,6 @@
 }
 
 static void read_mb_modes_mv(VP9D_COMP *pbi, MODE_INFO *mi, MB_MODE_INFO *mbmi,
-                             MODE_INFO *prev_mi,
                              int mi_row, int mi_col,
                              vp9_reader *r) {
   VP9_COMMON *const cm = &pbi->common;
@@ -559,7 +558,6 @@
   // Make sure the MACROBLOCKD mode info pointer is pointed at the
   // correct entry for the current macroblock.
   xd->mode_info_context = mi;
-  xd->prev_mode_info_context = prev_mi;
 
   // Distance of Mb to the various image edges.
   // These specified to 8th pel as they are always compared to MV values
@@ -605,7 +603,8 @@
         printf("%d %d\n", xd->mode_info_context->mbmi.mv[0].as_mv.row,
                xd->mode_info_context->mbmi.mv[0].as_mv.col);
 #endif
-      vp9_find_mv_refs(cm, xd, mi, use_prev_in_find_mv_refs ? prev_mi : NULL,
+      vp9_find_mv_refs(cm, xd, mi, use_prev_in_find_mv_refs ?
+                       xd->prev_mode_info_context : NULL,
                        ref_frame, mbmi->ref_mvs[ref_frame],
                        cm->ref_frame_sign_bias);
 
@@ -671,7 +670,8 @@
                          mi_row, mi_col, xd->scale_factor, xd->scale_factor_uv);
 
         vp9_find_mv_refs(cm, xd, mi,
-                         use_prev_in_find_mv_refs ? prev_mi : NULL,
+                         use_prev_in_find_mv_refs ?
+                         xd->prev_mode_info_context : NULL,
                          second_ref_frame, mbmi->ref_mvs[second_ref_frame],
                          cm->ref_frame_sign_bias);
 
@@ -921,13 +921,12 @@
                            vp9_reader *r) {
   VP9_COMMON *const cm = &pbi->common;
   MODE_INFO *mi = xd->mode_info_context;
-  MODE_INFO *prev_mi = xd->prev_mode_info_context;
   MB_MODE_INFO *const mbmi = &mi->mbmi;
 
   if (cm->frame_type == KEY_FRAME) {
     kfread_modes(pbi, mi, mi_row, mi_col, r);
   } else {
-    read_mb_modes_mv(pbi, mi, &mi->mbmi, prev_mi, mi_row, mi_col, r);
+    read_mb_modes_mv(pbi, mi, &mi->mbmi, mi_row, mi_col, r);
     set_scale_factors(xd,
                       mi->mbmi.ref_frame - 1, mi->mbmi.second_ref_frame - 1,
                       cm->active_ref_scale);
diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodframe.c
index 348961b..83fb8f9 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodframe.c
@@ -995,7 +995,6 @@
                        "Failed to allocate bool decoder 0");
 
   pc->clr_type = (YUV_TYPE)vp9_read_bit(&header_bc);
-  pc->clamp_type = (CLAMP_TYPE)vp9_read_bit(&header_bc);
   pc->error_resilient_mode = vp9_read_bit(&header_bc);
 
   setup_loopfilter(pc, xd, &header_bc);
@@ -1007,14 +1006,10 @@
                  pc->uv_dc_delta_q == 0 &&
                  pc->uv_ac_delta_q == 0;
   if (xd->lossless) {
-    xd->inv_txm4x4_1      = vp9_short_iwalsh4x4_1;
-    xd->inv_txm4x4        = vp9_short_iwalsh4x4;
     xd->itxm_add          = vp9_idct_add_lossless_c;
     xd->itxm_add_y_block  = vp9_idct_add_y_block_lossless_c;
     xd->itxm_add_uv_block = vp9_idct_add_uv_block_lossless_c;
   } else {
-    xd->inv_txm4x4_1      = vp9_short_idct4x4_1;
-    xd->inv_txm4x4        = vp9_short_idct4x4;
     xd->itxm_add          = vp9_idct_add;
     xd->itxm_add_y_block  = vp9_idct_add_y_block;
     xd->itxm_add_uv_block = vp9_idct_add_uv_block;
diff --git a/vp9/decoder/vp9_idct_blk.c b/vp9/decoder/vp9_idct_blk.c
index 10b585b..7726598 100644
--- a/vp9/decoder/vp9_idct_blk.c
+++ b/vp9/decoder/vp9_idct_blk.c
@@ -84,23 +84,6 @@
   }
 }
 
-static void add_residual(const int16_t *diff, uint8_t *dest, int stride,
-                         int width, int height) {
-  int r, c;
-
-  for (r = 0; r < height; r++) {
-    for (c = 0; c < width; c++)
-      dest[c] = clip_pixel(diff[c] + dest[c]);
-
-    dest += stride;
-    diff += width;
-  }
-}
-
-void vp9_add_residual_4x4_c(const int16_t *diff, uint8_t *dest, int stride) {
-  add_residual(diff, dest, stride, 4, 4);
-}
-
 static void add_constant_residual(const int16_t diff, uint8_t *dest, int stride,
                                   int width, int height) {
   int r, c;
@@ -133,11 +116,8 @@
   if (tx_type == DCT_DCT) {
     vp9_idct_add(input, dest, stride, eob);
   } else {
-    DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
-
-    vp9_short_iht4x4(input, output, 4, tx_type);
+    vp9_short_iht4x4_add(input, dest, stride, tx_type);
     vpx_memset(input, 0, 32);
-    vp9_add_residual_4x4(output, dest, stride);
   }
 }
 
@@ -154,13 +134,9 @@
 }
 
 void vp9_idct_add_c(int16_t *input, uint8_t *dest, int stride, int eob) {
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
-
   if (eob > 1) {
-    // the idct halves ( >> 1) the pitch
-    vp9_short_idct4x4(input, output, 4 << 1);
+    vp9_short_idct4x4_add(input, dest, stride);
     vpx_memset(input, 0, 32);
-    vp9_add_residual_4x4(output, dest, stride);
   } else {
     vp9_dc_only_idct_add(input[0], dest, dest, stride, stride);
     ((int *)input)[0] = 0;
@@ -168,38 +144,27 @@
 }
 
 void vp9_dc_idct_add_c(int16_t *input, uint8_t *dest, int stride, int dc) {
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
-
   input[0] = dc;
-
-  // the idct halves ( >> 1) the pitch
-  vp9_short_idct4x4(input, output, 4 << 1);
+  vp9_short_idct4x4_add(input, dest, stride);
   vpx_memset(input, 0, 32);
-  vp9_add_residual_4x4(output, dest, stride);
 }
 
 void vp9_idct_add_lossless_c(int16_t *input, uint8_t *dest, int stride,
                              int eob) {
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
-
   if (eob > 1) {
-    vp9_short_iwalsh4x4_c(input, output, 4 << 1);
+    vp9_short_iwalsh4x4_add(input, dest, stride);
     vpx_memset(input, 0, 32);
-    vp9_add_residual_4x4(output, dest, stride);
   } else {
-    vp9_dc_only_inv_walsh_add(input[0], dest, dest, stride, stride);
+    vp9_short_iwalsh4x4_1_add_c(input, dest, stride);
     ((int *)input)[0] = 0;
   }
 }
 
 void vp9_dc_idct_add_lossless_c(int16_t *input, uint8_t *dest,
                                 int stride, int dc) {
-  DECLARE_ALIGNED_ARRAY(16, int16_t, output, 16);
-
   input[0] = dc;
-  vp9_short_iwalsh4x4_c(input, output, 4 << 1);
+  vp9_short_iwalsh4x4_add(input, dest, stride);
   vpx_memset(input, 0, 32);
-  vp9_add_residual_4x4(output, dest, stride);
 }
 
 void vp9_idct_add_8x8_c(int16_t *input, uint8_t *dest, int stride, int eob) {
diff --git a/vp9/decoder/x86/vp9_dequantize_sse2.c b/vp9/decoder/x86/vp9_dequantize_sse2.c
index 72036c2..54ec67f 100644
--- a/vp9/decoder/x86/vp9_dequantize_sse2.c
+++ b/vp9/decoder/x86/vp9_dequantize_sse2.c
@@ -15,49 +15,6 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_idct.h"
 
-void vp9_add_residual_4x4_sse2(const int16_t *diff, uint8_t *dest, int stride) {
-  const int width = 4;
-  const __m128i zero = _mm_setzero_si128();
-
-  // Diff data
-  const __m128i d0 = _mm_loadl_epi64((const __m128i *)(diff + 0 * width));
-  const __m128i d1 = _mm_loadl_epi64((const __m128i *)(diff + 1 * width));
-  const __m128i d2 = _mm_loadl_epi64((const __m128i *)(diff + 2 * width));
-  const __m128i d3 = _mm_loadl_epi64((const __m128i *)(diff + 3 * width));
-
-  // Prediction data.
-  __m128i p0 = _mm_cvtsi32_si128(*(const int *)(dest + 0 * stride));
-  __m128i p1 = _mm_cvtsi32_si128(*(const int *)(dest + 1 * stride));
-  __m128i p2 = _mm_cvtsi32_si128(*(const int *)(dest + 2 * stride));
-  __m128i p3 = _mm_cvtsi32_si128(*(const int *)(dest + 3 * stride));
-
-  p0 = _mm_unpacklo_epi8(p0, zero);
-  p1 = _mm_unpacklo_epi8(p1, zero);
-  p2 = _mm_unpacklo_epi8(p2, zero);
-  p3 = _mm_unpacklo_epi8(p3, zero);
-
-  p0 = _mm_add_epi16(p0, d0);
-  p1 = _mm_add_epi16(p1, d1);
-  p2 = _mm_add_epi16(p2, d2);
-  p3 = _mm_add_epi16(p3, d3);
-
-  p0 = _mm_packus_epi16(p0, p1);
-  p2 = _mm_packus_epi16(p2, p3);
-
-  *(int *)dest = _mm_cvtsi128_si32(p0);
-  dest += stride;
-
-  p0 = _mm_srli_si128(p0, 8);
-  *(int *)dest = _mm_cvtsi128_si32(p0);
-  dest += stride;
-
-  *(int *)dest = _mm_cvtsi128_si32(p2);
-  dest += stride;
-
-  p2 = _mm_srli_si128(p2, 8);
-  *(int *)dest = _mm_cvtsi128_si32(p2);
-}
-
 void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest,
                                         int stride) {
   uint8_t abs_diff;
diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index 70b2333..b5b1355 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c
@@ -67,10 +67,111 @@
   p[3] = value >> 24;
 }
 
+void vp9_encode_unsigned_max(vp9_writer *br, int data, int max) {
+  assert(data <= max);
+  while (max) {
+    vp9_write_bit(br, data & 1);
+    data >>= 1;
+    max >>= 1;
+  }
+}
+
+int recenter_nonneg(int v, int m) {
+  if (v > (m << 1))
+    return v;
+  else if (v >= m)
+    return ((v - m) << 1);
+  else
+    return ((m - v) << 1) - 1;
+}
+
+static int get_unsigned_bits(unsigned num_values) {
+  int cat = 0;
+  if ((num_values--) <= 1) return 0;
+  while (num_values > 0) {
+    cat++;
+    num_values >>= 1;
+  }
+  return cat;
+}
+
+void encode_uniform(vp9_writer *w, int v, int n) {
+  int l = get_unsigned_bits(n);
+  int m;
+  if (l == 0)
+    return;
+  m = (1 << l) - n;
+  if (v < m) {
+    vp9_write_literal(w, v, l - 1);
+  } else {
+    vp9_write_literal(w, m + ((v - m) >> 1), l - 1);
+    vp9_write_literal(w, (v - m) & 1, 1);
+  }
+}
+
+int count_uniform(int v, int n) {
+  int l = get_unsigned_bits(n);
+  int m;
+  if (l == 0) return 0;
+  m = (1 << l) - n;
+  if (v < m)
+    return l - 1;
+  else
+    return l;
+}
+
+void encode_term_subexp(vp9_writer *w, int word, int k, int num_syms) {
+  int i = 0;
+  int mk = 0;
+  while (1) {
+    int b = (i ? k + i - 1 : k);
+    int a = (1 << b);
+    if (num_syms <= mk + 3 * a) {
+      encode_uniform(w, word - mk, num_syms - mk);
+      break;
+    } else {
+      int t = (word >= mk + a);
+      vp9_write_literal(w, t, 1);
+      if (t) {
+        i = i + 1;
+        mk += a;
+      } else {
+        vp9_write_literal(w, word - mk, b);
+        break;
+      }
+    }
+  }
+}
+
+int count_term_subexp(int word, int k, int num_syms) {
+  int count = 0;
+  int i = 0;
+  int mk = 0;
+  while (1) {
+    int b = (i ? k + i - 1 : k);
+    int a = (1 << b);
+    if (num_syms <= mk + 3 * a) {
+      count += count_uniform(word - mk, num_syms - mk);
+      break;
+    } else {
+      int t = (word >= mk + a);
+      count++;
+      if (t) {
+        i = i + 1;
+        mk += a;
+      } else {
+        count += b;
+        break;
+      }
+    }
+  }
+  return count;
+}
+
 static void compute_update_table() {
   int i;
   for (i = 0; i < 255; i++)
-    update_bits[i] = vp9_count_term_subexp(i, SUBEXP_PARAM, 255);
+    update_bits[i] = count_term_subexp(i, SUBEXP_PARAM, 255);
 }
 
 static int split_index(int i, int n, int modulus) {
@@ -85,18 +186,18 @@
   const int modulus = MODULUS_PARAM;
   int i;
   if ((m << 1) <= n)
-    i = vp9_recenter_nonneg(v, m) - 1;
+    i = recenter_nonneg(v, m) - 1;
   else
-    i = vp9_recenter_nonneg(n - 1 - v, n - 1 - m) - 1;
+    i = recenter_nonneg(n - 1 - v, n - 1 - m) - 1;
 
   i = split_index(i, n - 1, modulus);
   return i;
 }
 
-static void write_prob_diff_update(vp9_writer *const bc,
+static void write_prob_diff_update(vp9_writer *w,
                                    vp9_prob newp, vp9_prob oldp) {
   int delp = remap_prob(newp, oldp);
-  vp9_encode_term_subexp(bc, delp, SUBEXP_PARAM, 255);
+  encode_term_subexp(w, delp, SUBEXP_PARAM, 255);
 }
 
 static int prob_diff_update_cost(vp9_prob newp, vp9_prob oldp) {
@@ -105,7 +206,7 @@
 }
 
 static void update_mode(
-  vp9_writer *const bc,
+  vp9_writer *w,
   int n,
   const struct vp9_token tok[/* n */],
   vp9_tree tree,
@@ -128,15 +229,15 @@
   if (new_b + (n << 8) < old_b) {
     int i = 0;
 
-    vp9_write_bit(bc, 1);
+    vp9_write_bit(w, 1);
 
     do {
       const vp9_prob p = Pnew[i];
 
-      vp9_write_literal(bc, Pcur[i] = p ? p : 1, 8);
+      vp9_write_literal(w, Pcur[i] = p ? p : 1, 8);
     } while (++i < n);
   } else
-    vp9_write_bit(bc, 0);
+    vp9_write_bit(w, 0);
 }
 
 static void update_mbintra_mode_probs(VP9_COMP* const cpi,
@@ -1543,7 +1644,6 @@
 
   // TODO(jkoleszar): remove these two unused bits?
   vp9_write_bit(&header_bc, pc->clr_type);
-  vp9_write_bit(&header_bc, pc->clamp_type);
 
   // error resilient mode
   vp9_write_bit(&header_bc, pc->error_resilient_mode);
diff --git a/vp9/encoder/vp9_boolhuff.c b/vp9/encoder/vp9_boolhuff.c
index e9436af..0fcb257 100644
--- a/vp9/encoder/vp9_boolhuff.c
+++ b/vp9/encoder/vp9_boolhuff.c
@@ -59,100 +59,3 @@
     br->buffer[br->pos++] = 0;
 }
 
-
-void vp9_encode_unsigned_max(vp9_writer *br, int data, int max) {
-  assert(data <= max);
-  while (max) {
-    vp9_write_bit(br, data & 1);
-    data >>= 1;
-    max >>= 1;
-  }
-}
-
-int vp9_recenter_nonneg(int v, int m) {
-  if (v > (m << 1)) return v;
-  else if (v >= m) return ((v - m) << 1);
-  else return ((m - v) << 1) - 1;
-}
-
-static int get_unsigned_bits(unsigned num_values) {
-  int cat = 0;
-  if ((num_values--) <= 1) return 0;
-  while (num_values > 0) {
-    cat++;
-    num_values >>= 1;
-  }
-  return cat;
-}
-
-void vp9_encode_uniform(vp9_writer *br, int v, int n) {
-  int l = get_unsigned_bits(n);
-  int m;
-  if (l == 0) return;
-  m = (1 << l) - n;
-  if (v < m)
-    vp9_write_literal(br, v, l - 1);
-  else {
-    vp9_write_literal(br, m + ((v - m) >> 1), l - 1);
-    vp9_write_literal(br, (v - m) & 1, 1);
-  }
-}
-
-int vp9_count_uniform(int v, int n) {
-  int l = get_unsigned_bits(n);
-  int m;
-  if (l == 0) return 0;
-  m = (1 << l) - n;
-  if (v < m)
-    return l - 1;
-  else
-    return l;
-}
-
-void vp9_encode_term_subexp(vp9_writer *br, int word, int k, int num_syms) {
-  int i = 0;
-  int mk = 0;
-  while (1) {
-    int b = (i ? k + i - 1 : k);
-    int a = (1 << b);
-    if (num_syms <= mk + 3 * a) {
-      vp9_encode_uniform(br, word - mk, num_syms - mk);
-      break;
-    } else {
-      int t = (word >= mk + a);
-      vp9_write_literal(br, t, 1);
-      if (t) {
-        i = i + 1;
-        mk += a;
-      } else {
-        vp9_write_literal(br, word - mk, b);
-        break;
-      }
-    }
-  }
-}
-
-int vp9_count_term_subexp(int word, int k, int num_syms) {
-  int count = 0;
-  int i = 0;
-  int mk = 0;
-  while (1) {
-    int b = (i ? k + i - 1 : k);
-    int a = (1 << b);
-    if (num_syms <= mk + 3 * a) {
-      count += vp9_count_uniform(word - mk, num_syms - mk);
-      break;
-    } else {
-      int t = (word >= mk + a);
-      count++;
-      if (t) {
-        i = i + 1;
-        mk += a;
-      } else {
-        count += b;
-        break;
-      }
-    }
-  }
-  return count;
-}
diff --git a/vp9/encoder/vp9_boolhuff.h b/vp9/encoder/vp9_boolhuff.h
index 58b40fb..c3f340d 100644
--- a/vp9/encoder/vp9_boolhuff.h
+++ b/vp9/encoder/vp9_boolhuff.h
@@ -37,19 +37,10 @@
 extern const unsigned int vp9_prob_cost[256];
 
 void vp9_start_encode(vp9_writer *bc, uint8_t *buffer);
-void vp9_encode_unsigned_max(vp9_writer *br, int data, int max);
 void vp9_stop_encode(vp9_writer *bc);
 
-
-void vp9_encode_uniform(vp9_writer *bc, int v, int n);
-void vp9_encode_term_subexp(vp9_writer *bc, int v, int k, int n);
-int vp9_count_uniform(int v, int n);
-int vp9_count_term_subexp(int v, int k, int n);
-int vp9_recenter_nonneg(int v, int m);
-
 DECLARE_ALIGNED(16, extern const unsigned char, vp9_norm[256]);
 
-
 static void vp9_write(vp9_writer *br, int bit, int probability) {
   unsigned int split;
   int count = br->count;
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 3e108c8..f3a03f3 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -1169,8 +1169,8 @@
   if (lossless) {
     cpi->mb.fwd_txm8x4            = vp9_short_walsh8x4;
     cpi->mb.fwd_txm4x4            = vp9_short_walsh4x4;
-    cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_iwalsh4x4_1;
-    cpi->mb.e_mbd.inv_txm4x4      = vp9_short_iwalsh4x4;
+    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_short_iwalsh4x4_1_add;
+    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_short_iwalsh4x4_add;
     cpi->mb.optimize              = 0;
     cpi->common.filter_level      = 0;
     cpi->zbin_mode_boost_enabled  = 0;
@@ -1178,8 +1178,8 @@
   } else {
     cpi->mb.fwd_txm8x4            = vp9_short_fdct8x4;
     cpi->mb.fwd_txm4x4            = vp9_short_fdct4x4;
-    cpi->mb.e_mbd.inv_txm4x4_1    = vp9_short_idct4x4_1;
-    cpi->mb.e_mbd.inv_txm4x4      = vp9_short_idct4x4;
+    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_short_idct4x4_1_add;
+    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_short_idct4x4_add;
   }
 }
 
@@ -1650,8 +1650,9 @@
   MACROBLOCK *const x = &cpi->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   int n;
-  MODE_INFO *mi = x->e_mbd.mode_info_context;
-  unsigned int segment_id = mi->mbmi.segment_id;
+  MODE_INFO *mi = xd->mode_info_context;
+  MB_MODE_INFO *mbmi = &mi->mbmi;
+  unsigned int segment_id = mbmi->segment_id;
   const int mis = cm->mode_info_stride;
   const int bwl = mi_width_log2(bsize);
   const int bw = 1 << bwl, bh = 1 << mi_height_log2(bsize);
@@ -1662,7 +1663,7 @@
       vp9_update_zbin_extra(cpi, x);
     }
   } else {
-    vp9_setup_interp_filters(xd, xd->mode_info_context->mbmi.interp_filter, cm);
+    vp9_setup_interp_filters(xd, mbmi->interp_filter, cm);
 
     if (cpi->oxcf.tuning == VP8_TUNE_SSIM) {
       // Adjust the zbin based on this MB rate.
@@ -1673,13 +1674,13 @@
     // Increase zbin size to suppress noise
     cpi->zbin_mode_boost = 0;
     if (cpi->zbin_mode_boost_enabled) {
-      if (xd->mode_info_context->mbmi.ref_frame != INTRA_FRAME) {
-        if (xd->mode_info_context->mbmi.mode == ZEROMV) {
-          if (xd->mode_info_context->mbmi.ref_frame != LAST_FRAME)
+      if (mbmi->ref_frame != INTRA_FRAME) {
+        if (mbmi->mode == ZEROMV) {
+          if (mbmi->ref_frame != LAST_FRAME)
             cpi->zbin_mode_boost = GF_ZEROMV_ZBIN_BOOST;
           else
             cpi->zbin_mode_boost = LF_ZEROMV_ZBIN_BOOST;
-        } else if (xd->mode_info_context->mbmi.mode == SPLITMV) {
+        } else if (mbmi->mode == SPLITMV) {
           cpi->zbin_mode_boost = SPLIT_MV_ZBIN_BOOST;
         } else {
           cpi->zbin_mode_boost = MV_ZBIN_BOOST;
@@ -1693,73 +1694,60 @@
   }
 
 #if CONFIG_AB4X4
-  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME &&
+  if (mbmi->ref_frame == INTRA_FRAME &&
       bsize < BLOCK_SIZE_SB8X8) {
 #else
-  if (xd->mode_info_context->mbmi.mode == I4X4_PRED) {
-    assert(bsize == BLOCK_SIZE_SB8X8 &&
-           xd->mode_info_context->mbmi.txfm_size == TX_4X4);
+  if (mbmi->mode == I4X4_PRED) {
+    assert(bsize == BLOCK_SIZE_SB8X8 && mbmi->txfm_size == TX_4X4);
 #endif
     vp9_encode_intra4x4mby(x, BLOCK_SIZE_SB8X8);
-    vp9_build_intra_predictors_sbuv_s(&x->e_mbd, BLOCK_SIZE_SB8X8);
+    vp9_build_intra_predictors_sbuv_s(xd, BLOCK_SIZE_SB8X8);
     vp9_encode_sbuv(cm, x, BLOCK_SIZE_SB8X8);
 
     if (output_enabled)
       sum_intra_stats(cpi, x);
-  } else if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME) {
-    vp9_build_intra_predictors_sby_s(&x->e_mbd, bsize);
-    vp9_build_intra_predictors_sbuv_s(&x->e_mbd, bsize);
+  } else if (mbmi->ref_frame == INTRA_FRAME) {
+    vp9_build_intra_predictors_sby_s(xd, bsize);
+    vp9_build_intra_predictors_sbuv_s(xd, bsize);
     if (output_enabled)
       sum_intra_stats(cpi, x);
   } else {
-    int ref_fb_idx, second_ref_fb_idx;
+    int idx = cm->ref_frame_map[get_ref_frame_idx(cpi, mbmi->ref_frame)];
+    YV12_BUFFER_CONFIG *ref_fb = &cm->yv12_fb[idx];
+    YV12_BUFFER_CONFIG *second_ref_fb = NULL;
+    if (mbmi->second_ref_frame > 0) {
+      idx = cm->ref_frame_map[get_ref_frame_idx(cpi, mbmi->second_ref_frame)];
+      second_ref_fb = &cm->yv12_fb[idx];
+    }
 
     assert(cm->frame_type != KEY_FRAME);
 
-    if (xd->mode_info_context->mbmi.ref_frame == LAST_FRAME)
-      ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];
-    else if (xd->mode_info_context->mbmi.ref_frame == GOLDEN_FRAME)
-      ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];
-    else
-      ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];
-
-    if (xd->mode_info_context->mbmi.second_ref_frame > 0) {
-      if (xd->mode_info_context->mbmi.second_ref_frame == LAST_FRAME)
-        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->lst_fb_idx];
-      else if (xd->mode_info_context->mbmi.second_ref_frame == GOLDEN_FRAME)
-        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->gld_fb_idx];
-      else
-        second_ref_fb_idx = cpi->common.ref_frame_map[cpi->alt_fb_idx];
-    }
-
-    setup_pre_planes(xd,
-        &cpi->common.yv12_fb[ref_fb_idx],
-        xd->mode_info_context->mbmi.second_ref_frame > 0
-            ? &cpi->common.yv12_fb[second_ref_fb_idx] : NULL,
-        mi_row, mi_col, xd->scale_factor, xd->scale_factor_uv);
+    setup_pre_planes(xd, ref_fb, second_ref_fb,
+                     mi_row, mi_col, xd->scale_factor, xd->scale_factor_uv);
 
     vp9_build_inter_predictors_sb(xd, mi_row, mi_col,
-                     (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
+                                  bsize < BLOCK_SIZE_SB8X8 ? BLOCK_SIZE_SB8X8
+                                                           : bsize);
   }
 
 #if CONFIG_AB4X4
-  if (xd->mode_info_context->mbmi.ref_frame == INTRA_FRAME &&
+  if (mbmi->ref_frame == INTRA_FRAME &&
       bsize < BLOCK_SIZE_SB8X8) {
 #else
-  if (xd->mode_info_context->mbmi.mode == I4X4_PRED) {
+  if (mbmi->mode == I4X4_PRED) {
     assert(bsize == BLOCK_SIZE_SB8X8);
 #endif
-    vp9_tokenize_sb(cpi, &x->e_mbd, t, !output_enabled, BLOCK_SIZE_SB8X8);
+    vp9_tokenize_sb(cpi, xd, t, !output_enabled, BLOCK_SIZE_SB8X8);
   } else if (!x->skip) {
     vp9_encode_sb(cm, x, (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
-    vp9_tokenize_sb(cpi, &x->e_mbd, t, !output_enabled,
+    vp9_tokenize_sb(cpi, xd, t, !output_enabled,
                     (bsize < BLOCK_SIZE_SB8X8) ? BLOCK_SIZE_SB8X8 : bsize);
   } else {
     // FIXME(rbultje): not tile-aware (mi - 1)
     int mb_skip_context =
         (mi - 1)->mbmi.mb_skip_coeff + (mi - mis)->mbmi.mb_skip_coeff;
 
-    xd->mode_info_context->mbmi.mb_skip_coeff = 1;
+    mbmi->mb_skip_coeff = 1;
     if (output_enabled)
       cpi->skip_true_count[mb_skip_context]++;
     vp9_reset_sb_tokens_context(xd,
@@ -1776,14 +1764,14 @@
 
   if (output_enabled) {
     if (cm->txfm_mode == TX_MODE_SELECT &&
-        !(mi->mbmi.mb_skip_coeff ||
+        !(mbmi->mb_skip_coeff ||
           vp9_segfeature_active(xd, segment_id, SEG_LVL_SKIP))) {
       if (bsize >= BLOCK_SIZE_SB32X32) {
-        cpi->txfm_count_32x32p[mi->mbmi.txfm_size]++;
+        cpi->txfm_count_32x32p[mbmi->txfm_size]++;
       } else if (bsize >= BLOCK_SIZE_MB16X16) {
-        cpi->txfm_count_16x16p[mi->mbmi.txfm_size]++;
+        cpi->txfm_count_16x16p[mbmi->txfm_size]++;
       } else {
-        cpi->txfm_count_8x8p[mi->mbmi.txfm_size]++;
+        cpi->txfm_count_8x8p[mbmi->txfm_size]++;
       }
     } else {
       int x, y;
@@ -1796,8 +1784,8 @@
 #if CONFIG_AB4X4
       if (sz == TX_8X8 && bsize < BLOCK_SIZE_SB8X8)
 #else
-      if (sz == TX_8X8 && (xd->mode_info_context->mbmi.mode == SPLITMV ||
-                           xd->mode_info_context->mbmi.mode == I4X4_PRED))
+      if (sz == TX_8X8 && (mbmi->mode == SPLITMV ||
+                           mbmi->mode == I4X4_PRED))
 #endif
         sz = TX_4X4;
 
diff --git a/vp9/encoder/vp9_encodeintra.c b/vp9/encoder/vp9_encodeintra.c
index fe5bdb3..f8cf50f 100644
--- a/vp9/encoder/vp9_encodeintra.c
+++ b/vp9/encoder/vp9_encodeintra.c
@@ -53,9 +53,6 @@
   int16_t* const src_diff =
       raster_block_offset_int16(xd, bsize, 0, ib,
                                 x->plane[0].src_diff);
-  int16_t* const diff =
-      raster_block_offset_int16(xd, bsize, 0, ib,
-                                xd->plane[0].diff);
   int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, ib, 16);
   const int bwl = b_width_log2(bsize), bhl = b_height_log2(bsize);
 
@@ -72,17 +69,15 @@
   if (tx_type != DCT_DCT) {
     vp9_short_fht4x4(src_diff, coeff, 4 << bwl, tx_type);
     x->quantize_b_4x4(x, ib, tx_type, 16);
-    vp9_short_iht4x4(BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16),
-                     diff, 4 << bwl, tx_type);
+    vp9_short_iht4x4_add(BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16), dst,
+                         xd->plane[0].dst.stride, tx_type);
   } else {
     x->fwd_txm4x4(src_diff, coeff, 8 << bwl);
     x->quantize_b_4x4(x, ib, tx_type, 16);
-    vp9_inverse_transform_b_4x4(&x->e_mbd, xd->plane[0].eobs[ib],
+    vp9_inverse_transform_b_4x4_add(&x->e_mbd, xd->plane[0].eobs[ib],
                                 BLOCK_OFFSET(xd->plane[0].dqcoeff, ib, 16),
-                                diff, 8 << bwl);
+                                dst, xd->plane[0].dst.stride);
   }
-
-  vp9_recon_b(dst, diff, 4 << bwl, dst, xd->plane[0].dst.stride);
 }
 
 void vp9_encode_intra4x4mby(MACROBLOCK *mb, BLOCK_SIZE_TYPE bsize) {
diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index bbc97da..84b3507 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c
@@ -425,7 +425,6 @@
   VP9_COMMON *cm;
   MACROBLOCK *x;
   struct optimize_ctx *ctx;
-  int *wip_txfrm_size;  // for "work in progress" only... will remove once done
 };
 
 static void xform_quant(int plane, int block, BLOCK_SIZE_TYPE bsize,
@@ -494,14 +493,9 @@
                          int ss_txfrm_size, void *arg) {
   struct encode_b_args* const args = arg;
   MACROBLOCK* const x = args->x;
-  int *wip_txfrm_size = args->wip_txfrm_size;
   MACROBLOCKD* const xd = &x->e_mbd;
-  const int bw = 4 << (b_width_log2(bsize) - xd->plane[plane].subsampling_x);
   const int raster_block = txfrm_block_to_raster_block(xd, bsize, plane,
                                                        block, ss_txfrm_size);
-  int16_t* const diff = raster_block_offset_int16(xd, bsize, plane,
-                                                  raster_block,
-                                                  xd->plane[plane].diff);
   uint8_t* const dst = raster_block_offset_uint8(xd, bsize, plane,
                                                  raster_block,
                                                  xd->plane[plane].dst.buf,
@@ -517,7 +511,6 @@
     case TX_32X32:
         vp9_short_idct32x32_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff,
                                 block, 16), dst, xd->plane[plane].dst.stride);
-        *wip_txfrm_size = 32;
       break;
     case TX_16X16:
       tx_type = plane == 0 ? get_tx_type_16x16(xd, raster_block) : DCT_DCT;
@@ -529,7 +522,6 @@
                                block, 16), dst, xd->plane[plane].dst.stride,
                                tx_type);
       }
-      *wip_txfrm_size = 16;
       break;
     case TX_8X8:
       tx_type = plane == 0 ? get_tx_type_8x8(xd, raster_block) : DCT_DCT;
@@ -541,7 +533,6 @@
                              block, 16), dst, xd->plane[plane].dst.stride,
                              tx_type);
       }
-      *wip_txfrm_size = 8;
       break;
     case TX_4X4:
       tx_type = plane == 0 ? get_tx_type_4x4(xd, raster_block) : DCT_DCT;
@@ -549,13 +540,13 @@
         // this is like vp9_short_idct4x4 but has a special case around eob<=1
         // which is significant (not just an optimization) for the lossless
         // case.
-        vp9_inverse_transform_b_4x4(xd, xd->plane[plane].eobs[block],
-            BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16), diff, bw * 2);
+        vp9_inverse_transform_b_4x4_add(xd, xd->plane[plane].eobs[block],
+            BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16), dst,
+            xd->plane[plane].dst.stride);
       } else {
-        vp9_short_iht4x4(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
-                         diff, bw, tx_type);
+        vp9_short_iht4x4_add(BLOCK_OFFSET(xd->plane[plane].dqcoeff, block, 16),
+                             dst, xd->plane[plane].dst.stride, tx_type);
       }
-      *wip_txfrm_size = 4;
       break;
   }
 }
@@ -563,16 +554,15 @@
 void vp9_xform_quant_sby(VP9_COMMON *const cm, MACROBLOCK *x,
                          BLOCK_SIZE_TYPE bsize) {
   MACROBLOCKD* const xd = &x->e_mbd;
-  struct encode_b_args arg = {cm, x, NULL, NULL};
+  struct encode_b_args arg = {cm, x, NULL};
 
-  foreach_transformed_block_in_plane(xd, bsize, 0,
-                                     xform_quant, &arg);
+  foreach_transformed_block_in_plane(xd, bsize, 0, xform_quant, &arg);
 }
 
 void vp9_xform_quant_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
                          BLOCK_SIZE_TYPE bsize) {
   MACROBLOCKD* const xd = &x->e_mbd;
-  struct encode_b_args arg = {cm, x, NULL, NULL};
+  struct encode_b_args arg = {cm, x, NULL};
 
   foreach_transformed_block_uv(xd, bsize, xform_quant, &arg);
 }
@@ -581,61 +571,37 @@
                     BLOCK_SIZE_TYPE bsize) {
   MACROBLOCKD* const xd = &x->e_mbd;
   struct optimize_ctx ctx;
-  int wip_txfrm_size = 0;
-  struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size};
+  struct encode_b_args arg = {cm, x, &ctx};
 
   vp9_subtract_sby(x, bsize);
   if (x->optimize)
     vp9_optimize_init(xd, bsize, &ctx);
 
-  foreach_transformed_block_in_plane(xd, bsize, 0,
-                                     encode_block, &arg);
-  if (wip_txfrm_size < 8)
-    vp9_recon_sby(xd, bsize);
+  foreach_transformed_block_in_plane(xd, bsize, 0, encode_block, &arg);
 }
 
 void vp9_encode_sbuv(VP9_COMMON *const cm, MACROBLOCK *x,
                      BLOCK_SIZE_TYPE bsize) {
   MACROBLOCKD* const xd = &x->e_mbd;
   struct optimize_ctx ctx;
-  int wip_txfrm_size = 0;
-  struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size};
+  struct encode_b_args arg = {cm, x, &ctx};
 
   vp9_subtract_sbuv(x, bsize);
   if (x->optimize)
     vp9_optimize_init(xd, bsize, &ctx);
 
   foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
-
-  if (wip_txfrm_size < 8)
-    vp9_recon_sbuv(xd, bsize);
 }
 
 void vp9_encode_sb(VP9_COMMON *const cm, MACROBLOCK *x,
                    BLOCK_SIZE_TYPE bsize) {
   MACROBLOCKD* const xd = &x->e_mbd;
   struct optimize_ctx ctx;
-  int wip_txfrm_size = 0;
-  struct encode_b_args arg = {cm, x, &ctx, &wip_txfrm_size};
+  struct encode_b_args arg = {cm, x, &ctx};
 
   vp9_subtract_sb(x, bsize);
   if (x->optimize)
     vp9_optimize_init(xd, bsize, &ctx);
-#if 0
+
   foreach_transformed_block(xd, bsize, encode_block, &arg);
-
-  vp9_recon_sb(xd, bsize);
-#else
-  // wip version... will use foreach_transformed_block when done
-  foreach_transformed_block_in_plane(xd, bsize, 0,
-                                     encode_block, &arg);
-  if (wip_txfrm_size < 8)
-    vp9_recon_sby(xd, bsize);
-  wip_txfrm_size = 0;
-
-  foreach_transformed_block_uv(xd, bsize, encode_block, &arg);
-
-  if (wip_txfrm_size < 8)
-    vp9_recon_sbuv(xd, bsize);
-#endif
 }
diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index a144c1b..02d46cb 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c
@@ -1178,11 +1178,11 @@
 
   cpi->oxcf.lossless = oxcf->lossless;
   if (cpi->oxcf.lossless) {
-    cpi->mb.e_mbd.inv_txm4x4_1 = vp9_short_iwalsh4x4_1;
-    cpi->mb.e_mbd.inv_txm4x4   = vp9_short_iwalsh4x4;
+    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_short_iwalsh4x4_1_add;
+    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_short_iwalsh4x4_add;
   } else {
-    cpi->mb.e_mbd.inv_txm4x4_1 = vp9_short_idct4x4_1;
-    cpi->mb.e_mbd.inv_txm4x4   = vp9_short_idct4x4;
+    cpi->mb.e_mbd.inv_txm4x4_1_add    = vp9_short_idct4x4_1_add;
+    cpi->mb.e_mbd.inv_txm4x4_add      = vp9_short_idct4x4_add;
   }
 
   cpi->baseline_gf_interval = DEFAULT_GF_INTERVAL;
diff --git a/vp9/encoder/vp9_onyx_int.h b/vp9/encoder/vp9_onyx_int.h
index b61472f..57d19ca 100644
--- a/vp9/encoder/vp9_onyx_int.h
+++ b/vp9/encoder/vp9_onyx_int.h
@@ -624,6 +624,16 @@
 #endif
 } VP9_COMP;
 
+static int get_ref_frame_idx(VP9_COMP *cpi, MV_REFERENCE_FRAME ref_frame) {
+  if (ref_frame == LAST_FRAME) {
+    return cpi->lst_fb_idx;
+  } else if (ref_frame == GOLDEN_FRAME) {
+    return cpi->gld_fb_idx;
+  } else {
+    return cpi->alt_fb_idx;
+  }
+}
+
 void vp9_encode_frame(VP9_COMP *cpi);
 
 void vp9_pack_bitstream(VP9_COMP *cpi, unsigned char *dest,
diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index e633004..8c1ef49 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c
@@ -627,11 +627,6 @@
                                 BLOCK_SIZE_SB8X8,
                                 0, ib,
                                 x->plane[0].src_diff);
-  int16_t* const diff =
-      raster_block_offset_int16(xd,
-                                BLOCK_SIZE_SB8X8,
-                                0, ib,
-                                xd->plane[0].diff);
   int16_t* const coeff = BLOCK_OFFSET(x->plane[0].coeff, ib, 16);
   uint8_t* const dst =
       raster_block_offset_uint8(xd,
@@ -703,18 +698,18 @@
   xd->mode_info_context->bmi[ib].as_mode.first =
     (B_PREDICTION_MODE)(*best_mode);
 
-  // inverse transform
-  if (best_tx_type != DCT_DCT)
-    vp9_short_iht4x4(best_dqcoeff, diff, 8, best_tx_type);
-  else
-    xd->inv_txm4x4(best_dqcoeff, diff, 16);
-
   vp9_intra4x4_predict(xd, ib,
                        BLOCK_SIZE_SB8X8,
                        *best_mode,
                        dst, xd->plane[0].dst.stride);
-  vp9_recon_b(dst, diff, 8,
-              dst, xd->plane[0].dst.stride);
+
+  // inverse transform
+  if (best_tx_type != DCT_DCT) {
+    vp9_short_iht4x4_add(best_dqcoeff, dst, xd->plane[0].dst.stride,
+                           best_tx_type);
+  } else {
+    xd->inv_txm4x4_add(best_dqcoeff, dst, xd->plane[0].dst.stride);
+  }
 
   return best_rd;
 }
@@ -3080,28 +3075,12 @@
       mbmi->mode = this_mode;
     } else {
       YV12_BUFFER_CONFIG *scaled_ref_frame[2] = {NULL, NULL};
-      int fb;
-
-      if (mbmi->ref_frame == LAST_FRAME) {
-        fb = cpi->lst_fb_idx;
-      } else if (mbmi->ref_frame == GOLDEN_FRAME) {
-        fb = cpi->gld_fb_idx;
-      } else {
-        fb = cpi->alt_fb_idx;
-      }
-
+      int fb = get_ref_frame_idx(cpi, mbmi->ref_frame);
       if (cpi->scaled_ref_idx[fb] != cm->ref_frame_map[fb])
         scaled_ref_frame[0] = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];
 
       if (comp_pred) {
-        if (mbmi->second_ref_frame == LAST_FRAME) {
-          fb = cpi->lst_fb_idx;
-        } else if (mbmi->second_ref_frame == GOLDEN_FRAME) {
-          fb = cpi->gld_fb_idx;
-        } else {
-          fb = cpi->alt_fb_idx;
-        }
-
+        fb = get_ref_frame_idx(cpi, mbmi->second_ref_frame);
         if (cpi->scaled_ref_idx[fb] != cm->ref_frame_map[fb])
           scaled_ref_frame[1] = &cm->yv12_fb[cpi->scaled_ref_idx[fb]];
       }
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index ace7e6f..0c5ce1d 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -70,7 +70,6 @@
 VP9_COMMON_SRCS-yes += common/vp9_mvref_common.c
 VP9_COMMON_SRCS-yes += common/vp9_mvref_common.h
 VP9_COMMON_SRCS-yes += common/vp9_quant_common.c
-VP9_COMMON_SRCS-yes += common/vp9_recon.c
 VP9_COMMON_SRCS-yes += common/vp9_reconinter.c
 VP9_COMMON_SRCS-yes += common/vp9_reconintra.c
 VP9_COMMON_SRCS-$(CONFIG_POSTPROC_VISUALIZER) += common/vp9_textblit.c