Merge "Modified  mv prediction." into experimental
diff --git a/build/make/Makefile b/build/make/Makefile
index be9ee6d..b5d0d52 100644
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -112,7 +112,7 @@
 	$(qexec)mkdir -p $$(dir $$@)
 	$(qexec)$(CC) $$(INTERNAL_CFLAGS) $$(CFLAGS) -M $$< | $(fmt_deps) > $$@
 
-$(BUILD_PFX)$(call xform_obj_path,$(1))%.c.o: $(1)%.c $(BUILD_PFX)$(call xform_obj_path,$(1))%.c.d
+$(BUILD_PFX)$(call xform_obj_path,$(1))%.c.o: $(1)%.c
 	$(if $(quiet),@echo "    [CC] $$@")
 	$(qexec)$(CC) $$(INTERNAL_CFLAGS) $$(CFLAGS) -c -o $$@ $$<
 
diff --git a/build/make/configure.sh b/build/make/configure.sh
index 05bbabe..d9b0fe7 100755
--- a/build/make/configure.sh
+++ b/build/make/configure.sh
@@ -435,10 +435,10 @@
 EOF
 
     if enabled rvct; then cat >> $1 << EOF
-fmt_deps = sed -e 's;^__image.axf;\$(dir \$@)\$(notdir \$<).o \$@;' #hide
+fmt_deps = sed -e 's;^__image.axf;\$\${@:.d=.o} \$\$@;' #hide
 EOF
     else cat >> $1 << EOF
-fmt_deps = sed -e 's;^\([a-zA-Z0-9_]*\)\.o;\$(dir \$@)\1\$(suffix \$<).o \$@;'
+fmt_deps = sed -e 's;^\([a-zA-Z0-9_]*\)\.o;\$\${@:.d=.o} \$\$@;'
 EOF
     fi
 
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index d857280..64bf0bb 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -16,7 +16,6 @@
 
 extern "C" {
 #include "vp9/common/entropy.h"
-#include "vp9/common/idct.h"
 #include "vp9_rtcd.h"
 }
 
diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index 3fe4774..ebec890 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -15,7 +15,6 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 extern "C" {
-#include "vp9/common/idct.h"
 #include "vp9_rtcd.h"
 }
 
diff --git a/test/fdct8x8_test.cc b/test/fdct8x8_test.cc
index 729344d..d82f7c3 100644
--- a/test/fdct8x8_test.cc
+++ b/test/fdct8x8_test.cc
@@ -15,7 +15,6 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 extern "C" {
-#include "vp9/common/idct.h"
 #include "vp9_rtcd.h"
 }
 
diff --git a/test/idct8x8_test.cc b/test/idct8x8_test.cc
index 099efbf..7cf7aa0 100644
--- a/test/idct8x8_test.cc
+++ b/test/idct8x8_test.cc
@@ -15,7 +15,6 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 extern "C" {
-#include "vp9/common/idct.h"
 #include "vp9_rtcd.h"
 }
 
diff --git a/vp8/common/ppc/systemdependent.c b/vp8/common/ppc/systemdependent.c
index 87f4cac..6899c0e 100644
--- a/vp8/common/ppc/systemdependent.c
+++ b/vp8/common/ppc/systemdependent.c
@@ -12,13 +12,8 @@
 #include "subpixel.h"
 #include "loopfilter.h"
 #include "recon.h"
-#include "idct.h"
 #include "onyxc_int.h"
 
-void (*vp8_short_idct4x4)(short *input, short *output, int pitch);
-void (*vp8_short_idct4x4_1)(short *input, short *output, int pitch);
-void (*vp8_dc_only_idct)(short input_dc, short *output, int pitch);
-
 extern void (*vp8_post_proc_down_and_across_mb_row)(
     unsigned char *src_ptr,
     unsigned char *dst_ptr,
diff --git a/vp9/common/arm/arm_systemdependent.c b/vp9/common/arm/arm_systemdependent.c
index 19a01c6..0a0e809 100644
--- a/vp9/common/arm/arm_systemdependent.c
+++ b/vp9/common/arm/arm_systemdependent.c
@@ -15,7 +15,6 @@
 #include "vp9/common/subpixel.h"
 #include "vp9/common/loopfilter.h"
 #include "vp9/common/recon.h"
-#include "vp9/common/idct.h"
 #include "vp9/common/onyxc_int.h"
 
 void vp9_arch_arm_common_init(VP9_COMMON *ctx) {
diff --git a/vp9/common/blockd.h b/vp9/common/blockd.h
index 412d2bc..409c7b8 100644
--- a/vp9/common/blockd.h
+++ b/vp9/common/blockd.h
@@ -145,6 +145,11 @@
 
 #define VP9_MVREFS (1 + SPLITMV - NEARESTMV)
 
+#if CONFIG_LOSSLESS
+#define WHT_UPSCALE_FACTOR 3
+#define Y2_WHT_UPSCALE_FACTOR 2
+#endif
+
 typedef enum {
   B_DC_PRED,          /* average of above and left pixels */
   B_TM_PRED,
@@ -370,6 +375,14 @@
 
   unsigned int frames_since_golden;
   unsigned int frames_till_alt_ref_frame;
+
+  /* Inverse transform function pointers. */
+  void (*inv_xform4x4_1_x8)(short *input, short *output, int pitch);
+  void (*inv_xform4x4_x8)(short *input, short *output, int pitch);
+  void (*inv_walsh4x4_1)(short *in, short *out);
+  void (*inv_walsh4x4_lossless)(short *in, short *out);
+
+
   vp9_subpix_fn_t  subpixel_predict;
   vp9_subpix_fn_t  subpixel_predict8x4;
   vp9_subpix_fn_t  subpixel_predict8x8;
diff --git a/vp9/common/generic/systemdependent.c b/vp9/common/generic/systemdependent.c
index 51dfaea..6d1a271 100644
--- a/vp9/common/generic/systemdependent.c
+++ b/vp9/common/generic/systemdependent.c
@@ -13,7 +13,6 @@
 #include "vp9_rtcd.h"
 #include "vp9/common/subpixel.h"
 #include "vp9/common/loopfilter.h"
-#include "vp9/common/idct.h"
 #include "vp9/common/onyxc_int.h"
 
 extern void vp9_arch_x86_common_init(VP9_COMMON *ctx);
@@ -23,48 +22,6 @@
 #if CONFIG_RUNTIME_CPU_DETECT
   VP9_COMMON_RTCD *rtcd = &ctx->rtcd;
 
-  rtcd->idct.idct1        = vp9_short_idct4x4llm_1_c;
-  rtcd->idct.idct16       = vp9_short_idct4x4llm_c;
-  rtcd->idct.idct1_scalar_add = vp9_dc_only_idct_add_c;
-  rtcd->idct.iwalsh1      = vp9_short_inv_walsh4x4_1_c;
-  rtcd->idct.iwalsh16     = vp9_short_inv_walsh4x4_c;
-  rtcd->idct.idct8        = vp9_short_idct8x8_c;
-  rtcd->idct.idct10_8     = vp9_short_idct10_8x8_c;
-  rtcd->idct.idct1_scalar_add_8x8 = vp9_dc_only_idct_add_8x8_c;
-  rtcd->idct.ihaar2       = vp9_short_ihaar2x2_c;
-  rtcd->idct.idct16x16    = vp9_short_idct16x16_c;
-  rtcd->idct.idct10_16x16 = vp9_short_idct10_16x16_c;
-
-  rtcd->subpix.eighttap16x16       = vp9_eighttap_predict16x16_c;
-  rtcd->subpix.eighttap8x8         = vp9_eighttap_predict8x8_c;
-  rtcd->subpix.eighttap_avg16x16   = vp9_eighttap_predict_avg16x16_c;
-  rtcd->subpix.eighttap_avg8x8     = vp9_eighttap_predict_avg8x8_c;
-  rtcd->subpix.eighttap_avg4x4     = vp9_eighttap_predict_avg4x4_c;
-  rtcd->subpix.eighttap8x4         = vp9_eighttap_predict8x4_c;
-  rtcd->subpix.eighttap4x4         = vp9_eighttap_predict_c;
-  rtcd->subpix.eighttap16x16_sharp     = vp9_eighttap_predict16x16_sharp_c;
-  rtcd->subpix.eighttap8x8_sharp       = vp9_eighttap_predict8x8_sharp_c;
-  rtcd->subpix.eighttap_avg16x16_sharp = vp9_eighttap_predict_avg16x16_sharp_c;
-  rtcd->subpix.eighttap_avg8x8_sharp   = vp9_eighttap_predict_avg8x8_sharp_c;
-  rtcd->subpix.eighttap_avg4x4_sharp   = vp9_eighttap_predict_avg4x4_sharp_c;
-  rtcd->subpix.eighttap8x4_sharp       = vp9_eighttap_predict8x4_sharp_c;
-  rtcd->subpix.eighttap4x4_sharp       = vp9_eighttap_predict_sharp_c;
-
-  rtcd->subpix.sixtap16x16       = vp9_sixtap_predict16x16_c;
-  rtcd->subpix.sixtap8x8         = vp9_sixtap_predict8x8_c;
-  rtcd->subpix.sixtap_avg16x16   = vp9_sixtap_predict_avg16x16_c;
-  rtcd->subpix.sixtap_avg8x8     = vp9_sixtap_predict_avg8x8_c;
-  rtcd->subpix.sixtap8x4         = vp9_sixtap_predict8x4_c;
-  rtcd->subpix.sixtap4x4         = vp9_sixtap_predict_c;
-  rtcd->subpix.sixtap_avg4x4     = vp9_sixtap_predict_avg_c;
-  rtcd->subpix.bilinear16x16     = vp9_bilinear_predict16x16_c;
-  rtcd->subpix.bilinear8x8       = vp9_bilinear_predict8x8_c;
-  rtcd->subpix.bilinear_avg16x16 = vp9_bilinear_predict_avg16x16_c;
-  rtcd->subpix.bilinear_avg8x8   = vp9_bilinear_predict_avg8x8_c;
-  rtcd->subpix.bilinear8x4       = vp9_bilinear_predict8x4_c;
-  rtcd->subpix.bilinear4x4       = vp9_bilinear_predict4x4_c;
-  rtcd->subpix.bilinear_avg4x4   = vp9_bilinear_predict_avg4x4_c;
-
 #if CONFIG_POSTPROC || (CONFIG_VP9_ENCODER && CONFIG_INTERNAL_STATS)
   rtcd->postproc.down             = vp9_mbpost_proc_down_c;
   rtcd->postproc.across           = vp9_mbpost_proc_across_ip_c;
diff --git a/vp9/common/idct.h b/vp9/common/idct.h
deleted file mode 100644
index 0f0478c..0000000
--- a/vp9/common/idct.h
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#ifndef __INC_IDCT_H
-#define __INC_IDCT_H
-
-#include "vp9/common/blockd.h"
-
-#define prototype_second_order(sym) \
-  void sym(short *input, short *output)
-
-#define prototype_idct(sym) \
-  void sym(short *input, short *output, int pitch)
-
-#define prototype_idct_scalar_add(sym) \
-  void sym(short input, \
-           unsigned char *pred, unsigned char *output, \
-           int pitch, int stride)
-
-#if ARCH_X86 || ARCH_X86_64
-#include "x86/idct_x86.h"
-#endif
-
-#ifdef _MSC_VER
-/* TODO: remove these after integer implmementations are done */
-#define M_PI       3.14159265358979323846
-#define round(x) (((x)>0)? floor((x)+0.5): ceil((x)-0.5))
-#endif
-
-
-#if ARCH_ARM
-#include "arm/idct_arm.h"
-#endif
-
-#if CONFIG_LOSSLESS
-#define WHT_UPSCALE_FACTOR 3
-#define Y2_WHT_UPSCALE_FACTOR 2
-#endif
-
-#ifndef vp9_idct_idct16x16
-#define vp9_idct_idct16x16 vp9_short_idct16x16_c
-#endif
-extern prototype_idct(vp9_idct_idct16x16);
-
-#ifndef vp9_idct_idct10_16x16
-#define vp9_idct_idct10_16x16 vp9_short_idct10_16x16_c
-#endif
-extern prototype_idct(vp9_idct_idct10_16x16);
-
-#ifndef vp9_idct_idct8
-#define vp9_idct_idct8 vp9_short_idct8x8_c
-#endif
-extern prototype_idct(vp9_idct_idct8);
-
-#ifndef vp9_idct_idct10_8
-#define vp9_idct_idct10_8 vp9_short_idct10_8x8_c
-#endif
-extern prototype_idct(vp9_idct_idct10_8);
-
-#ifndef vp9_idct_idct8_1
-#define vp9_idct_idct8_1 vp9_short_idct8x8_1_c
-#endif
-extern prototype_idct(vp9_idct_idct8_1);
-
-#ifndef vp9_idct_ihaar2
-#define vp9_idct_ihaar2 vp9_short_ihaar2x2_c
-#endif
-extern prototype_idct(vp9_idct_ihaar2);
-
-#ifndef vp9_idct_ihaar2_1
-#define vp9_idct_ihaar2_1 vp9_short_ihaar2x2_1_c
-#endif
-extern prototype_idct(vp9_idct_ihaar2_1);
-
-#ifndef vp9_idct_idct1_scalar_add_8x8
-#define vp9_idct_idct1_scalar_add_8x8 vp9_dc_only_idct_add_8x8_c
-#endif
-extern prototype_idct_scalar_add(vp9_idct_idct1_scalar_add_8x8);
-
-
-
-#ifndef vp9_idct_idct1
-#define vp9_idct_idct1 vp9_short_idct4x4llm_1_c
-#endif
-extern prototype_idct(vp9_idct_idct1);
-
-#ifndef vp9_idct_idct16
-#define vp9_idct_idct16 vp9_short_idct4x4llm_c
-#endif
-extern prototype_idct(vp9_idct_idct16);
-
-#ifndef vp9_idct_idct1_scalar_add
-#define vp9_idct_idct1_scalar_add vp9_dc_only_idct_add_c
-#endif
-extern prototype_idct_scalar_add(vp9_idct_idct1_scalar_add);
-
-
-#ifndef vp9_idct_iwalsh1
-#define vp9_idct_iwalsh1 vp9_short_inv_walsh4x4_1_c
-#endif
-extern prototype_second_order(vp9_idct_iwalsh1);
-
-#ifndef vp9_idct_iwalsh16
-#define vp9_idct_iwalsh16 vp9_short_inv_walsh4x4_c
-#endif
-extern prototype_second_order(vp9_idct_iwalsh16);
-
-#if CONFIG_LOSSLESS
-extern prototype_idct(vp9_short_inv_walsh4x4_x8_c);
-extern prototype_idct(vp9_short_inv_walsh4x4_1_x8_c);
-extern prototype_idct_scalar_add(vp9_dc_only_inv_walsh_add_c);
-extern prototype_second_order(vp9_short_inv_walsh4x4_lossless_c);
-extern prototype_second_order(vp9_short_inv_walsh4x4_1_lossless_c);
-#endif
-
-void vp9_ihtllm_c(const int16_t *input, int16_t *output, int pitch,
-                  TX_TYPE tx_type, int tx_dim);
-
-typedef prototype_idct((*vp9_idct_fn_t));
-typedef prototype_idct_scalar_add((*vp9_idct_scalar_add_fn_t));
-typedef prototype_second_order((*vp9_second_order_fn_t));
-
-typedef struct {
-  vp9_idct_fn_t            idct1;
-  vp9_idct_fn_t            idct16;
-  vp9_idct_scalar_add_fn_t idct1_scalar_add;
-
-  vp9_second_order_fn_t iwalsh1;
-  vp9_second_order_fn_t iwalsh16;
-
-  vp9_idct_fn_t            idct8;
-  vp9_idct_fn_t            idct10_8;
-  vp9_idct_fn_t            idct8_1;
-  vp9_idct_scalar_add_fn_t idct1_scalar_add_8x8;
-  vp9_idct_fn_t ihaar2;
-  vp9_idct_fn_t ihaar2_1;
-
-  vp9_idct_fn_t            idct16x16;
-  vp9_idct_fn_t            idct10_16x16;
-} vp9_idct_rtcd_vtable_t;
-
-#if CONFIG_RUNTIME_CPU_DETECT
-#define IDCT_INVOKE(ctx,fn) (ctx)->fn
-#else
-#define IDCT_INVOKE(ctx,fn) vp9_idct_##fn
-#endif
-
-#endif
diff --git a/vp9/common/idctllm.c b/vp9/common/idctllm.c
index aa56654..55b7a85 100644
--- a/vp9/common/idctllm.c
+++ b/vp9/common/idctllm.c
@@ -25,7 +25,6 @@
 #include <assert.h>
 #include <math.h>
 #include "vpx_ports/config.h"
-#include "vp9/common/idct.h"
 #include "vp9/common/systemdependent.h"
 
 #include "vp9/common/blockd.h"
diff --git a/vp9/common/invtrans.c b/vp9/common/invtrans.c
index f972afa..ac5553e 100644
--- a/vp9/common/invtrans.c
+++ b/vp9/common/invtrans.c
@@ -9,6 +9,7 @@
  */
 
 #include "invtrans.h"
+#include "./vp9_rtcd.h"
 
 static void recon_dcblock(MACROBLOCKD *xd) {
   BLOCKD *b = &xd->block[24];
@@ -28,108 +29,92 @@
   xd->block[12].dqcoeff[0] = b->diff[8];
 }
 
-void vp9_inverse_transform_b_4x4(const vp9_idct_rtcd_vtable_t *rtcd,
-                                 BLOCKD *b, int pitch) {
+void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int block, int pitch) {
+  BLOCKD *b = &xd->block[block];
   if (b->eob <= 1)
-    IDCT_INVOKE(rtcd, idct1)(b->dqcoeff, b->diff, pitch);
+    xd->inv_xform4x4_1_x8(b->dqcoeff, b->diff, pitch);
   else
-    IDCT_INVOKE(rtcd, idct16)(b->dqcoeff, b->diff, pitch);
+    xd->inv_xform4x4_x8(b->dqcoeff, b->diff, pitch);
 }
 
-void vp9_inverse_transform_mby_4x4(const vp9_idct_rtcd_vtable_t *rtcd,
-                                   MACROBLOCKD *xd) {
+void vp9_inverse_transform_mby_4x4(MACROBLOCKD *xd) {
   int i;
   BLOCKD *blockd = xd->block;
 
   if (xd->mode_info_context->mbmi.mode != SPLITMV) {
     /* do 2nd order transform on the dc block */
-    IDCT_INVOKE(rtcd, iwalsh16)(blockd[24].dqcoeff, blockd[24].diff);
+    vp9_short_inv_walsh4x4(blockd[24].dqcoeff, blockd[24].diff);
     recon_dcblock(xd);
   }
 
   for (i = 0; i < 16; i++) {
-    vp9_inverse_transform_b_4x4(rtcd, &blockd[i], 32);
+    vp9_inverse_transform_b_4x4(xd, i, 32);
   }
 }
 
-void vp9_inverse_transform_mbuv_4x4(const vp9_idct_rtcd_vtable_t *rtcd,
-                                    MACROBLOCKD *xd) {
+void vp9_inverse_transform_mbuv_4x4(MACROBLOCKD *xd) {
   int i;
-  BLOCKD *blockd = xd->block;
-
   for (i = 16; i < 24; i++) {
-    vp9_inverse_transform_b_4x4(rtcd, &blockd[i], 16);
+    vp9_inverse_transform_b_4x4(xd, i, 16);
   }
 }
 
-void vp9_inverse_transform_mb_4x4(const vp9_idct_rtcd_vtable_t *rtcd,
-                                  MACROBLOCKD *xd) {
-  vp9_inverse_transform_mby_4x4(rtcd, xd);
-  vp9_inverse_transform_mbuv_4x4(rtcd, xd);
+void vp9_inverse_transform_mb_4x4(MACROBLOCKD *xd) {
+  vp9_inverse_transform_mby_4x4(xd);
+  vp9_inverse_transform_mbuv_4x4(xd);
 }
 
-void vp9_inverse_transform_b_8x8(const vp9_idct_rtcd_vtable_t *rtcd,
-                                 short *input_dqcoeff, short *output_coeff,
+void vp9_inverse_transform_b_8x8(short *input_dqcoeff, short *output_coeff,
                                  int pitch) {
-  // int b,i;
-  // if (b->eob > 1)
-  IDCT_INVOKE(rtcd, idct8)(input_dqcoeff, output_coeff, pitch);
-  // else
-  // IDCT_INVOKE(rtcd, idct8_1)(b->dqcoeff, b->diff, pitch);//pitch
+  vp9_short_idct8x8(input_dqcoeff, output_coeff, pitch);
 }
 
-void vp9_inverse_transform_mby_8x8(const vp9_idct_rtcd_vtable_t *rtcd,
-                                   MACROBLOCKD *xd) {
+void vp9_inverse_transform_mby_8x8(MACROBLOCKD *xd) {
   int i;
   BLOCKD *blockd = xd->block;
 
   if (xd->mode_info_context->mbmi.mode != SPLITMV) {
     // do 2nd order transform on the dc block
-    IDCT_INVOKE(rtcd, ihaar2)(blockd[24].dqcoeff, blockd[24].diff, 8);
+    vp9_short_ihaar2x2(blockd[24].dqcoeff, blockd[24].diff, 8);
     recon_dcblock_8x8(xd); // need to change for 8x8
   }
 
   for (i = 0; i < 9; i += 8) {
-    vp9_inverse_transform_b_8x8(rtcd, &blockd[i].dqcoeff[0],
+    vp9_inverse_transform_b_8x8(&blockd[i].dqcoeff[0],
                                 &blockd[i].diff[0], 32);
   }
   for (i = 2; i < 11; i += 8) {
-    vp9_inverse_transform_b_8x8(rtcd, &blockd[i + 2].dqcoeff[0],
+    vp9_inverse_transform_b_8x8(&blockd[i + 2].dqcoeff[0],
                                 &blockd[i].diff[0], 32);
   }
 }
 
-void vp9_inverse_transform_mbuv_8x8(const vp9_idct_rtcd_vtable_t *rtcd,
-                                    MACROBLOCKD *xd) {
+void vp9_inverse_transform_mbuv_8x8(MACROBLOCKD *xd) {
   int i;
   BLOCKD *blockd = xd->block;
 
   for (i = 16; i < 24; i += 4) {
-    vp9_inverse_transform_b_8x8(rtcd, &blockd[i].dqcoeff[0],
+    vp9_inverse_transform_b_8x8(&blockd[i].dqcoeff[0],
                                 &blockd[i].diff[0], 16);
   }
 }
 
-void vp9_inverse_transform_mb_8x8(const vp9_idct_rtcd_vtable_t *rtcd,
-                                  MACROBLOCKD *xd) {
-  vp9_inverse_transform_mby_8x8(rtcd, xd);
-  vp9_inverse_transform_mbuv_8x8(rtcd, xd);
+void vp9_inverse_transform_mb_8x8(MACROBLOCKD *xd) {
+  vp9_inverse_transform_mby_8x8(xd);
+  vp9_inverse_transform_mbuv_8x8(xd);
 }
 
-void vp9_inverse_transform_b_16x16(const vp9_idct_rtcd_vtable_t *rtcd,
-                                   short *input_dqcoeff,
+void vp9_inverse_transform_b_16x16(short *input_dqcoeff,
                                    short *output_coeff, int pitch) {
-  IDCT_INVOKE(rtcd, idct16x16)(input_dqcoeff, output_coeff, pitch);
+  vp9_short_idct16x16(input_dqcoeff, output_coeff, pitch);
 }
 
-void vp9_inverse_transform_mby_16x16(const vp9_idct_rtcd_vtable_t *rtcd,
-                                     MACROBLOCKD *xd) {
-  vp9_inverse_transform_b_16x16(rtcd, &xd->block[0].dqcoeff[0],
+void vp9_inverse_transform_mby_16x16(MACROBLOCKD *xd) {
+  vp9_inverse_transform_b_16x16(&xd->block[0].dqcoeff[0],
                                 &xd->block[0].diff[0], 32);
 }
 
-void vp9_inverse_transform_mb_16x16(const vp9_idct_rtcd_vtable_t *rtcd,
-                                    MACROBLOCKD *xd) {
-  vp9_inverse_transform_mby_16x16(rtcd, xd);
-  vp9_inverse_transform_mbuv_8x8(rtcd, xd);
+void vp9_inverse_transform_mb_16x16(MACROBLOCKD *xd) {
+  vp9_inverse_transform_mby_16x16(xd);
+  vp9_inverse_transform_mbuv_8x8(xd);
 }
diff --git a/vp9/common/invtrans.h b/vp9/common/invtrans.h
index 370964f..58dc4d7 100644
--- a/vp9/common/invtrans.h
+++ b/vp9/common/invtrans.h
@@ -12,42 +12,30 @@
 #define __INC_INVTRANS_H
 
 #include "vpx_ports/config.h"
-#include "idct.h"
 #include "blockd.h"
 
-extern void vp9_inverse_transform_b_4x4(const vp9_idct_rtcd_vtable_t *rtcd,
-                                        BLOCKD *b, int pitch);
+extern void vp9_inverse_transform_b_4x4(MACROBLOCKD *xd, int block, int pitch);
 
-extern void vp9_inverse_transform_mb_4x4(const vp9_idct_rtcd_vtable_t *rtcd,
-                                         MACROBLOCKD *xd);
+extern void vp9_inverse_transform_mb_4x4(MACROBLOCKD *xd);
 
-extern void vp9_inverse_transform_mby_4x4(const vp9_idct_rtcd_vtable_t *rtcd,
-                                          MACROBLOCKD *xd);
+extern void vp9_inverse_transform_mby_4x4(MACROBLOCKD *xd);
 
-extern void vp9_inverse_transform_mbuv_4x4(const vp9_idct_rtcd_vtable_t *rtcd,
-                                           MACROBLOCKD *xd);
+extern void vp9_inverse_transform_mbuv_4x4(MACROBLOCKD *xd);
 
-extern void vp9_inverse_transform_b_8x8(const vp9_idct_rtcd_vtable_t *rtcd,
-                                        short *input_dqcoeff,
+extern void vp9_inverse_transform_b_8x8(short *input_dqcoeff,
                                         short *output_coeff, int pitch);
 
-extern void vp9_inverse_transform_mb_8x8(const vp9_idct_rtcd_vtable_t *rtcd,
-                                         MACROBLOCKD *xd);
+extern void vp9_inverse_transform_mb_8x8(MACROBLOCKD *xd);
 
-extern void vp9_inverse_transform_mby_8x8(const vp9_idct_rtcd_vtable_t *rtcd,
-                                          MACROBLOCKD *xd);
+extern void vp9_inverse_transform_mby_8x8(MACROBLOCKD *xd);
 
-extern void vp9_inverse_transform_mbuv_8x8(const vp9_idct_rtcd_vtable_t *rtcd,
-                                           MACROBLOCKD *xd);
+extern void vp9_inverse_transform_mbuv_8x8(MACROBLOCKD *xd);
 
-extern void vp9_inverse_transform_b_16x16(const vp9_idct_rtcd_vtable_t *rtcd,
-                                          short *input_dqcoeff,
+extern void vp9_inverse_transform_b_16x16(short *input_dqcoeff,
                                           short *output_coeff, int pitch);
 
-extern void vp9_inverse_transform_mb_16x16(const vp9_idct_rtcd_vtable_t *rtcd,
-                                           MACROBLOCKD *xd);
+extern void vp9_inverse_transform_mb_16x16(MACROBLOCKD *xd);
 
-extern void vp9_inverse_transform_mby_16x16(const vp9_idct_rtcd_vtable_t *rtcd,
-                                            MACROBLOCKD *xd);
+extern void vp9_inverse_transform_mby_16x16(MACROBLOCKD *xd);
 
 #endif  // __INC_INVTRANS_H
diff --git a/vp9/common/onyxc_int.h b/vp9/common/onyxc_int.h
index beefdbc..27a6ca1 100644
--- a/vp9/common/onyxc_int.h
+++ b/vp9/common/onyxc_int.h
@@ -19,7 +19,6 @@
 #include "entropymv.h"
 #include "entropy.h"
 #include "entropymode.h"
-#include "idct.h"
 #if CONFIG_POSTPROC
 #include "postproc.h"
 #endif
@@ -148,8 +147,6 @@
 
 typedef struct VP9_COMMON_RTCD {
 #if CONFIG_RUNTIME_CPU_DETECT
-  vp9_idct_rtcd_vtable_t        idct;
-  vp9_subpix_rtcd_vtable_t      subpix;
 #if CONFIG_POSTPROC
   vp9_postproc_rtcd_vtable_t    postproc;
 #endif
diff --git a/vp9/common/onyxd.h b/vp9/common/onyxd.h
index 110c753..7b7662b 100644
--- a/vp9/common/onyxd.h
+++ b/vp9/common/onyxd.h
@@ -42,7 +42,7 @@
   void vp9_initialize_dec(void);
 
   int vp9_receive_compressed_data(VP9D_PTR comp, unsigned long size,
-                                  const unsigned char *dest,
+                                  const unsigned char **dest,
                                   int64_t time_stamp);
 
   int vp9_get_raw_frame(VP9D_PTR comp, YV12_BUFFER_CONFIG *sd,
diff --git a/vp9/common/ppc/systemdependent.c b/vp9/common/ppc/systemdependent.c
index ba4f236..941c11b 100644
--- a/vp9/common/ppc/systemdependent.c
+++ b/vp9/common/ppc/systemdependent.c
@@ -11,7 +11,6 @@
 #include "subpixel.h"
 #include "loopfilter.h"
 #include "recon.h"
-#include "idct.h"
 #include "onyxc_int.h"
 
 void (*vp8_short_idct4x4)(short *input, short *output, int pitch);
diff --git a/vp9/common/reconinter.c b/vp9/common/reconinter.c
index b67cec7..37478f7 100644
--- a/vp9/common/reconinter.c
+++ b/vp9/common/reconinter.c
@@ -11,7 +11,6 @@
 
 #include "vpx_ports/config.h"
 #include "vpx/vpx_integer.h"
-#include "subpixel.h"
 #include "blockd.h"
 #include "reconinter.h"
 #include "vp9/common/reconintra.h"
@@ -23,66 +22,38 @@
                               INTERPOLATIONFILTERTYPE mcomp_filter_type,
                               VP9_COMMON *cm) {
   if (mcomp_filter_type == SIXTAP) {
-    xd->subpixel_predict        = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, sixtap4x4);
-    xd->subpixel_predict8x4     = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, sixtap8x4);
-    xd->subpixel_predict8x8     = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, sixtap8x8);
-    xd->subpixel_predict16x16   = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, sixtap16x16);
-    xd->subpixel_predict_avg    = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, sixtap_avg4x4);
-    xd->subpixel_predict_avg8x8 = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, sixtap_avg8x8);
-    xd->subpixel_predict_avg16x16 = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, sixtap_avg16x16);
+    xd->subpixel_predict        = vp9_sixtap_predict;
+    xd->subpixel_predict8x4     = vp9_sixtap_predict8x4;
+    xd->subpixel_predict8x8     = vp9_sixtap_predict8x8;
+    xd->subpixel_predict16x16   = vp9_sixtap_predict16x16;
+    xd->subpixel_predict_avg    = vp9_sixtap_predict_avg;
+    xd->subpixel_predict_avg8x8 = vp9_sixtap_predict_avg8x8;
+    xd->subpixel_predict_avg16x16 = vp9_sixtap_predict_avg16x16;
   } else if (mcomp_filter_type == EIGHTTAP || mcomp_filter_type == SWITCHABLE) {
-    xd->subpixel_predict        = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, eighttap4x4);
-    xd->subpixel_predict8x4     = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, eighttap8x4);
-    xd->subpixel_predict8x8     = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, eighttap8x8);
-    xd->subpixel_predict16x16   = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, eighttap16x16);
-    xd->subpixel_predict_avg    = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, eighttap_avg4x4);
-    xd->subpixel_predict_avg8x8 = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, eighttap_avg8x8);
-    xd->subpixel_predict_avg16x16 = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, eighttap_avg16x16);
+    xd->subpixel_predict        = vp9_eighttap_predict;
+    xd->subpixel_predict8x4     = vp9_eighttap_predict8x4;
+    xd->subpixel_predict8x8     = vp9_eighttap_predict8x8;
+    xd->subpixel_predict16x16   = vp9_eighttap_predict16x16;
+    xd->subpixel_predict_avg    = vp9_eighttap_predict_avg4x4;
+    xd->subpixel_predict_avg8x8 = vp9_eighttap_predict_avg8x8;
+    xd->subpixel_predict_avg16x16 = vp9_eighttap_predict_avg16x16;
   } else if (mcomp_filter_type == EIGHTTAP_SHARP) {
-    xd->subpixel_predict        = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, eighttap4x4_sharp);
-    xd->subpixel_predict8x4     = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, eighttap8x4_sharp);
-    xd->subpixel_predict8x8     = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, eighttap8x8_sharp);
-    xd->subpixel_predict16x16   = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, eighttap16x16_sharp);
-    xd->subpixel_predict_avg    = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, eighttap_avg4x4_sharp);
-    xd->subpixel_predict_avg8x8 = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, eighttap_avg8x8_sharp);
-    xd->subpixel_predict_avg16x16 = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, eighttap_avg16x16_sharp);
+    xd->subpixel_predict        = vp9_eighttap_predict_sharp;
+    xd->subpixel_predict8x4     = vp9_eighttap_predict8x4_sharp;
+    xd->subpixel_predict8x8     = vp9_eighttap_predict8x8_sharp;
+    xd->subpixel_predict16x16   = vp9_eighttap_predict16x16_sharp;
+    xd->subpixel_predict_avg    = vp9_eighttap_predict_avg4x4_sharp;
+    xd->subpixel_predict_avg8x8 = vp9_eighttap_predict_avg8x8_sharp;
+    xd->subpixel_predict_avg16x16 = vp9_eighttap_predict_avg16x16_sharp_c;
   }
   else {
-    xd->subpixel_predict        = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, bilinear4x4);
-    xd->subpixel_predict8x4     = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, bilinear8x4);
-    xd->subpixel_predict8x8     = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, bilinear8x8);
-    xd->subpixel_predict16x16   = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, bilinear16x16);
-    xd->subpixel_predict_avg    = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, bilinear_avg4x4);
-    xd->subpixel_predict_avg8x8 = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, bilinear_avg8x8);
-    xd->subpixel_predict_avg16x16 = SUBPIX_INVOKE(
-        &cm->rtcd.subpix, bilinear_avg16x16);
+    xd->subpixel_predict        = vp9_bilinear_predict4x4;
+    xd->subpixel_predict8x4     = vp9_bilinear_predict8x4;
+    xd->subpixel_predict8x8     = vp9_bilinear_predict8x8;
+    xd->subpixel_predict16x16   = vp9_bilinear_predict16x16;
+    xd->subpixel_predict_avg    = vp9_bilinear_predict_avg4x4;
+    xd->subpixel_predict_avg8x8 = vp9_bilinear_predict_avg8x8;
+    xd->subpixel_predict_avg16x16 = vp9_bilinear_predict_avg16x16;
   }
 }
 
diff --git a/vp9/common/rtcd_defs.sh b/vp9/common/rtcd_defs.sh
index 813e93a..bbef1ec 100644
--- a/vp9/common/rtcd_defs.sh
+++ b/vp9/common/rtcd_defs.sh
@@ -227,6 +227,144 @@
 prototype unsigned int vp9_sad3x16 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int ref_stride, int max_sad"
 specialize vp9_sad3x16 sse2
 
+#
+# Sub Pixel Filters
+#
+prototype void vp9_eighttap_predict16x16 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+specialize vp9_eighttap_predict16x16
+
+prototype void vp9_eighttap_predict8x8 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+specialize vp9_eighttap_predict8x8
+
+prototype void vp9_eighttap_predict_avg16x16 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+specialize vp9_eighttap_predict_avg16x16
+
+prototype void vp9_eighttap_predict_avg8x8 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+specialize vp9_eighttap_predict_avg8x8
+
+prototype void vp9_eighttap_predict_avg4x4 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+specialize vp9_eighttap_predict_avg4x4
+
+prototype void vp9_eighttap_predict8x4 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+specialize vp9_eighttap_predict8x4
+
+prototype void vp9_eighttap_predict "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+specialize vp9_eighttap_predict
+
+prototype void vp9_eighttap_predict16x16_sharp "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+specialize vp9_eighttap_predict16x16_sharp
+
+prototype void vp9_eighttap_predict8x8_sharp "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+specialize vp9_eighttap_predict8x8_sharp
+
+prototype void vp9_eighttap_predict_avg16x16_sharp "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+specialize vp9_eighttap_predict_avg16x16_sharp
+
+prototype void vp9_eighttap_predict_avg8x8_sharp "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+specialize vp9_eighttap_predict_avg8x8_sharp
+
+prototype void vp9_eighttap_predict_avg4x4_sharp "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+specialize vp9_eighttap_predict_avg4x4_sharp
+
+prototype void vp9_eighttap_predict8x4_sharp "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+specialize vp9_eighttap_predict8x4_sharp
+
+prototype void vp9_eighttap_predict_sharp "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+specialize vp9_eighttap_predict_sharp
+
+prototype void vp9_sixtap_predict16x16 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+specialize vp9_sixtap_predict16x16
+
+prototype void vp9_sixtap_predict8x8 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+specialize vp9_sixtap_predict8x8
+
+prototype void vp9_sixtap_predict_avg16x16 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+specialize vp9_sixtap_predict_avg16x16
+
+prototype void vp9_sixtap_predict_avg8x8 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+specialize vp9_sixtap_predict_avg8x8
+
+prototype void vp9_sixtap_predict8x4 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+specialize vp9_sixtap_predict8x4
+
+prototype void vp9_sixtap_predict "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+specialize vp9_sixtap_predict
+
+prototype void vp9_sixtap_predict_avg "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+specialize vp9_sixtap_predict_avg
+
+prototype void vp9_bilinear_predict16x16 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+specialize vp9_bilinear_predict16x16 mmx sse2
+
+prototype void vp9_bilinear_predict8x8 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+specialize vp9_bilinear_predict8x8 mmx sse2
+
+prototype void vp9_bilinear_predict_avg16x16 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+specialize vp9_bilinear_predict_avg16x16
+
+prototype void vp9_bilinear_predict_avg8x8 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+specialize vp9_bilinear_predict_avg8x8
+
+prototype void vp9_bilinear_predict8x4 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+specialize vp9_bilinear_predict8x4 mmx
+
+prototype void vp9_bilinear_predict4x4 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+specialize vp9_bilinear_predict4x4 mmx
+
+prototype void vp9_bilinear_predict_avg4x4 "unsigned char *src_ptr, int  src_pixels_per_line, int  xoffset, int  yoffset, unsigned char *dst_ptr, int  dst_pitch"
+specialize vp9_bilinear_predict_avg4x4
+
+#
+# dct
+#
+prototype void vp9_short_idct4x4llm_1 "short *input, short *output, int pitch"
+specialize vp9_short_idct4x4llm_1 mmx
+
+prototype void vp9_short_idct4x4llm "short *input, short *output, int pitch"
+specialize vp9_short_idct4x4llm mmx
+
+prototype void vp9_short_idct8x8 "short *input, short *output, int pitch"
+specialize vp9_short_idct8x8
+
+prototype void vp9_short_idct10_8x8 "short *input, short *output, int pitch"
+specialize vp9_short_idct10_8x8
+
+prototype void vp9_short_ihaar2x2 "short *input, short *output, int pitch"
+specialize vp9_short_ihaar2x2
+
+prototype void vp9_short_idct16x16 "short *input, short *output, int pitch"
+specialize vp9_short_idct16x16
+
+prototype void vp9_short_idct10_16x16 "short *input, short *output, int pitch"
+specialize vp9_short_idct10_16x16
+
+#
+# 2nd order
+#
+prototype void vp9_short_inv_walsh4x4_1 "short *in, short *out"
+specialize vp9_short_inv_walsh4x4_1
+
+prototype void vp9_short_inv_walsh4x4 "short *in, short *out"
+specialize vp9_short_inv_walsh4x4_
+
+
+# dct and add
+prototype void vp9_dc_only_idct_add_8x8 "short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride"
+specialize vp9_dc_only_idct_add_8x8
+
+prototype void vp9_dc_only_idct_add "short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride"
+specialize vp9_dc_only_idct_add
+
+if [ "$CONFIG_LOSSLESS" = "yes" ]; then
+prototype void vp9_short_inv_walsh4x4_1_x8 "short *input, short *output, int pitch"
+prototype void vp9_short_inv_walsh4x4_x8 "short *input, short *output, int pitch"
+prototype void vp9_dc_only_inv_walsh_add "short input_dc, unsigned char *pred_ptr, unsigned char *dst_ptr, int pitch, int stride"
+prototype void vp9_short_inv_walsh4x4_1_lossless "short *in, short *out"
+prototype void vp9_short_inv_walsh4x4_lossless "short *in, short *out"
+fi
+
+
+
 if [ "$CONFIG_SUPERBLOCKS" = "yes" ]; then
 
 prototype unsigned int vp9_sad32x3 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int ref_stride, int max_sad"
@@ -378,22 +516,22 @@
 prototype void vp9_sad4x4x8 "const unsigned char *src_ptr, int  src_stride, const unsigned char *ref_ptr, int  ref_stride, unsigned short *sad_array"
 specialize vp9_sad4x4x8 sse4
 
-prototype void vp9_sad32x32x4d "const unsigned char *src_ptr, int  src_stride, const unsigned char* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
+prototype void vp9_sad32x32x4d "const unsigned char *src_ptr, int  src_stride, const unsigned char **ref_ptr, int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad32x32x4d
 
-prototype void vp9_sad16x16x4d "const unsigned char *src_ptr, int  src_stride, const unsigned char* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
+prototype void vp9_sad16x16x4d "const unsigned char *src_ptr, int  src_stride, const unsigned char **ref_ptr, int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad16x16x4d sse3
 
-prototype void vp9_sad16x8x4d "const unsigned char *src_ptr, int  src_stride, const unsigned char* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
+prototype void vp9_sad16x8x4d "const unsigned char *src_ptr, int  src_stride, const unsigned char **ref_ptr, int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad16x8x4d sse3
 
-prototype void vp9_sad8x16x4d "const unsigned char *src_ptr, int  src_stride, const unsigned char* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
+prototype void vp9_sad8x16x4d "const unsigned char *src_ptr, int  src_stride, const unsigned char **ref_ptr, int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad8x16x4d sse3
 
-prototype void vp9_sad8x8x4d "const unsigned char *src_ptr, int  src_stride, const unsigned char* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
+prototype void vp9_sad8x8x4d "const unsigned char *src_ptr, int  src_stride, const unsigned char **ref_ptr, int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad8x8x4d sse3
 
-prototype void vp9_sad4x4x4d "const unsigned char *src_ptr, int  src_stride, const unsigned char* const ref_ptr[], int  ref_stride, unsigned int *sad_array"
+prototype void vp9_sad4x4x4d "const unsigned char *src_ptr, int  src_stride, const unsigned char **ref_ptr, int  ref_stride, unsigned int *sad_array"
 specialize vp9_sad4x4x4d sse3
 
 #
@@ -507,6 +645,9 @@
 prototype void vp9_temporal_filter_apply "unsigned char *frame1, unsigned int stride, unsigned char *frame2, unsigned int block_size, int strength, int filter_weight, unsigned int *accumulator, unsigned short *count"
 specialize vp9_temporal_filter_apply sse2
 
+prototype void vp9_yv12_copy_partial_frame "struct yv12_buffer_config *src_ybc, struct yv12_buffer_config *dst_ybc, int fraction"
+specialize vp9_yv12_copy_partial_frame neon
+
 
 fi
 # end encoder functions
diff --git a/vp9/common/subpixel.h b/vp9/common/subpixel.h
index 03bb9f2..2b84291 100644
--- a/vp9/common/subpixel.h
+++ b/vp9/common/subpixel.h
@@ -16,189 +16,6 @@
   void sym(unsigned char *src, int src_pitch, int xofst, int yofst, \
            unsigned char *dst, int dst_pitch)
 
-#if ARCH_X86 || ARCH_X86_64
-#include "x86/subpixel_x86.h"
-#endif
-
-#if ARCH_ARM
-#include "arm/subpixel_arm.h"
-#endif
-
-#ifndef vp9_subpix_sixtap16x16
-#define vp9_subpix_sixtap16x16 vp9_sixtap_predict16x16_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_sixtap16x16);
-
-#ifndef vp9_subpix_sixtap8x8
-#define vp9_subpix_sixtap8x8 vp9_sixtap_predict8x8_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_sixtap8x8);
-
-#ifndef vp9_subpix_sixtap_avg16x16
-#define vp9_subpix_sixtap_avg16x16 vp9_sixtap_predict_avg16x16_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_sixtap_avg16x16);
-
-#ifndef vp9_subpix_sixtap_avg8x8
-#define vp9_subpix_sixtap_avg8x8 vp9_sixtap_predict_avg8x8_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_sixtap_avg8x8);
-#ifndef vp9_subpix_sixtap8x4
-#define vp9_subpix_sixtap8x4 vp9_sixtap_predict8x4_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_sixtap8x4);
-
-#ifndef vp9_subpix_sixtap4x4
-#define vp9_subpix_sixtap4x4 vp9_sixtap_predict_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_sixtap4x4);
-
-#ifndef vp9_subpix_sixtap_avg4x4
-#define vp9_subpix_sixtap_avg4x4 vp9_sixtap_predict_avg_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_sixtap_avg4x4);
-
-#ifndef vp9_subpix_eighttap16x16
-#define vp9_subpix_eighttap16x16 vp9_eighttap_predict16x16_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_eighttap16x16);
-
-#ifndef vp9_subpix_eighttap8x8
-#define vp9_subpix_eighttap8x8 vp9_eighttap_predict8x8_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_eighttap8x8);
-
-#ifndef vp9_subpix_eighttap_avg16x16
-#define vp9_subpix_eighttap_avg16x16 vp9_eighttap_predict_avg16x16_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_eighttap_avg16x16);
-
-#ifndef vp9_subpix_eighttap_avg8x8
-#define vp9_subpix_eighttap_avg8x8 vp9_eighttap_predict_avg8x8_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_eighttap_avg8x8);
-
-#ifndef vp9_subpix_eighttap8x4
-#define vp9_subpix_eighttap8x4 vp9_eighttap_predict8x4_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_eighttap8x4);
-
-#ifndef vp9_subpix_eighttap4x4
-#define vp9_subpix_eighttap4x4 vp9_eighttap_predict_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_eighttap4x4);
-
-#ifndef vp9_subpix_eighttap_avg4x4
-#define vp9_subpix_eighttap_avg4x4 vp9_eighttap_predict_avg4x4_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_eighttap_avg4x4);
-
-#ifndef vp9_subpix_eighttap16x16_sharp
-#define vp9_subpix_eighttap16x16_sharp vp9_eighttap_predict16x16_sharp_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_eighttap16x16_sharp);
-
-#ifndef vp9_subpix_eighttap8x8_sharp
-#define vp9_subpix_eighttap8x8_sharp vp9_eighttap_predict8x8_sharp_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_eighttap8x8_sharp);
-
-#ifndef vp9_subpix_eighttap_avg16x16_sharp
-#define vp9_subpix_eighttap_avg16x16_sharp vp9_eighttap_predict_avg16x16_sharp_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_eighttap_avg16x16_sharp);
-
-#ifndef vp9_subpix_eighttap_avg8x8_sharp
-#define vp9_subpix_eighttap_avg8x8_sharp vp9_eighttap_predict_avg8x8_sharp_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_eighttap_avg8x8_sharp);
-
-#ifndef vp9_subpix_eighttap8x4_sharp
-#define vp9_subpix_eighttap8x4_sharp vp9_eighttap_predict8x4_sharp_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_eighttap8x4_sharp);
-
-#ifndef vp9_subpix_eighttap4x4_sharp
-#define vp9_subpix_eighttap4x4_sharp vp9_eighttap_predict_sharp_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_eighttap4x4_sharp);
-
-#ifndef vp9_subpix_eighttap_avg4x4_sharp
-#define vp9_subpix_eighttap_avg4x4_sharp vp9_eighttap_predict_avg4x4_sharp_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_eighttap_avg4x4_sharp);
-
-#ifndef vp9_subpix_bilinear16x16
-#define vp9_subpix_bilinear16x16 vp9_bilinear_predict16x16_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_bilinear16x16);
-
-#ifndef vp9_subpix_bilinear8x8
-#define vp9_subpix_bilinear8x8 vp9_bilinear_predict8x8_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_bilinear8x8);
-
-#ifndef vp9_subpix_bilinear_avg16x16
-#define vp9_subpix_bilinear_avg16x16 vp9_bilinear_predict_avg16x16_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_bilinear_avg16x16);
-
-#ifndef vp9_subpix_bilinear_avg8x8
-#define vp9_subpix_bilinear_avg8x8 vp9_bilinear_predict_avg8x8_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_bilinear_avg8x8);
-
-#ifndef vp9_subpix_bilinear8x4
-#define vp9_subpix_bilinear8x4 vp9_bilinear_predict8x4_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_bilinear8x4);
-
-#ifndef vp9_subpix_bilinear4x4
-#define vp9_subpix_bilinear4x4 vp9_bilinear_predict4x4_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_bilinear4x4);
-
-#ifndef vp9_subpix_bilinear_avg4x4
-#define vp9_subpix_bilinear_avg4x4 vp9_bilinear_predict_avg4x4_c
-#endif
-extern prototype_subpixel_predict(vp9_subpix_bilinear_avg4x4);
-
 typedef prototype_subpixel_predict((*vp9_subpix_fn_t));
-typedef struct {
-  vp9_subpix_fn_t  eighttap16x16;
-  vp9_subpix_fn_t  eighttap8x8;
-  vp9_subpix_fn_t  eighttap_avg16x16;
-  vp9_subpix_fn_t  eighttap_avg8x8;
-  vp9_subpix_fn_t  eighttap_avg4x4;
-  vp9_subpix_fn_t  eighttap8x4;
-  vp9_subpix_fn_t  eighttap4x4;
-  vp9_subpix_fn_t  eighttap16x16_sharp;
-  vp9_subpix_fn_t  eighttap8x8_sharp;
-  vp9_subpix_fn_t  eighttap_avg16x16_sharp;
-  vp9_subpix_fn_t  eighttap_avg8x8_sharp;
-  vp9_subpix_fn_t  eighttap_avg4x4_sharp;
-  vp9_subpix_fn_t  eighttap8x4_sharp;
-  vp9_subpix_fn_t  eighttap4x4_sharp;
-  vp9_subpix_fn_t  sixtap16x16;
-  vp9_subpix_fn_t  sixtap8x8;
-  vp9_subpix_fn_t  sixtap_avg16x16;
-  vp9_subpix_fn_t  sixtap_avg8x8;
-  vp9_subpix_fn_t  sixtap8x4;
-  vp9_subpix_fn_t  sixtap4x4;
-  vp9_subpix_fn_t  sixtap_avg4x4;
-  vp9_subpix_fn_t  bilinear16x16;
-  vp9_subpix_fn_t  bilinear8x8;
-  vp9_subpix_fn_t  bilinear_avg16x16;
-  vp9_subpix_fn_t  bilinear_avg8x8;
-  vp9_subpix_fn_t  bilinear8x4;
-  vp9_subpix_fn_t  bilinear4x4;
-  vp9_subpix_fn_t  bilinear_avg4x4;
-} vp9_subpix_rtcd_vtable_t;
-
-#if CONFIG_RUNTIME_CPU_DETECT
-#define SUBPIX_INVOKE(ctx,fn) (ctx)->fn
-#else
-#define SUBPIX_INVOKE(ctx,fn) vp9_subpix_##fn
-#endif
 
 #endif
diff --git a/vp9/common/x86/sadmxn_x86.c b/vp9/common/x86/sadmxn_x86.c
index 1ba2d48..77cd372 100644
--- a/vp9/common/x86/sadmxn_x86.c
+++ b/vp9/common/x86/sadmxn_x86.c
@@ -30,9 +30,9 @@
   s1 = _mm_loadu_si128((const __m128i *)(src_ptr + 1 * src_stride));
   s2 = _mm_loadu_si128((const __m128i *)(src_ptr + 2 * src_stride));
 
-  r0 = _mm_loadu_si128((const __m128i *)(ref_ptr + 0 * src_stride));
-  r1 = _mm_loadu_si128((const __m128i *)(ref_ptr + 1 * src_stride));
-  r2 = _mm_loadu_si128((const __m128i *)(ref_ptr + 2 * src_stride));
+  r0 = _mm_loadu_si128((const __m128i *)(ref_ptr + 0 * ref_stride));
+  r1 = _mm_loadu_si128((const __m128i *)(ref_ptr + 1 * ref_stride));
+  r2 = _mm_loadu_si128((const __m128i *)(ref_ptr + 2 * ref_stride));
 
   sad = _mm_sad_epu8(s0, r0);
   sad = _mm_add_epi16(sad,  _mm_sad_epu8(s1, r1));
@@ -57,10 +57,10 @@
     s1 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 1 * src_stride));
     s2 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 2 * src_stride));
     s3 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 3 * src_stride));
-    r0 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 0 * src_stride));
-    r1 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 1 * src_stride));
-    r2 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 2 * src_stride));
-    r3 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 3 * src_stride));
+    r0 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 0 * ref_stride));
+    r1 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 1 * ref_stride));
+    r2 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 2 * ref_stride));
+    r3 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 3 * ref_stride));
 
     s0 = _mm_unpacklo_epi8(s0, s1);
     r0 = _mm_unpacklo_epi8(r0, r1);
diff --git a/vp9/common/x86/x86_systemdependent.c b/vp9/common/x86/x86_systemdependent.c
index 6e8e9ad..62e75ff 100644
--- a/vp9/common/x86/x86_systemdependent.c
+++ b/vp9/common/x86/x86_systemdependent.c
@@ -10,9 +10,7 @@
 
 #include "vpx_config.h"
 #include "vpx_ports/x86.h"
-#include "vp9/common/subpixel.h"
 #include "vp9/common/loopfilter.h"
-#include "vp9/common/idct.h"
 #include "vp9/common/pragmas.h"
 #include "vp9/common/onyxc_int.h"
 
@@ -32,22 +30,6 @@
 #if HAVE_MMX
 // The commented functions need to be re-written for vpx.
   if (flags & HAS_MMX) {
-    rtcd->idct.idct1        = vp9_short_idct4x4llm_1_mmx;
-    rtcd->idct.idct16       = vp9_short_idct4x4llm_mmx;
-    rtcd->idct.idct1_scalar_add = vp9_dc_only_idct_add_mmx;
-    // rtcd->idct.iwalsh16     = vp9_short_inv_walsh4x4_mmx;
-    // rtcd->idct.iwalsh1     = vp9_short_inv_walsh4x4_1_mmx;
-
-    /* Disabled due to unsupported enhanced interpolation/high_prec mv
-    rtcd->subpix.sixtap16x16   = vp9_sixtap_predict16x16_mmx;
-    rtcd->subpix.sixtap8x8     = vp9_sixtap_predict8x8_mmx;
-    rtcd->subpix.sixtap8x4     = vp9_sixtap_predict8x4_mmx;
-    rtcd->subpix.sixtap4x4     = vp9_sixtap_predict4x4_mmx;
-    */
-    rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_mmx;
-    rtcd->subpix.bilinear8x8   = vp9_bilinear_predict8x8_mmx;
-    rtcd->subpix.bilinear8x4   = vp9_bilinear_predict8x4_mmx;
-    rtcd->subpix.bilinear4x4   = vp9_bilinear_predict4x4_mmx;
 
 #if CONFIG_POSTPROC
     rtcd->postproc.down        = vp9_mbpost_proc_down_mmx;
@@ -65,14 +47,6 @@
 
     // rtcd->idct.iwalsh16     = vp9_short_inv_walsh4x4_sse2;
 
-    /* Disabled due to unsupported enhanced interpolation/high_prec mv
-    rtcd->subpix.sixtap16x16   = vp9_sixtap_predict16x16_sse2;
-    rtcd->subpix.sixtap8x8     = vp9_sixtap_predict8x8_sse2;
-    rtcd->subpix.sixtap8x4     = vp9_sixtap_predict8x4_sse2;
-    */
-    rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_sse2;
-    rtcd->subpix.bilinear8x8   = vp9_bilinear_predict8x8_sse2;
-
 #if CONFIG_POSTPROC
     rtcd->postproc.down        = vp9_mbpost_proc_down_xmm;
     rtcd->postproc.across      = vp9_mbpost_proc_across_ip_xmm;
@@ -86,14 +60,6 @@
 #if HAVE_SSSE3
 
   if (flags & HAS_SSSE3) {
-    /* Disabled due to unsupported enhanced interpolation/high_prec mv
-    rtcd->subpix.sixtap16x16   = vp9_sixtap_predict16x16_ssse3;
-    rtcd->subpix.sixtap8x8     = vp9_sixtap_predict8x8_ssse3;
-    rtcd->subpix.sixtap8x4     = vp9_sixtap_predict8x4_ssse3;
-    rtcd->subpix.sixtap4x4     = vp9_sixtap_predict4x4_ssse3;
-    rtcd->subpix.bilinear16x16 = vp9_bilinear_predict16x16_ssse3;
-    rtcd->subpix.bilinear8x8   = vp9_bilinear_predict8x8_ssse3;
-    */
 
     /* these are disable because of unsupported diagonal pred modes
     rtcd->recon.build_intra_predictors_mbuv =
diff --git a/vp9/decoder/arm/armv6/idct_blk_v6.c b/vp9/decoder/arm/armv6/idct_blk_v6.c
index e430f29..bf0c814 100644
--- a/vp9/decoder/arm/armv6/idct_blk_v6.c
+++ b/vp9/decoder/arm/armv6/idct_blk_v6.c
@@ -9,7 +9,7 @@
  */
 
 #include "vpx_ports/config.h"
-#include "vp9/common/idct.h"
+#include "vp9/common/blockd.h"
 #include "vp9/decoder/dequantize.h"
 
 void vp8_dequant_dc_idct_add_y_block_v6(short *q, short *dq,
diff --git a/vp9/decoder/arm/dequantize_arm.c b/vp9/decoder/arm/dequantize_arm.c
index cc2ff31..5ec78f3 100644
--- a/vp9/decoder/arm/dequantize_arm.c
+++ b/vp9/decoder/arm/dequantize_arm.c
@@ -11,7 +11,7 @@
 
 #include "vpx_ports/config.h"
 #include "vp9/decoder/dequantize.h"
-#include "vp9/common/idct.h"
+#include "vp9/common/blockd.h"
 #include "vpx_mem/vpx_mem.h"
 
 #if HAVE_ARMV7
diff --git a/vp9/decoder/arm/neon/idct_blk_neon.c b/vp9/decoder/arm/neon/idct_blk_neon.c
index 5711e86..cc68fd4 100644
--- a/vp9/decoder/arm/neon/idct_blk_neon.c
+++ b/vp9/decoder/arm/neon/idct_blk_neon.c
@@ -9,7 +9,7 @@
  */
 
 #include "vpx_ports/config.h"
-#include "vp9/common/idct.h"
+#include "vp9/common/blockd.h"
 #include "vp9/decoder/dequantize.h"
 
 /* place these declarations here because we don't want to maintain them
diff --git a/vp9/decoder/decodframe.c b/vp9/decoder/decodframe.c
index b884bad..625053e 100644
--- a/vp9/decoder/decodframe.c
+++ b/vp9/decoder/decodframe.c
@@ -27,7 +27,6 @@
 #include "vp9/common/extend.h"
 #include "vp9/common/modecont.h"
 #include "vpx_mem/vpx_mem.h"
-#include "vp9/common/idct.h"
 #include "dboolhuff.h"
 
 #include "vp9/common/seg_common.h"
@@ -122,22 +121,20 @@
 
 #if CONFIG_LOSSLESS
   if (!QIndex) {
-    pbi->common.rtcd.idct.idct1        = vp9_short_inv_walsh4x4_1_x8_c;
-    pbi->common.rtcd.idct.idct16       = vp9_short_inv_walsh4x4_x8_c;
-    pbi->common.rtcd.idct.idct1_scalar_add  = vp9_dc_only_inv_walsh_add_c;
-    pbi->common.rtcd.idct.iwalsh1      = vp9_short_inv_walsh4x4_1_lossless_c;
-    pbi->common.rtcd.idct.iwalsh16     = vp9_short_inv_walsh4x4_lossless_c;
+    pbi->mb.inv_xform4x4_1_x8     = vp9_short_inv_walsh4x4_1_x8;
+    pbi->mb.inv_xform4x4_x8       = vp9_short_inv_walsh4x4_x8;
+    pbi->mb.inv_walsh4x4_1        = vp9_short_inv_walsh4x4_1_lossless;
+    pbi->mb.inv_walsh4x4_lossless = vp9_short_inv_walsh4x4_lossless;
     pbi->idct_add            = vp9_dequant_idct_add_lossless_c;
     pbi->dc_idct_add         = vp9_dequant_dc_idct_add_lossless_c;
     pbi->dc_idct_add_y_block = vp9_dequant_dc_idct_add_y_block_lossless_c;
     pbi->idct_add_y_block    = vp9_dequant_idct_add_y_block_lossless_c;
     pbi->idct_add_uv_block   = vp9_dequant_idct_add_uv_block_lossless_c;
   } else {
-    pbi->common.rtcd.idct.idct1        = vp9_short_idct4x4llm_1_c;
-    pbi->common.rtcd.idct.idct16       = vp9_short_idct4x4llm_c;
-    pbi->common.rtcd.idct.idct1_scalar_add  = vp9_dc_only_idct_add_c;
-    pbi->common.rtcd.idct.iwalsh1      = vp9_short_inv_walsh4x4_1_c;
-    pbi->common.rtcd.idct.iwalsh16     = vp9_short_inv_walsh4x4_c;
+    pbi->mb.inv_xform4x4_1_x8     = vp9_short_idct4x4llm_1;
+    pbi->mb.inv_xform4x4_x8       = vp9_short_idct4x4llm;
+    pbi->mb.inv_walsh4x4_1        = vp9_short_inv_walsh4x4_1;
+    pbi->mb.inv_walsh4x4_lossless = vp9_short_inv_walsh4x4;
     pbi->idct_add            = vp9_dequant_idct_add;
     pbi->dc_idct_add         = vp9_dequant_dc_idct_add;
     pbi->dc_idct_add_y_block = vp9_dequant_dc_idct_add_y_block;
@@ -145,6 +142,10 @@
     pbi->idct_add_uv_block   = vp9_dequant_idct_add_uv_block;
   }
 #else
+  pbi->mb.inv_xform4x4_1_x8     = vp9_short_idct4x4llm_1;
+  pbi->mb.inv_xform4x4_x8       = vp9_short_idct4x4llm;
+  pbi->mb.inv_walsh4x4_1        = vp9_short_inv_walsh4x4_1;
+  pbi->mb.inv_walsh4x4_lossless = vp9_short_inv_walsh4x4;
   pbi->idct_add            = vp9_dequant_idct_add;
   pbi->dc_idct_add         = vp9_dequant_dc_idct_add;
   pbi->dc_idct_add_y_block = vp9_dequant_dc_idct_add_y_block;
@@ -287,13 +288,7 @@
       xd->eobs[i] = 0;
     }
 
-    if (tx_size == TX_16X16) {
-      eobtotal = vp9_decode_mb_tokens_16x16(pbi, xd, bc);
-    } else if (tx_size == TX_8X8) {
-      eobtotal = vp9_decode_mb_tokens_8x8(pbi, xd, bc);
-    } else {
-      eobtotal = vp9_decode_mb_tokens_4x4(pbi, xd, bc);
-    }
+    eobtotal = vp9_decode_mb_tokens(pbi, xd, bc);
     if (eobtotal == 0) {  // skip loopfilter
       xd->mode_info_context->mbmi.mb_skip_coeff = 1;
       continue;
@@ -311,7 +306,7 @@
           xd->dst.uv_stride, xd->eobs + 16, xd);
     } else if (tx_size == TX_8X8) {
       vp9_dequantize_b_2x2(b);
-      IDCT_INVOKE(RTCD_VTABLE(idct), ihaar2)(&b->dqcoeff[0], b->diff, 8);
+      vp9_short_ihaar2x2(&b->dqcoeff[0], b->diff, 8);
       ((int *)b->qcoeff)[0] = 0;  // 2nd order block are set to 0 after idct
       ((int *)b->qcoeff)[1] = 0;
       ((int *)b->qcoeff)[2] = 0;
@@ -332,7 +327,7 @@
     } else {
       vp9_dequantize_b(b);
       if (xd->eobs[24] > 1) {
-        IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff);
+        vp9_short_inv_walsh4x4(&b->dqcoeff[0], b->diff);
         ((int *)b->qcoeff)[0] = 0;
         ((int *)b->qcoeff)[1] = 0;
         ((int *)b->qcoeff)[2] = 0;
@@ -342,7 +337,7 @@
         ((int *)b->qcoeff)[6] = 0;
         ((int *)b->qcoeff)[7] = 0;
       } else {
-        IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], b->diff);
+        xd->inv_walsh4x4_1(&b->dqcoeff[0], b->diff);
         ((int *)b->qcoeff)[0] = 0;
       }
 
@@ -391,12 +386,8 @@
       xd->block[i].eob = 0;
       xd->eobs[i] = 0;
     }
-    if (tx_size == TX_16X16) {
-      eobtotal = vp9_decode_mb_tokens_16x16(pbi, xd, bc);
-    } else if (tx_size == TX_8X8) {
-      eobtotal = vp9_decode_mb_tokens_8x8(pbi, xd, bc);
-    } else {
-      eobtotal = vp9_decode_mb_tokens_4x4(pbi, xd, bc);
+    if (mode != B_PRED) {
+      eobtotal = vp9_decode_mb_tokens(pbi, xd, bc);
     }
   }
 
@@ -491,6 +482,8 @@
       xd->mode_info_context->bmi[i].as_mode.context = b->bmi.as_mode.context =
           vp9_find_bpred_context(b);
 #endif
+      if (!xd->mode_info_context->mbmi.mb_skip_coeff)
+        eobtotal += vp9_decode_coefs_4x4(pbi, xd, bc, PLANE_TYPE_Y_WITH_DC, i);
 #if CONFIG_COMP_INTRA_PRED
       b_mode2 = xd->mode_info_context->bmi[i].as_mode.second;
 
@@ -513,6 +506,8 @@
                                *(b->base_dst) + b->dst, 16, b->dst_stride);
       }
     }
+    if (!xd->mode_info_context->mbmi.mb_skip_coeff)
+      vp9_decode_mb_tokens_4x4_uv(pbi, xd, bc);
   } else if (mode == SPLITMV) {
     if (tx_size == TX_8X8) {
       vp9_dequant_idct_add_y_block_8x8(xd->qcoeff, xd->block[0].dequant,
@@ -535,12 +530,12 @@
                                         xd->dst.y_buffer, 16, xd->dst.y_stride);
       } else {
         vp9_dequant_idct_add_16x16(xd->qcoeff, xd->block[0].dequant,
-                                     xd->predictor, xd->dst.y_buffer,
-                                     16, xd->dst.y_stride, xd->eobs[0]);
+                                   xd->predictor, xd->dst.y_buffer,
+                                   16, xd->dst.y_stride, xd->eobs[0]);
       }
     } else if (tx_size == TX_8X8) {
       vp9_dequantize_b_2x2(b);
-      IDCT_INVOKE(RTCD_VTABLE(idct), ihaar2)(&b->dqcoeff[0], b->diff, 8);
+      vp9_short_ihaar2x2(&b->dqcoeff[0], b->diff, 8);
       ((int *)b->qcoeff)[0] = 0;  // 2nd order block are set to 0 after idct
       ((int *)b->qcoeff)[1] = 0;
       ((int *)b->qcoeff)[2] = 0;
@@ -549,13 +544,13 @@
       ((int *)b->qcoeff)[5] = 0;
       ((int *)b->qcoeff)[6] = 0;
       ((int *)b->qcoeff)[7] = 0;
-        vp9_dequant_dc_idct_add_y_block_8x8(xd->qcoeff,
+      vp9_dequant_dc_idct_add_y_block_8x8(xd->qcoeff,
           xd->block[0].dequant, xd->predictor, xd->dst.y_buffer,
           xd->dst.y_stride, xd->eobs, xd->block[24].diff, xd);
     } else {
       vp9_dequantize_b(b);
       if (xd->eobs[24] > 1) {
-        IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh16)(&b->dqcoeff[0], b->diff);
+        vp9_short_inv_walsh4x4(&b->dqcoeff[0], b->diff);
         ((int *)b->qcoeff)[0] = 0;
         ((int *)b->qcoeff)[1] = 0;
         ((int *)b->qcoeff)[2] = 0;
@@ -565,7 +560,7 @@
         ((int *)b->qcoeff)[6] = 0;
         ((int *)b->qcoeff)[7] = 0;
       } else {
-        IDCT_INVOKE(RTCD_VTABLE(idct), iwalsh1)(&b->dqcoeff[0], b->diff);
+        xd->inv_walsh4x4_1(&b->dqcoeff[0], b->diff);
         ((int *)b->qcoeff)[0] = 0;
       }
 
@@ -992,7 +987,7 @@
   }
 }
 
-int vp9_decode_frame(VP9D_COMP *pbi) {
+int vp9_decode_frame(VP9D_COMP *pbi, const unsigned char **p_data_end) {
   BOOL_DECODER header_bc, residual_bc;
   VP9_COMMON *const pc = &pbi->common;
   MACROBLOCKD *const xd  = &pbi->mb;
@@ -1456,5 +1451,12 @@
 #endif
   // printf("Frame %d Done\n", frame_count++);
 
+  /* Find the end of the coded buffer */
+  while (residual_bc.count > CHAR_BIT
+         && residual_bc.count < VP9_BD_VALUE_SIZE) {
+    residual_bc.count -= CHAR_BIT;
+    residual_bc.user_buffer--;
+  }
+  *p_data_end = residual_bc.user_buffer;
   return 0;
 }
diff --git a/vp9/decoder/dequantize.c b/vp9/decoder/dequantize.c
index 4f45af7..37e8e4c 100644
--- a/vp9/decoder/dequantize.c
+++ b/vp9/decoder/dequantize.c
@@ -11,7 +11,6 @@
 
 #include "vp9_rtcd.h"
 #include "dequantize.h"
-#include "vp9/common/idct.h"
 #include "vpx_mem/vpx_mem.h"
 #include "onyxd_int.h"
 
diff --git a/vp9/decoder/detokenize.c b/vp9/decoder/detokenize.c
index 16e85a0..b6823e7 100644
--- a/vp9/decoder/detokenize.c
+++ b/vp9/decoder/detokenize.c
@@ -124,14 +124,14 @@
                         PLANE_TYPE type,
                         TX_TYPE tx_type,
                         int seg_eob, INT16 *qcoeff_ptr,
-                        const int *const scan, int block_type,
+                        const int *const scan, TX_SIZE txfm_size,
                         const int *coef_bands) {
   FRAME_CONTEXT *const fc = &dx->common.fc;
   int pt, c = (type == PLANE_TYPE_Y_NO_DC);
   vp9_prob (*coef_probs)[PREV_COEF_CONTEXTS][ENTROPY_NODES], *prob;
   unsigned int (*coef_counts)[PREV_COEF_CONTEXTS][MAX_ENTROPY_TOKENS];
 
-  switch (block_type) {
+  switch (txfm_size) {
     default:
     case TX_4X4:
       if (tx_type == DCT_DCT) {
@@ -246,8 +246,7 @@
   return c;
 }
 
-
-int get_eob(MACROBLOCKD* const xd, int segment_id, int eob_max) {
+static int get_eob(MACROBLOCKD* const xd, int segment_id, int eob_max) {
   int active = vp9_segfeature_active(xd, segment_id, SEG_LVL_EOB);
   int eob = vp9_get_segdata(xd, segment_id, SEG_LVL_EOB);
 
@@ -257,141 +256,126 @@
 }
 
 
-int vp9_decode_mb_tokens_16x16(VP9D_COMP* const pbi,
-                               MACROBLOCKD* const xd,
-                               BOOL_DECODER* const bc) {
+static int vp9_decode_mb_tokens_16x16(VP9D_COMP* const pbi,
+                                      MACROBLOCKD* const xd,
+                                      BOOL_DECODER* const bc) {
   ENTROPY_CONTEXT* const A = (ENTROPY_CONTEXT *)xd->above_context;
   ENTROPY_CONTEXT* const L = (ENTROPY_CONTEXT *)xd->left_context;
-
   unsigned short* const eobs = xd->eobs;
-  PLANE_TYPE type;
-  int c, i, eobtotal = 0, seg_eob;
   const int segment_id = xd->mode_info_context->mbmi.segment_id;
-  INT16 *qcoeff_ptr = &xd->qcoeff[0];
-  TX_TYPE tx_type = get_tx_type(xd, &xd->block[0]);
-
-  type = PLANE_TYPE_Y_WITH_DC;
-  seg_eob = get_eob(xd, segment_id, 256);
+  int c, i, eobtotal = 0, seg_eob;
 
   // Luma block
-  {
-    const int* const scan = vp9_default_zig_zag1d_16x16;
-    eobs[0] = c = decode_coefs(pbi, xd, bc, A, L, type,
-                               tx_type, seg_eob, qcoeff_ptr,
-                               scan, TX_16X16, vp9_coef_bands_16x16);
-    A[1] = A[2] = A[3] = A[0];
-    L[1] = L[2] = L[3] = L[0];
-    eobtotal += c;
-  }
+  eobs[0] = c = decode_coefs(pbi, xd, bc, A, L, PLANE_TYPE_Y_WITH_DC,
+                             get_tx_type(xd, &xd->block[0]),
+                             get_eob(xd, segment_id, 256),
+                             xd->qcoeff, vp9_default_zig_zag1d_16x16,
+                             TX_16X16, vp9_coef_bands_16x16);
+  A[1] = A[2] = A[3] = A[0];
+  L[1] = L[2] = L[3] = L[0];
+  eobtotal += c;
 
   // 8x8 chroma blocks
-  qcoeff_ptr += 256;
-  type = PLANE_TYPE_UV;
-  tx_type = DCT_DCT;
   seg_eob = get_eob(xd, segment_id, 64);
   for (i = 16; i < 24; i += 4) {
     ENTROPY_CONTEXT* const a = A + vp9_block2above_8x8[i];
     ENTROPY_CONTEXT* const l = L + vp9_block2left_8x8[i];
-    const int* const scan = vp9_default_zig_zag1d_8x8;
 
-    eobs[i] = c = decode_coefs(pbi, xd, bc, a, l, type,
-                               tx_type, seg_eob, qcoeff_ptr,
-                               scan, TX_8X8, vp9_coef_bands_8x8);
+    eobs[i] = c = decode_coefs(pbi, xd, bc, a, l, PLANE_TYPE_UV,
+                               DCT_DCT, seg_eob, xd->block[i].qcoeff,
+                               vp9_default_zig_zag1d_8x8,
+                               TX_8X8, vp9_coef_bands_8x8);
     a[1] = a[0];
     l[1] = l[0];
-
     eobtotal += c;
-    qcoeff_ptr += 64;
   }
+
+  // no Y2 block
   vpx_memset(&A[8], 0, sizeof(A[8]));
   vpx_memset(&L[8], 0, sizeof(L[8]));
+
   return eobtotal;
 }
 
-int vp9_decode_mb_tokens_8x8(VP9D_COMP* const pbi,
-                             MACROBLOCKD* const xd,
-                             BOOL_DECODER* const bc) {
+static int vp9_decode_mb_tokens_8x8(VP9D_COMP* const pbi,
+                                    MACROBLOCKD* const xd,
+                                    BOOL_DECODER* const bc) {
   ENTROPY_CONTEXT *const A = (ENTROPY_CONTEXT *)xd->above_context;
   ENTROPY_CONTEXT *const L = (ENTROPY_CONTEXT *)xd->left_context;
-
   unsigned short *const eobs = xd->eobs;
   PLANE_TYPE type;
   int c, i, eobtotal = 0, seg_eob;
   const int segment_id = xd->mode_info_context->mbmi.segment_id;
-  INT16 *qcoeff_ptr = &xd->qcoeff[0];
-  TX_TYPE tx_type = DCT_DCT;
 
-  int bufthred = (xd->mode_info_context->mbmi.mode == I8X8_PRED ||
-                  xd->mode_info_context->mbmi.mode == SPLITMV) ? 16 : 24;
+  // 2nd order DC block
   if (xd->mode_info_context->mbmi.mode != B_PRED &&
       xd->mode_info_context->mbmi.mode != SPLITMV &&
       xd->mode_info_context->mbmi.mode != I8X8_PRED) {
     ENTROPY_CONTEXT *const a = A + vp9_block2above_8x8[24];
     ENTROPY_CONTEXT *const l = L + vp9_block2left_8x8[24];
-    const int *const scan = vp9_default_zig_zag1d;
-    type = PLANE_TYPE_Y2;
 
-    seg_eob = get_eob(xd, segment_id, 4);
-    eobs[24] = c = decode_coefs(pbi, xd, bc, a, l, type,
-                                tx_type, seg_eob, qcoeff_ptr + 24 * 16,
-                                scan, TX_8X8, vp9_coef_bands);
-
+    eobs[24] = c = decode_coefs(pbi, xd, bc, a, l, PLANE_TYPE_Y2,
+                                DCT_DCT, get_eob(xd, segment_id, 4),
+                                xd->block[24].qcoeff,
+                                vp9_default_zig_zag1d, TX_8X8, vp9_coef_bands);
     eobtotal += c - 4;
-
     type = PLANE_TYPE_Y_NO_DC;
-  } else
+  } else {
     type = PLANE_TYPE_Y_WITH_DC;
-
-  seg_eob = get_eob(xd, segment_id, 64);
-
-  for (i = 0; i < bufthred ; i += 4) {
-    ENTROPY_CONTEXT *const a = A + vp9_block2above_8x8[i];
-    ENTROPY_CONTEXT *const l = L + vp9_block2left_8x8[i];
-    const int *const scan = vp9_default_zig_zag1d_8x8;
-    tx_type = DCT_DCT;
-
-    if (i == 16)
-      type = PLANE_TYPE_UV;
-    if (type == PLANE_TYPE_Y_WITH_DC) {
-      tx_type = get_tx_type(xd, xd->block + i);
-    }
-
-    eobs[i] = c = decode_coefs(pbi, xd, bc, a, l, type,
-                               tx_type, seg_eob, qcoeff_ptr,
-                               scan, TX_8X8, vp9_coef_bands_8x8);
-    a[1] = a[0];
-    l[1] = l[0];
-
-    eobtotal += c;
-    qcoeff_ptr += 64;
   }
 
-  if (bufthred == 16) {
-    type = PLANE_TYPE_UV;
-    tx_type = DCT_DCT;
-    seg_eob = get_eob(xd, segment_id, 16);
+  // luma blocks
+  seg_eob = get_eob(xd, segment_id, 64);
+  for (i = 0; i < 16; i += 4) {
+    ENTROPY_CONTEXT *const a = A + vp9_block2above_8x8[i];
+    ENTROPY_CONTEXT *const l = L + vp9_block2left_8x8[i];
 
-    // use 4x4 transform for U, V components in I8X8 prediction mode
+    eobs[i] = c = decode_coefs(pbi, xd, bc, a, l, type,
+                               type == PLANE_TYPE_Y_WITH_DC ?
+                                 get_tx_type(xd, xd->block + i) : DCT_DCT,
+                               seg_eob, xd->block[i].qcoeff,
+                               vp9_default_zig_zag1d_8x8,
+                               TX_8X8, vp9_coef_bands_8x8);
+    a[1] = a[0];
+    l[1] = l[0];
+    eobtotal += c;
+  }
+
+  // chroma blocks
+  if (xd->mode_info_context->mbmi.mode == I8X8_PRED ||
+      xd->mode_info_context->mbmi.mode == SPLITMV) {
+    // use 4x4 transform for U, V components in I8X8/splitmv prediction mode
+    seg_eob = get_eob(xd, segment_id, 16);
     for (i = 16; i < 24; i++) {
       ENTROPY_CONTEXT *const a = A + vp9_block2above[i];
       ENTROPY_CONTEXT *const l = L + vp9_block2left[i];
-      const int *scan = vp9_default_zig_zag1d;
 
-      eobs[i] = c = decode_coefs(pbi, xd, bc, a, l, type,
-                                 tx_type, seg_eob, qcoeff_ptr,
-                                 scan, TX_4X4, vp9_coef_bands);
-
+      eobs[i] = c = decode_coefs(pbi, xd, bc, a, l, PLANE_TYPE_UV,
+                                 DCT_DCT, seg_eob, xd->block[i].qcoeff,
+                                 vp9_default_zig_zag1d, TX_4X4, vp9_coef_bands);
       eobtotal += c;
-      qcoeff_ptr += 16;
+    }
+  } else {
+    for (i = 16; i < 24; i += 4) {
+      ENTROPY_CONTEXT *const a = A + vp9_block2above_8x8[i];
+      ENTROPY_CONTEXT *const l = L + vp9_block2left_8x8[i];
+
+      eobs[i] = c = decode_coefs(pbi, xd, bc, a, l, PLANE_TYPE_UV,
+                                 DCT_DCT, seg_eob, xd->block[i].qcoeff,
+                                 vp9_default_zig_zag1d_8x8,
+                                 TX_8X8, vp9_coef_bands_8x8);
+      a[1] = a[0];
+      l[1] = l[0];
+      eobtotal += c;
     }
   }
 
   return eobtotal;
 }
 
-static int decode_coefs_4x4(VP9D_COMP *dx, MACROBLOCKD *xd,
-                            BOOL_DECODER* const bc,
-                            PLANE_TYPE type, int i) {
+int vp9_decode_coefs_4x4(VP9D_COMP *dx, MACROBLOCKD *xd,
+                         BOOL_DECODER* const bc,
+                         PLANE_TYPE type, int i) {
   ENTROPY_CONTEXT *const A = (ENTROPY_CONTEXT *)xd->above_context;
   ENTROPY_CONTEXT *const L = (ENTROPY_CONTEXT *)xd->left_context;
   ENTROPY_CONTEXT *const a = A + vp9_block2above[i];
@@ -424,26 +408,53 @@
   return c;
 }
 
-int vp9_decode_mb_tokens_4x4(VP9D_COMP* const dx,
-                             MACROBLOCKD* const xd,
-                             BOOL_DECODER* const bc) {
+int vp9_decode_mb_tokens_4x4_uv(VP9D_COMP* const dx,
+                                MACROBLOCKD* const xd,
+                                BOOL_DECODER* const bc) {
+  int eobtotal = 0, i;
+
+  for (i = 16; i < 24; i++)
+    eobtotal += vp9_decode_coefs_4x4(dx, xd, bc, PLANE_TYPE_UV, i);
+
+  return eobtotal;
+}
+
+static int vp9_decode_mb_tokens_4x4(VP9D_COMP* const dx,
+                                    MACROBLOCKD* const xd,
+                                    BOOL_DECODER* const bc) {
   int i, eobtotal = 0;
   PLANE_TYPE type;
 
   if (xd->mode_info_context->mbmi.mode != B_PRED &&
       xd->mode_info_context->mbmi.mode != I8X8_PRED &&
       xd->mode_info_context->mbmi.mode != SPLITMV) {
-    eobtotal += decode_coefs_4x4(dx, xd, bc, PLANE_TYPE_Y2, 24) - 16;
+    eobtotal += vp9_decode_coefs_4x4(dx, xd, bc, PLANE_TYPE_Y2, 24) - 16;
     type = PLANE_TYPE_Y_NO_DC;
   } else {
     type = PLANE_TYPE_Y_WITH_DC;
   }
 
   for (i = 0; i < 16; ++i) {
-    eobtotal += decode_coefs_4x4(dx, xd, bc, type, i);
+    eobtotal += vp9_decode_coefs_4x4(dx, xd, bc, type, i);
   }
-  do {
-    eobtotal += decode_coefs_4x4(dx, xd, bc, PLANE_TYPE_UV, i);
-  } while (++i < 24);
+
+  return eobtotal + vp9_decode_mb_tokens_4x4_uv(dx, xd, bc);
+}
+
+int vp9_decode_mb_tokens(VP9D_COMP* const dx,
+                         MACROBLOCKD* const xd,
+                         BOOL_DECODER* const bc) {
+  const TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
+  int eobtotal;
+
+  if (tx_size == TX_16X16) {
+    eobtotal = vp9_decode_mb_tokens_16x16(dx, xd, bc);
+  } else if (tx_size == TX_8X8) {
+    eobtotal = vp9_decode_mb_tokens_8x8(dx, xd, bc);
+  } else {
+    assert(tx_size == TX_4X4);
+    eobtotal = vp9_decode_mb_tokens_4x4(dx, xd, bc);
+  }
+
   return eobtotal;
 }
diff --git a/vp9/decoder/detokenize.h b/vp9/decoder/detokenize.h
index 5a7d354..9f00d29 100644
--- a/vp9/decoder/detokenize.h
+++ b/vp9/decoder/detokenize.h
@@ -16,13 +16,14 @@
 
 void vp9_reset_mb_tokens_context(MACROBLOCKD* const);
 
-int vp9_decode_mb_tokens_4x4(VP9D_COMP* const, MACROBLOCKD* const,
-                             BOOL_DECODER* const);
+int vp9_decode_coefs_4x4(VP9D_COMP *dx, MACROBLOCKD *xd,
+                         BOOL_DECODER* const bc,
+                         PLANE_TYPE type, int i);
 
-int vp9_decode_mb_tokens_8x8(VP9D_COMP* const, MACROBLOCKD* const,
-                             BOOL_DECODER* const);
+int vp9_decode_mb_tokens(VP9D_COMP* const, MACROBLOCKD* const,
+                         BOOL_DECODER* const);
 
-int vp9_decode_mb_tokens_16x16(VP9D_COMP* const, MACROBLOCKD* const,
-                               BOOL_DECODER* const);
+int vp9_decode_mb_tokens_4x4_uv(VP9D_COMP* const dx, MACROBLOCKD* const xd,
+                                BOOL_DECODER* const bc);
 
 #endif /* DETOKENIZE_H */
diff --git a/vp9/decoder/idct_blk.c b/vp9/decoder/idct_blk.c
index 7805aa0..98eb81c 100644
--- a/vp9/decoder/idct_blk.c
+++ b/vp9/decoder/idct_blk.c
@@ -9,7 +9,7 @@
  */
 
 #include "vp9_rtcd.h"
-#include "vp9/common/idct.h"
+#include "vp9/common/blockd.h"
 
 void vp9_dequant_dc_idct_add_y_block_c(short *q, const short *dq,
                                        unsigned char *pre,
diff --git a/vp9/decoder/onyxd_if.c b/vp9/decoder/onyxd_if.c
index 924395d..6c20223 100644
--- a/vp9/decoder/onyxd_if.c
+++ b/vp9/decoder/onyxd_if.c
@@ -315,13 +315,14 @@
 }
 
 int vp9_receive_compressed_data(VP9D_PTR ptr, unsigned long size,
-                                const unsigned char *source,
+                                const unsigned char **psource,
                                 int64_t time_stamp) {
 #if HAVE_ARMV7
   int64_t dx_store_reg[8];
 #endif
   VP9D_COMP *pbi = (VP9D_COMP *) ptr;
   VP9_COMMON *cm = &pbi->common;
+  const unsigned char *source = *psource;
   int retcode = 0;
 
   /*if(pbi->ready_for_new_data == 0)
@@ -380,7 +381,7 @@
 
   pbi->common.error.setjmp = 1;
 
-  retcode = vp9_decode_frame(pbi);
+  retcode = vp9_decode_frame(pbi, psource);
 
   if (retcode < 0) {
 #if HAVE_ARMV7
diff --git a/vp9/decoder/onyxd_int.h b/vp9/decoder/onyxd_int.h
index 2684c04..cbb13ff 100644
--- a/vp9/decoder/onyxd_int.h
+++ b/vp9/decoder/onyxd_int.h
@@ -83,7 +83,7 @@
 
 } VP9D_COMP;
 
-int vp9_decode_frame(VP9D_COMP *cpi);
+int vp9_decode_frame(VP9D_COMP *cpi, const unsigned char **p_data_end);
 
 
 #if CONFIG_DEBUG
diff --git a/vp9/decoder/x86/idct_blk_mmx.c b/vp9/decoder/x86/idct_blk_mmx.c
index 22f3e30..8445b28 100644
--- a/vp9/decoder/x86/idct_blk_mmx.c
+++ b/vp9/decoder/x86/idct_blk_mmx.c
@@ -9,7 +9,7 @@
  */
 
 #include "vpx_ports/config.h"
-#include "vp9/common/idct.h"
+#include "vp9/common/blockd.h"
 #include "vp9/decoder/dequantize.h"
 
 void vp9_dequant_dc_idct_add_y_block_mmx(short *q, const short *dq,
diff --git a/vp9/decoder/x86/idct_blk_sse2.c b/vp9/decoder/x86/idct_blk_sse2.c
index 93c2f3e..6e66f53 100644
--- a/vp9/decoder/x86/idct_blk_sse2.c
+++ b/vp9/decoder/x86/idct_blk_sse2.c
@@ -9,7 +9,7 @@
  */
 
 #include "vpx_ports/config.h"
-#include "vp9/common/idct.h"
+#include "vp9/common/blockd.h"
 #include "vp9/decoder/dequantize.h"
 
 void vp9_idct_dequant_dc_0_2x_sse2(short *q, const short *dq,
diff --git a/vp9/encoder/dct.c b/vp9/encoder/dct.c
index 5be2d76..108bdbe 100644
--- a/vp9/encoder/dct.c
+++ b/vp9/encoder/dct.c
@@ -12,7 +12,6 @@
 #include <assert.h>
 #include <math.h>
 #include "vpx_ports/config.h"
-#include "vp9/common/idct.h"
 #include "vp9/common/systemdependent.h"
 
 #include "vp9/common/blockd.h"
diff --git a/vp9/encoder/encodeframe.c b/vp9/encoder/encodeframe.c
index a3fe052..6ad9eae 100644
--- a/vp9/encoder/encodeframe.c
+++ b/vp9/encoder/encodeframe.c
@@ -33,7 +33,6 @@
 #include <stdio.h>
 #include <math.h>
 #include <limits.h>
-#include "vp9/common/subpixel.h"
 #include "vpx_ports/vpx_timer.h"
 #include "vp9/common/pred_common.h"
 #include "vp9/common/mvref_common.h"
@@ -2078,14 +2077,14 @@
 
   if (mbmi->ref_frame == INTRA_FRAME) {
     if (mbmi->mode == B_PRED) {
-      vp9_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
-      vp9_encode_intra4x4mby(IF_RTCD(&cpi->rtcd), x);
+      vp9_encode_intra16x16mbuv(x);
+      vp9_encode_intra4x4mby(x);
     } else if (mbmi->mode == I8X8_PRED) {
-      vp9_encode_intra8x8mby(IF_RTCD(&cpi->rtcd), x);
-      vp9_encode_intra8x8mbuv(IF_RTCD(&cpi->rtcd), x);
+      vp9_encode_intra8x8mby(x);
+      vp9_encode_intra8x8mbuv(x);
     } else {
-      vp9_encode_intra16x16mbuv(IF_RTCD(&cpi->rtcd), x);
-      vp9_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);
+      vp9_encode_intra16x16mbuv(x);
+      vp9_encode_intra16x16mby(x);
     }
 
     if (output_enabled)
@@ -2125,7 +2124,7 @@
     }
 
     if (!x->skip) {
-      vp9_encode_inter16x16(IF_RTCD(&cpi->rtcd), x);
+      vp9_encode_inter16x16(x);
 
       // Clear mb_skip_coeff if mb_no_coeff_skip is not set
       if (!cpi->common.mb_no_coeff_skip)
@@ -2227,7 +2226,6 @@
   uint8_t *vdst = xd->dst.v_buffer;
   int src_y_stride = x->src.y_stride, dst_y_stride = xd->dst.y_stride;
   int src_uv_stride = x->src.uv_stride, dst_uv_stride = xd->dst.uv_stride;
-  const VP9_ENCODER_RTCD *rtcd = IF_RTCD(&cpi->rtcd);
   int seg_ref_active;
   unsigned char ref_pred_flag;
   int n;
@@ -2344,7 +2342,7 @@
                           udst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
                           vdst + x_idx * 8 + y_idx * 8 * dst_uv_stride,
                           dst_uv_stride);
-    vp9_fidct_mb(x, rtcd);
+    vp9_fidct_mb(x);
     vp9_recon_mby_s_c(&x->e_mbd,
                       dst + x_idx * 16 + y_idx * 16 * dst_y_stride);
     vp9_recon_mbuv_s_c(&x->e_mbd,
diff --git a/vp9/encoder/encodeintra.c b/vp9/encoder/encodeintra.c
index a6bc1c3..42e5e18 100644
--- a/vp9/encoder/encodeintra.c
+++ b/vp9/encoder/encodeintra.c
@@ -10,7 +10,6 @@
 
 #include "vpx_ports/config.h"
 #include "vp9_rtcd.h"
-#include "vp9/common/idct.h"
 #include "quantize.h"
 #include "vp9/common/reconintra.h"
 #include "vp9/common/reconintra4x4.h"
@@ -38,11 +37,11 @@
     mbmi->uv_mode = DC_PRED;
     mbmi->ref_frame = INTRA_FRAME;
 
-    vp9_encode_intra16x16mby(IF_RTCD(&cpi->rtcd), x);
+    vp9_encode_intra16x16mby(x);
   } else {
     for (i = 0; i < 16; i++) {
       x->e_mbd.block[i].bmi.as_mode.first = B_DC_PRED;
-      vp9_encode_intra4x4block(IF_RTCD(&cpi->rtcd), x, i);
+      vp9_encode_intra4x4block(x, i);
     }
   }
 
@@ -51,8 +50,7 @@
   return intra_pred_var;
 }
 
-void vp9_encode_intra4x4block(const VP9_ENCODER_RTCD *rtcd,
-                              MACROBLOCK *x, int ib) {
+void vp9_encode_intra4x4block(MACROBLOCK *x, int ib) {
   BLOCKD *b = &x->e_mbd.block[ib];
   BLOCK *be = &x->block[ib];
   TX_TYPE tx_type;
@@ -82,21 +80,21 @@
   } else {
     x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);
     x->quantize_b_4x4(be, b) ;
-    vp9_inverse_transform_b_4x4(IF_RTCD(&rtcd->common->idct), b, 32);
+    vp9_inverse_transform_b_4x4(&x->e_mbd, ib, 32);
   }
 
   vp9_recon_b(b->predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
 }
 
-void vp9_encode_intra4x4mby(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *mb) {
+void vp9_encode_intra4x4mby(MACROBLOCK *mb) {
   int i;
 
   for (i = 0; i < 16; i++)
-    vp9_encode_intra4x4block(rtcd, mb, i);
+    vp9_encode_intra4x4block(mb, i);
   return;
 }
 
-void vp9_encode_intra16x16mby(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
+void vp9_encode_intra16x16mby(MACROBLOCK *x) {
   MACROBLOCKD *xd = &x->e_mbd;
   BLOCK *b = &x->block[0];
   TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
@@ -120,33 +118,33 @@
       vp9_fht(b->src_diff, 32, b->coeff, tx_type, 16);
       vp9_quantize_mby_16x16(x);
       if (x->optimize)
-        vp9_optimize_mby_16x16(x, rtcd);
+        vp9_optimize_mby_16x16(x);
       vp9_ihtllm_c(bd->dqcoeff, bd->diff, 32, tx_type, 16);
     } else {
       vp9_transform_mby_16x16(x);
       vp9_quantize_mby_16x16(x);
       if (x->optimize)
-        vp9_optimize_mby_16x16(x, rtcd);
-      vp9_inverse_transform_mby_16x16(IF_RTCD(&rtcd->common->idct), xd);
+        vp9_optimize_mby_16x16(x);
+      vp9_inverse_transform_mby_16x16(xd);
     }
   } else if (tx_size == TX_8X8) {
     vp9_transform_mby_8x8(x);
     vp9_quantize_mby_8x8(x);
     if (x->optimize)
-      vp9_optimize_mby_8x8(x, rtcd);
-    vp9_inverse_transform_mby_8x8(IF_RTCD(&rtcd->common->idct), xd);
+      vp9_optimize_mby_8x8(x);
+    vp9_inverse_transform_mby_8x8(xd);
   } else {
     vp9_transform_mby_4x4(x);
     vp9_quantize_mby_4x4(x);
     if (x->optimize)
-      vp9_optimize_mby_4x4(x, rtcd);
-    vp9_inverse_transform_mby_4x4(IF_RTCD(&rtcd->common->idct), xd);
+      vp9_optimize_mby_4x4(x);
+    vp9_inverse_transform_mby_4x4(xd);
   }
 
   vp9_recon_mby(xd);
 }
 
-void vp9_encode_intra16x16mbuv(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
+void vp9_encode_intra16x16mbuv(MACROBLOCK *x) {
   MACROBLOCKD *xd = &x->e_mbd;
   TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
 
@@ -167,21 +165,20 @@
     vp9_transform_mbuv_4x4(x);
     vp9_quantize_mbuv_4x4(x);
     if (x->optimize)
-      vp9_optimize_mbuv_4x4(x, rtcd);
-    vp9_inverse_transform_mbuv_4x4(IF_RTCD(&rtcd->common->idct), xd);
+      vp9_optimize_mbuv_4x4(x);
+    vp9_inverse_transform_mbuv_4x4(xd);
   } else /* 16x16 or 8x8 */ {
     vp9_transform_mbuv_8x8(x);
     vp9_quantize_mbuv_8x8(x);
     if (x->optimize)
-      vp9_optimize_mbuv_8x8(x, rtcd);
-    vp9_inverse_transform_mbuv_8x8(IF_RTCD(&rtcd->common->idct), xd);
+      vp9_optimize_mbuv_8x8(x);
+    vp9_inverse_transform_mbuv_8x8(xd);
   }
 
   vp9_recon_intra_mbuv(xd);
 }
 
-void vp9_encode_intra8x8(const VP9_ENCODER_RTCD *rtcd,
-                         MACROBLOCK *x, int ib) {
+void vp9_encode_intra8x8(MACROBLOCK *x, int ib) {
   MACROBLOCKD *xd = &x->e_mbd;
   BLOCKD *b = &xd->block[ib];
   BLOCK *be = &x->block[ib];
@@ -216,7 +213,7 @@
     } else {
       x->vp9_short_fdct8x8(be->src_diff, (x->block + idx)->coeff, 32);
       x->quantize_b_8x8(x->block + idx, xd->block + idx);
-      vp9_idct_idct8(xd->block[idx].dqcoeff, xd->block[ib].diff, 32);
+      vp9_short_idct8x8(xd->block[idx].dqcoeff, xd->block[ib].diff, 32);
     }
   } else {
     for (i = 0; i < 4; i++) {
@@ -225,7 +222,7 @@
       vp9_subtract_b(be, b, 16);
       x->vp9_short_fdct4x4(be->src_diff, be->coeff, 32);
       x->quantize_b_4x4(be, b);
-      vp9_inverse_transform_b_4x4(IF_RTCD(&rtcd->common->idct), b, 32);
+      vp9_inverse_transform_b_4x4(xd, ib + iblock[i], 32);
     }
   }
 
@@ -237,17 +234,16 @@
   }
 }
 
-void vp9_encode_intra8x8mby(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
+void vp9_encode_intra8x8mby(MACROBLOCK *x) {
   int i, ib;
 
   for (i = 0; i < 4; i++) {
     ib = vp9_i8x8_block[i];
-    vp9_encode_intra8x8(rtcd, x, ib);
+    vp9_encode_intra8x8(x, ib);
   }
 }
 
-void vp9_encode_intra_uv4x4(const VP9_ENCODER_RTCD *rtcd,
-                            MACROBLOCK *x, int ib,
+void vp9_encode_intra_uv4x4(MACROBLOCK *x, int ib,
                             int mode, int second) {
   BLOCKD *b = &x->e_mbd.block[ib];
   BLOCK *be = &x->block[ib];
@@ -266,13 +262,13 @@
 
   x->vp9_short_fdct4x4(be->src_diff, be->coeff, 16);
   x->quantize_b_4x4(be, b);
-  vp9_inverse_transform_b_4x4(IF_RTCD(&rtcd->common->idct), b, 16);
+  vp9_inverse_transform_b_4x4(&x->e_mbd, ib, 16);
 
   vp9_recon_uv_b_c(b->predictor, b->diff, *(b->base_dst) + b->dst,
                    b->dst_stride);
 }
 
-void vp9_encode_intra8x8mbuv(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
+void vp9_encode_intra8x8mbuv(MACROBLOCK *x) {
   int i, ib, mode, second;
   BLOCKD *b;
 
@@ -286,8 +282,8 @@
     second = -1;
 #endif
     /*u */
-    vp9_encode_intra_uv4x4(rtcd, x, i + 16, mode, second);
+    vp9_encode_intra_uv4x4(x, i + 16, mode, second);
     /*v */
-    vp9_encode_intra_uv4x4(rtcd, x, i + 20, mode, second);
+    vp9_encode_intra_uv4x4(x, i + 20, mode, second);
   }
 }
diff --git a/vp9/encoder/encodeintra.h b/vp9/encoder/encodeintra.h
index 38b42b7..9326ad6 100644
--- a/vp9/encoder/encodeintra.h
+++ b/vp9/encoder/encodeintra.h
@@ -14,14 +14,12 @@
 #include "onyx_int.h"
 
 int vp9_encode_intra(VP9_COMP *cpi, MACROBLOCK *x, int use_16x16_pred);
-void vp9_encode_intra16x16mby(const VP9_ENCODER_RTCD *, MACROBLOCK *x);
-void vp9_encode_intra16x16mbuv(const VP9_ENCODER_RTCD *, MACROBLOCK *x);
-void vp9_encode_intra4x4mby(const VP9_ENCODER_RTCD *, MACROBLOCK *mb);
-void vp9_encode_intra4x4block(const VP9_ENCODER_RTCD *rtcd,
-                              MACROBLOCK *x, int ib);
-void vp9_encode_intra8x8mby(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x);
-void vp9_encode_intra8x8mbuv(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x);
-void vp9_encode_intra8x8(const VP9_ENCODER_RTCD *rtcd,
-                         MACROBLOCK *x, int ib);
+void vp9_encode_intra16x16mby(MACROBLOCK *x);
+void vp9_encode_intra16x16mbuv(MACROBLOCK *x);
+void vp9_encode_intra4x4mby(MACROBLOCK *mb);
+void vp9_encode_intra4x4block(MACROBLOCK *x, int ib);
+void vp9_encode_intra8x8mby(MACROBLOCK *x);
+void vp9_encode_intra8x8mbuv(MACROBLOCK *x);
+void vp9_encode_intra8x8(MACROBLOCK *x, int ib);
 
 #endif  // __ENCODEINTRA_H_
diff --git a/vp9/encoder/encodemb.c b/vp9/encoder/encodemb.c
index 6774a88..136a248 100644
--- a/vp9/encoder/encodemb.c
+++ b/vp9/encoder/encodemb.c
@@ -20,12 +20,6 @@
 #include "vp9/common/systemdependent.h"
 #include "vp9_rtcd.h"
 
-#if CONFIG_RUNTIME_CPU_DETECT
-#define IF_RTCD(x) (x)
-#else
-#define IF_RTCD(x) NULL
-#endif
-
 void vp9_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch) {
   unsigned char *src_ptr = (*(be->base_src) + be->src);
   short *diff_ptr = be->src_diff;
@@ -119,7 +113,7 @@
   vp9_subtract_mby_s_c(diff, src, stride, pred, 16);
 }
 
-static void subtract_mb(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
+static void subtract_mb(MACROBLOCK *x) {
   BLOCK *b = &x->block[0];
 
   vp9_subtract_mby(x->src_diff, *(b->base_src), x->e_mbd.predictor,
@@ -265,7 +259,7 @@
 
 static void optimize_b(MACROBLOCK *mb, int i, PLANE_TYPE type,
                        ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
-                       const VP9_ENCODER_RTCD *rtcd, int tx_size) {
+                       int tx_size) {
   BLOCK *b;
   BLOCKD *d;
   vp9_token_state tokens[65][2];
@@ -567,7 +561,7 @@
   }
 }
 
-void vp9_optimize_mby_4x4(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
+void vp9_optimize_mby_4x4(MACROBLOCK *x) {
   int b;
   PLANE_TYPE type;
   int has_2nd_order;
@@ -590,19 +584,19 @@
 
   for (b = 0; b < 16; b++) {
     optimize_b(x, b, type,
-               ta + vp9_block2above[b], tl + vp9_block2left[b], rtcd, TX_4X4);
+               ta + vp9_block2above[b], tl + vp9_block2left[b], TX_4X4);
   }
 
   if (has_2nd_order) {
     b = 24;
     optimize_b(x, b, PLANE_TYPE_Y2,
-               ta + vp9_block2above[b], tl + vp9_block2left[b], rtcd, TX_4X4);
+               ta + vp9_block2above[b], tl + vp9_block2left[b], TX_4X4);
     check_reset_2nd_coeffs(&x->e_mbd,
                            ta + vp9_block2above[b], tl + vp9_block2left[b]);
   }
 }
 
-void vp9_optimize_mbuv_4x4(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
+void vp9_optimize_mbuv_4x4(MACROBLOCK *x) {
   int b;
   ENTROPY_CONTEXT_PLANES t_above, t_left;
   ENTROPY_CONTEXT *ta;
@@ -619,16 +613,16 @@
 
   for (b = 16; b < 24; b++) {
     optimize_b(x, b, PLANE_TYPE_UV,
-               ta + vp9_block2above[b], tl + vp9_block2left[b], rtcd, TX_4X4);
+               ta + vp9_block2above[b], tl + vp9_block2left[b], TX_4X4);
   }
 }
 
-static void optimize_mb_4x4(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
-  vp9_optimize_mby_4x4(x, rtcd);
-  vp9_optimize_mbuv_4x4(x, rtcd);
+static void optimize_mb_4x4(MACROBLOCK *x) {
+  vp9_optimize_mby_4x4(x);
+  vp9_optimize_mbuv_4x4(x);
 }
 
-void vp9_optimize_mby_8x8(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
+void vp9_optimize_mby_8x8(MACROBLOCK *x) {
   int b;
   PLANE_TYPE type;
   ENTROPY_CONTEXT_PLANES t_above, t_left;
@@ -648,7 +642,7 @@
   for (b = 0; b < 16; b += 4) {
     optimize_b(x, b, type,
                ta + vp9_block2above_8x8[b], tl + vp9_block2left_8x8[b],
-               rtcd, TX_8X8);
+               TX_8X8);
     ta[vp9_block2above_8x8[b] + 1] = ta[vp9_block2above_8x8[b]];
     tl[vp9_block2left_8x8[b] + 1]  = tl[vp9_block2left_8x8[b]];
   }
@@ -661,7 +655,7 @@
   }
 }
 
-void vp9_optimize_mbuv_8x8(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
+void vp9_optimize_mbuv_8x8(MACROBLOCK *x) {
   int b;
   ENTROPY_CONTEXT_PLANES t_above, t_left;
   ENTROPY_CONTEXT *ta;
@@ -679,20 +673,19 @@
   for (b = 16; b < 24; b += 4) {
     optimize_b(x, b, PLANE_TYPE_UV,
                ta + vp9_block2above_8x8[b], tl + vp9_block2left_8x8[b],
-               rtcd, TX_8X8);
+               TX_8X8);
     ta[vp9_block2above_8x8[b] + 1] = ta[vp9_block2above_8x8[b]];
     tl[vp9_block2left_8x8[b] + 1]  = tl[vp9_block2left_8x8[b]];
   }
 }
 
-static void optimize_mb_8x8(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
-  vp9_optimize_mby_8x8(x, rtcd);
-  vp9_optimize_mbuv_8x8(x, rtcd);
+static void optimize_mb_8x8(MACROBLOCK *x) {
+  vp9_optimize_mby_8x8(x);
+  vp9_optimize_mbuv_8x8(x);
 }
 
 static void optimize_b_16x16(MACROBLOCK *mb, int i, PLANE_TYPE type,
-                             ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l,
-                             const VP9_ENCODER_RTCD *rtcd) {
+                             ENTROPY_CONTEXT *a, ENTROPY_CONTEXT *l) {
   BLOCK *b = &mb->block[i];
   BLOCKD *d = &mb->e_mbd.block[i];
   vp9_token_state tokens[257][2];
@@ -864,7 +857,7 @@
   *a = *l = (d->eob != !type);
 }
 
-void vp9_optimize_mby_16x16(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
+void vp9_optimize_mby_16x16(MACROBLOCK *x) {
   ENTROPY_CONTEXT_PLANES t_above, t_left;
   ENTROPY_CONTEXT *ta, *tl;
 
@@ -876,15 +869,15 @@
 
   ta = (ENTROPY_CONTEXT *)&t_above;
   tl = (ENTROPY_CONTEXT *)&t_left;
-  optimize_b_16x16(x, 0, PLANE_TYPE_Y_WITH_DC, ta, tl, rtcd);
+  optimize_b_16x16(x, 0, PLANE_TYPE_Y_WITH_DC, ta, tl);
 }
 
-static void optimize_mb_16x16(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
-  vp9_optimize_mby_16x16(x, rtcd);
-  vp9_optimize_mbuv_8x8(x, rtcd);
+static void optimize_mb_16x16(MACROBLOCK *x) {
+  vp9_optimize_mby_16x16(x);
+  vp9_optimize_mbuv_8x8(x);
 }
 
-void vp9_fidct_mb(MACROBLOCK *x, const VP9_ENCODER_RTCD *rtcd) {
+void vp9_fidct_mb(MACROBLOCK *x) {
   MACROBLOCKD *const xd = &x->e_mbd;
   TX_SIZE tx_size = xd->mode_info_context->mbmi.txfm_size;
 
@@ -892,8 +885,8 @@
     vp9_transform_mb_16x16(x);
     vp9_quantize_mb_16x16(x);
     if (x->optimize)
-      optimize_mb_16x16(x, rtcd);
-    vp9_inverse_transform_mb_16x16(IF_RTCD(&rtcd->common->idct), xd);
+      optimize_mb_16x16(x);
+    vp9_inverse_transform_mb_16x16(xd);
   } else if (tx_size == TX_8X8) {
     if (xd->mode_info_context->mbmi.mode == SPLITMV) {
       assert(xd->mode_info_context->mbmi.partitioning != PARTITIONING_4X4);
@@ -902,38 +895,38 @@
       vp9_quantize_mby_8x8(x);
       vp9_quantize_mbuv_4x4(x);
       if (x->optimize) {
-        vp9_optimize_mby_8x8(x, rtcd);
-        vp9_optimize_mbuv_4x4(x, rtcd);
+        vp9_optimize_mby_8x8(x);
+        vp9_optimize_mbuv_4x4(x);
       }
-      vp9_inverse_transform_mby_8x8(IF_RTCD(&rtcd->common->idct), xd);
-      vp9_inverse_transform_mbuv_4x4(IF_RTCD(&rtcd->common->idct), xd);
+      vp9_inverse_transform_mby_8x8(xd);
+      vp9_inverse_transform_mbuv_4x4(xd);
     } else {
       vp9_transform_mb_8x8(x);
       vp9_quantize_mb_8x8(x);
       if (x->optimize)
-        optimize_mb_8x8(x, rtcd);
-      vp9_inverse_transform_mb_8x8(IF_RTCD(&rtcd->common->idct), xd);
+        optimize_mb_8x8(x);
+      vp9_inverse_transform_mb_8x8(xd);
     }
   } else {
     transform_mb_4x4(x);
     vp9_quantize_mb_4x4(x);
     if (x->optimize)
-      optimize_mb_4x4(x, rtcd);
-    vp9_inverse_transform_mb_4x4(IF_RTCD(&rtcd->common->idct), xd);
+      optimize_mb_4x4(x);
+    vp9_inverse_transform_mb_4x4(xd);
   }
 }
 
-void vp9_encode_inter16x16(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
+void vp9_encode_inter16x16(MACROBLOCK *x) {
   MACROBLOCKD *const xd = &x->e_mbd;
 
   vp9_build_inter_predictors_mb(xd);
-  subtract_mb(rtcd, x);
-  vp9_fidct_mb(x, rtcd);
+  subtract_mb(x);
+  vp9_fidct_mb(x);
   vp9_recon_mb(xd);
 }
 
 /* this function is used by first pass only */
-void vp9_encode_inter16x16y(const VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x) {
+void vp9_encode_inter16x16y(MACROBLOCK *x) {
   MACROBLOCKD *xd = &x->e_mbd;
   BLOCK *b = &x->block[0];
 
@@ -948,7 +941,7 @@
 
   vp9_transform_mby_4x4(x);
   vp9_quantize_mby_4x4(x);
-  vp9_inverse_transform_mby_4x4(IF_RTCD(&rtcd->common->idct), xd);
+  vp9_inverse_transform_mby_4x4(xd);
 
   vp9_recon_mby(xd);
 }
diff --git a/vp9/encoder/encodemb.h b/vp9/encoder/encodemb.h
index e59ed8a..905dad3 100644
--- a/vp9/encoder/encodemb.h
+++ b/vp9/encoder/encodemb.h
@@ -35,27 +35,27 @@
 
 #include "onyx_int.h"
 struct VP9_ENCODER_RTCD;
-void vp9_encode_inter16x16(const struct VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x);
+void vp9_encode_inter16x16(MACROBLOCK *x);
 
 void vp9_transform_mbuv_4x4(MACROBLOCK *x);
 void vp9_transform_mby_4x4(MACROBLOCK *x);
 
-void vp9_optimize_mby_4x4(MACROBLOCK *x, const struct VP9_ENCODER_RTCD *rtcd);
-void vp9_optimize_mbuv_4x4(MACROBLOCK *x, const struct VP9_ENCODER_RTCD *rtcd);
-void vp9_encode_inter16x16y(const struct VP9_ENCODER_RTCD *rtcd, MACROBLOCK *x);
+void vp9_optimize_mby_4x4(MACROBLOCK *x);
+void vp9_optimize_mbuv_4x4(MACROBLOCK *x);
+void vp9_encode_inter16x16y(MACROBLOCK *x);
 
 void vp9_transform_mb_8x8(MACROBLOCK *mb);
 void vp9_transform_mby_8x8(MACROBLOCK *x);
 void vp9_transform_mbuv_8x8(MACROBLOCK *x);
 void vp9_build_dcblock_8x8(MACROBLOCK *b);
-void vp9_optimize_mby_8x8(MACROBLOCK *x, const struct VP9_ENCODER_RTCD *rtcd);
-void vp9_optimize_mbuv_8x8(MACROBLOCK *x, const struct VP9_ENCODER_RTCD *rtcd);
+void vp9_optimize_mby_8x8(MACROBLOCK *x);
+void vp9_optimize_mbuv_8x8(MACROBLOCK *x);
 
 void vp9_transform_mb_16x16(MACROBLOCK *mb);
 void vp9_transform_mby_16x16(MACROBLOCK *x);
-void vp9_optimize_mby_16x16(MACROBLOCK *x, const struct VP9_ENCODER_RTCD *rtcd);
+void vp9_optimize_mby_16x16(MACROBLOCK *x);
 
-void vp9_fidct_mb(MACROBLOCK *x, const struct VP9_ENCODER_RTCD *rtcd);
+void vp9_fidct_mb(MACROBLOCK *x);
 
 void vp9_subtract_4b_c(BLOCK *be, BLOCKD *bd, int pitch);
 
diff --git a/vp9/encoder/firstpass.c b/vp9/encoder/firstpass.c
index db7c4d7..3e232ef 100644
--- a/vp9/encoder/firstpass.c
+++ b/vp9/encoder/firstpass.c
@@ -621,7 +621,7 @@
           this_error = motion_error;
           vp9_set_mbmode_and_mvs(x, NEWMV, &mv);
           xd->mode_info_context->mbmi.txfm_size = TX_4X4;
-          vp9_encode_inter16x16y(IF_RTCD(&cpi->rtcd), x);
+          vp9_encode_inter16x16y(x);
           sum_mvr += mv.as_mv.row;
           sum_mvr_abs += abs(mv.as_mv.row);
           sum_mvc += mv.as_mv.col;
diff --git a/vp9/encoder/generic/csystemdependent.c b/vp9/encoder/generic/csystemdependent.c
index d49b7d9..411408c 100644
--- a/vp9/encoder/generic/csystemdependent.c
+++ b/vp9/encoder/generic/csystemdependent.c
@@ -14,30 +14,8 @@
 #include "vp9/encoder/onyx_int.h"
 
 
-void vp9_arch_x86_encoder_init(VP9_COMP *cpi);
-void vp9_arch_arm_encoder_init(VP9_COMP *cpi);
-
-void (*vp9_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc,
-                                        YV12_BUFFER_CONFIG *dst_ybc,
-                                        int fraction);
-extern void vp9_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc,
-                                        YV12_BUFFER_CONFIG *dst_ybc,
-                                        int fraction);
-
 void vp9_cmachine_specific_config(VP9_COMP *cpi) {
 #if CONFIG_RUNTIME_CPU_DETECT
   cpi->rtcd.common                    = &cpi->common.rtcd;
 #endif
-
-  vp9_yv12_copy_partial_frame_ptr = vp9_yv12_copy_partial_frame;
-
-#if ARCH_X86 || ARCH_X86_64
-  vp9_arch_x86_encoder_init(cpi);
-#endif
-
-#if ARCH_ARM
-  vp9_arch_arm_encoder_init(cpi);
-#endif
-
-
 }
diff --git a/vp9/encoder/mcomp.c b/vp9/encoder/mcomp.c
index 90e944f..a4914d8 100644
--- a/vp9/encoder/mcomp.c
+++ b/vp9/encoder/mcomp.c
@@ -1442,7 +1442,7 @@
       unsigned int sad_array[4];
 
       for (j = 0; j < x->searches_per_step; j += 4) {
-        unsigned char *block_offset[4];
+        unsigned char const *block_offset[4];
 
         for (t = 0; t < 4; t++)
           block_offset[t] = ss[i + t].offset + best_address;
@@ -2070,7 +2070,7 @@
 
     if (all_in) {
       unsigned int sad_array[4];
-      unsigned char *block_offset[4];
+      unsigned char const *block_offset[4];
       block_offset[0] = best_address - in_what_stride;
       block_offset[1] = best_address - 1;
       block_offset[2] = best_address + 1;
diff --git a/vp9/encoder/onyx_if.c b/vp9/encoder/onyx_if.c
index 333a4f3..7d2a177 100644
--- a/vp9/encoder/onyx_if.c
+++ b/vp9/encoder/onyx_if.c
@@ -23,6 +23,7 @@
 #include "ratectrl.h"
 #include "vp9/common/quant_common.h"
 #include "segmentation.h"
+#include "./vp9_rtcd.h"
 #if CONFIG_POSTPROC
 #include "vp9/common/postproc.h"
 #endif
@@ -1267,8 +1268,6 @@
   }
 #endif
 
-
-
   cpi->mb.quantize_b_4x4      = vp9_regular_quantize_b_4x4;
   cpi->mb.quantize_b_4x4_pair = vp9_regular_quantize_b_4x4_pair;
   cpi->mb.quantize_b_8x8      = vp9_regular_quantize_b_8x8;
@@ -1584,14 +1583,18 @@
   cpi->oxcf.best_allowed_q = q_trans[oxcf->best_allowed_q];
   cpi->oxcf.cq_level = q_trans[cpi->oxcf.cq_level];
 
+  cpi->mb.e_mbd.inv_xform4x4_1_x8     = vp9_short_idct4x4llm_1;
+  cpi->mb.e_mbd.inv_xform4x4_x8       = vp9_short_idct4x4llm;
+  cpi->mb.e_mbd.inv_walsh4x4_1        = vp9_short_inv_walsh4x4_1;
+  cpi->mb.e_mbd.inv_walsh4x4_lossless = vp9_short_inv_walsh4x4;
+
 #if CONFIG_LOSSLESS
   cpi->oxcf.lossless = oxcf->lossless;
   if (cpi->oxcf.lossless) {
-    cpi->common.rtcd.idct.idct1        = vp9_short_inv_walsh4x4_1_x8_c;
-    cpi->common.rtcd.idct.idct16       = vp9_short_inv_walsh4x4_x8_c;
-    cpi->common.rtcd.idct.idct1_scalar_add  = vp9_dc_only_inv_walsh_add_c;
-    cpi->common.rtcd.idct.iwalsh1      = vp9_short_inv_walsh4x4_1_c;
-    cpi->common.rtcd.idct.iwalsh16     = vp9_short_inv_walsh4x4_lossless_c;
+    cpi->mb.e_mbd.inv_xform4x4_1_x8     = vp9_short_inv_walsh4x4_1_x8;
+    cpi->mb.e_mbd.inv_xform4x4_x8       = vp9_short_inv_walsh4x4_x8;
+    cpi->mb.e_mbd.inv_walsh4x4_1        = vp9_short_inv_walsh4x4_1_lossless;
+    cpi->mb.e_mbd.inv_walsh4x4_lossless = vp9_short_inv_walsh4x4_lossless;
   }
 #endif
 
diff --git a/vp9/encoder/picklpf.c b/vp9/encoder/picklpf.c
index 0107fac..d5631bd 100644
--- a/vp9/encoder/picklpf.c
+++ b/vp9/encoder/picklpf.c
@@ -31,12 +31,8 @@
 #define IF_RTCD(x) NULL
 #endif
 
-extern void(*vp9_yv12_copy_partial_frame_ptr)(YV12_BUFFER_CONFIG *src_ybc,
-                                              YV12_BUFFER_CONFIG *dst_ybc,
-                                              int fraction);
-
-void vp9_yv12_copy_partial_frame(YV12_BUFFER_CONFIG *src_ybc,
-                                 YV12_BUFFER_CONFIG *dst_ybc, int Fraction) {
+void vp9_yv12_copy_partial_frame_c(YV12_BUFFER_CONFIG *src_ybc,
+                                   YV12_BUFFER_CONFIG *dst_ybc, int Fraction) {
   unsigned char *src_y, *dst_y;
   int yheight;
   int ystride;
@@ -147,7 +143,7 @@
   int best_filt_val = cm->filter_level;
 
   //  Make a copy of the unfiltered / processed recon buffer
-  vp9_yv12_copy_partial_frame_ptr(cm->frame_to_show, &cpi->last_frame_uf, 3);
+  vp9_yv12_copy_partial_frame(cm->frame_to_show, &cpi->last_frame_uf, 3);
 
   if (cm->frame_type == KEY_FRAME)
     cm->sharpness_level = 0;
@@ -174,7 +170,7 @@
   best_err = calc_partial_ssl_err(sd, cm->frame_to_show, 3);
 
   //  Re-instate the unfiltered frame
-  vp9_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show, 3);
+  vp9_yv12_copy_partial_frame(&cpi->last_frame_uf, cm->frame_to_show, 3);
 
   filt_val -= (1 + ((filt_val > 10) ? 1 : 0));
 
@@ -187,7 +183,7 @@
     filt_err = calc_partial_ssl_err(sd, cm->frame_to_show, 3);
 
     //  Re-instate the unfiltered frame
-    vp9_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf, cm->frame_to_show, 3);
+    vp9_yv12_copy_partial_frame(&cpi->last_frame_uf, cm->frame_to_show, 3);
 
 
     // Update the best case record or exit loop.
@@ -216,7 +212,7 @@
       filt_err = calc_partial_ssl_err(sd, cm->frame_to_show, 3);
 
       //  Re-instate the unfiltered frame
-      vp9_yv12_copy_partial_frame_ptr(&cpi->last_frame_uf,
+      vp9_yv12_copy_partial_frame(&cpi->last_frame_uf,
                                       cm->frame_to_show, 3);
 
       // Update the best case record or exit loop.
diff --git a/vp9/encoder/rdopt.c b/vp9/encoder/rdopt.c
index 9e4519d..53a87a4 100644
--- a/vp9/encoder/rdopt.c
+++ b/vp9/encoder/rdopt.c
@@ -28,7 +28,6 @@
 #include "vp9/common/quant_common.h"
 #include "encodemb.h"
 #include "quantize.h"
-#include "vp9/common/idct.h"
 #include "variance.h"
 #include "mcomp.h"
 #include "rdopt.h"
@@ -680,7 +679,6 @@
 static void macro_block_yrd_4x4(MACROBLOCK *mb,
                                 int *Rate,
                                 int *Distortion,
-                                const VP9_ENCODER_RTCD *rtcd,
                                 int *skippable, int backup) {
   int b;
   MACROBLOCKD *const xd = &mb->e_mbd;
@@ -751,7 +749,6 @@
 static void macro_block_yrd_8x8(MACROBLOCK *mb,
                                 int *Rate,
                                 int *Distortion,
-                                const VP9_ENCODER_RTCD *rtcd,
                                 int *skippable, int backup) {
   MACROBLOCKD *const xd = &mb->e_mbd;
   BLOCK   *const mb_y2 = mb->block + 24;
@@ -802,8 +799,7 @@
 }
 
 static void macro_block_yrd_16x16(MACROBLOCK *mb, int *Rate, int *Distortion,
-                                  const VP9_ENCODER_RTCD *rtcd, int *skippable,
-                                  int backup) {
+                                  int *skippable, int backup) {
   int d;
   MACROBLOCKD *xd = &mb->e_mbd;
   BLOCKD *b  = &mb->e_mbd.block[0];
@@ -821,7 +817,7 @@
   //                trailing coefficients to be zero, instead of running trellis
   //                optimization in the rate-distortion optimization loop?
   if (mb->e_mbd.mode_info_context->mbmi.mode < I8X8_PRED)
-    vp9_optimize_mby_16x16(mb, rtcd);
+    vp9_optimize_mby_16x16(mb);
 
   d = vp9_mbblock_error(mb, 0);
 
@@ -902,7 +898,6 @@
 static void macro_block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate,
                             int *distortion, int *skippable,
                             int64_t txfm_cache[NB_TXFM_MODES]) {
-  VP9_COMMON *cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   int r[2][TX_SIZE_MAX], d[TX_SIZE_MAX], s[TX_SIZE_MAX];
 
@@ -910,11 +905,9 @@
                    x->block[0].src_stride);
 
   macro_block_yrd_16x16(x, &r[0][TX_16X16], &d[TX_16X16],
-                        IF_RTCD(&cpi->rtcd), &s[TX_16X16], 1);
-  macro_block_yrd_8x8(x, &r[0][TX_8X8], &d[TX_8X8],
-                      IF_RTCD(&cpi->rtcd), &s[TX_8X8], 1);
-  macro_block_yrd_4x4(x, &r[0][TX_4X4], &d[TX_4X4],
-                      IF_RTCD(&cpi->rtcd), &s[TX_4X4], 1);
+                        &s[TX_16X16], 1);
+  macro_block_yrd_8x8(x, &r[0][TX_8X8], &d[TX_8X8], &s[TX_8X8], 1);
+  macro_block_yrd_4x4(x, &r[0][TX_4X4], &d[TX_4X4], &s[TX_4X4], 1);
 
   choose_txfm_size_from_rd(cpi, x, r, rate, d, distortion, s, skippable,
                            txfm_cache);
@@ -932,7 +925,7 @@
 #if CONFIG_SUPERBLOCKS
 static void super_block_yrd(VP9_COMP *cpi,
                             MACROBLOCK *x, int *rate, int *distortion,
-                            const VP9_ENCODER_RTCD *rtcd, int *skip,
+                            int *skip,
                             int64_t txfm_cache[NB_TXFM_MODES]) {
   MACROBLOCKD *const xd = &x->e_mbd;
   int r[2][TX_SIZE_MAX], d[TX_SIZE_MAX], s[TX_SIZE_MAX], n;
@@ -961,21 +954,21 @@
 
     xd->above_context = &t_above[TX_16X16][x_idx];
     xd->left_context = &t_left[TX_16X16][y_idx];
-    macro_block_yrd_16x16(x, &r_tmp, &d_tmp, IF_RTCD(&cpi->rtcd), &s_tmp, 0);
+    macro_block_yrd_16x16(x, &r_tmp, &d_tmp, &s_tmp, 0);
     d[TX_16X16] += d_tmp;
     r[0][TX_16X16] += r_tmp;
     s[TX_16X16] = s[TX_16X16] && s_tmp;
 
     xd->above_context = &t_above[TX_4X4][x_idx];
     xd->left_context = &t_left[TX_4X4][y_idx];
-    macro_block_yrd_4x4(x, &r_tmp, &d_tmp, IF_RTCD(&cpi->rtcd), &s_tmp, 0);
+    macro_block_yrd_4x4(x, &r_tmp, &d_tmp, &s_tmp, 0);
     d[TX_4X4] += d_tmp;
     r[0][TX_4X4] += r_tmp;
     s[TX_4X4] = s[TX_4X4] && s_tmp;
 
     xd->above_context = &t_above[TX_8X8][x_idx];
     xd->left_context = &t_left[TX_8X8][y_idx];
-    macro_block_yrd_8x8(x, &r_tmp, &d_tmp, IF_RTCD(&cpi->rtcd), &s_tmp, 0);
+    macro_block_yrd_8x8(x, &r_tmp, &d_tmp, &s_tmp, 0);
     d[TX_8X8] += d_tmp;
     r[0][TX_8X8] += r_tmp;
     s[TX_8X8] = s[TX_8X8] && s_tmp;
@@ -1144,8 +1137,7 @@
   if (best_tx_type != DCT_DCT)
     vp9_ihtllm_c(best_dqcoeff, b->diff, 32, best_tx_type, 4);
   else
-    IDCT_INVOKE(IF_RTCD(&cpi->rtcd.common->idct), idct16)(
-        best_dqcoeff, b->diff, 32);
+    xd->inv_xform4x4_x8(best_dqcoeff, b->diff, 32);
 
   vp9_recon_b(best_predictor, b->diff, *(b->base_dst) + b->dst, b->dst_stride);
 
@@ -1261,7 +1253,7 @@
     vp9_build_intra_predictors_sby_s(&x->e_mbd);
 
     super_block_yrd(cpi, x, &this_rate_tokenonly,
-                    &this_distortion, IF_RTCD(&cpi->rtcd), &s, txfm_cache);
+                    &this_distortion, &s, txfm_cache);
     this_rate = this_rate_tokenonly +
                 x->mbmode_cost[x->e_mbd.frame_type]
                               [x->e_mbd.mode_info_context->mbmi.mode];
@@ -1509,7 +1501,7 @@
 #if CONFIG_COMP_INTRA_PRED
   b->bmi.as_mode.second = (*best_second_mode);
 #endif
-  vp9_encode_intra8x8(IF_RTCD(&cpi->rtcd), x, ib);
+  vp9_encode_intra8x8(x, ib);
 
   if (xd->mode_info_context->mbmi.txfm_size == TX_8X8) {
     a[vp9_block2above_8x8[idx]]     = besta0;
@@ -1846,7 +1838,6 @@
 static void super_block_uvrd_8x8(MACROBLOCK *x,
                                  int *rate,
                                  int *distortion,
-                                 const VP9_ENCODER_RTCD *rtcd,
                                  int *skippable) {
   MACROBLOCKD *const xd = &x->e_mbd;
   int d = 0, r = 0, n, s = 1;
@@ -1909,7 +1900,7 @@
     vp9_build_intra_predictors_sbuv_s(&x->e_mbd);
 
     super_block_uvrd_8x8(x, &this_rate_tokenonly,
-                         &this_distortion, IF_RTCD(&cpi->rtcd), &s);
+                         &this_distortion, &s);
     this_rate = this_rate_tokenonly +
                 x->intra_uv_mode_cost[x->e_mbd.frame_type][mode];
     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
@@ -2072,8 +2063,7 @@
                                        int *labelyrate,
                                        int *distortion,
                                        ENTROPY_CONTEXT *ta,
-                                       ENTROPY_CONTEXT *tl,
-                                       const VP9_ENCODER_RTCD *rtcd) {
+                                       ENTROPY_CONTEXT *tl) {
   int i;
   MACROBLOCKD *xd = &x->e_mbd;
 
@@ -2109,8 +2099,7 @@
                                            int *distortion,
                                            int64_t *otherrd,
                                            ENTROPY_CONTEXT *ta,
-                                           ENTROPY_CONTEXT *tl,
-                                           const VP9_ENCODER_RTCD *rtcd) {
+                                           ENTROPY_CONTEXT *tl) {
   int i, j;
   MACROBLOCKD *xd = &x->e_mbd;
   const int iblock[4] = { 0, 1, 4, 5 };
@@ -2431,13 +2420,12 @@
 
       if (segmentation == PARTITIONING_4X4) {
         this_rd = encode_inter_mb_segment(x, labels, i, &labelyrate,
-                                          &distortion,
-                                          ta_s, tl_s, IF_RTCD(&cpi->rtcd));
+                                          &distortion, ta_s, tl_s);
         other_rd = this_rd;
       } else {
         this_rd = encode_inter_mb_segment_8x8(x, labels, i, &labelyrate,
                                               &distortion, &other_rd,
-                                              ta_s, tl_s, IF_RTCD(&cpi->rtcd));
+                                              ta_s, tl_s);
       }
       this_rd += RDCOST(x->rdmult, x->rddiv, rate, 0);
       rate += labelyrate;
@@ -3344,7 +3332,7 @@
 
       // Y cost and distortion
       super_block_yrd(cpi, x, rate_y, distortion_y,
-                      IF_RTCD(&cpi->rtcd), &skippable_y, txfm_cache);
+                      &skippable_y, txfm_cache);
       *rate2 += *rate_y;
       *distortion += *distortion_y;
 
@@ -4450,7 +4438,8 @@
 #endif
 
     // Test best rd so far against threshold for trying this mode.
-    if (best_rd <= cpi->rd_threshes[mode_index]) {
+    if (best_rd <= cpi->rd_threshes[mode_index] ||
+        cpi->rd_threshes[mode_index] == INT_MAX) {
       continue;
     }
 
@@ -4530,7 +4519,7 @@
     if (ref_frame == INTRA_FRAME) {
       vp9_build_intra_predictors_sby_s(xd);
       super_block_yrd(cpi, x, &rate_y, &distortion_y,
-                      IF_RTCD(&cpi->rtcd), &skippable, txfm_cache);
+                      &skippable, txfm_cache);
       if (mbmi->txfm_size == TX_4X4) {
         rate_uv = rate_uv_4x4;
         distortion_uv = dist_uv_4x4;
diff --git a/vp9/encoder/variance.h b/vp9/encoder/variance.h
index 6afbfb7..b504fbb 100644
--- a/vp9/encoder/variance.h
+++ b/vp9/encoder/variance.h
@@ -38,7 +38,7 @@
 
 typedef void (*vp9_sad_multi_d_fn_t)(const unsigned char *src_ptr,
                                      int source_stride,
-                                     const unsigned char * const ref_ptr[],
+                                     const unsigned char ** ref_ptr,
                                      int  ref_stride, unsigned int *sad_array);
 
 typedef unsigned int (*vp9_variance_fn_t)(const unsigned char *src_ptr,
diff --git a/vp9/encoder/x86/x86_csystemdependent.c b/vp9/encoder/x86/x86_csystemdependent.c
index 38ede74..5fc86c1 100644
--- a/vp9/encoder/x86/x86_csystemdependent.c
+++ b/vp9/encoder/x86/x86_csystemdependent.c
@@ -14,7 +14,8 @@
 #include "vp9/encoder/variance.h"
 #include "vp9/encoder/onyx_int.h"
 
-
+// TODO(jimbankoski) Consider rewriting the c to take the same values rather
+// than going through these pointer conversions
 #if HAVE_MMX
 void vp9_short_fdct8x4_mmx(short *input, short *output, int pitch) {
   vp9_short_fdct4x4_mmx(input,   output,    pitch);
@@ -75,24 +76,3 @@
 }
 
 #endif
-
-void vp9_arch_x86_encoder_init(VP9_COMP *cpi) {
-#if CONFIG_RUNTIME_CPU_DETECT
-  int flags = x86_simd_caps();
-
-  /* Note:
-   *
-   * This platform can be built without runtime CPU detection as well. If
-   * you modify any of the function mappings present in this file, be sure
-   * to also update them in static mapings (<arch>/filename_<arch>.h)
-   */
-
-  /* Override default functions with fastest ones for this CPU. */
-#if HAVE_SSE2
-  if (flags & HAS_SSE2) {
-  }
-#endif
-
-
-#endif
-}
diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index d6c9b2b..7b77231 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk
@@ -38,7 +38,6 @@
 VP9_COMMON_SRCS-yes += common/extend.h
 VP9_COMMON_SRCS-yes += common/findnearmv.h
 VP9_COMMON_SRCS-yes += common/header.h
-VP9_COMMON_SRCS-yes += common/idct.h
 VP9_COMMON_SRCS-yes += common/invtrans.h
 VP9_COMMON_SRCS-yes += common/loopfilter.h
 VP9_COMMON_SRCS-yes += common/modecont.h
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index f155319..ae60ae1 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -77,8 +77,8 @@
   VP9_PTR             cpi;
   unsigned char          *cx_data;
   unsigned int            cx_data_sz;
-  unsigned char           *altref_cx_data;
-  unsigned int            altref_size;
+  unsigned char          *pending_cx_data;
+  unsigned int            pending_cx_data_sz;
   vpx_image_t             preview_img;
   unsigned int            next_frame_flag;
   vp8_postproc_cfg_t      preview_ppcfg;
@@ -577,19 +577,6 @@
   }
 }
 
-static void append_length(unsigned char* cx_data, unsigned long int *cx_size) {
-  unsigned char chunk;
-  unsigned int offset = 0;
-  unsigned long int size = *cx_size;
-  do {
-    chunk = size & 0x7F;
-    size >>= 7;
-    chunk |= (offset == 0) << 7;
-    cx_data[offset] = chunk;
-    offset++;
-  } while (size);
-  *cx_size += offset;
-}
 
 static vpx_codec_err_t vp8e_encode(vpx_codec_alg_priv_t  *ctx,
                                    const vpx_image_t     *img,
@@ -693,14 +680,24 @@
       ctx->next_frame_flag = 0;
     }
 
+    cx_data = ctx->cx_data;
+    cx_data_sz = ctx->cx_data_sz;
     lib_flags = 0;
 
-    if (ctx->altref_size) {
-      cx_data = ctx->altref_cx_data + ctx->altref_size;
-      cx_data_sz = ctx->cx_data_sz - ctx->altref_size;
-    } else {
-      cx_data = ctx->cx_data;
-      cx_data_sz = ctx->cx_data_sz;
+    /* Any pending invisible frames? */
+    if (ctx->pending_cx_data) {
+      memmove(cx_data, ctx->pending_cx_data, ctx->pending_cx_data_sz);
+      ctx->pending_cx_data = cx_data;
+      cx_data += ctx->pending_cx_data_sz;
+      cx_data_sz -= ctx->pending_cx_data_sz;
+
+      /* TODO: this is a minimal check, the underlying codec doesn't respect
+       * the buffer size anyway.
+       */
+      if (cx_data_sz < ctx->cx_data_sz / 2) {
+        ctx->base.err_detail = "Compressed data buffer too small";
+        return VPX_CODEC_ERROR;
+      }
     }
 
     while (cx_data_sz >= ctx->cx_data_sz / 2 &&
@@ -712,13 +709,11 @@
         vpx_codec_cx_pkt_t pkt;
         VP9_COMP *cpi = (VP9_COMP *)ctx->cpi;
 
-        /* TODO(jkoleszar): for now we append lengths to all frames, revisit
-         * this later to ensure if this is necessary */
-        append_length(cx_data + size, &size);
-
+        /* Pack invisible frames with the next visisble frame */
         if (!cpi->common.show_frame) {
-          ctx->altref_cx_data = cx_data;
-          ctx->altref_size = size;
+          if (!ctx->pending_cx_data)
+            ctx->pending_cx_data = cx_data;
+          ctx->pending_cx_data_sz += size;
           cx_data += size;
           cx_data_sz -= size;
           continue;
@@ -777,14 +772,14 @@
         }
         else*/
         {
-          if (ctx->altref_size) {
-            pkt.data.frame.sz = ctx->altref_size + size;
-            pkt.data.frame.buf = ctx->altref_cx_data;
-            ctx->altref_size = 0;
-            ctx->altref_cx_data = NULL;
+          if (ctx->pending_cx_data) {
+            pkt.data.frame.buf = ctx->pending_cx_data;
+            pkt.data.frame.sz  = ctx->pending_cx_data_sz + size;
+            ctx->pending_cx_data = NULL;
+            ctx->pending_cx_data_sz = 0;
           } else {
             pkt.data.frame.buf = cx_data;
-            pkt.data.frame.sz = size;
+            pkt.data.frame.sz  = size;
           }
           pkt.data.frame.partition_id = -1;
           vpx_codec_pkt_list_add(&ctx->pkt_list.head, &pkt);
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index c85b423..7432156 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -303,11 +303,11 @@
   img->self_allocd = 0;
 }
 
-static vpx_codec_err_t vp8_decode(vpx_codec_alg_priv_t  *ctx,
-                                  const uint8_t         *data,
-                                  unsigned int            data_sz,
-                                  void                    *user_priv,
-                                  long                    deadline) {
+static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t  *ctx,
+                                  const uint8_t        **data,
+                                  unsigned int           data_sz,
+                                  void                  *user_priv,
+                                  long                   deadline) {
   vpx_codec_err_t res = VPX_CODEC_OK;
 
   ctx->img_avail = 0;
@@ -317,7 +317,7 @@
    * of the heap.
    */
   if (!ctx->si.h)
-    res = ctx->base.iface->dec.peek_si(data, data_sz, &ctx->si);
+    res = ctx->base.iface->dec.peek_si(*data, data_sz, &ctx->si);
 
 
   /* Perform deferred allocations, if required */
@@ -424,6 +424,33 @@
   return res;
 }
 
+static vpx_codec_err_t vp9_decode(vpx_codec_alg_priv_t  *ctx,
+                                  const uint8_t         *data,
+                                  unsigned int           data_sz,
+                                  void                  *user_priv,
+                                  long                   deadline) {
+  const uint8_t *data_start = data;
+  const uint8_t *data_end = data + data_sz;
+  vpx_codec_err_t res;
+
+  do {
+    res = decode_one(ctx, &data_start, data_sz, user_priv, deadline);
+    assert(data_start >= data);
+    assert(data_start <= data_end);
+
+    /* Early exit if there was a decode error */
+    if (res)
+      break;
+
+    /* Account for suboptimal termination by the encoder. */
+    while (data_start < data_end && *data_start == 0)
+      data_start++;
+
+    data_sz = data_end - data_start;
+  } while (data_start < data_end);
+  return res;
+}
+
 static vpx_image_t *vp8_get_frame(vpx_codec_alg_priv_t  *ctx,
                                   vpx_codec_iter_t      *iter) {
   vpx_image_t *img = NULL;
@@ -672,7 +699,7 @@
   {
     vp8_peek_si,      /* vpx_codec_peek_si_fn_t    peek_si; */
     vp8_get_si,       /* vpx_codec_get_si_fn_t     get_si; */
-    vp8_decode,       /* vpx_codec_decode_fn_t     decode; */
+    vp9_decode,       /* vpx_codec_decode_fn_t     decode; */
     vp8_get_frame,    /* vpx_codec_frame_get_fn_t  frame_get; */
   },
   {
diff --git a/vpx/src/vpx_decoder.c b/vpx/src/vpx_decoder.c
index 4398d92..1f575e0 100644
--- a/vpx/src/vpx_decoder.c
+++ b/vpx/src/vpx_decoder.c
@@ -109,29 +109,6 @@
   return SAVE_STATUS(ctx, res);
 }
 
-static int read_frame_length(const uint8_t *data, uint64_t size,
-                             uint64_t *frame_length, int *size_length) {
-  uint64_t value = 0;
-  *size_length = 0;
-  do {
-    uint64_t index;
-    size -= value + *size_length;
-    index = size - 1;
-    value = 0;
-    do {
-      if (data + index < data) {
-          *frame_length = -1;
-          return -1;
-      }
-      value <<= 7;
-      value |= (data[index] & 0x7F);
-    } while (!(data[index--] >> 7));
-    *size_length = size - 1 - index;
-  } while (value + *size_length < size);
-  *frame_length = value;
-  return 0;
-}
-
 
 vpx_codec_err_t vpx_codec_decode(vpx_codec_ctx_t    *ctx,
                                  const uint8_t        *data,
@@ -139,11 +116,6 @@
                                  void       *user_priv,
                                  long        deadline) {
   vpx_codec_err_t res;
-  int offset = 0;
-  uint64_t length = 0;
-  unsigned char altref_frame;
-  unsigned int cx_size = data_sz;
-  uint8_t *cx_data = data;
 
   /* Sanity checks */
   /* NULL data ptr allowed if data_sz is 0 too */
@@ -152,18 +124,8 @@
   else if (!ctx->iface || !ctx->priv)
     res = VPX_CODEC_ERROR;
   else {
-    do {
-      altref_frame = !(*cx_data & 0x10);
-      res = read_frame_length(cx_data, cx_size, &length, &offset);
-      if (res != 0)
-        return SAVE_STATUS(ctx, VPX_CODEC_UNSUP_BITSTREAM);
-      res = ctx->iface->dec.decode(ctx->priv->alg_priv, cx_data,
-                                         length, user_priv, deadline);
-      if (res != 0)
-        return SAVE_STATUS(ctx, res);
-      cx_data += offset + length;
-      cx_size -= offset + length;
-    } while (cx_data - data <= data_sz && altref_frame);
+    res = ctx->iface->dec.decode(ctx->priv->alg_priv, data, data_sz,
+                                 user_priv, deadline);
   }
 
   return SAVE_STATUS(ctx, res);