Merge "Adds a fastssim metric to VPX internal stats."

diff --git a/.mailmap b/.mailmap
index fb82a24..0bfda12 100644
--- a/.mailmap
+++ b/.mailmap

@@ -1,18 +1,26 @@
 Adrian Grange <agrange@google.com>
+Alex Converse <aconverse@google.com> <alex.converse@gmail.com>
 Alexis Ballier <aballier@gentoo.org> <alexis.ballier@gmail.com>
+Alpha Lam <hclam@google.com> <hclam@chromium.org>
+Deb Mukherjee <debargha@google.com>
+Erik Niemeyer <erik.a.niemeyer@intel.com> <erik.a.niemeyer@gmail.com>
+Guillaume Martres <gmartres@google.com> <smarter3@gmail.com>
 Hangyu Kuang <hkuang@google.com>
 Jim Bankoski <jimbankoski@google.com>
-John Koleszar <jkoleszar@google.com>
 Johann Koenig <johannkoenig@google.com>
 Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com>
-Johann Koenig <johannkoenig@google.com> <johannkoenig@dhcp-172-19-7-52.mtv.corp.google.com>
+John Koleszar <jkoleszar@google.com>
+Joshua Litt <joshualitt@google.com> <joshualitt@chromium.org>
+Marco Paniconi <marpan@google.com>
+Marco Paniconi <marpan@google.com> <marpan@chromium.org>
 Pascal Massimino <pascal.massimino@gmail.com>
+Paul Wilkins <paulwilkins@google.com>
+Ralph Giles <giles@xiph.org> <giles@entropywave.com>
+Ralph Giles <giles@xiph.org> <giles@mozilla.com>
 Sami Pietilä <samipietila@google.com>
+Tamar Levy <tamar.levy@intel.com>
+Tamar Levy <tamar.levy@intel.com> <levytamar82@gmail.com>
 Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com>
 Timothy B. Terriberry <tterribe@xiph.org> Tim Terriberry <tterriberry@mozilla.com>
 Tom Finegan <tomfinegan@google.com>
-Ralph Giles <giles@xiph.org> <giles@entropywave.com>
-Ralph Giles <giles@xiph.org> <giles@mozilla.com>
-Alpha Lam <hclam@google.com> <hclam@chromium.org>
-Deb Mukherjee <debargha@google.com>
 Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com>

diff --git a/AUTHORS b/AUTHORS
index a9aa481..2f63d7c 100644
--- a/AUTHORS
+++ b/AUTHORS

@@ -3,10 +3,11 @@
 
 Aaron Watry <awatry@gmail.com>
 Abo Talib Mahfoodh <ab.mahfoodh@gmail.com>
+Adam Xu <adam@xuyaowu.com>
 Adrian Grange <agrange@google.com>
 Ahmad Sharif <asharif@google.com>
 Alexander Voronov <avoronov@graphics.cs.msu.ru>
-Alex Converse <alex.converse@gmail.com>
+Alex Converse <aconverse@google.com>
 Alexis Ballier <aballier@gentoo.org>
 Alok Ahuja <waveletcoeff@gmail.com>
 Alpha Lam <hclam@google.com>
@@ -14,44 +15,58 @@
 Ami Fischman <fischman@chromium.org>
 Andoni Morales Alastruey <ylatuya@gmail.com>
 Andres Mejia <mcitadel@gmail.com>
+Andrew Russell <anrussell@google.com>
 Aron Rosenberg <arosenberg@logitech.com>
 Attila Nagy <attilanagy@google.com>
 changjun.yang <changjun.yang@intel.com>
+Charles 'Buck' Krasic <ckrasic@google.com>
 chm <chm@rock-chips.com>
 Christian Duvivier <cduvivier@google.com>
 Daniel Kang <ddkang@google.com>
 Deb Mukherjee <debargha@google.com>
+Dim Temp <dimtemp0@gmail.com>
 Dmitry Kovalev <dkovalev@google.com>
 Dragan Mrdjan <dmrdjan@mips.com>
-Erik Niemeyer <erik.a.niemeyer@gmail.com>
+Ehsan Akhgari <ehsan.akhgari@gmail.com>
+Erik Niemeyer <erik.a.niemeyer@intel.com>
 Fabio Pedretti <fabio.ped@libero.it>
 Frank Galligan <fgalligan@google.com>
 Fredrik Söderquist <fs@opera.com>
 Fritz Koenig <frkoenig@google.com>
 Gaute Strokkenes <gaute.strokkenes@broadcom.com>
 Giuseppe Scrivano <gscrivano@gnu.org>
+Gordana Cmiljanovic <gordana.cmiljanovic@imgtec.com>
 Guillaume Martres <gmartres@google.com>
 Guillermo Ballester Valor <gbvalor@gmail.com>
 Hangyu Kuang <hkuang@google.com>
+Hanno Böck <hanno@hboeck.de>
 Henrik Lundin <hlundin@google.com>
 Hui Su <huisu@google.com>
 Ivan Maltz <ivanmaltz@google.com>
+Jacek Caban <cjacek@gmail.com>
+JackyChen <jackychen@google.com>
 James Berry <jamesberry@google.com>
+James Yu <james.yu@linaro.org>
 James Zern <jzern@google.com>
+Jan Gerber <j@mailb.org>
 Jan Kratochvil <jan.kratochvil@redhat.com>
 Janne Salonen <jsalonen@google.com>
 Jeff Faust <jfaust@google.com>
 Jeff Muizelaar <jmuizelaar@mozilla.com>
 Jeff Petkau <jpet@chromium.org>
+Jia Jia <jia.jia@linaro.org>
 Jim Bankoski <jimbankoski@google.com>
 Jingning Han <jingning@google.com>
+Joey Parrish <joeyparrish@google.com>
 Johann Koenig <johannkoenig@google.com>
 John Koleszar <jkoleszar@google.com>
+John Stark <jhnstrk@gmail.com>
 Joshua Bleecher Snyder <josh@treelinelabs.com>
 Joshua Litt <joshualitt@google.com>
 Justin Clift <justin@salasaga.org>
 Justin Lebar <justin.lebar@gmail.com>
 KO Myung-Hun <komh@chollian.net>
+Lawrence Velázquez <larryv@macports.org>
 Lou Quillio <louquillio@google.com>
 Luca Barbato <lu_zero@gentoo.org>
 Makoto Kato <makoto.kt@gmail.com>
@@ -65,6 +80,7 @@
 Mike Frysinger <vapier@chromium.org>
 Mike Hommey <mhommey@mozilla.com>
 Mikhal Shemer <mikhal@google.com>
+Minghai Shang <minghai@google.com>
 Morton Jonuschat <yabawock@gmail.com>
 Parag Salasakar <img.mips1@gmail.com>
 Pascal Massimino <pascal.massimino@gmail.com>
@@ -72,6 +88,8 @@
 Paul Wilkins <paulwilkins@google.com>
 Pavol Rusnak <stick@gk2.sk>
 Paweł Hajdan <phajdan@google.com>
+Pengchong Jin <pengchong@google.com>
+Peter de Rivaz <peter.derivaz@gmail.com>
 Philip Jägenstedt <philipj@opera.com>
 Priit Laes <plaes@plaes.org>
 Rafael Ávila de Espíndola <rafael.espindola@gmail.com>
@@ -79,22 +97,29 @@
 Ralph Giles <giles@xiph.org>
 Rob Bradford <rob@linux.intel.com>
 Ronald S. Bultje <rbultje@google.com>
+Rui Ueyama <ruiu@google.com>
 Sami Pietilä <samipietila@google.com>
 Scott Graham <scottmg@chromium.org>
 Scott LaVarnway <slavarnway@google.com>
+Sean McGovern <gseanmcg@gmail.com>
+Sergey Ulanov <sergeyu@chromium.org>
 Shimon Doodkin <helpmepro1@gmail.com>
 Stefan Holmer <holmer@google.com>
 Suman Sunkara <sunkaras@google.com>
 Taekhyun Kim <takim@nvidia.com>
 Takanori MATSUURA <t.matsuu@gmail.com>
 Tamar Levy <tamar.levy@intel.com>
+Tao Bai <michaelbai@chromium.org>
 Tero Rintaluoma <teror@google.com>
 Thijs Vermeir <thijsvermeir@gmail.com>
+Tim Kopp <tkopp@google.com>
 Timothy B. Terriberry <tterribe@xiph.org>
 Tom Finegan <tomfinegan@google.com>
 Vignesh Venkatasubramanian <vigneshv@google.com>
 Yaowu Xu <yaowu@google.com>
+Yongzhe Wang <yongzhe@google.com>
 Yunqing Wang <yunqingwang@google.com>
+Zoe Liu <zoeliu@google.com>
 Google Inc.
 The Mozilla Foundation
 The Xiph.Org Foundation

diff --git a/CHANGELOG b/CHANGELOG
index 97c9a7b..a318784 100644
--- a/CHANGELOG
+++ b/CHANGELOG

@@ -1,3 +1,26 @@
+2015-04-03 v1.4.0 "Indian Runner Duck"
+  This release includes significant improvements to the VP9 codec.
+
+  - Upgrading:
+    This release is ABI incompatible with 1.3.0. It drops the compatibility
+    layer, requiring VPX_IMG_FMT_* instead of IMG_FMT_*, and adds several codec
+    controls for VP9.
+
+  - Enhancements:
+    Faster VP9 encoding and decoding
+    Multithreaded VP9 decoding (tile and frame-based)
+    Multithreaded VP9 encoding - on by default
+    YUV 4:2:2 and 4:4:4 support in VP9
+    10 and 12bit support in VP9
+    64bit ARM support by replacing ARM assembly with intrinsics
+
+  - Bug Fixes:
+    Fixes a VP9 bitstream issue in Profile 1. This only affected non-YUV 4:2:0
+    files.
+
+  - Known Issues:
+    Frame Parallel decoding fails for segmented and non-420 files.
+
 2013-11-15 v1.3.0 "Forest"
   This release introduces the VP9 codec in a backward-compatible way.
   All existing users of VP8 can continue to use the library without

diff --git a/README b/README
index 584a344..fcd1c2e 100644
--- a/README
+++ b/README

@@ -1,4 +1,4 @@
-README - 30 May 2014
+README - 23 March 2015
 
 Welcome to the WebM VP8/VP9 Codec SDK!
 
@@ -62,12 +62,6 @@
     armv7s-darwin-gcc
     mips32-linux-gcc
     mips64-linux-gcc
-    ppc32-darwin8-gcc
-    ppc32-darwin9-gcc
-    ppc32-linux-gcc
-    ppc64-darwin8-gcc
-    ppc64-darwin9-gcc
-    ppc64-linux-gcc
     sparc-solaris-gcc
     x86-android-gcc
     x86-darwin8-gcc
@@ -78,6 +72,7 @@
     x86-darwin11-gcc
     x86-darwin12-gcc
     x86-darwin13-gcc
+    x86-darwin14-gcc
     x86-iphonesimulator-gcc
     x86-linux-gcc
     x86-linux-icc
@@ -95,6 +90,7 @@
     x86_64-darwin11-gcc
     x86_64-darwin12-gcc
     x86_64-darwin13-gcc
+    x86_64-darwin14-gcc
     x86_64-iphonesimulator-gcc
     x86_64-linux-gcc
     x86_64-linux-icc
@@ -111,6 +107,7 @@
     universal-darwin11-gcc
     universal-darwin12-gcc
     universal-darwin13-gcc
+    universal-darwin14-gcc
     generic-gnu
 
   The generic-gnu target, in conjunction with the CROSS environment variable,

diff --git a/build/make/Android.mk b/build/make/Android.mk
index d897b44..20f670a 100644
--- a/build/make/Android.mk
+++ b/build/make/Android.mk

@@ -158,8 +158,6 @@
 
 LOCAL_MODULE := libvpx
 
-LOCAL_LDLIBS := -llog
-
 ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes)
   LOCAL_STATIC_LIBRARIES := cpufeatures
 endif
@@ -184,7 +182,11 @@
 	@$(RM) -r $(ASM_CNV_PATH)
 	@$(RM) $(CLEAN-OBJS)
 
-include $(BUILD_SHARED_LIBRARY)
+ifeq ($(ENABLE_SHARED),1)
+  include $(BUILD_SHARED_LIBRARY)
+else
+  include $(BUILD_STATIC_LIBRARY)
+endif
 
 ifeq ($(CONFIG_RUNTIME_CPU_DETECT),yes)
 $(call import-module,cpufeatures)

diff --git a/build/make/Makefile b/build/make/Makefile
index e048e9c..fc7749a 100644
--- a/build/make/Makefile
+++ b/build/make/Makefile

@@ -383,8 +383,8 @@
 .libs: $(LIBS)
 	@touch $@
 $(foreach lib,$(filter %_g.a,$(LIBS)),$(eval $(call archive_template,$(lib))))
-$(foreach lib,$(filter %so.$(VERSION_MAJOR).$(VERSION_MINOR).$(VERSION_PATCH),$(LIBS)),$(eval $(call so_template,$(lib))))
-$(foreach lib,$(filter %$(VERSION_MAJOR).dylib,$(LIBS)),$(eval $(call dl_template,$(lib))))
+$(foreach lib,$(filter %so.$(SO_VERSION_MAJOR).$(SO_VERSION_MINOR).$(SO_VERSION_PATCH),$(LIBS)),$(eval $(call so_template,$(lib))))
+$(foreach lib,$(filter %$(SO_VERSION_MAJOR).dylib,$(LIBS)),$(eval $(call dl_template,$(lib))))
 
 INSTALL-LIBS=$(call cond_enabled,CONFIG_INSTALL_LIBS,INSTALL-LIBS)
 ifeq ($(MAKECMDGOALS),dist)

diff --git a/build/make/configure.sh b/build/make/configure.sh
index e54de21..a51a847 100644
--- a/build/make/configure.sh
+++ b/build/make/configure.sh

@@ -640,12 +640,6 @@
       *i[3456]86*)
         tgt_isa=x86
         ;;
-      *powerpc64*)
-        tgt_isa=ppc64
-        ;;
-      *powerpc*)
-        tgt_isa=ppc32
-        ;;
       *sparc*)
         tgt_isa=sparc
         ;;
@@ -1042,25 +1036,30 @@
         disable_feature fast_unaligned
       fi
 
+      if enabled runtime_cpu_detect; then
+        disable_feature runtime_cpu_detect
+      fi
+
       if [ -n "${tune_cpu}" ]; then
         case ${tune_cpu} in
           p5600)
-            add_cflags -mips32r5 -funroll-loops -mload-store-pairs
-            add_cflags -msched-weight -mhard-float
-            add_asflags -mips32r5 -mhard-float
+            check_add_cflags -mips32r5 -funroll-loops -mload-store-pairs
+            check_add_cflags -msched-weight -mhard-float -mfp64
+            check_add_asflags -mips32r5 -mhard-float -mfp64
+            check_add_ldflags -mfp64
             ;;
           i6400)
-            add_cflags -mips64r6 -mabi=64 -funroll-loops -mload-store-pairs
-            add_cflags -msched-weight -mhard-float
-            add_asflags -mips64r6 -mabi=64 -mhard-float
-            add_ldflags -mips64r6 -mabi=64
+            check_add_cflags -mips64r6 -mabi=64 -funroll-loops -msched-weight 
+            check_add_cflags  -mload-store-pairs -mhard-float -mfp64
+            check_add_asflags -mips64r6 -mabi=64 -mhard-float -mfp64
+            check_add_ldflags -mips64r6 -mabi=64 -mfp64
             ;;
         esac
 
         if enabled msa; then
-          add_cflags -mmsa -mfp64 -flax-vector-conversions
-          add_asflags -mmsa -mfp64 -flax-vector-conversions
-          add_ldflags -mmsa -mfp64 -flax-vector-conversions
+          add_cflags -mmsa
+          add_asflags -mmsa
+          add_ldflags -mmsa
 
           disable_feature fast_unaligned
         fi
@@ -1070,29 +1069,6 @@
       check_add_asflags -march=${tgt_isa}
       check_add_asflags -KPIC
       ;;
-    ppc*)
-      enable_feature ppc
-      bits=${tgt_isa##ppc}
-      link_with_cc=gcc
-      setup_gnu_toolchain
-      add_asflags -force_cpusubtype_ALL -I"\$(dir \$<)darwin"
-      soft_enable altivec
-      enabled altivec && add_cflags -maltivec
-
-      case "$tgt_os" in
-        linux*)
-          add_asflags -maltivec -mregnames -I"\$(dir \$<)linux"
-          ;;
-        darwin*)
-          darwin_arch="-arch ppc"
-          enabled ppc64 && darwin_arch="${darwin_arch}64"
-          add_cflags  ${darwin_arch} -m${bits} -fasm-blocks
-          add_asflags ${darwin_arch} -force_cpusubtype_ALL -I"\$(dir \$<)darwin"
-          add_ldflags ${darwin_arch} -m${bits}
-          enabled altivec && add_cflags -faltivec
-          ;;
-      esac
-      ;;
     x86*)
       case  ${tgt_os} in
         win*)
@@ -1329,11 +1305,15 @@
   # only for MIPS platforms
   case ${toolchain} in
     mips*)
-      if enabled dspr2; then
-        if enabled big_endian; then
+      if enabled big_endian; then
+        if enabled dspr2; then
           echo "dspr2 optimizations are available only for little endian platforms"
           disable_feature dspr2
         fi
+        if enabled msa; then
+          echo "msa optimizations are available only for little endian platforms"
+          disable_feature msa
+        fi
       fi
       ;;
   esac

diff --git a/configure b/configure
index 728521a..e05dd69 100755
--- a/configure
+++ b/configure

@@ -112,12 +112,6 @@
 all_platforms="${all_platforms} armv7s-darwin-gcc"
 all_platforms="${all_platforms} mips32-linux-gcc"
 all_platforms="${all_platforms} mips64-linux-gcc"
-all_platforms="${all_platforms} ppc32-darwin8-gcc"
-all_platforms="${all_platforms} ppc32-darwin9-gcc"
-all_platforms="${all_platforms} ppc32-linux-gcc"
-all_platforms="${all_platforms} ppc64-darwin8-gcc"
-all_platforms="${all_platforms} ppc64-darwin9-gcc"
-all_platforms="${all_platforms} ppc64-linux-gcc"
 all_platforms="${all_platforms} sparc-solaris-gcc"
 all_platforms="${all_platforms} x86-android-gcc"
 all_platforms="${all_platforms} x86-darwin8-gcc"
@@ -247,8 +241,6 @@
     mips
     x86
     x86_64
-    ppc32
-    ppc64
 "
 ARCH_EXT_LIST="
     edsp
@@ -269,8 +261,6 @@
     sse4_1
     avx
     avx2
-
-    altivec
 "
 HAVE_LIST="
     ${ARCH_EXT_LIST}
@@ -621,12 +611,6 @@
         universal-darwin*)
             darwin_ver=${tgt_os##darwin}
 
-            # Snow Leopard (10.6/darwin10) dropped support for PPC
-            # Include PPC support for all prior versions
-            if [ $darwin_ver -lt 10 ]; then
-                fat_bin_archs="$fat_bin_archs ppc32-${tgt_os}-gcc"
-            fi
-
             # Tiger (10.4/darwin8) brought support for x86
             if [ $darwin_ver -ge 8 ]; then
                 fat_bin_archs="$fat_bin_archs x86-${tgt_os}-${tgt_cc}"
@@ -727,7 +711,7 @@
     esac
 
     # Other toolchain specific defaults
-    case $toolchain in x86*|ppc*|universal*) soft_enable postproc;; esac
+    case $toolchain in x86*|universal*) soft_enable postproc;; esac
 
     if enabled postproc_visualizer; then
         enabled postproc || die "postproc_visualizer requires postproc to be enabled"

diff --git a/libs.mk b/libs.mk
index e48d55c..3046e1b 100644
--- a/libs.mk
+++ b/libs.mk

@@ -230,25 +230,27 @@
 
 BUILD_LIBVPX_SO         := $(if $(BUILD_LIBVPX),$(CONFIG_SHARED))
 
+SO_VERSION_MAJOR := 2
+SO_VERSION_MINOR := 0
+SO_VERSION_PATCH := 0
 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS))
-LIBVPX_SO               := libvpx.$(VERSION_MAJOR).dylib
+LIBVPX_SO               := libvpx.$(SO_VERSION_MAJOR).dylib
 EXPORT_FILE             := libvpx.syms
 LIBVPX_SO_SYMLINKS      := $(addprefix $(LIBSUBDIR)/, \
                              libvpx.dylib  )
 else
-LIBVPX_SO               := libvpx.so.$(VERSION_MAJOR).$(VERSION_MINOR).$(VERSION_PATCH)
+LIBVPX_SO               := libvpx.so.$(SO_VERSION_MAJOR).$(SO_VERSION_MINOR).$(SO_VERSION_PATCH)
 EXPORT_FILE             := libvpx.ver
-SYM_LINK                := libvpx.so
 LIBVPX_SO_SYMLINKS      := $(addprefix $(LIBSUBDIR)/, \
-                             libvpx.so libvpx.so.$(VERSION_MAJOR) \
-                             libvpx.so.$(VERSION_MAJOR).$(VERSION_MINOR))
+                             libvpx.so libvpx.so.$(SO_VERSION_MAJOR) \
+                             libvpx.so.$(SO_VERSION_MAJOR).$(SO_VERSION_MINOR))
 endif
 
 LIBS-$(BUILD_LIBVPX_SO) += $(BUILD_PFX)$(LIBVPX_SO)\
                            $(notdir $(LIBVPX_SO_SYMLINKS))
 $(BUILD_PFX)$(LIBVPX_SO): $(LIBVPX_OBJS) $(EXPORT_FILE)
 $(BUILD_PFX)$(LIBVPX_SO): extralibs += -lm
-$(BUILD_PFX)$(LIBVPX_SO): SONAME = libvpx.so.$(VERSION_MAJOR)
+$(BUILD_PFX)$(LIBVPX_SO): SONAME = libvpx.so.$(SO_VERSION_MAJOR)
 $(BUILD_PFX)$(LIBVPX_SO): EXPORTS_FILE = $(EXPORT_FILE)
 
 libvpx.ver: $(call enabled,CODEC_EXPORTS)

diff --git a/test/android/Android.mk b/test/android/Android.mk
index 4e750b2..af85634 100644
--- a/test/android/Android.mk
+++ b/test/android/Android.mk

@@ -40,7 +40,13 @@
 LOCAL_ARM_MODE := arm
 LOCAL_MODULE := libvpx_test
 LOCAL_STATIC_LIBRARIES := gtest libwebm
-LOCAL_SHARED_LIBRARIES := vpx
+
+ifeq ($(ENABLE_SHARED),1)
+  LOCAL_SHARED_LIBRARIES := vpx
+else
+  LOCAL_STATIC_LIBRARIES += vpx
+endif
+
 include $(LOCAL_PATH)/test/test.mk
 LOCAL_C_INCLUDES := $(BINDINGS_DIR)
 FILTERED_SRC := $(sort $(filter %.cc %.c, $(LIBVPX_TEST_SRCS-yes)))

diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc
index bdd71c6..ff39f1a 100644
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc

@@ -29,8 +29,6 @@
     cfg_.g_timebase = video->timebase();
     cfg_.rc_twopass_stats_in = stats_->buf();
 
-    // Default to 1 thread.
-    cfg_.g_threads = 1;
     res = vpx_codec_enc_init(&encoder_, CodecInterface(), &cfg_,
                              init_flags_);
     ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();

diff --git a/test/encode_test_driver.h b/test/encode_test_driver.h
index 7b7dd31..e16cf9c 100644
--- a/test/encode_test_driver.h
+++ b/test/encode_test_driver.h

@@ -183,7 +183,10 @@
  protected:
   explicit EncoderTest(const CodecFactory *codec)
       : codec_(codec), abort_(false), init_flags_(0), frame_flags_(0),
-        last_pts_(0) {}
+        last_pts_(0) {
+    // Default to 1 thread.
+    cfg_.g_threads = 1;
+  }
 
   virtual ~EncoderTest() {}
 

diff --git a/test/lpf_8_test.cc b/test/lpf_8_test.cc
index c836fac..2c396ce 100644
--- a/test/lpf_8_test.cc
+++ b/test/lpf_8_test.cc

@@ -52,7 +52,7 @@
                                const uint8_t *thresh1);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-typedef std::tr1::tuple<loop_op_t, loop_op_t, int> loop8_param_t;
+typedef std::tr1::tuple<loop_op_t, loop_op_t, int, int> loop8_param_t;
 typedef std::tr1::tuple<dual_loop_op_t, dual_loop_op_t, int> dualloop8_param_t;
 
 #if HAVE_SSE2
@@ -144,6 +144,7 @@
     loopfilter_op_ = GET_PARAM(0);
     ref_loopfilter_op_ = GET_PARAM(1);
     bit_depth_ = GET_PARAM(2);
+    count_ = GET_PARAM(3);
     mask_ = (1 << bit_depth_) - 1;
   }
 
@@ -151,6 +152,7 @@
 
  protected:
   int bit_depth_;
+  int count_;
   int mask_;
   loop_op_t loopfilter_op_;
   loop_op_t ref_loopfilter_op_;
@@ -206,7 +208,6 @@
         tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
     };
     int32_t p = kNumCoeffs/32;
-    int count = 1;
 
     uint16_t tmp_s[kNumCoeffs];
     int j = 0;
@@ -238,13 +239,13 @@
       ref_s[j] = s[j];
     }
 #if CONFIG_VP9_HIGHBITDEPTH
-    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, count, bd);
+    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, count_, bd);
     ASM_REGISTER_STATE_CHECK(
-        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count, bd));
+        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count_, bd));
 #else
-    ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh, count);
+    ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh, count_);
     ASM_REGISTER_STATE_CHECK(
-        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count));
+        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count_));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
     for (int j = 0; j < kNumCoeffs; ++j) {
@@ -305,19 +306,18 @@
         tmp, tmp, tmp, tmp, tmp, tmp, tmp, tmp
     };
     int32_t p = kNumCoeffs / 32;
-    int count = 1;
     for (int j = 0; j < kNumCoeffs; ++j) {
       s[j] = rnd.Rand16() & mask_;
       ref_s[j] = s[j];
     }
 #if CONFIG_VP9_HIGHBITDEPTH
-    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, count, bd);
+    ref_loopfilter_op_(ref_s + 8 + p * 8, p, blimit, limit, thresh, count_, bd);
     ASM_REGISTER_STATE_CHECK(
-        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count, bd));
+        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count_, bd));
 #else
-    ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh, count);
+    ref_loopfilter_op_(ref_s+8+p*8, p, blimit, limit, thresh, count_);
     ASM_REGISTER_STATE_CHECK(
-        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count));
+        loopfilter_op_(s + 8 + p * 8, p, blimit, limit, thresh, count_));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     for (int j = 0; j < kNumCoeffs; ++j) {
       err_count += ref_s[j] != s[j];
@@ -521,55 +521,62 @@
     SSE2, Loop8Test6Param,
     ::testing::Values(
         make_tuple(&vp9_highbd_lpf_horizontal_4_sse2,
-                   &vp9_highbd_lpf_horizontal_4_c, 8),
+                   &vp9_highbd_lpf_horizontal_4_c, 8, 1),
         make_tuple(&vp9_highbd_lpf_vertical_4_sse2,
-                   &vp9_highbd_lpf_vertical_4_c, 8),
+                   &vp9_highbd_lpf_vertical_4_c, 8, 1),
         make_tuple(&vp9_highbd_lpf_horizontal_8_sse2,
-                   &vp9_highbd_lpf_horizontal_8_c, 8),
+                   &vp9_highbd_lpf_horizontal_8_c, 8, 1),
         make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
-                   &vp9_highbd_lpf_horizontal_16_c, 8),
+                   &vp9_highbd_lpf_horizontal_16_c, 8, 1),
+        make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
+                   &vp9_highbd_lpf_horizontal_16_c, 8, 2),
         make_tuple(&vp9_highbd_lpf_vertical_8_sse2,
-                   &vp9_highbd_lpf_vertical_8_c, 8),
+                   &vp9_highbd_lpf_vertical_8_c, 8, 1),
         make_tuple(&wrapper_vertical_16_sse2,
-                   &wrapper_vertical_16_c, 8),
+                   &wrapper_vertical_16_c, 8, 1),
         make_tuple(&vp9_highbd_lpf_horizontal_4_sse2,
-                   &vp9_highbd_lpf_horizontal_4_c, 10),
+                   &vp9_highbd_lpf_horizontal_4_c, 10, 1),
         make_tuple(&vp9_highbd_lpf_vertical_4_sse2,
-                   &vp9_highbd_lpf_vertical_4_c, 10),
+                   &vp9_highbd_lpf_vertical_4_c, 10, 1),
         make_tuple(&vp9_highbd_lpf_horizontal_8_sse2,
-                   &vp9_highbd_lpf_horizontal_8_c, 10),
+                   &vp9_highbd_lpf_horizontal_8_c, 10, 1),
         make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
-                   &vp9_highbd_lpf_horizontal_16_c, 10),
+                   &vp9_highbd_lpf_horizontal_16_c, 10, 1),
+        make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
+                   &vp9_highbd_lpf_horizontal_16_c, 10, 2),
         make_tuple(&vp9_highbd_lpf_vertical_8_sse2,
-                   &vp9_highbd_lpf_vertical_8_c, 10),
+                   &vp9_highbd_lpf_vertical_8_c, 10, 1),
         make_tuple(&wrapper_vertical_16_sse2,
-                   &wrapper_vertical_16_c, 10),
+                   &wrapper_vertical_16_c, 10, 1),
         make_tuple(&vp9_highbd_lpf_horizontal_4_sse2,
-                   &vp9_highbd_lpf_horizontal_4_c, 12),
+                   &vp9_highbd_lpf_horizontal_4_c, 12, 1),
         make_tuple(&vp9_highbd_lpf_vertical_4_sse2,
-                   &vp9_highbd_lpf_vertical_4_c, 12),
+                   &vp9_highbd_lpf_vertical_4_c, 12, 1),
         make_tuple(&vp9_highbd_lpf_horizontal_8_sse2,
-                   &vp9_highbd_lpf_horizontal_8_c, 12),
+                   &vp9_highbd_lpf_horizontal_8_c, 12, 1),
         make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
-                   &vp9_highbd_lpf_horizontal_16_c, 12),
+                   &vp9_highbd_lpf_horizontal_16_c, 12, 1),
+        make_tuple(&vp9_highbd_lpf_horizontal_16_sse2,
+                   &vp9_highbd_lpf_horizontal_16_c, 12, 2),
         make_tuple(&vp9_highbd_lpf_vertical_8_sse2,
-                   &vp9_highbd_lpf_vertical_8_c, 12),
+                   &vp9_highbd_lpf_vertical_8_c, 12, 1),
         make_tuple(&wrapper_vertical_16_sse2,
-                   &wrapper_vertical_16_c, 12),
+                   &wrapper_vertical_16_c, 12, 1),
         make_tuple(&wrapper_vertical_16_dual_sse2,
-                   &wrapper_vertical_16_dual_c, 8),
+                   &wrapper_vertical_16_dual_c, 8, 1),
         make_tuple(&wrapper_vertical_16_dual_sse2,
-                   &wrapper_vertical_16_dual_c, 10),
+                   &wrapper_vertical_16_dual_c, 10, 1),
         make_tuple(&wrapper_vertical_16_dual_sse2,
-                   &wrapper_vertical_16_dual_c, 12)));
+                   &wrapper_vertical_16_dual_c, 12, 1)));
 #else
 INSTANTIATE_TEST_CASE_P(
     SSE2, Loop8Test6Param,
     ::testing::Values(
-        make_tuple(&vp9_lpf_horizontal_8_sse2, &vp9_lpf_horizontal_8_c, 8),
-        make_tuple(&vp9_lpf_horizontal_16_sse2, &vp9_lpf_horizontal_16_c, 8),
-        make_tuple(&vp9_lpf_vertical_8_sse2, &vp9_lpf_vertical_8_c, 8),
-        make_tuple(&wrapper_vertical_16_sse2, &wrapper_vertical_16_c, 8)));
+        make_tuple(&vp9_lpf_horizontal_8_sse2, &vp9_lpf_horizontal_8_c, 8, 1),
+        make_tuple(&vp9_lpf_horizontal_16_sse2, &vp9_lpf_horizontal_16_c, 8, 1),
+        make_tuple(&vp9_lpf_horizontal_16_sse2, &vp9_lpf_horizontal_16_c, 8, 2),
+        make_tuple(&vp9_lpf_vertical_8_sse2, &vp9_lpf_vertical_8_c, 8, 1),
+        make_tuple(&wrapper_vertical_16_sse2, &wrapper_vertical_16_c, 8, 1)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #endif
 
@@ -577,7 +584,9 @@
 INSTANTIATE_TEST_CASE_P(
     AVX2, Loop8Test6Param,
     ::testing::Values(
-        make_tuple(&vp9_lpf_horizontal_16_avx2, &vp9_lpf_horizontal_16_c, 8)));
+        make_tuple(&vp9_lpf_horizontal_16_avx2, &vp9_lpf_horizontal_16_c, 8, 1),
+        make_tuple(&vp9_lpf_horizontal_16_avx2, &vp9_lpf_horizontal_16_c, 8,
+                   2)));
 #endif
 
 #if HAVE_SSE2
@@ -635,20 +644,22 @@
 // Using #if inside the macro is unsupported on MSVS but the tests are not
 // currently built for MSVS with ARM and NEON.
         make_tuple(&vp9_lpf_horizontal_16_neon,
-                   &vp9_lpf_horizontal_16_c, 8),
+                   &vp9_lpf_horizontal_16_c, 8, 1),
+        make_tuple(&vp9_lpf_horizontal_16_neon,
+                   &vp9_lpf_horizontal_16_c, 8, 2),
         make_tuple(&wrapper_vertical_16_neon,
-                   &wrapper_vertical_16_c, 8),
+                   &wrapper_vertical_16_c, 8, 1),
         make_tuple(&wrapper_vertical_16_dual_neon,
-                   &wrapper_vertical_16_dual_c, 8),
+                   &wrapper_vertical_16_dual_c, 8, 1),
         make_tuple(&vp9_lpf_horizontal_8_neon,
-                   &vp9_lpf_horizontal_8_c, 8),
+                   &vp9_lpf_horizontal_8_c, 8, 1),
         make_tuple(&vp9_lpf_vertical_8_neon,
-                   &vp9_lpf_vertical_8_c, 8),
+                   &vp9_lpf_vertical_8_c, 8, 1),
 #endif  // HAVE_NEON_ASM
         make_tuple(&vp9_lpf_horizontal_4_neon,
-                   &vp9_lpf_horizontal_4_c, 8),
+                   &vp9_lpf_horizontal_4_c, 8, 1),
         make_tuple(&vp9_lpf_vertical_4_neon,
-                   &vp9_lpf_vertical_4_c, 8)));
+                   &vp9_lpf_vertical_4_c, 8, 1)));
 INSTANTIATE_TEST_CASE_P(
     NEON, Loop8Test9Param,
     ::testing::Values(

diff --git a/test/partial_idct_test.cc b/test/partial_idct_test.cc
index ba82da4..ba73f86 100644
--- a/test/partial_idct_test.cc
+++ b/test/partial_idct_test.cc

@@ -230,7 +230,7 @@
                    &vp9_idct4x4_1_add_c,
                    TX_4X4, 1)));
 
-#if HAVE_NEON
+#if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     NEON, PartialIDctTest,
     ::testing::Values(
@@ -258,7 +258,7 @@
                    &vp9_idct4x4_16_add_c,
                    &vp9_idct4x4_1_add_neon,
                    TX_4X4, 1)));
-#endif  // HAVE_NEON
+#endif  // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(

diff --git a/test/svc_test.cc b/test/svc_test.cc
index 67e83e3..0af3005 100644
--- a/test/svc_test.cc
+++ b/test/svc_test.cc

@@ -63,6 +63,9 @@
     vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();
     VP9CodecFactory codec_factory;
     decoder_ = codec_factory.CreateDecoder(dec_cfg, 0);
+
+    tile_columns_ = 0;
+    tile_rows_ = 0;
   }
 
   virtual void TearDown() {
@@ -75,6 +78,8 @@
         vpx_svc_init(&svc_, &codec_, vpx_codec_vp9_cx(), &codec_enc_);
     EXPECT_EQ(VPX_CODEC_OK, res);
     vpx_codec_control(&codec_, VP8E_SET_CPUUSED, 4);  // Make the test faster
+    vpx_codec_control(&codec_, VP9E_SET_TILE_COLUMNS, tile_columns_);
+    vpx_codec_control(&codec_, VP9E_SET_TILE_ROWS, tile_rows_);
     codec_initialized_ = true;
   }
 
@@ -108,7 +113,8 @@
     codec_enc_.g_pass = VPX_RC_FIRST_PASS;
     InitializeEncoder();
 
-    libvpx_test::I420VideoSource video(test_file_name_, kWidth, kHeight,
+    libvpx_test::I420VideoSource video(test_file_name_,
+                                       codec_enc_.g_w, codec_enc_.g_h,
                                        codec_enc_.g_timebase.den,
                                        codec_enc_.g_timebase.num, 0, 30);
     video.Begin();
@@ -176,7 +182,8 @@
     }
     InitializeEncoder();
 
-    libvpx_test::I420VideoSource video(test_file_name_, kWidth, kHeight,
+    libvpx_test::I420VideoSource video(test_file_name_,
+                                       codec_enc_.g_w, codec_enc_.g_h,
                                        codec_enc_.g_timebase.den,
                                        codec_enc_.g_timebase.num, 0, 30);
     video.Begin();
@@ -310,6 +317,8 @@
   std::string test_file_name_;
   bool codec_initialized_;
   Decoder *decoder_;
+  int tile_columns_;
+  int tile_rows_;
 };
 
 TEST_F(SvcTest, SvcInit) {
@@ -737,4 +746,51 @@
   FreeBitstreamBuffers(&outputs[0], 10);
 }
 
+TEST_F(SvcTest, TwoPassEncode2TemporalLayersWithTiles) {
+  // First pass encode
+  std::string stats_buf;
+  vpx_svc_set_options(&svc_, "scale-factors=1/1");
+  svc_.temporal_layers = 2;
+  Pass1EncodeNFrames(10, 1, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  svc_.temporal_layers = 2;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1");
+  codec_enc_.g_w = 704;
+  codec_enc_.g_h = 144;
+  tile_columns_ = 1;
+  tile_rows_ = 1;
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
+  DecodeNFrames(&outputs[0], 10);
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
+
+TEST_F(SvcTest,
+       TwoPassEncode2TemporalLayersWithMultipleFrameContextsAndTiles) {
+  // First pass encode
+  std::string stats_buf;
+  vpx_svc_set_options(&svc_, "scale-factors=1/1");
+  svc_.temporal_layers = 2;
+  Pass1EncodeNFrames(10, 1, &stats_buf);
+
+  // Second pass encode
+  codec_enc_.g_pass = VPX_RC_LAST_PASS;
+  svc_.temporal_layers = 2;
+  codec_enc_.g_error_resilient = 0;
+  codec_enc_.g_w = 704;
+  codec_enc_.g_h = 144;
+  tile_columns_ = 1;
+  tile_rows_ = 1;
+  vpx_svc_set_options(&svc_, "auto-alt-refs=1 scale-factors=1/1 "
+                      "multi-frame-contexts=1");
+  vpx_fixed_buf outputs[10];
+  memset(&outputs[0], 0, sizeof(outputs));
+  Pass2EncodeNFrames(&stats_buf, 10, 1, &outputs[0]);
+  DecodeNFrames(&outputs[0], 10);
+  FreeBitstreamBuffers(&outputs[0], 10);
+}
+
 }  // namespace

diff --git a/test/test_libvpx.cc b/test/test_libvpx.cc
index fc035af..30a5255 100644
--- a/test/test_libvpx.cc
+++ b/test/test_libvpx.cc

@@ -15,10 +15,11 @@
 extern "C" {
 #if CONFIG_VP8
 extern void vp8_rtcd();
-#endif
+#endif  // CONFIG_VP8
 #if CONFIG_VP9
 extern void vp9_rtcd();
-#endif
+#endif  // CONFIG_VP9
+extern void vpx_scale_rtcd();
 }
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
@@ -59,11 +60,12 @@
 
 #if CONFIG_VP8
   vp8_rtcd();
-#endif
+#endif  // CONFIG_VP8
 #if CONFIG_VP9
   vp9_rtcd();
-#endif
-#endif
+#endif  // CONFIG_VP9
+  vpx_scale_rtcd();
+#endif  // !CONFIG_SHARED
 
   return RUN_ALL_TESTS();
 }

diff --git a/test/test_vector_test.cc b/test/test_vector_test.cc
index fa264f2..5847074 100644
--- a/test/test_vector_test.cc
+++ b/test/test_vector_test.cc

@@ -29,7 +29,7 @@
 
 enum DecodeMode {
   kSerialMode,
-  kFrameParallMode
+  kFrameParallelMode
 };
 
 const int kDecodeMode = 0;
@@ -95,7 +95,7 @@
   vpx_codec_dec_cfg_t cfg = {0};
   char str[256];
 
-  if (mode == kFrameParallMode) {
+  if (mode == kFrameParallelMode) {
     flags |= VPX_CODEC_USE_FRAME_THREADING;
   }
 

diff --git a/test/tools_common.sh b/test/tools_common.sh
index 34c1516..0ae011e 100755
--- a/test/tools_common.sh
+++ b/test/tools_common.sh

@@ -402,6 +402,7 @@
 
 VP9_WEBM_FILE="${LIBVPX_TEST_DATA_PATH}/vp90-2-00-quantizer-00.webm"
 VP9_FPM_WEBM_FILE="${LIBVPX_TEST_DATA_PATH}/vp90-2-07-frame_parallel-1.webm"
+VP9_LT_50_FRAMES_WEBM_FILE="${LIBVPX_TEST_DATA_PATH}/vp90-2-02-size-32x08.webm"
 
 YUV_RAW_INPUT="${LIBVPX_TEST_DATA_PATH}/hantro_collage_w352h288.yuv"
 YUV_RAW_INPUT_WIDTH=352

diff --git a/test/vp9_frame_parallel_test.cc b/test/vp9_frame_parallel_test.cc
index 0594d75..f0df88a 100644
--- a/test/vp9_frame_parallel_test.cc
+++ b/test/vp9_frame_parallel_test.cc

@@ -29,7 +29,7 @@
 
 #if CONFIG_WEBM_IO
 
-struct FileList {
+struct PauseFileList {
   const char *name;
   // md5 sum for decoded frames which does not include skipped frames.
   const char *expected_md5;
@@ -39,7 +39,8 @@
 // Decodes |filename| with |num_threads|. Pause at the specified frame_num,
 // seek to next key frame and then continue decoding until the end. Return
 // the md5 of the decoded frames which does not include skipped frames.
-string DecodeFile(const string &filename, int num_threads, int pause_num) {
+string DecodeFileWithPause(const string &filename, int num_threads,
+                           int pause_num) {
   libvpx_test::WebMVideoSource video(filename);
   video.Init();
   int in_frames = 0;
@@ -92,12 +93,12 @@
   return string(md5.Get());
 }
 
-void DecodeFiles(const FileList files[]) {
-  for (const FileList *iter = files; iter->name != NULL; ++iter) {
+void DecodeFilesWithPause(const PauseFileList files[]) {
+  for (const PauseFileList *iter = files; iter->name != NULL; ++iter) {
     SCOPED_TRACE(iter->name);
     for (int t = 2; t <= 8; ++t) {
       EXPECT_EQ(iter->expected_md5,
-                DecodeFile(iter->name, t, iter->pause_frame_num))
+                DecodeFileWithPause(iter->name, t, iter->pause_frame_num))
           << "threads = " << t;
     }
   }
@@ -106,19 +107,19 @@
 TEST(VP9MultiThreadedFrameParallel, PauseSeekResume) {
   // vp90-2-07-frame_parallel-1.webm is a 40 frame video file with
   // one key frame for every ten frames.
-  static const FileList files[] = {
+  static const PauseFileList files[] = {
     { "vp90-2-07-frame_parallel-1.webm",
-      "6ea7c3875d67252e7caf2bc6e75b36b1", 6},
+      "6ea7c3875d67252e7caf2bc6e75b36b1", 6 },
     { "vp90-2-07-frame_parallel-1.webm",
-      "4bb634160c7356a8d7d4299b6dc83a45", 12},
+      "4bb634160c7356a8d7d4299b6dc83a45", 12 },
     { "vp90-2-07-frame_parallel-1.webm",
-      "89772591e6ef461f9fa754f916c78ed8", 26},
-    { NULL, NULL, 0},
+      "89772591e6ef461f9fa754f916c78ed8", 26 },
+    { NULL, NULL, 0 },
   };
-  DecodeFiles(files);
+  DecodeFilesWithPause(files);
 }
 
-struct InvalidFileList {
+struct FileList {
   const char *name;
   // md5 sum for decoded frames which does not include corrupted frames.
   const char *expected_md5;
@@ -128,8 +129,8 @@
 
 // Decodes |filename| with |num_threads|. Return the md5 of the decoded
 // frames which does not include corrupted frames.
-string DecodeInvalidFile(const string &filename, int num_threads,
-                         int expected_frame_count) {
+string DecodeFile(const string &filename, int num_threads,
+                  int expected_frame_count) {
   libvpx_test::WebMVideoSource video(filename);
   video.Init();
 
@@ -173,37 +174,47 @@
   return string(md5.Get());
 }
 
-void DecodeInvalidFiles(const InvalidFileList files[]) {
-  for (const InvalidFileList *iter = files; iter->name != NULL; ++iter) {
+void DecodeFiles(const FileList files[]) {
+  for (const FileList *iter = files; iter->name != NULL; ++iter) {
     SCOPED_TRACE(iter->name);
     for (int t = 2; t <= 8; ++t) {
       EXPECT_EQ(iter->expected_md5,
-                DecodeInvalidFile(iter->name, t, iter->expected_frame_count))
+                DecodeFile(iter->name, t, iter->expected_frame_count))
           << "threads = " << t;
     }
   }
 }
 
 TEST(VP9MultiThreadedFrameParallel, InvalidFileTest) {
-  static const InvalidFileList files[] = {
+  static const FileList files[] = {
     // invalid-vp90-2-07-frame_parallel-1.webm is a 40 frame video file with
     // one key frame for every ten frames. The 11th frame has corrupted data.
     { "invalid-vp90-2-07-frame_parallel-1.webm",
-      "0549d0f45f60deaef8eb708e6c0eb6cb", 30},
+      "0549d0f45f60deaef8eb708e6c0eb6cb", 30 },
     // invalid-vp90-2-07-frame_parallel-2.webm is a 40 frame video file with
     // one key frame for every ten frames. The 1st and 31st frames have
     // corrupted data.
     { "invalid-vp90-2-07-frame_parallel-2.webm",
-      "6a1f3cf6f9e7a364212fadb9580d525e", 20},
+      "6a1f3cf6f9e7a364212fadb9580d525e", 20 },
     // invalid-vp90-2-07-frame_parallel-3.webm is a 40 frame video file with
     // one key frame for every ten frames. The 5th and 13th frames have
     // corrupted data.
     { "invalid-vp90-2-07-frame_parallel-3.webm",
-      "8256544308de926b0681e04685b98677", 27},
-    { NULL, NULL, 0},
+      "8256544308de926b0681e04685b98677", 27 },
+    { NULL, NULL, 0 },
   };
-  DecodeInvalidFiles(files);
+  DecodeFiles(files);
 }
 
+TEST(VP9MultiThreadedFrameParallel, ValidFileTest) {
+  static const FileList files[] = {
+#if CONFIG_VP9_HIGHBITDEPTH
+    { "vp92-2-20-10bit-yuv420.webm",
+      "a16b99df180c584e8db2ffeda987d293", 10 },
+#endif
+    { NULL, NULL, 0 },
+  };
+  DecodeFiles(files);
+}
 #endif  // CONFIG_WEBM_IO
 }  // namespace

diff --git a/test/vpxdec.sh b/test/vpxdec.sh
index d73a447..de51c80 100755
--- a/test/vpxdec.sh
+++ b/test/vpxdec.sh

@@ -17,7 +17,8 @@
 # Environment check: Make sure input is available.
 vpxdec_verify_environment() {
   if [ ! -e "${VP8_IVF_FILE}" ] || [ ! -e "${VP9_WEBM_FILE}" ] || \
-    [ ! -e "${VP9_FPM_WEBM_FILE}" ] ; then
+    [ ! -e "${VP9_FPM_WEBM_FILE}" ] || \
+    [ ! -e "${VP9_LT_50_FRAMES_WEBM_FILE}" ] ; then
     elog "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
     return 1
   fi
@@ -87,12 +88,29 @@
         --frame-parallel
     done
   fi
+}
 
+vpxdec_vp9_webm_less_than_50_frames() {
+  # ensure that reaching eof in webm_guess_framerate doesn't result in invalid
+  # frames in actual webm_read_frame calls.
+  if [ "$(vpxdec_can_decode_vp9)" = "yes" ] && \
+     [ "$(webm_io_available)" = "yes" ]; then
+    local readonly decoder="$(vpx_tool_path vpxdec)"
+    local readonly expected=10
+    local readonly num_frames=$(${VPX_TEST_PREFIX} "${decoder}" \
+      "${VP9_LT_50_FRAMES_WEBM_FILE}" --summary --noblit 2>&1 \
+      | awk '/^[0-9]+ decoded frames/ { print $1 }')
+    if [ "$num_frames" -ne "$expected" ]; then
+      elog "Output frames ($num_frames) != expected ($expected)"
+      return 1
+    fi
+  fi
 }
 
 vpxdec_tests="vpxdec_vp8_ivf
               vpxdec_vp8_ivf_pipe_input
               vpxdec_vp9_webm
-              vpxdec_vp9_webm_frame_parallel"
+              vpxdec_vp9_webm_frame_parallel
+              vpxdec_vp9_webm_less_than_50_frames"
 
 run_tests vpxdec_verify_environment "${vpxdec_tests}"

diff --git a/third_party/x86inc/README.libvpx b/third_party/x86inc/README.libvpx
index 02cd9ab..343bcf9 100644
--- a/third_party/x86inc/README.libvpx
+++ b/third_party/x86inc/README.libvpx

@@ -9,3 +9,4 @@
 
 Local Modifications:
 Some modifications to allow PIC to work with x86inc.
+Conditionally define program_name to allow overriding.

diff --git a/third_party/x86inc/x86inc.asm b/third_party/x86inc/x86inc.asm
index 9273fc9..bc81169 100644
--- a/third_party/x86inc/x86inc.asm
+++ b/third_party/x86inc/x86inc.asm

@@ -36,7 +36,9 @@
 
 %include "vpx_config.asm"
 
+%ifndef program_name
 %define program_name vp9
+%endif
 
 
 %define UNIX64 0

diff --git a/vp8/common/debugmodes.c b/vp8/common/debugmodes.c
index 46064e6..159fddc 100644
--- a/vp8/common/debugmodes.c
+++ b/vp8/common/debugmodes.c

@@ -81,7 +81,6 @@
     fprintf(mvs, "\n");
 
     /* print out the block modes */
-    mb_index = 0;
     fprintf(mvs, "Mbs for Frame %d\n", frame);
     {
         int b_row;
@@ -129,7 +128,6 @@
 
 
     /* print out the block modes */
-    mb_index = 0;
     fprintf(mvs, "MVs for Frame %d\n", frame);
     {
         int b_row;

diff --git a/vp8/common/mfqe.c b/vp8/common/mfqe.c
index 0693326..edd6ca0 100644
--- a/vp8/common/mfqe.c
+++ b/vp8/common/mfqe.c

@@ -153,11 +153,11 @@
         actd = (vp8_variance16x16(yd, yd_stride, VP8_ZEROS, 0, &sse)+128)>>8;
         act = (vp8_variance16x16(y, y_stride, VP8_ZEROS, 0, &sse)+128)>>8;
 #ifdef USE_SSD
-        sad = (vp8_variance16x16(y, y_stride, yd, yd_stride, &sse));
+        vp8_variance16x16(y, y_stride, yd, yd_stride, &sse);
         sad = (sse + 128)>>8;
-        usad = (vp8_variance8x8(u, uv_stride, ud, uvd_stride, &sse));
+        vp8_variance8x8(u, uv_stride, ud, uvd_stride, &sse);
         usad = (sse + 32)>>6;
-        vsad = (vp8_variance8x8(v, uv_stride, vd, uvd_stride, &sse));
+        vp8_variance8x8(v, uv_stride, vd, uvd_stride, &sse);
         vsad = (sse + 32)>>6;
 #else
         sad = (vp8_sad16x16(y, y_stride, yd, yd_stride, UINT_MAX) + 128) >> 8;
@@ -170,11 +170,11 @@
         actd = (vp8_variance8x8(yd, yd_stride, VP8_ZEROS, 0, &sse)+32)>>6;
         act = (vp8_variance8x8(y, y_stride, VP8_ZEROS, 0, &sse)+32)>>6;
 #ifdef USE_SSD
-        sad = (vp8_variance8x8(y, y_stride, yd, yd_stride, &sse));
+        vp8_variance8x8(y, y_stride, yd, yd_stride, &sse);
         sad = (sse + 32)>>6;
-        usad = (vp8_variance4x4(u, uv_stride, ud, uvd_stride, &sse));
+        vp8_variance4x4(u, uv_stride, ud, uvd_stride, &sse);
         usad = (sse + 8)>>4;
-        vsad = (vp8_variance4x4(v, uv_stride, vd, uvd_stride, &sse));
+        vp8_variance4x4(v, uv_stride, vd, uvd_stride, &sse);
         vsad = (sse + 8)>>4;
 #else
         sad = (vp8_sad8x8(y, y_stride, yd, yd_stride, UINT_MAX) + 32) >> 6;

diff --git a/vp8/common/ppc/copy_altivec.asm b/vp8/common/ppc/copy_altivec.asm
deleted file mode 100644
index a4ce915..0000000
--- a/vp8/common/ppc/copy_altivec.asm
+++ /dev/null

@@ -1,47 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl copy_mem16x16_ppc
-
-;# r3 unsigned char *src
-;# r4 int src_stride
-;# r5 unsigned char *dst
-;# r6 int dst_stride
-
-;# Make the assumption that input will not be aligned,
-;#  but the output will be.  So two reads and a perm
-;#  for the input, but only one store for the output.
-copy_mem16x16_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xe000
-    mtspr   256, r12            ;# set VRSAVE
-
-    li      r10, 16
-    mtctr   r10
-
-cp_16x16_loop:
-    lvsl    v0,  0, r3          ;# permutate value for alignment
-
-    lvx     v1,   0, r3
-    lvx     v2, r10, r3
-
-    vperm   v1, v1, v2, v0
-
-    stvx    v1,  0, r5
-
-    add     r3, r3, r4          ;# increment source pointer
-    add     r5, r5, r6          ;# increment destination pointer
-
-    bdnz    cp_16x16_loop
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr

diff --git a/vp8/common/ppc/filter_altivec.asm b/vp8/common/ppc/filter_altivec.asm
deleted file mode 100644
index 4da2e94..0000000
--- a/vp8/common/ppc/filter_altivec.asm
+++ /dev/null

@@ -1,1013 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl sixtap_predict_ppc
-    .globl sixtap_predict8x4_ppc
-    .globl sixtap_predict8x8_ppc
-    .globl sixtap_predict16x16_ppc
-
-.macro load_c V, LABEL, OFF, R0, R1
-    lis     \R0, \LABEL@ha
-    la      \R1, \LABEL@l(\R0)
-    lvx     \V, \OFF, \R1
-.endm
-
-.macro load_hfilter V0, V1
-    load_c \V0, HFilter, r5, r9, r10
-
-    addi    r5,  r5, 16
-    lvx     \V1, r5, r10
-.endm
-
-;# Vertical filtering
-.macro Vprolog
-    load_c v0, VFilter, r6, r3, r10
-
-    vspltish v5, 8
-    vspltish v6, 3
-    vslh    v6, v5, v6      ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    vspltb  v1, v0, 1
-    vspltb  v2, v0, 2
-    vspltb  v3, v0, 3
-    vspltb  v4, v0, 4
-    vspltb  v5, v0, 5
-    vspltb  v0, v0, 0
-.endm
-
-.macro vpre_load
-    Vprolog
-    li      r10,  16
-    lvx     v10,   0, r9    ;# v10..v14 = first 5 rows
-    lvx     v11, r10, r9
-    addi    r9,   r9, 32
-    lvx     v12,   0, r9
-    lvx     v13, r10, r9
-    addi    r9,   r9, 32
-    lvx     v14,   0, r9
-.endm
-
-.macro Msum Re, Ro, V, T, TMP
-                                ;# (Re,Ro) += (V*T)
-    vmuleub \TMP, \V, \T        ;# trashes v8
-    vadduhm \Re, \Re, \TMP      ;# Re = evens, saturation unnecessary
-    vmuloub \TMP, \V, \T
-    vadduhm \Ro, \Ro, \TMP      ;# Ro = odds
-.endm
-
-.macro vinterp_no_store P0 P1 P2 P3 P4 P5
-    vmuleub  v8, \P0, v0        ;# 64 + 4 positive taps
-    vadduhm v16, v6, v8
-    vmuloub  v8, \P0, v0
-    vadduhm v17, v6, v8
-    Msum v16, v17, \P2, v2, v8
-    Msum v16, v17, \P3, v3, v8
-    Msum v16, v17, \P5, v5, v8
-
-    vmuleub v18, \P1, v1        ;# 2 negative taps
-    vmuloub v19, \P1, v1
-    Msum v18, v19, \P4, v4, v8
-
-    vsubuhs v16, v16, v18       ;# subtract neg from pos
-    vsubuhs v17, v17, v19
-    vsrh    v16, v16, v7        ;# divide by 128
-    vsrh    v17, v17, v7        ;# v16 v17 = evens, odds
-    vmrghh  v18, v16, v17       ;# v18 v19 = 16-bit result in order
-    vmrglh  v19, v16, v17
-    vpkuhus  \P0, v18, v19      ;# P0 = 8-bit result
-.endm
-
-.macro vinterp_no_store_8x8 P0 P1 P2 P3 P4 P5
-    vmuleub v24, \P0, v13       ;# 64 + 4 positive taps
-    vadduhm v21, v20, v24
-    vmuloub v24, \P0, v13
-    vadduhm v22, v20, v24
-    Msum v21, v22, \P2, v15, v25
-    Msum v21, v22, \P3, v16, v25
-    Msum v21, v22, \P5, v18, v25
-
-    vmuleub v23, \P1, v14       ;# 2 negative taps
-    vmuloub v24, \P1, v14
-    Msum v23, v24, \P4, v17, v25
-
-    vsubuhs v21, v21, v23       ;# subtract neg from pos
-    vsubuhs v22, v22, v24
-    vsrh    v21, v21, v19       ;# divide by 128
-    vsrh    v22, v22, v19       ;# v16 v17 = evens, odds
-    vmrghh  v23, v21, v22       ;# v18 v19 = 16-bit result in order
-    vmrglh  v24, v21, v22
-    vpkuhus \P0, v23, v24       ;# P0 = 8-bit result
-.endm
-
-
-.macro Vinterp P0 P1 P2 P3 P4 P5
-    vinterp_no_store \P0, \P1, \P2, \P3, \P4, \P5
-    stvx    \P0, 0, r7
-    add     r7, r7, r8      ;# 33 ops per 16 pels
-.endm
-
-
-.macro luma_v P0, P1, P2, P3, P4, P5
-    addi    r9,   r9, 16        ;# P5 = newest input row
-    lvx     \P5,   0, r9
-    Vinterp \P0, \P1, \P2, \P3, \P4, \P5
-.endm
-
-.macro luma_vtwo
-    luma_v v10, v11, v12, v13, v14, v15
-    luma_v v11, v12, v13, v14, v15, v10
-.endm
-
-.macro luma_vfour
-    luma_vtwo
-    luma_v v12, v13, v14, v15, v10, v11
-    luma_v v13, v14, v15, v10, v11, v12
-.endm
-
-.macro luma_vsix
-    luma_vfour
-    luma_v v14, v15, v10, v11, v12, v13
-    luma_v v15, v10, v11, v12, v13, v14
-.endm
-
-.macro Interp4 R I I4
-    vmsummbm \R, v13, \I, v15
-    vmsummbm \R, v14, \I4, \R
-.endm
-
-.macro Read8x8 VD, RS, RP, increment_counter
-    lvsl    v21,  0, \RS        ;# permutate value for alignment
-
-    ;# input to filter is 21 bytes wide, output is 16 bytes.
-    ;#  input will can span three vectors if not aligned correctly.
-    lvx     \VD,   0, \RS
-    lvx     v20, r10, \RS
-
-.if \increment_counter
-    add     \RS, \RS, \RP
-.endif
-
-    vperm   \VD, \VD, v20, v21
-.endm
-
-.macro interp_8x8 R
-    vperm   v20, \R, \R, v16    ;# v20 = 0123 1234 2345 3456
-    vperm   v21, \R, \R, v17    ;# v21 = 4567 5678 6789 789A
-    Interp4 v20, v20,  v21      ;# v20 = result 0 1 2 3
-    vperm   \R, \R, \R, v18     ;# R   = 89AB 9ABC ABCx BCxx
-    Interp4 v21, v21, \R        ;# v21 = result 4 5 6 7
-
-    vpkswus \R, v20, v21        ;#  R = 0 1 2 3 4 5 6 7
-    vsrh    \R, \R, v19
-
-    vpkuhus \R, \R, \R          ;# saturate and pack
-
-.endm
-
-.macro Read4x4 VD, RS, RP, increment_counter
-    lvsl    v21,  0, \RS        ;# permutate value for alignment
-
-    ;# input to filter is 21 bytes wide, output is 16 bytes.
-    ;#  input will can span three vectors if not aligned correctly.
-    lvx     v20,   0, \RS
-
-.if \increment_counter
-    add     \RS, \RS, \RP
-.endif
-
-    vperm   \VD, v20, v20, v21
-.endm
-    .text
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-sixtap_predict_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xff87
-    ori     r12, r12, 0xffc0
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    slwi.   r5, r5, 5           ;# index into horizontal filter array
-
-    vspltish v19, 7
-
-    ;# If there isn't any filtering to be done for the horizontal, then
-    ;#  just skip to the second pass.
-    beq-    vertical_only_4x4
-
-    ;# load up horizontal filter
-    load_hfilter v13, v14
-
-    ;# rounding added in on the multiply
-    vspltisw v16, 8
-    vspltisw v15, 3
-    vslw    v15, v16, v15       ;# 0x00000040000000400000004000000040
-
-    ;# Load up permutation constants
-    load_c v16, B_0123, 0, r9, r10
-    load_c v17, B_4567, 0, r9, r10
-    load_c v18, B_89AB, 0, r9, r10
-
-    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
-    addi    r3, r3, -2
-
-    addi    r9, r3, 0
-    li      r10, 16
-    Read8x8 v2, r3, r4, 1
-    Read8x8 v3, r3, r4, 1
-    Read8x8 v4, r3, r4, 1
-    Read8x8 v5, r3, r4, 1
-
-    slwi.   r6, r6, 4           ;# index into vertical filter array
-
-    ;# filter a line
-    interp_8x8 v2
-    interp_8x8 v3
-    interp_8x8 v4
-    interp_8x8 v5
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional 5 lines that are needed
-    ;#  for the vertical filter.
-    beq-    store_4x4
-
-    ;# only needed if there is a vertical filter present
-    ;# if the second filter is not null then need to back off by 2*pitch
-    sub     r9, r9, r4
-    sub     r9, r9, r4
-
-    Read8x8 v0, r9, r4, 1
-    Read8x8 v1, r9, r4, 0
-    Read8x8 v6, r3, r4, 1
-    Read8x8 v7, r3, r4, 1
-    Read8x8 v8, r3, r4, 0
-
-    interp_8x8 v0
-    interp_8x8 v1
-    interp_8x8 v6
-    interp_8x8 v7
-    interp_8x8 v8
-
-    b       second_pass_4x4
-
-vertical_only_4x4:
-    ;# only needed if there is a vertical filter present
-    ;# if the second filter is not null then need to back off by 2*pitch
-    sub     r3, r3, r4
-    sub     r3, r3, r4
-    li      r10, 16
-
-    Read8x8 v0, r3, r4, 1
-    Read8x8 v1, r3, r4, 1
-    Read8x8 v2, r3, r4, 1
-    Read8x8 v3, r3, r4, 1
-    Read8x8 v4, r3, r4, 1
-    Read8x8 v5, r3, r4, 1
-    Read8x8 v6, r3, r4, 1
-    Read8x8 v7, r3, r4, 1
-    Read8x8 v8, r3, r4, 0
-
-    slwi    r6, r6, 4           ;# index into vertical filter array
-
-second_pass_4x4:
-    load_c   v20, b_hilo_4x4, 0, r9, r10
-    load_c   v21, b_hilo, 0, r9, r10
-
-    ;# reposition input so that it can go through the
-    ;# filtering phase with one pass.
-    vperm   v0, v0, v1, v20     ;# 0 1 x x
-    vperm   v2, v2, v3, v20     ;# 2 3 x x
-    vperm   v4, v4, v5, v20     ;# 4 5 x x
-    vperm   v6, v6, v7, v20     ;# 6 7 x x
-
-    vperm   v0, v0, v2, v21     ;# 0 1 2 3
-    vperm   v4, v4, v6, v21     ;# 4 5 6 7
-
-    vsldoi  v1, v0, v4, 4
-    vsldoi  v2, v0, v4, 8
-    vsldoi  v3, v0, v4, 12
-
-    vsldoi  v5, v4, v8, 4
-
-    load_c   v13, VFilter, r6, r9, r10
-
-    vspltish v15, 8
-    vspltish v20, 3
-    vslh    v20, v15, v20       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    vspltb  v14, v13, 1
-    vspltb  v15, v13, 2
-    vspltb  v16, v13, 3
-    vspltb  v17, v13, 4
-    vspltb  v18, v13, 5
-    vspltb  v13, v13, 0
-
-    vinterp_no_store_8x8 v0, v1, v2, v3, v4, v5
-
-    stvx    v0, 0, r1
-
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    lwz     r0, 4(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    lwz     r0, 8(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    lwz     r0, 12(r1)
-    stw     r0, 0(r7)
-
-    b       exit_4x4
-
-store_4x4:
-
-    stvx    v2, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    stvx    v3, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    stvx    v4, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    stvx    v5, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-
-exit_4x4:
-
-    addi    r1, r1, 32          ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-.macro w_8x8 V, D, R, P
-    stvx    \V, 0, r1
-    lwz     \R, 0(r1)
-    stw     \R, 0(r7)
-    lwz     \R, 4(r1)
-    stw     \R, 4(r7)
-    add     \D, \D, \P
-.endm
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-
-sixtap_predict8x4_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xffc0
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    slwi.   r5, r5, 5           ;# index into horizontal filter array
-
-    vspltish v19, 7
-
-    ;# If there isn't any filtering to be done for the horizontal, then
-    ;#  just skip to the second pass.
-    beq-    second_pass_pre_copy_8x4
-
-    load_hfilter v13, v14
-
-    ;# rounding added in on the multiply
-    vspltisw v16, 8
-    vspltisw v15, 3
-    vslw    v15, v16, v15       ;# 0x00000040000000400000004000000040
-
-    ;# Load up permutation constants
-    load_c v16, B_0123, 0, r9, r10
-    load_c v17, B_4567, 0, r9, r10
-    load_c v18, B_89AB, 0, r9, r10
-
-    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
-    addi    r3, r3, -2
-
-    addi    r9, r3, 0
-    li      r10, 16
-    Read8x8 v2, r3, r4, 1
-    Read8x8 v3, r3, r4, 1
-    Read8x8 v4, r3, r4, 1
-    Read8x8 v5, r3, r4, 1
-
-    slwi.   r6, r6, 4           ;# index into vertical filter array
-
-    ;# filter a line
-    interp_8x8 v2
-    interp_8x8 v3
-    interp_8x8 v4
-    interp_8x8 v5
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional 5 lines that are needed
-    ;#  for the vertical filter.
-    beq-    store_8x4
-
-    ;# only needed if there is a vertical filter present
-    ;# if the second filter is not null then need to back off by 2*pitch
-    sub     r9, r9, r4
-    sub     r9, r9, r4
-
-    Read8x8 v0, r9, r4, 1
-    Read8x8 v1, r9, r4, 0
-    Read8x8 v6, r3, r4, 1
-    Read8x8 v7, r3, r4, 1
-    Read8x8 v8, r3, r4, 0
-
-    interp_8x8 v0
-    interp_8x8 v1
-    interp_8x8 v6
-    interp_8x8 v7
-    interp_8x8 v8
-
-    b       second_pass_8x4
-
-second_pass_pre_copy_8x4:
-    ;# only needed if there is a vertical filter present
-    ;# if the second filter is not null then need to back off by 2*pitch
-    sub     r3, r3, r4
-    sub     r3, r3, r4
-    li      r10, 16
-
-    Read8x8 v0,  r3, r4, 1
-    Read8x8 v1,  r3, r4, 1
-    Read8x8 v2,  r3, r4, 1
-    Read8x8 v3,  r3, r4, 1
-    Read8x8 v4,  r3, r4, 1
-    Read8x8 v5,  r3, r4, 1
-    Read8x8 v6,  r3, r4, 1
-    Read8x8 v7,  r3, r4, 1
-    Read8x8 v8,  r3, r4, 1
-
-    slwi    r6, r6, 4           ;# index into vertical filter array
-
-second_pass_8x4:
-    load_c v13, VFilter, r6, r9, r10
-
-    vspltish v15, 8
-    vspltish v20, 3
-    vslh    v20, v15, v20       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    vspltb  v14, v13, 1
-    vspltb  v15, v13, 2
-    vspltb  v16, v13, 3
-    vspltb  v17, v13, 4
-    vspltb  v18, v13, 5
-    vspltb  v13, v13, 0
-
-    vinterp_no_store_8x8 v0, v1, v2, v3,  v4,  v5
-    vinterp_no_store_8x8 v1, v2, v3, v4,  v5,  v6
-    vinterp_no_store_8x8 v2, v3, v4, v5,  v6,  v7
-    vinterp_no_store_8x8 v3, v4, v5, v6,  v7,  v8
-
-    cmpi    cr0, r8, 8
-    beq     cr0, store_aligned_8x4
-
-    w_8x8   v0, r7, r0, r8
-    w_8x8   v1, r7, r0, r8
-    w_8x8   v2, r7, r0, r8
-    w_8x8   v3, r7, r0, r8
-
-    b       exit_8x4
-
-store_aligned_8x4:
-
-    load_c v10, b_hilo, 0, r9, r10
-
-    vperm   v0, v0, v1, v10
-    vperm   v2, v2, v3, v10
-
-    stvx    v0, 0, r7
-    addi    r7, r7, 16
-    stvx    v2, 0, r7
-
-    b       exit_8x4
-
-store_8x4:
-    cmpi    cr0, r8, 8
-    beq     cr0, store_aligned2_8x4
-
-    w_8x8   v2, r7, r0, r8
-    w_8x8   v3, r7, r0, r8
-    w_8x8   v4, r7, r0, r8
-    w_8x8   v5, r7, r0, r8
-
-    b       exit_8x4
-
-store_aligned2_8x4:
-    load_c v10, b_hilo, 0, r9, r10
-
-    vperm   v2, v2, v3, v10
-    vperm   v4, v4, v5, v10
-
-    stvx    v2, 0, r7
-    addi    r7, r7, 16
-    stvx    v4, 0, r7
-
-exit_8x4:
-
-    addi    r1, r1, 32          ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-
-    blr
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-
-;# Because the width that needs to be filtered will fit in a single altivec
-;#  register there is no need to loop.  Everything can stay in registers.
-sixtap_predict8x8_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xffc0
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    slwi.   r5, r5, 5           ;# index into horizontal filter array
-
-    vspltish v19, 7
-
-    ;# If there isn't any filtering to be done for the horizontal, then
-    ;#  just skip to the second pass.
-    beq-    second_pass_pre_copy_8x8
-
-    load_hfilter v13, v14
-
-    ;# rounding added in on the multiply
-    vspltisw v16, 8
-    vspltisw v15, 3
-    vslw    v15, v16, v15       ;# 0x00000040000000400000004000000040
-
-    ;# Load up permutation constants
-    load_c v16, B_0123, 0, r9, r10
-    load_c v17, B_4567, 0, r9, r10
-    load_c v18, B_89AB, 0, r9, r10
-
-    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
-    addi    r3, r3, -2
-
-    addi    r9, r3, 0
-    li      r10, 16
-    Read8x8 v2, r3, r4, 1
-    Read8x8 v3, r3, r4, 1
-    Read8x8 v4, r3, r4, 1
-    Read8x8 v5, r3, r4, 1
-    Read8x8 v6, r3, r4, 1
-    Read8x8 v7, r3, r4, 1
-    Read8x8 v8, r3, r4, 1
-    Read8x8 v9, r3, r4, 1
-
-    slwi.   r6, r6, 4           ;# index into vertical filter array
-
-    ;# filter a line
-    interp_8x8 v2
-    interp_8x8 v3
-    interp_8x8 v4
-    interp_8x8 v5
-    interp_8x8 v6
-    interp_8x8 v7
-    interp_8x8 v8
-    interp_8x8 v9
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional 5 lines that are needed
-    ;#  for the vertical filter.
-    beq-    store_8x8
-
-    ;# only needed if there is a vertical filter present
-    ;# if the second filter is not null then need to back off by 2*pitch
-    sub     r9, r9, r4
-    sub     r9, r9, r4
-
-    Read8x8 v0,  r9, r4, 1
-    Read8x8 v1,  r9, r4, 0
-    Read8x8 v10, r3, r4, 1
-    Read8x8 v11, r3, r4, 1
-    Read8x8 v12, r3, r4, 0
-
-    interp_8x8 v0
-    interp_8x8 v1
-    interp_8x8 v10
-    interp_8x8 v11
-    interp_8x8 v12
-
-    b       second_pass_8x8
-
-second_pass_pre_copy_8x8:
-    ;# only needed if there is a vertical filter present
-    ;# if the second filter is not null then need to back off by 2*pitch
-    sub     r3, r3, r4
-    sub     r3, r3, r4
-    li      r10, 16
-
-    Read8x8 v0,  r3, r4, 1
-    Read8x8 v1,  r3, r4, 1
-    Read8x8 v2,  r3, r4, 1
-    Read8x8 v3,  r3, r4, 1
-    Read8x8 v4,  r3, r4, 1
-    Read8x8 v5,  r3, r4, 1
-    Read8x8 v6,  r3, r4, 1
-    Read8x8 v7,  r3, r4, 1
-    Read8x8 v8,  r3, r4, 1
-    Read8x8 v9,  r3, r4, 1
-    Read8x8 v10, r3, r4, 1
-    Read8x8 v11, r3, r4, 1
-    Read8x8 v12, r3, r4, 0
-
-    slwi    r6, r6, 4           ;# index into vertical filter array
-
-second_pass_8x8:
-    load_c v13, VFilter, r6, r9, r10
-
-    vspltish v15, 8
-    vspltish v20, 3
-    vslh    v20, v15, v20       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    vspltb  v14, v13, 1
-    vspltb  v15, v13, 2
-    vspltb  v16, v13, 3
-    vspltb  v17, v13, 4
-    vspltb  v18, v13, 5
-    vspltb  v13, v13, 0
-
-    vinterp_no_store_8x8 v0, v1, v2, v3,  v4,  v5
-    vinterp_no_store_8x8 v1, v2, v3, v4,  v5,  v6
-    vinterp_no_store_8x8 v2, v3, v4, v5,  v6,  v7
-    vinterp_no_store_8x8 v3, v4, v5, v6,  v7,  v8
-    vinterp_no_store_8x8 v4, v5, v6, v7,  v8,  v9
-    vinterp_no_store_8x8 v5, v6, v7, v8,  v9,  v10
-    vinterp_no_store_8x8 v6, v7, v8, v9,  v10, v11
-    vinterp_no_store_8x8 v7, v8, v9, v10, v11, v12
-
-    cmpi    cr0, r8, 8
-    beq     cr0, store_aligned_8x8
-
-    w_8x8   v0, r7, r0, r8
-    w_8x8   v1, r7, r0, r8
-    w_8x8   v2, r7, r0, r8
-    w_8x8   v3, r7, r0, r8
-    w_8x8   v4, r7, r0, r8
-    w_8x8   v5, r7, r0, r8
-    w_8x8   v6, r7, r0, r8
-    w_8x8   v7, r7, r0, r8
-
-    b       exit_8x8
-
-store_aligned_8x8:
-
-    load_c v10, b_hilo, 0, r9, r10
-
-    vperm   v0, v0, v1, v10
-    vperm   v2, v2, v3, v10
-    vperm   v4, v4, v5, v10
-    vperm   v6, v6, v7, v10
-
-    stvx    v0, 0, r7
-    addi    r7, r7, 16
-    stvx    v2, 0, r7
-    addi    r7, r7, 16
-    stvx    v4, 0, r7
-    addi    r7, r7, 16
-    stvx    v6, 0, r7
-
-    b       exit_8x8
-
-store_8x8:
-    cmpi    cr0, r8, 8
-    beq     cr0, store_aligned2_8x8
-
-    w_8x8   v2, r7, r0, r8
-    w_8x8   v3, r7, r0, r8
-    w_8x8   v4, r7, r0, r8
-    w_8x8   v5, r7, r0, r8
-    w_8x8   v6, r7, r0, r8
-    w_8x8   v7, r7, r0, r8
-    w_8x8   v8, r7, r0, r8
-    w_8x8   v9, r7, r0, r8
-
-    b       exit_8x8
-
-store_aligned2_8x8:
-    load_c v10, b_hilo, 0, r9, r10
-
-    vperm   v2, v2, v3, v10
-    vperm   v4, v4, v5, v10
-    vperm   v6, v6, v7, v10
-    vperm   v8, v8, v9, v10
-
-    stvx    v2, 0, r7
-    addi    r7, r7, 16
-    stvx    v4, 0, r7
-    addi    r7, r7, 16
-    stvx    v6, 0, r7
-    addi    r7, r7, 16
-    stvx    v8, 0, r7
-
-exit_8x8:
-
-    addi    r1, r1, 32          ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-
-;# Two pass filtering.  First pass is Horizontal edges, second pass is vertical
-;#  edges.  One of the filters can be null, but both won't be.  Needs to use a
-;#  temporary buffer because the source buffer can't be modified and the buffer
-;#  for the destination is not large enough to hold the temporary data.
-sixtap_predict16x16_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xf000
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-416(r1)         ;# create space on the stack
-
-    ;# Three possiblities
-    ;#  1. First filter is null.  Don't use a temp buffer.
-    ;#  2. Second filter is null.  Don't use a temp buffer.
-    ;#  3. Neither are null, use temp buffer.
-
-    ;# First Pass (horizontal edge)
-    ;#  setup pointers for src
-    ;#  if possiblity (1) then setup the src pointer to be the orginal and jump
-    ;#  to second pass.  this is based on if x_offset is 0.
-
-    ;# load up horizontal filter
-    slwi.   r5, r5, 5           ;# index into horizontal filter array
-
-    load_hfilter v4, v5
-
-    beq-    copy_horizontal_16x21
-
-    ;# Back off input buffer by 2 bytes.  Need 2 before and 3 after
-    addi    r3, r3, -2
-
-    slwi.   r6, r6, 4           ;# index into vertical filter array
-
-    ;# setup constants
-    ;# v14 permutation value for alignment
-    load_c v14, b_hperm, 0, r9, r10
-
-    ;# These statements are guessing that there won't be a second pass,
-    ;#  but if there is then inside the bypass they need to be set
-    li      r0, 16              ;# prepare for no vertical filter
-
-    ;# Change the output pointer and pitch to be the actual
-    ;#  desination instead of a temporary buffer.
-    addi    r9, r7, 0
-    addi    r5, r8, 0
-
-    ;# no vertical filter, so write the output from the first pass
-    ;#  directly into the output buffer.
-    beq-    no_vertical_filter_bypass
-
-    ;# if the second filter is not null then need to back off by 2*pitch
-    sub     r3, r3, r4
-    sub     r3, r3, r4
-
-    ;# setup counter for the number of lines that are going to be filtered
-    li      r0, 21
-
-    ;# use the stack as temporary storage
-    la      r9, 48(r1)
-    li      r5, 16
-
-no_vertical_filter_bypass:
-
-    mtctr   r0
-
-    ;# rounding added in on the multiply
-    vspltisw v10, 8
-    vspltisw v12, 3
-    vslw    v12, v10, v12       ;# 0x00000040000000400000004000000040
-
-    ;# downshift by 7 ( divide by 128 ) at the end
-    vspltish v13, 7
-
-    ;# index to the next set of vectors in the row.
-    li      r10, 16
-    li      r12, 32
-
-horizontal_loop_16x16:
-
-    lvsl    v15,  0, r3         ;# permutate value for alignment
-
-    ;# input to filter is 21 bytes wide, output is 16 bytes.
-    ;#  input will can span three vectors if not aligned correctly.
-    lvx     v1,   0, r3
-    lvx     v2, r10, r3
-    lvx     v3, r12, r3
-
-    vperm   v8, v1, v2, v15
-    vperm   v9, v2, v3, v15     ;# v8 v9 = 21 input pixels left-justified
-
-    vsldoi  v11, v8, v9, 4
-
-    ;# set 0
-    vmsummbm v6, v4, v8, v12    ;# taps times elements
-    vmsummbm v0, v5, v11, v6
-
-    ;# set 1
-    vsldoi  v10, v8, v9, 1
-    vsldoi  v11, v8, v9, 5
-
-    vmsummbm v6, v4, v10, v12
-    vmsummbm v1, v5, v11, v6
-
-    ;# set 2
-    vsldoi  v10, v8, v9, 2
-    vsldoi  v11, v8, v9, 6
-
-    vmsummbm v6, v4, v10, v12
-    vmsummbm v2, v5, v11, v6
-
-    ;# set 3
-    vsldoi  v10, v8, v9, 3
-    vsldoi  v11, v8, v9, 7
-
-    vmsummbm v6, v4, v10, v12
-    vmsummbm v3, v5, v11, v6
-
-    vpkswus v0, v0, v1          ;# v0 = 0 4 8 C 1 5 9 D (16-bit)
-    vpkswus v1, v2, v3          ;# v1 = 2 6 A E 3 7 B F
-
-    vsrh    v0, v0, v13         ;# divide v0, v1 by 128
-    vsrh    v1, v1, v13
-
-    vpkuhus v0, v0, v1          ;# v0 = scrambled 8-bit result
-    vperm   v0, v0, v0, v14     ;# v0 = correctly-ordered result
-
-    stvx    v0,  0, r9
-    add     r9, r9, r5
-
-    add     r3, r3, r4
-
-    bdnz    horizontal_loop_16x16
-
-    ;# check again to see if vertical filter needs to be done.
-    cmpi    cr0, r6, 0
-    beq     cr0, end_16x16
-
-    ;# yes there is, so go to the second pass
-    b       second_pass_16x16
-
-copy_horizontal_16x21:
-    li      r10, 21
-    mtctr   r10
-
-    li      r10, 16
-
-    sub     r3, r3, r4
-    sub     r3, r3, r4
-
-    ;# this is done above if there is a horizontal filter,
-    ;#  if not it needs to be done down here.
-    slwi    r6, r6, 4           ;# index into vertical filter array
-
-    ;# always write to the stack when doing a horizontal copy
-    la      r9, 48(r1)
-
-copy_horizontal_loop_16x21:
-    lvsl    v15,  0, r3         ;# permutate value for alignment
-
-    lvx     v1,   0, r3
-    lvx     v2, r10, r3
-
-    vperm   v8, v1, v2, v15
-
-    stvx    v8,  0, r9
-    addi    r9, r9, 16
-
-    add     r3, r3, r4
-
-    bdnz    copy_horizontal_loop_16x21
-
-second_pass_16x16:
-
-    ;# always read from the stack when doing a vertical filter
-    la      r9, 48(r1)
-
-    ;# downshift by 7 ( divide by 128 ) at the end
-    vspltish v7, 7
-
-    vpre_load
-
-    luma_vsix
-    luma_vsix
-    luma_vfour
-
-end_16x16:
-
-    addi    r1, r1, 416         ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .data
-
-    .align 4
-HFilter:
-    .byte     0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0
-    .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte     0, -6,123, 12,  0, -6,123, 12,  0, -6,123, 12,  0, -6,123, 12
-    .byte    -1,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0, -1,  0,  0,  0
-    .byte     2,-11,108, 36,  2,-11,108, 36,  2,-11,108, 36,  2,-11,108, 36
-    .byte    -8,  1,  0,  0, -8,  1,  0,  0, -8,  1,  0,  0, -8,  1,  0,  0
-    .byte     0, -9, 93, 50,  0, -9, 93, 50,  0, -9, 93, 50,  0, -9, 93, 50
-    .byte    -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0
-    .byte     3,-16, 77, 77,  3,-16, 77, 77,  3,-16, 77, 77,  3,-16, 77, 77
-    .byte   -16,  3,  0,  0,-16,  3,  0,  0,-16,  3,  0,  0,-16,  3,  0,  0
-    .byte     0, -6, 50, 93,  0, -6, 50, 93,  0, -6, 50, 93,  0, -6, 50, 93
-    .byte    -9,  0,  0,  0, -9,  0,  0,  0, -9,  0,  0,  0, -9,  0,  0,  0
-    .byte     1, -8, 36,108,  1, -8, 36,108,  1, -8, 36,108,  1, -8, 36,108
-    .byte   -11,  2,  0,  0,-11,  2,  0,  0,-11,  2,  0,  0,-11,  2,  0,  0
-    .byte     0, -1, 12,123,  0, -1, 12,123,  0, -1, 12,123,  0, -1, 12,123
-    .byte    -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0, -6,  0,  0,  0
-
-    .align 4
-VFilter:
-    .byte     0,  0,128,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte     0,  6,123, 12,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte     2, 11,108, 36,  8,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte     0,  9, 93, 50,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte     3, 16, 77, 77, 16,  3,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte     0,  6, 50, 93,  9,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte     1,  8, 36,108, 11,  2,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte     0,  1, 12,123,  6,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-
-    .align 4
-b_hperm:
-    .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
-
-    .align 4
-B_0123:
-    .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
-
-    .align 4
-B_4567:
-    .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
-
-    .align 4
-B_89AB:
-    .byte     8,  9, 10, 11,  9, 10, 11, 12, 10, 11, 12, 13, 11, 12, 13, 14
-
-    .align 4
-b_hilo:
-    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
-
-    .align 4
-b_hilo_4x4:
-    .byte     0,  1,  2,  3, 16, 17, 18, 19,  0,  0,  0,  0,  0,  0,  0,  0

diff --git a/vp8/common/ppc/filter_bilinear_altivec.asm b/vp8/common/ppc/filter_bilinear_altivec.asm
deleted file mode 100644
index fd8aa66..0000000
--- a/vp8/common/ppc/filter_bilinear_altivec.asm
+++ /dev/null

@@ -1,677 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl bilinear_predict4x4_ppc
-    .globl bilinear_predict8x4_ppc
-    .globl bilinear_predict8x8_ppc
-    .globl bilinear_predict16x16_ppc
-
-.macro load_c V, LABEL, OFF, R0, R1
-    lis     \R0, \LABEL@ha
-    la      \R1, \LABEL@l(\R0)
-    lvx     \V, \OFF, \R1
-.endm
-
-.macro load_vfilter V0, V1
-    load_c \V0, vfilter_b, r6, r9, r10
-
-    addi    r6,  r6, 16
-    lvx     \V1, r6, r10
-.endm
-
-.macro HProlog jump_label
-    ;# load up horizontal filter
-    slwi.   r5, r5, 4           ;# index into horizontal filter array
-
-    ;# index to the next set of vectors in the row.
-    li      r10, 16
-    li      r12, 32
-
-    ;# downshift by 7 ( divide by 128 ) at the end
-    vspltish v19, 7
-
-    ;# If there isn't any filtering to be done for the horizontal, then
-    ;#  just skip to the second pass.
-    beq     \jump_label
-
-    load_c v20, hfilter_b, r5, r9, r0
-
-    ;# setup constants
-    ;# v14 permutation value for alignment
-    load_c v28, b_hperm_b, 0, r9, r0
-
-    ;# rounding added in on the multiply
-    vspltisw v21, 8
-    vspltisw v18, 3
-    vslw    v18, v21, v18       ;# 0x00000040000000400000004000000040
-
-    slwi.   r6, r6, 5           ;# index into vertical filter array
-.endm
-
-;# Filters a horizontal line
-;# expects:
-;#  r3  src_ptr
-;#  r4  pitch
-;#  r10 16
-;#  r12 32
-;#  v17 perm intput
-;#  v18 rounding
-;#  v19 shift
-;#  v20 filter taps
-;#  v21 tmp
-;#  v22 tmp
-;#  v23 tmp
-;#  v24 tmp
-;#  v25 tmp
-;#  v26 tmp
-;#  v27 tmp
-;#  v28 perm output
-;#
-.macro HFilter V
-    vperm   v24, v21, v21, v10  ;# v20 = 0123 1234 2345 3456
-    vperm   v25, v21, v21, v11  ;# v21 = 4567 5678 6789 789A
-
-    vmsummbm v24, v20, v24, v18
-    vmsummbm v25, v20, v25, v18
-
-    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
-
-    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
-
-    vpkuhus \V, v24, v24        ;# \V = scrambled 8-bit result
-.endm
-
-.macro hfilter_8 V, increment_counter
-    lvsl    v17,  0, r3         ;# permutate value for alignment
-
-    ;# input to filter is 9 bytes wide, output is 8 bytes.
-    lvx     v21,   0, r3
-    lvx     v22, r10, r3
-
-.if \increment_counter
-    add     r3, r3, r4
-.endif
-    vperm   v21, v21, v22, v17
-
-    HFilter \V
-.endm
-
-
-.macro load_and_align_8 V, increment_counter
-    lvsl    v17,  0, r3         ;# permutate value for alignment
-
-    ;# input to filter is 21 bytes wide, output is 16 bytes.
-    ;#  input will can span three vectors if not aligned correctly.
-    lvx     v21,   0, r3
-    lvx     v22, r10, r3
-
-.if \increment_counter
-    add     r3, r3, r4
-.endif
-
-    vperm   \V, v21, v22, v17
-.endm
-
-.macro write_aligned_8 V, increment_counter
-    stvx    \V,  0, r7
-
-.if \increment_counter
-    add     r7, r7, r8
-.endif
-.endm
-
-.macro vfilter_16 P0 P1
-    vmuleub v22, \P0, v20       ;# 64 + 4 positive taps
-    vadduhm v22, v18, v22
-    vmuloub v23, \P0, v20
-    vadduhm v23, v18, v23
-
-    vmuleub v24, \P1, v21
-    vadduhm v22, v22, v24       ;# Re = evens, saturation unnecessary
-    vmuloub v25, \P1, v21
-    vadduhm v23, v23, v25       ;# Ro = odds
-
-    vsrh    v22, v22, v19       ;# divide by 128
-    vsrh    v23, v23, v19       ;# v16 v17 = evens, odds
-    vmrghh  \P0, v22, v23       ;# v18 v19 = 16-bit result in order
-    vmrglh  v23, v22, v23
-    vpkuhus \P0, \P0, v23       ;# P0 = 8-bit result
-.endm
-
-
-.macro w_8x8 V, D, R, P
-    stvx    \V, 0, r1
-    lwz     \R, 0(r1)
-    stw     \R, 0(r7)
-    lwz     \R, 4(r1)
-    stw     \R, 4(r7)
-    add     \D, \D, \P
-.endm
-
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-bilinear_predict4x4_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xf830
-    ori     r12, r12, 0xfff8
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    HProlog second_pass_4x4_pre_copy_b
-
-    ;# Load up permutation constants
-    load_c v10, b_0123_b, 0, r9, r12
-    load_c v11, b_4567_b, 0, r9, r12
-
-    hfilter_8 v0, 1
-    hfilter_8 v1, 1
-    hfilter_8 v2, 1
-    hfilter_8 v3, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     store_out_4x4_b
-
-    hfilter_8 v4, 0
-
-    b   second_pass_4x4_b
-
-second_pass_4x4_pre_copy_b:
-    slwi    r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_8  v0, 1
-    load_and_align_8  v1, 1
-    load_and_align_8  v2, 1
-    load_and_align_8  v3, 1
-    load_and_align_8  v4, 1
-
-second_pass_4x4_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-
-store_out_4x4_b:
-
-    stvx    v0, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    stvx    v1, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    stvx    v2, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-    add     r7, r7, r8
-
-    stvx    v3, 0, r1
-    lwz     r0, 0(r1)
-    stw     r0, 0(r7)
-
-exit_4x4:
-
-    addi    r1, r1, 32          ;# recover stack
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-bilinear_predict8x4_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xf830
-    ori     r12, r12, 0xfff8
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    HProlog second_pass_8x4_pre_copy_b
-
-    ;# Load up permutation constants
-    load_c v10, b_0123_b, 0, r9, r12
-    load_c v11, b_4567_b, 0, r9, r12
-
-    hfilter_8 v0, 1
-    hfilter_8 v1, 1
-    hfilter_8 v2, 1
-    hfilter_8 v3, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     store_out_8x4_b
-
-    hfilter_8 v4, 0
-
-    b   second_pass_8x4_b
-
-second_pass_8x4_pre_copy_b:
-    slwi    r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_8  v0, 1
-    load_and_align_8  v1, 1
-    load_and_align_8  v2, 1
-    load_and_align_8  v3, 1
-    load_and_align_8  v4, 1
-
-second_pass_8x4_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-
-store_out_8x4_b:
-
-    cmpi    cr0, r8, 8
-    beq     cr0, store_aligned_8x4_b
-
-    w_8x8   v0, r7, r0, r8
-    w_8x8   v1, r7, r0, r8
-    w_8x8   v2, r7, r0, r8
-    w_8x8   v3, r7, r0, r8
-
-    b       exit_8x4
-
-store_aligned_8x4_b:
-    load_c v10, b_hilo_b, 0, r9, r10
-
-    vperm   v0, v0, v1, v10
-    vperm   v2, v2, v3, v10
-
-    stvx    v0, 0, r7
-    addi    r7, r7, 16
-    stvx    v2, 0, r7
-
-exit_8x4:
-
-    addi    r1, r1, 32          ;# recover stack
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-bilinear_predict8x8_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xfff0
-    ori     r12, r12, 0xffff
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    HProlog second_pass_8x8_pre_copy_b
-
-    ;# Load up permutation constants
-    load_c v10, b_0123_b, 0, r9, r12
-    load_c v11, b_4567_b, 0, r9, r12
-
-    hfilter_8 v0, 1
-    hfilter_8 v1, 1
-    hfilter_8 v2, 1
-    hfilter_8 v3, 1
-    hfilter_8 v4, 1
-    hfilter_8 v5, 1
-    hfilter_8 v6, 1
-    hfilter_8 v7, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     store_out_8x8_b
-
-    hfilter_8 v8, 0
-
-    b   second_pass_8x8_b
-
-second_pass_8x8_pre_copy_b:
-    slwi    r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_8  v0, 1
-    load_and_align_8  v1, 1
-    load_and_align_8  v2, 1
-    load_and_align_8  v3, 1
-    load_and_align_8  v4, 1
-    load_and_align_8  v5, 1
-    load_and_align_8  v6, 1
-    load_and_align_8  v7, 1
-    load_and_align_8  v8, 0
-
-second_pass_8x8_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-    vfilter_16 v4,  v5
-    vfilter_16 v5,  v6
-    vfilter_16 v6,  v7
-    vfilter_16 v7,  v8
-
-store_out_8x8_b:
-
-    cmpi    cr0, r8, 8
-    beq     cr0, store_aligned_8x8_b
-
-    w_8x8   v0, r7, r0, r8
-    w_8x8   v1, r7, r0, r8
-    w_8x8   v2, r7, r0, r8
-    w_8x8   v3, r7, r0, r8
-    w_8x8   v4, r7, r0, r8
-    w_8x8   v5, r7, r0, r8
-    w_8x8   v6, r7, r0, r8
-    w_8x8   v7, r7, r0, r8
-
-    b       exit_8x8
-
-store_aligned_8x8_b:
-    load_c v10, b_hilo_b, 0, r9, r10
-
-    vperm   v0, v0, v1, v10
-    vperm   v2, v2, v3, v10
-    vperm   v4, v4, v5, v10
-    vperm   v6, v6, v7, v10
-
-    stvx    v0, 0, r7
-    addi    r7, r7, 16
-    stvx    v2, 0, r7
-    addi    r7, r7, 16
-    stvx    v4, 0, r7
-    addi    r7, r7, 16
-    stvx    v6, 0, r7
-
-exit_8x8:
-
-    addi    r1, r1, 32          ;# recover stack
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-;# Filters a horizontal line
-;# expects:
-;#  r3  src_ptr
-;#  r4  pitch
-;#  r10 16
-;#  r12 32
-;#  v17 perm intput
-;#  v18 rounding
-;#  v19 shift
-;#  v20 filter taps
-;#  v21 tmp
-;#  v22 tmp
-;#  v23 tmp
-;#  v24 tmp
-;#  v25 tmp
-;#  v26 tmp
-;#  v27 tmp
-;#  v28 perm output
-;#
-.macro hfilter_16 V, increment_counter
-
-    lvsl    v17,  0, r3         ;# permutate value for alignment
-
-    ;# input to filter is 21 bytes wide, output is 16 bytes.
-    ;#  input will can span three vectors if not aligned correctly.
-    lvx     v21,   0, r3
-    lvx     v22, r10, r3
-    lvx     v23, r12, r3
-
-.if \increment_counter
-    add     r3, r3, r4
-.endif
-    vperm   v21, v21, v22, v17
-    vperm   v22, v22, v23, v17  ;# v8 v9 = 21 input pixels left-justified
-
-    ;# set 0
-    vmsummbm v24, v20, v21, v18 ;# taps times elements
-
-    ;# set 1
-    vsldoi  v23, v21, v22, 1
-    vmsummbm v25, v20, v23, v18
-
-    ;# set 2
-    vsldoi  v23, v21, v22, 2
-    vmsummbm v26, v20, v23, v18
-
-    ;# set 3
-    vsldoi  v23, v21, v22, 3
-    vmsummbm v27, v20, v23, v18
-
-    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
-    vpkswus v25, v26, v27       ;# v25 = 2 6 A E 3 7 B F
-
-    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
-    vsrh    v25, v25, v19
-
-    vpkuhus \V, v24, v25        ;# \V = scrambled 8-bit result
-    vperm   \V, \V, v0, v28     ;# \V = correctly-ordered result
-.endm
-
-.macro load_and_align_16 V, increment_counter
-    lvsl    v17,  0, r3         ;# permutate value for alignment
-
-    ;# input to filter is 21 bytes wide, output is 16 bytes.
-    ;#  input will can span three vectors if not aligned correctly.
-    lvx     v21,   0, r3
-    lvx     v22, r10, r3
-
-.if \increment_counter
-    add     r3, r3, r4
-.endif
-
-    vperm   \V, v21, v22, v17
-.endm
-
-.macro write_16 V, increment_counter
-    stvx    \V,  0, r7
-
-.if \increment_counter
-    add     r7, r7, r8
-.endif
-.endm
-
-    .align 2
-;# r3 unsigned char * src
-;# r4 int src_pitch
-;# r5 int x_offset
-;# r6 int y_offset
-;# r7 unsigned char * dst
-;# r8 int dst_pitch
-bilinear_predict16x16_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xfff8
-    mtspr   256, r12            ;# set VRSAVE
-
-    HProlog second_pass_16x16_pre_copy_b
-
-    hfilter_16 v0,  1
-    hfilter_16 v1,  1
-    hfilter_16 v2,  1
-    hfilter_16 v3,  1
-    hfilter_16 v4,  1
-    hfilter_16 v5,  1
-    hfilter_16 v6,  1
-    hfilter_16 v7,  1
-    hfilter_16 v8,  1
-    hfilter_16 v9,  1
-    hfilter_16 v10, 1
-    hfilter_16 v11, 1
-    hfilter_16 v12, 1
-    hfilter_16 v13, 1
-    hfilter_16 v14, 1
-    hfilter_16 v15, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     store_out_16x16_b
-
-    hfilter_16 v16, 0
-
-    b   second_pass_16x16_b
-
-second_pass_16x16_pre_copy_b:
-    slwi    r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_16  v0,  1
-    load_and_align_16  v1,  1
-    load_and_align_16  v2,  1
-    load_and_align_16  v3,  1
-    load_and_align_16  v4,  1
-    load_and_align_16  v5,  1
-    load_and_align_16  v6,  1
-    load_and_align_16  v7,  1
-    load_and_align_16  v8,  1
-    load_and_align_16  v9,  1
-    load_and_align_16  v10, 1
-    load_and_align_16  v11, 1
-    load_and_align_16  v12, 1
-    load_and_align_16  v13, 1
-    load_and_align_16  v14, 1
-    load_and_align_16  v15, 1
-    load_and_align_16  v16, 0
-
-second_pass_16x16_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-    vfilter_16 v4,  v5
-    vfilter_16 v5,  v6
-    vfilter_16 v6,  v7
-    vfilter_16 v7,  v8
-    vfilter_16 v8,  v9
-    vfilter_16 v9,  v10
-    vfilter_16 v10, v11
-    vfilter_16 v11, v12
-    vfilter_16 v12, v13
-    vfilter_16 v13, v14
-    vfilter_16 v14, v15
-    vfilter_16 v15, v16
-
-store_out_16x16_b:
-
-    write_16 v0,  1
-    write_16 v1,  1
-    write_16 v2,  1
-    write_16 v3,  1
-    write_16 v4,  1
-    write_16 v5,  1
-    write_16 v6,  1
-    write_16 v7,  1
-    write_16 v8,  1
-    write_16 v9,  1
-    write_16 v10, 1
-    write_16 v11, 1
-    write_16 v12, 1
-    write_16 v13, 1
-    write_16 v14, 1
-    write_16 v15, 0
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .data
-
-    .align 4
-hfilter_b:
-    .byte   128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0
-    .byte   112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0
-    .byte    96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0
-    .byte    80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0
-    .byte    64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0
-    .byte    48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0
-    .byte    32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0
-    .byte    16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0
-
-    .align 4
-vfilter_b:
-    .byte   128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
-    .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
-    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
-    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
-    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
-    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
-    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
-    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
-    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
-    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
-    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
-    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
-    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
-
-    .align 4
-b_hperm_b:
-    .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
-
-    .align 4
-b_0123_b:
-    .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
-
-    .align 4
-b_4567_b:
-    .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
-
-b_hilo_b:
-    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23

diff --git a/vp8/common/ppc/idctllm_altivec.asm b/vp8/common/ppc/idctllm_altivec.asm
deleted file mode 100644
index 117d9cf..0000000
--- a/vp8/common/ppc/idctllm_altivec.asm
+++ /dev/null

@@ -1,189 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl short_idct4x4llm_ppc
-
-.macro load_c V, LABEL, OFF, R0, R1
-    lis     \R0, \LABEL@ha
-    la      \R1, \LABEL@l(\R0)
-    lvx     \V, \OFF, \R1
-.endm
-
-;# r3 short *input
-;# r4 short *output
-;# r5 int pitch
-    .align 2
-short_idct4x4llm_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xfff8
-    mtspr   256, r12            ;# set VRSAVE
-
-    load_c v8, sinpi8sqrt2, 0, r9, r10
-    load_c v9, cospi8sqrt2minus1, 0, r9, r10
-    load_c v10, hi_hi, 0, r9, r10
-    load_c v11, lo_lo, 0, r9, r10
-    load_c v12, shift_16, 0, r9, r10
-
-    li      r10,  16
-    lvx     v0,   0, r3         ;# input ip[0], ip[ 4]
-    lvx     v1, r10, r3         ;# input ip[8], ip[12]
-
-    ;# first pass
-    vupkhsh v2, v0
-    vupkhsh v3, v1
-    vaddsws v6, v2, v3          ;# a1 = ip[0]+ip[8]
-    vsubsws v7, v2, v3          ;# b1 = ip[0]-ip[8]
-
-    vupklsh v0, v0
-    vmulosh v4, v0, v8
-    vsraw   v4, v4, v12
-    vaddsws v4, v4, v0          ;# ip[ 4] * sin(pi/8) * sqrt(2)
-
-    vupklsh v1, v1
-    vmulosh v5, v1, v9
-    vsraw   v5, v5, v12         ;# ip[12] * cos(pi/8) * sqrt(2)
-    vaddsws v5, v5, v1
-
-    vsubsws v4, v4, v5          ;# c1
-
-    vmulosh v3, v1, v8
-    vsraw   v3, v3, v12
-    vaddsws v3, v3, v1          ;# ip[12] * sin(pi/8) * sqrt(2)
-
-    vmulosh v5, v0, v9
-    vsraw   v5, v5, v12         ;# ip[ 4] * cos(pi/8) * sqrt(2)
-    vaddsws v5, v5, v0
-
-    vaddsws v3, v3, v5          ;# d1
-
-    vaddsws v0, v6, v3          ;# a1 + d1
-    vsubsws v3, v6, v3          ;# a1 - d1
-
-    vaddsws v1, v7, v4          ;# b1 + c1
-    vsubsws v2, v7, v4          ;# b1 - c1
-
-    ;# transpose input
-    vmrghw  v4, v0, v1          ;# a0 b0 a1 b1
-    vmrghw  v5, v2, v3          ;# c0 d0 c1 d1
-
-    vmrglw  v6, v0, v1          ;# a2 b2 a3 b3
-    vmrglw  v7, v2, v3          ;# c2 d2 c3 d3
-
-    vperm   v0, v4, v5, v10     ;# a0 b0 c0 d0
-    vperm   v1, v4, v5, v11     ;# a1 b1 c1 d1
-
-    vperm   v2, v6, v7, v10     ;# a2 b2 c2 d2
-    vperm   v3, v6, v7, v11     ;# a3 b3 c3 d3
-
-    ;# second pass
-    vaddsws v6, v0, v2          ;# a1 = ip[0]+ip[8]
-    vsubsws v7, v0, v2          ;# b1 = ip[0]-ip[8]
-
-    vmulosh v4, v1, v8
-    vsraw   v4, v4, v12
-    vaddsws v4, v4, v1          ;# ip[ 4] * sin(pi/8) * sqrt(2)
-
-    vmulosh v5, v3, v9
-    vsraw   v5, v5, v12         ;# ip[12] * cos(pi/8) * sqrt(2)
-    vaddsws v5, v5, v3
-
-    vsubsws v4, v4, v5          ;# c1
-
-    vmulosh v2, v3, v8
-    vsraw   v2, v2, v12
-    vaddsws v2, v2, v3          ;# ip[12] * sin(pi/8) * sqrt(2)
-
-    vmulosh v5, v1, v9
-    vsraw   v5, v5, v12         ;# ip[ 4] * cos(pi/8) * sqrt(2)
-    vaddsws v5, v5, v1
-
-    vaddsws v3, v2, v5          ;# d1
-
-    vaddsws v0, v6, v3          ;# a1 + d1
-    vsubsws v3, v6, v3          ;# a1 - d1
-
-    vaddsws v1, v7, v4          ;# b1 + c1
-    vsubsws v2, v7, v4          ;# b1 - c1
-
-    vspltish v6, 4
-    vspltish v7, 3
-
-    vpkswss v0, v0, v1
-    vpkswss v1, v2, v3
-
-    vaddshs v0, v0, v6
-    vaddshs v1, v1, v6
-
-    vsrah   v0, v0, v7
-    vsrah   v1, v1, v7
-
-    ;# transpose output
-    vmrghh  v2, v0, v1          ;# a0 c0 a1 c1 a2 c2 a3 c3
-    vmrglh  v3, v0, v1          ;# b0 d0 b1 d1 b2 d2 b3 d3
-
-    vmrghh  v0, v2, v3          ;# a0 b0 c0 d0 a1 b1 c1 d1
-    vmrglh  v1, v2, v3          ;# a2 b2 c2 d2 a3 b3 c3 d3
-
-    stwu    r1,-416(r1)         ;# create space on the stack
-
-    stvx    v0,  0, r1
-    lwz     r6, 0(r1)
-    stw     r6, 0(r4)
-    lwz     r6, 4(r1)
-    stw     r6, 4(r4)
-
-    add     r4, r4, r5
-
-    lwz     r6,  8(r1)
-    stw     r6,  0(r4)
-    lwz     r6, 12(r1)
-    stw     r6,  4(r4)
-
-    add     r4, r4, r5
-
-    stvx    v1,  0, r1
-    lwz     r6, 0(r1)
-    stw     r6, 0(r4)
-    lwz     r6, 4(r1)
-    stw     r6, 4(r4)
-
-    add     r4, r4, r5
-
-    lwz     r6,  8(r1)
-    stw     r6,  0(r4)
-    lwz     r6, 12(r1)
-    stw     r6,  4(r4)
-
-    addi    r1, r1, 416         ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 4
-sinpi8sqrt2:
-    .short  35468, 35468, 35468, 35468, 35468, 35468, 35468, 35468
-
-    .align 4
-cospi8sqrt2minus1:
-    .short  20091, 20091, 20091, 20091, 20091, 20091, 20091, 20091
-
-    .align 4
-shift_16:
-    .long      16,    16,    16,    16
-
-    .align 4
-hi_hi:
-    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23
-
-    .align 4
-lo_lo:
-    .byte     8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31

diff --git a/vp8/common/ppc/loopfilter_altivec.c b/vp8/common/ppc/loopfilter_altivec.c
deleted file mode 100644
index 71bf6e2..0000000
--- a/vp8/common/ppc/loopfilter_altivec.c
+++ /dev/null

@@ -1,135 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "loopfilter.h"
-#include "onyxc_int.h"
-
-typedef void loop_filter_function_y_ppc
-(
-    unsigned char *s,   // source pointer
-    int p,              // pitch
-    const signed char *flimit,
-    const signed char *limit,
-    const signed char *thresh
-);
-
-typedef void loop_filter_function_uv_ppc
-(
-    unsigned char *u,   // source pointer
-    unsigned char *v,   // source pointer
-    int p,              // pitch
-    const signed char *flimit,
-    const signed char *limit,
-    const signed char *thresh
-);
-
-typedef void loop_filter_function_s_ppc
-(
-    unsigned char *s,   // source pointer
-    int p,              // pitch
-    const signed char *flimit
-);
-
-loop_filter_function_y_ppc mbloop_filter_horizontal_edge_y_ppc;
-loop_filter_function_y_ppc mbloop_filter_vertical_edge_y_ppc;
-loop_filter_function_y_ppc loop_filter_horizontal_edge_y_ppc;
-loop_filter_function_y_ppc loop_filter_vertical_edge_y_ppc;
-
-loop_filter_function_uv_ppc mbloop_filter_horizontal_edge_uv_ppc;
-loop_filter_function_uv_ppc mbloop_filter_vertical_edge_uv_ppc;
-loop_filter_function_uv_ppc loop_filter_horizontal_edge_uv_ppc;
-loop_filter_function_uv_ppc loop_filter_vertical_edge_uv_ppc;
-
-loop_filter_function_s_ppc loop_filter_simple_horizontal_edge_ppc;
-loop_filter_function_s_ppc loop_filter_simple_vertical_edge_ppc;
-
-// Horizontal MB filtering
-void loop_filter_mbh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                         int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    mbloop_filter_horizontal_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);
-
-    if (u_ptr)
-        mbloop_filter_horizontal_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);
-}
-
-void loop_filter_mbhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                          int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    (void)u_ptr;
-    (void)v_ptr;
-    (void)uv_stride;
-    loop_filter_simple_horizontal_edge_ppc(y_ptr, y_stride, lfi->mbflim);
-}
-
-// Vertical MB Filtering
-void loop_filter_mbv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                         int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    mbloop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->mbflim, lfi->lim, lfi->thr);
-
-    if (u_ptr)
-        mbloop_filter_vertical_edge_uv_ppc(u_ptr, v_ptr, uv_stride, lfi->mbflim, lfi->lim, lfi->thr);
-}
-
-void loop_filter_mbvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                          int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    (void)u_ptr;
-    (void)v_ptr;
-    (void)uv_stride;
-    loop_filter_simple_vertical_edge_ppc(y_ptr, y_stride, lfi->mbflim);
-}
-
-// Horizontal B Filtering
-void loop_filter_bh_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                        int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    // These should all be done at once with one call, instead of 3
-    loop_filter_horizontal_edge_y_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
-    loop_filter_horizontal_edge_y_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
-    loop_filter_horizontal_edge_y_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim, lfi->lim, lfi->thr);
-
-    if (u_ptr)
-        loop_filter_horizontal_edge_uv_ppc(u_ptr + 4 * uv_stride, v_ptr + 4 * uv_stride, uv_stride, lfi->flim, lfi->lim, lfi->thr);
-}
-
-void loop_filter_bhs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                         int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    (void)u_ptr;
-    (void)v_ptr;
-    (void)uv_stride;
-    loop_filter_simple_horizontal_edge_ppc(y_ptr + 4 * y_stride, y_stride, lfi->flim);
-    loop_filter_simple_horizontal_edge_ppc(y_ptr + 8 * y_stride, y_stride, lfi->flim);
-    loop_filter_simple_horizontal_edge_ppc(y_ptr + 12 * y_stride, y_stride, lfi->flim);
-}
-
-// Vertical B Filtering
-void loop_filter_bv_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                        int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    loop_filter_vertical_edge_y_ppc(y_ptr, y_stride, lfi->flim, lfi->lim, lfi->thr);
-
-    if (u_ptr)
-        loop_filter_vertical_edge_uv_ppc(u_ptr + 4, v_ptr + 4, uv_stride, lfi->flim, lfi->lim, lfi->thr);
-}
-
-void loop_filter_bvs_ppc(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr,
-                         int y_stride, int uv_stride, loop_filter_info *lfi)
-{
-    (void)u_ptr;
-    (void)v_ptr;
-    (void)uv_stride;
-    loop_filter_simple_vertical_edge_ppc(y_ptr + 4,  y_stride, lfi->flim);
-    loop_filter_simple_vertical_edge_ppc(y_ptr + 8,  y_stride, lfi->flim);
-    loop_filter_simple_vertical_edge_ppc(y_ptr + 12, y_stride, lfi->flim);
-}

diff --git a/vp8/common/ppc/loopfilter_filters_altivec.asm b/vp8/common/ppc/loopfilter_filters_altivec.asm
deleted file mode 100644
index 61df4e9..0000000
--- a/vp8/common/ppc/loopfilter_filters_altivec.asm
+++ /dev/null

@@ -1,1253 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl mbloop_filter_horizontal_edge_y_ppc
-    .globl loop_filter_horizontal_edge_y_ppc
-    .globl mbloop_filter_vertical_edge_y_ppc
-    .globl loop_filter_vertical_edge_y_ppc
-
-    .globl mbloop_filter_horizontal_edge_uv_ppc
-    .globl loop_filter_horizontal_edge_uv_ppc
-    .globl mbloop_filter_vertical_edge_uv_ppc
-    .globl loop_filter_vertical_edge_uv_ppc
-
-    .globl loop_filter_simple_horizontal_edge_ppc
-    .globl loop_filter_simple_vertical_edge_ppc
-
-    .text
-;# We often need to perform transposes (and other transpose-like operations)
-;#   on matrices of data.  This is simplified by the fact that we usually
-;#   operate on hunks of data whose dimensions are powers of 2, or at least
-;#   divisible by highish powers of 2.
-;#
-;#   These operations can be very confusing.  They become more straightforward
-;#   when we think of them as permutations of address bits: Concatenate a
-;#   group of vector registers and think of it as occupying a block of
-;#   memory beginning at address zero.  The low four bits 0...3 of the
-;#   address then correspond to position within a register, the higher-order
-;#   address bits select the register.
-;#
-;#   Although register selection, at the code level, is arbitrary, things
-;#   are simpler if we use contiguous ranges of register numbers, simpler
-;#   still if the low-order bits of the register number correspond to
-;#   conceptual address bits.  We do this whenever reasonable.
-;#
-;#   A 16x16 transpose can then be thought of as an operation on
-;#   a 256-element block of memory.  It takes 8 bits 0...7 to address this
-;#   memory and the effect of a transpose is to interchange address bit
-;#   0 with 4, 1 with 5, 2 with 6, and 3 with 7.  Bits 0...3 index the
-;#   column, which is interchanged with the row addressed by bits 4..7.
-;#
-;#   The altivec merge instructions provide a rapid means of effecting
-;#   many of these transforms.  They operate at three widths (8,16,32).
-;#   Writing V(x) for vector register #x, paired merges permute address
-;#   indices as follows.
-;#
-;#   0->1  1->2  2->3  3->(4+d)  (4+s)->0:
-;#
-;#      vmrghb  V( x),          V( y), V( y + (1<<s))
-;#      vmrglb  V( x + (1<<d)), V( y), V( y + (1<<s))
-;#
-;#
-;#   =0=   1->2  2->3  3->(4+d)  (4+s)->1:
-;#
-;#      vmrghh  V( x),          V( y), V( y + (1<<s))
-;#      vmrglh  V( x + (1<<d)), V( y), V( y + (1<<s))
-;#
-;#
-;#   =0=   =1=   2->3  3->(4+d)  (4+s)->2:
-;#
-;#      vmrghw  V( x),          V( y), V( y + (1<<s))
-;#      vmrglw  V( x + (1<<d)), V( y), V( y + (1<<s))
-;#
-;#
-;#   Unfortunately, there is no doubleword merge instruction.
-;#   The following sequence uses "vperm" is a substitute.
-;#   Assuming that the selection masks b_hihi and b_lolo (defined in LFppc.c)
-;#   are in registers Vhihi and Vlolo, we can also effect the permutation
-;#
-;#   =0=   =1=   =2=   3->(4+d)  (4+s)->3   by the sequence:
-;#
-;#      vperm   V( x),          V( y), V( y + (1<<s)), Vhihi
-;#      vperm   V( x + (1<<d)), V( y), V( y + (1<<s)), Vlolo
-;#
-;#
-;#   Except for bits s and d, the other relationships between register
-;#   number (= high-order part of address) bits are at the disposal of
-;#   the programmer.
-;#
-
-;# To avoid excess transposes, we filter all 3 vertical luma subblock
-;#   edges together.  This requires a single 16x16 transpose, which, in
-;#   the above language, amounts to the following permutation of address
-;#   indices:  0<->4   1<->5  2<->6  3<->7, which we accomplish by
-;#   4 iterations of the cyclic transform 0->1->2->3->4->5->6->7->0.
-;#
-;#   Except for the fact that the destination registers get written
-;#   before we are done referencing the old contents, the cyclic transform
-;#   is effected by
-;#
-;#      x = 0;  do {
-;#          vmrghb V(2x),   V(x), V(x+8);
-;#          vmrghb V(2x+1), V(x), V(x+8);
-;#      } while( ++x < 8);
-;#
-;#   For clarity, and because we can afford it, we do this transpose
-;#   using all 32 registers, alternating the banks 0..15  and  16 .. 31,
-;#   leaving the final result in 16 .. 31, as the lower registers are
-;#   used in the filtering itself.
-;#
-.macro Tpair A, B, X, Y
-    vmrghb  \A, \X, \Y
-    vmrglb  \B, \X, \Y
-.endm
-
-;# Each step takes 8*2 = 16 instructions
-
-.macro t16_even
-    Tpair v16,v17,  v0,v8
-    Tpair v18,v19,  v1,v9
-    Tpair v20,v21,  v2,v10
-    Tpair v22,v23,  v3,v11
-    Tpair v24,v25,  v4,v12
-    Tpair v26,v27,  v5,v13
-    Tpair v28,v29,  v6,v14
-    Tpair v30,v31,  v7,v15
-.endm
-
-.macro t16_odd
-    Tpair v0,v1, v16,v24
-    Tpair v2,v3, v17,v25
-    Tpair v4,v5, v18,v26
-    Tpair v6,v7, v19,v27
-    Tpair v8,v9, v20,v28
-    Tpair v10,v11, v21,v29
-    Tpair v12,v13, v22,v30
-    Tpair v14,v15, v23,v31
-.endm
-
-;# Whole transpose takes 4*16 = 64 instructions
-
-.macro t16_full
-    t16_odd
-    t16_even
-    t16_odd
-    t16_even
-.endm
-
-;# Vertical edge filtering requires transposes.  For the simple filter,
-;#   we need to convert 16 rows of 4 pels each into 4 registers of 16 pels
-;#   each.  Writing 0 ... 63 for the pixel indices, the desired result is:
-;#
-;#  v0 =  0  1 ... 14 15
-;#  v1 = 16 17 ... 30 31
-;#  v2 = 32 33 ... 47 48
-;#  v3 = 49 50 ... 62 63
-;#
-;#  In frame-buffer memory, the layout is:
-;#
-;#     0  16  32  48
-;#     1  17  33  49
-;#     ...
-;#    15  31  47  63.
-;#
-;#  We begin by reading the data 32 bits at a time (using scalar operations)
-;#  into a temporary array, reading the rows of the array into vector registers,
-;#  with the following layout:
-;#
-;#  v0 =  0 16 32 48  4 20 36 52  8 24 40 56  12 28 44 60
-;#  v1 =  1 17 33 49  5 21 ...                      45 61
-;#  v2 =  2 18 ...                                  46 62
-;#  v3 =  3 19 ...                                  47 63
-;#
-;#  From the "address-bit" perspective discussed above, we simply need to
-;#  interchange bits 0 <-> 4 and 1 <-> 5, leaving bits 2 and 3 alone.
-;#  In other words, we transpose each of the four 4x4 submatrices.
-;#
-;#  This transformation is its own inverse, and we need to perform it
-;#  again before writing the pixels back into the frame buffer.
-;#
-;#  It acts in place on registers v0...v3, uses v4...v7 as temporaries,
-;#  and assumes that v14/v15 contain the b_hihi/b_lolo selectors
-;#  defined above.  We think of both groups of 4 registers as having
-;#  "addresses" {0,1,2,3} * 16.
-;#
-.macro Transpose4times4x4 Vlo, Vhi
-
-    ;# d=s=0        0->1  1->2  2->3  3->4  4->0  =5=
-
-    vmrghb  v4, v0, v1
-    vmrglb  v5, v0, v1
-    vmrghb  v6, v2, v3
-    vmrglb  v7, v2, v3
-
-    ;# d=0 s=1      =0=   1->2  2->3  3->4  4->5  5->1
-
-    vmrghh  v0, v4, v6
-    vmrglh  v1, v4, v6
-    vmrghh  v2, v5, v7
-    vmrglh  v3, v5, v7
-
-    ;# d=s=0        =0=   =1=   2->3  3->4  4->2  =5=
-
-    vmrghw  v4, v0, v1
-    vmrglw  v5, v0, v1
-    vmrghw  v6, v2, v3
-    vmrglw  v7, v2, v3
-
-    ;# d=0  s=1     =0=   =1=   =2=   3->4  4->5  5->3
-
-    vperm   v0, v4, v6, \Vlo
-    vperm   v1, v4, v6, \Vhi
-    vperm   v2, v5, v7, \Vlo
-    vperm   v3, v5, v7, \Vhi
-.endm
-;# end Transpose4times4x4
-
-
-;# Normal mb vertical edge filter transpose.
-;#
-;#   We read 8 columns of data, initially in the following pattern:
-;#
-;#  (0,0)  (1,0) ... (7,0)  (0,1)  (1,1) ... (7,1)
-;#  (0,2)  (1,2) ... (7,2)  (0,3)  (1,3) ... (7,3)
-;#  ...
-;#  (0,14) (1,14) .. (7,14) (0,15) (1,15) .. (7,15)
-;#
-;#   and wish to convert to:
-;#
-;#  (0,0) ... (0,15)
-;#  (1,0) ... (1,15)
-;#  ...
-;#  (7,0) ... (7,15).
-;#
-;#  In "address bit" language, we wish to map
-;#
-;#  0->4  1->5  2->6  3->0  4->1  5->2  6->3, i.e., I -> (I+4) mod 7.
-;#
-;#  This can be accomplished by 4 iterations of the cyclic transform
-;#
-;#  I -> (I+1) mod 7;
-;#
-;#  each iteration can be realized by (d=0, s=2):
-;#
-;#  x = 0;  do  Tpair( V(2x),V(2x+1),  V(x),V(x+4))  while( ++x < 4);
-;#
-;#  The input/output is in registers v0...v7.  We use v10...v17 as mirrors;
-;#  preserving v8 = sign converter.
-;#
-;#  Inverse transpose is similar, except here I -> (I+3) mod 7 and the
-;#  result lands in the "mirror" registers v10...v17
-;#
-.macro t8x16_odd
-    Tpair v10, v11,  v0, v4
-    Tpair v12, v13,  v1, v5
-    Tpair v14, v15,  v2, v6
-    Tpair v16, v17,  v3, v7
-.endm
-
-.macro t8x16_even
-    Tpair v0, v1,  v10, v14
-    Tpair v2, v3,  v11, v15
-    Tpair v4, v5,  v12, v16
-    Tpair v6, v7,  v13, v17
-.endm
-
-.macro transpose8x16_fwd
-    t8x16_odd
-    t8x16_even
-    t8x16_odd
-    t8x16_even
-.endm
-
-.macro transpose8x16_inv
-    t8x16_odd
-    t8x16_even
-    t8x16_odd
-.endm
-
-.macro Transpose16x16
-    vmrghb  v0, v16, v24
-    vmrglb  v1, v16, v24
-    vmrghb  v2, v17, v25
-    vmrglb  v3, v17, v25
-    vmrghb  v4, v18, v26
-    vmrglb  v5, v18, v26
-    vmrghb  v6, v19, v27
-    vmrglb  v7, v19, v27
-    vmrghb  v8, v20, v28
-    vmrglb  v9, v20, v28
-    vmrghb  v10, v21, v29
-    vmrglb  v11, v21, v29
-    vmrghb  v12, v22, v30
-    vmrglb  v13, v22, v30
-    vmrghb  v14, v23, v31
-    vmrglb  v15, v23, v31
-    vmrghb  v16, v0, v8
-    vmrglb  v17, v0, v8
-    vmrghb  v18, v1, v9
-    vmrglb  v19, v1, v9
-    vmrghb  v20, v2, v10
-    vmrglb  v21, v2, v10
-    vmrghb  v22, v3, v11
-    vmrglb  v23, v3, v11
-    vmrghb  v24, v4, v12
-    vmrglb  v25, v4, v12
-    vmrghb  v26, v5, v13
-    vmrglb  v27, v5, v13
-    vmrghb  v28, v6, v14
-    vmrglb  v29, v6, v14
-    vmrghb  v30, v7, v15
-    vmrglb  v31, v7, v15
-    vmrghb  v0, v16, v24
-    vmrglb  v1, v16, v24
-    vmrghb  v2, v17, v25
-    vmrglb  v3, v17, v25
-    vmrghb  v4, v18, v26
-    vmrglb  v5, v18, v26
-    vmrghb  v6, v19, v27
-    vmrglb  v7, v19, v27
-    vmrghb  v8, v20, v28
-    vmrglb  v9, v20, v28
-    vmrghb  v10, v21, v29
-    vmrglb  v11, v21, v29
-    vmrghb  v12, v22, v30
-    vmrglb  v13, v22, v30
-    vmrghb  v14, v23, v31
-    vmrglb  v15, v23, v31
-    vmrghb  v16, v0, v8
-    vmrglb  v17, v0, v8
-    vmrghb  v18, v1, v9
-    vmrglb  v19, v1, v9
-    vmrghb  v20, v2, v10
-    vmrglb  v21, v2, v10
-    vmrghb  v22, v3, v11
-    vmrglb  v23, v3, v11
-    vmrghb  v24, v4, v12
-    vmrglb  v25, v4, v12
-    vmrghb  v26, v5, v13
-    vmrglb  v27, v5, v13
-    vmrghb  v28, v6, v14
-    vmrglb  v29, v6, v14
-    vmrghb  v30, v7, v15
-    vmrglb  v31, v7, v15
-.endm
-
-;# load_g loads a global vector (whose address is in the local variable Gptr)
-;#   into vector register Vreg.  Trashes r0
-.macro load_g Vreg, Gptr
-    lwz     r0, \Gptr
-    lvx     \Vreg, 0, r0
-.endm
-
-;# exploit the saturation here.  if the answer is negative
-;# it will be clamped to 0.  orring 0 with a positive
-;# number will be the positive number (abs)
-;# RES = abs( A-B), trashes TMP
-.macro Abs RES, TMP, A, B
-    vsububs \RES, \A, \B
-    vsububs \TMP, \B, \A
-    vor     \RES, \RES, \TMP
-.endm
-
-;# RES = Max( RES, abs( A-B)), trashes TMP
-.macro max_abs RES, TMP, A, B
-    vsububs \TMP, \A, \B
-    vmaxub  \RES, \RES, \TMP
-    vsububs \TMP, \B, \A
-    vmaxub  \RES, \RES, \TMP
-.endm
-
-.macro Masks
-    ;# build masks
-    ;# input is all 8 bit unsigned (0-255).  need to
-    ;# do abs(vala-valb) > limit.  but no need to compare each
-    ;# value to the limit.  find the max of the absolute differences
-    ;# and compare that to the limit.
-    ;# First hev
-    Abs     v14, v13, v2, v3    ;# |P1 - P0|
-    max_abs  v14, v13, v5, v4    ;# |Q1 - Q0|
-
-    vcmpgtub v10, v14, v10      ;# HEV = true if thresh exceeded
-
-    ;# Next limit
-    max_abs  v14, v13, v0, v1    ;# |P3 - P2|
-    max_abs  v14, v13, v1, v2    ;# |P2 - P1|
-    max_abs  v14, v13, v6, v5    ;# |Q2 - Q1|
-    max_abs  v14, v13, v7, v6    ;# |Q3 - Q2|
-
-    vcmpgtub v9, v14, v9        ;# R = true if limit exceeded
-
-    ;# flimit
-    Abs     v14, v13, v3, v4    ;# |P0 - Q0|
-
-    vcmpgtub v8, v14, v8        ;# X = true if flimit exceeded
-
-    vor     v8, v8, v9          ;# R = true if flimit or limit exceeded
-    ;# done building masks
-.endm
-
-.macro build_constants RFL, RLI, RTH, FL, LI, TH
-    ;# build constants
-    lvx     \FL, 0, \RFL        ;# flimit
-    lvx     \LI, 0, \RLI        ;# limit
-    lvx     \TH, 0, \RTH        ;# thresh
-
-    vspltisb v11, 8
-    vspltisb v12, 4
-    vslb    v11, v11, v12       ;# 0x80808080808080808080808080808080
-.endm
-
-.macro load_data_y
-    ;# setup strides/pointers to be able to access
-    ;# all of the data
-    add     r5, r4, r4          ;# r5 = 2 * stride
-    sub     r6, r3, r5          ;# r6 -> 2 rows back
-    neg     r7, r4              ;# r7 = -stride
-
-    ;# load 16 pixels worth of data to work on
-    sub     r0, r6, r5          ;# r0 -> 4 rows back (temp)
-    lvx     v0,  0, r0          ;# P3  (read only)
-    lvx     v1, r7, r6          ;# P2
-    lvx     v2,  0, r6          ;# P1
-    lvx     v3, r7, r3          ;# P0
-    lvx     v4,  0, r3          ;# Q0
-    lvx     v5, r4, r3          ;# Q1
-    lvx     v6, r5, r3          ;# Q2
-    add     r0, r3, r5          ;# r0 -> 2 rows fwd (temp)
-    lvx     v7, r4, r0          ;# Q3  (read only)
-.endm
-
-;# Expects
-;#  v10 == HEV
-;#  v13 == tmp
-;#  v14 == tmp
-.macro common_adjust P0, Q0, P1, Q1, HEV_PRESENT
-    vxor    \P1, \P1, v11       ;# SP1
-    vxor    \P0, \P0, v11       ;# SP0
-    vxor    \Q0, \Q0, v11       ;# SQ0
-    vxor    \Q1, \Q1, v11       ;# SQ1
-
-    vsubsbs v13, \P1, \Q1       ;# f  = c (P1 - Q1)
-.if \HEV_PRESENT
-    vand    v13, v13, v10       ;# f &= hev
-.endif
-    vsubsbs v14, \Q0, \P0       ;# -126 <=  X = Q0-P0  <= +126
-    vaddsbs v13, v13, v14
-    vaddsbs v13, v13, v14
-    vaddsbs v13, v13, v14       ;# A = c( c(P1-Q1) + 3*(Q0-P0))
-
-    vandc   v13, v13, v8        ;# f &= mask
-
-    vspltisb v8, 3
-    vspltisb v9, 4
-
-    vaddsbs v14, v13, v9        ;# f1 = c (f+4)
-    vaddsbs v15, v13, v8        ;# f2 = c (f+3)
-
-    vsrab   v13, v14, v8        ;# f1 >>= 3
-    vsrab   v15, v15, v8        ;# f2 >>= 3
-
-    vsubsbs \Q0, \Q0, v13       ;# u1 = c (SQ0 - f1)
-    vaddsbs \P0, \P0, v15       ;# u2 = c (SP0 + f2)
-.endm
-
-.macro vp8_mbfilter
-    Masks
-
-    ;# start the fitering here
-    vxor    v1, v1, v11         ;# SP2
-    vxor    v2, v2, v11         ;# SP1
-    vxor    v3, v3, v11         ;# SP0
-    vxor    v4, v4, v11         ;# SQ0
-    vxor    v5, v5, v11         ;# SQ1
-    vxor    v6, v6, v11         ;# SQ2
-
-    ;# add outer taps if we have high edge variance
-    vsubsbs v13, v2, v5         ;# f  = c (SP1-SQ1)
-
-    vsubsbs v14, v4, v3         ;# SQ0-SP0
-    vaddsbs v13, v13, v14
-    vaddsbs v13, v13, v14
-    vaddsbs v13, v13, v14       ;# f  = c( c(SP1-SQ1) + 3*(SQ0-SP0))
-
-    vandc   v13, v13, v8        ;# f &= mask
-    vand    v15, v13, v10       ;# f2 = f & hev
-
-    ;# save bottom 3 bits so that we round one side +4 and the other +3
-    vspltisb v8, 3
-    vspltisb v9, 4
-
-    vaddsbs v14, v15, v9        ;# f1 = c (f+4)
-    vaddsbs v15, v15, v8        ;# f2 = c (f+3)
-
-    vsrab   v14, v14, v8        ;# f1 >>= 3
-    vsrab   v15, v15, v8        ;# f2 >>= 3
-
-    vsubsbs v4, v4, v14         ;# u1 = c (SQ0 - f1)
-    vaddsbs v3, v3, v15         ;# u2 = c (SP0 + f2)
-
-    ;# only apply wider filter if not high edge variance
-    vandc   v13, v13, v10       ;# f &= ~hev
-
-    vspltisb v9, 2
-    vnor    v8, v8, v8
-    vsrb    v9, v8, v9          ;# 0x3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f3f
-    vupkhsb v9, v9              ;# 0x003f003f003f003f003f003f003f003f
-    vspltisb v8, 9
-
-    ;# roughly 1/7th difference across boundary
-    vspltish v10, 7
-    vmulosb v14, v8, v13        ;# A = c( c(P1-Q1) + 3*(Q0-P0))
-    vmulesb v15, v8, v13
-    vaddshs v14, v14, v9        ;# +=  63
-    vaddshs v15, v15, v9
-    vsrah   v14, v14, v10       ;# >>= 7
-    vsrah   v15, v15, v10
-    vmrglh  v10, v15, v14
-    vmrghh  v15, v15, v14
-
-    vpkshss v10, v15, v10       ;# X = saturated down to bytes
-
-    vsubsbs v6, v6, v10         ;# subtract from Q and add to P
-    vaddsbs v1, v1, v10
-
-    vxor    v6, v6, v11
-    vxor    v1, v1, v11
-
-    ;# roughly 2/7th difference across boundary
-    vspltish v10, 7
-    vaddubm v12, v8, v8
-    vmulosb v14, v12, v13       ;# A = c( c(P1-Q1) + 3*(Q0-P0))
-    vmulesb v15, v12, v13
-    vaddshs v14, v14, v9
-    vaddshs v15, v15, v9
-    vsrah   v14, v14, v10       ;# >>= 7
-    vsrah   v15, v15, v10
-    vmrglh  v10, v15, v14
-    vmrghh  v15, v15, v14
-
-    vpkshss v10, v15, v10       ;# X = saturated down to bytes
-
-    vsubsbs v5, v5, v10         ;# subtract from Q and add to P
-    vaddsbs v2, v2, v10
-
-    vxor    v5, v5, v11
-    vxor    v2, v2, v11
-
-    ;# roughly 3/7th difference across boundary
-    vspltish v10, 7
-    vaddubm v12, v12, v8
-    vmulosb v14, v12, v13       ;# A = c( c(P1-Q1) + 3*(Q0-P0))
-    vmulesb v15, v12, v13
-    vaddshs v14, v14, v9
-    vaddshs v15, v15, v9
-    vsrah   v14, v14, v10       ;# >>= 7
-    vsrah   v15, v15, v10
-    vmrglh  v10, v15, v14
-    vmrghh  v15, v15, v14
-
-    vpkshss v10, v15, v10       ;# X = saturated down to bytes
-
-    vsubsbs v4, v4, v10         ;# subtract from Q and add to P
-    vaddsbs v3, v3, v10
-
-    vxor    v4, v4, v11
-    vxor    v3, v3, v11
-.endm
-
-.macro SBFilter
-    Masks
-
-    common_adjust v3, v4, v2, v5, 1
-
-    ;# outer tap adjustments
-    vspltisb v8, 1
-
-    vaddubm v13, v13, v8        ;# f  += 1
-    vsrab   v13, v13, v8        ;# f >>= 1
-
-    vandc   v13, v13, v10       ;# f &= ~hev
-
-    vsubsbs v5, v5, v13         ;# u1 = c (SQ1 - f)
-    vaddsbs v2, v2, v13         ;# u2 = c (SP1 + f)
-
-    vxor    v2, v2, v11
-    vxor    v3, v3, v11
-    vxor    v4, v4, v11
-    vxor    v5, v5, v11
-.endm
-
-    .align 2
-mbloop_filter_horizontal_edge_y_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    mtspr   256, r12            ;# set VRSAVE
-
-    build_constants r5, r6, r7, v8, v9, v10
-
-    load_data_y
-
-    vp8_mbfilter
-
-    stvx     v1, r7, r6         ;# P2
-    stvx     v2,  0, r6         ;# P1
-    stvx     v3, r7, r3         ;# P0
-    stvx     v4,  0, r3         ;# Q0
-    stvx     v5, r4, r3         ;# Q1
-    stvx     v6, r5, r3         ;# Q2
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 2
-;#  r3 unsigned char *s
-;#  r4 int p
-;#  r5 const signed char *flimit
-;#  r6 const signed char *limit
-;#  r7 const signed char *thresh
-loop_filter_horizontal_edge_y_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    mtspr   256, r12            ;# set VRSAVE
-
-    build_constants r5, r6, r7, v8, v9, v10
-
-    load_data_y
-
-    SBFilter
-
-    stvx     v2,  0, r6         ;# P1
-    stvx     v3, r7, r3         ;# P0
-    stvx     v4,  0, r3         ;# Q0
-    stvx     v5, r4, r3         ;# Q1
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-;# Filtering a vertical mb.  Each mb is aligned on a 16 byte boundary.
-;#  So we can read in an entire mb aligned.  However if we want to filter the mb
-;#  edge we run into problems.  For the loopfilter we require 4 bytes before the mb
-;#  and 4 after for a total of 8 bytes.  Reading 16 bytes inorder to get 4 is a bit
-;#  of a waste.  So this is an even uglier way to get around that.
-;# Using the regular register file words are read in and then saved back out to
-;#  memory to align and order them up.  Then they are read in using the
-;#  vector register file.
-.macro RLVmb V, R
-    lwzux   r0, r3, r4
-    stw     r0, 4(\R)
-    lwz     r0,-4(r3)
-    stw     r0, 0(\R)
-    lwzux   r0, r3, r4
-    stw     r0,12(\R)
-    lwz     r0,-4(r3)
-    stw     r0, 8(\R)
-    lvx     \V, 0, \R
-.endm
-
-.macro WLVmb V, R
-    stvx    \V, 0, \R
-    lwz     r0,12(\R)
-    stwux   r0, r3, r4
-    lwz     r0, 8(\R)
-    stw     r0,-4(r3)
-    lwz     r0, 4(\R)
-    stwux   r0, r3, r4
-    lwz     r0, 0(\R)
-    stw     r0,-4(r3)
-.endm
-
-    .align 2
-;#  r3 unsigned char *s
-;#  r4 int p
-;#  r5 const signed char *flimit
-;#  r6 const signed char *limit
-;#  r7 const signed char *thresh
-mbloop_filter_vertical_edge_y_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xc000
-    mtspr   256, r12            ;# set VRSAVE
-
-    la      r9, -48(r1)         ;# temporary space for reading in vectors
-    sub     r3, r3, r4
-
-    RLVmb v0, r9
-    RLVmb v1, r9
-    RLVmb v2, r9
-    RLVmb v3, r9
-    RLVmb v4, r9
-    RLVmb v5, r9
-    RLVmb v6, r9
-    RLVmb v7, r9
-
-    transpose8x16_fwd
-
-    build_constants r5, r6, r7, v8, v9, v10
-
-    vp8_mbfilter
-
-    transpose8x16_inv
-
-    add r3, r3, r4
-    neg r4, r4
-
-    WLVmb v17, r9
-    WLVmb v16, r9
-    WLVmb v15, r9
-    WLVmb v14, r9
-    WLVmb v13, r9
-    WLVmb v12, r9
-    WLVmb v11, r9
-    WLVmb v10, r9
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-.macro RL V, R, P
-    lvx     \V, 0,  \R
-    add     \R, \R, \P
-.endm
-
-.macro WL V, R, P
-    stvx    \V, 0,  \R
-    add     \R, \R, \P
-.endm
-
-.macro Fil P3, P2, P1, P0, Q0, Q1, Q2, Q3
-                                ;# K = |P0-P1| already
-    Abs     v14, v13, \Q0, \Q1  ;# M = |Q0-Q1|
-    vmaxub  v14, v14, v4        ;# M = max( |P0-P1|, |Q0-Q1|)
-    vcmpgtub v10, v14, v0
-
-    Abs     v4, v5, \Q2, \Q3    ;# K = |Q2-Q3| = next |P0-P1]
-
-    max_abs  v14, v13, \Q1, \Q2  ;# M = max( M, |Q1-Q2|)
-    max_abs  v14, v13, \P1, \P2  ;# M = max( M, |P1-P2|)
-    max_abs  v14, v13, \P2, \P3  ;# M = max( M, |P2-P3|)
-
-    vmaxub   v14, v14, v4       ;# M = max interior abs diff
-    vcmpgtub v9, v14, v2        ;# M = true if int_l exceeded
-
-    Abs     v14, v13, \P0, \Q0  ;# X = Abs( P0-Q0)
-    vcmpgtub v8, v14, v3        ;# X = true if edge_l exceeded
-    vor     v8, v8, v9          ;# M = true if edge_l or int_l exceeded
-
-    ;# replace P1,Q1 w/signed versions
-    common_adjust \P0, \Q0, \P1, \Q1, 1
-
-    vaddubm v13, v13, v1        ;# -16 <= M <= 15, saturation irrelevant
-    vsrab   v13, v13, v1
-    vandc   v13, v13, v10       ;# adjust P1,Q1 by (M+1)>>1  if ! hev
-    vsubsbs \Q1, \Q1, v13
-    vaddsbs \P1, \P1, v13
-
-    vxor    \P1, \P1, v11       ;# P1
-    vxor    \P0, \P0, v11       ;# P0
-    vxor    \Q0, \Q0, v11       ;# Q0
-    vxor    \Q1, \Q1, v11       ;# Q1
-.endm
-
-
-    .align 2
-;#  r3 unsigned char *s
-;#  r4 int p
-;#  r5 const signed char *flimit
-;#  r6 const signed char *limit
-;#  r7 const signed char *thresh
-loop_filter_vertical_edge_y_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xffff
-    mtspr   256, r12            ;# set VRSAVE
-
-    addi    r9, r3, 0
-    RL      v16, r9, r4
-    RL      v17, r9, r4
-    RL      v18, r9, r4
-    RL      v19, r9, r4
-    RL      v20, r9, r4
-    RL      v21, r9, r4
-    RL      v22, r9, r4
-    RL      v23, r9, r4
-    RL      v24, r9, r4
-    RL      v25, r9, r4
-    RL      v26, r9, r4
-    RL      v27, r9, r4
-    RL      v28, r9, r4
-    RL      v29, r9, r4
-    RL      v30, r9, r4
-    lvx     v31, 0, r9
-
-    Transpose16x16
-
-    vspltisb v1, 1
-
-    build_constants r5, r6, r7, v3, v2, v0
-
-    Abs v4, v5, v19, v18                            ;# K(v14) = first |P0-P1|
-
-    Fil v16, v17, v18, v19,  v20, v21, v22, v23
-    Fil v20, v21, v22, v23,  v24, v25, v26, v27
-    Fil v24, v25, v26, v27,  v28, v29, v30, v31
-
-    Transpose16x16
-
-    addi    r9, r3, 0
-    WL      v16, r9, r4
-    WL      v17, r9, r4
-    WL      v18, r9, r4
-    WL      v19, r9, r4
-    WL      v20, r9, r4
-    WL      v21, r9, r4
-    WL      v22, r9, r4
-    WL      v23, r9, r4
-    WL      v24, r9, r4
-    WL      v25, r9, r4
-    WL      v26, r9, r4
-    WL      v27, r9, r4
-    WL      v28, r9, r4
-    WL      v29, r9, r4
-    WL      v30, r9, r4
-    stvx    v31, 0, r9
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-;# -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- UV FILTERING -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-
-.macro active_chroma_sel V
-    andi.   r7, r3, 8       ;# row origin modulo 16
-    add     r7, r7, r7      ;# selects selectors
-    lis     r12, _chromaSelectors@ha
-    la      r0,  _chromaSelectors@l(r12)
-    lwzux   r0, r7, r0      ;# leave selector addr in r7
-
-    lvx     \V, 0, r0       ;# mask to concatenate active U,V pels
-.endm
-
-.macro hread_uv Dest, U, V, Offs, VMask
-    lvx     \U, \Offs, r3
-    lvx     \V, \Offs, r4
-    vperm   \Dest, \U, \V, \VMask   ;# Dest = active part of U then V
-.endm
-
-.macro hwrite_uv New, U, V, Offs, Umask, Vmask
-    vperm   \U, \New, \U, \Umask    ;# Combine new pels with siblings
-    vperm   \V, \New, \V, \Vmask
-    stvx    \U, \Offs, r3           ;# Write to frame buffer
-    stvx    \V, \Offs, r4
-.endm
-
-;# Process U,V in parallel.
-.macro load_chroma_h
-    neg     r9, r5          ;# r9 = -1 * stride
-    add     r8, r9, r9      ;# r8 = -2 * stride
-    add     r10, r5, r5     ;# r10 = 2 * stride
-
-    active_chroma_sel v12
-
-    ;# P3, Q3 are read-only; need not save addresses or sibling pels
-    add     r6, r8, r8      ;# r6 = -4 * stride
-    hread_uv v0, v14, v15, r6, v12
-    add     r6, r10, r5     ;# r6 =  3 * stride
-    hread_uv v7, v14, v15, r6, v12
-
-    ;# Others are read/write; save addresses and sibling pels
-
-    add     r6, r8, r9      ;# r6 = -3 * stride
-    hread_uv v1, v16, v17, r6,  v12
-    hread_uv v2, v18, v19, r8,  v12
-    hread_uv v3, v20, v21, r9,  v12
-    hread_uv v4, v22, v23, 0,   v12
-    hread_uv v5, v24, v25, r5,  v12
-    hread_uv v6, v26, v27, r10, v12
-.endm
-
-.macro uresult_sel V
-    load_g   \V, 4(r7)
-.endm
-
-.macro vresult_sel V
-    load_g   \V, 8(r7)
-.endm
-
-;# always write P1,P0,Q0,Q1
-.macro store_chroma_h
-    uresult_sel v11
-    vresult_sel v12
-    hwrite_uv v2, v18, v19, r8, v11, v12
-    hwrite_uv v3, v20, v21, r9, v11, v12
-    hwrite_uv v4, v22, v23, 0,  v11, v12
-    hwrite_uv v5, v24, v25, r5, v11, v12
-.endm
-
-    .align 2
-;#  r3 unsigned char *u
-;#  r4 unsigned char *v
-;#  r5 int p
-;#  r6 const signed char *flimit
-;#  r7 const signed char *limit
-;#  r8 const signed char *thresh
-mbloop_filter_horizontal_edge_uv_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xffff
-    mtspr   256, r12            ;# set VRSAVE
-
-    build_constants r6, r7, r8, v8, v9, v10
-
-    load_chroma_h
-
-    vp8_mbfilter
-
-    store_chroma_h
-
-    hwrite_uv v1, v16, v17, r6,  v11, v12    ;# v1 == P2
-    hwrite_uv v6, v26, v27, r10, v11, v12    ;# v6 == Q2
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 2
-;#  r3 unsigned char *u
-;#  r4 unsigned char *v
-;#  r5 int p
-;#  r6 const signed char *flimit
-;#  r7 const signed char *limit
-;#  r8 const signed char *thresh
-loop_filter_horizontal_edge_uv_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xffff
-    mtspr   256, r12            ;# set VRSAVE
-
-    build_constants r6, r7, r8, v8, v9, v10
-
-    load_chroma_h
-
-    SBFilter
-
-    store_chroma_h
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-.macro R V, R
-    lwzux   r0, r3, r5
-    stw     r0, 4(\R)
-    lwz     r0,-4(r3)
-    stw     r0, 0(\R)
-    lwzux   r0, r4, r5
-    stw     r0,12(\R)
-    lwz     r0,-4(r4)
-    stw     r0, 8(\R)
-    lvx     \V, 0, \R
-.endm
-
-
-.macro W V, R
-    stvx    \V, 0, \R
-    lwz     r0,12(\R)
-    stwux   r0, r4, r5
-    lwz     r0, 8(\R)
-    stw     r0,-4(r4)
-    lwz     r0, 4(\R)
-    stwux   r0, r3, r5
-    lwz     r0, 0(\R)
-    stw     r0,-4(r3)
-.endm
-
-.macro chroma_vread R
-    sub r3, r3, r5          ;# back up one line for simplicity
-    sub r4, r4, r5
-
-    R v0, \R
-    R v1, \R
-    R v2, \R
-    R v3, \R
-    R v4, \R
-    R v5, \R
-    R v6, \R
-    R v7, \R
-
-    transpose8x16_fwd
-.endm
-
-.macro chroma_vwrite R
-
-    transpose8x16_inv
-
-    add     r3, r3, r5
-    add     r4, r4, r5
-    neg     r5, r5          ;# Write rows back in reverse order
-
-    W v17, \R
-    W v16, \R
-    W v15, \R
-    W v14, \R
-    W v13, \R
-    W v12, \R
-    W v11, \R
-    W v10, \R
-.endm
-
-    .align 2
-;#  r3 unsigned char *u
-;#  r4 unsigned char *v
-;#  r5 int p
-;#  r6 const signed char *flimit
-;#  r7 const signed char *limit
-;#  r8 const signed char *thresh
-mbloop_filter_vertical_edge_uv_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xc000
-    mtspr   256, r12            ;# set VRSAVE
-
-    la      r9, -48(r1)         ;# temporary space for reading in vectors
-
-    chroma_vread r9
-
-    build_constants r6, r7, r8, v8, v9, v10
-
-    vp8_mbfilter
-
-    chroma_vwrite r9
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 2
-;#  r3 unsigned char *u
-;#  r4 unsigned char *v
-;#  r5 int p
-;#  r6 const signed char *flimit
-;#  r7 const signed char *limit
-;#  r8 const signed char *thresh
-loop_filter_vertical_edge_uv_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xc000
-    mtspr   256, r12            ;# set VRSAVE
-
-    la      r9, -48(r1)         ;# temporary space for reading in vectors
-
-    chroma_vread r9
-
-    build_constants r6, r7, r8, v8, v9, v10
-
-    SBFilter
-
-    chroma_vwrite r9
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-;# -=-=-=-=-=-=-=-=-=-=-=-=-=-= SIMPLE LOOP FILTER =-=-=-=-=-=-=-=-=-=-=-=-=-=-
-
-.macro vp8_simple_filter
-    Abs v14, v13, v1, v2    ;# M = abs( P0 - Q0)
-    vcmpgtub v8, v14, v8    ;# v5 = true if _over_ limit
-
-    ;# preserve unsigned v0 and v3
-    common_adjust v1, v2, v0, v3, 0
-
-    vxor v1, v1, v11
-    vxor v2, v2, v11        ;# cvt Q0, P0 back to pels
-.endm
-
-.macro simple_vertical
-    addi    r8,  0, 16
-    addi    r7, r5, 32
-
-    lvx     v0,  0, r5
-    lvx     v1, r8, r5
-    lvx     v2,  0, r7
-    lvx     v3, r8, r7
-
-    lis     r12, _B_hihi@ha
-    la      r0,  _B_hihi@l(r12)
-    lvx     v16, 0, r0
-
-    lis     r12, _B_lolo@ha
-    la      r0,  _B_lolo@l(r12)
-    lvx     v17, 0, r0
-
-    Transpose4times4x4 v16, v17
-    vp8_simple_filter
-
-    vxor v0, v0, v11
-    vxor v3, v3, v11        ;# cvt Q0, P0 back to pels
-
-    Transpose4times4x4 v16, v17
-
-    stvx    v0,  0, r5
-    stvx    v1, r8, r5
-    stvx    v2,  0, r7
-    stvx    v3, r8, r7
-.endm
-
-    .align 2
-;#  r3 unsigned char *s
-;#  r4 int p
-;#  r5 const signed char *flimit
-loop_filter_simple_horizontal_edge_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    mtspr   256, r12            ;# set VRSAVE
-
-    ;# build constants
-    lvx     v8, 0, r5           ;# flimit
-
-    vspltisb v11, 8
-    vspltisb v12, 4
-    vslb    v11, v11, v12       ;# 0x80808080808080808080808080808080
-
-    neg     r5, r4              ;# r5 = -1 * stride
-    add     r6, r5, r5          ;# r6 = -2 * stride
-
-    lvx     v0, r6, r3          ;# v0 = P1 = 16 pels two rows above edge
-    lvx     v1, r5, r3          ;# v1 = P0 = 16 pels one row  above edge
-    lvx     v2,  0, r3          ;# v2 = Q0 = 16 pels one row  below edge
-    lvx     v3, r4, r3          ;# v3 = Q1 = 16 pels two rows below edge
-
-    vp8_simple_filter
-
-    stvx    v1, r5, r3          ;# store P0
-    stvx    v2,  0, r3          ;# store Q0
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-.macro RLV Offs
-    stw     r0, (\Offs*4)(r5)
-    lwzux   r0, r7, r4
-.endm
-
-.macro WLV Offs
-    lwz     r0, (\Offs*4)(r5)
-    stwux   r0, r7, r4
-.endm
-
-    .align 2
-;#  r3 unsigned char *s
-;#  r4 int p
-;#  r5 const signed char *flimit
-loop_filter_simple_vertical_edge_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xc000
-    mtspr   256, r12            ;# set VRSAVE
-
-    ;# build constants
-    lvx     v8, 0, r5           ;# flimit
-
-    vspltisb v11, 8
-    vspltisb v12, 4
-    vslb    v11, v11, v12       ;# 0x80808080808080808080808080808080
-
-    la r5, -96(r1)              ;# temporary space for reading in vectors
-
-    ;# Store 4 pels at word "Offs" in temp array, then advance r7
-    ;#   to next row and read another 4 pels from the frame buffer.
-
-    subi    r7, r3,  2          ;# r7 -> 2 pels before start
-    lwzx    r0,  0, r7          ;# read first 4 pels
-
-    ;# 16 unaligned word accesses
-    RLV 0
-    RLV 4
-    RLV 8
-    RLV 12
-    RLV 1
-    RLV 5
-    RLV 9
-    RLV 13
-    RLV 2
-    RLV 6
-    RLV 10
-    RLV 14
-    RLV 3
-    RLV 7
-    RLV 11
-
-    stw     r0, (15*4)(r5)      ;# write last 4 pels
-
-    simple_vertical
-
-    ;# Read temp array, write frame buffer.
-    subi    r7, r3,  2          ;# r7 -> 2 pels before start
-    lwzx    r0,  0, r5          ;# read/write first 4 pels
-    stwx    r0,  0, r7
-
-    WLV 4
-    WLV 8
-    WLV 12
-    WLV 1
-    WLV 5
-    WLV 9
-    WLV 13
-    WLV 2
-    WLV 6
-    WLV 10
-    WLV 14
-    WLV 3
-    WLV 7
-    WLV 11
-    WLV 15
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .data
-
-_chromaSelectors:
-    .long   _B_hihi
-    .long   _B_Ures0
-    .long   _B_Vres0
-    .long   0
-    .long   _B_lolo
-    .long   _B_Ures8
-    .long   _B_Vres8
-    .long   0
-
-    .align 4
-_B_Vres8:
-    .byte   16, 17, 18, 19, 20, 21, 22, 23,  8,  9, 10, 11, 12, 13, 14, 15
-
-    .align 4
-_B_Ures8:
-    .byte   16, 17, 18, 19, 20, 21, 22, 23,  0,  1,  2,  3,  4,  5,  6,  7
-
-    .align 4
-_B_lolo:
-    .byte    8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
-
-    .align 4
-_B_Vres0:
-    .byte    8,  9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31
-    .align 4
-_B_Ures0:
-    .byte    0,  1,  2,  3,  4,  5,  6,  7, 24, 25, 26, 27, 28, 29, 30, 31
-
-    .align 4
-_B_hihi:
-    .byte    0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23

diff --git a/vp8/common/ppc/platform_altivec.asm b/vp8/common/ppc/platform_altivec.asm
deleted file mode 100644
index f81d86f..0000000
--- a/vp8/common/ppc/platform_altivec.asm
+++ /dev/null

@@ -1,59 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl save_platform_context
-    .globl restore_platform_context
-
-.macro W V P
-    stvx    \V,  0, \P
-    addi    \P, \P, 16
-.endm
-
-.macro R V P
-    lvx     \V,  0, \P
-    addi    \P, \P, 16
-.endm
-
-;# r3 context_ptr
-    .align 2
-save_platform_contex:
-    W v20, r3
-    W v21, r3
-    W v22, r3
-    W v23, r3
-    W v24, r3
-    W v25, r3
-    W v26, r3
-    W v27, r3
-    W v28, r3
-    W v29, r3
-    W v30, r3
-    W v31, r3
-
-    blr
-
-;# r3 context_ptr
-    .align 2
-restore_platform_context:
-    R v20, r3
-    R v21, r3
-    R v22, r3
-    R v23, r3
-    R v24, r3
-    R v25, r3
-    R v26, r3
-    R v27, r3
-    R v28, r3
-    R v29, r3
-    R v30, r3
-    R v31, r3
-
-    blr

diff --git a/vp8/common/ppc/recon_altivec.asm b/vp8/common/ppc/recon_altivec.asm
deleted file mode 100644
index dd39e05..0000000
--- a/vp8/common/ppc/recon_altivec.asm
+++ /dev/null

@@ -1,175 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl recon4b_ppc
-    .globl recon2b_ppc
-    .globl recon_b_ppc
-
-.macro row_of16 Diff Pred Dst Stride
-    lvx     v1,  0, \Pred           ;# v1 = pred = p0..p15
-    addi    \Pred, \Pred, 16        ;# next pred
-    vmrghb  v2, v0, v1              ;# v2 = 16-bit p0..p7
-    lvx     v3,  0, \Diff           ;# v3 = d0..d7
-    vaddshs v2, v2, v3              ;# v2 = r0..r7
-    vmrglb  v1, v0, v1              ;# v1 = 16-bit p8..p15
-    lvx     v3, r8, \Diff           ;# v3 = d8..d15
-    addi    \Diff, \Diff, 32        ;# next diff
-    vaddshs v3, v3, v1              ;# v3 = r8..r15
-    vpkshus v2, v2, v3              ;# v2 = 8-bit r0..r15
-    stvx    v2,  0, \Dst            ;# to dst
-    add     \Dst, \Dst, \Stride     ;# next dst
-.endm
-
-    .text
-    .align 2
-;#  r3 = short *diff_ptr,
-;#  r4 = unsigned char *pred_ptr,
-;#  r5 = unsigned char *dst_ptr,
-;#  r6 = int stride
-recon4b_ppc:
-    mfspr   r0, 256                     ;# get old VRSAVE
-    stw     r0, -8(r1)                  ;# save old VRSAVE to stack
-    oris    r0, r0, 0xf000
-    mtspr   256,r0                      ;# set VRSAVE
-
-    vxor    v0, v0, v0
-    li      r8, 16
-
-    row_of16 r3, r4, r5, r6
-    row_of16 r3, r4, r5, r6
-    row_of16 r3, r4, r5, r6
-    row_of16 r3, r4, r5, r6
-
-    lwz     r12, -8(r1)                 ;# restore old VRSAVE from stack
-    mtspr   256, r12                    ;# reset old VRSAVE
-
-    blr
-
-.macro two_rows_of8 Diff Pred Dst Stride write_first_four_pels
-    lvx     v1,  0, \Pred       ;# v1 = pred = p0..p15
-    vmrghb  v2, v0, v1          ;# v2 = 16-bit p0..p7
-    lvx     v3,  0, \Diff       ;# v3 = d0..d7
-    vaddshs v2, v2, v3          ;# v2 = r0..r7
-    vmrglb  v1, v0, v1          ;# v1 = 16-bit p8..p15
-    lvx     v3, r8, \Diff       ;# v2 = d8..d15
-    vaddshs v3, v3, v1          ;# v3 = r8..r15
-    vpkshus v2, v2, v3          ;# v3 = 8-bit r0..r15
-    stvx    v2,  0, r10         ;# 2 rows to dst from buf
-    lwz     r0, 0(r10)
-.if \write_first_four_pels
-    stw     r0, 0(\Dst)
-    .else
-    stwux   r0, \Dst, \Stride
-.endif
-    lwz     r0, 4(r10)
-    stw     r0, 4(\Dst)
-    lwz     r0, 8(r10)
-    stwux   r0, \Dst, \Stride       ;# advance dst to next row
-    lwz     r0, 12(r10)
-    stw     r0, 4(\Dst)
-.endm
-
-    .align 2
-;#  r3 = short *diff_ptr,
-;#  r4 = unsigned char *pred_ptr,
-;#  r5 = unsigned char *dst_ptr,
-;#  r6 = int stride
-
-recon2b_ppc:
-    mfspr   r0, 256                     ;# get old VRSAVE
-    stw     r0, -8(r1)                  ;# save old VRSAVE to stack
-    oris    r0, r0, 0xf000
-    mtspr   256,r0                      ;# set VRSAVE
-
-    vxor    v0, v0, v0
-    li      r8, 16
-
-    la      r10, -48(r1)                ;# buf
-
-    two_rows_of8 r3, r4, r5, r6, 1
-
-    addi    r4, r4, 16;                 ;# next pred
-    addi    r3, r3, 32;                 ;# next diff
-
-    two_rows_of8 r3, r4, r5, r6, 0
-
-    lwz     r12, -8(r1)                 ;# restore old VRSAVE from stack
-    mtspr   256, r12                    ;# reset old VRSAVE
-
-    blr
-
-.macro get_two_diff_rows
-    stw     r0, 0(r10)
-    lwz     r0, 4(r3)
-    stw     r0, 4(r10)
-    lwzu    r0, 32(r3)
-    stw     r0, 8(r10)
-    lwz     r0, 4(r3)
-    stw     r0, 12(r10)
-    lvx     v3, 0, r10
-.endm
-
-    .align 2
-;#  r3 = short *diff_ptr,
-;#  r4 = unsigned char *pred_ptr,
-;#  r5 = unsigned char *dst_ptr,
-;#  r6 = int stride
-recon_b_ppc:
-    mfspr   r0, 256                     ;# get old VRSAVE
-    stw     r0, -8(r1)                  ;# save old VRSAVE to stack
-    oris    r0, r0, 0xf000
-    mtspr   256,r0                      ;# set VRSAVE
-
-    vxor    v0, v0, v0
-
-    la      r10, -48(r1)    ;# buf
-
-    lwz     r0, 0(r4)
-    stw     r0, 0(r10)
-    lwz     r0, 16(r4)
-    stw     r0, 4(r10)
-    lwz     r0, 32(r4)
-    stw     r0, 8(r10)
-    lwz     r0, 48(r4)
-    stw     r0, 12(r10)
-
-    lvx     v1,  0, r10;    ;# v1 = pred = p0..p15
-
-    lwz r0, 0(r3)           ;# v3 = d0..d7
-
-    get_two_diff_rows
-
-    vmrghb  v2, v0, v1;     ;# v2 = 16-bit p0..p7
-    vaddshs v2, v2, v3;     ;# v2 = r0..r7
-
-    lwzu r0, 32(r3)         ;# v3 = d8..d15
-
-    get_two_diff_rows
-
-    vmrglb  v1, v0, v1;     ;# v1 = 16-bit p8..p15
-    vaddshs v3, v3, v1;     ;# v3 = r8..r15
-
-    vpkshus v2, v2, v3;     ;# v2 = 8-bit r0..r15
-    stvx    v2,  0, r10;    ;# 16 pels to dst from buf
-
-    lwz     r0, 0(r10)
-    stw     r0, 0(r5)
-    lwz     r0, 4(r10)
-    stwux   r0, r5, r6
-    lwz     r0, 8(r10)
-    stwux   r0, r5, r6
-    lwz     r0, 12(r10)
-    stwx    r0, r5, r6
-
-    lwz     r12, -8(r1)                 ;# restore old VRSAVE from stack
-    mtspr   256, r12                    ;# reset old VRSAVE
-
-    blr

diff --git a/vp8/common/ppc/sad_altivec.asm b/vp8/common/ppc/sad_altivec.asm
deleted file mode 100644
index e5f2638..0000000
--- a/vp8/common/ppc/sad_altivec.asm
+++ /dev/null

@@ -1,277 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl vp8_sad16x16_ppc
-    .globl vp8_sad16x8_ppc
-    .globl vp8_sad8x16_ppc
-    .globl vp8_sad8x8_ppc
-    .globl vp8_sad4x4_ppc
-
-.macro load_aligned_16 V R O
-    lvsl    v3,  0, \R          ;# permutate value for alignment
-
-    lvx     v1,  0, \R
-    lvx     v2, \O, \R
-
-    vperm   \V, v1, v2, v3
-.endm
-
-.macro prologue
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffc0
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1, -32(r1)         ;# create space on the stack
-
-    li      r10, 16             ;# load offset and loop counter
-
-    vspltisw v8, 0              ;# zero out total to start
-.endm
-
-.macro epilogue
-    addi    r1, r1, 32          ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-.endm
-
-.macro SAD_16
-    ;# v6 = abs (v4 - v5)
-    vsububs v6, v4, v5
-    vsububs v7, v5, v4
-    vor     v6, v6, v7
-
-    ;# v8 += abs (v4 - v5)
-    vsum4ubs v8, v6, v8
-.endm
-
-.macro sad_16_loop loop_label
-    lvsl    v3,  0, r5          ;# only needs to be done once per block
-
-    ;# preload a line of data before getting into the loop
-    lvx     v4, 0, r3
-    lvx     v1,  0, r5
-    lvx     v2, r10, r5
-
-    add     r5, r5, r6
-    add     r3, r3, r4
-
-    vperm   v5, v1, v2, v3
-
-    .align 4
-\loop_label:
-    ;# compute difference on first row
-    vsububs v6, v4, v5
-    vsububs v7, v5, v4
-
-    ;# load up next set of data
-    lvx     v9, 0, r3
-    lvx     v1,  0, r5
-    lvx     v2, r10, r5
-
-    ;# perform abs() of difference
-    vor     v6, v6, v7
-    add     r3, r3, r4
-
-    ;# add to the running tally
-    vsum4ubs v8, v6, v8
-
-    ;# now onto the next line
-    vperm   v5, v1, v2, v3
-    add     r5, r5, r6
-    lvx     v4, 0, r3
-
-    ;# compute difference on second row
-    vsububs v6, v9, v5
-    lvx     v1,  0, r5
-    vsububs v7, v5, v9
-    lvx     v2, r10, r5
-    vor     v6, v6, v7
-    add     r3, r3, r4
-    vsum4ubs v8, v6, v8
-    vperm   v5, v1, v2, v3
-    add     r5, r5, r6
-
-    bdnz    \loop_label
-
-    vspltisw v7, 0
-
-    vsumsws v8, v8, v7
-
-    stvx    v8, 0, r1
-    lwz     r3, 12(r1)
-.endm
-
-.macro sad_8_loop loop_label
-    .align 4
-\loop_label:
-    ;# only one of the inputs should need to be aligned.
-    load_aligned_16 v4, r3, r10
-    load_aligned_16 v5, r5, r10
-
-    ;# move onto the next line
-    add     r3, r3, r4
-    add     r5, r5, r6
-
-    ;# only one of the inputs should need to be aligned.
-    load_aligned_16 v6, r3, r10
-    load_aligned_16 v7, r5, r10
-
-    ;# move onto the next line
-    add     r3, r3, r4
-    add     r5, r5, r6
-
-    vmrghb  v4, v4, v6
-    vmrghb  v5, v5, v7
-
-    SAD_16
-
-    bdnz    \loop_label
-
-    vspltisw v7, 0
-
-    vsumsws v8, v8, v7
-
-    stvx    v8, 0, r1
-    lwz     r3, 12(r1)
-.endm
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  src_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  ref_stride
-;#
-;# r3 return value
-vp8_sad16x16_ppc:
-
-    prologue
-
-    li      r9, 8
-    mtctr   r9
-
-    sad_16_loop sad16x16_loop
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  src_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  ref_stride
-;#
-;# r3 return value
-vp8_sad16x8_ppc:
-
-    prologue
-
-    li      r9, 4
-    mtctr   r9
-
-    sad_16_loop sad16x8_loop
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  src_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  ref_stride
-;#
-;# r3 return value
-vp8_sad8x16_ppc:
-
-    prologue
-
-    li      r9, 8
-    mtctr   r9
-
-    sad_8_loop sad8x16_loop
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  src_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  ref_stride
-;#
-;# r3 return value
-vp8_sad8x8_ppc:
-
-    prologue
-
-    li      r9, 4
-    mtctr   r9
-
-    sad_8_loop sad8x8_loop
-
-    epilogue
-
-    blr
-
-.macro transfer_4x4 I P
-    lwz     r0, 0(\I)
-    add     \I, \I, \P
-
-    lwz     r7, 0(\I)
-    add     \I, \I, \P
-
-    lwz     r8, 0(\I)
-    add     \I, \I, \P
-
-    lwz     r9, 0(\I)
-
-    stw     r0,  0(r1)
-    stw     r7,  4(r1)
-    stw     r8,  8(r1)
-    stw     r9, 12(r1)
-.endm
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  src_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  ref_stride
-;#
-;# r3 return value
-vp8_sad4x4_ppc:
-
-    prologue
-
-    transfer_4x4 r3, r4
-    lvx     v4, 0, r1
-
-    transfer_4x4 r5, r6
-    lvx     v5, 0, r1
-
-    vspltisw v8, 0              ;# zero out total to start
-
-    ;# v6 = abs (v4 - v5)
-    vsububs v6, v4, v5
-    vsububs v7, v5, v4
-    vor     v6, v6, v7
-
-    ;# v8 += abs (v4 - v5)
-    vsum4ubs v7, v6, v8
-    vsumsws v7, v7, v8
-
-    stvx    v7, 0, r1
-    lwz     r3, 12(r1)
-
-    epilogue
-
-    blr

diff --git a/vp8/common/ppc/systemdependent.c b/vp8/common/ppc/systemdependent.c
deleted file mode 100644
index 6899c0e..0000000
--- a/vp8/common/ppc/systemdependent.c
+++ /dev/null

@@ -1,165 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "subpixel.h"
-#include "loopfilter.h"
-#include "recon.h"
-#include "onyxc_int.h"
-
-extern void (*vp8_post_proc_down_and_across_mb_row)(
-    unsigned char *src_ptr,
-    unsigned char *dst_ptr,
-    int src_pixels_per_line,
-    int dst_pixels_per_line,
-    int cols,
-    unsigned char *f,
-    int size
-);
-
-extern void (*vp8_mbpost_proc_down)(unsigned char *dst, int pitch, int rows, int cols, int flimit);
-extern void vp8_mbpost_proc_down_c(unsigned char *dst, int pitch, int rows, int cols, int flimit);
-extern void (*vp8_mbpost_proc_across_ip)(unsigned char *src, int pitch, int rows, int cols, int flimit);
-extern void vp8_mbpost_proc_across_ip_c(unsigned char *src, int pitch, int rows, int cols, int flimit);
-
-extern void vp8_post_proc_down_and_across_mb_row_c
-(
-    unsigned char *src_ptr,
-    unsigned char *dst_ptr,
-    int src_pixels_per_line,
-    int dst_pixels_per_line,
-    int cols,
-    unsigned char *f,
-    int size
-);
-void vp8_plane_add_noise_c(unsigned char *Start, unsigned int Width, unsigned int Height, int Pitch, int q, int a);
-
-extern copy_mem_block_function *vp8_copy_mem16x16;
-extern copy_mem_block_function *vp8_copy_mem8x8;
-extern copy_mem_block_function *vp8_copy_mem8x4;
-
-// PPC
-extern subpixel_predict_function sixtap_predict_ppc;
-extern subpixel_predict_function sixtap_predict8x4_ppc;
-extern subpixel_predict_function sixtap_predict8x8_ppc;
-extern subpixel_predict_function sixtap_predict16x16_ppc;
-extern subpixel_predict_function bilinear_predict4x4_ppc;
-extern subpixel_predict_function bilinear_predict8x4_ppc;
-extern subpixel_predict_function bilinear_predict8x8_ppc;
-extern subpixel_predict_function bilinear_predict16x16_ppc;
-
-extern copy_mem_block_function copy_mem16x16_ppc;
-
-void recon_b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
-void recon2b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
-void recon4b_ppc(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
-
-extern void short_idct4x4llm_ppc(short *input, short *output, int pitch);
-
-// Generic C
-extern subpixel_predict_function vp8_sixtap_predict_c;
-extern subpixel_predict_function vp8_sixtap_predict8x4_c;
-extern subpixel_predict_function vp8_sixtap_predict8x8_c;
-extern subpixel_predict_function vp8_sixtap_predict16x16_c;
-extern subpixel_predict_function vp8_bilinear_predict4x4_c;
-extern subpixel_predict_function vp8_bilinear_predict8x4_c;
-extern subpixel_predict_function vp8_bilinear_predict8x8_c;
-extern subpixel_predict_function vp8_bilinear_predict16x16_c;
-
-extern copy_mem_block_function vp8_copy_mem16x16_c;
-extern copy_mem_block_function vp8_copy_mem8x8_c;
-extern copy_mem_block_function vp8_copy_mem8x4_c;
-
-void vp8_recon_b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
-void vp8_recon2b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
-void vp8_recon4b_c(short *diff_ptr, unsigned char *pred_ptr, unsigned char *dst_ptr, int stride);
-
-extern void vp8_short_idct4x4llm_1_c(short *input, short *output, int pitch);
-extern void vp8_short_idct4x4llm_c(short *input, short *output, int pitch);
-extern void vp8_dc_only_idct_c(short input_dc, short *output, int pitch);
-
-// PPC
-extern loop_filter_block_function loop_filter_mbv_ppc;
-extern loop_filter_block_function loop_filter_bv_ppc;
-extern loop_filter_block_function loop_filter_mbh_ppc;
-extern loop_filter_block_function loop_filter_bh_ppc;
-
-extern loop_filter_block_function loop_filter_mbvs_ppc;
-extern loop_filter_block_function loop_filter_bvs_ppc;
-extern loop_filter_block_function loop_filter_mbhs_ppc;
-extern loop_filter_block_function loop_filter_bhs_ppc;
-
-// Generic C
-extern loop_filter_block_function vp8_loop_filter_mbv_c;
-extern loop_filter_block_function vp8_loop_filter_bv_c;
-extern loop_filter_block_function vp8_loop_filter_mbh_c;
-extern loop_filter_block_function vp8_loop_filter_bh_c;
-
-extern loop_filter_block_function vp8_loop_filter_mbvs_c;
-extern loop_filter_block_function vp8_loop_filter_bvs_c;
-extern loop_filter_block_function vp8_loop_filter_mbhs_c;
-extern loop_filter_block_function vp8_loop_filter_bhs_c;
-
-extern loop_filter_block_function *vp8_lf_mbvfull;
-extern loop_filter_block_function *vp8_lf_mbhfull;
-extern loop_filter_block_function *vp8_lf_bvfull;
-extern loop_filter_block_function *vp8_lf_bhfull;
-
-extern loop_filter_block_function *vp8_lf_mbvsimple;
-extern loop_filter_block_function *vp8_lf_mbhsimple;
-extern loop_filter_block_function *vp8_lf_bvsimple;
-extern loop_filter_block_function *vp8_lf_bhsimple;
-
-void vp8_clear_c(void)
-{
-}
-
-void vp8_machine_specific_config(void)
-{
-    // Pure C:
-    vp8_clear_system_state                = vp8_clear_c;
-    vp8_recon_b                          = vp8_recon_b_c;
-    vp8_recon4b                         = vp8_recon4b_c;
-    vp8_recon2b                         = vp8_recon2b_c;
-
-    vp8_bilinear_predict16x16            = bilinear_predict16x16_ppc;
-    vp8_bilinear_predict8x8              = bilinear_predict8x8_ppc;
-    vp8_bilinear_predict8x4              = bilinear_predict8x4_ppc;
-    vp8_bilinear_predict                 = bilinear_predict4x4_ppc;
-
-    vp8_sixtap_predict16x16              = sixtap_predict16x16_ppc;
-    vp8_sixtap_predict8x8                = sixtap_predict8x8_ppc;
-    vp8_sixtap_predict8x4                = sixtap_predict8x4_ppc;
-    vp8_sixtap_predict                   = sixtap_predict_ppc;
-
-    vp8_short_idct4x4_1                  = vp8_short_idct4x4llm_1_c;
-    vp8_short_idct4x4                    = short_idct4x4llm_ppc;
-    vp8_dc_only_idct                      = vp8_dc_only_idct_c;
-
-    vp8_lf_mbvfull                       = loop_filter_mbv_ppc;
-    vp8_lf_bvfull                        = loop_filter_bv_ppc;
-    vp8_lf_mbhfull                       = loop_filter_mbh_ppc;
-    vp8_lf_bhfull                        = loop_filter_bh_ppc;
-
-    vp8_lf_mbvsimple                     = loop_filter_mbvs_ppc;
-    vp8_lf_bvsimple                      = loop_filter_bvs_ppc;
-    vp8_lf_mbhsimple                     = loop_filter_mbhs_ppc;
-    vp8_lf_bhsimple                      = loop_filter_bhs_ppc;
-
-    vp8_post_proc_down_and_across_mb_row = vp8_post_proc_down_and_across_mb_row_c;
-    vp8_mbpost_proc_down                  = vp8_mbpost_proc_down_c;
-    vp8_mbpost_proc_across_ip              = vp8_mbpost_proc_across_ip_c;
-    vp8_plane_add_noise                   = vp8_plane_add_noise_c;
-
-    vp8_copy_mem16x16                    = copy_mem16x16_ppc;
-    vp8_copy_mem8x8                      = vp8_copy_mem8x8_c;
-    vp8_copy_mem8x4                      = vp8_copy_mem8x4_c;
-
-}

diff --git a/vp8/common/ppc/variance_altivec.asm b/vp8/common/ppc/variance_altivec.asm
deleted file mode 100644
index fb8d5bb..0000000
--- a/vp8/common/ppc/variance_altivec.asm
+++ /dev/null

@@ -1,375 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl vp8_get8x8var_ppc
-    .globl vp8_get16x16var_ppc
-    .globl vp8_mse16x16_ppc
-    .globl vp8_variance16x16_ppc
-    .globl vp8_variance16x8_ppc
-    .globl vp8_variance8x16_ppc
-    .globl vp8_variance8x8_ppc
-    .globl vp8_variance4x4_ppc
-
-.macro load_aligned_16 V R O
-    lvsl    v3,  0, \R          ;# permutate value for alignment
-
-    lvx     v1,  0, \R
-    lvx     v2, \O, \R
-
-    vperm   \V, v1, v2, v3
-.endm
-
-.macro prologue
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffc0
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1, -32(r1)         ;# create space on the stack
-
-    li      r10, 16             ;# load offset and loop counter
-
-    vspltisw v7, 0              ;# zero for merging
-    vspltisw v8, 0              ;# zero out total to start
-    vspltisw v9, 0              ;# zero out total for dif^2
-.endm
-
-.macro epilogue
-    addi    r1, r1, 32          ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-.endm
-
-.macro compute_sum_sse
-    ;# Compute sum first.  Unpack to so signed subract
-    ;#  can be used.  Only have a half word signed
-    ;#  subract.  Do high, then low.
-    vmrghb  v2, v7, v4
-    vmrghb  v3, v7, v5
-    vsubshs v2, v2, v3
-    vsum4shs v8, v2, v8
-
-    vmrglb  v2, v7, v4
-    vmrglb  v3, v7, v5
-    vsubshs v2, v2, v3
-    vsum4shs v8, v2, v8
-
-    ;# Now compute sse.
-    vsububs v2, v4, v5
-    vsububs v3, v5, v4
-    vor     v2, v2, v3
-
-    vmsumubm v9, v2, v2, v9
-.endm
-
-.macro variance_16 DS loop_label store_sum
-\loop_label:
-    ;# only one of the inputs should need to be aligned.
-    load_aligned_16 v4, r3, r10
-    load_aligned_16 v5, r5, r10
-
-    ;# move onto the next line
-    add     r3, r3, r4
-    add     r5, r5, r6
-
-    compute_sum_sse
-
-    bdnz    \loop_label
-
-    vsumsws v8, v8, v7
-    vsumsws v9, v9, v7
-
-    stvx    v8, 0, r1
-    lwz     r3, 12(r1)
-
-    stvx    v9, 0, r1
-    lwz     r4, 12(r1)
-
-.if \store_sum
-    stw     r3, 0(r8)           ;# sum
-.endif
-    stw     r4, 0(r7)           ;# sse
-
-    mullw   r3, r3, r3          ;# sum*sum
-    srlwi   r3, r3, \DS         ;# (sum*sum) >> DS
-    subf    r3, r3, r4          ;# sse - ((sum*sum) >> DS)
-.endm
-
-.macro variance_8 DS loop_label store_sum
-\loop_label:
-    ;# only one of the inputs should need to be aligned.
-    load_aligned_16 v4, r3, r10
-    load_aligned_16 v5, r5, r10
-
-    ;# move onto the next line
-    add     r3, r3, r4
-    add     r5, r5, r6
-
-    ;# only one of the inputs should need to be aligned.
-    load_aligned_16 v6, r3, r10
-    load_aligned_16 v0, r5, r10
-
-    ;# move onto the next line
-    add     r3, r3, r4
-    add     r5, r5, r6
-
-    vmrghb  v4, v4, v6
-    vmrghb  v5, v5, v0
-
-    compute_sum_sse
-
-    bdnz    \loop_label
-
-    vsumsws v8, v8, v7
-    vsumsws v9, v9, v7
-
-    stvx    v8, 0, r1
-    lwz     r3, 12(r1)
-
-    stvx    v9, 0, r1
-    lwz     r4, 12(r1)
-
-.if \store_sum
-    stw     r3, 0(r8)           ;# sum
-.endif
-    stw     r4, 0(r7)           ;# sse
-
-    mullw   r3, r3, r3          ;# sum*sum
-    srlwi   r3, r3, \DS         ;# (sum*sum) >> 8
-    subf    r3, r3, r4          ;# sse - ((sum*sum) >> 8)
-.endm
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  recon_stride
-;# r7 unsigned int *SSE
-;# r8 int *Sum
-;#
-;# r3 return value
-vp8_get8x8var_ppc:
-
-    prologue
-
-    li      r9, 4
-    mtctr   r9
-
-    variance_8 6, get8x8var_loop, 1
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  recon_stride
-;# r7 unsigned int *SSE
-;# r8 int *Sum
-;#
-;# r3 return value
-vp8_get16x16var_ppc:
-
-    prologue
-
-    mtctr   r10
-
-    variance_16 8, get16x16var_loop, 1
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  recon_stride
-;# r7 unsigned int *sse
-;#
-;# r 3 return value
-vp8_mse16x16_ppc:
-    prologue
-
-    mtctr   r10
-
-mse16x16_loop:
-    ;# only one of the inputs should need to be aligned.
-    load_aligned_16 v4, r3, r10
-    load_aligned_16 v5, r5, r10
-
-    ;# move onto the next line
-    add     r3, r3, r4
-    add     r5, r5, r6
-
-    ;# Now compute sse.
-    vsububs v2, v4, v5
-    vsububs v3, v5, v4
-    vor     v2, v2, v3
-
-    vmsumubm v9, v2, v2, v9
-
-    bdnz    mse16x16_loop
-
-    vsumsws v9, v9, v7
-
-    stvx    v9, 0, r1
-    lwz     r3, 12(r1)
-
-    stvx    v9, 0, r1
-    lwz     r3, 12(r1)
-
-    stw     r3, 0(r7)           ;# sse
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  recon_stride
-;# r7 unsigned int *sse
-;#
-;# r3 return value
-vp8_variance16x16_ppc:
-
-    prologue
-
-    mtctr   r10
-
-    variance_16 8, variance16x16_loop, 0
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  recon_stride
-;# r7 unsigned int *sse
-;#
-;# r3 return value
-vp8_variance16x8_ppc:
-
-    prologue
-
-    li      r9, 8
-    mtctr   r9
-
-    variance_16 7, variance16x8_loop, 0
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  recon_stride
-;# r7 unsigned int *sse
-;#
-;# r3 return value
-vp8_variance8x16_ppc:
-
-    prologue
-
-    li      r9, 8
-    mtctr   r9
-
-    variance_8 7, variance8x16_loop, 0
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  recon_stride
-;# r7 unsigned int *sse
-;#
-;# r3 return value
-vp8_variance8x8_ppc:
-
-    prologue
-
-    li      r9, 4
-    mtctr   r9
-
-    variance_8 6, variance8x8_loop, 0
-
-    epilogue
-
-    blr
-
-.macro transfer_4x4 I P
-    lwz     r0, 0(\I)
-    add     \I, \I, \P
-
-    lwz     r10,0(\I)
-    add     \I, \I, \P
-
-    lwz     r8, 0(\I)
-    add     \I, \I, \P
-
-    lwz     r9, 0(\I)
-
-    stw     r0,  0(r1)
-    stw     r10, 4(r1)
-    stw     r8,  8(r1)
-    stw     r9, 12(r1)
-.endm
-
-    .align 2
-;# r3 unsigned char *src_ptr
-;# r4 int  source_stride
-;# r5 unsigned char *ref_ptr
-;# r6 int  recon_stride
-;# r7 unsigned int *sse
-;#
-;# r3 return value
-vp8_variance4x4_ppc:
-
-    prologue
-
-    transfer_4x4 r3, r4
-    lvx     v4, 0, r1
-
-    transfer_4x4 r5, r6
-    lvx     v5, 0, r1
-
-    compute_sum_sse
-
-    vsumsws v8, v8, v7
-    vsumsws v9, v9, v7
-
-    stvx    v8, 0, r1
-    lwz     r3, 12(r1)
-
-    stvx    v9, 0, r1
-    lwz     r4, 12(r1)
-
-    stw     r4, 0(r7)           ;# sse
-
-    mullw   r3, r3, r3          ;# sum*sum
-    srlwi   r3, r3, 4           ;# (sum*sum) >> 4
-    subf    r3, r3, r4          ;# sse - ((sum*sum) >> 4)
-
-    epilogue
-
-    blr

diff --git a/vp8/common/ppc/variance_subpixel_altivec.asm b/vp8/common/ppc/variance_subpixel_altivec.asm
deleted file mode 100644
index 2308373..0000000
--- a/vp8/common/ppc/variance_subpixel_altivec.asm
+++ /dev/null

@@ -1,865 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl vp8_sub_pixel_variance4x4_ppc
-    .globl vp8_sub_pixel_variance8x8_ppc
-    .globl vp8_sub_pixel_variance8x16_ppc
-    .globl vp8_sub_pixel_variance16x8_ppc
-    .globl vp8_sub_pixel_variance16x16_ppc
-
-.macro load_c V, LABEL, OFF, R0, R1
-    lis     \R0, \LABEL@ha
-    la      \R1, \LABEL@l(\R0)
-    lvx     \V, \OFF, \R1
-.endm
-
-.macro load_vfilter V0, V1
-    load_c \V0, vfilter_b, r6, r12, r10
-
-    addi    r6,  r6, 16
-    lvx     \V1, r6, r10
-.endm
-
-.macro HProlog jump_label
-    ;# load up horizontal filter
-    slwi.   r5, r5, 4           ;# index into horizontal filter array
-
-    ;# index to the next set of vectors in the row.
-    li      r10, 16
-
-    ;# downshift by 7 ( divide by 128 ) at the end
-    vspltish v19, 7
-
-    ;# If there isn't any filtering to be done for the horizontal, then
-    ;#  just skip to the second pass.
-    beq     \jump_label
-
-    load_c v20, hfilter_b, r5, r12, r0
-
-    ;# setup constants
-    ;# v14 permutation value for alignment
-    load_c v28, b_hperm_b, 0, r12, r0
-
-    ;# index to the next set of vectors in the row.
-    li      r12, 32
-
-    ;# rounding added in on the multiply
-    vspltisw v21, 8
-    vspltisw v18, 3
-    vslw    v18, v21, v18       ;# 0x00000040000000400000004000000040
-
-    slwi.   r6, r6, 5           ;# index into vertical filter array
-.endm
-
-;# Filters a horizontal line
-;# expects:
-;#  r3  src_ptr
-;#  r4  pitch
-;#  r10 16
-;#  r12 32
-;#  v17 perm intput
-;#  v18 rounding
-;#  v19 shift
-;#  v20 filter taps
-;#  v21 tmp
-;#  v22 tmp
-;#  v23 tmp
-;#  v24 tmp
-;#  v25 tmp
-;#  v26 tmp
-;#  v27 tmp
-;#  v28 perm output
-;#
-
-.macro hfilter_8 V, hp, lp, increment_counter
-    lvsl    v17,  0, r3         ;# permutate value for alignment
-
-    ;# input to filter is 9 bytes wide, output is 8 bytes.
-    lvx     v21,   0, r3
-    lvx     v22, r10, r3
-
-.if \increment_counter
-    add     r3, r3, r4
-.endif
-    vperm   v21, v21, v22, v17
-
-    vperm   v24, v21, v21, \hp  ;# v20 = 0123 1234 2345 3456
-    vperm   v25, v21, v21, \lp  ;# v21 = 4567 5678 6789 789A
-
-    vmsummbm v24, v20, v24, v18
-    vmsummbm v25, v20, v25, v18
-
-    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
-
-    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
-
-    vpkuhus \V, v24, v24        ;# \V = scrambled 8-bit result
-.endm
-
-.macro vfilter_16 P0 P1
-    vmuleub v22, \P0, v20       ;# 64 + 4 positive taps
-    vadduhm v22, v18, v22
-    vmuloub v23, \P0, v20
-    vadduhm v23, v18, v23
-
-    vmuleub v24, \P1, v21
-    vadduhm v22, v22, v24       ;# Re = evens, saturation unnecessary
-    vmuloub v25, \P1, v21
-    vadduhm v23, v23, v25       ;# Ro = odds
-
-    vsrh    v22, v22, v19       ;# divide by 128
-    vsrh    v23, v23, v19       ;# v16 v17 = evens, odds
-    vmrghh  \P0, v22, v23       ;# v18 v19 = 16-bit result in order
-    vmrglh  v23, v22, v23
-    vpkuhus \P0, \P0, v23       ;# P0 = 8-bit result
-.endm
-
-.macro compute_sum_sse src, ref, sum, sse, t1, t2, z0
-    ;# Compute sum first.  Unpack to so signed subract
-    ;#  can be used.  Only have a half word signed
-    ;#  subract.  Do high, then low.
-    vmrghb  \t1, \z0, \src
-    vmrghb  \t2, \z0, \ref
-    vsubshs \t1, \t1, \t2
-    vsum4shs \sum, \t1, \sum
-
-    vmrglb  \t1, \z0, \src
-    vmrglb  \t2, \z0, \ref
-    vsubshs \t1, \t1, \t2
-    vsum4shs \sum, \t1, \sum
-
-    ;# Now compute sse.
-    vsububs \t1, \src, \ref
-    vsububs \t2, \ref, \src
-    vor     \t1, \t1, \t2
-
-    vmsumubm \sse, \t1, \t1, \sse
-.endm
-
-.macro variance_final sum, sse, z0, DS
-    vsumsws \sum, \sum, \z0
-    vsumsws \sse, \sse, \z0
-
-    stvx    \sum, 0, r1
-    lwz     r3, 12(r1)
-
-    stvx    \sse, 0, r1
-    lwz     r4, 12(r1)
-
-    stw     r4, 0(r9)           ;# sse
-
-    mullw   r3, r3, r3          ;# sum*sum
-    srlwi   r3, r3, \DS         ;# (sum*sum) >> 8
-    subf    r3, r3, r4          ;# sse - ((sum*sum) >> 8)
-.endm
-
-.macro compute_sum_sse_16 V, increment_counter
-    load_and_align_16  v16, r7, r8, \increment_counter
-    compute_sum_sse \V, v16, v18, v19, v20, v21, v23
-.endm
-
-.macro load_and_align_16 V, R, P, increment_counter
-    lvsl    v17,  0, \R         ;# permutate value for alignment
-
-    ;# input to filter is 21 bytes wide, output is 16 bytes.
-    ;#  input will can span three vectors if not aligned correctly.
-    lvx     v21,   0, \R
-    lvx     v22, r10, \R
-
-.if \increment_counter
-    add     \R, \R, \P
-.endif
-
-    vperm   \V, v21, v22, v17
-.endm
-
-    .align 2
-;# r3 unsigned char  *src_ptr
-;# r4 int  src_pixels_per_line
-;# r5 int  xoffset
-;# r6 int  yoffset
-;# r7 unsigned char *dst_ptr
-;# r8 int dst_pixels_per_line
-;# r9 unsigned int *sse
-;#
-;# r3 return value
-vp8_sub_pixel_variance4x4_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xf830
-    ori     r12, r12, 0xfff8
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    HProlog second_pass_4x4_pre_copy_b
-
-    ;# Load up permutation constants
-    load_c v10, b_0123_b, 0, r12, r0
-    load_c v11, b_4567_b, 0, r12, r0
-
-    hfilter_8 v0, v10, v11, 1
-    hfilter_8 v1, v10, v11, 1
-    hfilter_8 v2, v10, v11, 1
-    hfilter_8 v3, v10, v11, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     compute_sum_sse_4x4_b
-
-    hfilter_8 v4, v10, v11, 0
-
-    b   second_pass_4x4_b
-
-second_pass_4x4_pre_copy_b:
-    slwi    r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_16 v0, r3, r4, 1
-    load_and_align_16 v1, r3, r4, 1
-    load_and_align_16 v2, r3, r4, 1
-    load_and_align_16 v3, r3, r4, 1
-    load_and_align_16 v4, r3, r4, 0
-
-second_pass_4x4_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18       ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-
-compute_sum_sse_4x4_b:
-    vspltish v18, 0             ;# sum
-    vspltish v19, 0             ;# sse
-    vspltish v23, 0             ;# unpack
-    li      r10, 16
-
-    load_and_align_16 v4, r7, r8, 1
-    load_and_align_16 v5, r7, r8, 1
-    load_and_align_16 v6, r7, r8, 1
-    load_and_align_16 v7, r7, r8, 1
-
-    vmrghb  v0, v0, v1
-    vmrghb  v1, v2, v3
-
-    vmrghb  v2, v4, v5
-    vmrghb  v3, v6, v7
-
-    load_c v10, b_hilo_b, 0, r12, r0
-
-    vperm   v0, v0, v1, v10
-    vperm   v1, v2, v3, v10
-
-    compute_sum_sse v0, v1, v18, v19, v20, v21, v23
-
-    variance_final v18, v19, v23, 4
-
-    addi    r1, r1, 32          ;# recover stack
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 2
-;# r3 unsigned char  *src_ptr
-;# r4 int  src_pixels_per_line
-;# r5 int  xoffset
-;# r6 int  yoffset
-;# r7 unsigned char *dst_ptr
-;# r8 int dst_pixels_per_line
-;# r9 unsigned int *sse
-;#
-;# r3 return value
-vp8_sub_pixel_variance8x8_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xfff0
-    ori     r12, r12, 0xffff
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    HProlog second_pass_8x8_pre_copy_b
-
-    ;# Load up permutation constants
-    load_c v10, b_0123_b, 0, r12, r0
-    load_c v11, b_4567_b, 0, r12, r0
-
-    hfilter_8 v0, v10, v11, 1
-    hfilter_8 v1, v10, v11, 1
-    hfilter_8 v2, v10, v11, 1
-    hfilter_8 v3, v10, v11, 1
-    hfilter_8 v4, v10, v11, 1
-    hfilter_8 v5, v10, v11, 1
-    hfilter_8 v6, v10, v11, 1
-    hfilter_8 v7, v10, v11, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     compute_sum_sse_8x8_b
-
-    hfilter_8 v8, v10, v11, 0
-
-    b   second_pass_8x8_b
-
-second_pass_8x8_pre_copy_b:
-    slwi.   r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_16 v0, r3, r4, 1
-    load_and_align_16 v1, r3, r4, 1
-    load_and_align_16 v2, r3, r4, 1
-    load_and_align_16 v3, r3, r4, 1
-    load_and_align_16 v4, r3, r4, 1
-    load_and_align_16 v5, r3, r4, 1
-    load_and_align_16 v6, r3, r4, 1
-    load_and_align_16 v7, r3, r4, 1
-    load_and_align_16 v8, r3, r4, 0
-
-    beq     compute_sum_sse_8x8_b
-
-second_pass_8x8_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0, v1
-    vfilter_16 v1, v2
-    vfilter_16 v2, v3
-    vfilter_16 v3, v4
-    vfilter_16 v4, v5
-    vfilter_16 v5, v6
-    vfilter_16 v6, v7
-    vfilter_16 v7, v8
-
-compute_sum_sse_8x8_b:
-    vspltish v18, 0             ;# sum
-    vspltish v19, 0             ;# sse
-    vspltish v23, 0             ;# unpack
-    li      r10, 16
-
-    vmrghb  v0, v0, v1
-    vmrghb  v1, v2, v3
-    vmrghb  v2, v4, v5
-    vmrghb  v3, v6, v7
-
-    load_and_align_16 v4,  r7, r8, 1
-    load_and_align_16 v5,  r7, r8, 1
-    load_and_align_16 v6,  r7, r8, 1
-    load_and_align_16 v7,  r7, r8, 1
-    load_and_align_16 v8,  r7, r8, 1
-    load_and_align_16 v9,  r7, r8, 1
-    load_and_align_16 v10, r7, r8, 1
-    load_and_align_16 v11, r7, r8, 0
-
-    vmrghb  v4, v4,  v5
-    vmrghb  v5, v6,  v7
-    vmrghb  v6, v8,  v9
-    vmrghb  v7, v10, v11
-
-    compute_sum_sse v0, v4, v18, v19, v20, v21, v23
-    compute_sum_sse v1, v5, v18, v19, v20, v21, v23
-    compute_sum_sse v2, v6, v18, v19, v20, v21, v23
-    compute_sum_sse v3, v7, v18, v19, v20, v21, v23
-
-    variance_final v18, v19, v23, 6
-
-    addi    r1, r1, 32          ;# recover stack
-    mtspr   256, r11            ;# reset old VRSAVE
-    blr
-
-    .align 2
-;# r3 unsigned char  *src_ptr
-;# r4 int  src_pixels_per_line
-;# r5 int  xoffset
-;# r6 int  yoffset
-;# r7 unsigned char *dst_ptr
-;# r8 int dst_pixels_per_line
-;# r9 unsigned int *sse
-;#
-;# r3 return value
-vp8_sub_pixel_variance8x16_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xfffc
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    HProlog second_pass_8x16_pre_copy_b
-
-    ;# Load up permutation constants
-    load_c v29, b_0123_b, 0, r12, r0
-    load_c v30, b_4567_b, 0, r12, r0
-
-    hfilter_8 v0,  v29, v30, 1
-    hfilter_8 v1,  v29, v30, 1
-    hfilter_8 v2,  v29, v30, 1
-    hfilter_8 v3,  v29, v30, 1
-    hfilter_8 v4,  v29, v30, 1
-    hfilter_8 v5,  v29, v30, 1
-    hfilter_8 v6,  v29, v30, 1
-    hfilter_8 v7,  v29, v30, 1
-    hfilter_8 v8,  v29, v30, 1
-    hfilter_8 v9,  v29, v30, 1
-    hfilter_8 v10, v29, v30, 1
-    hfilter_8 v11, v29, v30, 1
-    hfilter_8 v12, v29, v30, 1
-    hfilter_8 v13, v29, v30, 1
-    hfilter_8 v14, v29, v30, 1
-    hfilter_8 v15, v29, v30, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     compute_sum_sse_8x16_b
-
-    hfilter_8 v16, v29, v30, 0
-
-    b   second_pass_8x16_b
-
-second_pass_8x16_pre_copy_b:
-    slwi.   r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_16 v0,  r3, r4, 1
-    load_and_align_16 v1,  r3, r4, 1
-    load_and_align_16 v2,  r3, r4, 1
-    load_and_align_16 v3,  r3, r4, 1
-    load_and_align_16 v4,  r3, r4, 1
-    load_and_align_16 v5,  r3, r4, 1
-    load_and_align_16 v6,  r3, r4, 1
-    load_and_align_16 v7,  r3, r4, 1
-    load_and_align_16 v8,  r3, r4, 1
-    load_and_align_16 v9,  r3, r4, 1
-    load_and_align_16 v10, r3, r4, 1
-    load_and_align_16 v11, r3, r4, 1
-    load_and_align_16 v12, r3, r4, 1
-    load_and_align_16 v13, r3, r4, 1
-    load_and_align_16 v14, r3, r4, 1
-    load_and_align_16 v15, r3, r4, 1
-    load_and_align_16 v16, r3, r4, 0
-
-    beq     compute_sum_sse_8x16_b
-
-second_pass_8x16_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-    vfilter_16 v4,  v5
-    vfilter_16 v5,  v6
-    vfilter_16 v6,  v7
-    vfilter_16 v7,  v8
-    vfilter_16 v8,  v9
-    vfilter_16 v9,  v10
-    vfilter_16 v10, v11
-    vfilter_16 v11, v12
-    vfilter_16 v12, v13
-    vfilter_16 v13, v14
-    vfilter_16 v14, v15
-    vfilter_16 v15, v16
-
-compute_sum_sse_8x16_b:
-    vspltish v18, 0             ;# sum
-    vspltish v19, 0             ;# sse
-    vspltish v23, 0             ;# unpack
-    li      r10, 16
-
-    vmrghb  v0, v0,  v1
-    vmrghb  v1, v2,  v3
-    vmrghb  v2, v4,  v5
-    vmrghb  v3, v6,  v7
-    vmrghb  v4, v8,  v9
-    vmrghb  v5, v10, v11
-    vmrghb  v6, v12, v13
-    vmrghb  v7, v14, v15
-
-    load_and_align_16 v8,  r7, r8, 1
-    load_and_align_16 v9,  r7, r8, 1
-    load_and_align_16 v10, r7, r8, 1
-    load_and_align_16 v11, r7, r8, 1
-    load_and_align_16 v12, r7, r8, 1
-    load_and_align_16 v13, r7, r8, 1
-    load_and_align_16 v14, r7, r8, 1
-    load_and_align_16 v15, r7, r8, 1
-
-    vmrghb  v8,  v8,  v9
-    vmrghb  v9,  v10, v11
-    vmrghb  v10, v12, v13
-    vmrghb  v11, v14, v15
-
-    compute_sum_sse v0, v8,  v18, v19, v20, v21, v23
-    compute_sum_sse v1, v9,  v18, v19, v20, v21, v23
-    compute_sum_sse v2, v10, v18, v19, v20, v21, v23
-    compute_sum_sse v3, v11, v18, v19, v20, v21, v23
-
-    load_and_align_16 v8,  r7, r8, 1
-    load_and_align_16 v9,  r7, r8, 1
-    load_and_align_16 v10, r7, r8, 1
-    load_and_align_16 v11, r7, r8, 1
-    load_and_align_16 v12, r7, r8, 1
-    load_and_align_16 v13, r7, r8, 1
-    load_and_align_16 v14, r7, r8, 1
-    load_and_align_16 v15, r7, r8, 0
-
-    vmrghb  v8,  v8,  v9
-    vmrghb  v9,  v10, v11
-    vmrghb  v10, v12, v13
-    vmrghb  v11, v14, v15
-
-    compute_sum_sse v4, v8,  v18, v19, v20, v21, v23
-    compute_sum_sse v5, v9,  v18, v19, v20, v21, v23
-    compute_sum_sse v6, v10, v18, v19, v20, v21, v23
-    compute_sum_sse v7, v11, v18, v19, v20, v21, v23
-
-    variance_final v18, v19, v23, 7
-
-    addi    r1, r1, 32          ;# recover stack
-    mtspr   256, r11            ;# reset old VRSAVE
-    blr
-
-;# Filters a horizontal line
-;# expects:
-;#  r3  src_ptr
-;#  r4  pitch
-;#  r10 16
-;#  r12 32
-;#  v17 perm intput
-;#  v18 rounding
-;#  v19 shift
-;#  v20 filter taps
-;#  v21 tmp
-;#  v22 tmp
-;#  v23 tmp
-;#  v24 tmp
-;#  v25 tmp
-;#  v26 tmp
-;#  v27 tmp
-;#  v28 perm output
-;#
-.macro hfilter_16 V, increment_counter
-
-    lvsl    v17,  0, r3         ;# permutate value for alignment
-
-    ;# input to filter is 21 bytes wide, output is 16 bytes.
-    ;#  input will can span three vectors if not aligned correctly.
-    lvx     v21,   0, r3
-    lvx     v22, r10, r3
-    lvx     v23, r12, r3
-
-.if \increment_counter
-    add     r3, r3, r4
-.endif
-    vperm   v21, v21, v22, v17
-    vperm   v22, v22, v23, v17  ;# v8 v9 = 21 input pixels left-justified
-
-    ;# set 0
-    vmsummbm v24, v20, v21, v18 ;# taps times elements
-
-    ;# set 1
-    vsldoi  v23, v21, v22, 1
-    vmsummbm v25, v20, v23, v18
-
-    ;# set 2
-    vsldoi  v23, v21, v22, 2
-    vmsummbm v26, v20, v23, v18
-
-    ;# set 3
-    vsldoi  v23, v21, v22, 3
-    vmsummbm v27, v20, v23, v18
-
-    vpkswus v24, v24, v25       ;# v24 = 0 4 8 C 1 5 9 D (16-bit)
-    vpkswus v25, v26, v27       ;# v25 = 2 6 A E 3 7 B F
-
-    vsrh    v24, v24, v19       ;# divide v0, v1 by 128
-    vsrh    v25, v25, v19
-
-    vpkuhus \V, v24, v25        ;# \V = scrambled 8-bit result
-    vperm   \V, \V, v0, v28     ;# \V = correctly-ordered result
-.endm
-
-    .align 2
-;# r3 unsigned char  *src_ptr
-;# r4 int  src_pixels_per_line
-;# r5 int  xoffset
-;# r6 int  yoffset
-;# r7 unsigned char *dst_ptr
-;# r8 int dst_pixels_per_line
-;# r9 unsigned int *sse
-;#
-;# r3 return value
-vp8_sub_pixel_variance16x8_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xfff8
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1, -32(r1)         ;# create space on the stack
-
-    HProlog second_pass_16x8_pre_copy_b
-
-    hfilter_16 v0, 1
-    hfilter_16 v1, 1
-    hfilter_16 v2, 1
-    hfilter_16 v3, 1
-    hfilter_16 v4, 1
-    hfilter_16 v5, 1
-    hfilter_16 v6, 1
-    hfilter_16 v7, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     compute_sum_sse_16x8_b
-
-    hfilter_16 v8, 0
-
-    b   second_pass_16x8_b
-
-second_pass_16x8_pre_copy_b:
-    slwi.   r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_16  v0,  r3, r4, 1
-    load_and_align_16  v1,  r3, r4, 1
-    load_and_align_16  v2,  r3, r4, 1
-    load_and_align_16  v3,  r3, r4, 1
-    load_and_align_16  v4,  r3, r4, 1
-    load_and_align_16  v5,  r3, r4, 1
-    load_and_align_16  v6,  r3, r4, 1
-    load_and_align_16  v7,  r3, r4, 1
-    load_and_align_16  v8,  r3, r4, 1
-
-    beq     compute_sum_sse_16x8_b
-
-second_pass_16x8_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-    vfilter_16 v4,  v5
-    vfilter_16 v5,  v6
-    vfilter_16 v6,  v7
-    vfilter_16 v7,  v8
-
-compute_sum_sse_16x8_b:
-    vspltish v18, 0             ;# sum
-    vspltish v19, 0             ;# sse
-    vspltish v23, 0             ;# unpack
-    li      r10, 16
-
-    compute_sum_sse_16 v0, 1
-    compute_sum_sse_16 v1, 1
-    compute_sum_sse_16 v2, 1
-    compute_sum_sse_16 v3, 1
-    compute_sum_sse_16 v4, 1
-    compute_sum_sse_16 v5, 1
-    compute_sum_sse_16 v6, 1
-    compute_sum_sse_16 v7, 0
-
-    variance_final v18, v19, v23, 7
-
-    addi    r1, r1, 32          ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .align 2
-;# r3 unsigned char  *src_ptr
-;# r4 int  src_pixels_per_line
-;# r5 int  xoffset
-;# r6 int  yoffset
-;# r7 unsigned char *dst_ptr
-;# r8 int dst_pixels_per_line
-;# r9 unsigned int *sse
-;#
-;# r3 return value
-vp8_sub_pixel_variance16x16_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xffff
-    ori     r12, r12, 0xfff8
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1, -32(r1)         ;# create space on the stack
-
-    HProlog second_pass_16x16_pre_copy_b
-
-    hfilter_16 v0,  1
-    hfilter_16 v1,  1
-    hfilter_16 v2,  1
-    hfilter_16 v3,  1
-    hfilter_16 v4,  1
-    hfilter_16 v5,  1
-    hfilter_16 v6,  1
-    hfilter_16 v7,  1
-    hfilter_16 v8,  1
-    hfilter_16 v9,  1
-    hfilter_16 v10, 1
-    hfilter_16 v11, 1
-    hfilter_16 v12, 1
-    hfilter_16 v13, 1
-    hfilter_16 v14, 1
-    hfilter_16 v15, 1
-
-    ;# Finished filtering main horizontal block.  If there is no
-    ;#  vertical filtering, jump to storing the data.  Otherwise
-    ;#  load up and filter the additional line that is needed
-    ;#  for the vertical filter.
-    beq     compute_sum_sse_16x16_b
-
-    hfilter_16 v16, 0
-
-    b   second_pass_16x16_b
-
-second_pass_16x16_pre_copy_b:
-    slwi.   r6, r6, 5           ;# index into vertical filter array
-
-    load_and_align_16  v0,  r3, r4, 1
-    load_and_align_16  v1,  r3, r4, 1
-    load_and_align_16  v2,  r3, r4, 1
-    load_and_align_16  v3,  r3, r4, 1
-    load_and_align_16  v4,  r3, r4, 1
-    load_and_align_16  v5,  r3, r4, 1
-    load_and_align_16  v6,  r3, r4, 1
-    load_and_align_16  v7,  r3, r4, 1
-    load_and_align_16  v8,  r3, r4, 1
-    load_and_align_16  v9,  r3, r4, 1
-    load_and_align_16  v10, r3, r4, 1
-    load_and_align_16  v11, r3, r4, 1
-    load_and_align_16  v12, r3, r4, 1
-    load_and_align_16  v13, r3, r4, 1
-    load_and_align_16  v14, r3, r4, 1
-    load_and_align_16  v15, r3, r4, 1
-    load_and_align_16  v16, r3, r4, 0
-
-    beq     compute_sum_sse_16x16_b
-
-second_pass_16x16_b:
-    vspltish v20, 8
-    vspltish v18, 3
-    vslh    v18, v20, v18   ;# 0x0040 0040 0040 0040 0040 0040 0040 0040
-
-    load_vfilter v20, v21
-
-    vfilter_16 v0,  v1
-    vfilter_16 v1,  v2
-    vfilter_16 v2,  v3
-    vfilter_16 v3,  v4
-    vfilter_16 v4,  v5
-    vfilter_16 v5,  v6
-    vfilter_16 v6,  v7
-    vfilter_16 v7,  v8
-    vfilter_16 v8,  v9
-    vfilter_16 v9,  v10
-    vfilter_16 v10, v11
-    vfilter_16 v11, v12
-    vfilter_16 v12, v13
-    vfilter_16 v13, v14
-    vfilter_16 v14, v15
-    vfilter_16 v15, v16
-
-compute_sum_sse_16x16_b:
-    vspltish v18, 0             ;# sum
-    vspltish v19, 0             ;# sse
-    vspltish v23, 0             ;# unpack
-    li      r10, 16
-
-    compute_sum_sse_16 v0,  1
-    compute_sum_sse_16 v1,  1
-    compute_sum_sse_16 v2,  1
-    compute_sum_sse_16 v3,  1
-    compute_sum_sse_16 v4,  1
-    compute_sum_sse_16 v5,  1
-    compute_sum_sse_16 v6,  1
-    compute_sum_sse_16 v7,  1
-    compute_sum_sse_16 v8,  1
-    compute_sum_sse_16 v9,  1
-    compute_sum_sse_16 v10, 1
-    compute_sum_sse_16 v11, 1
-    compute_sum_sse_16 v12, 1
-    compute_sum_sse_16 v13, 1
-    compute_sum_sse_16 v14, 1
-    compute_sum_sse_16 v15, 0
-
-    variance_final v18, v19, v23, 8
-
-    addi    r1, r1, 32          ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-    .data
-
-    .align 4
-hfilter_b:
-    .byte   128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0,128,  0,  0,  0
-    .byte   112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0,112, 16,  0,  0
-    .byte    96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0, 96, 32,  0,  0
-    .byte    80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0, 80, 48,  0,  0
-    .byte    64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0, 64, 64,  0,  0
-    .byte    48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0, 48, 80,  0,  0
-    .byte    32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0, 32, 96,  0,  0
-    .byte    16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0, 16,112,  0,  0
-
-    .align 4
-vfilter_b:
-    .byte   128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128
-    .byte     0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
-    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
-    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
-    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
-    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
-    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
-    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
-    .byte    64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
-    .byte    48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48, 48
-    .byte    80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80, 80
-    .byte    32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32
-    .byte    96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96, 96
-    .byte    16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
-    .byte   112,112,112,112,112,112,112,112,112,112,112,112,112,112,112,112
-
-    .align 4
-b_hperm_b:
-    .byte     0,  4,  8, 12,  1,  5,  9, 13,  2,  6, 10, 14,  3,  7, 11, 15
-
-    .align 4
-b_0123_b:
-    .byte     0,  1,  2,  3,  1,  2,  3,  4,  2,  3,  4,  5,  3,  4,  5,  6
-
-    .align 4
-b_4567_b:
-    .byte     4,  5,  6,  7,  5,  6,  7,  8,  6,  7,  8,  9,  7,  8,  9, 10
-
-b_hilo_b:
-    .byte     0,  1,  2,  3,  4,  5,  6,  7, 16, 17, 18, 19, 20, 21, 22, 23

diff --git a/vp8/common/rtcd.c b/vp8/common/rtcd.c
index 0b371b0..ab0e9b4 100644
--- a/vp8/common/rtcd.c
+++ b/vp8/common/rtcd.c

@@ -7,15 +7,13 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include "vpx_config.h"
+#include "./vpx_config.h"
 #define RTCD_C
-#include "vp8_rtcd.h"
+#include "./vp8_rtcd.h"
 #include "vpx_ports/vpx_once.h"
 
-extern void vpx_scale_rtcd(void);
 
 void vp8_rtcd()
 {
-    vpx_scale_rtcd();
     once(setup_rtcd_internal);
 }

diff --git a/vp8/decoder/decodeframe.c b/vp8/decoder/decodeframe.c
index ab03c90..4f404bd 100644
--- a/vp8/decoder/decodeframe.c
+++ b/vp8/decoder/decodeframe.c

@@ -1069,7 +1069,6 @@
                 pc->vert_scale = clear[6] >> 6;
             }
             data += 7;
-            clear += 7;
         }
         else
         {

diff --git a/vp8/encoder/firstpass.c b/vp8/encoder/firstpass.c
index 85767ef..84b9fae 100644
--- a/vp8/encoder/firstpass.c
+++ b/vp8/encoder/firstpass.c

@@ -2969,7 +2969,6 @@
      */
     decay_accumulator = 1.0;
     boost_score = 0.0;
-    loop_decay_rate = 1.00;       /* Starting decay rate */
 
     for (i = 0 ; i < cpi->twopass.frames_to_key ; i++)
     {
@@ -3213,7 +3212,7 @@
         int new_width = cpi->oxcf.Width;
         int new_height = cpi->oxcf.Height;
 
-        int projected_buffer_level = (int)cpi->buffer_level;
+        int projected_buffer_level;
         int tmp_q;
 
         double projected_bits_perframe;

diff --git a/vp8/encoder/picklpf.c b/vp8/encoder/picklpf.c
index f0c8f28..a55a1ea 100644
--- a/vp8/encoder/picklpf.c
+++ b/vp8/encoder/picklpf.c

@@ -142,7 +142,7 @@
     int min_filter_level = get_min_filter_level(cpi, cm->base_qindex);
     int max_filter_level = get_max_filter_level(cpi, cm->base_qindex);
     int filt_val;
-    int best_filt_val = cm->filter_level;
+    int best_filt_val;
     YV12_BUFFER_CONFIG * saved_frame = cm->frame_to_show;
 
     /* Replace unfiltered frame buffer with a new one */
@@ -274,8 +274,7 @@
 
     int filter_step;
     int filt_high = 0;
-    /* Start search at previous frame filter level */
-    int filt_mid = cm->filter_level;
+    int filt_mid;
     int filt_low = 0;
     int filt_best;
     int filt_direction = 0;

diff --git a/vp8/encoder/ppc/csystemdependent.c b/vp8/encoder/ppc/csystemdependent.c
deleted file mode 100644
index 63f2357..0000000
--- a/vp8/encoder/ppc/csystemdependent.c
+++ /dev/null

@@ -1,160 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-#include "vp8/encoder/variance.h"
-#include "vp8/encoder/onyx_int.h"
-
-SADFunction *vp8_sad16x16;
-SADFunction *vp8_sad16x8;
-SADFunction *vp8_sad8x16;
-SADFunction *vp8_sad8x8;
-SADFunction *vp8_sad4x4;
-
-variance_function *vp8_variance4x4;
-variance_function *vp8_variance8x8;
-variance_function *vp8_variance8x16;
-variance_function *vp8_variance16x8;
-variance_function *vp8_variance16x16;
-
-variance_function *vp8_mse16x16;
-
-sub_pixel_variance_function *vp8_sub_pixel_variance4x4;
-sub_pixel_variance_function *vp8_sub_pixel_variance8x8;
-sub_pixel_variance_function *vp8_sub_pixel_variance8x16;
-sub_pixel_variance_function *vp8_sub_pixel_variance16x8;
-sub_pixel_variance_function *vp8_sub_pixel_variance16x16;
-
-int (*vp8_block_error)(short *coeff, short *dqcoeff);
-int (*vp8_mbblock_error)(MACROBLOCK *mb, int dc);
-
-int (*vp8_mbuverror)(MACROBLOCK *mb);
-unsigned int (*vp8_get_mb_ss)(short *);
-void (*vp8_short_fdct4x4)(short *input, short *output, int pitch);
-void (*vp8_short_fdct8x4)(short *input, short *output, int pitch);
-void (*vp8_fast_fdct4x4)(short *input, short *output, int pitch);
-void (*vp8_fast_fdct8x4)(short *input, short *output, int pitch);
-void (*short_walsh4x4)(short *input, short *output, int pitch);
-
-void (*vp8_subtract_b)(BLOCK *be, BLOCKD *bd, int pitch);
-void (*vp8_subtract_mby)(short *diff, unsigned char *src, unsigned char *pred, int stride);
-void (*vp8_subtract_mbuv)(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
-void (*vp8_fast_quantize_b)(BLOCK *b, BLOCKD *d);
-
-unsigned int (*vp8_get4x4sse_cs)(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride);
-
-// c imports
-extern int block_error_c(short *coeff, short *dqcoeff);
-extern int vp8_mbblock_error_c(MACROBLOCK *mb, int dc);
-
-extern int vp8_mbuverror_c(MACROBLOCK *mb);
-extern unsigned int vp8_get8x8var_c(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
-extern void short_fdct4x4_c(short *input, short *output, int pitch);
-extern void short_fdct8x4_c(short *input, short *output, int pitch);
-extern void vp8_short_walsh4x4_c(short *input, short *output, int pitch);
-
-extern void vp8_subtract_b_c(BLOCK *be, BLOCKD *bd, int pitch);
-extern void subtract_mby_c(short *diff, unsigned char *src, unsigned char *pred, int stride);
-extern void subtract_mbuv_c(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
-extern void vp8_fast_quantize_b_c(BLOCK *b, BLOCKD *d);
-
-extern SADFunction sad16x16_c;
-extern SADFunction sad16x8_c;
-extern SADFunction sad8x16_c;
-extern SADFunction sad8x8_c;
-extern SADFunction sad4x4_c;
-
-extern variance_function variance16x16_c;
-extern variance_function variance8x16_c;
-extern variance_function variance16x8_c;
-extern variance_function variance8x8_c;
-extern variance_function variance4x4_c;
-extern variance_function mse16x16_c;
-
-extern sub_pixel_variance_function sub_pixel_variance4x4_c;
-extern sub_pixel_variance_function sub_pixel_variance8x8_c;
-extern sub_pixel_variance_function sub_pixel_variance8x16_c;
-extern sub_pixel_variance_function sub_pixel_variance16x8_c;
-extern sub_pixel_variance_function sub_pixel_variance16x16_c;
-
-extern unsigned int vp8_get_mb_ss_c(short *);
-extern unsigned int vp8_get4x4sse_cs_c(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride);
-
-// ppc
-extern int vp8_block_error_ppc(short *coeff, short *dqcoeff);
-
-extern void vp8_short_fdct4x4_ppc(short *input, short *output, int pitch);
-extern void vp8_short_fdct8x4_ppc(short *input, short *output, int pitch);
-
-extern void vp8_subtract_mby_ppc(short *diff, unsigned char *src, unsigned char *pred, int stride);
-extern void vp8_subtract_mbuv_ppc(short *diff, unsigned char *usrc, unsigned char *vsrc, unsigned char *pred, int stride);
-
-extern SADFunction vp8_sad16x16_ppc;
-extern SADFunction vp8_sad16x8_ppc;
-extern SADFunction vp8_sad8x16_ppc;
-extern SADFunction vp8_sad8x8_ppc;
-extern SADFunction vp8_sad4x4_ppc;
-
-extern variance_function vp8_variance16x16_ppc;
-extern variance_function vp8_variance8x16_ppc;
-extern variance_function vp8_variance16x8_ppc;
-extern variance_function vp8_variance8x8_ppc;
-extern variance_function vp8_variance4x4_ppc;
-extern variance_function vp8_mse16x16_ppc;
-
-extern sub_pixel_variance_function vp8_sub_pixel_variance4x4_ppc;
-extern sub_pixel_variance_function vp8_sub_pixel_variance8x8_ppc;
-extern sub_pixel_variance_function vp8_sub_pixel_variance8x16_ppc;
-extern sub_pixel_variance_function vp8_sub_pixel_variance16x8_ppc;
-extern sub_pixel_variance_function vp8_sub_pixel_variance16x16_ppc;
-
-extern unsigned int vp8_get8x8var_ppc(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
-extern unsigned int vp8_get16x16var_ppc(unsigned char *src_ptr, int  source_stride, unsigned char *ref_ptr, int  recon_stride, unsigned int *SSE, int *Sum);
-
-void vp8_cmachine_specific_config(void)
-{
-    // Pure C:
-    vp8_mbuverror               = vp8_mbuverror_c;
-    vp8_fast_quantize_b           = vp8_fast_quantize_b_c;
-    vp8_short_fdct4x4            = vp8_short_fdct4x4_ppc;
-    vp8_short_fdct8x4            = vp8_short_fdct8x4_ppc;
-    vp8_fast_fdct4x4             = vp8_short_fdct4x4_ppc;
-    vp8_fast_fdct8x4             = vp8_short_fdct8x4_ppc;
-    short_walsh4x4               = vp8_short_walsh4x4_c;
-
-    vp8_variance4x4             = vp8_variance4x4_ppc;
-    vp8_variance8x8             = vp8_variance8x8_ppc;
-    vp8_variance8x16            = vp8_variance8x16_ppc;
-    vp8_variance16x8            = vp8_variance16x8_ppc;
-    vp8_variance16x16           = vp8_variance16x16_ppc;
-    vp8_mse16x16                = vp8_mse16x16_ppc;
-
-    vp8_sub_pixel_variance4x4     = vp8_sub_pixel_variance4x4_ppc;
-    vp8_sub_pixel_variance8x8     = vp8_sub_pixel_variance8x8_ppc;
-    vp8_sub_pixel_variance8x16    = vp8_sub_pixel_variance8x16_ppc;
-    vp8_sub_pixel_variance16x8    = vp8_sub_pixel_variance16x8_ppc;
-    vp8_sub_pixel_variance16x16   = vp8_sub_pixel_variance16x16_ppc;
-
-    vp8_get_mb_ss                 = vp8_get_mb_ss_c;
-    vp8_get4x4sse_cs            = vp8_get4x4sse_cs_c;
-
-    vp8_sad16x16                = vp8_sad16x16_ppc;
-    vp8_sad16x8                 = vp8_sad16x8_ppc;
-    vp8_sad8x16                 = vp8_sad8x16_ppc;
-    vp8_sad8x8                  = vp8_sad8x8_ppc;
-    vp8_sad4x4                  = vp8_sad4x4_ppc;
-
-    vp8_block_error              = vp8_block_error_ppc;
-    vp8_mbblock_error            = vp8_mbblock_error_c;
-
-    vp8_subtract_b               = vp8_subtract_b_c;
-    vp8_subtract_mby             = vp8_subtract_mby_ppc;
-    vp8_subtract_mbuv            = vp8_subtract_mbuv_ppc;
-}

diff --git a/vp8/encoder/ppc/encodemb_altivec.asm b/vp8/encoder/ppc/encodemb_altivec.asm
deleted file mode 100644
index 6e0099d..0000000
--- a/vp8/encoder/ppc/encodemb_altivec.asm
+++ /dev/null

@@ -1,153 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl vp8_subtract_mbuv_ppc
-    .globl vp8_subtract_mby_ppc
-
-;# r3 short *diff
-;# r4 unsigned char *usrc
-;# r5 unsigned char *vsrc
-;# r6 unsigned char *pred
-;# r7 int stride
-vp8_subtract_mbuv_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xf000
-    mtspr   256, r12            ;# set VRSAVE
-
-    li      r9, 256
-    add     r3, r3, r9
-    add     r3, r3, r9
-    add     r6, r6, r9
-
-    li      r10, 16
-    li      r9,  4
-    mtctr   r9
-
-    vspltisw v0, 0
-
-mbu_loop:
-    lvsl    v5, 0, r4           ;# permutate value for alignment
-    lvx     v1, 0, r4           ;# src
-    lvx     v2, 0, r6           ;# pred
-
-    add     r4, r4, r7
-    addi    r6, r6, 16
-
-    vperm   v1, v1, v0, v5
-
-    vmrghb  v3, v0, v1          ;# unpack high src  to short
-    vmrghb  v4, v0, v2          ;# unpack high pred to short
-
-    lvsl    v5, 0, r4           ;# permutate value for alignment
-    lvx     v1, 0, r4           ;# src
-
-    add     r4, r4, r7
-
-    vsubshs v3, v3, v4
-
-    stvx    v3, 0, r3           ;# store out diff
-
-    vperm   v1, v1, v0, v5
-
-    vmrghb  v3, v0, v1          ;# unpack high src  to short
-    vmrglb  v4, v0, v2          ;# unpack high pred to short
-
-    vsubshs v3, v3, v4
-
-    stvx    v3, r10, r3         ;# store out diff
-
-    addi    r3, r3, 32
-
-    bdnz    mbu_loop
-
-    mtctr   r9
-
-mbv_loop:
-    lvsl    v5, 0, r5           ;# permutate value for alignment
-    lvx     v1, 0, r5           ;# src
-    lvx     v2, 0, r6           ;# pred
-
-    add     r5, r5, r7
-    addi    r6, r6, 16
-
-    vperm   v1, v1, v0, v5
-
-    vmrghb  v3, v0, v1          ;# unpack high src  to short
-    vmrghb  v4, v0, v2          ;# unpack high pred to short
-
-    lvsl    v5, 0, r5           ;# permutate value for alignment
-    lvx     v1, 0, r5           ;# src
-
-    add     r5, r5, r7
-
-    vsubshs v3, v3, v4
-
-    stvx    v3, 0, r3           ;# store out diff
-
-    vperm   v1, v1, v0, v5
-
-    vmrghb  v3, v0, v1          ;# unpack high src  to short
-    vmrglb  v4, v0, v2          ;# unpack high pred to short
-
-    vsubshs v3, v3, v4
-
-    stvx    v3, r10, r3         ;# store out diff
-
-    addi    r3, r3, 32
-
-    bdnz    mbv_loop
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr
-
-;# r3 short *diff
-;# r4 unsigned char *src
-;# r5 unsigned char *pred
-;# r6 int stride
-vp8_subtract_mby_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xf800
-    mtspr   256, r12            ;# set VRSAVE
-
-    li      r10, 16
-    mtctr   r10
-
-    vspltisw v0, 0
-
-mby_loop:
-    lvx     v1, 0, r4           ;# src
-    lvx     v2, 0, r5           ;# pred
-
-    add     r4, r4, r6
-    addi    r5, r5, 16
-
-    vmrghb  v3, v0, v1          ;# unpack high src  to short
-    vmrghb  v4, v0, v2          ;# unpack high pred to short
-
-    vsubshs v3, v3, v4
-
-    stvx    v3, 0, r3           ;# store out diff
-
-    vmrglb  v3, v0, v1          ;# unpack low src  to short
-    vmrglb  v4, v0, v2          ;# unpack low pred to short
-
-    vsubshs v3, v3, v4
-
-    stvx    v3, r10, r3         ;# store out diff
-
-    addi    r3, r3, 32
-
-    bdnz    mby_loop
-
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr

diff --git a/vp8/encoder/ppc/fdct_altivec.asm b/vp8/encoder/ppc/fdct_altivec.asm
deleted file mode 100644
index 935d0cb..0000000
--- a/vp8/encoder/ppc/fdct_altivec.asm
+++ /dev/null

@@ -1,205 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl vp8_short_fdct4x4_ppc
-    .globl vp8_short_fdct8x4_ppc
-
-.macro load_c V, LABEL, OFF, R0, R1
-    lis     \R0, \LABEL@ha
-    la      \R1, \LABEL@l(\R0)
-    lvx     \V, \OFF, \R1
-.endm
-
-;# Forward and inverse DCTs are nearly identical; only differences are
-;#   in normalization (fwd is twice unitary, inv is half unitary)
-;#   and that they are of course transposes of each other.
-;#
-;#   The following three accomplish most of implementation and
-;#   are used only by ppc_idct.c and ppc_fdct.c.
-.macro prologue
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xfffc
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    li      r6, 16
-
-    load_c v0, dct_tab, 0, r9, r10
-    lvx     v1,   r6, r10
-    addi    r10, r10, 32
-    lvx     v2,    0, r10
-    lvx     v3,   r6, r10
-
-    load_c v4, ppc_dctperm_tab,  0, r9, r10
-    load_c v5, ppc_dctperm_tab, r6, r9, r10
-
-    load_c v6, round_tab, 0, r10, r9
-.endm
-
-.macro epilogue
-    addi    r1, r1, 32          ;# recover stack
-
-    mtspr   256, r11            ;# reset old VRSAVE
-.endm
-
-;# Do horiz xf on two rows of coeffs  v8 = a0 a1 a2 a3  b0 b1 b2 b3.
-;#   a/A are the even rows 0,2   b/B are the odd rows 1,3
-;#   For fwd transform, indices are horizontal positions, then frequencies.
-;#   For inverse transform, frequencies then positions.
-;#   The two resulting  A0..A3  B0..B3  are later combined
-;#   and vertically transformed.
-
-.macro two_rows_horiz Dst
-    vperm   v9, v8, v8, v4      ;# v9 = a2 a3 a0 a1  b2 b3 b0 b1
-
-    vmsumshm v10, v0, v8, v6
-    vmsumshm v10, v1, v9, v10
-    vsraw   v10, v10, v7        ;# v10 = A0 A1  B0 B1
-
-    vmsumshm v11, v2, v8, v6
-    vmsumshm v11, v3, v9, v11
-    vsraw   v11, v11, v7        ;# v11 = A2 A3  B2 B3
-
-    vpkuwum v10, v10, v11       ;# v10  = A0 A1  B0 B1  A2 A3  B2 B3
-    vperm   \Dst, v10, v10, v5  ;# Dest = A0 B0  A1 B1  A2 B2  A3 B3
-.endm
-
-;# Vertical xf on two rows. DCT values in comments are for inverse transform;
-;#   forward transform uses transpose.
-
-.macro two_rows_vert Ceven, Codd
-    vspltw  v8, \Ceven, 0       ;# v8 = c00 c10  or  c02 c12 four times
-    vspltw  v9, \Codd,  0       ;# v9 = c20 c30  or  c22 c32 ""
-    vmsumshm v8, v8, v12, v6
-    vmsumshm v8, v9, v13, v8
-    vsraw   v10, v8, v7
-
-    vspltw  v8, \Codd,  1       ;# v8 = c01 c11  or  c03 c13
-    vspltw  v9, \Ceven, 1       ;# v9 = c21 c31  or  c23 c33
-    vmsumshm v8, v8, v12, v6
-    vmsumshm v8, v9, v13, v8
-    vsraw   v8, v8, v7
-
-    vpkuwum v8, v10, v8         ;# v8 = rows 0,1  or 2,3
-.endm
-
-.macro two_rows_h Dest
-    stw     r0,  0(r8)
-    lwz     r0,  4(r3)
-    stw     r0,  4(r8)
-    lwzux   r0, r3,r5
-    stw     r0,  8(r8)
-    lwz     r0,  4(r3)
-    stw     r0, 12(r8)
-    lvx     v8,  0,r8
-    two_rows_horiz \Dest
-.endm
-
-    .align 2
-;# r3 short *input
-;# r4 short *output
-;# r5 int pitch
-vp8_short_fdct4x4_ppc:
-
-    prologue
-
-    vspltisw v7, 14             ;# == 14, fits in 5 signed bits
-    addi    r8, r1, 0
-
-
-    lwz     r0, 0(r3)
-    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13
-
-    lwzux   r0, r3, r5
-    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33
-
-    lvx     v6, r6, r9          ;# v6 = Vround
-    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter
-
-    two_rows_vert v0, v1
-    stvx    v8, 0, r4
-    two_rows_vert v2, v3
-    stvx    v8, r6, r4
-
-    epilogue
-
-    blr
-
-    .align 2
-;# r3 short *input
-;# r4 short *output
-;# r5 int pitch
-vp8_short_fdct8x4_ppc:
-    prologue
-
-    vspltisw v7, 14             ;# == 14, fits in 5 signed bits
-    addi    r8,  r1, 0
-    addi    r10, r3, 0
-
-    lwz     r0, 0(r3)
-    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13
-
-    lwzux   r0, r3, r5
-    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33
-
-    lvx     v6, r6, r9          ;# v6 = Vround
-    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter
-
-    two_rows_vert v0, v1
-    stvx    v8, 0, r4
-    two_rows_vert v2, v3
-    stvx    v8, r6, r4
-
-    ;# Next block
-    addi    r3, r10, 8
-    addi    r4, r4, 32
-    lvx     v6, 0, r9           ;# v6 = Hround
-
-    vspltisw v7, 14             ;# == 14, fits in 5 signed bits
-    addi    r8, r1, 0
-
-    lwz     r0, 0(r3)
-    two_rows_h v12                ;# v12 = H00 H10  H01 H11  H02 H12  H03 H13
-
-    lwzux   r0, r3, r5
-    two_rows_h v13                ;# v13 = H20 H30  H21 H31  H22 H32  H23 H33
-
-    lvx     v6, r6, r9          ;# v6 = Vround
-    vspltisw v7, -16            ;# == 16 == -16, only low 5 bits matter
-
-    two_rows_vert v0, v1
-    stvx    v8, 0, r4
-    two_rows_vert v2, v3
-    stvx    v8, r6, r4
-
-    epilogue
-
-    blr
-
-    .data
-    .align 4
-ppc_dctperm_tab:
-    .byte 4,5,6,7, 0,1,2,3, 12,13,14,15, 8,9,10,11
-    .byte 0,1,4,5, 2,3,6,7, 8,9,12,13, 10,11,14,15
-
-    .align 4
-dct_tab:
-    .short  23170, 23170,-12540,-30274, 23170, 23170,-12540,-30274
-    .short  23170, 23170, 30274, 12540, 23170, 23170, 30274, 12540
-
-    .short  23170,-23170, 30274,-12540, 23170,-23170, 30274,-12540
-    .short -23170, 23170, 12540,-30274,-23170, 23170, 12540,-30274
-
-    .align 4
-round_tab:
-    .long (1 << (14-1)), (1 << (14-1)), (1 << (14-1)), (1 << (14-1))
-    .long (1 << (16-1)), (1 << (16-1)), (1 << (16-1)), (1 << (16-1))

diff --git a/vp8/encoder/ppc/rdopt_altivec.asm b/vp8/encoder/ppc/rdopt_altivec.asm
deleted file mode 100644
index ba48230..0000000
--- a/vp8/encoder/ppc/rdopt_altivec.asm
+++ /dev/null

@@ -1,51 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-    .globl vp8_block_error_ppc
-
-    .align 2
-;# r3 short *Coeff
-;# r4 short *dqcoeff
-vp8_block_error_ppc:
-    mfspr   r11, 256            ;# get old VRSAVE
-    oris    r12, r11, 0xf800
-    mtspr   256, r12            ;# set VRSAVE
-
-    stwu    r1,-32(r1)          ;# create space on the stack
-
-    stw     r5, 12(r1)          ;# tranfer dc to vector register
-
-    lvx     v0, 0, r3           ;# Coeff
-    lvx     v1, 0, r4           ;# dqcoeff
-
-    li      r10, 16
-
-    vspltisw v3, 0
-
-    vsubshs v0, v0, v1
-
-    vmsumshm v2, v0, v0, v3     ;# multiply differences
-
-    lvx     v0, r10, r3         ;# Coeff
-    lvx     v1, r10, r4         ;# dqcoeff
-
-    vsubshs v0, v0, v1
-
-    vmsumshm v1, v0, v0, v2     ;# multiply differences
-    vsumsws v1, v1, v3          ;# sum up
-
-    stvx    v1, 0, r1
-    lwz     r3, 12(r1)          ;# return value
-
-    addi    r1, r1, 32          ;# recover stack
-    mtspr   256, r11            ;# reset old VRSAVE
-
-    blr

diff --git a/vp8/encoder/rdopt.c b/vp8/encoder/rdopt.c
index 29da926..582c8bc 100644
--- a/vp8/encoder/rdopt.c
+++ b/vp8/encoder/rdopt.c

@@ -1661,7 +1661,6 @@
             mv.as_mv.row = mvx[vcnt/2];
             mv.as_mv.col = mvy[vcnt/2];
 
-            find = 1;
             /* sr is set to 0 to allow calling function to decide the search
              * range.
              */
@@ -2293,7 +2292,6 @@
                 mode_mv[NEWMV].as_int = d->bmi.mv.as_int;
 
                 /* Further step/diamond searches as necessary */
-                n = 0;
                 further_steps = (cpi->sf.max_step_search_steps - 1) - step_param;
 
                 n = num00;
@@ -2560,8 +2558,6 @@
                                                intra_rd_penalty, cpi, x);
             if (this_rd < best_mode.rd || x->skip)
             {
-                /* Note index of best mode so far */
-                best_mode_index = mode_index;
                 *returnrate = rd.rate2;
                 *returndistortion = rd.distortion2;
                 update_best_mode(&best_mode, this_rd, &rd, other_cost, x);

diff --git a/vp8/encoder/x86/quantize_sse2.c b/vp8/encoder/x86/quantize_sse2.c
index 291d219..f56e646 100644
--- a/vp8/encoder/x86/quantize_sse2.c
+++ b/vp8/encoder/x86/quantize_sse2.c

@@ -35,7 +35,7 @@
 void vp8_regular_quantize_b_sse2(BLOCK *b, BLOCKD *d)
 {
     char eob = 0;
-    short *zbin_boost_ptr  = b->zrun_zbin_boost;
+    short *zbin_boost_ptr;
     short *qcoeff_ptr      = d->qcoeff;
     DECLARE_ALIGNED_ARRAY(16, short, x, 16);
     DECLARE_ALIGNED_ARRAY(16, short, y, 16);

diff --git a/vp8/vp8_cx_iface.c b/vp8/vp8_cx_iface.c
index 6768ffd..a5dfd07 100644
--- a/vp8/vp8_cx_iface.c
+++ b/vp8/vp8_cx_iface.c

@@ -10,7 +10,8 @@
 
 
 #include "./vpx_config.h"
-#include "vp8_rtcd.h"
+#include "./vp8_rtcd.h"
+#include "./vpx_scale_rtcd.h"
 #include "vpx/vpx_codec.h"
 #include "vpx/internal/vpx_codec_internal.h"
 #include "vpx_version.h"
@@ -649,6 +650,7 @@
 
 
     vp8_rtcd();
+    vpx_scale_rtcd();
 
     if (!ctx->priv)
     {
@@ -861,9 +863,6 @@
     if (!ctx->cfg.rc_target_bitrate)
         return res;
 
-    if (!ctx->cfg.rc_target_bitrate)
-        return res;
-
     if (img)
         res = validate_img(ctx, img);
 

diff --git a/vp8/vp8_dx_iface.c b/vp8/vp8_dx_iface.c
index 67a0fef..e0eb30a 100644
--- a/vp8/vp8_dx_iface.c
+++ b/vp8/vp8_dx_iface.c

@@ -11,7 +11,8 @@
 
 #include <stdlib.h>
 #include <string.h>
-#include "vp8_rtcd.h"
+#include "./vp8_rtcd.h"
+#include "./vpx_scale_rtcd.h"
 #include "vpx/vpx_decoder.h"
 #include "vpx/vp8dx.h"
 #include "vpx/internal/vpx_codec_internal.h"
@@ -106,6 +107,7 @@
     (void) data;
 
     vp8_rtcd();
+    vpx_scale_rtcd();
 
     /* This function only allocates space for the vpx_codec_alg_priv_t
      * structure. More memory may be required at the time the stream

diff --git a/vp9/common/mips/msa/vp9_macros_msa.h b/vp9/common/mips/msa/vp9_macros_msa.h
new file mode 100644
index 0000000..17ad33a
--- /dev/null
+++ b/vp9/common/mips/msa/vp9_macros_msa.h

@@ -0,0 +1,692 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_
+#define VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_
+
+#include <msa.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+#if HAVE_MSA
+/* load macros */
+#define LOAD_UB(psrc) *((const v16u8 *)(psrc))
+#define LOAD_SB(psrc) *((const v16i8 *)(psrc))
+#define LOAD_UH(psrc) *((const v8u16 *)(psrc))
+#define LOAD_SH(psrc) *((const v8i16 *)(psrc))
+#define LOAD_UW(psrc) *((const v4u32 *)(psrc))
+#define LOAD_SW(psrc) *((const v4i32 *)(psrc))
+#define LOAD_UD(psrc) *((const v2u64 *)(psrc))
+#define LOAD_SD(psrc) *((const v2i64 *)(psrc))
+
+/* store macros */
+#define STORE_UB(vec, pdest) *((v16u8 *)(pdest)) = (vec)
+#define STORE_SB(vec, pdest) *((v16i8 *)(pdest)) = (vec)
+#define STORE_UH(vec, pdest) *((v8u16 *)(pdest)) = (vec)
+#define STORE_SH(vec, pdest) *((v8i16 *)(pdest)) = (vec)
+#define STORE_UW(vec, pdest) *((v4u32 *)(pdest)) = (vec)
+#define STORE_SW(vec, pdest) *((v4i32 *)(pdest)) = (vec)
+#define STORE_UD(vec, pdest) *((v2u64 *)(pdest)) = (vec)
+#define STORE_SD(vec, pdest) *((v2i64 *)(pdest)) = (vec)
+
+#if (__mips_isa_rev >= 6)
+#define LOAD_WORD(psrc) ({                         \
+  const uint8_t *src_m = (const uint8_t *)(psrc);  \
+  uint32_t val_m;                                  \
+                                                   \
+  __asm__ __volatile__ (                           \
+      "lw  %[val_m],  %[src_m]  \n\t"              \
+                                                   \
+      : [val_m] "=r" (val_m)                       \
+      : [src_m] "m" (*src_m)                       \
+  );                                               \
+                                                   \
+  val_m;                                           \
+})
+
+#if (__mips == 64)
+#define LOAD_DWORD(psrc) ({                        \
+  const uint8_t *src_m = (const uint8_t *)(psrc);  \
+  uint64_t val_m = 0;                              \
+                                                   \
+  __asm__ __volatile__ (                           \
+      "ld  %[val_m],  %[src_m]  \n\t"              \
+                                                   \
+      : [val_m] "=r" (val_m)                       \
+      : [src_m] "m" (*src_m)                       \
+  );                                               \
+                                                   \
+  val_m;                                           \
+})
+#else  // !(__mips == 64)
+#define LOAD_DWORD(psrc) ({                                      \
+  const uint8_t *src1_m = (const uint8_t *)(psrc);               \
+  const uint8_t *src2_m = ((const uint8_t *)(psrc)) + 4;         \
+  uint32_t val0_m, val1_m;                                       \
+  uint64_t genval_m = 0;                                         \
+                                                                 \
+  __asm__ __volatile__ (                                         \
+      "lw  %[val0_m],  %[src1_m]  \n\t"                          \
+                                                                 \
+      : [val0_m] "=r" (val0_m)                                   \
+      : [src1_m] "m" (*src1_m)                                   \
+  );                                                             \
+                                                                 \
+  __asm__ __volatile__ (                                         \
+      "lw  %[val1_m],  %[src2_m]  \n\t"                          \
+                                                                 \
+      : [val1_m] "=r" (val1_m)                                   \
+      : [src2_m] "m" (*src2_m)                                   \
+  );                                                             \
+                                                                 \
+  genval_m = (uint64_t)(val1_m);                                 \
+  genval_m = (uint64_t)((genval_m << 32) & 0xFFFFFFFF00000000);  \
+  genval_m = (uint64_t)(genval_m | (uint64_t)val0_m);            \
+                                                                 \
+  genval_m;                                                      \
+})
+#endif  // (__mips == 64)
+#define STORE_WORD_WITH_OFFSET_1(pdst, val) {    \
+  uint8_t *dst_ptr_m = ((uint8_t *)(pdst)) + 1;  \
+  const uint32_t val_m = (val);                  \
+                                                 \
+  __asm__ __volatile__ (                         \
+      "sw  %[val_m],  %[dst_ptr_m]  \n\t"        \
+                                                 \
+      : [dst_ptr_m] "=m" (*dst_ptr_m)            \
+      : [val_m] "r" (val_m)                      \
+  );                                             \
+}
+
+#define STORE_WORD(pdst, val) {            \
+  uint8_t *dst_ptr_m = (uint8_t *)(pdst);  \
+  const uint32_t val_m = (val);            \
+                                           \
+  __asm__ __volatile__ (                   \
+      "sw  %[val_m],  %[dst_ptr_m]  \n\t"  \
+                                           \
+      : [dst_ptr_m] "=m" (*dst_ptr_m)      \
+      : [val_m] "r" (val_m)                \
+  );                                       \
+}
+
+#define STORE_DWORD(pdst, val) {           \
+  uint8_t *dst_ptr_m = (uint8_t *)(pdst);  \
+  const uint64_t val_m = (val);            \
+                                           \
+  __asm__ __volatile__ (                   \
+      "sd  %[val_m],  %[dst_ptr_m]  \n\t"  \
+                                           \
+      : [dst_ptr_m] "=m" (*dst_ptr_m)      \
+      : [val_m] "r" (val_m)                \
+  );                                       \
+}
+#else  // !(__mips_isa_rev >= 6)
+#define LOAD_WORD(psrc) ({                         \
+  const uint8_t *src_m = (const uint8_t *)(psrc);  \
+  uint32_t val_m;                                  \
+                                                   \
+  __asm__ __volatile__ (                           \
+      "ulw  %[val_m],  %[src_m]  \n\t"             \
+                                                   \
+      : [val_m] "=r" (val_m)                       \
+      : [src_m] "m" (*src_m)                       \
+  );                                               \
+                                                   \
+  val_m;                                           \
+})
+
+#if (__mips == 64)
+#define LOAD_DWORD(psrc) ({                        \
+  const uint8_t *src_m = (const uint8_t *)(psrc);  \
+  uint64_t val_m = 0;                              \
+                                                   \
+  __asm__ __volatile__ (                           \
+      "uld  %[val_m],  %[src_m]  \n\t"             \
+                                                   \
+      : [val_m] "=r" (val_m)                       \
+      : [src_m] "m" (*src_m)                       \
+  );                                               \
+                                                   \
+  val_m;                                           \
+})
+#else  // !(__mips == 64)
+#define LOAD_DWORD(psrc) ({                                      \
+  const uint8_t *src1_m = (const uint8_t *)(psrc);               \
+  const uint8_t *src2_m = ((const uint8_t *)(psrc)) + 4;         \
+  uint32_t val0_m, val1_m;                                       \
+  uint64_t genval_m = 0;                                         \
+                                                                 \
+  __asm__ __volatile__ (                                         \
+      "ulw  %[val0_m],  %[src1_m]  \n\t"                         \
+                                                                 \
+      : [val0_m] "=r" (val0_m)                                   \
+      : [src1_m] "m" (*src1_m)                                   \
+  );                                                             \
+                                                                 \
+  __asm__ __volatile__ (                                         \
+      "ulw  %[val1_m],  %[src2_m]  \n\t"                         \
+                                                                 \
+      : [val1_m] "=r" (val1_m)                                   \
+      : [src2_m] "m" (*src2_m)                                   \
+  );                                                             \
+                                                                 \
+  genval_m = (uint64_t)(val1_m);                                 \
+  genval_m = (uint64_t)((genval_m << 32) & 0xFFFFFFFF00000000);  \
+  genval_m = (uint64_t)(genval_m | (uint64_t)val0_m);            \
+                                                                 \
+  genval_m;                                                      \
+})
+#endif  // (__mips == 64)
+
+#define STORE_WORD_WITH_OFFSET_1(pdst, val) {    \
+  uint8_t *dst_ptr_m = ((uint8_t *)(pdst)) + 1;  \
+  const uint32_t val_m = (val);                  \
+                                                 \
+  __asm__ __volatile__ (                         \
+      "usw  %[val_m],  %[dst_ptr_m]  \n\t"       \
+                                                 \
+      : [dst_ptr_m] "=m" (*dst_ptr_m)            \
+      : [val_m] "r" (val_m)                      \
+  );                                             \
+}
+
+#define STORE_WORD(pdst, val) {             \
+  uint8_t *dst_ptr_m = (uint8_t *)(pdst);   \
+  const uint32_t val_m = (val);             \
+                                            \
+  __asm__ __volatile__ (                    \
+      "usw  %[val_m],  %[dst_ptr_m]  \n\t"  \
+                                            \
+      : [dst_ptr_m] "=m" (*dst_ptr_m)       \
+      : [val_m] "r" (val_m)                 \
+  );                                        \
+}
+
+#define STORE_DWORD(pdst, val) {                            \
+  uint8_t *dst1_m = (uint8_t *)(pdst);                      \
+  uint8_t *dst2_m = ((uint8_t *)(pdst)) + 4;                \
+  uint32_t val0_m, val1_m;                                  \
+                                                            \
+  val0_m = (uint32_t)((val) & 0x00000000FFFFFFFF);          \
+  val1_m = (uint32_t)(((val) >> 32) & 0x00000000FFFFFFFF);  \
+                                                            \
+  __asm__ __volatile__ (                                    \
+      "usw  %[val0_m],  %[dst1_m]  \n\t"                    \
+      "usw  %[val1_m],  %[dst2_m]  \n\t"                    \
+                                                            \
+      : [dst1_m] "=m" (*dst1_m), [dst2_m] "=m" (*dst2_m)    \
+      : [val0_m] "r" (val0_m), [val1_m] "r" (val1_m)        \
+  );                                                        \
+}
+#endif  // (__mips_isa_rev >= 6)
+
+#define LOAD_2VECS_UB(psrc, stride,   \
+                      val0, val1) {   \
+  val0 = LOAD_UB(psrc + 0 * stride);  \
+  val1 = LOAD_UB(psrc + 1 * stride);  \
+}
+
+#define LOAD_4VECS_UB(psrc, stride,              \
+                      val0, val1, val2, val3) {  \
+  val0 = LOAD_UB(psrc + 0 * stride);             \
+  val1 = LOAD_UB(psrc + 1 * stride);             \
+  val2 = LOAD_UB(psrc + 2 * stride);             \
+  val3 = LOAD_UB(psrc + 3 * stride);             \
+}
+
+#define LOAD_4VECS_SB(psrc, stride,              \
+                      val0, val1, val2, val3) {  \
+  val0 = LOAD_SB(psrc + 0 * stride);             \
+  val1 = LOAD_SB(psrc + 1 * stride);             \
+  val2 = LOAD_SB(psrc + 2 * stride);             \
+  val3 = LOAD_SB(psrc + 3 * stride);             \
+}
+
+#define LOAD_5VECS_UB(psrc, stride,                    \
+                      out0, out1, out2, out3, out4) {  \
+  LOAD_4VECS_UB((psrc), (stride),                      \
+                (out0), (out1), (out2), (out3));       \
+  out4 = LOAD_UB(psrc + 4 * stride);                   \
+}
+
+#define LOAD_5VECS_SB(psrc, stride,                    \
+                      out0, out1, out2, out3, out4) {  \
+  LOAD_4VECS_SB((psrc), (stride),                      \
+                (out0), (out1), (out2), (out3));       \
+  out4 = LOAD_SB(psrc + 4 * stride);                   \
+}
+
+#define LOAD_7VECS_SB(psrc, stride,            \
+                      val0, val1, val2, val3,  \
+                      val4, val5, val6) {      \
+  val0 = LOAD_SB((psrc) + 0 * (stride));       \
+  val1 = LOAD_SB((psrc) + 1 * (stride));       \
+  val2 = LOAD_SB((psrc) + 2 * (stride));       \
+  val3 = LOAD_SB((psrc) + 3 * (stride));       \
+  val4 = LOAD_SB((psrc) + 4 * (stride));       \
+  val5 = LOAD_SB((psrc) + 5 * (stride));       \
+  val6 = LOAD_SB((psrc) + 6 * (stride));       \
+}
+
+#define LOAD_8VECS_UB(psrc, stride,               \
+                      out0, out1, out2, out3,     \
+                      out4, out5, out6, out7) {   \
+  LOAD_4VECS_UB((psrc), (stride),                 \
+                (out0), (out1), (out2), (out3));  \
+  LOAD_4VECS_UB((psrc + 4 * stride), (stride),    \
+                (out4), (out5), (out6), (out7));  \
+}
+
+#define LOAD_8VECS_SB(psrc, stride,               \
+                      out0, out1, out2, out3,     \
+                      out4, out5, out6, out7) {   \
+  LOAD_4VECS_SB((psrc), (stride),                 \
+                (out0), (out1), (out2), (out3));  \
+  LOAD_4VECS_SB((psrc + 4 * stride), (stride),    \
+                (out4), (out5), (out6), (out7));  \
+}
+
+#define STORE_4VECS_UB(dst_out, pitch,         \
+                       in0, in1, in2, in3) {   \
+  STORE_UB((in0), (dst_out));                  \
+  STORE_UB((in1), ((dst_out) + (pitch)));      \
+  STORE_UB((in2), ((dst_out) + 2 * (pitch)));  \
+  STORE_UB((in3), ((dst_out) + 3 * (pitch)));  \
+}
+
+#define STORE_8VECS_UB(dst_out, pitch_in,               \
+                       in0, in1, in2, in3,              \
+                       in4, in5, in6, in7) {            \
+  STORE_4VECS_UB(dst_out, pitch_in,                     \
+                 in0, in1, in2, in3);                   \
+  STORE_4VECS_UB((dst_out + 4 * (pitch_in)), pitch_in,  \
+                 in4, in5, in6, in7);                   \
+}
+
+#define VEC_INSERT_4W_UB(src, src0, src1, src2, src3) {  \
+  src = (v16u8)__msa_insert_w((v4i32)(src), 0, (src0));  \
+  src = (v16u8)__msa_insert_w((v4i32)(src), 1, (src1));  \
+  src = (v16u8)__msa_insert_w((v4i32)(src), 2, (src2));  \
+  src = (v16u8)__msa_insert_w((v4i32)(src), 3, (src3));  \
+}
+
+#define VEC_INSERT_2DW_UB(src, src0, src1) {             \
+  src = (v16u8)__msa_insert_d((v2i64)(src), 0, (src0));  \
+  src = (v16u8)__msa_insert_d((v2i64)(src), 1, (src1));  \
+}
+
+/* interleave macros */
+/* no in-place support */
+#define ILV_B_LRLR_UB(in0, in1, in2, in3,                  \
+                      out0, out1, out2, out3) {            \
+  out0 = (v16u8)__msa_ilvl_b((v16i8)(in1), (v16i8)(in0));  \
+  out1 = (v16u8)__msa_ilvr_b((v16i8)(in1), (v16i8)(in0));  \
+  out2 = (v16u8)__msa_ilvl_b((v16i8)(in3), (v16i8)(in2));  \
+  out3 = (v16u8)__msa_ilvr_b((v16i8)(in3), (v16i8)(in2));  \
+}
+
+#define ILVR_B_2VECS_UB(in0_r, in1_r, in0_l, in1_l,            \
+                        out0, out1) {                          \
+  out0 = (v16u8)__msa_ilvr_b((v16i8)(in0_l), (v16i8)(in0_r));  \
+  out1 = (v16u8)__msa_ilvr_b((v16i8)(in1_l), (v16i8)(in1_r));  \
+}
+
+#define ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l,     \
+                        out0, out1) {                   \
+  out0 = __msa_ilvr_b((v16i8)(in0_l), (v16i8)(in0_r));  \
+  out1 = __msa_ilvr_b((v16i8)(in1_l), (v16i8)(in1_r));  \
+}
+
+#define ILVR_B_4VECS_UB(in0_r, in1_r, in2_r, in3_r,  \
+                        in0_l, in1_l, in2_l, in3_l,  \
+                        out0, out1, out2, out3) {    \
+  ILVR_B_2VECS_UB(in0_r, in1_r, in0_l, in1_l,        \
+                  out0, out1);                       \
+  ILVR_B_2VECS_UB(in2_r, in3_r, in2_l, in3_l,        \
+                  out2, out3);                       \
+}
+
+#define ILVR_B_4VECS_SB(in0_r, in1_r, in2_r, in3_r,  \
+                        in0_l, in1_l, in2_l, in3_l,  \
+                        out0, out1, out2, out3) {    \
+  ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l,        \
+                  out0, out1);                       \
+  ILVR_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l,        \
+                  out2, out3);                       \
+}
+
+#define ILVR_B_6VECS_SB(in0_r, in1_r, in2_r,   \
+                        in3_r, in4_r, in5_r,   \
+                        in0_l, in1_l, in2_l,   \
+                        in3_l, in4_l, in5_l,   \
+                        out0, out1, out2,      \
+                        out3, out4, out5) {    \
+  ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l,  \
+                  out0, out1);                 \
+  ILVR_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l,  \
+                  out2, out3);                 \
+  ILVR_B_2VECS_SB(in4_r, in5_r, in4_l, in5_l,  \
+                  out4, out5);                 \
+}
+
+#define ILVR_B_8VECS_SB(in0_r, in1_r, in2_r, in3_r,  \
+                        in4_r, in5_r, in6_r, in7_r,  \
+                        in0_l, in1_l, in2_l, in3_l,  \
+                        in4_l, in5_l, in6_l, in7_l,  \
+                        out0, out1, out2, out3,      \
+                        out4, out5, out6, out7) {    \
+  ILVR_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l,        \
+                  out0, out1);                       \
+  ILVR_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l,        \
+                  out2, out3);                       \
+  ILVR_B_2VECS_SB(in4_r, in5_r, in4_l, in5_l,        \
+                  out4, out5);                       \
+  ILVR_B_2VECS_SB(in6_r, in7_r, in6_l, in7_l,        \
+                  out6, out7);                       \
+}
+
+#define ILVL_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l,     \
+                        out0, out1) {                   \
+  out0 = __msa_ilvl_b((v16i8)(in0_l), (v16i8)(in0_r));  \
+  out1 = __msa_ilvl_b((v16i8)(in1_l), (v16i8)(in1_r));  \
+}
+
+#define ILVL_B_4VECS_SB(in0_r, in1_r, in2_r, in3_r,  \
+                        in0_l, in1_l, in2_l, in3_l,  \
+                        out0, out1, out2, out3) {    \
+  ILVL_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l,        \
+                  out0, out1);                       \
+  ILVL_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l,        \
+                  out2, out3);                       \
+}
+
+#define ILVL_B_6VECS_SB(in0_r, in1_r, in2_r,   \
+                        in3_r, in4_r, in5_r,   \
+                        in0_l, in1_l, in2_l,   \
+                        in3_l, in4_l, in5_l,   \
+                        out0, out1, out2,      \
+                        out3, out4, out5) {    \
+  ILVL_B_2VECS_SB(in0_r, in1_r, in0_l, in1_l,  \
+                  out0, out1);                 \
+  ILVL_B_2VECS_SB(in2_r, in3_r, in2_l, in3_l,  \
+                  out2, out3);                 \
+  ILVL_B_2VECS_SB(in4_r, in5_r, in4_l, in5_l,  \
+                  out4, out5);                 \
+}
+
+#define ILVR_D_2VECS_SB(out0, in0_l, in0_r,                    \
+                        out1, in1_l, in1_r) {                  \
+  out0 = (v16i8)__msa_ilvr_d((v2i64)(in0_l), (v2i64)(in0_r));  \
+  out1 = (v16i8)__msa_ilvr_d((v2i64)(in1_l), (v2i64)(in1_r));  \
+}
+
+#define ILVR_D_3VECS_SB(out0, in0_l, in0_r,                    \
+                        out1, in1_l, in1_r,                    \
+                        out2, in2_l, in2_r) {                  \
+  ILVR_D_2VECS_SB(out0, in0_l, in0_r,                          \
+                  out1, in1_l, in1_r);                         \
+  out2 = (v16i8)__msa_ilvr_d((v2i64)(in2_l), (v2i64)(in2_r));  \
+}
+
+#define ILVR_D_4VECS_SB(out0, in0_l, in0_r,    \
+                        out1, in1_l, in1_r,    \
+                        out2, in2_l, in2_r,    \
+                        out3, in3_l, in3_r) {  \
+  ILVR_D_2VECS_SB(out0, in0_l, in0_r,          \
+                  out1, in1_l, in1_r);         \
+  ILVR_D_2VECS_SB(out2, in2_l, in2_r,          \
+                  out3, in3_l, in3_r);         \
+}
+
+#define XORI_B_2VECS_UB(val0, val1,               \
+                        out0, out1, xor_val) {    \
+  out0 = __msa_xori_b((v16u8)(val0), (xor_val));  \
+  out1 = __msa_xori_b((v16u8)(val1), (xor_val));  \
+}
+
+#define XORI_B_2VECS_SB(val0, val1,                      \
+                        out0, out1, xor_val) {           \
+  out0 = (v16i8)__msa_xori_b((v16u8)(val0), (xor_val));  \
+  out1 = (v16i8)__msa_xori_b((v16u8)(val1), (xor_val));  \
+}
+
+#define XORI_B_3VECS_SB(val0, val1, val2,                \
+                        out0, out1, out2, xor_val) {     \
+  XORI_B_2VECS_SB(val0, val1,  out0, out1, xor_val);     \
+  out2 = (v16i8)__msa_xori_b((v16u8)(val2), (xor_val));  \
+}
+
+#define XORI_B_4VECS_UB(val0, val1, val2, val3,      \
+                        out0, out1, out2, out3,      \
+                        xor_val) {                   \
+  XORI_B_2VECS_UB(val0, val1, out0, out1, xor_val);  \
+  XORI_B_2VECS_UB(val2, val3, out2, out3, xor_val);  \
+}
+
+#define XORI_B_4VECS_SB(val0, val1, val2, val3,      \
+                        out0, out1, out2, out3,      \
+                        xor_val) {                   \
+  XORI_B_2VECS_SB(val0, val1, out0, out1, xor_val);  \
+  XORI_B_2VECS_SB(val2, val3, out2, out3, xor_val);  \
+}
+
+#define XORI_B_7VECS_SB(val0, val1, val2, val3,      \
+                        val4, val5, val6,            \
+                        out0, out1, out2, out3,      \
+                        out4, out5, out6,            \
+                        xor_val) {                   \
+  XORI_B_4VECS_SB(val0, val1, val2, val3,            \
+                  out0, out1, out2, out3, xor_val);  \
+  XORI_B_3VECS_SB(val4, val5, val6,                  \
+                  out4, out5, out6, xor_val);        \
+}
+
+#define SRARI_H_4VECS_UH(val0, val1, val2, val3,                  \
+                         out0, out1, out2, out3,                  \
+                         shift_right_val) {                       \
+  out0 = (v8u16)__msa_srari_h((v8i16)(val0), (shift_right_val));  \
+  out1 = (v8u16)__msa_srari_h((v8i16)(val1), (shift_right_val));  \
+  out2 = (v8u16)__msa_srari_h((v8i16)(val2), (shift_right_val));  \
+  out3 = (v8u16)__msa_srari_h((v8i16)(val3), (shift_right_val));  \
+}
+
+#define SRARI_SATURATE_UNSIGNED_H(input, right_shift_val, sat_val) ({  \
+  v8u16 out_m;                                                         \
+                                                                       \
+  out_m = (v8u16)__msa_srari_h((v8i16)(input), (right_shift_val));     \
+  out_m = __msa_sat_u_h(out_m, (sat_val));                             \
+  out_m;                                                               \
+})
+
+#define SRARI_SATURATE_SIGNED_H(input, right_shift_val, sat_val) ({  \
+  v8i16 out_m;                                                       \
+                                                                     \
+  out_m = __msa_srari_h((v8i16)(input), (right_shift_val));          \
+  out_m = __msa_sat_s_h(out_m, (sat_val));                           \
+  out_m;                                                             \
+})
+
+#define PCKEV_2B_XORI128_STORE_4_BYTES_4(in1, in2,        \
+                                         pdst, stride) {  \
+  uint32_t out0_m, out1_m, out2_m, out3_m;                \
+  v16i8 tmp0_m;                                           \
+  uint8_t *dst_m = (uint8_t *)(pdst);                     \
+                                                          \
+  tmp0_m = __msa_pckev_b((v16i8)(in2), (v16i8)(in1));     \
+  tmp0_m = (v16i8)__msa_xori_b((v16u8)tmp0_m, 128);       \
+                                                          \
+  out0_m = __msa_copy_u_w((v4i32)tmp0_m, 0);              \
+  out1_m = __msa_copy_u_w((v4i32)tmp0_m, 1);              \
+  out2_m = __msa_copy_u_w((v4i32)tmp0_m, 2);              \
+  out3_m = __msa_copy_u_w((v4i32)tmp0_m, 3);              \
+                                                          \
+  STORE_WORD(dst_m, out0_m);                              \
+  dst_m += stride;                                        \
+  STORE_WORD(dst_m, out1_m);                              \
+  dst_m += stride;                                        \
+  STORE_WORD(dst_m, out2_m);                              \
+  dst_m += stride;                                        \
+  STORE_WORD(dst_m, out3_m);                              \
+}
+
+#define PCKEV_B_4_XORI128_STORE_8_BYTES_4(in1, in2,        \
+                                          in3, in4,        \
+                                          pdst, stride) {  \
+  uint64_t out0_m, out1_m, out2_m, out3_m;                 \
+  v16i8 tmp0_m, tmp1_m;                                    \
+  uint8_t *dst_m = (uint8_t *)(pdst);                      \
+                                                           \
+  tmp0_m = __msa_pckev_b((v16i8)(in2), (v16i8)(in1));      \
+  tmp1_m = __msa_pckev_b((v16i8)(in4), (v16i8)(in3));      \
+                                                           \
+  tmp0_m = (v16i8)__msa_xori_b((v16u8)tmp0_m, 128);        \
+  tmp1_m = (v16i8)__msa_xori_b((v16u8)tmp1_m, 128);        \
+                                                           \
+  out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0);               \
+  out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1);               \
+  out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0);               \
+  out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1);               \
+                                                           \
+  STORE_DWORD(dst_m, out0_m);                              \
+  dst_m += stride;                                         \
+  STORE_DWORD(dst_m, out1_m);                              \
+  dst_m += stride;                                         \
+  STORE_DWORD(dst_m, out2_m);                              \
+  dst_m += stride;                                         \
+  STORE_DWORD(dst_m, out3_m);                              \
+}
+
+/* Only for signed vecs */
+#define PCKEV_B_XORI128_STORE_VEC(in1, in2, pdest) {  \
+  v16i8 tmp_m;                                        \
+                                                      \
+  tmp_m = __msa_pckev_b((v16i8)(in1), (v16i8)(in2));  \
+  tmp_m = (v16i8)__msa_xori_b((v16u8)tmp_m, 128);     \
+  STORE_SB(tmp_m, (pdest));                           \
+}
+
+/* Only for signed vecs */
+#define PCKEV_B_4_XORI128_AVG_STORE_8_BYTES_4(in1, dst0,       \
+                                              in2, dst1,       \
+                                              in3, dst2,       \
+                                              in4, dst3,       \
+                                              pdst, stride) {  \
+  uint64_t out0_m, out1_m, out2_m, out3_m;                     \
+  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                        \
+  uint8_t *dst_m = (uint8_t *)(pdst);                          \
+                                                               \
+  tmp0_m = (v16u8)__msa_pckev_b((v16i8)(in2), (v16i8)(in1));   \
+  tmp1_m = (v16u8)__msa_pckev_b((v16i8)(in4), (v16i8)(in3));   \
+                                                               \
+  tmp2_m = (v16u8)__msa_ilvr_d((v2i64)(dst1), (v2i64)(dst0));  \
+  tmp3_m = (v16u8)__msa_ilvr_d((v2i64)(dst3), (v2i64)(dst2));  \
+                                                               \
+  tmp0_m = __msa_xori_b(tmp0_m, 128);                          \
+  tmp1_m = __msa_xori_b(tmp1_m, 128);                          \
+                                                               \
+  tmp0_m = __msa_aver_u_b(tmp0_m, tmp2_m);                     \
+  tmp1_m = __msa_aver_u_b(tmp1_m, tmp3_m);                     \
+                                                               \
+  out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0);                   \
+  out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1);                   \
+  out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0);                   \
+  out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1);                   \
+                                                               \
+  STORE_DWORD(dst_m, out0_m);                                  \
+  dst_m += stride;                                             \
+  STORE_DWORD(dst_m, out1_m);                                  \
+  dst_m += stride;                                             \
+  STORE_DWORD(dst_m, out2_m);                                  \
+  dst_m += stride;                                             \
+  STORE_DWORD(dst_m, out3_m);                                  \
+}
+
+/* Only for signed vecs */
+#define PCKEV_B_XORI128_AVG_STORE_VEC(in1, in2, dst, pdest) {  \
+  v16u8 tmp_m;                                                 \
+                                                               \
+  tmp_m = (v16u8)__msa_pckev_b((v16i8)(in1), (v16i8)(in2));    \
+  tmp_m = __msa_xori_b(tmp_m, 128);                            \
+  tmp_m = __msa_aver_u_b(tmp_m, (v16u8)(dst));                 \
+  STORE_UB(tmp_m, (pdest));                                    \
+}
+
+#define PCKEV_B_STORE_8_BYTES_4(in1, in2, in3, in4,    \
+                                pdst, stride) {        \
+  uint64_t out0_m, out1_m, out2_m, out3_m;             \
+  v16i8 tmp0_m, tmp1_m;                                \
+  uint8_t *dst_m = (uint8_t *)(pdst);                  \
+                                                       \
+  tmp0_m = __msa_pckev_b((v16i8)(in2), (v16i8)(in1));  \
+  tmp1_m = __msa_pckev_b((v16i8)(in4), (v16i8)(in3));  \
+                                                       \
+  out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0);           \
+  out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1);           \
+  out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0);           \
+  out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1);           \
+                                                       \
+  STORE_DWORD(dst_m, out0_m);                          \
+  dst_m += stride;                                     \
+  STORE_DWORD(dst_m, out1_m);                          \
+  dst_m += stride;                                     \
+  STORE_DWORD(dst_m, out2_m);                          \
+  dst_m += stride;                                     \
+  STORE_DWORD(dst_m, out3_m);                          \
+}
+
+/* Only for unsigned vecs */
+#define PCKEV_B_STORE_VEC(in1, in2, pdest) {          \
+  v16i8 tmp_m;                                        \
+                                                      \
+  tmp_m = __msa_pckev_b((v16i8)(in1), (v16i8)(in2));  \
+  STORE_SB(tmp_m, (pdest));                           \
+}
+
+#define PCKEV_B_AVG_STORE_8_BYTES_4(in1, dst0, in2, dst1,       \
+                                    in3, dst2, in4, dst3,       \
+                                    pdst, stride) {             \
+  uint64_t out0_m, out1_m, out2_m, out3_m;                      \
+  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                         \
+  uint8_t *dst_m = (uint8_t *)(pdst);                           \
+                                                                \
+  tmp0_m = (v16u8)__msa_pckev_b((v16i8)(in2), (v16i8)(in1));    \
+  tmp1_m = (v16u8)__msa_pckev_b((v16i8)(in4), (v16i8)(in3));    \
+                                                                \
+  tmp2_m = (v16u8)__msa_pckev_d((v2i64)(dst1), (v2i64)(dst0));  \
+  tmp3_m = (v16u8)__msa_pckev_d((v2i64)(dst3), (v2i64)(dst2));  \
+                                                                \
+  tmp0_m = __msa_aver_u_b(tmp0_m, tmp2_m);                      \
+  tmp1_m = __msa_aver_u_b(tmp1_m, tmp3_m);                      \
+                                                                \
+  out0_m = __msa_copy_u_d((v2i64)tmp0_m, 0);                    \
+  out1_m = __msa_copy_u_d((v2i64)tmp0_m, 1);                    \
+  out2_m = __msa_copy_u_d((v2i64)tmp1_m, 0);                    \
+  out3_m = __msa_copy_u_d((v2i64)tmp1_m, 1);                    \
+                                                                \
+  STORE_DWORD(dst_m, out0_m);                                   \
+  dst_m += stride;                                              \
+  STORE_DWORD(dst_m, out1_m);                                   \
+  dst_m += stride;                                              \
+  STORE_DWORD(dst_m, out2_m);                                   \
+  dst_m += stride;                                              \
+  STORE_DWORD(dst_m, out3_m);                                   \
+}
+
+#define PCKEV_B_AVG_STORE_VEC(in1, in2, dst, pdest) {        \
+  v16u8 tmp_m;                                               \
+                                                             \
+  tmp_m = (v16u8)__msa_pckev_b((v16i8)(in1), (v16i8)(in2));  \
+  tmp_m = __msa_aver_u_b(tmp_m, (v16u8)(dst));               \
+  STORE_UB(tmp_m, (pdest));                                  \
+}
+#endif  /* HAVE_MSA */
+#endif  /* VP9_COMMON_MIPS_MSA_VP9_MACROS_MSA_H_ */

diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c
index 77a8709..7cdfaec 100644
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c

@@ -15,6 +15,18 @@
 #include "vpx_mem/vpx_mem.h"
 #include "vpx/vpx_integer.h"
 
+// Unconstrained Node Tree
+const vp9_tree_index vp9_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
+  2, 6,                                // 0 = LOW_VAL
+  -TWO_TOKEN, 4,                       // 1 = TWO
+  -THREE_TOKEN, -FOUR_TOKEN,           // 2 = THREE
+  8, 10,                               // 3 = HIGH_LOW
+  -CATEGORY1_TOKEN, -CATEGORY2_TOKEN,  // 4 = CAT_ONE
+  12, 14,                              // 5 = CAT_THREEFOUR
+  -CATEGORY3_TOKEN, -CATEGORY4_TOKEN,  // 6 = CAT_THREE
+  -CATEGORY5_TOKEN, -CATEGORY6_TOKEN   // 7 = CAT_FIVE
+};
+
 const vp9_prob vp9_cat1_prob[] = { 159 };
 const vp9_prob vp9_cat2_prob[] = { 165, 145 };
 const vp9_prob vp9_cat3_prob[] = { 173, 148, 140 };

diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index 4eb2e64..eee096f 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h

@@ -173,6 +173,7 @@
 #define PIVOT_NODE                  2   // which node is pivot
 
 #define MODEL_NODES (ENTROPY_NODES - UNCONSTRAINED_NODES)
+extern const vp9_tree_index vp9_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)];
 extern const vp9_prob vp9_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES];
 
 typedef vp9_prob vp9_coeff_probs_model[REF_TYPES][COEF_BANDS]

diff --git a/vp9/common/vp9_postproc.c b/vp9/common/vp9_postproc.c
index bb21ade..0fbc2d2 100644
--- a/vp9/common/vp9_postproc.c
+++ b/vp9/common/vp9_postproc.c

@@ -91,10 +91,7 @@
                                      int flimit) {
   uint8_t const *p_src;
   uint8_t *p_dst;
-  int row;
-  int col;
-  int i;
-  int v;
+  int row, col, i, v, kernel;
   int pitch = src_pixels_per_line;
   uint8_t d[8];
   (void)dst_pixels_per_line;
@@ -105,8 +102,8 @@
     p_dst = dst_ptr;
 
     for (col = 0; col < cols; col++) {
-      int kernel = 4;
-      int v = p_src[col];
+      kernel = 4;
+      v = p_src[col];
 
       for (i = -2; i <= 2; i++) {
         if (abs(v - p_src[col + i * pitch]) > flimit)
@@ -128,7 +125,7 @@
       d[i] = p_src[i];
 
     for (col = 0; col < cols; col++) {
-      int kernel = 4;
+      kernel = 4;
       v = p_src[col];
 
       d[col & 7] = v;
@@ -168,10 +165,7 @@
                                             int flimit) {
   uint16_t const *p_src;
   uint16_t *p_dst;
-  int row;
-  int col;
-  int i;
-  int v;
+  int row, col, i, v, kernel;
   int pitch = src_pixels_per_line;
   uint16_t d[8];
 
@@ -181,8 +175,8 @@
     p_dst = dst_ptr;
 
     for (col = 0; col < cols; col++) {
-      int kernel = 4;
-      int v = p_src[col];
+      kernel = 4;
+      v = p_src[col];
 
       for (i = -2; i <= 2; i++) {
         if (abs(v - p_src[col + i * pitch]) > flimit)
@@ -205,7 +199,7 @@
       d[i] = p_src[i];
 
     for (col = 0; col < cols; col++) {
-      int kernel = 4;
+      kernel = 4;
       v = p_src[col];
 
       d[col & 7] = v;
@@ -518,22 +512,24 @@
     assert((src->flags & YV12_FLAG_HIGHBITDEPTH) ==
            (dst->flags & YV12_FLAG_HIGHBITDEPTH));
     if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
-      const uint16_t *const src = CONVERT_TO_SHORTPTR(srcs[i] + 2 * src_stride
-                                                      + 2);
-      uint16_t *const dst = CONVERT_TO_SHORTPTR(dsts[i] + 2 * dst_stride + 2);
-      vp9_highbd_post_proc_down_and_across(src, dst, src_stride, dst_stride,
-                                           src_height, src_width, ppl);
+      const uint16_t *const src_plane = CONVERT_TO_SHORTPTR(
+          srcs[i] + 2 * src_stride + 2);
+      uint16_t *const dst_plane = CONVERT_TO_SHORTPTR(
+          dsts[i] + 2 * dst_stride + 2);
+      vp9_highbd_post_proc_down_and_across(src_plane, dst_plane, src_stride,
+                                           dst_stride, src_height, src_width,
+                                           ppl);
     } else {
-      const uint8_t *const src = srcs[i] + 2 * src_stride + 2;
-      uint8_t *const dst = dsts[i] + 2 * dst_stride + 2;
+      const uint8_t *const src_plane = srcs[i] + 2 * src_stride + 2;
+      uint8_t *const dst_plane = dsts[i] + 2 * dst_stride + 2;
 
-      vp9_post_proc_down_and_across(src, dst, src_stride, dst_stride,
-                                    src_height, src_width, ppl);
+      vp9_post_proc_down_and_across(src_plane, dst_plane, src_stride,
+                                    dst_stride, src_height, src_width, ppl);
     }
 #else
-    const uint8_t *const src = srcs[i] + 2 * src_stride + 2;
-    uint8_t *const dst = dsts[i] + 2 * dst_stride + 2;
-    vp9_post_proc_down_and_across(src, dst, src_stride, dst_stride,
+    const uint8_t *const src_plane = srcs[i] + 2 * src_stride + 2;
+    uint8_t *const dst_plane = dsts[i] + 2 * dst_stride + 2;
+    vp9_post_proc_down_and_across(src_plane, dst_plane, src_stride, dst_stride,
                                   src_height, src_width, ppl);
 #endif
   }
@@ -558,16 +554,15 @@
    * a gaussian distribution with sigma determined by q.
    */
   {
-    double i;
     int next, j;
 
     next = 0;
 
     for (i = -32; i < 32; i++) {
-      int a = (int)(0.5 + 256 * gaussian(sigma, 0, i));
+      int a_i = (int)(0.5 + 256 * gaussian(sigma, 0, i));
 
-      if (a) {
-        for (j = 0; j < a; j++) {
+      if (a_i) {
+        for (j = 0; j < a_i; j++) {
           char_dist[next + j] = (char) i;
         }
 

diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index 1be358e..c496299 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c

@@ -285,10 +285,10 @@
 void vp9_setup_dst_planes(struct macroblockd_plane planes[MAX_MB_PLANE],
                           const YV12_BUFFER_CONFIG *src,
                           int mi_row, int mi_col) {
-  uint8_t *const buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
-                               src->alpha_buffer};
-  const int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
-                          src->alpha_stride};
+  uint8_t *const buffers[MAX_MB_PLANE] = { src->y_buffer, src->u_buffer,
+      src->v_buffer};
+  const int strides[MAX_MB_PLANE] = { src->y_stride, src->uv_stride,
+      src->uv_stride};
   int i;
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
@@ -304,11 +304,10 @@
                           const struct scale_factors *sf) {
   if (src != NULL) {
     int i;
-    uint8_t *const buffers[4] = {src->y_buffer, src->u_buffer, src->v_buffer,
-                                 src->alpha_buffer};
-    const int strides[4] = {src->y_stride, src->uv_stride, src->uv_stride,
-                            src->alpha_stride};
-
+    uint8_t *const buffers[MAX_MB_PLANE] = { src->y_buffer, src->u_buffer,
+        src->v_buffer};
+    const int strides[MAX_MB_PLANE] = { src->y_stride, src->uv_stride,
+        src->uv_stride};
     for (i = 0; i < MAX_MB_PLANE; ++i) {
       struct macroblockd_plane *const pd = &xd->plane[i];
       setup_pred_plane(&pd->pre[idx], buffers[i], strides[i], mi_row, mi_col,

diff --git a/vp9/common/vp9_rtcd.c b/vp9/common/vp9_rtcd.c
index c777bc8..2dfa09f 100644
--- a/vp9/common/vp9_rtcd.c
+++ b/vp9/common/vp9_rtcd.c

@@ -12,10 +12,7 @@
 #include "./vp9_rtcd.h"
 #include "vpx_ports/vpx_once.h"
 
-void vpx_scale_rtcd(void);
-
 void vp9_rtcd() {
-    vpx_scale_rtcd();
     // TODO(JBB): Remove this once, by insuring that both the encoder and
     // decoder setup functions are protected by once();
     once(setup_rtcd_internal);

diff --git a/vp9/common/vp9_rtcd_defs.pl b/vp9/common/vp9_rtcd_defs.pl
index 1344813..537cc6c 100644
--- a/vp9/common/vp9_rtcd_defs.pl
+++ b/vp9/common/vp9_rtcd_defs.pl

@@ -499,7 +499,7 @@
   specialize qw/vp9_highbd_d153_predictor_4x4/;
 
   add_proto qw/void vp9_highbd_v_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vp9_highbd_v_predictor_4x4 neon/, "$sse_x86inc";
+  specialize qw/vp9_highbd_v_predictor_4x4/, "$sse_x86inc";
 
   add_proto qw/void vp9_highbd_tm_predictor_4x4/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_highbd_tm_predictor_4x4/, "$sse_x86inc";
@@ -577,7 +577,7 @@
   specialize qw/vp9_highbd_d153_predictor_16x16/;
 
   add_proto qw/void vp9_highbd_v_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
-  specialize qw/vp9_highbd_v_predictor_16x16 neon/, "$sse2_x86inc";
+  specialize qw/vp9_highbd_v_predictor_16x16/, "$sse2_x86inc";
 
   add_proto qw/void vp9_highbd_tm_predictor_16x16/, "uint16_t *dst, ptrdiff_t y_stride, const uint16_t *above, const uint16_t *left, int bd";
   specialize qw/vp9_highbd_tm_predictor_16x16/, "$sse2_x86_64";
@@ -1028,16 +1028,20 @@
 specialize qw/vp9_sad32x32x8/;
 
 add_proto qw/void vp9_sad16x16x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
-specialize qw/vp9_sad16x16x8 sse4/;
+specialize qw/vp9_sad16x16x8 sse4_1/;
+$vp9_sad16x16x8_sse4_1=vp9_sad16x16x8_sse4;
 
 add_proto qw/void vp9_sad16x8x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
-specialize qw/vp9_sad16x8x8 sse4/;
+specialize qw/vp9_sad16x8x8 sse4_1/;
+$vp9_sad16x8x8_sse4_1=vp9_sad16x8x8_sse4;
 
 add_proto qw/void vp9_sad8x16x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
-specialize qw/vp9_sad8x16x8 sse4/;
+specialize qw/vp9_sad8x16x8 sse4_1/;
+$vp9_sad8x16x8_sse4_1=vp9_sad8x16x8_sse4;
 
 add_proto qw/void vp9_sad8x8x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
-specialize qw/vp9_sad8x8x8 sse4/;
+specialize qw/vp9_sad8x8x8 sse4_1/;
+$vp9_sad8x8x8_sse4_1=vp9_sad8x8x8_sse4;
 
 add_proto qw/void vp9_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
 specialize qw/vp9_sad8x4x8/;
@@ -1046,7 +1050,8 @@
 specialize qw/vp9_sad4x8x8/;
 
 add_proto qw/void vp9_sad4x4x8/, "const uint8_t *src_ptr, int  src_stride, const uint8_t *ref_ptr, int  ref_stride, uint32_t *sad_array";
-specialize qw/vp9_sad4x4x8 sse4/;
+specialize qw/vp9_sad4x4x8 sse4_1/;
+$vp9_sad4x4x8_sse4_1=vp9_sad4x4x8_sse4;
 
 add_proto qw/void vp9_sad64x64x4d/, "const uint8_t *src_ptr, int  src_stride, const uint8_t* const ref_ptr[], int  ref_stride, unsigned int *sad_array";
 specialize qw/vp9_sad64x64x4d sse2 avx2 neon/;
@@ -1109,6 +1114,18 @@
 add_proto qw/unsigned int vp9_avg_4x4/, "const uint8_t *, int p";
 specialize qw/vp9_avg_4x4 sse2/;
 
+add_proto qw/void vp9_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
+specialize qw/vp9_minmax_8x8 sse2/;
+
+add_proto qw/void vp9_hadamard_8x8/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
+specialize qw/vp9_hadamard_8x8 sse2/, "$ssse3_x86_64";
+
+add_proto qw/void vp9_hadamard_16x16/, "int16_t const *src_diff, int src_stride, int16_t *coeff";
+specialize qw/vp9_hadamard_16x16 sse2/;
+
+add_proto qw/int16_t vp9_satd/, "const int16_t *coeff, int length";
+specialize qw/vp9_satd sse2/;
+
 add_proto qw/void vp9_int_pro_row/, "int16_t *hbuf, uint8_t const *ref, const int ref_stride, const int height";
 specialize qw/vp9_int_pro_row sse2/;
 
@@ -1123,6 +1140,8 @@
   specialize qw/vp9_highbd_avg_8x8/;
   add_proto qw/unsigned int vp9_highbd_avg_4x4/, "const uint8_t *, int p";
   specialize qw/vp9_highbd_avg_4x4/;
+  add_proto qw/unsigned int vp9_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
+  specialize qw/vp9_highbd_minmax_8x8/;
 }
 
 # ENCODEMB INVOKE
@@ -1162,6 +1181,9 @@
   add_proto qw/int64_t vp9_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz";
   specialize qw/vp9_block_error avx2/, "$sse2_x86inc";
 
+  add_proto qw/int64_t vp9_block_error_fp/, "const int16_t *coeff, const int16_t *dqcoeff, int block_size";
+  specialize qw/vp9_block_error_fp sse2/;
+
   add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
   specialize qw/vp9_quantize_fp neon sse2/, "$ssse3_x86_64";
 

diff --git a/vp9/decoder/vp9_decodeframe.c b/vp9/decoder/vp9_decodeframe.c
index 8840750..1a3b946 100644
--- a/vp9/decoder/vp9_decodeframe.c
+++ b/vp9/decoder/vp9_decodeframe.c

@@ -1807,162 +1807,134 @@
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 void dec_build_inter_predictors(VP9Decoder *const pbi, MACROBLOCKD *xd,
-                                int plane, int block, int bw, int bh, int x,
-                                int y, int w, int h, int mi_x, int mi_y) {
+                                int plane, int bw, int bh, int x,
+                                int y, int w, int h, int mi_x, int mi_y,
+                                const InterpKernel *kernel,
+                                const struct scale_factors *sf,
+                                struct buf_2d *pre_buf, struct buf_2d *dst_buf,
+                                const MV* mv, RefCntBuffer *ref_frame_buf,
+                                int is_scaled, int ref) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  const MODE_INFO *mi = xd->mi[0].src_mi;
-  const int is_compound = has_second_ref(&mi->mbmi);
-  const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter);
-  int ref;
+  uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
+  MV32 scaled_mv;
+  int xs, ys, x0, y0, x0_16, y0_16, frame_width, frame_height,
+      buf_stride, subpel_x, subpel_y;
+  uint8_t *ref_frame, *buf_ptr;
 
-  for (ref = 0; ref < 1 + is_compound; ++ref) {
-    const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
-    struct buf_2d *const pre_buf = &pd->pre[ref];
-    struct buf_2d *const dst_buf = &pd->dst;
-    uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
-    const MV mv = mi->mbmi.sb_type < BLOCK_8X8
-               ? average_split_mvs(pd, mi, ref, block)
-               : mi->mbmi.mv[ref].as_mv;
+  // Get reference frame pointer, width and height.
+  if (plane == 0) {
+    frame_width = ref_frame_buf->buf.y_crop_width;
+    frame_height = ref_frame_buf->buf.y_crop_height;
+    ref_frame = ref_frame_buf->buf.y_buffer;
+  } else {
+    frame_width = ref_frame_buf->buf.uv_crop_width;
+    frame_height = ref_frame_buf->buf.uv_crop_height;
+    ref_frame = plane == 1 ? ref_frame_buf->buf.u_buffer
+                         : ref_frame_buf->buf.v_buffer;
+  }
 
-    const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh,
+  if (is_scaled) {
+    const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, mv, bw, bh,
                                                pd->subsampling_x,
                                                pd->subsampling_y);
+    // Co-ordinate of containing block to pixel precision.
+    int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
+    int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
 
-    MV32 scaled_mv;
-    int xs, ys, x0, y0, x0_16, y0_16, y1, frame_width, frame_height,
-        buf_stride, subpel_x, subpel_y;
-    uint8_t *ref_frame, *buf_ptr;
-    const int idx = xd->block_refs[ref]->idx;
-    BufferPool *const pool = pbi->common.buffer_pool;
-    RefCntBuffer *const ref_frame_buf = &pool->frame_bufs[idx];
-    const int is_scaled = vp9_is_scaled(sf);
+    // Co-ordinate of the block to 1/16th pixel precision.
+    x0_16 = (x_start + x) << SUBPEL_BITS;
+    y0_16 = (y_start + y) << SUBPEL_BITS;
 
-    // Get reference frame pointer, width and height.
-    if (plane == 0) {
-      frame_width = ref_frame_buf->buf.y_crop_width;
-      frame_height = ref_frame_buf->buf.y_crop_height;
-      ref_frame = ref_frame_buf->buf.y_buffer;
-    } else {
-      frame_width = ref_frame_buf->buf.uv_crop_width;
-      frame_height = ref_frame_buf->buf.uv_crop_height;
-      ref_frame = plane == 1 ? ref_frame_buf->buf.u_buffer
-                           : ref_frame_buf->buf.v_buffer;
+    // Co-ordinate of current block in reference frame
+    // to 1/16th pixel precision.
+    x0_16 = sf->scale_value_x(x0_16, sf);
+    y0_16 = sf->scale_value_y(y0_16, sf);
+
+    // Map the top left corner of the block into the reference frame.
+    x0 = sf->scale_value_x(x_start + x, sf);
+    y0 = sf->scale_value_y(y_start + y, sf);
+
+    // Scale the MV and incorporate the sub-pixel offset of the block
+    // in the reference frame.
+    scaled_mv = vp9_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
+    xs = sf->x_step_q4;
+    ys = sf->y_step_q4;
+  } else {
+    // Co-ordinate of containing block to pixel precision.
+    x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
+    y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
+
+    // Co-ordinate of the block to 1/16th pixel precision.
+    x0_16 = x0 << SUBPEL_BITS;
+    y0_16 = y0 << SUBPEL_BITS;
+
+    scaled_mv.row = mv->row * (1 << (1 - pd->subsampling_y));
+    scaled_mv.col = mv->col * (1 << (1 - pd->subsampling_x));
+    xs = ys = 16;
+  }
+  subpel_x = scaled_mv.col & SUBPEL_MASK;
+  subpel_y = scaled_mv.row & SUBPEL_MASK;
+
+  // Calculate the top left corner of the best matching block in the
+  // reference frame.
+  x0 += scaled_mv.col >> SUBPEL_BITS;
+  y0 += scaled_mv.row >> SUBPEL_BITS;
+  x0_16 += scaled_mv.col;
+  y0_16 += scaled_mv.row;
+
+  // Get reference block pointer.
+  buf_ptr = ref_frame + y0 * pre_buf->stride + x0;
+  buf_stride = pre_buf->stride;
+
+  // Do border extension if there is motion or the
+  // width/height is not a multiple of 8 pixels.
+  if (is_scaled || scaled_mv.col || scaled_mv.row ||
+      (frame_width & 0x7) || (frame_height & 0x7)) {
+    int y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1;
+
+    // Get reference block bottom right horizontal coordinate.
+    int x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1;
+    int x_pad = 0, y_pad = 0;
+
+    if (subpel_x || (sf->x_step_q4 != SUBPEL_SHIFTS)) {
+      x0 -= VP9_INTERP_EXTEND - 1;
+      x1 += VP9_INTERP_EXTEND;
+      x_pad = 1;
     }
 
-    if (is_scaled) {
-      // Co-ordinate of containing block to pixel precision.
-      int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
-      int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
-
-      // Co-ordinate of the block to 1/16th pixel precision.
-      x0_16 = (x_start + x) << SUBPEL_BITS;
-      y0_16 = (y_start + y) << SUBPEL_BITS;
-
-      // Co-ordinate of current block in reference frame
-      // to 1/16th pixel precision.
-      x0_16 = sf->scale_value_x(x0_16, sf);
-      y0_16 = sf->scale_value_y(y0_16, sf);
-
-      // Map the top left corner of the block into the reference frame.
-      x0 = sf->scale_value_x(x_start + x, sf);
-      y0 = sf->scale_value_y(y_start + y, sf);
-
-      // Scale the MV and incorporate the sub-pixel offset of the block
-      // in the reference frame.
-      scaled_mv = vp9_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
-      xs = sf->x_step_q4;
-      ys = sf->y_step_q4;
-    } else {
-      // Co-ordinate of containing block to pixel precision.
-      x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
-      y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
-
-      // Co-ordinate of the block to 1/16th pixel precision.
-      x0_16 = x0 << SUBPEL_BITS;
-      y0_16 = y0 << SUBPEL_BITS;
-
-      scaled_mv.row = mv_q4.row;
-      scaled_mv.col = mv_q4.col;
-      xs = ys = 16;
+    if (subpel_y || (sf->y_step_q4 != SUBPEL_SHIFTS)) {
+      y0 -= VP9_INTERP_EXTEND - 1;
+      y1 += VP9_INTERP_EXTEND;
+      y_pad = 1;
     }
-    subpel_x = scaled_mv.col & SUBPEL_MASK;
-    subpel_y = scaled_mv.row & SUBPEL_MASK;
 
-    // Calculate the top left corner of the best matching block in the
-    // reference frame.
-    x0 += scaled_mv.col >> SUBPEL_BITS;
-    y0 += scaled_mv.row >> SUBPEL_BITS;
-    x0_16 += scaled_mv.col;
-    y0_16 += scaled_mv.row;
+    // Wait until reference block is ready. Pad 7 more pixels as last 7
+    // pixels of each superblock row can be changed by next superblock row.
+    if (pbi->frame_parallel_decode)
+      vp9_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
+                           MAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1));
 
-    // Get reference block pointer.
-    buf_ptr = ref_frame + y0 * pre_buf->stride + x0;
-    buf_stride = pre_buf->stride;
-
-    // Get reference block bottom right vertical coordinate.
-    y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1;
-
-    // Do border extension if there is motion or the
-    // width/height is not a multiple of 8 pixels.
-    if (is_scaled || scaled_mv.col || scaled_mv.row ||
-        (frame_width & 0x7) || (frame_height & 0x7)) {
-      // Get reference block bottom right horizontal coordinate.
-      int x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1;
-      int x_pad = 0, y_pad = 0;
-
-      if (subpel_x || (sf->x_step_q4 != SUBPEL_SHIFTS)) {
-        x0 -= VP9_INTERP_EXTEND - 1;
-        x1 += VP9_INTERP_EXTEND;
-        x_pad = 1;
-      }
-
-      if (subpel_y || (sf->y_step_q4 != SUBPEL_SHIFTS)) {
-        y0 -= VP9_INTERP_EXTEND - 1;
-        y1 += VP9_INTERP_EXTEND;
-        y_pad = 1;
-      }
-
-      // Wait until reference block is ready. Pad 7 more pixels as last 7
-      // pixels of each superblock row can be changed by next superblock row.
-       if (pbi->frame_parallel_decode)
-         vp9_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
-                              MAX(0, (y1 + 7) << (plane == 0 ? 0 : 1)));
-
-      // Skip border extension if block is inside the frame.
-      if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width - 1 ||
-          y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) {
-        uint8_t *buf_ptr1 = ref_frame + y0 * pre_buf->stride + x0;
-        // Extend the border.
+    // Skip border extension if block is inside the frame.
+    if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width - 1 ||
+        y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) {
+      uint8_t *buf_ptr1 = ref_frame + y0 * pre_buf->stride + x0;
+      // Extend the border.
 #if CONFIG_VP9_HIGHBITDEPTH
-        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-          high_build_mc_border(buf_ptr1,
-                               pre_buf->stride,
-                               xd->mc_buf_high,
-                               x1 - x0 + 1,
-                               x0,
-                               y0,
-                               x1 - x0 + 1,
-                               y1 - y0 + 1,
-                               frame_width,
-                               frame_height);
-          buf_stride = x1 - x0 + 1;
-          buf_ptr = CONVERT_TO_BYTEPTR(xd->mc_buf_high) +
-              y_pad * 3 * buf_stride + x_pad * 3;
-        } else {
-          build_mc_border(buf_ptr1,
-                          pre_buf->stride,
-                          xd->mc_buf,
-                          x1 - x0 + 1,
-                          x0,
-                          y0,
-                          x1 - x0 + 1,
-                          y1 - y0 + 1,
-                          frame_width,
-                          frame_height);
-          buf_stride = x1 - x0 + 1;
-          buf_ptr = xd->mc_buf + y_pad * 3 * buf_stride + x_pad * 3;
-        }
-#else
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        high_build_mc_border(buf_ptr1,
+                             pre_buf->stride,
+                             xd->mc_buf_high,
+                             x1 - x0 + 1,
+                             x0,
+                             y0,
+                             x1 - x0 + 1,
+                             y1 - y0 + 1,
+                             frame_width,
+                             frame_height);
+        buf_stride = x1 - x0 + 1;
+        buf_ptr = CONVERT_TO_BYTEPTR(xd->mc_buf_high) +
+            y_pad * 3 * buf_stride + x_pad * 3;
+      } else {
         build_mc_border(buf_ptr1,
                         pre_buf->stride,
                         xd->mc_buf,
@@ -1975,28 +1947,43 @@
                         frame_height);
         buf_stride = x1 - x0 + 1;
         buf_ptr = xd->mc_buf + y_pad * 3 * buf_stride + x_pad * 3;
-#endif  // CONFIG_VP9_HIGHBITDEPTH
       }
-    } else {
-      // Wait until reference block is ready. Pad 7 more pixels as last 7
-      // pixels of each superblock row can be changed by next superblock row.
-       if (pbi->frame_parallel_decode)
-         vp9_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
-                              MAX(0, (y1 + 7) << (plane == 0 ? 0 : 1)));
-    }
-#if CONFIG_VP9_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      high_inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
-                           subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
-    } else {
-      inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
-                    subpel_y, sf, w, h, ref, kernel, xs, ys);
-    }
 #else
+      build_mc_border(buf_ptr1,
+                      pre_buf->stride,
+                      xd->mc_buf,
+                      x1 - x0 + 1,
+                      x0,
+                      y0,
+                      x1 - x0 + 1,
+                      y1 - y0 + 1,
+                      frame_width,
+                      frame_height);
+      buf_stride = x1 - x0 + 1;
+      buf_ptr = xd->mc_buf + y_pad * 3 * buf_stride + x_pad * 3;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    }
+  } else {
+    // Wait until reference block is ready. Pad 7 more pixels as last 7
+    // pixels of each superblock row can be changed by next superblock row.
+     if (pbi->frame_parallel_decode) {
+       const int y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1;
+       vp9_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
+                            MAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1));
+     }
+  }
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    high_inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
+                         subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
+  } else {
     inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
                     subpel_y, sf, w, h, ref, kernel, xs, ys);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
   }
+#else
+  inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
+                  subpel_y, sf, w, h, ref, kernel, xs, ys);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 }
 
 void vp9_dec_build_inter_predictors_sb(VP9Decoder *const pbi, MACROBLOCKD *xd,
@@ -2005,24 +1992,50 @@
   int plane;
   const int mi_x = mi_col * MI_SIZE;
   const int mi_y = mi_row * MI_SIZE;
+  const MODE_INFO *mi = xd->mi[0].src_mi;
+  const InterpKernel *kernel = vp9_get_interp_kernel(mi->mbmi.interp_filter);
+  const BLOCK_SIZE sb_type = mi->mbmi.sb_type;
+  const int is_compound = has_second_ref(&mi->mbmi);
+
   for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
     const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize,
                                                         &xd->plane[plane]);
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+    struct buf_2d *const dst_buf = &pd->dst;
     const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
     const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+
     const int bw = 4 * num_4x4_w;
     const int bh = 4 * num_4x4_h;
+    int ref;
 
-    if (xd->mi[0].src_mi->mbmi.sb_type < BLOCK_8X8) {
-      int i = 0, x, y;
-      assert(bsize == BLOCK_8X8);
-      for (y = 0; y < num_4x4_h; ++y)
-        for (x = 0; x < num_4x4_w; ++x)
-          dec_build_inter_predictors(pbi, xd, plane, i++, bw, bh,
-                                     4 * x, 4 * y, 4, 4, mi_x, mi_y);
-    } else {
-      dec_build_inter_predictors(pbi, xd, plane, 0, bw, bh,
-                                 0, 0, bw, bh, mi_x, mi_y);
+    for (ref = 0; ref < 1 + is_compound; ++ref) {
+      const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
+      struct buf_2d *const pre_buf = &pd->pre[ref];
+      const int idx = xd->block_refs[ref]->idx;
+      BufferPool *const pool = pbi->common.buffer_pool;
+      RefCntBuffer *const ref_frame_buf = &pool->frame_bufs[idx];
+      const int is_scaled = vp9_is_scaled(sf);
+
+      if (sb_type < BLOCK_8X8) {
+        int i = 0, x, y;
+        assert(bsize == BLOCK_8X8);
+        for (y = 0; y < num_4x4_h; ++y) {
+          for (x = 0; x < num_4x4_w; ++x) {
+            const MV mv = average_split_mvs(pd, mi, ref, i++);
+            dec_build_inter_predictors(pbi, xd, plane, bw, bh,
+                                       4 * x, 4 * y, 4, 4, mi_x, mi_y, kernel,
+                                       sf, pre_buf, dst_buf, &mv,
+                                       ref_frame_buf, is_scaled, ref);
+          }
+        }
+      } else {
+        const MV mv = mi->mbmi.mv[ref].as_mv;
+        dec_build_inter_predictors(pbi, xd, plane, bw, bh,
+                                   0, 0, bw, bh, mi_x, mi_y, kernel,
+                                   sf, pre_buf, dst_buf, &mv, ref_frame_buf,
+                                   is_scaled, ref);
+      }
     }
   }
 }

diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index 358f22a..5480222 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c

@@ -12,6 +12,7 @@
 #include <limits.h>
 #include <stdio.h>
 
+#include "./vp9_rtcd.h"
 #include "./vpx_scale_rtcd.h"
 
 #include "vpx_mem/vpx_mem.h"
@@ -39,6 +40,7 @@
 
   if (!init_done) {
     vp9_rtcd();
+    vpx_scale_rtcd();
     vp9_init_intra_predictors();
     init_done = 1;
   }

diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index 35690b8..fd40875 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c

@@ -45,17 +45,6 @@
   return val;
 }
 
-static const vp9_tree_index coeff_subtree_high[TREE_SIZE(ENTROPY_TOKENS)] = {
-  2, 6,                                         /* 0 = LOW_VAL */
-  -TWO_TOKEN, 4,                                /* 1 = TWO */
-  -THREE_TOKEN, -FOUR_TOKEN,                    /* 2 = THREE */
-  8, 10,                                        /* 3 = HIGH_LOW */
-  -CATEGORY1_TOKEN, -CATEGORY2_TOKEN,           /* 4 = CAT_ONE */
-  12, 14,                                       /* 5 = CAT_THREEFOUR */
-  -CATEGORY3_TOKEN, -CATEGORY4_TOKEN,           /* 6 = CAT_THREE */
-  -CATEGORY5_TOKEN, -CATEGORY6_TOKEN            /* 7 = CAT_FIVE */
-};
-
 static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
                         FRAME_COUNTS *counts, PLANE_TYPE type,
                         tran_low_t *dqcoeff, TX_SIZE tx_size, const int16_t *dq,
@@ -147,7 +136,7 @@
       val = 1;
     } else {
       INCREMENT_COUNT(TWO_TOKEN);
-      token = vp9_read_tree(r, coeff_subtree_high,
+      token = vp9_read_tree(r, vp9_coef_con_tree,
                             vp9_pareto8_full[prob[PIVOT_NODE] - 1]);
       switch (token) {
         case TWO_TOKEN:

diff --git a/vp9/decoder/vp9_dthread.c b/vp9/decoder/vp9_dthread.c
index be2e6cd..96a63bd 100644
--- a/vp9/decoder/vp9_dthread.c
+++ b/vp9/decoder/vp9_dthread.c

@@ -155,6 +155,10 @@
   dst_worker_data->pbi->need_resync = src_worker_data->pbi->need_resync;
   vp9_frameworker_unlock_stats(src_worker);
 
+  dst_cm->bit_depth = src_cm->bit_depth;
+#if CONFIG_VP9_HIGHBITDEPTH
+  dst_cm->use_highbitdepth = src_cm->use_highbitdepth;
+#endif
   dst_cm->prev_frame = src_cm->show_existing_frame ?
                        src_cm->prev_frame : src_cm->cur_frame;
   dst_cm->last_width = !src_cm->show_existing_frame ?

diff --git a/vp9/encoder/vp9_avg.c b/vp9/encoder/vp9_avg.c
index dc5cfe2..e26a03c 100644
--- a/vp9/encoder/vp9_avg.c
+++ b/vp9/encoder/vp9_avg.c

@@ -28,6 +28,94 @@
   return (sum + 8) >> 4;
 }
 
+static void hadamard_col8(const int16_t *src_diff, int src_stride,
+                          int16_t *coeff) {
+  int16_t b0 = src_diff[0 * src_stride] + src_diff[1 * src_stride];
+  int16_t b1 = src_diff[0 * src_stride] - src_diff[1 * src_stride];
+  int16_t b2 = src_diff[2 * src_stride] + src_diff[3 * src_stride];
+  int16_t b3 = src_diff[2 * src_stride] - src_diff[3 * src_stride];
+  int16_t b4 = src_diff[4 * src_stride] + src_diff[5 * src_stride];
+  int16_t b5 = src_diff[4 * src_stride] - src_diff[5 * src_stride];
+  int16_t b6 = src_diff[6 * src_stride] + src_diff[7 * src_stride];
+  int16_t b7 = src_diff[6 * src_stride] - src_diff[7 * src_stride];
+
+  int16_t c0 = b0 + b2;
+  int16_t c1 = b1 + b3;
+  int16_t c2 = b0 - b2;
+  int16_t c3 = b1 - b3;
+  int16_t c4 = b4 + b6;
+  int16_t c5 = b5 + b7;
+  int16_t c6 = b4 - b6;
+  int16_t c7 = b5 - b7;
+
+  coeff[0] = c0 + c4;
+  coeff[7] = c1 + c5;
+  coeff[3] = c2 + c6;
+  coeff[4] = c3 + c7;
+  coeff[2] = c0 - c4;
+  coeff[6] = c1 - c5;
+  coeff[1] = c2 - c6;
+  coeff[5] = c3 - c7;
+}
+
+void vp9_hadamard_8x8_c(int16_t const *src_diff, int src_stride,
+                        int16_t *coeff) {
+  int idx;
+  int16_t buffer[64];
+  int16_t *tmp_buf = &buffer[0];
+  for (idx = 0; idx < 8; ++idx) {
+    hadamard_col8(src_diff, src_stride, tmp_buf);
+    tmp_buf += 8;
+    ++src_diff;
+  }
+
+  tmp_buf = &buffer[0];
+  for (idx = 0; idx < 8; ++idx) {
+    hadamard_col8(tmp_buf, 8, coeff);
+    coeff += 8;
+    ++tmp_buf;
+  }
+}
+
+// In place 16x16 2D Hadamard transform
+void vp9_hadamard_16x16_c(int16_t const *src_diff, int src_stride,
+                          int16_t *coeff) {
+  int idx;
+  for (idx = 0; idx < 4; ++idx) {
+    int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride
+                                + (idx & 0x01) * 8;
+    vp9_hadamard_8x8_c(src_ptr, src_stride, coeff + idx * 64);
+  }
+
+  for (idx = 0; idx < 64; ++idx) {
+    int16_t a0 = coeff[0];
+    int16_t a1 = coeff[64];
+    int16_t a2 = coeff[128];
+    int16_t a3 = coeff[192];
+
+    int16_t b0 = a0 + a1;
+    int16_t b1 = a0 - a1;
+    int16_t b2 = a2 + a3;
+    int16_t b3 = a2 - a3;
+
+    coeff[0]   = (b0 + b2) >> 1;
+    coeff[64]  = (b1 + b3) >> 1;
+    coeff[128] = (b0 - b2) >> 1;
+    coeff[192] = (b1 - b3) >> 1;
+
+    ++coeff;
+  }
+}
+
+int16_t vp9_satd_c(const int16_t *coeff, int length) {
+  int i;
+  int satd = 0;
+  for (i = 0; i < length; ++i)
+    satd += abs(coeff[i]);
+
+  return (int16_t)satd;
+}
+
 // Integer projection onto row vectors.
 void vp9_int_pro_row_c(int16_t *hbuf, uint8_t const *ref,
                        const int ref_stride, const int height) {
@@ -67,6 +155,20 @@
   return var;
 }
 
+void vp9_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp,
+                      int *min, int *max) {
+  int i, j;
+  *min = 255;
+  *max = 0;
+  for (i = 0; i < 8; ++i, s += p, d += dp) {
+    for (j = 0; j < 8; ++j) {
+      int diff = abs(s[j]-d[j]);
+      *min = diff < *min ? diff : *min;
+      *max = diff > *max ? diff : *max;
+    }
+  }
+}
+
 #if CONFIG_VP9_HIGHBITDEPTH
 unsigned int vp9_highbd_avg_8x8_c(const uint8_t *s8, int p) {
   int i, j;
@@ -87,6 +189,22 @@
 
   return (sum + 8) >> 4;
 }
+
+void vp9_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
+                             int dp, int *min, int *max) {
+  int i, j;
+  *min = 255;
+  *max = 0;
+  const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
+  const uint16_t* d = CONVERT_TO_SHORTPTR(d8);
+  for (i = 0; i < 8; ++i, s += p, d += dp) {
+    for (j = 0; j < 8; ++j) {
+      int diff = abs(s[j]-d[j]);
+      *min = diff < *min ? diff : *min;
+      *max = diff > *max ? diff : *max;
+    }
+  }
+}
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 

diff --git a/vp9/encoder/vp9_bitstream.c b/vp9/encoder/vp9_bitstream.c
index b24fe29..d67d1f4 100644
--- a/vp9/encoder/vp9_bitstream.c
+++ b/vp9/encoder/vp9_bitstream.c

@@ -268,8 +268,7 @@
     vp9_write(w, is_inter, vp9_get_intra_inter_prob(cm, xd));
 
   if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT &&
-      !(is_inter &&
-        (skip || vp9_segfeature_active(seg, segment_id, SEG_LVL_SKIP)))) {
+      !(is_inter && skip)) {
     write_selected_tx_size(cm, xd, w);
   }
 

diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 10a62ef..d428175 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c

@@ -165,21 +165,6 @@
     return BLOCK_8X8;
 }
 
-static BLOCK_SIZE get_nonrd_var_based_fixed_partition(VP9_COMP *cpi,
-                                                      MACROBLOCK *x,
-                                                      int mi_row,
-                                                      int mi_col) {
-  unsigned int var = get_sby_perpixel_diff_variance(cpi, &x->plane[0].src,
-                                                    mi_row, mi_col,
-                                                    BLOCK_64X64);
-  if (var < 4)
-    return BLOCK_64X64;
-  else if (var < 10)
-    return BLOCK_32X32;
-  else
-    return BLOCK_16X16;
-}
-
 // Lighter version of set_offsets that only sets the mode info
 // pointers.
 static INLINE void set_mode_info_offsets(VP9_COMMON *const cm,
@@ -405,18 +390,21 @@
   variance_node vt;
   const int block_width = num_8x8_blocks_wide_lookup[bsize];
   const int block_height = num_8x8_blocks_high_lookup[bsize];
+  const int low_res = (cm->width <= 352 && cm->height <= 288);
 
   assert(block_height == block_width);
   tree_to_node(data, bsize, &vt);
 
-  if (force_split)
+  if (force_split == 1)
     return 0;
 
   // For bsize=bsize_min (16x16/8x8 for 8x8/4x4 downsampling), select if
   // variance is below threshold, otherwise split will be selected.
   // No check for vert/horiz split as too few samples for variance.
   if (bsize == bsize_min) {
-    get_variance(&vt.part_variances->none);
+    // Variance already computed to set the force_split.
+    if (low_res || cm->frame_type == KEY_FRAME)
+      get_variance(&vt.part_variances->none);
     if (mi_col + block_width / 2 < cm->mi_cols &&
         mi_row + block_height / 2 < cm->mi_rows &&
         vt.part_variances->none.variance < threshold) {
@@ -425,11 +413,10 @@
     }
     return 0;
   } else if (bsize > bsize_min) {
-    // Variance is already computed for 32x32 blocks to set the force_split.
-    if (bsize != BLOCK_32X32)
+    // Variance already computed to set the force_split.
+    if (low_res || cm->frame_type == KEY_FRAME)
       get_variance(&vt.part_variances->none);
-    // For key frame or low_res: for bsize above 32X32 or very high variance,
-    // take split.
+    // For key frame: take split for bsize above 32X32 or very high variance.
     if (cm->frame_type == KEY_FRAME &&
         (bsize > BLOCK_32X32 ||
         vt.part_variances->none.variance > (threshold << 4))) {
@@ -473,7 +460,6 @@
   return 0;
 }
 
-
 void vp9_set_vbp_thresholds(VP9_COMP *cpi, int q) {
   SPEED_FEATURES *const sf = &cpi->sf;
   if (sf->partition_search_type != VAR_BASED_PARTITION &&
@@ -482,39 +468,183 @@
   } else {
     VP9_COMMON *const cm = &cpi->common;
     const int is_key_frame = (cm->frame_type == KEY_FRAME);
-    const int threshold_multiplier = is_key_frame ? 80 : 4;
+    const int threshold_multiplier = is_key_frame ? 20 : 1;
     const int64_t threshold_base = (int64_t)(threshold_multiplier *
-        vp9_convert_qindex_to_q(q, cm->bit_depth));
+        cpi->y_dequant[q][1]);
 
     // TODO(marpan): Allow 4x4 partitions for inter-frames.
     // use_4x4_partition = (variance4x4downsample[i2 + j] == 1);
     // If 4x4 partition is not used, then 8x8 partition will be selected
     // if variance of 16x16 block is very high, so use larger threshold
     // for 16x16 (threshold_bsize_min) in that case.
+
+    // Array index: 0 - threshold_64x64; 1 - threshold_32x32;
+    // 2 - threshold_16x16; 3 - vbp_threshold_8x8;
     if (is_key_frame) {
-      cpi->vbp_threshold = threshold_base >> 2;
-      cpi->vbp_threshold_bsize_max = threshold_base;
-      cpi->vbp_threshold_bsize_min = threshold_base << 2;
-      cpi->vbp_threshold_16x16 = cpi->vbp_threshold;
+      cpi->vbp_thresholds[0] = threshold_base;
+      cpi->vbp_thresholds[1] = threshold_base >> 2;
+      cpi->vbp_thresholds[2] = threshold_base >> 2;
+      cpi->vbp_thresholds[3] = threshold_base << 2;
+      cpi->vbp_threshold_sad = 0;
       cpi->vbp_bsize_min = BLOCK_8X8;
     } else {
-      cpi->vbp_threshold = threshold_base;
+      cpi->vbp_thresholds[1] = threshold_base;
       if (cm->width <= 352 && cm->height <= 288) {
-        cpi->vbp_threshold_bsize_max = threshold_base >> 2;
-        cpi->vbp_threshold_bsize_min = threshold_base << 3;
+        cpi->vbp_thresholds[0] = threshold_base >> 2;
+        cpi->vbp_thresholds[2] = threshold_base << 3;
+        cpi->vbp_threshold_sad = 100;
       } else {
-        cpi->vbp_threshold_bsize_max = threshold_base;
-        cpi->vbp_threshold_bsize_min = threshold_base << cpi->oxcf.speed;
+        cpi->vbp_thresholds[0] = threshold_base;
+        cpi->vbp_thresholds[1] = (5 * threshold_base) >> 2;
+        cpi->vbp_thresholds[2] = threshold_base << cpi->oxcf.speed;
+        cpi->vbp_threshold_sad = 1000;
       }
-      cpi->vbp_threshold_16x16 = cpi->vbp_threshold_bsize_min;
       cpi->vbp_bsize_min = BLOCK_16X16;
     }
+    cpi->vbp_threshold_minmax = 15 + (q >> 3);
+  }
+}
+
+// Compute the minmax over the 8x8 subblocks.
+static int compute_minmax_8x8(const uint8_t *s, int sp, const uint8_t *d,
+                              int dp, int x16_idx, int y16_idx,
+#if CONFIG_VP9_HIGHBITDEPTH
+                              int highbd_flag,
+#endif
+                              int pixels_wide,
+                              int pixels_high) {
+  int k;
+  int minmax_max = 0;
+  int minmax_min = 255;
+  // Loop over the 4 8x8 subblocks.
+  for (k = 0; k < 4; k++) {
+    int x8_idx = x16_idx + ((k & 1) << 3);
+    int y8_idx = y16_idx + ((k >> 1) << 3);
+    int min = 0;
+    int max = 0;
+    if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
+        vp9_highbd_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
+                              d + y8_idx * dp + x8_idx, dp,
+                              &min, &max);
+      } else {
+        vp9_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
+                       d + y8_idx * dp + x8_idx, dp,
+                       &min, &max);
+      }
+#else
+      vp9_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
+                     d + y8_idx * dp + x8_idx, dp,
+                     &min, &max);
+#endif
+      if ((max - min) > minmax_max)
+        minmax_max = (max - min);
+      if ((max - min) < minmax_min)
+        minmax_min = (max - min);
+    }
+  }
+  return (minmax_max - minmax_min);
+}
+
+static void modify_vbp_thresholds(VP9_COMP *cpi, int64_t thresholds[], int q) {
+  VP9_COMMON *const cm = &cpi->common;
+  const int64_t threshold_base = (int64_t)(cpi->y_dequant[q][1]);
+
+  // Array index: 0 - threshold_64x64; 1 - threshold_32x32;
+  // 2 - threshold_16x16; 3 - vbp_threshold_8x8;
+  thresholds[1] = threshold_base;
+  if (cm->width <= 352 && cm->height <= 288) {
+    thresholds[0] = threshold_base >> 2;
+    thresholds[2] = threshold_base << 3;
+  } else {
+    thresholds[0] = threshold_base;
+    thresholds[1] = (5 * threshold_base) >> 2;
+    thresholds[2] = threshold_base << cpi->oxcf.speed;
+  }
+}
+
+static void fill_variance_4x4avg(const uint8_t *s, int sp, const uint8_t *d,
+                                 int dp, int x8_idx, int y8_idx, v8x8 *vst,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                 int highbd_flag,
+#endif
+                                 int pixels_wide,
+                                 int pixels_high,
+                                 int is_key_frame) {
+  int k;
+  for (k = 0; k < 4; k++) {
+    int x4_idx = x8_idx + ((k & 1) << 2);
+    int y4_idx = y8_idx + ((k >> 1) << 2);
+    unsigned int sse = 0;
+    int sum = 0;
+    if (x4_idx < pixels_wide && y4_idx < pixels_high) {
+      int s_avg;
+      int d_avg = 128;
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
+        s_avg = vp9_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+        if (!is_key_frame)
+          d_avg = vp9_highbd_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+      } else {
+        s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+        if (!is_key_frame)
+          d_avg = vp9_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+      }
+#else
+      s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp);
+      if (!is_key_frame)
+        d_avg = vp9_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+#endif
+      sum = s_avg - d_avg;
+      sse = sum * sum;
+    }
+    fill_variance(sse, sum, 0, &vst->split[k].part_variances.none);
+  }
+}
+
+static void fill_variance_8x8avg(const uint8_t *s, int sp, const uint8_t *d,
+                                 int dp, int x16_idx, int y16_idx, v16x16 *vst,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                 int highbd_flag,
+#endif
+                                 int pixels_wide,
+                                 int pixels_high,
+                                 int is_key_frame) {
+  int k;
+  for (k = 0; k < 4; k++) {
+    int x8_idx = x16_idx + ((k & 1) << 3);
+    int y8_idx = y16_idx + ((k >> 1) << 3);
+    unsigned int sse = 0;
+    int sum = 0;
+    if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+      int s_avg;
+      int d_avg = 128;
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
+        s_avg = vp9_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+        if (!is_key_frame)
+          d_avg = vp9_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+      } else {
+        s_avg = vp9_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+        if (!is_key_frame)
+          d_avg = vp9_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+      }
+#else
+      s_avg = vp9_avg_8x8(s + y8_idx * sp + x8_idx, sp);
+      if (!is_key_frame)
+        d_avg = vp9_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+#endif
+      sum = s_avg - d_avg;
+      sse = sum * sum;
+    }
+    fill_variance(sse, sum, 0, &vst->split[k].part_variances.none);
   }
 }
 
 // This function chooses partitioning based on the variance between source and
 // reconstructed last, where variance is computed for down-sampled inputs.
-static void choose_partitioning(VP9_COMP *cpi,
+static int choose_partitioning(VP9_COMP *cpi,
                                 const TileInfo *const tile,
                                 MACROBLOCK *x,
                                 int mi_row, int mi_col) {
@@ -523,12 +653,14 @@
   int i, j, k, m;
   v64x64 vt;
   v16x16 vt2[16];
-  int force_split[5];
+  int force_split[21];
   uint8_t *s;
   const uint8_t *d;
   int sp;
   int dp;
   int pixels_wide = 64, pixels_high = 64;
+  int64_t thresholds[4] = {cpi->vbp_thresholds[0], cpi->vbp_thresholds[1],
+      cpi->vbp_thresholds[2], cpi->vbp_thresholds[3]};
 
   // Always use 4x4 partition for key frame.
   const int is_key_frame = (cm->frame_type == KEY_FRAME);
@@ -541,6 +673,11 @@
     const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map :
                                                     cm->last_frame_seg_map;
     segment_id = vp9_get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
+
+    if (cyclic_refresh_segment_id_boosted(segment_id)) {
+      int q = vp9_get_qindex(&cm->seg, segment_id, cm->base_qindex);
+      modify_vbp_thresholds(cpi, thresholds, q);
+    }
   }
 
   set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64);
@@ -560,18 +697,10 @@
 
     const YV12_BUFFER_CONFIG *yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
     unsigned int y_sad, y_sad_g;
-    BLOCK_SIZE bsize;
-    if (mi_row + 4 < cm->mi_rows && mi_col + 4 < cm->mi_cols)
-      bsize = BLOCK_64X64;
-    else if (mi_row + 4 < cm->mi_rows && mi_col + 4 >= cm->mi_cols)
-      bsize = BLOCK_32X64;
-    else if (mi_row + 4 >= cm->mi_rows && mi_col + 4 < cm->mi_cols)
-      bsize = BLOCK_64X32;
-    else
-      bsize = BLOCK_32X32;
+    const BLOCK_SIZE bsize = BLOCK_32X32
+        + (mi_col + 4 < cm->mi_cols) * 2 + (mi_row + 4 < cm->mi_rows);
 
     assert(yv12 != NULL);
-
     if (yv12_g && yv12_g != yv12) {
       vp9_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
                            &cm->frame_refs[GOLDEN_FRAME - 1].sf);
@@ -620,6 +749,19 @@
 
     d = xd->plane[0].dst.buf;
     dp = xd->plane[0].dst.stride;
+
+    // If the y_sad is very small, take 64x64 as partition and exit.
+    // Don't check on boosted segment for now, as 64x64 is suppressed there.
+    if (segment_id == CR_SEGMENT_ID_BASE &&
+        y_sad < cpi->vbp_threshold_sad) {
+      const int block_width = num_8x8_blocks_wide_lookup[BLOCK_64X64];
+      const int block_height = num_8x8_blocks_high_lookup[BLOCK_64X64];
+      if (mi_col + block_width / 2 < cm->mi_cols &&
+          mi_row + block_height / 2 < cm->mi_rows) {
+        set_block_size(cpi, xd, mi_row, mi_col, BLOCK_64X64);
+        return 0;
+      }
+    }
   } else {
     d = VP9_VAR_OFFS;
     dp = 0;
@@ -642,6 +784,7 @@
   }
 
   // Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks,
+  // 5-20 for the 16x16 blocks.
   force_split[0] = 0;
   // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances
   // for splits.
@@ -653,46 +796,49 @@
     for (j = 0; j < 4; j++) {
       const int x16_idx = x32_idx + ((j & 1) << 4);
       const int y16_idx = y32_idx + ((j >> 1) << 4);
+      const int split_index = 5 + i2 + j;
       v16x16 *vst = &vt.split[i].split[j];
+      force_split[split_index] = 0;
       variance4x4downsample[i2 + j] = 0;
       if (!is_key_frame) {
-        for (k = 0; k < 4; k++) {
-          int x8_idx = x16_idx + ((k & 1) << 3);
-          int y8_idx = y16_idx + ((k >> 1) << 3);
-          unsigned int sse = 0;
-          int sum = 0;
-          if (x8_idx < pixels_wide && y8_idx < pixels_high) {
-            int s_avg, d_avg;
+        fill_variance_8x8avg(s, sp, d, dp, x16_idx, y16_idx, vst,
 #if CONFIG_VP9_HIGHBITDEPTH
-            if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-              s_avg = vp9_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp);
-              d_avg = vp9_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp);
-            } else {
-              s_avg = vp9_avg_8x8(s + y8_idx * sp + x8_idx, sp);
-              d_avg = vp9_avg_8x8(d + y8_idx * dp + x8_idx, dp);
-            }
-#else
-            s_avg = vp9_avg_8x8(s + y8_idx * sp + x8_idx, sp);
-            d_avg = vp9_avg_8x8(d + y8_idx * dp + x8_idx, dp);
+                            xd->cur_buf->flags,
 #endif
-            sum = s_avg - d_avg;
-            sse = sum * sum;
-          }
-          // If variance is based on 8x8 downsampling, we stop here and have
-          // one sample for 8x8 block (so use 1 for count in fill_variance),
-          // which of course means variance = 0 for 8x8 block.
-          fill_variance(sse, sum, 0, &vst->split[k].part_variances.none);
-        }
+                            pixels_wide,
+                            pixels_high,
+                            is_key_frame);
         fill_variance_tree(&vt.split[i].split[j], BLOCK_16X16);
-        // For low-resolution, compute the variance based on 8x8 down-sampling,
-        // and if it is large (above the threshold) we go down for 4x4.
-        // For key frame we always go down to 4x4.
-        if (low_res)
-          get_variance(&vt.split[i].split[j].part_variances.none);
+        get_variance(&vt.split[i].split[j].part_variances.none);
+        if (vt.split[i].split[j].part_variances.none.variance >
+            thresholds[2]) {
+          // 16X16 variance is above threshold for split, so force split to 8x8
+          // for this 16x16 block (this also forces splits for upper levels).
+          force_split[split_index] = 1;
+          force_split[i + 1] = 1;
+          force_split[0] = 1;
+        } else if (vt.split[i].split[j].part_variances.none.variance >
+                   thresholds[1] &&
+                   !cyclic_refresh_segment_id_boosted(segment_id)) {
+          // We have some nominal amount of 16x16 variance (based on average),
+          // compute the minmax over the 8x8 sub-blocks, and if above threshold,
+          // force split to 8x8 block for this 16x16 block.
+          int minmax = compute_minmax_8x8(s, sp, d, dp, x16_idx, y16_idx,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                          xd->cur_buf->flags,
+#endif
+                                          pixels_wide, pixels_high);
+          if (minmax > cpi->vbp_threshold_minmax) {
+            force_split[split_index] = 1;
+            force_split[i + 1] = 1;
+            force_split[0] = 1;
+          }
+        }
       }
       if (is_key_frame || (low_res &&
           vt.split[i].split[j].part_variances.none.variance >
-          (cpi->vbp_threshold << 1))) {
+          (thresholds[1] << 1))) {
+        force_split[split_index] = 0;
         // Go down to 4x4 down-sampling for variance.
         variance4x4downsample[i2 + j] = 1;
         for (k = 0; k < 4; k++) {
@@ -700,47 +846,18 @@
           int y8_idx = y16_idx + ((k >> 1) << 3);
           v8x8 *vst2 = is_key_frame ? &vst->split[k] :
               &vt2[i2 + j].split[k];
-          for (m = 0; m < 4; m++) {
-            int x4_idx = x8_idx + ((m & 1) << 2);
-            int y4_idx = y8_idx + ((m >> 1) << 2);
-            unsigned int sse = 0;
-            int sum = 0;
-            if (x4_idx < pixels_wide && y4_idx < pixels_high) {
-              int d_avg = 128;
+          fill_variance_4x4avg(s, sp, d, dp, x8_idx, y8_idx, vst2,
 #if CONFIG_VP9_HIGHBITDEPTH
-              int s_avg;
-              if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-                s_avg = vp9_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp);
-                if (cm->frame_type != KEY_FRAME)
-                  d_avg = vp9_highbd_avg_4x4(d + y4_idx * dp + x4_idx, dp);
-              } else {
-                s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp);
-                if (cm->frame_type != KEY_FRAME)
-                  d_avg = vp9_avg_4x4(d + y4_idx * dp + x4_idx, dp);
-              }
-#else
-              int s_avg = vp9_avg_4x4(s + y4_idx * sp + x4_idx, sp);
-              if (!is_key_frame)
-                d_avg = vp9_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+                               xd->cur_buf->flags,
 #endif
-              sum = s_avg - d_avg;
-              sse = sum * sum;
-            }
-            // If variance is based on 4x4 down-sampling, we stop here and have
-            // one sample for 4x4 block (so use 1 for count in fill_variance),
-            // which of course means variance = 0 for 4x4 block.
-            fill_variance(sse, sum, 0, &vst2->split[m].part_variances.none);
-          }
+                               pixels_wide,
+                               pixels_high,
+                               is_key_frame);
         }
       }
     }
   }
 
-  // No 64x64 blocks on segments other than base (un-boosted) segment,
-  // so force split.
-  if (cyclic_refresh_segment_id_boosted(segment_id))
-    force_split[0] = 1;
-
   // Fill the rest of the variance tree by summing split partition values.
   for (i = 0; i < 4; i++) {
     const int i2 = i << 2;
@@ -756,29 +873,32 @@
     fill_variance_tree(&vt.split[i], BLOCK_32X32);
     // If variance of this 32x32 block is above the threshold, force the block
     // to split. This also forces a split on the upper (64x64) level.
-    get_variance(&vt.split[i].part_variances.none);
-    if (vt.split[i].part_variances.none.variance > cpi->vbp_threshold) {
-      force_split[i + 1] = 1;
-      force_split[0] = 1;
+    if (!force_split[i + 1]) {
+      get_variance(&vt.split[i].part_variances.none);
+      if (vt.split[i].part_variances.none.variance > thresholds[1]) {
+        force_split[i + 1] = 1;
+        force_split[0] = 1;
+      }
     }
   }
-  if (!force_split[0])
+  if (!force_split[0]) {
     fill_variance_tree(&vt, BLOCK_64X64);
+    get_variance(&vt.part_variances.none);
+  }
 
-  // Now go through the entire structure,  splitting every block size until
+  // Now go through the entire structure, splitting every block size until
   // we get to one that's got a variance lower than our threshold.
   if ( mi_col + 8 > cm->mi_cols || mi_row + 8 > cm->mi_rows ||
       !set_vt_partitioning(cpi, xd, &vt, BLOCK_64X64, mi_row, mi_col,
-                           cpi->vbp_threshold_bsize_max, BLOCK_16X16,
-                           force_split[0])) {
+                           thresholds[0], BLOCK_16X16, force_split[0])) {
     for (i = 0; i < 4; ++i) {
       const int x32_idx = ((i & 1) << 2);
       const int y32_idx = ((i >> 1) << 2);
       const int i2 = i << 2;
       if (!set_vt_partitioning(cpi, xd, &vt.split[i], BLOCK_32X32,
                                (mi_row + y32_idx), (mi_col + x32_idx),
-                               cpi->vbp_threshold,
-                               BLOCK_16X16, force_split[i + 1])) {
+                               thresholds[1], BLOCK_16X16,
+                               force_split[i + 1])) {
         for (j = 0; j < 4; ++j) {
           const int x16_idx = ((j & 1) << 1);
           const int y16_idx = ((j >> 1) << 1);
@@ -791,8 +911,9 @@
           if (!set_vt_partitioning(cpi, xd, vtemp, BLOCK_16X16,
                                    mi_row + y32_idx + y16_idx,
                                    mi_col + x32_idx + x16_idx,
-                                   cpi->vbp_threshold_16x16,
-                                   cpi->vbp_bsize_min, 0)) {
+                                   thresholds[2],
+                                   cpi->vbp_bsize_min,
+                                   force_split[5 + i2  + j])) {
             for (k = 0; k < 4; ++k) {
               const int x8_idx = (k & 1);
               const int y8_idx = (k >> 1);
@@ -801,8 +922,7 @@
                                          BLOCK_8X8,
                                          mi_row + y32_idx + y16_idx + y8_idx,
                                          mi_col + x32_idx + x16_idx + x8_idx,
-                                         cpi->vbp_threshold_bsize_min,
-                                         BLOCK_8X8, 0)) {
+                                         thresholds[3], BLOCK_8X8, 0)) {
                   set_block_size(cpi, xd,
                                  (mi_row + y32_idx + y16_idx + y8_idx),
                                  (mi_col + x32_idx + x16_idx + x8_idx),
@@ -820,6 +940,7 @@
       }
     }
   }
+  return 0;
 }
 
 static void update_state(VP9_COMP *cpi, ThreadData *td,
@@ -3842,6 +3963,9 @@
     }
     vp9_zero(x->zcoeff_blk);
 
+    if (cm->frame_type != KEY_FRAME && cpi->rc.frames_since_golden == 0)
+      cpi->ref_frame_flags &= (~VP9_GOLD_FLAG);
+
     if (sf->partition_search_type == SOURCE_VAR_BASED_PARTITION)
       source_var_based_partition_search_method(cpi);
   }

diff --git a/vp9/encoder/vp9_encodeframe.h b/vp9/encoder/vp9_encodeframe.h
index 8d545b6..1acde02 100644
--- a/vp9/encoder/vp9_encodeframe.h
+++ b/vp9/encoder/vp9_encodeframe.h

@@ -12,6 +12,8 @@
 #ifndef VP9_ENCODER_VP9_ENCODEFRAME_H_
 #define VP9_ENCODER_VP9_ENCODEFRAME_H_
 
+#include "vpx/vpx_integer.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif

diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 32b28d7..6da237d 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c

@@ -13,6 +13,7 @@
 #include <limits.h>
 
 #include "./vpx_config.h"
+#include "./vp9_rtcd.h"
 #include "./vpx_scale_rtcd.h"
 #include "vpx/internal/vpx_psnr.h"
 #include "vpx_ports/vpx_timer.h"
@@ -183,6 +184,33 @@
   }
 }
 
+int vp9_get_active_map(VP9_COMP* cpi,
+                       unsigned char* new_map_16x16,
+                       int rows,
+                       int cols) {
+  if (rows == cpi->common.mb_rows && cols == cpi->common.mb_cols &&
+      new_map_16x16) {
+    unsigned char* const seg_map_8x8 = cpi->segmentation_map;
+    const int mi_rows = cpi->common.mi_rows;
+    const int mi_cols = cpi->common.mi_cols;
+    vpx_memset(new_map_16x16, !cpi->active_map.enabled, rows * cols);
+    if (cpi->active_map.enabled) {
+      int r, c;
+      for (r = 0; r < mi_rows; ++r) {
+        for (c = 0; c < mi_cols; ++c) {
+          // Cyclic refresh segments are considered active despite not having
+          // AM_SEGMENT_ID_ACTIVE
+          new_map_16x16[(r >> 1) * cols + (c >> 1)] |=
+              seg_map_8x8[r * mi_cols + c] != AM_SEGMENT_ID_INACTIVE;
+        }
+      }
+    }
+    return 0;
+  } else {
+    return -1;
+  }
+}
+
 void vp9_set_high_precision_mv(VP9_COMP *cpi, int allow_high_precision_mv) {
   MACROBLOCK *const mb = &cpi->td.mb;
   cpi->common.allow_high_precision_mv = allow_high_precision_mv;
@@ -266,6 +294,7 @@
 
   if (!init_done) {
     vp9_rtcd();
+    vpx_scale_rtcd();
     vp9_init_intra_predictors();
     vp9_init_me_luts();
     vp9_rc_init_minq_luts();
@@ -665,9 +694,14 @@
   int min_log2_tile_cols, max_log2_tile_cols;
   vp9_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
 
-  cm->log2_tile_cols = clamp(cpi->oxcf.tile_columns,
-                             min_log2_tile_cols, max_log2_tile_cols);
-  cm->log2_tile_rows = cpi->oxcf.tile_rows;
+  if (is_two_pass_svc(cpi) && cpi->svc.encode_empty_frame_state == ENCODING) {
+    cm->log2_tile_cols = 0;
+    cm->log2_tile_rows = 0;
+  } else {
+    cm->log2_tile_cols = clamp(cpi->oxcf.tile_columns,
+                               min_log2_tile_cols, max_log2_tile_cols);
+    cm->log2_tile_rows = cpi->oxcf.tile_rows;
+  }
 }
 
 static void init_buffer_indices(VP9_COMP *cpi) {

diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index 635ef6d..e0ed608 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h

@@ -465,10 +465,11 @@
   int resize_pending;
 
   // VAR_BASED_PARTITION thresholds
-  int64_t vbp_threshold;
-  int64_t vbp_threshold_bsize_min;
-  int64_t vbp_threshold_bsize_max;
-  int64_t vbp_threshold_16x16;
+  // 0 - threshold_64x64; 1 - threshold_32x32;
+  // 2 - threshold_16x16; 3 - vbp_threshold_8x8;
+  int64_t vbp_thresholds[4];
+  int64_t vbp_threshold_minmax;
+  int64_t vbp_threshold_sad;
   BLOCK_SIZE vbp_bsize_min;
 
   // Multi-threading
@@ -513,6 +514,8 @@
 
 int vp9_set_active_map(VP9_COMP *cpi, unsigned char *map, int rows, int cols);
 
+int vp9_get_active_map(VP9_COMP *cpi, unsigned char *map, int rows, int cols);
+
 int vp9_set_internal_size(VP9_COMP *cpi,
                           VPX_SCALING horiz_mode, VPX_SCALING vert_mode);
 

diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 380f0b7..bf9e500 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c

@@ -2612,6 +2612,7 @@
 }
 
 #define MINQ_ADJ_LIMIT 48
+#define MINQ_ADJ_LIMIT_CQ 20
 void vp9_twopass_postencode_update(VP9_COMP *cpi) {
   TWO_PASS *const twopass = &cpi->twopass;
   RATE_CONTROL *const rc = &cpi->rc;
@@ -2651,7 +2652,7 @@
     const int maxq_adj_limit =
       rc->worst_quality - twopass->active_worst_quality;
     const int minq_adj_limit =
-      (cpi->oxcf.rc_mode == VPX_CQ) ? 0 : MINQ_ADJ_LIMIT;
+        (cpi->oxcf.rc_mode == VPX_CQ ? MINQ_ADJ_LIMIT_CQ : MINQ_ADJ_LIMIT);
 
     // Undershoot.
     if (rc->rate_error_estimate > cpi->oxcf.under_shoot_pct) {

diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index 9602eb5..1f5f08a 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c

@@ -2017,7 +2017,7 @@
     if (fn_ptr->sdx3f != NULL) {
       while ((c + 2) < col_max) {
         int i;
-        unsigned int sads[3];
+        DECLARE_ALIGNED(16, uint32_t, sads[3]);
 
         fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride,
                       sads);
@@ -2082,7 +2082,7 @@
     if (fn_ptr->sdx8f != NULL) {
       while ((c + 7) < col_max) {
         int i;
-        unsigned int sads[8];
+        DECLARE_ALIGNED(16, uint32_t, sads[8]);
 
         fn_ptr->sdx8f(what->buf, what->stride, check_here, in_what->stride,
                       sads);
@@ -2106,7 +2106,7 @@
     if (fn_ptr->sdx3f != NULL) {
       while ((c + 2) < col_max) {
         int i;
-        unsigned int sads[3];
+        DECLARE_ALIGNED(16, uint32_t, sads[3]);
 
         fn_ptr->sdx3f(what->buf, what->stride, check_here, in_what->stride,
                       sads);

diff --git a/vp9/encoder/vp9_pickmode.c b/vp9/encoder/vp9_pickmode.c
index ad895e7..416f679 100644
--- a/vp9/encoder/vp9_pickmode.c
+++ b/vp9/encoder/vp9_pickmode.c

@@ -20,9 +20,11 @@
 #include "vp9/common/vp9_blockd.h"
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_mvref_common.h"
+#include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
 
+#include "vp9/encoder/vp9_cost.h"
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_pickmode.h"
 #include "vp9/encoder/vp9_ratectrl.h"
@@ -188,6 +190,8 @@
                                  cond_cost_list(cpi, cost_list),
                                  x->nmvjointcost, x->mvcost,
                                  &dis, &x->pred_sse[ref], NULL, 0, 0);
+    *rate_mv = vp9_mv_bit_cost(&tmp_mv->as_mv, &ref_mv,
+                               x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
   }
 
   if (scaled_ref_frame) {
@@ -198,6 +202,250 @@
   return rv;
 }
 
+static void block_variance(const uint8_t *src, int src_stride,
+                           const uint8_t *ref, int ref_stride,
+                           int w, int h, unsigned int *sse, int *sum,
+                           int block_size, unsigned int *sse8x8,
+                           int *sum8x8, unsigned int *var8x8) {
+  int i, j, k = 0;
+
+  *sse = 0;
+  *sum = 0;
+
+  for (i = 0; i < h; i += block_size) {
+    for (j = 0; j < w; j += block_size) {
+      vp9_get8x8var(src + src_stride * i + j, src_stride,
+                    ref + ref_stride * i + j, ref_stride,
+                    &sse8x8[k], &sum8x8[k]);
+      *sse += sse8x8[k];
+      *sum += sum8x8[k];
+      var8x8[k] = sse8x8[k] - (((unsigned int)sum8x8[k] * sum8x8[k]) >> 6);
+      k++;
+    }
+  }
+}
+
+static void calculate_variance(int bw, int bh, TX_SIZE tx_size,
+                               unsigned int *sse_i, int *sum_i,
+                               unsigned int *var_o, unsigned int *sse_o,
+                               int *sum_o) {
+  const BLOCK_SIZE unit_size = txsize_to_bsize[tx_size];
+  const int nw = 1 << (bw - b_width_log2_lookup[unit_size]);
+  const int nh = 1 << (bh - b_height_log2_lookup[unit_size]);
+  int i, j, k = 0;
+
+  for (i = 0; i < nh; i += 2) {
+    for (j = 0; j < nw; j += 2) {
+      sse_o[k] = sse_i[i * nw + j] + sse_i[i * nw + j + 1] +
+          sse_i[(i + 1) * nw + j] + sse_i[(i + 1) * nw + j + 1];
+      sum_o[k] = sum_i[i * nw + j] + sum_i[i * nw + j + 1] +
+          sum_i[(i + 1) * nw + j] + sum_i[(i + 1) * nw + j + 1];
+      var_o[k] = sse_o[k] - (((unsigned int)sum_o[k] * sum_o[k]) >>
+          (b_width_log2_lookup[unit_size] +
+              b_height_log2_lookup[unit_size] + 6));
+      k++;
+    }
+  }
+}
+
+static void model_rd_for_sb_y_large(VP9_COMP *cpi, BLOCK_SIZE bsize,
+                                    MACROBLOCK *x, MACROBLOCKD *xd,
+                                    int *out_rate_sum, int64_t *out_dist_sum,
+                                    unsigned int *var_y, unsigned int *sse_y,
+                                    int mi_row, int mi_col, int *early_term) {
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  unsigned int sse;
+  int rate;
+  int64_t dist;
+  struct macroblock_plane *const p = &x->plane[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  const uint32_t dc_quant = pd->dequant[0];
+  const uint32_t ac_quant = pd->dequant[1];
+  const int64_t dc_thr = dc_quant * dc_quant >> 6;
+  const int64_t ac_thr = ac_quant * ac_quant >> 6;
+  unsigned int var;
+  int sum;
+  int skip_dc = 0;
+
+  const int bw = b_width_log2_lookup[bsize];
+  const int bh = b_height_log2_lookup[bsize];
+  const int num8x8 = 1 << (bw + bh - 2);
+  unsigned int sse8x8[64] = {0};
+  int sum8x8[64] = {0};
+  unsigned int var8x8[64] = {0};
+  TX_SIZE tx_size;
+  int i, k;
+
+  // Calculate variance for whole partition, and also save 8x8 blocks' variance
+  // to be used in following transform skipping test.
+  block_variance(p->src.buf, p->src.stride, pd->dst.buf, pd->dst.stride,
+                 4 << bw, 4 << bh, &sse, &sum, 8, sse8x8, sum8x8, var8x8);
+  var = sse - (((int64_t)sum * sum) >> (bw + bh + 4));
+
+  *var_y = var;
+  *sse_y = sse;
+
+  if (cpi->common.tx_mode == TX_MODE_SELECT) {
+    if (sse > (var << 2))
+      tx_size = MIN(max_txsize_lookup[bsize],
+                    tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+    else
+      tx_size = TX_8X8;
+
+    if (cpi->sf.partition_search_type == VAR_BASED_PARTITION) {
+      if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
+          cyclic_refresh_segment_id_boosted(xd->mi[0].src_mi->mbmi.segment_id))
+        tx_size = TX_8X8;
+      else if (tx_size > TX_16X16)
+        tx_size = TX_16X16;
+    }
+  } else {
+    tx_size = MIN(max_txsize_lookup[bsize],
+                  tx_mode_to_biggest_tx_size[cpi->common.tx_mode]);
+  }
+
+  assert(tx_size >= TX_8X8);
+  xd->mi[0].src_mi->mbmi.tx_size = tx_size;
+
+  // Evaluate if the partition block is a skippable block in Y plane.
+  {
+    unsigned int sse16x16[16] = {0};
+    int sum16x16[16] = {0};
+    unsigned int var16x16[16] = {0};
+    const int num16x16 = num8x8 >> 2;
+
+    unsigned int sse32x32[4] = {0};
+    int sum32x32[4] = {0};
+    unsigned int var32x32[4] = {0};
+    const int num32x32 = num8x8 >> 4;
+
+    int ac_test = 1;
+    int dc_test = 1;
+    const int num = (tx_size == TX_8X8) ? num8x8 :
+        ((tx_size == TX_16X16) ? num16x16 : num32x32);
+    const unsigned int *sse_tx = (tx_size == TX_8X8) ? sse8x8 :
+        ((tx_size == TX_16X16) ? sse16x16 : sse32x32);
+    const unsigned int *var_tx = (tx_size == TX_8X8) ? var8x8 :
+        ((tx_size == TX_16X16) ? var16x16 : var32x32);
+
+    // Calculate variance if tx_size > TX_8X8
+    if (tx_size >= TX_16X16)
+      calculate_variance(bw, bh, TX_8X8, sse8x8, sum8x8, var16x16, sse16x16,
+                         sum16x16);
+    if (tx_size == TX_32X32)
+      calculate_variance(bw, bh, TX_16X16, sse16x16, sum16x16, var32x32,
+                         sse32x32, sum32x32);
+
+    // Skipping test
+    x->skip_txfm[0] = 0;
+    for (k = 0; k < num; k++)
+      // Check if all ac coefficients can be quantized to zero.
+      if (!(var_tx[k] < ac_thr || var == 0)) {
+        ac_test = 0;
+        break;
+      }
+
+    for (k = 0; k < num; k++)
+      // Check if dc coefficient can be quantized to zero.
+      if (!(sse_tx[k] - var_tx[k] < dc_thr || sse == var)) {
+        dc_test = 0;
+        break;
+      }
+
+    if (ac_test) {
+      x->skip_txfm[0] = 2;
+
+      if (dc_test)
+        x->skip_txfm[0] = 1;
+    } else if (dc_test) {
+      skip_dc = 1;
+    }
+  }
+
+  if (x->skip_txfm[0] == 1) {
+    int skip_uv[2] = {0};
+    unsigned int var_uv[2];
+    unsigned int sse_uv[2];
+
+    *out_rate_sum = 0;
+    *out_dist_sum = sse << 4;
+
+    // Transform skipping test in UV planes.
+    for (i = 1; i <= 2; i++) {
+      struct macroblock_plane *const p = &x->plane[i];
+      struct macroblockd_plane *const pd = &xd->plane[i];
+      const TX_SIZE uv_tx_size = get_uv_tx_size(&xd->mi[0].src_mi->mbmi, pd);
+      const BLOCK_SIZE unit_size = txsize_to_bsize[uv_tx_size];
+      const BLOCK_SIZE uv_bsize = get_plane_block_size(bsize, pd);
+      const int uv_bw = b_width_log2_lookup[uv_bsize];
+      const int uv_bh = b_height_log2_lookup[uv_bsize];
+      const int sf = (uv_bw - b_width_log2_lookup[unit_size]) +
+          (uv_bh - b_height_log2_lookup[unit_size]);
+      const uint32_t uv_dc_thr = pd->dequant[0] * pd->dequant[0] >> (6 - sf);
+      const uint32_t uv_ac_thr = pd->dequant[1] * pd->dequant[1] >> (6 - sf);
+      int j = i - 1;
+
+      vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, i);
+      var_uv[j] = cpi->fn_ptr[uv_bsize].vf(p->src.buf, p->src.stride,
+          pd->dst.buf, pd->dst.stride, &sse_uv[j]);
+
+      if ((var_uv[j] < uv_ac_thr || var_uv[j] == 0) &&
+          (sse_uv[j] - var_uv[j] < uv_dc_thr || sse_uv[j] == var_uv[j]))
+        skip_uv[j] = 1;
+      else
+        break;
+    }
+
+    // If the transform in YUV planes are skippable, the mode search checks
+    // fewer inter modes and doesn't check intra modes.
+    if (skip_uv[0] & skip_uv[1]) {
+      *early_term = 1;
+    }
+
+    return;
+  }
+
+  if (!skip_dc) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
+                                   dc_quant >> (xd->bd - 5), &rate, &dist);
+    } else {
+      vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
+                                   dc_quant >> 3, &rate, &dist);
+    }
+#else
+    vp9_model_rd_from_var_lapndz(sse - var, num_pels_log2_lookup[bsize],
+                                 dc_quant >> 3, &rate, &dist);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
+
+  if (!skip_dc) {
+    *out_rate_sum = rate >> 1;
+    *out_dist_sum = dist << 3;
+  } else {
+    *out_rate_sum = 0;
+    *out_dist_sum = (sse - var) << 4;
+  }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
+                                 ac_quant >> (xd->bd - 5), &rate, &dist);
+  } else {
+    vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
+                                 ac_quant >> 3, &rate, &dist);
+  }
+#else
+  vp9_model_rd_from_var_lapndz(var, num_pels_log2_lookup[bsize],
+                               ac_quant >> 3, &rate, &dist);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  *out_rate_sum += rate;
+  *out_dist_sum += dist << 4;
+}
 
 static void model_rd_for_sb_y(VP9_COMP *cpi, BLOCK_SIZE bsize,
                               MACROBLOCK *x, MACROBLOCKD *xd,
@@ -312,6 +560,132 @@
   *out_dist_sum += dist << 4;
 }
 
+#if CONFIG_VP9_HIGHBITDEPTH
+static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
+                      int *skippable, int64_t *sse, int plane,
+                      BLOCK_SIZE bsize, TX_SIZE tx_size) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  unsigned int var_y, sse_y;
+  (void)plane;
+  (void)tx_size;
+  model_rd_for_sb_y(cpi, bsize, x, xd, rate, dist, &var_y, &sse_y);
+  *sse = INT_MAX;
+  *skippable = 0;
+  return;
+}
+#else
+static void block_yrd(VP9_COMP *cpi, MACROBLOCK *x, int *rate, int64_t *dist,
+                      int *skippable, int64_t *sse, int plane,
+                      BLOCK_SIZE bsize, TX_SIZE tx_size) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const struct macroblockd_plane *pd = &xd->plane[plane];
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
+  const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
+  const int step = 1 << (tx_size << 1);
+  const int block_step = (1 << tx_size);
+  int block = 0, r, c;
+  int shift = tx_size == TX_32X32 ? 0 : 2;
+  const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ? 0 :
+      xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+  const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ? 0 :
+      xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+  int eob_cost = 0;
+
+  (void)cpi;
+  vp9_subtract_plane(x, bsize, plane);
+  *skippable = 1;
+  // Keep track of the row and column of the blocks we use so that we know
+  // if we are in the unrestricted motion border.
+  for (r = 0; r < max_blocks_high; r += block_step) {
+    for (c = 0; c < num_4x4_w; c += block_step) {
+      if (c < max_blocks_wide) {
+        const scan_order *const scan_order = &vp9_default_scan_orders[tx_size];
+        tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+        tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+        tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+        uint16_t *const eob = &p->eobs[block];
+        const int diff_stride = 4 * num_4x4_blocks_wide_lookup[bsize];
+        const int16_t *src_diff;
+        src_diff = &p->src_diff[(r * diff_stride + c) << 2];
+
+        switch (tx_size) {
+          case TX_32X32:
+            vp9_fdct32x32_rd(src_diff, coeff, diff_stride);
+            vp9_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin,
+                                  p->round_fp, p->quant_fp, p->quant_shift,
+                                  qcoeff, dqcoeff, pd->dequant, eob,
+                                  scan_order->scan, scan_order->iscan);
+            break;
+          case TX_16X16:
+            vp9_hadamard_16x16(src_diff, diff_stride, (int16_t *)coeff);
+            vp9_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
+                            p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                            pd->dequant, eob,
+                            scan_order->scan, scan_order->iscan);
+            break;
+          case TX_8X8:
+            vp9_hadamard_8x8(src_diff, diff_stride, (int16_t *)coeff);
+            vp9_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
+                            p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                            pd->dequant, eob,
+                            scan_order->scan, scan_order->iscan);
+            break;
+          case TX_4X4:
+            x->fwd_txm4x4(src_diff, coeff, diff_stride);
+            vp9_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
+                            p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
+                            pd->dequant, eob,
+                            scan_order->scan, scan_order->iscan);
+            break;
+          default:
+            assert(0);
+            break;
+        }
+        *skippable &= (*eob == 0);
+        eob_cost += 1;
+      }
+      block += step;
+    }
+  }
+
+  if (*skippable && *sse < INT64_MAX) {
+    *rate = 0;
+    *dist = (*sse << 6) >> shift;
+    *sse = *dist;
+    return;
+  }
+
+  block = 0;
+  *rate = 0;
+  *dist = 0;
+  *sse = (*sse << 6) >> shift;
+  for (r = 0; r < max_blocks_high; r += block_step) {
+    for (c = 0; c < num_4x4_w; c += block_step) {
+      if (c < max_blocks_wide) {
+        tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+        tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+        tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+        uint16_t *const eob = &p->eobs[block];
+
+        if (*eob == 1)
+          *rate += (int)abs(qcoeff[0]);
+        else if (*eob > 1)
+          *rate += (int)vp9_satd((const int16_t *)qcoeff, step << 4);
+
+        *dist += vp9_block_error_fp(coeff, dqcoeff, step << 4) >> shift;
+      }
+      block += step;
+    }
+  }
+
+  if (*skippable == 0) {
+    *rate <<= 10;
+    *rate += (eob_cost << 8);
+  }
+}
+#endif
+
 static void model_rd_for_sb_uv(VP9_COMP *cpi, BLOCK_SIZE bsize,
                                MACROBLOCK *x, MACROBLOCKD *xd,
                                int *out_rate_sum, int64_t *out_dist_sum,
@@ -518,7 +892,9 @@
   int i, j;
   int rate;
   int64_t dist;
-  unsigned int var_y, sse_y;
+  int64_t this_sse = INT64_MAX;
+  int is_skippable;
+
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
   assert(plane == 0);
   (void) plane;
@@ -533,8 +909,13 @@
                           x->skip_encode ? src_stride : dst_stride,
                           pd->dst.buf, dst_stride,
                           i, j, 0);
-  // This procedure assumes zero offset from p->src.buf and pd->dst.buf.
-  model_rd_for_sb_y(cpi, bsize_tx, x, xd, &rate, &dist, &var_y, &sse_y);
+
+  // TODO(jingning): This needs further refactoring.
+  block_yrd(cpi, x, &rate, &dist, &is_skippable, &this_sse, 0,
+            bsize_tx, MIN(tx_size, TX_16X16));
+  x->skip_txfm[0] = is_skippable;
+  rate += vp9_cost_bit(vp9_get_skip_prob(&cpi->common, xd), is_skippable);
+
   p->src.buf = src_buf_base;
   pd->dst.buf = dst_buf_base;
   args->rate += rate;
@@ -602,9 +983,23 @@
   *rd_cost = best_rdc;
 }
 
-static const int ref_frame_cost[MAX_REF_FRAMES] = {
-    1235, 229, 530, 615,
-};
+static void init_ref_frame_cost(VP9_COMMON *const cm,
+                                MACROBLOCKD *const xd,
+                                int ref_frame_cost[MAX_REF_FRAMES]) {
+  vp9_prob intra_inter_p = vp9_get_intra_inter_prob(cm, xd);
+  vp9_prob ref_single_p1 = vp9_get_pred_prob_single_ref_p1(cm, xd);
+  vp9_prob ref_single_p2 = vp9_get_pred_prob_single_ref_p2(cm, xd);
+
+  ref_frame_cost[INTRA_FRAME] = vp9_cost_bit(intra_inter_p, 0);
+  ref_frame_cost[LAST_FRAME] = ref_frame_cost[GOLDEN_FRAME] =
+    ref_frame_cost[ALTREF_FRAME] = vp9_cost_bit(intra_inter_p, 1);
+
+  ref_frame_cost[LAST_FRAME] += vp9_cost_bit(ref_single_p1, 0);
+  ref_frame_cost[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p1, 1);
+  ref_frame_cost[ALTREF_FRAME] += vp9_cost_bit(ref_single_p1, 1);
+  ref_frame_cost[GOLDEN_FRAME] += vp9_cost_bit(ref_single_p2, 0);
+  ref_frame_cost[ALTREF_FRAME] += vp9_cost_bit(ref_single_p2, 1);
+}
 
 typedef struct {
   MV_REFERENCE_FRAME ref_frame;
@@ -682,6 +1077,10 @@
   int ref_frame_skip_mask = 0;
   int idx;
   int best_pred_sad = INT_MAX;
+  int best_early_term = 0;
+  int ref_frame_cost[MAX_REF_FRAMES];
+
+  init_ref_frame_cost(cm, xd, ref_frame_cost);
 
   if (reuse_inter_pred) {
     int i;
@@ -726,7 +1125,6 @@
 #endif
 
   if (cpi->rc.frames_since_golden == 0) {
-    cpi->ref_frame_flags &= (~VP9_GOLD_FLAG);
     usable_ref_frame = LAST_FRAME;
   } else {
     usable_ref_frame = GOLDEN_FRAME;
@@ -773,6 +1171,10 @@
     int mode_index;
     int i;
     PREDICTION_MODE this_mode = ref_mode_set[idx].pred_mode;
+    int64_t this_sse;
+    int is_skippable;
+    int this_early_term = 0;
+
     if (!(cpi->sf.inter_mode_mask[bsize] & (1 << this_mode)))
       continue;
 
@@ -925,17 +1327,55 @@
       var_y = pf_var[best_filter];
       sse_y = pf_sse[best_filter];
       x->skip_txfm[0] = skip_txfm;
+      if (reuse_inter_pred) {
+        pd->dst.buf = this_mode_pred->data;
+        pd->dst.stride = this_mode_pred->stride;
+      }
     } else {
       mbmi->interp_filter = (filter_ref == SWITCHABLE) ? EIGHTTAP : filter_ref;
       vp9_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
-      model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist,
-                        &var_y, &sse_y);
-      this_rdc.rate +=
-          cm->interp_filter == SWITCHABLE ?
-              vp9_get_switchable_rate(cpi, xd) : 0;
+
+      // For large partition blocks, extra testing is done.
+      if (bsize > BLOCK_32X32 &&
+        !cyclic_refresh_segment_id_boosted(xd->mi[0].src_mi->mbmi.segment_id) &&
+        cm->base_qindex) {
+        model_rd_for_sb_y_large(cpi, bsize, x, xd, &this_rdc.rate,
+                                &this_rdc.dist, &var_y, &sse_y, mi_row, mi_col,
+                                &this_early_term);
+      } else {
+        model_rd_for_sb_y(cpi, bsize, x, xd, &this_rdc.rate, &this_rdc.dist,
+                          &var_y, &sse_y);
+      }
     }
 
-    // chroma component rate-distortion cost modeling
+    if (!this_early_term) {
+      this_sse = (int64_t)sse_y;
+      block_yrd(cpi, x, &this_rdc.rate, &this_rdc.dist, &is_skippable,
+                &this_sse, 0, bsize, MIN(mbmi->tx_size, TX_16X16));
+      x->skip_txfm[0] = is_skippable;
+      if (is_skippable) {
+        this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+      } else {
+        if (RDCOST(x->rdmult, x->rddiv, this_rdc.rate, this_rdc.dist) <
+            RDCOST(x->rdmult, x->rddiv, 0, this_sse)) {
+          this_rdc.rate += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 0);
+        } else {
+          this_rdc.rate = vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+          this_rdc.dist = this_sse;
+          x->skip_txfm[0] = 1;
+        }
+      }
+
+      if (cm->interp_filter == SWITCHABLE) {
+        if ((mbmi->mv[0].as_mv.row | mbmi->mv[0].as_mv.col) & 0x07)
+          this_rdc.rate += vp9_get_switchable_rate(cpi, xd);
+      }
+    } else {
+      this_rdc.rate += cm->interp_filter == SWITCHABLE ?
+          vp9_get_switchable_rate(cpi, xd) : 0;
+      this_rdc.rate += vp9_cost_bit(vp9_get_skip_prob(cm, xd), 1);
+    }
+
     if (x->color_sensitivity[0] || x->color_sensitivity[1]) {
       int uv_rate = 0;
       int64_t uv_dist = 0;
@@ -943,7 +1383,8 @@
         vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 1);
       if (x->color_sensitivity[1])
         vp9_build_inter_predictors_sbp(xd, mi_row, mi_col, bsize, 2);
-      model_rd_for_sb_uv(cpi, bsize, x, xd, &uv_rate, &uv_dist, &var_y, &sse_y);
+      model_rd_for_sb_uv(cpi, bsize, x, xd, &uv_rate, &uv_dist,
+                         &var_y, &sse_y);
       this_rdc.rate += uv_rate;
       this_rdc.dist += uv_dist;
     }
@@ -982,6 +1423,7 @@
       best_tx_size = mbmi->tx_size;
       best_ref_frame = ref_frame;
       best_mode_skip_txfm = x->skip_txfm[0];
+      best_early_term = this_early_term;
 
       if (reuse_inter_pred) {
         free_pred_buffer(best_pred);
@@ -994,6 +1436,13 @@
 
     if (x->skip)
       break;
+
+    // If early termination flag is 1 and at least 2 modes are checked,
+    // the mode search is terminated.
+    if (best_early_term && idx > 0) {
+      x->skip = 1;
+      break;
+    }
   }
 
   mbmi->mode          = best_mode;
@@ -1042,6 +1491,8 @@
       const PREDICTION_MODE this_mode = intra_mode_list[i];
       if (!((1 << this_mode) & cpi->sf.intra_y_mode_mask[intra_tx_size]))
         continue;
+      mbmi->mode = this_mode;
+      mbmi->ref_frame[0] = INTRA_FRAME;
       args.mode = this_mode;
       args.rate = 0;
       args.dist = 0;
@@ -1058,17 +1509,17 @@
 
       if (this_rdc.rdcost < best_rdc.rdcost) {
         best_rdc = this_rdc;
-        mbmi->mode = this_mode;
+        best_mode = this_mode;
         best_intra_tx_size = mbmi->tx_size;
-        mbmi->ref_frame[0] = INTRA_FRAME;
+        best_ref_frame = INTRA_FRAME;
         mbmi->uv_mode = this_mode;
         mbmi->mv[0].as_int = INVALID_MV;
+        best_mode_skip_txfm = x->skip_txfm[0];
       }
     }
 
     // Reset mb_mode_info to the best inter mode.
-    if (mbmi->ref_frame[0] != INTRA_FRAME) {
-      x->skip_txfm[0] = best_mode_skip_txfm;
+    if (best_ref_frame != INTRA_FRAME) {
       mbmi->tx_size = best_tx_size;
     } else {
       mbmi->tx_size = best_intra_tx_size;
@@ -1076,6 +1527,9 @@
   }
 
   pd->dst = orig_dst;
+  mbmi->mode = best_mode;
+  mbmi->ref_frame[0] = best_ref_frame;
+  x->skip_txfm[0] = best_mode_skip_txfm;
 
   if (reuse_inter_pred && best_pred != NULL) {
     if (best_pred->data != orig_dst.buf && is_inter_mode(mbmi->mode)) {

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 5efa40b..166535b 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c

@@ -292,6 +292,18 @@
   return error;
 }
 
+int64_t vp9_block_error_fp_c(const int16_t *coeff, const int16_t *dqcoeff,
+                             int block_size) {
+  int i;
+  int64_t error = 0;
+
+  for (i = 0; i < block_size; i++) {
+    const int diff = coeff[i] - dqcoeff[i];
+    error +=  diff * diff;
+  }
+
+  return error;
+}
 
 #if CONFIG_VP9_HIGHBITDEPTH
 int64_t vp9_highbd_block_error_c(const tran_low_t *coeff,
@@ -1540,6 +1552,7 @@
                                 int mi_row, int mi_col,
                                 int_mv single_newmv[MAX_REF_FRAMES],
                                 int *rate_mv) {
+  const VP9_COMMON *const cm = &cpi->common;
   const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
   const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
   MACROBLOCKD *xd = &x->e_mbd;
@@ -1548,14 +1561,8 @@
                        mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]};
   int_mv ref_mv[2];
   int ite, ref;
-  // Prediction buffer from second frame.
-#if CONFIG_VP9_HIGHBITDEPTH
-  uint8_t *second_pred;
-  uint8_t *second_pred_alloc;
-#else
-  uint8_t *second_pred = vpx_memalign(16, pw * ph * sizeof(uint8_t));
-#endif  // CONFIG_VP9_HIGHBITDEPTH
   const InterpKernel *kernel = vp9_get_interp_kernel(mbmi->interp_filter);
+  struct scale_factors sf;
 
   // Do joint motion search in compound mode to get more accurate mv.
   struct buf_2d backup_yv12[2][MAX_MB_PLANE];
@@ -1564,14 +1571,13 @@
     vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[0]),
     vp9_get_scaled_ref_frame(cpi, mbmi->ref_frame[1])
   };
+
+  // Prediction buffer from second frame.
 #if CONFIG_VP9_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    second_pred_alloc = vpx_memalign(16, pw * ph * sizeof(uint16_t));
-    second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc);
-  } else {
-    second_pred_alloc = vpx_memalign(16, pw * ph * sizeof(uint8_t));
-    second_pred = second_pred_alloc;
-  }
+  DECLARE_ALIGNED_ARRAY(16, uint16_t, second_pred_alloc_16, 64 * 64);
+  uint8_t *second_pred;
+#else
+  DECLARE_ALIGNED_ARRAY(16, uint8_t, second_pred, 64 * 64);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
   for (ref = 0; ref < 2; ++ref) {
@@ -1591,6 +1597,17 @@
     frame_mv[refs[ref]].as_int = single_newmv[refs[ref]].as_int;
   }
 
+  // Since we have scaled the reference frames to match the size of the current
+  // frame we must use a unit scaling factor during mode selection.
+#if CONFIG_VP9_HIGHBITDEPTH
+  vp9_setup_scale_factors_for_frame(&sf, cm->width, cm->height,
+                                    cm->width, cm->height,
+                                    cm->use_highbitdepth);
+#else
+  vp9_setup_scale_factors_for_frame(&sf, cm->width, cm->height,
+                                    cm->width, cm->height);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
   // Allow joint search multiple times iteratively for each reference frame
   // and break out of the search loop if it couldn't find a better mv.
   for (ite = 0; ite < 4; ite++) {
@@ -1615,22 +1632,22 @@
     // Get the prediction block from the 'other' reference frame.
 #if CONFIG_VP9_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      second_pred = CONVERT_TO_BYTEPTR(second_pred_alloc_16);
       vp9_highbd_build_inter_predictor(ref_yv12[!id].buf,
                                        ref_yv12[!id].stride,
                                        second_pred, pw,
                                        &frame_mv[refs[!id]].as_mv,
-                                       &xd->block_refs[!id]->sf,
-                                       pw, ph, 0,
+                                       &sf, pw, ph, 0,
                                        kernel, MV_PRECISION_Q3,
                                        mi_col * MI_SIZE, mi_row * MI_SIZE,
                                        xd->bd);
     } else {
+      second_pred = (uint8_t *)second_pred_alloc_16;
       vp9_build_inter_predictor(ref_yv12[!id].buf,
                                 ref_yv12[!id].stride,
                                 second_pred, pw,
                                 &frame_mv[refs[!id]].as_mv,
-                                &xd->block_refs[!id]->sf,
-                                pw, ph, 0,
+                                &sf, pw, ph, 0,
                                 kernel, MV_PRECISION_Q3,
                                 mi_col * MI_SIZE, mi_row * MI_SIZE);
     }
@@ -1639,8 +1656,7 @@
                               ref_yv12[!id].stride,
                               second_pred, pw,
                               &frame_mv[refs[!id]].as_mv,
-                              &xd->block_refs[!id]->sf,
-                              pw, ph, 0,
+                              &sf, pw, ph, 0,
                               kernel, MV_PRECISION_Q3,
                               mi_col * MI_SIZE, mi_row * MI_SIZE);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -1712,12 +1728,6 @@
                                 &mbmi->ref_mvs[refs[ref]][0].as_mv,
                                 x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
   }
-
-#if CONFIG_VP9_HIGHBITDEPTH
-  vpx_free(second_pred_alloc);
-#else
-  vpx_free(second_pred);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
 }
 
 static int64_t rd_pick_best_sub8x8_mode(VP9_COMP *cpi, MACROBLOCK *x,
@@ -2412,7 +2422,6 @@
   int_mv cur_mv[2];
 #if CONFIG_VP9_HIGHBITDEPTH
   DECLARE_ALIGNED_ARRAY(16, uint16_t, tmp_buf16, MAX_MB_PLANE * 64 * 64);
-  DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf8, MAX_MB_PLANE * 64 * 64);
   uint8_t *tmp_buf;
 #else
   DECLARE_ALIGNED_ARRAY(16, uint8_t, tmp_buf, MAX_MB_PLANE * 64 * 64);
@@ -2441,7 +2450,7 @@
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf16);
   } else {
-    tmp_buf = tmp_buf8;
+    tmp_buf = (uint8_t *)tmp_buf16;
   }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 

diff --git a/vp9/encoder/vp9_speed_features.c b/vp9/encoder/vp9_speed_features.c
index 3515b6e..4c5ba5d 100644
--- a/vp9/encoder/vp9_speed_features.c
+++ b/vp9/encoder/vp9_speed_features.c

@@ -301,7 +301,7 @@
         (frames_since_key % (sf->last_partitioning_redo_frequency << 1) == 1);
     sf->max_delta_qindex = is_keyframe ? 20 : 15;
     sf->partition_search_type = REFERENCE_PARTITION;
-    sf->use_nonrd_pick_mode = !is_keyframe;
+    sf->use_nonrd_pick_mode = 1;
     sf->allow_skip_recode = 0;
     sf->inter_mode_mask[BLOCK_32X32] = INTER_NEAREST_NEW_ZERO;
     sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST_NEW_ZERO;

diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 4c89953..9d2595b 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c

@@ -65,18 +65,6 @@
   -CATEGORY5_TOKEN, -CATEGORY6_TOKEN   // 10 = CAT_FIVE
 };
 
-// Unconstrained Node Tree
-const vp9_tree_index vp9_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
-  2, 6,                                // 0 = LOW_VAL
-  -TWO_TOKEN, 4,                       // 1 = TWO
-  -THREE_TOKEN, -FOUR_TOKEN,           // 2 = THREE
-  8, 10,                               // 3 = HIGH_LOW
-  -CATEGORY1_TOKEN, -CATEGORY2_TOKEN,  // 4 = CAT_ONE
-  12, 14,                              // 5 = CAT_THREEFOUR
-  -CATEGORY3_TOKEN, -CATEGORY4_TOKEN,  // 6 = CAT_THREE
-  -CATEGORY5_TOKEN, -CATEGORY6_TOKEN   // 7 = CAT_FIVE
-};
-
 static const vp9_tree_index cat1[2] = {0, 0};
 static const vp9_tree_index cat2[4] = {2, 2, 0, 0};
 static const vp9_tree_index cat3[6] = {2, 2, 4, 4, 0, 0};
@@ -625,7 +613,6 @@
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0].src_mi->mbmi;
-  TOKENEXTRA *t_backup = *t;
   const int ctx = vp9_get_skip_context(xd);
   const int skip_inc = !vp9_segfeature_active(&cm->seg, mbmi->segment_id,
                                               SEG_LVL_SKIP);
@@ -634,8 +621,6 @@
     if (!dry_run)
       td->counts->skip[ctx][1] += skip_inc;
     reset_skip_context(xd, bsize);
-    if (dry_run)
-      *t = t_backup;
     return;
   }
 
@@ -644,6 +629,5 @@
     vp9_foreach_transformed_block(xd, bsize, tokenize_b, &arg);
   } else {
     vp9_foreach_transformed_block(xd, bsize, set_entropy_context_b, &arg);
-    *t = t_backup;
   }
 }

diff --git a/vp9/encoder/x86/vp9_avg_intrin_sse2.c b/vp9/encoder/x86/vp9_avg_intrin_sse2.c
index 4e80b25..4672aa6 100644
--- a/vp9/encoder/x86/vp9_avg_intrin_sse2.c
+++ b/vp9/encoder/x86/vp9_avg_intrin_sse2.c

@@ -11,6 +11,83 @@
 #include <emmintrin.h>
 #include "vpx_ports/mem.h"
 
+void vp9_minmax_8x8_sse2(const uint8_t *s, int p, const uint8_t *d, int dp,
+                         int *min, int *max) {
+  __m128i u0, s0, d0, diff, maxabsdiff, minabsdiff, negdiff, absdiff0, absdiff;
+  u0  = _mm_setzero_si128();
+  // Row 0
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff0 = _mm_max_epi16(diff, negdiff);
+  // Row 1
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(absdiff0, absdiff);
+  minabsdiff = _mm_min_epi16(absdiff0, absdiff);
+  // Row 2
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 2 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 3
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 3 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 4
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 4 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 5
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 5 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 6
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 6 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+  // Row 7
+  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
+  d0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(d + 7 * dp)), u0);
+  diff = _mm_subs_epi16(s0, d0);
+  negdiff = _mm_subs_epi16(u0, diff);
+  absdiff = _mm_max_epi16(diff, negdiff);
+  maxabsdiff = _mm_max_epi16(maxabsdiff, absdiff);
+  minabsdiff = _mm_min_epi16(minabsdiff, absdiff);
+
+  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_si128(maxabsdiff, 8));
+  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 32));
+  maxabsdiff = _mm_max_epi16(maxabsdiff, _mm_srli_epi64(maxabsdiff, 16));
+  *max = _mm_extract_epi16(maxabsdiff, 0);
+
+  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_si128(minabsdiff, 8));
+  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 32));
+  minabsdiff = _mm_min_epi16(minabsdiff, _mm_srli_epi64(minabsdiff, 16));
+  *min = _mm_extract_epi16(minabsdiff, 0);
+}
 
 unsigned int vp9_avg_8x8_sse2(const uint8_t *s, int p) {
   __m128i s0, s1, u0;
@@ -57,6 +134,179 @@
   return (avg + 8) >> 4;
 }
 
+static void hadamard_col8_sse2(__m128i *in, int iter) {
+  __m128i a0 = in[0];
+  __m128i a1 = in[1];
+  __m128i a2 = in[2];
+  __m128i a3 = in[3];
+  __m128i a4 = in[4];
+  __m128i a5 = in[5];
+  __m128i a6 = in[6];
+  __m128i a7 = in[7];
+
+  __m128i b0 = _mm_add_epi16(a0, a1);
+  __m128i b1 = _mm_sub_epi16(a0, a1);
+  __m128i b2 = _mm_add_epi16(a2, a3);
+  __m128i b3 = _mm_sub_epi16(a2, a3);
+  __m128i b4 = _mm_add_epi16(a4, a5);
+  __m128i b5 = _mm_sub_epi16(a4, a5);
+  __m128i b6 = _mm_add_epi16(a6, a7);
+  __m128i b7 = _mm_sub_epi16(a6, a7);
+
+  a0 = _mm_add_epi16(b0, b2);
+  a1 = _mm_add_epi16(b1, b3);
+  a2 = _mm_sub_epi16(b0, b2);
+  a3 = _mm_sub_epi16(b1, b3);
+  a4 = _mm_add_epi16(b4, b6);
+  a5 = _mm_add_epi16(b5, b7);
+  a6 = _mm_sub_epi16(b4, b6);
+  a7 = _mm_sub_epi16(b5, b7);
+
+  if (iter == 0) {
+    b0 = _mm_add_epi16(a0, a4);
+    b7 = _mm_add_epi16(a1, a5);
+    b3 = _mm_add_epi16(a2, a6);
+    b4 = _mm_add_epi16(a3, a7);
+    b2 = _mm_sub_epi16(a0, a4);
+    b6 = _mm_sub_epi16(a1, a5);
+    b1 = _mm_sub_epi16(a2, a6);
+    b5 = _mm_sub_epi16(a3, a7);
+
+    a0 = _mm_unpacklo_epi16(b0, b1);
+    a1 = _mm_unpacklo_epi16(b2, b3);
+    a2 = _mm_unpackhi_epi16(b0, b1);
+    a3 = _mm_unpackhi_epi16(b2, b3);
+    a4 = _mm_unpacklo_epi16(b4, b5);
+    a5 = _mm_unpacklo_epi16(b6, b7);
+    a6 = _mm_unpackhi_epi16(b4, b5);
+    a7 = _mm_unpackhi_epi16(b6, b7);
+
+    b0 = _mm_unpacklo_epi32(a0, a1);
+    b1 = _mm_unpacklo_epi32(a4, a5);
+    b2 = _mm_unpackhi_epi32(a0, a1);
+    b3 = _mm_unpackhi_epi32(a4, a5);
+    b4 = _mm_unpacklo_epi32(a2, a3);
+    b5 = _mm_unpacklo_epi32(a6, a7);
+    b6 = _mm_unpackhi_epi32(a2, a3);
+    b7 = _mm_unpackhi_epi32(a6, a7);
+
+    in[0] = _mm_unpacklo_epi64(b0, b1);
+    in[1] = _mm_unpackhi_epi64(b0, b1);
+    in[2] = _mm_unpacklo_epi64(b2, b3);
+    in[3] = _mm_unpackhi_epi64(b2, b3);
+    in[4] = _mm_unpacklo_epi64(b4, b5);
+    in[5] = _mm_unpackhi_epi64(b4, b5);
+    in[6] = _mm_unpacklo_epi64(b6, b7);
+    in[7] = _mm_unpackhi_epi64(b6, b7);
+  } else {
+    in[0] = _mm_add_epi16(a0, a4);
+    in[7] = _mm_add_epi16(a1, a5);
+    in[3] = _mm_add_epi16(a2, a6);
+    in[4] = _mm_add_epi16(a3, a7);
+    in[2] = _mm_sub_epi16(a0, a4);
+    in[6] = _mm_sub_epi16(a1, a5);
+    in[1] = _mm_sub_epi16(a2, a6);
+    in[5] = _mm_sub_epi16(a3, a7);
+  }
+}
+
+void vp9_hadamard_8x8_sse2(int16_t const *src_diff, int src_stride,
+                           int16_t *coeff) {
+  __m128i src[8];
+  src[0] = _mm_load_si128((const __m128i *)src_diff);
+  src[1] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[2] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[3] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[4] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[5] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[6] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+  src[7] = _mm_load_si128((const __m128i *)(src_diff += src_stride));
+
+  hadamard_col8_sse2(src, 0);
+  hadamard_col8_sse2(src, 1);
+
+  _mm_store_si128((__m128i *)coeff, src[0]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[1]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[2]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[3]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[4]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[5]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[6]);
+  coeff += 8;
+  _mm_store_si128((__m128i *)coeff, src[7]);
+}
+
+void vp9_hadamard_16x16_sse2(int16_t const *src_diff, int src_stride,
+                             int16_t *coeff) {
+  int idx;
+  for (idx = 0; idx < 4; ++idx) {
+    int16_t const *src_ptr = src_diff + (idx >> 1) * 8 * src_stride
+                                + (idx & 0x01) * 8;
+    vp9_hadamard_8x8_sse2(src_ptr, src_stride, coeff + idx * 64);
+  }
+
+  for (idx = 0; idx < 64; idx += 8) {
+    __m128i coeff0 = _mm_load_si128((const __m128i *)coeff);
+    __m128i coeff1 = _mm_load_si128((const __m128i *)(coeff + 64));
+    __m128i coeff2 = _mm_load_si128((const __m128i *)(coeff + 128));
+    __m128i coeff3 = _mm_load_si128((const __m128i *)(coeff + 192));
+
+    __m128i b0 = _mm_add_epi16(coeff0, coeff1);
+    __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
+    __m128i b2 = _mm_add_epi16(coeff2, coeff3);
+    __m128i b3 = _mm_sub_epi16(coeff2, coeff3);
+
+    coeff0 = _mm_add_epi16(b0, b2);
+    coeff1 = _mm_add_epi16(b1, b3);
+    coeff0 = _mm_srai_epi16(coeff0, 1);
+    coeff1 = _mm_srai_epi16(coeff1, 1);
+    _mm_store_si128((__m128i *)coeff, coeff0);
+    _mm_store_si128((__m128i *)(coeff + 64), coeff1);
+
+    coeff2 = _mm_sub_epi16(b0, b2);
+    coeff3 = _mm_sub_epi16(b1, b3);
+    coeff2 = _mm_srai_epi16(coeff2, 1);
+    coeff3 = _mm_srai_epi16(coeff3, 1);
+    _mm_store_si128((__m128i *)(coeff + 128), coeff2);
+    _mm_store_si128((__m128i *)(coeff + 192), coeff3);
+
+    coeff += 8;
+  }
+}
+
+int16_t vp9_satd_sse2(const int16_t *coeff, int length) {
+  int i;
+  __m128i sum = _mm_load_si128((const __m128i *)coeff);
+  __m128i sign = _mm_srai_epi16(sum, 15);
+  __m128i val = _mm_xor_si128(sum, sign);
+  sum = _mm_sub_epi16(val, sign);
+  coeff += 8;
+
+  for (i = 8; i < length; i += 8) {
+    __m128i src_line = _mm_load_si128((const __m128i *)coeff);
+    sign = _mm_srai_epi16(src_line, 15);
+    val = _mm_xor_si128(src_line, sign);
+    val = _mm_sub_epi16(val, sign);
+    sum = _mm_add_epi16(sum, val);
+    coeff += 8;
+  }
+
+  val = _mm_srli_si128(sum, 8);
+  sum = _mm_add_epi16(sum, val);
+  val = _mm_srli_epi64(sum, 32);
+  sum = _mm_add_epi16(sum, val);
+  val = _mm_srli_epi32(sum, 16);
+  sum = _mm_add_epi16(sum, val);
+
+  return _mm_extract_epi16(sum, 0);
+}
+
 void vp9_int_pro_row_sse2(int16_t *hbuf, uint8_t const*ref,
                           const int ref_stride, const int height) {
   int idx;

diff --git a/vp9/encoder/x86/vp9_dct_ssse3.c b/vp9/encoder/x86/vp9_dct_ssse3.c
index bdc75e9..1c1005a 100644
--- a/vp9/encoder/x86/vp9_dct_ssse3.c
+++ b/vp9/encoder/x86/vp9_dct_ssse3.c

@@ -293,7 +293,8 @@
 
   if (!skip_block) {
     __m128i eob;
-    __m128i round, quant, dequant;
+    __m128i round, quant, dequant, thr;
+    int16_t nzflag;
     {
       __m128i coeff0, coeff1;
 
@@ -368,6 +369,7 @@
 
     // AC only loop
     index = 2;
+    thr = _mm_srai_epi16(dequant, 1);
     while (n_coeffs < 0) {
       __m128i coeff0, coeff1;
       {
@@ -387,28 +389,39 @@
         qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
         qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
 
-        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+        nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+            _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
 
-        // Reinsert signs
-        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+        if (nzflag) {
+          qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+          qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+          qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+          qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
 
-        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
-        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+          // Reinsert signs
+          qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+          qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+          qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+          qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
 
-        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
 
-        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
-        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+          coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+          coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+        } else {
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
+        }
       }
 
-      {
+      if (nzflag) {
         // Scan for eob
         __m128i zero_coeff0, zero_coeff1;
         __m128i nzero_coeff0, nzero_coeff1;

diff --git a/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm b/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm
index 28458dc..3a29aba 100644
--- a/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm
+++ b/vp9/encoder/x86/vp9_dct_ssse3_x86_64.asm

@@ -179,4 +179,77 @@
   mova              [outputq + 112], m7
 
   RET
+
+%macro HMD8_1D 0
+  psubw              m8, m0, m1
+  psubw              m9, m2, m3
+  paddw              m0, m1
+  paddw              m2, m3
+  SWAP               1, 8
+  SWAP               3, 9
+  psubw              m8, m4, m5
+  psubw              m9, m6, m7
+  paddw              m4, m5
+  paddw              m6, m7
+  SWAP               5, 8
+  SWAP               7, 9
+
+  psubw              m8, m0, m2
+  psubw              m9, m1, m3
+  paddw              m0, m2
+  paddw              m1, m3
+  SWAP               2, 8
+  SWAP               3, 9
+  psubw              m8, m4, m6
+  psubw              m9, m5, m7
+  paddw              m4, m6
+  paddw              m5, m7
+  SWAP               6, 8
+  SWAP               7, 9
+
+  psubw              m8, m0, m4
+  psubw              m9, m1, m5
+  paddw              m0, m4
+  paddw              m1, m5
+  SWAP               4, 8
+  SWAP               5, 9
+  psubw              m8, m2, m6
+  psubw              m9, m3, m7
+  paddw              m2, m6
+  paddw              m3, m7
+  SWAP               6, 8
+  SWAP               7, 9
+%endmacro
+
+INIT_XMM ssse3
+cglobal hadamard_8x8, 3, 5, 10, input, stride, output
+  lea                r3, [2 * strideq]
+  lea                r4, [4 * strideq]
+
+  mova               m0, [inputq]
+  mova               m1, [inputq + r3]
+  lea                inputq, [inputq + r4]
+  mova               m2, [inputq]
+  mova               m3, [inputq + r3]
+  lea                inputq, [inputq + r4]
+  mova               m4, [inputq]
+  mova               m5, [inputq + r3]
+  lea                inputq, [inputq + r4]
+  mova               m6, [inputq]
+  mova               m7, [inputq + r3]
+
+  HMD8_1D
+  TRANSPOSE8X8 0, 1, 2, 3, 4, 5, 6, 7, 9
+  HMD8_1D
+
+  mova              [outputq +   0], m0
+  mova              [outputq +  16], m1
+  mova              [outputq +  32], m2
+  mova              [outputq +  48], m3
+  mova              [outputq +  64], m4
+  mova              [outputq +  80], m5
+  mova              [outputq +  96], m6
+  mova              [outputq + 112], m7
+
+  RET
 %endif

diff --git a/vp9/encoder/x86/vp9_error_sse2.asm b/vp9/encoder/x86/vp9_error_sse2.asm
index 1126fdb..56373e8 100644
--- a/vp9/encoder/x86/vp9_error_sse2.asm
+++ b/vp9/encoder/x86/vp9_error_sse2.asm

@@ -72,3 +72,49 @@
   movd    edx, m5
 %endif
   RET
+
+; Compute the sum of squared difference between two int16_t vectors.
+; int64_t vp9_block_error_fp(int16_t *coeff, int16_t *dqcoeff,
+;                            intptr_t block_size)
+
+INIT_XMM sse2
+cglobal block_error_fp, 3, 3, 6, uqc, dqc, size
+  pxor      m4, m4                 ; sse accumulator
+  pxor      m5, m5                 ; dedicated zero register
+  lea     uqcq, [uqcq+sizeq*2]
+  lea     dqcq, [dqcq+sizeq*2]
+  neg    sizeq
+.loop:
+  mova      m2, [uqcq+sizeq*2]
+  mova      m0, [dqcq+sizeq*2]
+  mova      m3, [uqcq+sizeq*2+mmsize]
+  mova      m1, [dqcq+sizeq*2+mmsize]
+  psubw     m0, m2
+  psubw     m1, m3
+  ; individual errors are max. 15bit+sign, so squares are 30bit, and
+  ; thus the sum of 2 should fit in a 31bit integer (+ unused sign bit)
+  pmaddwd   m0, m0
+  pmaddwd   m1, m1
+  ; accumulate in 64bit
+  punpckldq m3, m0, m5
+  punpckhdq m0, m5
+  paddq     m4, m3
+  punpckldq m3, m1, m5
+  paddq     m4, m0
+  punpckhdq m1, m5
+  paddq     m4, m3
+  paddq     m4, m1
+  add    sizeq, mmsize
+  jl .loop
+
+  ; accumulate horizontally and store in return value
+  movhlps   m5, m4
+  paddq     m4, m5
+%if ARCH_X86_64
+  movq    rax, m4
+%else
+  pshufd   m5, m4, 0x1
+  movd    eax, m4
+  movd    edx, m5
+%endif
+  RET

diff --git a/vp9/encoder/x86/vp9_quantize_sse2.c b/vp9/encoder/x86/vp9_quantize_sse2.c
index 679c66e..00abd3c 100644
--- a/vp9/encoder/x86/vp9_quantize_sse2.c
+++ b/vp9/encoder/x86/vp9_quantize_sse2.c

@@ -230,6 +230,8 @@
                           const int16_t* scan_ptr,
                           const int16_t* iscan_ptr) {
   __m128i zero;
+  __m128i thr;
+  int16_t nzflag;
   (void)scan_ptr;
   (void)zbin_ptr;
   (void)quant_shift_ptr;
@@ -316,6 +318,8 @@
       n_coeffs += 8 * 2;
     }
 
+    thr = _mm_srai_epi16(dequant, 1);
+
     // AC only loop
     while (n_coeffs < 0) {
       __m128i coeff0, coeff1;
@@ -335,28 +339,39 @@
         qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
         qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
 
-        qcoeff0 = _mm_adds_epi16(qcoeff0, round);
-        qcoeff1 = _mm_adds_epi16(qcoeff1, round);
-        qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
-        qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
+        nzflag = _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff0, thr)) |
+            _mm_movemask_epi8(_mm_cmpgt_epi16(qcoeff1, thr));
 
-        // Reinsert signs
-        qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
-        qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
-        qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
-        qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
+        if (nzflag) {
+          qcoeff0 = _mm_adds_epi16(qcoeff0, round);
+          qcoeff1 = _mm_adds_epi16(qcoeff1, round);
+          qtmp0 = _mm_mulhi_epi16(qcoeff0, quant);
+          qtmp1 = _mm_mulhi_epi16(qcoeff1, quant);
 
-        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
-        _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
+          // Reinsert signs
+          qcoeff0 = _mm_xor_si128(qtmp0, coeff0_sign);
+          qcoeff1 = _mm_xor_si128(qtmp1, coeff1_sign);
+          qcoeff0 = _mm_sub_epi16(qcoeff0, coeff0_sign);
+          qcoeff1 = _mm_sub_epi16(qcoeff1, coeff1_sign);
 
-        coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
-        coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), qcoeff0);
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, qcoeff1);
 
-        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
-        _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+          coeff0 = _mm_mullo_epi16(qcoeff0, dequant);
+          coeff1 = _mm_mullo_epi16(qcoeff1, dequant);
+
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), coeff0);
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, coeff1);
+        } else {
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs), zero);
+          _mm_store_si128((__m128i*)(qcoeff_ptr + n_coeffs) + 1, zero);
+
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs), zero);
+          _mm_store_si128((__m128i*)(dqcoeff_ptr + n_coeffs) + 1, zero);
+        }
       }
 
-      {
+      if (nzflag) {
         // Scan for eob
         __m128i zero_coeff0, zero_coeff1;
         __m128i nzero_coeff0, nzero_coeff1;

diff --git a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
index c35eb36..449d52b 100644
--- a/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
+++ b/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm

@@ -282,6 +282,8 @@
   psignw                          m8, m9
   psignw                         m13, m10
   psrlw                           m0, m3, 2
+%else
+  psrlw                           m0, m3, 1
 %endif
   mova            [r4q+ncoeffq*2+ 0], m8
   mova            [r4q+ncoeffq*2+16], m13
@@ -302,7 +304,7 @@
   mova                           m10, [  coeffq+ncoeffq*2+16] ; m10 = c[i]
   pabsw                           m6, m9                   ; m6 = abs(m9)
   pabsw                          m11, m10                  ; m11 = abs(m10)
-%ifidn %1, fp_32x32
+
   pcmpgtw                         m7, m6,  m0
   pcmpgtw                        m12, m11, m0
   pmovmskb                       r6d, m7
@@ -310,7 +312,7 @@
 
   or                              r6, r2
   jz .skip_iter
-%endif
+
   pcmpeqw                         m7, m7
 
   paddsw                          m6, m1                   ; m6 += round
@@ -348,7 +350,6 @@
   add                        ncoeffq, mmsize
   jl .ac_only_loop
 
-%ifidn %1, fp_32x32
   jmp .accumulate_eob
 .skip_iter:
   mova            [r3q+ncoeffq*2+ 0], m5
@@ -357,7 +358,6 @@
   mova            [r4q+ncoeffq*2+16], m5
   add                        ncoeffq, mmsize
   jl .ac_only_loop
-%endif
 
 .accumulate_eob:
   ; horizontally accumulate/max eobs and write into [eob] memory pointer

diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index c854983..037367b 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk

@@ -129,6 +129,9 @@
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_mblpf_horiz_loopfilter_dspr2.c
 VP9_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/vp9_mblpf_vert_loopfilter_dspr2.c
 
+# common (msa)
+VP9_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/vp9_macros_msa.h
+
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.c
 VP9_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp9_idct_intrin_sse2.h
 VP9_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp9_idct_intrin_ssse3.c
@@ -141,8 +144,10 @@
 VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_mb_lpf_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON_ASM) += common/arm/neon/vp9_save_reg_neon$(ASM)
 
+ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht4x4_add_neon.c
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_iht8x8_add_neon.c
+endif
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon.c
 
 # neon with assembly and intrinsics implementations. If both are available

diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 0ce37aa..cba15e6 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c

@@ -1260,6 +1260,21 @@
   }
 }
 
+static vpx_codec_err_t ctrl_get_active_map(vpx_codec_alg_priv_t *ctx,
+                                           va_list args) {
+  vpx_active_map_t *const map = va_arg(args, vpx_active_map_t *);
+
+  if (map) {
+    if (!vp9_get_active_map(ctx->cpi, map->active_map,
+                            (int)map->rows, (int)map->cols))
+      return VPX_CODEC_OK;
+    else
+      return VPX_CODEC_INVALID_PARAM;
+  } else {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+}
+
 static vpx_codec_err_t ctrl_set_scale_mode(vpx_codec_alg_priv_t *ctx,
                                            va_list args) {
   vpx_scaling_mode_t *const mode = va_arg(args, vpx_scaling_mode_t *);
@@ -1417,6 +1432,7 @@
 #if VPX_ENCODER_ABI_VERSION > (4 + VPX_CODEC_ABI_VERSION)
   {VP9E_GET_SVC_LAYER_ID,             ctrl_get_svc_layer_id},
 #endif
+  {VP9E_GET_ACTIVEMAP,                ctrl_get_active_map},
 
   { -1, NULL},
 };

diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index 7350cb3..e9c58cc 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c

@@ -462,7 +462,6 @@
 static vpx_codec_err_t decode_one(vpx_codec_alg_priv_t *ctx,
                                   const uint8_t **data, unsigned int data_sz,
                                   void *user_priv, int64_t deadline) {
-  vp9_ppflags_t flags = {0, 0, 0};
   const VP9WorkerInterface *const winterface = vp9_get_worker_interface();
   (void)deadline;
 
@@ -547,9 +546,6 @@
     winterface->launch(worker);
   }
 
-  if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)
-    set_ppflags(ctx, &flags);
-
   return VPX_CODEC_OK;
 }
 
@@ -755,6 +751,8 @@
           (FrameWorkerData *)worker->data1;
       ctx->next_output_worker_id =
           (ctx->next_output_worker_id + 1) % ctx->num_frame_workers;
+      if (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC)
+        set_ppflags(ctx, &flags);
       // Wait for the frame from worker thread.
       if (winterface->sync(worker)) {
         // Check if worker has received any frames.

diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 8a5007b..e886dbd 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk

@@ -153,12 +153,14 @@
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_error_intrin_avx2.c
 VP9_CX_SRCS-$(HAVE_AVX2) += encoder/x86/vp9_variance_avx2.c
 
-VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_avg_neon.c
-VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad_neon.c
-VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad4d_neon.c
+ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c
-VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c
+endif
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_avg_neon.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_quantize_neon.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad4d_neon.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_sad_neon.c
 VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_subtract_neon.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_variance_neon.c
 
 VP9_CX_SRCS-yes := $(filter-out $(VP9_CX_SRCS_REMOVE-yes),$(VP9_CX_SRCS-yes))

diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index 60b588f..0e8adc1 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h

@@ -508,6 +508,12 @@
    * Supported in codecs: VP9
    */
   VP9E_SET_COLOR_SPACE,
+
+  /*!\brief Codec control function to get an Active map back from the encoder.
+   *
+   * Supported in codecs: VP9
+   */
+  VP9E_GET_ACTIVEMAP,
 };
 
 /*!\brief vpx 1-D scaling mode
@@ -691,6 +697,8 @@
 VPX_CTRL_USE_TYPE(VP9E_SET_TUNE_CONTENT, int) /* vp9e_tune_content */
 
 VPX_CTRL_USE_TYPE(VP9E_SET_COLOR_SPACE, int)
+
+VPX_CTRL_USE_TYPE(VP9E_GET_ACTIVEMAP, vpx_active_map_t *)
 /*! @} - end defgroup vp8_encoder */
 #ifdef __cplusplus
 }  // extern "C"

diff --git a/vpx_ports/vpx_once.h b/vpx_ports/vpx_once.h
index bd9eebd..f1df394 100644
--- a/vpx_ports/vpx_once.h
+++ b/vpx_ports/vpx_once.h

@@ -110,7 +110,7 @@
 
 
 #else
-/* No-op version that performs no synchronization. vp8_rtcd() is idempotent,
+/* No-op version that performs no synchronization. *_rtcd() is idempotent,
  * so as long as your platform provides atomic loads/stores of pointers
  * no synchronization is strictly necessary.
  */

diff --git a/vpx_scale/vpx_scale_rtcd.c b/vpx_scale/vpx_scale_rtcd.c
index 656a22f..bea603f 100644
--- a/vpx_scale/vpx_scale_rtcd.c
+++ b/vpx_scale/vpx_scale_rtcd.c

@@ -7,9 +7,9 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include "vpx_config.h"
+#include "./vpx_config.h"
 #define RTCD_C
-#include "vpx_scale_rtcd.h"
+#include "./vpx_scale_rtcd.h"
 #include "vpx_ports/vpx_once.h"
 
 void vpx_scale_rtcd()

diff --git a/vpx_scale/win32/scaleopt.c b/vpx_scale/win32/scaleopt.c
deleted file mode 100644
index 4336ece..0000000
--- a/vpx_scale/win32/scaleopt.c
+++ /dev/null

@@ -1,525 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-
-/****************************************************************************
-*
-*   Module Title :     scaleopt.cpp
-*
-*   Description  :     Optimized scaling functions
-*
-****************************************************************************/
-#include "pragmas.h"
-
-/****************************************************************************
-*  Module Statics
-****************************************************************************/
-__declspec(align(16)) const static unsigned short round_values[] = { 128, 128, 128, 128 };
-
-#include "vpx_scale/vpx_scale.h"
-#include "vpx_mem/vpx_mem.h"
-
-__declspec(align(16)) const static unsigned short const54_2[] = {  0,  64, 128, 192 };
-__declspec(align(16)) const static unsigned short const54_1[] = {256, 192, 128,  64 };
-
-
-/****************************************************************************
- *
- *  ROUTINE       : horizontal_line_5_4_scale_mmx
- *
- *  INPUTS        : const unsigned char *source : Pointer to source data.
- *                  unsigned int source_width    : Stride of source.
- *                  unsigned char *dest         : Pointer to destination data.
- *                  unsigned int dest_width      : Stride of destination (NOT USED).
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : Copies horizontal line of pixels from source to
- *                  destination scaling up by 4 to 5.
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-static
-void horizontal_line_5_4_scale_mmx
-(
-  const unsigned char *source,
-  unsigned int source_width,
-  unsigned char *dest,
-  unsigned int dest_width
-) {
-  /*
-  unsigned i;
-  unsigned int a, b, c, d, e;
-  unsigned char *des = dest;
-  const unsigned char *src = source;
-
-  (void) dest_width;
-
-  for ( i=0; i<source_width; i+=5 )
-  {
-      a = src[0];
-      b = src[1];
-      c = src[2];
-      d = src[3];
-      e = src[4];
-
-      des[0] = a;
-      des[1] = ((b*192 + c* 64 + 128)>>8);
-      des[2] = ((c*128 + d*128 + 128)>>8);
-      des[3] = ((d* 64 + e*192 + 128)>>8);
-
-      src += 5;
-      des += 4;
-  }
-  */
-  (void) dest_width;
-
-  __asm {
-
-    mov         esi,        source;
-    mov         edi,        dest;
-
-    mov         ecx,        source_width;
-    movq        mm5,        const54_1;
-
-    pxor        mm7,        mm7;
-    movq        mm6,        const54_2;
-
-    movq        mm4,        round_values;
-    lea         edx,        [esi+ecx];
-    horizontal_line_5_4_loop:
-
-    movq        mm0,        QWORD PTR  [esi];
-    00 01 02 03 04 05 06 07
-    movq        mm1,        mm0;
-    00 01 02 03 04 05 06 07
-
-    psrlq       mm0,        8;
-    01 02 03 04 05 06 07 xx
-    punpcklbw   mm1,        mm7;
-    xx 00 xx 01 xx 02 xx 03
-
-    punpcklbw   mm0,        mm7;
-    xx 01 xx 02 xx 03 xx 04
-    pmullw      mm1,        mm5
-
-    pmullw      mm0,        mm6
-    add         esi,        5
-
-    add         edi,        4
-    paddw       mm1,        mm0
-
-    paddw       mm1,        mm4
-    psrlw       mm1,        8
-
-    cmp         esi,        edx
-    packuswb    mm1,        mm7
-
-    movd        DWORD PTR [edi-4], mm1
-
-    jl          horizontal_line_5_4_loop
-
-  }
-
-}
-__declspec(align(16)) const static unsigned short one_fourths[]   = {  64,  64,  64, 64  };
-__declspec(align(16)) const static unsigned short two_fourths[]   = { 128, 128, 128, 128 };
-__declspec(align(16)) const static unsigned short three_fourths[] = { 192, 192, 192, 192 };
-
-static
-void vertical_band_5_4_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) {
-
-  __asm {
-    push        ebx
-
-    mov         esi,    source                    // Get the source and destination pointer
-    mov         ecx,    src_pitch               // Get the pitch size
-
-    mov         edi,    dest                    // tow lines below
-    pxor        mm7,    mm7                     // clear out mm7
-
-    mov         edx,    dest_pitch               // Loop counter
-    mov         ebx,    dest_width
-
-    vs_5_4_loop:
-
-    movd        mm0,    DWORD ptr [esi]         // src[0];
-    movd        mm1,    DWORD ptr [esi+ecx]     // src[1];
-
-    movd        mm2,    DWORD ptr [esi+ecx*2]
-    lea         eax,    [esi+ecx*2]             //
-
-    punpcklbw   mm1,    mm7
-    punpcklbw   mm2,    mm7
-
-    movq        mm3,    mm2
-    pmullw      mm1,    three_fourths
-
-    pmullw      mm2,    one_fourths
-    movd        mm4,    [eax+ecx]
-
-    pmullw      mm3,    two_fourths
-    punpcklbw   mm4,    mm7
-
-    movq        mm5,    mm4
-    pmullw      mm4,    two_fourths
-
-    paddw       mm1,    mm2
-    movd        mm6,    [eax+ecx*2]
-
-    pmullw      mm5,    one_fourths
-    paddw       mm1,    round_values;
-
-    paddw       mm3,    mm4
-    psrlw       mm1,    8
-
-    punpcklbw   mm6,    mm7
-    paddw       mm3,    round_values
-
-    pmullw      mm6,    three_fourths
-    psrlw       mm3,    8
-
-    packuswb    mm1,    mm7
-    packuswb    mm3,    mm7
-
-    movd        DWORD PTR [edi], mm0
-    movd        DWORD PTR [edi+edx], mm1
-
-
-    paddw       mm5,    mm6
-    movd        DWORD PTR [edi+edx*2], mm3
-
-    lea         eax,    [edi+edx*2]
-    paddw       mm5,    round_values
-
-    psrlw       mm5,    8
-    add         edi,    4
-
-    packuswb    mm5,    mm7
-    movd        DWORD PTR [eax+edx], mm5
-
-    add         esi,    4
-    sub         ebx,    4
-
-    jg         vs_5_4_loop
-
-    pop         ebx
-  }
-}
-
-
-__declspec(align(16)) const static unsigned short const53_1[] = {  0,  85, 171, 0 };
-__declspec(align(16)) const static unsigned short const53_2[] = {256, 171,  85, 0 };
-
-
-static
-void horizontal_line_5_3_scale_mmx
-(
-  const unsigned char *source,
-  unsigned int source_width,
-  unsigned char *dest,
-  unsigned int dest_width
-) {
-
-  (void) dest_width;
-  __asm {
-
-    mov         esi,        source;
-    mov         edi,        dest;
-
-    mov         ecx,        source_width;
-    movq        mm5,        const53_1;
-
-    pxor        mm7,        mm7;
-    movq        mm6,        const53_2;
-
-    movq        mm4,        round_values;
-    lea         edx,        [esi+ecx-5];
-    horizontal_line_5_3_loop:
-
-    movq        mm0,        QWORD PTR  [esi];
-    00 01 02 03 04 05 06 07
-    movq        mm1,        mm0;
-    00 01 02 03 04 05 06 07
-
-    psllw       mm0,        8;
-    xx 00 xx 02 xx 04 xx 06
-    psrlw       mm1,        8;
-    01 xx 03 xx 05 xx 07 xx
-
-    psrlw       mm0,        8;
-    00 xx 02 xx 04 xx 06 xx
-    psllq       mm1,        16;
-    xx xx 01 xx 03 xx 05 xx
-
-    pmullw      mm0,        mm6
-
-    pmullw      mm1,        mm5
-    add         esi,        5
-
-    add         edi,        3
-    paddw       mm1,        mm0
-
-    paddw       mm1,        mm4
-    psrlw       mm1,        8
-
-    cmp         esi,        edx
-    packuswb    mm1,        mm7
-
-    movd        DWORD PTR [edi-3], mm1
-    jl          horizontal_line_5_3_loop
-
-// exit condition
-    movq        mm0,        QWORD PTR  [esi];
-    00 01 02 03 04 05 06 07
-    movq        mm1,        mm0;
-    00 01 02 03 04 05 06 07
-
-    psllw       mm0,        8;
-    xx 00 xx 02 xx 04 xx 06
-    psrlw       mm1,        8;
-    01 xx 03 xx 05 xx 07 xx
-
-    psrlw       mm0,        8;
-    00 xx 02 xx 04 xx 06 xx
-    psllq       mm1,        16;
-    xx xx 01 xx 03 xx 05 xx
-
-    pmullw      mm0,        mm6
-
-    pmullw      mm1,        mm5
-    paddw       mm1,        mm0
-
-    paddw       mm1,        mm4
-    psrlw       mm1,        8
-
-    packuswb    mm1,        mm7
-    movd        eax,        mm1
-
-    mov         edx,        eax
-    shr         edx,        16
-
-    mov         WORD PTR[edi],   ax
-    mov         BYTE PTR[edi+2], dl
-
-  }
-
-}
-
-__declspec(align(16)) const static unsigned short one_thirds[] = {  85,  85,  85,  85 };
-__declspec(align(16)) const static unsigned short two_thirds[] = { 171, 171, 171, 171 };
-
-static
-void vertical_band_5_3_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) {
-
-  __asm {
-    push        ebx
-
-    mov         esi,    source                    // Get the source and destination pointer
-    mov         ecx,    src_pitch               // Get the pitch size
-
-    mov         edi,    dest                    // tow lines below
-    pxor        mm7,    mm7                     // clear out mm7
-
-    mov         edx,    dest_pitch               // Loop counter
-    movq        mm5,    one_thirds
-
-    movq        mm6,    two_thirds
-    mov         ebx,    dest_width;
-
-    vs_5_3_loop:
-
-    movd        mm0,    DWORD ptr [esi]         // src[0];
-    movd        mm1,    DWORD ptr [esi+ecx]     // src[1];
-
-    movd        mm2,    DWORD ptr [esi+ecx*2]
-    lea         eax,    [esi+ecx*2]             //
-
-    punpcklbw   mm1,    mm7
-    punpcklbw   mm2,    mm7
-
-    pmullw      mm1,    mm5
-    pmullw      mm2,    mm6
-
-    movd        mm3,    DWORD ptr [eax+ecx]
-    movd        mm4,    DWORD ptr [eax+ecx*2]
-
-    punpcklbw   mm3,    mm7
-    punpcklbw   mm4,    mm7
-
-    pmullw      mm3,    mm6
-    pmullw      mm4,    mm5
-
-
-    movd        DWORD PTR [edi], mm0
-    paddw       mm1,    mm2
-
-    paddw       mm1,    round_values
-    psrlw       mm1,    8
-
-    packuswb    mm1,    mm7
-    paddw       mm3,    mm4
-
-    paddw       mm3,    round_values
-    movd        DWORD PTR [edi+edx], mm1
-
-    psrlw       mm3,    8
-    packuswb    mm3,    mm7
-
-    movd        DWORD PTR [edi+edx*2], mm3
-
-
-    add         edi,    4
-    add         esi,    4
-
-    sub         ebx,    4
-    jg          vs_5_3_loop
-
-    pop         ebx
-  }
-}
-
-
-
-
-/****************************************************************************
- *
- *  ROUTINE       : horizontal_line_2_1_scale
- *
- *  INPUTS        : const unsigned char *source :
- *                  unsigned int source_width    :
- *                  unsigned char *dest         :
- *                  unsigned int dest_width      :
- *
- *  OUTPUTS       : None.
- *
- *  RETURNS       : void
- *
- *  FUNCTION      : 1 to 2 up-scaling of a horizontal line of pixels.
- *
- *  SPECIAL NOTES : None.
- *
- ****************************************************************************/
-static
-void horizontal_line_2_1_scale_mmx
-(
-  const unsigned char *source,
-  unsigned int source_width,
-  unsigned char *dest,
-  unsigned int dest_width
-) {
-  (void) dest_width;
-  (void) source_width;
-  __asm {
-    mov         esi,    source
-    mov         edi,    dest
-
-    pxor        mm7,    mm7
-    mov         ecx,    dest_width
-
-    xor         edx,    edx
-    hs_2_1_loop:
-
-    movq        mm0,    [esi+edx*2]
-    psllw       mm0,    8
-
-    psrlw       mm0,    8
-    packuswb    mm0,    mm7
-
-    movd        DWORD Ptr [edi+edx], mm0;
-    add         edx,    4
-
-    cmp         edx,    ecx
-    jl          hs_2_1_loop
-
-  }
-}
-
-
-
-static
-void vertical_band_2_1_scale_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) {
-  (void) dest_pitch;
-  (void) src_pitch;
-  vpx_memcpy(dest, source, dest_width);
-}
-
-
-__declspec(align(16)) const static unsigned short three_sixteenths[] = {  48,  48,  48,  48 };
-__declspec(align(16)) const static unsigned short ten_sixteenths[]   = { 160, 160, 160, 160 };
-
-static
-void vertical_band_2_1_scale_i_mmx(unsigned char *source, unsigned int src_pitch, unsigned char *dest, unsigned int dest_pitch, unsigned int dest_width) {
-
-  (void) dest_pitch;
-  __asm {
-    mov         esi,        source
-    mov         edi,        dest
-
-    mov         eax,        src_pitch
-    mov         edx,        dest_width
-
-    pxor        mm7,        mm7
-    sub         esi,        eax             // back one line
-
-
-    lea         ecx,        [esi+edx];
-    movq        mm6,        round_values;
-
-    movq        mm5,        three_sixteenths;
-    movq        mm4,        ten_sixteenths;
-
-    vs_2_1_i_loop:
-    movd        mm0,        [esi]           //
-    movd        mm1,        [esi+eax]       //
-
-    movd        mm2,        [esi+eax*2]     //
-    punpcklbw   mm0,        mm7
-
-    pmullw      mm0,        mm5
-    punpcklbw   mm1,        mm7
-
-    pmullw      mm1,        mm4
-    punpcklbw   mm2,        mm7
-
-    pmullw      mm2,        mm5
-    paddw       mm0,        round_values
-
-    paddw       mm1,        mm2
-    paddw       mm0,        mm1
-
-    psrlw       mm0,        8
-    packuswb    mm0,        mm7
-
-    movd        DWORD PTR [edi],        mm0
-    add         esi,        4
-
-    add         edi,        4;
-    cmp         esi,        ecx
-    jl          vs_2_1_i_loop
-
-  }
-}
-
-
-
-void
-register_mmxscalers(void) {
-  vp8_vertical_band_5_4_scale           = vertical_band_5_4_scale_mmx;
-  vp8_vertical_band_5_3_scale           = vertical_band_5_3_scale_mmx;
-  vp8_vertical_band_2_1_scale           = vertical_band_2_1_scale_mmx;
-  vp8_vertical_band_2_1_scale_i         = vertical_band_2_1_scale_i_mmx;
-  vp8_horizontal_line_2_1_scale         = horizontal_line_2_1_scale_mmx;
-  vp8_horizontal_line_5_3_scale         = horizontal_line_5_3_scale_mmx;
-  vp8_horizontal_line_5_4_scale         = horizontal_line_5_4_scale_mmx;
-}

diff --git a/vpxdec.c b/vpxdec.c
index 0403550..8c938df 100644
--- a/vpxdec.c
+++ b/vpxdec.c

@@ -1080,9 +1080,6 @@
         }
       }
     }
-
-    if (stop_after && frame_in >= stop_after)
-      break;
   }
 
   if (summary || progress) {

diff --git a/webmdec.cc b/webmdec.cc
index d591f3e..e152f5e 100644
--- a/webmdec.cc
+++ b/webmdec.cc

@@ -63,6 +63,7 @@
                  struct VpxInputContext *vpx_ctx) {
   mkvparser::MkvReader *const reader = new mkvparser::MkvReader(vpx_ctx->file);
   webm_ctx->reader = reader;
+  webm_ctx->reached_eos = 0;
 
   mkvparser::EBMLHeader header;
   long long pos = 0;
@@ -121,6 +122,11 @@
                     uint8_t **buffer,
                     size_t *bytes_in_buffer,
                     size_t *buffer_size) {
+  // This check is needed for frame parallel decoding, in which case this
+  // function could be called even after it has reached end of input stream.
+  if (webm_ctx->reached_eos) {
+    return 1;
+  }
   mkvparser::Segment *const segment =
       reinterpret_cast<mkvparser::Segment*>(webm_ctx->segment);
   const mkvparser::Cluster* cluster =
@@ -140,6 +146,7 @@
       cluster = segment->GetNext(cluster);
       if (cluster == NULL || cluster->EOS()) {
         *bytes_in_buffer = 0;
+        webm_ctx->reached_eos = 1;
         return 1;
       }
       status = cluster->GetFirst(block_entry);
@@ -212,6 +219,7 @@
   webm_ctx->block_entry = NULL;
   webm_ctx->block_frame_index = 0;
   webm_ctx->timestamp_ns = 0;
+  webm_ctx->reached_eos = 0;
 
   return 0;
 }

diff --git a/webmdec.h b/webmdec.h
index 1cd35d4..7d16380 100644
--- a/webmdec.h
+++ b/webmdec.h

@@ -29,6 +29,7 @@
   int video_track_index;
   uint64_t timestamp_ns;
   int is_key_frame;
+  int reached_eos;
 };
 
 // Checks if the input is a WebM file. If so, initializes WebMInputContext so