Merge "vpxenc: add --aq-mode flag to control adaptive quantization"

commit: b00057c88a6c98472fd78a9957453ec012a08336 [log] [tgz]
author: Guillaume Martres <smarter3@gmail.com> Wed Nov 20 08:13:28 2013 -0800
committer: Gerrit Code Review <gerrit@gerrit.golo.chromium.org> Wed Nov 20 08:13:28 2013 -0800
tree: e9bf24d5a6c06d1a674b81923c5a8064f6c23c7c
parent: d486427cf1bb1b64ab1e0a746e5e2b7c5bb3a0e2 [diff]
parent: 17084657e6da5b02ab1e492b237e52f2bd38ade3 [diff]
diff --git a/.mailmap b/.mailmap
index ba1279b..fb82a24 100644
--- a/.mailmap
+++ b/.mailmap

@@ -1,8 +1,18 @@
 Adrian Grange <agrange@google.com>
+Alexis Ballier <aballier@gentoo.org> <alexis.ballier@gmail.com>
+Hangyu Kuang <hkuang@google.com>
+Jim Bankoski <jimbankoski@google.com>
+John Koleszar <jkoleszar@google.com>
 Johann Koenig <johannkoenig@google.com>
+Johann Koenig <johannkoenig@google.com> <johann.koenig@duck.com>
+Johann Koenig <johannkoenig@google.com> <johannkoenig@dhcp-172-19-7-52.mtv.corp.google.com>
+Pascal Massimino <pascal.massimino@gmail.com>
+Sami Pietilä <samipietila@google.com>
 Tero Rintaluoma <teror@google.com> <tero.rintaluoma@on2.com>
+Timothy B. Terriberry <tterribe@xiph.org> Tim Terriberry <tterriberry@mozilla.com>
 Tom Finegan <tomfinegan@google.com>
 Ralph Giles <giles@xiph.org> <giles@entropywave.com>
 Ralph Giles <giles@xiph.org> <giles@mozilla.com>
 Alpha Lam <hclam@google.com> <hclam@chromium.org>
 Deb Mukherjee <debargha@google.com>
+Yaowu Xu <yaowu@google.com> <yaowu@xuyaowu.com>

diff --git a/AUTHORS b/AUTHORS
index 0937d5d..a9aa481 100644
--- a/AUTHORS
+++ b/AUTHORS

@@ -2,62 +2,97 @@
 # by tools/gen_authors.sh.
 
 Aaron Watry <awatry@gmail.com>
+Abo Talib Mahfoodh <ab.mahfoodh@gmail.com>
 Adrian Grange <agrange@google.com>
+Ahmad Sharif <asharif@google.com>
+Alexander Voronov <avoronov@graphics.cs.msu.ru>
 Alex Converse <alex.converse@gmail.com>
 Alexis Ballier <aballier@gentoo.org>
 Alok Ahuja <waveletcoeff@gmail.com>
 Alpha Lam <hclam@google.com>
+A.Mahfoodh <ab.mahfoodh@gmail.com>
+Ami Fischman <fischman@chromium.org>
 Andoni Morales Alastruey <ylatuya@gmail.com>
 Andres Mejia <mcitadel@gmail.com>
 Aron Rosenberg <arosenberg@logitech.com>
 Attila Nagy <attilanagy@google.com>
+changjun.yang <changjun.yang@intel.com>
+chm <chm@rock-chips.com>
+Christian Duvivier <cduvivier@google.com>
+Daniel Kang <ddkang@google.com>
 Deb Mukherjee <debargha@google.com>
+Dmitry Kovalev <dkovalev@google.com>
+Dragan Mrdjan <dmrdjan@mips.com>
+Erik Niemeyer <erik.a.niemeyer@gmail.com>
 Fabio Pedretti <fabio.ped@libero.it>
 Frank Galligan <fgalligan@google.com>
 Fredrik Söderquist <fs@opera.com>
 Fritz Koenig <frkoenig@google.com>
 Gaute Strokkenes <gaute.strokkenes@broadcom.com>
 Giuseppe Scrivano <gscrivano@gnu.org>
+Guillaume Martres <gmartres@google.com>
 Guillermo Ballester Valor <gbvalor@gmail.com>
+Hangyu Kuang <hkuang@google.com>
 Henrik Lundin <hlundin@google.com>
+Hui Su <huisu@google.com>
+Ivan Maltz <ivanmaltz@google.com>
 James Berry <jamesberry@google.com>
 James Zern <jzern@google.com>
 Jan Kratochvil <jan.kratochvil@redhat.com>
+Janne Salonen <jsalonen@google.com>
 Jeff Faust <jfaust@google.com>
 Jeff Muizelaar <jmuizelaar@mozilla.com>
+Jeff Petkau <jpet@chromium.org>
 Jim Bankoski <jimbankoski@google.com>
+Jingning Han <jingning@google.com>
 Johann Koenig <johannkoenig@google.com>
 John Koleszar <jkoleszar@google.com>
 Joshua Bleecher Snyder <josh@treelinelabs.com>
+Joshua Litt <joshualitt@google.com>
 Justin Clift <justin@salasaga.org>
 Justin Lebar <justin.lebar@gmail.com>
 KO Myung-Hun <komh@chollian.net>
 Lou Quillio <louquillio@google.com>
 Luca Barbato <lu_zero@gentoo.org>
 Makoto Kato <makoto.kt@gmail.com>
+Mans Rullgard <mans@mansr.com>
 Marco Paniconi <marpan@google.com>
+Mark Mentovai <mark@chromium.org>
 Martin Ettl <ettl.martin78@googlemail.com>
+Martin Storsjo <martin@martin.st>
+Matthew Heaney <matthewjheaney@chromium.org>
 Michael Kohler <michaelkohler@live.com>
+Mike Frysinger <vapier@chromium.org>
 Mike Hommey <mhommey@mozilla.com>
 Mikhal Shemer <mikhal@google.com>
+Morton Jonuschat <yabawock@gmail.com>
+Parag Salasakar <img.mips1@gmail.com>
 Pascal Massimino <pascal.massimino@gmail.com>
 Patrik Westin <patrik.westin@gmail.com>
 Paul Wilkins <paulwilkins@google.com>
 Pavol Rusnak <stick@gk2.sk>
+Paweł Hajdan <phajdan@google.com>
 Philip Jägenstedt <philipj@opera.com>
 Priit Laes <plaes@plaes.org>
 Rafael Ávila de Espíndola <rafael.espindola@gmail.com>
 Rafaël Carré <funman@videolan.org>
 Ralph Giles <giles@xiph.org>
+Rob Bradford <rob@linux.intel.com>
 Ronald S. Bultje <rbultje@google.com>
+Sami Pietilä <samipietila@google.com>
+Scott Graham <scottmg@chromium.org>
 Scott LaVarnway <slavarnway@google.com>
+Shimon Doodkin <helpmepro1@gmail.com>
 Stefan Holmer <holmer@google.com>
+Suman Sunkara <sunkaras@google.com>
 Taekhyun Kim <takim@nvidia.com>
 Takanori MATSUURA <t.matsuu@gmail.com>
+Tamar Levy <tamar.levy@intel.com>
 Tero Rintaluoma <teror@google.com>
 Thijs Vermeir <thijsvermeir@gmail.com>
 Timothy B. Terriberry <tterribe@xiph.org>
 Tom Finegan <tomfinegan@google.com>
+Vignesh Venkatasubramanian <vigneshv@google.com>
 Yaowu Xu <yaowu@google.com>
 Yunqing Wang <yunqingwang@google.com>
 Google Inc.

diff --git a/CHANGELOG b/CHANGELOG
index ef64a96..97c9a7b 100644
--- a/CHANGELOG
+++ b/CHANGELOG

@@ -1,3 +1,53 @@
+2013-11-15 v1.3.0 "Forest"
+  This release introduces the VP9 codec in a backward-compatible way.
+  All existing users of VP8 can continue to use the library without
+  modification. However, some VP8 options do not map to VP9 in the same manner.
+
+  The VP9 encoder in this release is not feature complete. Users interested in
+  the encoder are advised to use the git master branch and discuss issues on
+  libvpx mailing lists.
+
+  - Upgrading:
+    This release is ABI and API compatible with Duclair (v1.0.0). Users
+    of older releases should refer to the Upgrading notes in this document
+    for that release.
+
+  - Enhancements:
+      Get rid of bashisms in the main build scripts
+      Added usage info on command line options
+      Add lossless compression mode
+      Dll build of libvpx
+      Add additional Mac OS X targets: 10.7, 10.8 and 10.9 (darwin11-13)
+      Add option to disable documentation
+      configure: add --enable-external-build support
+      make: support V=1 as short form of verbose=yes
+      configure: support mingw-w64
+      configure: support hardfloat armv7 CHOSTS
+      configure: add support for android x86
+      Add estimated completion time to vpxenc
+      Don't exit on decode errors in vpxenc
+      vpxenc: support scaling prior to encoding
+      vpxdec: support scaling output
+      vpxenc: improve progress indicators with --skip
+      msvs: Don't link to winmm.lib
+      Add a new script for producing vcxproj files
+      Produce Visual Studio 10 and 11 project files
+      Produce Windows Phone project files
+      msvs-build: use msbuild for vs >= 2005
+      configure: default configure log to config.log
+      Add encoding option --static-thresh
+
+  - Speed:
+      Miscellaneous speed optimizations for VP8 and VP9.
+
+  - Quality:
+      In general, quality is consistent with the Eider release.
+
+  - Bug Fixes:
+      This release represents approximately a year of engineering effort,
+      and contains multiple bug fixes. Please refer to git history for details.
+
+
 2012-12-21 v1.2.0
   This release acts as a checkpoint for a large amount of internal refactoring
   and testing. It also contains a number of small bugfixes, so all users are

diff --git a/README b/README
index d7cb11a..ce9c1c6 100644
--- a/README
+++ b/README

@@ -64,6 +64,7 @@
     armv7-linux-gcc
     armv7-none-rvct
     armv7-win32-vs11
+    armv7-win32-vs12
     mips32-linux-gcc
     ppc32-darwin8-gcc
     ppc32-darwin9-gcc
@@ -91,6 +92,7 @@
     x86-win32-vs9
     x86-win32-vs10
     x86-win32-vs11
+    x86-win32-vs12
     x86_64-darwin9-gcc
     x86_64-darwin10-gcc
     x86_64-darwin11-gcc
@@ -104,6 +106,7 @@
     x86_64-win64-vs9
     x86_64-win64-vs10
     x86_64-win64-vs11
+    x86_64-win64-vs12
     universal-darwin8-gcc
     universal-darwin9-gcc
     universal-darwin10-gcc

diff --git a/build/make/ads2gas_apple.pl b/build/make/ads2gas_apple.pl
index 51e6fbc..befb3db 100755
--- a/build/make/ads2gas_apple.pl
+++ b/build/make/ads2gas_apple.pl

@@ -17,6 +17,13 @@
 #
 # Usage: cat inputfile | perl ads2gas_apple.pl > outputfile
 #
+
+my $chromium = 0;
+
+foreach my $arg (@ARGV) {
+    $chromium = 1 if ($arg eq "-chromium");
+}
+
 print "@ This file was created from a .asm file\n";
 print "@  using the ads2gas_apple.pl script.\n\n";
 print "\t.set WIDE_REFERENCE, 0\n";
@@ -47,7 +54,7 @@
     s/@/,:/g;
 
     # Comment character
-    s/;/@/g;
+    s/;/ @/g;
 
     # Hexadecimal constants prefaced by 0x
     s/#&/#0x/g;
@@ -188,7 +195,7 @@
         $trimmed =~ s/,//g;
 
         # string to array
-        @incoming_array = split(/ /, $trimmed);
+        @incoming_array = split(/\s+/, $trimmed);
 
         print ".macro @incoming_array[0]\n";
 
@@ -210,5 +217,19 @@
 #   s/\$/\\/g;                  # End macro definition
     s/MEND/.endm/;              # No need to tell it where to stop assembling
     next if /^\s*END\s*$/;
+
+    # Clang used by Chromium differs slightly from clang in XCode in what it
+    # will accept in the assembly.
+    if ($chromium) {
+        s/qsubaddx/qsax/i;
+        s/qaddsubx/qasx/i;
+        s/ldrneb/ldrbne/i;
+        s/ldrneh/ldrhne/i;
+        s/(vqshrun\.s16 .*, \#)0$/${1}8/i;
+
+        # http://llvm.org/bugs/show_bug.cgi?id=16022
+        s/\.include/#include/;
+    }
+
     print;
 }

diff --git a/build/make/configure.sh b/build/make/configure.sh
index b43a4ec..8dcb9bb 100755
--- a/build/make/configure.sh
+++ b/build/make/configure.sh

@@ -925,41 +925,26 @@
           ;;
 
         darwin*)
-            if [ -z "${sdk_path}" ]; then
-                SDK_PATH=`xcode-select -print-path 2> /dev/null`
-                SDK_PATH=${SDK_PATH}/Platforms/iPhoneOS.platform/Developer
-            else
-                SDK_PATH=${sdk_path}
-            fi
-            TOOLCHAIN_PATH=${SDK_PATH}/usr/bin
-            CXX=${TOOLCHAIN_PATH}/g++
-            CC=${TOOLCHAIN_PATH}/gcc
-            AR=${TOOLCHAIN_PATH}/ar
-            LD=${TOOLCHAIN_PATH}/arm-apple-darwin10-llvm-gcc-4.2
-            AS=${TOOLCHAIN_PATH}/as
-            STRIP=${TOOLCHAIN_PATH}/strip
-            NM=${TOOLCHAIN_PATH}/nm
+
+            XCRUN_FIND="xcrun --sdk iphoneos -find"
+            CXX="$(${XCRUN_FIND} clang++)"
+            CC="$(${XCRUN_FIND} clang)"
+            AR="$(${XCRUN_FIND} ar)"
+            LD="$(${XCRUN_FIND} ld)"
+            AS="$(${XCRUN_FIND} as)"
+            STRIP="$(${XCRUN_FIND} strip)"
+            NM="$(${XCRUN_FIND} nm)"
+            RANLIB="$(${XCRUN_FIND} ranlib)"
             AS_SFX=.s
 
             # ASFLAGS is written here instead of using check_add_asflags
             # because we need to overwrite all of ASFLAGS and purge the
             # options that were put in above
-            ASFLAGS="-version -arch ${tgt_isa} -g"
+            ASFLAGS="-arch ${tgt_isa} -g"
 
-            add_cflags -arch ${tgt_isa}
-            add_ldflags -arch_only ${tgt_isa}
-
-            if [ -z "${alt_libc}" ]; then
-                alt_libc=${SDK_PATH}/SDKs/iPhoneOS6.0.sdk
-            fi
-
-            add_cflags  "-isysroot ${alt_libc}"
-
-            # Add the paths for the alternate libc
-            for d in usr/include; do
-                try_dir="${alt_libc}/${d}"
-                [ -d "${try_dir}" ] && add_cflags -I"${try_dir}"
-            done
+            alt_libc="$(xcrun --sdk iphoneos --show-sdk-path)"
+            add_cflags -arch ${tgt_isa} -isysroot ${alt_libc}
+            add_ldflags -arch ${tgt_isa} -ios_version_min 7.0
 
             for d in lib usr/lib usr/lib/system; do
                 try_dir="${alt_libc}/${d}"
@@ -1093,7 +1078,7 @@
                 msvs_arch_dir=x86-msvs
                 vc_version=${tgt_cc##vs}
                 case $vc_version in
-                    7|8|9)
+                    7|8|9|10)
                          echo "${tgt_cc} does not support avx/avx2, disabling....."
                          RTCD_OPTIONS="${RTCD_OPTIONS}--disable-avx --disable-avx2 "
                          soft_disable avx

diff --git a/build/make/gen_msvs_sln.sh b/build/make/gen_msvs_sln.sh
index 0c269b1..ffa3706 100755
--- a/build/make/gen_msvs_sln.sh
+++ b/build/make/gen_msvs_sln.sh

@@ -255,7 +255,7 @@
     ;;
     --ver=*) vs_ver="$optval"
              case $optval in
-             [789]|10|11)
+             [789]|10|11|12)
              ;;
              *) die Unrecognized Visual Studio Version in $opt
              ;;
@@ -297,12 +297,15 @@
     11) sln_vers="12.00"
        sln_vers_str="Visual Studio 2012"
     ;;
+    12) sln_vers="12.00"
+       sln_vers_str="Visual Studio 2013"
+    ;;
 esac
 case "${vs_ver:-8}" in
     [789])
     sfx=vcproj
     ;;
-    10|11)
+    10|11|12)
     sfx=vcxproj
     ;;
 esac

diff --git a/build/make/gen_msvs_vcxproj.sh b/build/make/gen_msvs_vcxproj.sh
index 4875915..359157c 100755
--- a/build/make/gen_msvs_vcxproj.sh
+++ b/build/make/gen_msvs_vcxproj.sh

@@ -33,7 +33,7 @@
     --name=project_name         Name of the project (required)
     --proj-guid=GUID            GUID to use for the project
     --module-def=filename       File containing export definitions (for DLLs)
-    --ver=version               Version (10,11) of visual studio to generate for
+    --ver=version               Version (10,11,12) of visual studio to generate for
     --src-path-bare=dir         Path to root of source tree
     -Ipath/to/include           Additional include directories
     -DFLAG[=value]              Preprocessor macros to define
@@ -228,7 +228,7 @@
         --ver=*)
             vs_ver="$optval"
             case "$optval" in
-                10|11)
+                10|11|12)
                 ;;
                 *) die Unrecognized Visual Studio Version in $opt
                 ;;
@@ -269,7 +269,7 @@
 asm_use_custom_step=false
 uses_asm=${uses_asm:-false}
 case "${vs_ver:-11}" in
-    10|11)
+    10|11|12)
        asm_use_custom_step=$uses_asm
     ;;
 esac
@@ -383,6 +383,20 @@
                     tag_content PlatformToolset v110
                 fi
             fi
+            if [ "$vs_ver" = "12" ]; then
+                if [ "$plat" = "ARM" ]; then
+                    # Setting the wp80 toolchain automatically sets the
+                    # WINAPI_FAMILY define, which is required for building
+                    # code for arm with the windows headers. Alternatively,
+                    # one could add AppContainerApplication=true in the Globals
+                    # section and add PrecompiledHeader=NotUsing and
+                    # CompileAsWinRT=false in ClCompile and SubSystem=Console
+                    # in Link.
+                    tag_content PlatformToolset v120_wp80
+                else
+                    tag_content PlatformToolset v120
+                fi
+            fi
             tag_content CharacterSet Unicode
             if [ "$config" = "Release" ]; then
                 tag_content WholeProgramOptimization true

diff --git a/configure b/configure
index 729c986..f9454ba 100755
--- a/configure
+++ b/configure

@@ -24,9 +24,10 @@
   ${toggle_examples}              examples
   ${toggle_docs}                  documentation
   ${toggle_unit_tests}            unit tests
+  ${toggle_decode_perf_tests}     build decoder perf tests with unit tests
   --libc=PATH                     path to alternate libc
   --as={yasm|nasm|auto}           use specified assembler [auto, yasm preferred]
-  --sdk-path=PATH                 path to root of sdk (iOS, android builds only)
+  --sdk-path=PATH                 path to root of sdk (android builds only)
   ${toggle_fast_unaligned}        don't use unaligned accesses, even when
                                   supported by hardware [auto]
   ${toggle_codec_srcs}            in/exclude codec library source code
@@ -100,6 +101,7 @@
 all_platforms="${all_platforms} armv7-linux-gcc"     #neon Cortex-A8
 all_platforms="${all_platforms} armv7-none-rvct"     #neon Cortex-A8
 all_platforms="${all_platforms} armv7-win32-vs11"
+all_platforms="${all_platforms} armv7-win32-vs12"
 all_platforms="${all_platforms} mips32-linux-gcc"
 all_platforms="${all_platforms} ppc32-darwin8-gcc"
 all_platforms="${all_platforms} ppc32-darwin9-gcc"
@@ -127,6 +129,7 @@
 all_platforms="${all_platforms} x86-win32-vs9"
 all_platforms="${all_platforms} x86-win32-vs10"
 all_platforms="${all_platforms} x86-win32-vs11"
+all_platforms="${all_platforms} x86-win32-vs12"
 all_platforms="${all_platforms} x86_64-darwin9-gcc"
 all_platforms="${all_platforms} x86_64-darwin10-gcc"
 all_platforms="${all_platforms} x86_64-darwin11-gcc"
@@ -140,6 +143,7 @@
 all_platforms="${all_platforms} x86_64-win64-vs9"
 all_platforms="${all_platforms} x86_64-win64-vs10"
 all_platforms="${all_platforms} x86_64-win64-vs11"
+all_platforms="${all_platforms} x86_64-win64-vs12"
 all_platforms="${all_platforms} universal-darwin8-gcc"
 all_platforms="${all_platforms} universal-darwin9-gcc"
 all_platforms="${all_platforms} universal-darwin10-gcc"
@@ -299,6 +303,7 @@
     postproc_visualizer
     os_support
     unit_tests
+    decode_perf_tests
     multi_res_encoding
     temporal_denoising
     experimental
@@ -352,6 +357,7 @@
     small
     postproc_visualizer
     unit_tests
+    decode_perf_tests
     multi_res_encoding
     temporal_denoising
     experimental
@@ -670,7 +676,7 @@
                  VCPROJ_SFX=vcproj
                  gen_vcproj_cmd=${source_path}/build/make/gen_msvs_proj.sh
                  ;;
-             10|11)
+             10|11|12)
                  VCPROJ_SFX=vcxproj
                  gen_vcproj_cmd=${source_path}/build/make/gen_msvs_vcxproj.sh
                  ;;

diff --git a/examples.mk b/examples.mk
index 16f3c8f..36d20df 100644
--- a/examples.mk
+++ b/examples.mk

@@ -23,7 +23,9 @@
 vpxdec.SRCS                 += vpx_ports/vpx_timer.h
 vpxdec.SRCS                 += vpx/vpx_integer.h
 vpxdec.SRCS                 += args.c args.h
+vpxdec.SRCS                 += ivfdec.c ivfdec.h
 vpxdec.SRCS                 += tools_common.c tools_common.h
+vpxdec.SRCS                 += webmdec.c webmdec.h
 vpxdec.SRCS                 += nestegg/halloc/halloc.h
 vpxdec.SRCS                 += nestegg/halloc/src/align.h
 vpxdec.SRCS                 += nestegg/halloc/src/halloc.c
@@ -36,6 +38,8 @@
 vpxdec.DESCRIPTION           = Full featured decoder
 UTILS-$(CONFIG_ENCODERS)    += vpxenc.c
 vpxenc.SRCS                 += args.c args.h y4minput.c y4minput.h
+vpxenc.SRCS                 += ivfdec.c ivfdec.h
+vpxenc.SRCS                 += ivfenc.c ivfenc.h
 vpxenc.SRCS                 += tools_common.c tools_common.h
 vpxenc.SRCS                 += webmenc.c webmenc.h
 vpxenc.SRCS                 += vpx_ports/mem_ops.h
@@ -53,18 +57,11 @@
 vp8_scalable_patterns.DESCRIPTION = Temporal Scalability Encoder
 UTILS-$(CONFIG_VP9_ENCODER)    += vp9_spatial_scalable_encoder.c
 vp9_spatial_scalable_encoder.SRCS += args.c args.h
+vp9_spatial_scalable_encoder.SRCS += ivfenc.c ivfenc.h
+vp9_spatial_scalable_encoder.SRCS += tools_common.c tools_common.h
 vp9_spatial_scalable_encoder.GUID   = 4A38598D-627D-4505-9C7B-D4020C84100D
 vp9_spatial_scalable_encoder.DESCRIPTION = Spatial Scalable Encoder
 
-# Clean up old ivfenc, ivfdec binaries.
-ifeq ($(CONFIG_MSVS),yes)
-CLEAN-OBJS += $(foreach p,$(VS_PLATFORMS),$(p)/Release/ivfenc.exe)
-CLEAN-OBJS += $(foreach p,$(VS_PLATFORMS),$(p)/Release/ivfdec.exe)
-else
-CLEAN-OBJS += ivfenc{.c.o,.c.d,.dox,.exe,}
-CLEAN-OBJS += ivfdec{.c.o,.c.d,.dox,.exe,}
-endif
-
 # XMA example disabled for now, not used in VP8
 #UTILS-$(CONFIG_DECODERS)    += example_xma.c
 #example_xma.GUID             = A955FC4A-73F1-44F7-135E-30D84D32F022

diff --git a/ivfdec.c b/ivfdec.c
new file mode 100644
index 0000000..4a0816f
--- /dev/null
+++ b/ivfdec.c

@@ -0,0 +1,119 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./ivfdec.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+
+int file_is_ivf(struct VpxInputContext *input_ctx) {
+  char raw_hdr[32];
+  int is_ivf = 0;
+
+  // TODO(tomfinegan): This can eventually go away, but for now it's required
+  // because the means by which file types are detected differ in vpxdec and
+  // vpxenc.
+  rewind(input_ctx->file);
+
+  if (fread(raw_hdr, 1, 32, input_ctx->file) == 32) {
+    if (raw_hdr[0] == 'D' && raw_hdr[1] == 'K' &&
+        raw_hdr[2] == 'I' && raw_hdr[3] == 'F') {
+      is_ivf = 1;
+
+      if (mem_get_le16(raw_hdr + 4) != 0) {
+        fprintf(stderr, "Error: Unrecognized IVF version! This file may not"
+                " decode properly.");
+      }
+
+      input_ctx->fourcc = mem_get_le32(raw_hdr + 8);
+      input_ctx->width = mem_get_le16(raw_hdr + 12);
+      input_ctx->height = mem_get_le16(raw_hdr + 14);
+      input_ctx->framerate.numerator = mem_get_le32(raw_hdr + 16);
+      input_ctx->framerate.denominator = mem_get_le32(raw_hdr + 20);
+
+      /* Some versions of vpxenc used 1/(2*fps) for the timebase, so
+       * we can guess the framerate using only the timebase in this
+       * case. Other files would require reading ahead to guess the
+       * timebase, like we do for webm.
+       */
+      if (input_ctx->framerate.numerator < 1000) {
+        /* Correct for the factor of 2 applied to the timebase in the
+         * encoder.
+         */
+        if (input_ctx->framerate.numerator & 1)
+          input_ctx->framerate.denominator <<= 1;
+        else
+          input_ctx->framerate.numerator >>= 1;
+      } else {
+        /* Don't know FPS for sure, and don't have readahead code
+         * (yet?), so just default to 30fps.
+         */
+        input_ctx->framerate.numerator = 30;
+        input_ctx->framerate.denominator = 1;
+      }
+    }
+  }
+
+  if (!is_ivf) {
+    rewind(input_ctx->file);
+    input_ctx->detect.buf_read = 0;
+  } else {
+    input_ctx->detect.position = 4;
+  }
+  return is_ivf;
+}
+
+int ivf_read_frame(struct VpxInputContext *input_ctx,
+                   uint8_t **buffer,
+                   size_t *bytes_read,
+                   size_t *buffer_size) {
+  char raw_header[IVF_FRAME_HDR_SZ] = {0};
+  size_t frame_size = 0;
+  FILE *infile = input_ctx->file;
+
+  if (input_ctx->file_type != FILE_TYPE_IVF)
+    return 0;
+
+  if (fread(raw_header, IVF_FRAME_HDR_SZ, 1, infile) != 1) {
+    if (!feof(infile))
+      warn("Failed to read frame size\n");
+  } else {
+    frame_size = mem_get_le32(raw_header);
+
+    if (frame_size > 256 * 1024 * 1024) {
+      warn("Read invalid frame size (%u)\n", (unsigned int)frame_size);
+      frame_size = 0;
+    }
+
+    if (frame_size > *buffer_size) {
+      uint8_t *new_buffer = realloc(*buffer, 2 * frame_size);
+
+      if (new_buffer) {
+        *buffer = new_buffer;
+        *buffer_size = 2 * frame_size;
+      } else {
+        warn("Failed to allocate compressed data buffer\n");
+        frame_size = 0;
+      }
+    }
+  }
+
+  if (!feof(infile)) {
+    if (fread(*buffer, 1, frame_size, infile) != frame_size) {
+      warn("Failed to read full frame\n");
+      return 1;
+    }
+
+    *bytes_read = frame_size;
+    return 0;
+  }
+
+  return 1;
+}

diff --git a/ivfdec.h b/ivfdec.h
new file mode 100644
index 0000000..b1468a9
--- /dev/null
+++ b/ivfdec.h

@@ -0,0 +1,30 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef IVFDEC_H_
+#define IVFDEC_H_
+
+#include "./tools_common.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int file_is_ivf(struct VpxInputContext *input);
+
+int ivf_read_frame(struct VpxInputContext *input,
+                   uint8_t **buffer,
+                   size_t *bytes_read,
+                   size_t *buffer_size);
+
+#ifdef __cplusplus
+}  /* extern "C" */
+#endif
+
+#endif  /* IVFDEC_H_ */

diff --git a/ivfenc.c b/ivfenc.c
new file mode 100644
index 0000000..fa92566
--- /dev/null
+++ b/ivfenc.c

@@ -0,0 +1,62 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./ivfenc.h"
+
+#include "./tools_common.h"
+#include "vpx/vpx_encoder.h"
+#include "vpx_ports/mem_ops.h"
+
+void ivf_write_file_header(FILE *outfile,
+                           const struct vpx_codec_enc_cfg *cfg,
+                           unsigned int fourcc,
+                           int frame_cnt) {
+  char header[32];
+
+  if (cfg->g_pass != VPX_RC_ONE_PASS && cfg->g_pass != VPX_RC_LAST_PASS)
+    return;
+
+  header[0] = 'D';
+  header[1] = 'K';
+  header[2] = 'I';
+  header[3] = 'F';
+  mem_put_le16(header + 4,  0);                 /* version */
+  mem_put_le16(header + 6,  32);                /* headersize */
+  mem_put_le32(header + 8,  fourcc);            /* four CC */
+  mem_put_le16(header + 12, cfg->g_w);          /* width */
+  mem_put_le16(header + 14, cfg->g_h);          /* height */
+  mem_put_le32(header + 16, cfg->g_timebase.den); /* rate */
+  mem_put_le32(header + 20, cfg->g_timebase.num); /* scale */
+  mem_put_le32(header + 24, frame_cnt);         /* length */
+  mem_put_le32(header + 28, 0);                 /* unused */
+
+  (void) fwrite(header, 1, 32, outfile);
+}
+
+void ivf_write_frame_header(FILE *outfile, const struct vpx_codec_cx_pkt *pkt) {
+  char header[12];
+  vpx_codec_pts_t pts;
+
+  if (pkt->kind != VPX_CODEC_CX_FRAME_PKT)
+    return;
+
+  pts = pkt->data.frame.pts;
+  mem_put_le32(header, (int)pkt->data.frame.sz);
+  mem_put_le32(header + 4, pts & 0xFFFFFFFF);
+  mem_put_le32(header + 8, pts >> 32);
+
+  (void) fwrite(header, 1, 12, outfile);
+}
+
+void ivf_write_frame_size(FILE *outfile, size_t size) {
+  char header[4];
+  mem_put_le32(header, (int)size);
+  (void) fwrite(header, 1, 4, outfile);
+}

diff --git a/ivfenc.h b/ivfenc.h
new file mode 100644
index 0000000..a332c7d
--- /dev/null
+++ b/ivfenc.h

@@ -0,0 +1,33 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef IVFENC_H_
+#define IVFENC_H_
+
+#include "./tools_common.h"
+
+struct vpx_codec_enc_cfg;
+struct vpx_codec_cx_pkt;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void ivf_write_file_header(FILE *outfile,
+                           const struct vpx_codec_enc_cfg *cfg,
+                           uint32_t fourcc,
+                           int frame_cnt);
+void ivf_write_frame_header(FILE *outfile, const struct vpx_codec_cx_pkt *pkt);
+void ivf_write_frame_size(FILE *outfile, size_t size);
+
+#ifdef __cplusplus
+}  /* extern "C" */
+#endif
+
+#endif  /* IVFENC_H_ */

diff --git a/test/android/Android.mk b/test/android/Android.mk
index 8d8ce16..13af601 100644
--- a/test/android/Android.mk
+++ b/test/android/Android.mk

@@ -10,9 +10,6 @@
 # The test app itself runs on the command line through adb shell
 # The paths are really messed up as the libvpx make file
 # expects to be made from a parent directory.
-# TODO(joshualitt)
-# Fix android make files so they can be built from anywhere, will require
-# changing the libvpx make file and this one.
 CUR_WD := $(call my-dir)
 BINDINGS_DIR := $(CUR_WD)/../../..
 LOCAL_PATH := $(CUR_WD)/../../..
@@ -20,12 +17,11 @@
 #libvpx
 include $(CLEAR_VARS)
 include $(BINDINGS_DIR)/libvpx/build/make/Android.mk
-# Restore path
-# TODO joshualitt Fix makefiles so this is no longer needed
 LOCAL_PATH := $(CUR_WD)/../..
 
 #libgtest
 include $(CLEAR_VARS)
+LOCAL_ARM_MODE := arm
 LOCAL_CPP_EXTENSION := .cc
 LOCAL_MODULE := gtest
 LOCAL_C_INCLUDES := $(LOCAL_PATH)/third_party/googletest/src/
@@ -33,31 +29,14 @@
 LOCAL_SRC_FILES := ./third_party/googletest/src/src/gtest-all.cc
 include $(BUILD_STATIC_LIBRARY)
 
-#libnestegg
-include $(CLEAR_VARS)
-LOCAL_CPP_EXTENSION := .cc
-LOCAL_MODULE := nestegg
-NESTEGG_PATH := $(LOCAL_PATH)/nestegg
-LOCAL_C_INCLUDES := $(NESTEGG_PATH)/include
-LOCAL_C_INCLUDES += $(LOCAL_PATH)/
-LOCAL_C_INCLUDES += $(NESTEGG_PATH)/halloc/
-LOCAL_SRC_FILES := ./nestegg/halloc/src/halloc.c
-LOCAL_SRC_FILES += ./nestegg/src/nestegg.c
-include $(BUILD_STATIC_LIBRARY)
-
 #libvpx_test
 include $(CLEAR_VARS)
+LOCAL_ARM_MODE := arm
 LOCAL_MODULE := libvpx_test
 LOCAL_STATIC_LIBRARIES := gtest
-LOCAL_STATIC_LIBRARIES += nestegg
-LOCAL_STATIC_LIBRARIES += cpufeatures
 LOCAL_SHARED_LIBRARIES := vpx
-LOCAL_C_INCLUDES := $(LOCAL_PATH)/
-LOCAL_C_INCLUDES += $(BINDINGS_DIR)/
-LOCAL_C_INCLUDES += $(LOCAL_PATH)/third_party/googletest/src/include
-LOCAL_SRC_FILES := ./args.c
-LOCAL_SRC_FILES += ./md5_utils.c
-LOCAL_SRC_FILES += ./test/decode_test_driver.cc
-LOCAL_SRC_FILES += ./test/test_libvpx.cc
-LOCAL_SRC_FILES += ./test/test_vector_test.cc
+include $(LOCAL_PATH)/test/test.mk
+LOCAL_C_INCLUDES := $(BINDINGS_DIR)
+FILTERED_SRC := $(sort $(filter %.cc %.c, $(LIBVPX_TEST_SRCS-yes)))
+LOCAL_SRC_FILES := $(addprefix ./test/, $(FILTERED_SRC))
 include $(BUILD_EXECUTABLE)

diff --git a/test/android/README b/test/android/README
index 8bc1569..6840d91 100644
--- a/test/android/README
+++ b/test/android/README

@@ -1,23 +1,31 @@
 Android.mk will build vpx unittests on android.
-1) configure libvpx from the parent directory:
-./libvpx/configure --target=armv7-android-gcc --enable-external-build --enable-postproc --disable-install-srcs --enable-multi-res-encoding --enable-temporal-denoising --disable-unit-tests --disable-install-docs --disable-examples --disable-runtime-cpu-detect --sdk=$NDK
+1) Configure libvpx from the parent directory:
+./libvpx/configure --target=armv7-android-gcc --enable-external-build \
+  --enable-postproc --disable-install-srcs --enable-multi-res-encoding \
+  --enable-temporal-denoising --disable-unit-tests --disable-install-docs \
+  --disable-examples --disable-runtime-cpu-detect --sdk=$NDK
 
-2) from the parent directory, invoke ndk-build:
-NDK_PROJECT_PATH=. ndk-build APP_BUILD_SCRIPT=./libvpx/test/android/Android.mk APP_ABI=armeabi-v7a APP_PLATFORM=android-18 APP_OPTIM=release APP_STL=gnustl_static APP_CPPFLAGS=-frtti
+2) From the parent directory, invoke ndk-build:
+NDK_PROJECT_PATH=. ndk-build APP_BUILD_SCRIPT=./libvpx/test/android/Android.mk \
+  APP_ABI=armeabi-v7a APP_PLATFORM=android-18 APP_OPTIM=release \
+  APP_STL=gnustl_static
+
+Note: Both adb and ndk-build are available prebuilt at:
+  https://chromium.googlesource.com/android_tools
 
 3) Run get_files.py to download the test files:
-python get_files.py -i /path/to/test-data.sha1 -o /path/to/put/files -u http://libvpx-test-file-url
+python get_files.py -i /path/to/test-data.sha1 -o /path/to/put/files \
+  -u http://downloads.webmproject.org/test_data/libvpx
 
-NOTE: currently the url of the test files is http://downloads.webmproject.org/test_data/libvpx
-
-4) transfer files to device using adb.  Currently, I put these files in /data/local/tmp
+4) Transfer files to device using adb. Ensure you have proper permissions for
+the target
 
 adb push /path/to/test_files /data/local/tmp
 adb push /path/to/built_libs /data/local/tmp
 
 NOTE: Built_libs defaults to parent_dir/libs/armeabi-v7a
 
-5) run tests:
+5) Run tests:
 adb shell
 (on device)
 cd /data/local/tmp

diff --git a/test/datarate_test.cc b/test/datarate_test.cc
index 85f4bb6..5785a0a 100644
--- a/test/datarate_test.cc
+++ b/test/datarate_test.cc

@@ -248,9 +248,9 @@
     cfg_.rc_target_bitrate = i;
     ResetModel();
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-    ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.8)
+    ASSERT_GE(cfg_.rc_target_bitrate, effective_datarate_ * 0.9)
         << " The datarate for the file exceeds the target by too much!";
-    ASSERT_LE(cfg_.rc_target_bitrate, effective_datarate_ * 1.3)
+    ASSERT_LE(cfg_.rc_target_bitrate, effective_datarate_ * 1.1)
         << " The datarate for the file missed the target!";
   }
 }

diff --git a/test/decode_perf_test.cc b/test/decode_perf_test.cc
new file mode 100644
index 0000000..95600db
--- /dev/null
+++ b/test/decode_perf_test.cc

@@ -0,0 +1,105 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "test/codec_factory.h"
+#include "test/decode_test_driver.h"
+#include "test/ivf_video_source.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+#include "test/webm_video_source.h"
+#include "vpx_ports/vpx_timer.h"
+#include "./vpx_version.h"
+
+using std::tr1::make_tuple;
+
+namespace {
+
+#define VIDEO_NAME 0
+#define THREADS 1
+
+const double kUsecsInSec = 1000000.0;
+
+/*
+ DecodePerfTest takes a tuple of filename + number of threads to decode with
+ */
+typedef std::tr1::tuple<const char *const, unsigned> decode_perf_param_t;
+
+const decode_perf_param_t kVP9DecodePerfVectors[] = {
+  make_tuple("vp90-2-bbb_426x240_tile_1x1_180kbps.webm", 1),
+  make_tuple("vp90-2-bbb_640x360_tile_1x2_337kbps.webm", 2),
+  make_tuple("vp90-2-bbb_854x480_tile_1x2_651kbps.webm", 2),
+  make_tuple("vp90-2-bbb_1280x720_tile_1x4_1310kbps.webm", 4),
+  make_tuple("vp90-2-bbb_1920x1080_tile_1x1_2581kbps.webm", 1),
+  make_tuple("vp90-2-bbb_1920x1080_tile_1x4_2586kbps.webm", 4),
+  make_tuple("vp90-2-bbb_1920x1080_tile_1x4_fpm_2304kbps.webm", 4),
+  make_tuple("vp90-2-sintel_426x182_tile_1x1_171kbps.webm", 1),
+  make_tuple("vp90-2-sintel_640x272_tile_1x2_318kbps.webm", 2),
+  make_tuple("vp90-2-sintel_854x364_tile_1x2_621kbps.webm", 2),
+  make_tuple("vp90-2-sintel_1280x546_tile_1x4_1257kbps.webm", 4),
+  make_tuple("vp90-2-sintel_1920x818_tile_1x4_fpm_2279kbps.webm", 4),
+  make_tuple("vp90-2-tos_426x178_tile_1x1_181kbps.webm", 1),
+  make_tuple("vp90-2-tos_640x266_tile_1x2_336kbps.webm", 2),
+  make_tuple("vp90-2-tos_854x356_tile_1x2_656kbps.webm", 2),
+  make_tuple("vp90-2-tos_1280x534_tile_1x4_1306kbps.webm", 4),
+  make_tuple("vp90-2-tos_1920x800_tile_1x4_fpm_2335kbps.webm", 4),
+};
+
+/*
+ In order to reflect real world performance as much as possible, Perf tests
+ *DO NOT* do any correctness checks. Please run them alongside correctness
+ tests to ensure proper codec integrity. Furthermore, in this test we
+ deliberately limit the amount of system calls we make to avoid OS
+ preemption.
+
+ TODO(joshualitt) create a more detailed perf measurement test to collect
+   power/temp/min max frame decode times/etc
+ */
+
+class DecodePerfTest : public ::testing::TestWithParam<decode_perf_param_t> {
+};
+
+TEST_P(DecodePerfTest, PerfTest) {
+  const char *const video_name = GET_PARAM(VIDEO_NAME);
+  const unsigned threads = GET_PARAM(THREADS);
+
+  libvpx_test::WebMVideoSource video(video_name);
+  video.Init();
+
+  vpx_codec_dec_cfg_t cfg = {0};
+  cfg.threads = threads;
+  libvpx_test::VP9Decoder decoder(cfg, 0);
+
+  vpx_usec_timer t;
+  vpx_usec_timer_start(&t);
+
+  for (video.Begin(); video.cxdata() != NULL; video.Next()) {
+    decoder.DecodeFrame(video.cxdata(), video.frame_size());
+  }
+
+  vpx_usec_timer_mark(&t);
+  const double elapsed_secs = double(vpx_usec_timer_elapsed(&t))
+                              / kUsecsInSec;
+  const unsigned frames = video.frame_number();
+  const double fps = double(frames) / elapsed_secs;
+
+  printf("{\n");
+  printf("\t\"version\" : \"%s\",\n", VERSION_STRING_NOSP);
+  printf("\t\"videoName\" : \"%s\",\n", video_name);
+  printf("\t\"threadCount\" : %u,\n", threads);
+  printf("\t\"decodeTimeSecs\" : %f,\n", elapsed_secs);
+  printf("\t\"totalFrames\" : %u,\n", frames);
+  printf("\t\"framesPerSecond\" : %f\n", fps);
+  printf("}\n");
+}
+
+INSTANTIATE_TEST_CASE_P(VP9, DecodePerfTest,
+                        ::testing::ValuesIn(kVP9DecodePerfVectors));
+
+}  // namespace

diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index 796a2e9..9d8b0bd 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc

@@ -13,178 +13,288 @@
 #include <string.h>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
 
 extern "C" {
+#include "vp9/common/vp9_entropy.h"
 #include "./vp9_rtcd.h"
+void vp9_idct4x4_16_add_c(const int16_t *input, uint8_t *output, int pitch);
 }
-
-#include "test/acm_random.h"
 #include "vpx/vpx_integer.h"
-#include "vpx_ports/mem.h"
 
 using libvpx_test::ACMRandom;
 
 namespace {
-void fdct4x4(int16_t *in, int16_t *out, uint8_t* /*dst*/,
-             int stride, int /*tx_type*/) {
+const int kNumCoeffs = 16;
+typedef void (*fdct_t)(const int16_t *in, int16_t *out, int stride);
+typedef void (*idct_t)(const int16_t *in, uint8_t *out, int stride);
+typedef void (*fht_t) (const int16_t *in, int16_t *out, int stride,
+                       int tx_type);
+typedef void (*iht_t) (const int16_t *in, uint8_t *out, int stride,
+                       int tx_type);
+
+void fdct4x4_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
   vp9_fdct4x4_c(in, out, stride);
 }
-void idct4x4_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
-                 int stride, int /*tx_type*/) {
-  vp9_idct4x4_16_add_c(out, dst, stride);
-}
-void fht4x4(int16_t *in, int16_t *out, uint8_t* /*dst*/,
-            int stride, int tx_type) {
+
+void fht4x4_ref(const int16_t *in, int16_t *out, int stride, int tx_type) {
   vp9_short_fht4x4_c(in, out, stride, tx_type);
 }
-void iht4x4_add(int16_t* /*in*/, int16_t *out, uint8_t *dst,
-                int stride, int tx_type) {
-  vp9_iht4x4_16_add_c(out, dst, stride, tx_type);
-}
 
-class FwdTrans4x4Test : public ::testing::TestWithParam<int> {
+class Trans4x4TestBase {
  public:
-  virtual ~FwdTrans4x4Test() {}
-  virtual void SetUp() {
-    tx_type_ = GetParam();
-    if (tx_type_ == 0) {
-      fwd_txfm_ = fdct4x4;
-      inv_txfm_ = idct4x4_add;
-    } else {
-      fwd_txfm_ = fht4x4;
-      inv_txfm_ = iht4x4_add;
-    }
-  }
+  virtual ~Trans4x4TestBase() {}
 
  protected:
-  void RunFwdTxfm(int16_t *in, int16_t *out, uint8_t *dst,
-                  int stride, int tx_type) {
-    (*fwd_txfm_)(in, out, dst, stride, tx_type);
+  virtual void RunFwdTxfm(const int16_t *in, int16_t *out, int stride) = 0;
+
+  virtual void RunInvTxfm(const int16_t *out, uint8_t *dst, int stride) = 0;
+
+  void RunAccuracyCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    uint32_t max_error = 0;
+    int64_t total_error = 0;
+    const int count_test_block = 10000;
+    for (int i = 0; i < count_test_block; ++i) {
+      DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, kNumCoeffs);
+      DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, kNumCoeffs);
+      DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
+      DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+
+      // Initialize a test block with input range [-255, 255].
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        src[j] = rnd.Rand8();
+        dst[j] = rnd.Rand8();
+        test_input_block[j] = src[j] - dst[j];
+      }
+
+      REGISTER_STATE_CHECK(RunFwdTxfm(test_input_block,
+                                      test_temp_block, pitch_));
+      REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
+
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        const uint32_t diff = dst[j] - src[j];
+        const uint32_t error = diff * diff;
+        if (max_error < error)
+          max_error = error;
+        total_error += error;
+      }
+    }
+
+    EXPECT_GE(1u, max_error)
+        << "Error: 4x4 FHT/IHT has an individual round trip error > 1";
+
+    EXPECT_GE(count_test_block , total_error)
+        << "Error: 4x4 FHT/IHT has average round trip error > 1 per block";
   }
 
-  void RunInvTxfm(int16_t *in, int16_t *out, uint8_t *dst,
-                  int stride, int tx_type) {
-    (*inv_txfm_)(in, out, dst, stride, tx_type);
+  void RunCoeffCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 5000;
+    DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-255, 255].
+      for (int j = 0; j < kNumCoeffs; ++j)
+        input_block[j] = rnd.Rand8() - rnd.Rand8();
+
+      fwd_txfm_ref(input_block, output_ref_block, pitch_, tx_type_);
+      REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, pitch_));
+
+      // The minimum quant value is 4.
+      for (int j = 0; j < kNumCoeffs; ++j)
+        EXPECT_EQ(output_block[j], output_ref_block[j]);
+    }
   }
 
+  void RunMemCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 5000;
+    DECLARE_ALIGNED_ARRAY(16, int16_t, input_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, input_extreme_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, output_ref_block, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, output_block, kNumCoeffs);
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-255, 255].
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        input_block[j] = rnd.Rand8() - rnd.Rand8();
+        input_extreme_block[j] = rnd.Rand8() % 2 ? 255 : -255;
+      }
+      if (i == 0)
+        for (int j = 0; j < kNumCoeffs; ++j)
+          input_extreme_block[j] = 255;
+      if (i == 1)
+        for (int j = 0; j < kNumCoeffs; ++j)
+          input_extreme_block[j] = -255;
+
+      fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
+      REGISTER_STATE_CHECK(RunFwdTxfm(input_extreme_block,
+                                      output_block, pitch_));
+
+      // The minimum quant value is 4.
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        EXPECT_EQ(output_block[j], output_ref_block[j]);
+        EXPECT_GE(4 * DCT_MAX_VALUE, abs(output_block[j]))
+            << "Error: 16x16 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
+      }
+    }
+  }
+
+  void RunInvAccuracyCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 1000;
+    DECLARE_ALIGNED_ARRAY(16, int16_t, in, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, int16_t, coeff, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, kNumCoeffs);
+    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, kNumCoeffs);
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-255, 255].
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        src[j] = rnd.Rand8();
+        dst[j] = rnd.Rand8();
+        in[j] = src[j] - dst[j];
+      }
+
+      fwd_txfm_ref(in, coeff, pitch_, tx_type_);
+
+      REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
+
+      for (int j = 0; j < kNumCoeffs; ++j) {
+        const uint32_t diff = dst[j] - src[j];
+        const uint32_t error = diff * diff;
+        EXPECT_GE(1u, error)
+            << "Error: 16x16 IDCT has error " << error
+            << " at index " << j;
+      }
+    }
+  }
+
+  int pitch_;
   int tx_type_;
-  void (*fwd_txfm_)(int16_t *in, int16_t *out, uint8_t *dst,
-                   int stride, int tx_type);
-  void (*inv_txfm_)(int16_t *in, int16_t *out, uint8_t *dst,
-                   int stride, int tx_type);
+  fht_t fwd_txfm_ref;
 };
 
-TEST_P(FwdTrans4x4Test, SignBiasCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-  DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 16);
-  DECLARE_ALIGNED_ARRAY(16, int16_t, test_output_block, 16);
-  const int pitch = 4;
-  int count_sign_block[16][2];
-  const int count_test_block = 1000000;
+class Trans4x4DCT
+    : public Trans4x4TestBase,
+      public PARAMS(fdct_t, idct_t, int) {
+ public:
+  virtual ~Trans4x4DCT() {}
 
-  memset(count_sign_block, 0, sizeof(count_sign_block));
-  for (int i = 0; i < count_test_block; ++i) {
-    // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < 16; ++j)
-      test_input_block[j] = rnd.Rand8() - rnd.Rand8();
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    tx_type_  = GET_PARAM(2);
+    pitch_    = 4;
+    fwd_txfm_ref = fdct4x4_ref;
+  }
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
 
-    RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_);
-
-    for (int j = 0; j < 16; ++j) {
-      if (test_output_block[j] < 0)
-        ++count_sign_block[j][0];
-      else if (test_output_block[j] > 0)
-        ++count_sign_block[j][1];
-    }
+ protected:
+  void RunFwdTxfm(const int16_t *in, int16_t *out, int stride) {
+    fwd_txfm_(in, out, stride);
+  }
+  void RunInvTxfm(const int16_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride);
   }
 
-  for (int j = 0; j < 16; ++j) {
-    const bool bias_acceptable = (abs(count_sign_block[j][0] -
-                                      count_sign_block[j][1]) < 10000);
-    EXPECT_TRUE(bias_acceptable)
-        << "Error: 4x4 FDCT/FHT has a sign bias > 1%"
-        << " for input range [-255, 255] at index " << j
-        << " tx_type " << tx_type_;
-  }
+  fdct_t fwd_txfm_;
+  idct_t inv_txfm_;
+};
 
-  memset(count_sign_block, 0, sizeof(count_sign_block));
-  for (int i = 0; i < count_test_block; ++i) {
-    // Initialize a test block with input range [-15, 15].
-    for (int j = 0; j < 16; ++j)
-      test_input_block[j] = (rnd.Rand8() >> 4) - (rnd.Rand8() >> 4);
-
-    RunFwdTxfm(test_input_block, test_output_block, NULL, pitch, tx_type_);
-
-    for (int j = 0; j < 16; ++j) {
-      if (test_output_block[j] < 0)
-        ++count_sign_block[j][0];
-      else if (test_output_block[j] > 0)
-        ++count_sign_block[j][1];
-    }
-  }
-
-  for (int j = 0; j < 16; ++j) {
-    const bool bias_acceptable = (abs(count_sign_block[j][0] -
-                                      count_sign_block[j][1]) < 100000);
-    EXPECT_TRUE(bias_acceptable)
-        << "Error: 4x4 FDCT/FHT has a sign bias > 10%"
-        << " for input range [-15, 15] at index " << j;
-  }
+TEST_P(Trans4x4DCT, AccuracyCheck) {
+  RunAccuracyCheck();
 }
 
-TEST_P(FwdTrans4x4Test, RoundTripErrorCheck) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-
-  int max_error = 0;
-  int total_error = 0;
-  const int count_test_block = 1000000;
-  for (int i = 0; i < count_test_block; ++i) {
-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_input_block, 16);
-    DECLARE_ALIGNED_ARRAY(16, int16_t, test_temp_block, 16);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, dst, 16);
-    DECLARE_ALIGNED_ARRAY(16, uint8_t, src, 16);
-
-    for (int j = 0; j < 16; ++j) {
-      src[j] = rnd.Rand8();
-      dst[j] = rnd.Rand8();
-    }
-    // Initialize a test block with input range [-255, 255].
-    for (int j = 0; j < 16; ++j)
-      test_input_block[j] = src[j] - dst[j];
-
-    const int pitch = 4;
-    RunFwdTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
-
-    for (int j = 0; j < 16; ++j) {
-        if (test_temp_block[j] > 0) {
-          test_temp_block[j] += 2;
-          test_temp_block[j] /= 4;
-          test_temp_block[j] *= 4;
-        } else {
-          test_temp_block[j] -= 2;
-          test_temp_block[j] /= 4;
-          test_temp_block[j] *= 4;
-        }
-    }
-
-    // inverse transform and reconstruct the pixel block
-    RunInvTxfm(test_input_block, test_temp_block, dst, pitch, tx_type_);
-
-    for (int j = 0; j < 16; ++j) {
-      const int diff = dst[j] - src[j];
-      const int error = diff * diff;
-      if (max_error < error)
-        max_error = error;
-      total_error += error;
-    }
-  }
-  EXPECT_GE(1, max_error)
-      << "Error: FDCT/IDCT or FHT/IHT has an individual roundtrip error > 1";
-
-  EXPECT_GE(count_test_block, total_error)
-      << "Error: FDCT/IDCT or FHT/IHT has average "
-      << "roundtrip error > 1 per block";
+TEST_P(Trans4x4DCT, CoeffCheck) {
+  RunCoeffCheck();
 }
 
-INSTANTIATE_TEST_CASE_P(VP9, FwdTrans4x4Test, ::testing::Range(0, 4));
+TEST_P(Trans4x4DCT, MemCheck) {
+  RunMemCheck();
+}
+
+TEST_P(Trans4x4DCT, InvAccuracyCheck) {
+  RunInvAccuracyCheck();
+}
+
+class Trans4x4HT
+    : public Trans4x4TestBase,
+      public PARAMS(fht_t, iht_t, int) {
+ public:
+  virtual ~Trans4x4HT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    tx_type_  = GET_PARAM(2);
+    pitch_    = 4;
+    fwd_txfm_ref = fht4x4_ref;
+  }
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(const int16_t *in, int16_t *out, int stride) {
+    fwd_txfm_(in, out, stride, tx_type_);
+  }
+
+  void RunInvTxfm(const int16_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride, tx_type_);
+  }
+
+  fht_t fwd_txfm_;
+  iht_t inv_txfm_;
+};
+
+TEST_P(Trans4x4HT, AccuracyCheck) {
+  RunAccuracyCheck();
+}
+
+TEST_P(Trans4x4HT, CoeffCheck) {
+  RunCoeffCheck();
+}
+
+TEST_P(Trans4x4HT, MemCheck) {
+  RunMemCheck();
+}
+
+TEST_P(Trans4x4HT, InvAccuracyCheck) {
+  RunInvAccuracyCheck();
+}
+
+using std::tr1::make_tuple;
+
+INSTANTIATE_TEST_CASE_P(
+    C, Trans4x4DCT,
+    ::testing::Values(
+        make_tuple(&vp9_fdct4x4_c, &vp9_idct4x4_16_add_c, 0)));
+INSTANTIATE_TEST_CASE_P(
+    C, Trans4x4HT,
+    ::testing::Values(
+        make_tuple(&vp9_short_fht4x4_c, &vp9_iht4x4_16_add_c, 0),
+        make_tuple(&vp9_short_fht4x4_c, &vp9_iht4x4_16_add_c, 1),
+        make_tuple(&vp9_short_fht4x4_c, &vp9_iht4x4_16_add_c, 2),
+        make_tuple(&vp9_short_fht4x4_c, &vp9_iht4x4_16_add_c, 3)));
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans4x4DCT,
+    ::testing::Values(
+        make_tuple(&vp9_fdct4x4_sse2,
+                   &vp9_idct4x4_16_add_sse2, 0)));
+INSTANTIATE_TEST_CASE_P(
+    SSE2, Trans4x4HT,
+    ::testing::Values(
+        make_tuple(&vp9_short_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 0),
+        make_tuple(&vp9_short_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 1),
+        make_tuple(&vp9_short_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 2),
+        make_tuple(&vp9_short_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 3)));
+#endif
+
 }  // namespace

diff --git a/test/test-data.sha1 b/test/test-data.sha1
index 5229d09..827ae31 100644
--- a/test/test-data.sha1
+++ b/test/test-data.sha1

@@ -542,3 +542,20 @@
 ae7451810247fd13975cc257aa0301ff17102255  vp90-2-08-tile-4x4.webm.md5
 2ec6e15422ac7a61af072dc5f27fcaf1942ce116  vp90-2-08-tile-4x1.webm
 0094f5ee5e46345017c30e0aa4835b550212d853  vp90-2-08-tile-4x1.webm.md5
+8cdd435d89029987ee196896e21520e5f879f04d  vp90-2-bbb_1280x720_tile_1x4_1310kbps.webm
+091b373aa2ecb59aa5c647affd5bcafcc7547364  vp90-2-bbb_1920x1080_tile_1x1_2581kbps.webm
+87ee28032b0963a44b73a850fcc816a6dc83efbb  vp90-2-bbb_1920x1080_tile_1x4_2586kbps.webm
+c6ce25c4bfd4bdfc2932b70428e3dfe11210ec4f  vp90-2-bbb_1920x1080_tile_1x4_fpm_2304kbps.webm
+2064bdb22aa71c2691e0469fb62e8087a43f08f8  vp90-2-bbb_426x240_tile_1x1_180kbps.webm
+8080eda22694910162f0996e8a962612f381a57f  vp90-2-bbb_640x360_tile_1x2_337kbps.webm
+a484b335c27ea189c0f0d77babea4a510ce12d50  vp90-2-bbb_854x480_tile_1x2_651kbps.webm
+3eacf1f006250be4cc5c92a7ef146e385ee62653  vp90-2-sintel_1280x546_tile_1x4_1257kbps.webm
+217f089a16447490823127b36ce0d945522accfd  vp90-2-sintel_1920x818_tile_1x4_fpm_2279kbps.webm
+eedb3c641e60dacbe082491a16df529a5c9187df  vp90-2-sintel_426x182_tile_1x1_171kbps.webm
+cb7e4955af183dff33bcba0c837f0922ab066400  vp90-2-sintel_640x272_tile_1x2_318kbps.webm
+48613f9380e2580002f8a09d6e412ea4e89a52b9  vp90-2-sintel_854x364_tile_1x2_621kbps.webm
+990a91f24dd284562d21d714ae773dff5452cad8  vp90-2-tos_1280x534_tile_1x4_1306kbps.webm
+b6dd558c90bca466b4bcbd03b3371648186465a7  vp90-2-tos_1920x800_tile_1x4_fpm_2335kbps.webm
+1a9c2914ba932a38f0a143efc1ad0e318e78888b  vp90-2-tos_426x178_tile_1x1_181kbps.webm
+a3d2b09f24debad4747a1b3066f572be4273bced  vp90-2-tos_640x266_tile_1x2_336kbps.webm
+c64b03b5c090e6888cb39685c31f00a6b79fa45c  vp90-2-tos_854x356_tile_1x2_656kbps.webm

diff --git a/test/test.mk b/test/test.mk
index ac072d0..32601c5 100644
--- a/test/test.mk
+++ b/test/test.mk

@@ -44,6 +44,10 @@
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += webm_video_source.h
 
 LIBVPX_TEST_SRCS-$(CONFIG_DECODERS)    += test_vector_test.cc
+# Currently we only support decoder perf tests for vp9
+ifeq ($(CONFIG_DECODE_PERF_TESTS)$(CONFIG_VP9_DECODER), yesyes)
+LIBVPX_TEST_SRCS-yes                   += decode_perf_test.cc
+endif
 
 ##
 ## WHITE BOX TESTS
@@ -652,3 +656,43 @@
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp90-2-08-tile-4x1.webm.md5
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm
 LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += vp91-2-04-yv444.webm.md5
+
+ifeq ($(CONFIG_DECODE_PERF_TESTS),yes)
+# BBB VP9 streams
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
+  vp90-2-bbb_426x240_tile_1x1_180kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
+  vp90-2-bbb_640x360_tile_1x2_337kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
+  vp90-2-bbb_854x480_tile_1x2_651kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
+  vp90-2-bbb_1280x720_tile_1x4_1310kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
+  vp90-2-bbb_1920x1080_tile_1x1_2581kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
+  vp90-2-bbb_1920x1080_tile_1x4_2586kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
+  vp90-2-bbb_1920x1080_tile_1x4_fpm_2304kbps.webm
+#Sintel VP9 streams
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
+  vp90-2-sintel_426x182_tile_1x1_171kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
+  vp90-2-sintel_640x272_tile_1x2_318kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
+  vp90-2-sintel_854x364_tile_1x2_621kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
+  vp90-2-sintel_1280x546_tile_1x4_1257kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
+  vp90-2-sintel_1920x818_tile_1x4_fpm_2279kbps.webm
+# TOS VP9 streams
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
+  vp90-2-tos_426x178_tile_1x1_181kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
+  vp90-2-tos_640x266_tile_1x2_336kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
+  vp90-2-tos_854x356_tile_1x2_656kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
+  vp90-2-tos_1280x534_tile_1x4_1306kbps.webm
+LIBVPX_TEST_DATA-$(CONFIG_VP9_DECODER) += \
+  vp90-2-tos_1920x800_tile_1x4_fpm_2335kbps.webm
+endif  # CONFIG_DECODE_PERF_TESTS

diff --git a/tools_common.c b/tools_common.c
index 44b2a3f..9c24983 100644
--- a/tools_common.c
+++ b/tools_common.c

@@ -7,10 +7,13 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+
 #include "tools_common.h"
 
 #include <stdarg.h>
+#include <stdio.h>
 #include <stdlib.h>
+#include <string.h>
 
 #if defined(_WIN32) || defined(__OS2__)
 #include <io.h>
@@ -56,3 +59,74 @@
 void warn(const char *fmt, ...) {
   LOG_ERROR("Warning");
 }
+
+uint16_t mem_get_le16(const void *data) {
+  uint16_t val;
+  const uint8_t *mem = (const uint8_t*)data;
+
+  val = mem[1] << 8;
+  val |= mem[0];
+  return val;
+}
+
+uint32_t mem_get_le32(const void *data) {
+  uint32_t val;
+  const uint8_t *mem = (const uint8_t*)data;
+
+  val = mem[3] << 24;
+  val |= mem[2] << 16;
+  val |= mem[1] << 8;
+  val |= mem[0];
+  return val;
+}
+
+int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame) {
+  FILE *f = input_ctx->file;
+  struct FileTypeDetectionBuffer *detect = &input_ctx->detect;
+  int plane = 0;
+  int shortread = 0;
+
+  for (plane = 0; plane < 3; ++plane) {
+    uint8_t *ptr;
+    const int w = (plane ? (1 + yuv_frame->d_w) / 2 : yuv_frame->d_w);
+    const int h = (plane ? (1 + yuv_frame->d_h) / 2 : yuv_frame->d_h);
+    int r;
+
+    /* Determine the correct plane based on the image format. The for-loop
+     * always counts in Y,U,V order, but this may not match the order of
+     * the data on disk.
+     */
+    switch (plane) {
+      case 1:
+        ptr = yuv_frame->planes[
+            yuv_frame->fmt == VPX_IMG_FMT_YV12 ? VPX_PLANE_V : VPX_PLANE_U];
+        break;
+      case 2:
+        ptr = yuv_frame->planes[
+            yuv_frame->fmt == VPX_IMG_FMT_YV12 ? VPX_PLANE_U : VPX_PLANE_V];
+        break;
+      default:
+        ptr = yuv_frame->planes[plane];
+    }
+
+    for (r = 0; r < h; ++r) {
+      size_t needed = w;
+      size_t buf_position = 0;
+      const size_t left = detect->buf_read - detect->position;
+      if (left > 0) {
+        const size_t more = (left < needed) ? left : needed;
+        memcpy(ptr, detect->buf + detect->position, more);
+        buf_position = more;
+        needed -= more;
+        detect->position += more;
+      }
+      if (needed > 0) {
+        shortread |= (fread(ptr + buf_position, 1, needed, f) < needed);
+      }
+
+      ptr += yuv_frame->stride[plane];
+    }
+  }
+
+  return shortread;
+}

diff --git a/tools_common.h b/tools_common.h
index 068e7b5..7500523 100644
--- a/tools_common.h
+++ b/tools_common.h

@@ -13,6 +13,12 @@
 #include <stdio.h>
 
 #include "./vpx_config.h"
+#include "vpx/vpx_image.h"
+#include "vpx/vpx_integer.h"
+
+#if CONFIG_ENCODERS
+#include "./y4minput.h"
+#endif
 
 #if defined(_MSC_VER)
 /* MSVS doesn't define off_t, and uses _f{seek,tell}i64. */
@@ -52,11 +58,55 @@
 #define PATH_MAX 512
 #endif
 
+#define IVF_FRAME_HDR_SZ (4 + 8)  /* 4 byte size + 8 byte timestamp */
+#define IVF_FILE_HDR_SZ 32
+
+#define RAW_FRAME_HDR_SZ sizeof(uint32_t)
+
 #define VP8_FOURCC (0x30385056)
 #define VP9_FOURCC (0x30395056)
 #define VP8_FOURCC_MASK (0x00385056)
 #define VP9_FOURCC_MASK (0x00395056)
 
+enum VideoFileType {
+  FILE_TYPE_RAW,
+  FILE_TYPE_IVF,
+  FILE_TYPE_Y4M,
+  FILE_TYPE_WEBM
+};
+
+struct FileTypeDetectionBuffer {
+  char buf[4];
+  size_t buf_read;
+  size_t position;
+};
+
+struct VpxRational {
+  int numerator;
+  int denominator;
+};
+
+struct VpxInputContext {
+  const char *filename;
+  FILE *file;
+  off_t length;
+  struct FileTypeDetectionBuffer detect;
+  enum VideoFileType file_type;
+  uint32_t width;
+  uint32_t height;
+  int use_i420;
+  int only_i420;
+  uint32_t fourcc;
+  struct VpxRational framerate;
+#if CONFIG_ENCODERS
+  y4m_input y4m;
+#endif
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /* Sets a stdio stream into binary mode */
 FILE *set_binary_mode(FILE *stream);
 
@@ -67,4 +117,13 @@
 /* The tool including this file must define usage_exit() */
 void usage_exit();
 
+uint16_t mem_get_le16(const void *data);
+uint32_t mem_get_le32(const void *data);
+
+int read_yuv_frame(struct VpxInputContext *input_ctx, vpx_image_t *yuv_frame);
+
+#ifdef __cplusplus
+}  /* extern "C" */
+#endif
+
 #endif  // TOOLS_COMMON_H_

diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index 4b60cfd..881ada1 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c

@@ -3808,7 +3808,7 @@
 
     /* Setup background Q adjustment for error resilient mode.
      * For multi-layer encodes only enable this for the base layer.
-     */
+    */
     if (cpi->cyclic_refresh_mode_enabled)
     {
       if (cpi->current_layer==0)

diff --git a/vp9/common/arm/neon/vp9_loopfilter_16_neon.c b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c
new file mode 100644
index 0000000..2f022dc
--- /dev/null
+++ b/vp9/common/arm/neon/vp9_loopfilter_16_neon.c

@@ -0,0 +1,33 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp9_rtcd.h"
+
+void vp9_loop_filter_horizontal_edge_16_neon(uint8_t *s, int p /* pitch */,
+                                             const uint8_t *blimit0,
+                                             const uint8_t *limit0,
+                                             const uint8_t *thresh0,
+                                             const uint8_t *blimit1,
+                                             const uint8_t *limit1,
+                                             const uint8_t *thresh1) {
+  vp9_loop_filter_horizontal_edge(s, p, blimit0, limit0, thresh0, 1);
+  vp9_loop_filter_horizontal_edge(s + 8, p, blimit1, limit1, thresh1, 1);
+}
+
+void vp9_mbloop_filter_horizontal_edge_16_neon(uint8_t *s, int p /* pitch */,
+                                               const uint8_t *blimit0,
+                                               const uint8_t *limit0,
+                                               const uint8_t *thresh0,
+                                               const uint8_t *blimit1,
+                                               const uint8_t *limit1,
+                                               const uint8_t *thresh1) {
+  vp9_mbloop_filter_horizontal_edge(s, p, blimit0, limit0, thresh0, 1);
+  vp9_mbloop_filter_horizontal_edge(s + 8, p, blimit1, limit1, thresh1, 1);
+}

diff --git a/vp9/common/vp9_alloccommon.c b/vp9/common/vp9_alloccommon.c
index 0d65651..d298160 100644
--- a/vp9/common/vp9_alloccommon.c
+++ b/vp9/common/vp9_alloccommon.c

@@ -79,6 +79,57 @@
   vp9_update_mode_info_border(cm, cm->prev_mip);
 }
 
+int vp9_resize_frame_buffers(VP9_COMMON *cm, int width, int height) {
+  const int aligned_width = ALIGN_POWER_OF_TWO(width, MI_SIZE_LOG2);
+  const int aligned_height = ALIGN_POWER_OF_TWO(height, MI_SIZE_LOG2);
+  const int ss_x = cm->subsampling_x;
+  const int ss_y = cm->subsampling_y;
+  int mi_size;
+
+  if (vp9_realloc_frame_buffer(&cm->post_proc_buffer, width, height, ss_x, ss_y,
+                             VP9BORDERINPIXELS) < 0)
+    goto fail;
+
+  set_mb_mi(cm, aligned_width, aligned_height);
+
+  // Allocation
+  mi_size = cm->mode_info_stride * (cm->mi_rows + MI_BLOCK_SIZE);
+
+  vpx_free(cm->mip);
+  cm->mip = vpx_calloc(mi_size, sizeof(MODE_INFO));
+  if (!cm->mip)
+    goto fail;
+
+  vpx_free(cm->prev_mip);
+  cm->prev_mip = vpx_calloc(mi_size, sizeof(MODE_INFO));
+  if (!cm->prev_mip)
+    goto fail;
+
+  vpx_free(cm->mi_grid_base);
+  cm->mi_grid_base = vpx_calloc(mi_size, sizeof(*cm->mi_grid_base));
+  if (!cm->mi_grid_base)
+    goto fail;
+
+  vpx_free(cm->prev_mi_grid_base);
+  cm->prev_mi_grid_base = vpx_calloc(mi_size, sizeof(*cm->prev_mi_grid_base));
+  if (!cm->prev_mi_grid_base)
+    goto fail;
+
+  setup_mi(cm);
+
+  // Create the segmentation map structure and set to 0.
+  vpx_free(cm->last_frame_seg_map);
+  cm->last_frame_seg_map = vpx_calloc(cm->mi_rows * cm->mi_cols, 1);
+  if (!cm->last_frame_seg_map)
+    goto fail;
+
+  return 0;
+
+ fail:
+  vp9_free_frame_buffers(cm);
+  return 1;
+}
+
 int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height) {
   int i;
 

diff --git a/vp9/common/vp9_alloccommon.h b/vp9/common/vp9_alloccommon.h
index 5d5fae9..cf8dca5 100644
--- a/vp9/common/vp9_alloccommon.h
+++ b/vp9/common/vp9_alloccommon.h

@@ -21,6 +21,7 @@
 void vp9_create_common(VP9_COMMON *cm);
 void vp9_remove_common(VP9_COMMON *cm);
 
+int vp9_resize_frame_buffers(VP9_COMMON *cm, int width, int height);
 int vp9_alloc_frame_buffers(VP9_COMMON *cm, int width, int height);
 void vp9_free_frame_buffers(VP9_COMMON *cm);
 

diff --git a/vp9/common/vp9_blockd.h b/vp9/common/vp9_blockd.h
index c5da375..121947b 100644
--- a/vp9/common/vp9_blockd.h
+++ b/vp9/common/vp9_blockd.h

@@ -381,22 +381,6 @@
   const int stride = 4 << b_width_log2(plane_bsize);
   return base + raster_block_offset(plane_bsize, raster_block, stride);
 }
-static uint8_t* raster_block_offset_uint8(BLOCK_SIZE plane_bsize,
-                                          int raster_block, uint8_t *base,
-                                          int stride) {
-  return base + raster_block_offset(plane_bsize, raster_block, stride);
-}
-
-static int txfrm_block_to_raster_block(BLOCK_SIZE plane_bsize,
-                                       TX_SIZE tx_size, int block) {
-  const int bwl = b_width_log2(plane_bsize);
-  const int tx_cols_log2 = bwl - tx_size;
-  const int tx_cols = 1 << tx_cols_log2;
-  const int raster_mb = block >> (tx_size << 1);
-  const int x = (raster_mb & (tx_cols - 1)) << tx_size;
-  const int y = (raster_mb >> tx_cols_log2) << tx_size;
-  return x + (y << bwl);
-}
 
 static void txfrm_block_to_raster_xy(BLOCK_SIZE plane_bsize,
                                      TX_SIZE tx_size, int block,

diff --git a/vp9/common/vp9_common_data.c b/vp9/common/vp9_common_data.c
index f858900..388f38d 100644
--- a/vp9/common/vp9_common_data.c
+++ b/vp9/common/vp9_common_data.c

@@ -123,8 +123,6 @@
   TX_32X32,  // TX_MODE_SELECT
 };
 
-
-
 const BLOCK_SIZE ss_size_lookup[BLOCK_SIZES][2][2] = {
 //  ss_x == 0    ss_x == 0        ss_x == 1      ss_x == 1
 //  ss_y == 0    ss_y == 1        ss_y == 0      ss_y == 1
@@ -143,4 +141,24 @@
   {{BLOCK_64X64, BLOCK_64X32},   {BLOCK_32X64,   BLOCK_32X32}},
 };
 
-
+// Generates 4 bit field in which each bit set to 1 represents
+// a blocksize partition  1111 means we split 64x64, 32x32, 16x16
+// and 8x8.  1000 means we just split the 64x64 to 32x32
+const struct {
+  PARTITION_CONTEXT above;
+  PARTITION_CONTEXT left;
+} partition_context_lookup[BLOCK_SIZES]= {
+  {15, 15},  // 4X4   - {0b1111, 0b1111}
+  {15, 14},  // 4X8   - {0b1111, 0b1110}
+  {14, 15},  // 8X4   - {0b1110, 0b1111}
+  {14, 14},  // 8X8   - {0b1110, 0b1110}
+  {14, 12},  // 8X16  - {0b1110, 0b1100}
+  {12, 14},  // 16X8  - {0b1100, 0b1110}
+  {12, 12},  // 16X16 - {0b1100, 0b1100}
+  {12, 8 },  // 16X32 - {0b1100, 0b1000}
+  {8,  12},  // 32X16 - {0b1000, 0b1100}
+  {8,  8 },  // 32X32 - {0b1000, 0b1000}
+  {8,  0 },  // 32X64 - {0b1000, 0b0000}
+  {0,  8 },  // 64X32 - {0b0000, 0b1000}
+  {0,  0 },  // 64X64 - {0b0000, 0b0000}
+};

diff --git a/vp9/common/vp9_entropy.c b/vp9/common/vp9_entropy.c
index 2676762..82aa77e 100644
--- a/vp9/common/vp9_entropy.c
+++ b/vp9/common/vp9_entropy.c

@@ -15,7 +15,6 @@
 #include "vpx_mem/vpx_mem.h"
 #include "vpx/vpx_integer.h"
 
-#define MODEL_NODES (ENTROPY_NODES - UNCONSTRAINED_NODES)
 
 DECLARE_ALIGNED(16, const uint8_t, vp9_norm[256]) = {
   0, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4,
@@ -37,25 +36,84 @@
 };
 
 DECLARE_ALIGNED(16, const uint8_t,
-                vp9_coefband_trans_8x8plus[MAXBAND_INDEX + 1]) = {
+                vp9_coefband_trans_8x8plus[1024]) = {
   0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4,
-  4, 4, 4, 4, 4, 5
+  4, 4, 4, 4, 4, 5,
+  // beyond MAXBAND_INDEX+1 all values are filled as 5
+                    5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
+  5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
 };
 
-DECLARE_ALIGNED(16, const uint8_t,
-                vp9_coefband_trans_4x4[MAXBAND_INDEX + 1]) = {
+DECLARE_ALIGNED(16, const uint8_t, vp9_coefband_trans_4x4[16]) = {
   0, 1, 1, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 5, 5, 5,
-  5, 5, 5, 5, 5, 5
 };
 
 DECLARE_ALIGNED(16, const uint8_t, vp9_pt_energy_class[MAX_ENTROPY_TOKENS]) = {
   0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5
 };
 
-
-
-/* Array indices are identical to previously-existing CONTEXT_NODE indices */
-
+// Array indices are identical to previously-existing CONTEXT_NODE indices
 const vp9_tree_index vp9_coef_tree[TREE_SIZE(MAX_ENTROPY_TOKENS)] = {
   -DCT_EOB_TOKEN, 2,                          /* 0 = EOB */
   -ZERO_TOKEN, 4,                             /* 1 = ZERO */
@@ -99,7 +157,9 @@
 // the probabilities for the rest of the nodes.
 
 // beta = 8
-static const vp9_prob modelcoefprobs_pareto8[COEFPROB_MODELS][MODEL_NODES] = {
+
+
+static const vp9_prob pareto8_probs[COEFPROB_MODELS][MODEL_NODES] = {
   {  3,  86, 128,   6,  86,  23,  88,  29},
   {  9,  86, 129,  17,  88,  61,  94,  76},
   { 15,  87, 129,  28,  89,  93, 100, 110},
@@ -230,26 +290,285 @@
   {255, 246, 247, 255, 239, 255, 253, 255}
 };
 
-static void extend_model_to_full_distribution(vp9_prob p,
-                                              vp9_prob *tree_probs) {
+// This table is an expansion of the table : modelcoefprobs_pareto8
+// to all 255 probabilities using the code as follows to do the expansion:
+// tree_probs[i] = (model[l][i - UNCONSTRAINED_NODES] +
+//                  model[l + 1][i - UNCONSTRAINED_NODES]) >> 1;
+const vp9_prob vp9_pareto8_full[255][MODEL_NODES] = {
+    {  3, 86, 128,  6, 86, 23, 88, 29},
+    {  6, 86, 128, 11, 87, 42, 91, 52},
+    {  9, 86, 129, 17, 88, 61, 94, 76},
+    { 12, 86, 129, 22, 88, 77, 97, 93},
+    { 15, 87, 129, 28, 89, 93, 100, 110},
+    { 17, 87, 129, 33, 90, 105, 103, 123},
+    { 20, 88, 130, 38, 91, 118, 106, 136},
+    { 23, 88, 130, 43, 91, 128, 108, 146},
+    { 26, 89, 131, 48, 92, 139, 111, 156},
+    { 28, 89, 131, 53, 93, 147, 114, 163},
+    { 31, 90, 131, 58, 94, 156, 117, 171},
+    { 34, 90, 131, 62, 94, 163, 119, 177},
+    { 37, 90, 132, 66, 95, 171, 122, 184},
+    { 39, 90, 132, 70, 96, 177, 124, 189},
+    { 42, 91, 132, 75, 97, 183, 127, 194},
+    { 44, 91, 132, 79, 97, 188, 129, 198},
+    { 47, 92, 133, 83, 98, 193, 132, 202},
+    { 49, 92, 133, 86, 99, 197, 134, 205},
+    { 52, 93, 133, 90, 100, 201, 137, 208},
+    { 54, 93, 133, 94, 100, 204, 139, 211},
+    { 57, 94, 134, 98, 101, 208, 142, 214},
+    { 59, 94, 134, 101, 102, 211, 144, 216},
+    { 62, 94, 135, 105, 103, 214, 146, 218},
+    { 64, 94, 135, 108, 103, 216, 148, 220},
+    { 66, 95, 135, 111, 104, 219, 151, 222},
+    { 68, 95, 135, 114, 105, 221, 153, 223},
+    { 71, 96, 136, 117, 106, 224, 155, 225},
+    { 73, 96, 136, 120, 106, 225, 157, 226},
+    { 76, 97, 136, 123, 107, 227, 159, 228},
+    { 78, 97, 136, 126, 108, 229, 160, 229},
+    { 80, 98, 137, 129, 109, 231, 162, 231},
+    { 82, 98, 137, 131, 109, 232, 164, 232},
+    { 84, 98, 138, 134, 110, 234, 166, 233},
+    { 86, 98, 138, 137, 111, 235, 168, 234},
+    { 89, 99, 138, 140, 112, 236, 170, 235},
+    { 91, 99, 138, 142, 112, 237, 171, 235},
+    { 93, 100, 139, 145, 113, 238, 173, 236},
+    { 95, 100, 139, 147, 114, 239, 174, 237},
+    { 97, 101, 140, 149, 115, 240, 176, 238},
+    { 99, 101, 140, 151, 115, 241, 177, 238},
+    {101, 102, 140, 154, 116, 242, 179, 239},
+    {103, 102, 140, 156, 117, 242, 180, 239},
+    {105, 103, 141, 158, 118, 243, 182, 240},
+    {107, 103, 141, 160, 118, 243, 183, 240},
+    {109, 104, 141, 162, 119, 244, 185, 241},
+    {111, 104, 141, 164, 119, 244, 186, 241},
+    {113, 104, 142, 166, 120, 245, 187, 242},
+    {114, 104, 142, 168, 121, 245, 188, 242},
+    {116, 105, 143, 170, 122, 246, 190, 243},
+    {118, 105, 143, 171, 122, 246, 191, 243},
+    {120, 106, 143, 173, 123, 247, 192, 244},
+    {121, 106, 143, 175, 124, 247, 193, 244},
+    {123, 107, 144, 177, 125, 248, 195, 244},
+    {125, 107, 144, 178, 125, 248, 196, 244},
+    {127, 108, 145, 180, 126, 249, 197, 245},
+    {128, 108, 145, 181, 127, 249, 198, 245},
+    {130, 109, 145, 183, 128, 249, 199, 245},
+    {132, 109, 145, 184, 128, 249, 200, 245},
+    {134, 110, 146, 186, 129, 250, 201, 246},
+    {135, 110, 146, 187, 130, 250, 202, 246},
+    {137, 111, 147, 189, 131, 251, 203, 246},
+    {138, 111, 147, 190, 131, 251, 204, 246},
+    {140, 112, 147, 192, 132, 251, 205, 247},
+    {141, 112, 147, 193, 132, 251, 206, 247},
+    {143, 113, 148, 194, 133, 251, 207, 247},
+    {144, 113, 148, 195, 134, 251, 207, 247},
+    {146, 114, 149, 197, 135, 252, 208, 248},
+    {147, 114, 149, 198, 135, 252, 209, 248},
+    {149, 115, 149, 199, 136, 252, 210, 248},
+    {150, 115, 149, 200, 137, 252, 210, 248},
+    {152, 115, 150, 201, 138, 252, 211, 248},
+    {153, 115, 150, 202, 138, 252, 212, 248},
+    {155, 116, 151, 204, 139, 253, 213, 249},
+    {156, 116, 151, 205, 139, 253, 213, 249},
+    {158, 117, 151, 206, 140, 253, 214, 249},
+    {159, 117, 151, 207, 141, 253, 215, 249},
+    {161, 118, 152, 208, 142, 253, 216, 249},
+    {162, 118, 152, 209, 142, 253, 216, 249},
+    {163, 119, 153, 210, 143, 253, 217, 249},
+    {164, 119, 153, 211, 143, 253, 217, 249},
+    {166, 120, 153, 212, 144, 254, 218, 250},
+    {167, 120, 153, 212, 145, 254, 219, 250},
+    {168, 121, 154, 213, 146, 254, 220, 250},
+    {169, 121, 154, 214, 146, 254, 220, 250},
+    {171, 122, 155, 215, 147, 254, 221, 250},
+    {172, 122, 155, 216, 147, 254, 221, 250},
+    {173, 123, 155, 217, 148, 254, 222, 250},
+    {174, 123, 155, 217, 149, 254, 222, 250},
+    {176, 124, 156, 218, 150, 254, 223, 250},
+    {177, 124, 156, 219, 150, 254, 223, 250},
+    {178, 125, 157, 220, 151, 254, 224, 251},
+    {179, 125, 157, 220, 151, 254, 224, 251},
+    {180, 126, 157, 221, 152, 254, 225, 251},
+    {181, 126, 157, 221, 152, 254, 225, 251},
+    {183, 127, 158, 222, 153, 254, 226, 251},
+    {184, 127, 158, 223, 154, 254, 226, 251},
+    {185, 128, 159, 224, 155, 255, 227, 251},
+    {186, 128, 159, 224, 155, 255, 227, 251},
+    {187, 129, 160, 225, 156, 255, 228, 251},
+    {188, 130, 160, 225, 156, 255, 228, 251},
+    {189, 131, 160, 226, 157, 255, 228, 251},
+    {190, 131, 160, 226, 158, 255, 228, 251},
+    {191, 132, 161, 227, 159, 255, 229, 251},
+    {192, 132, 161, 227, 159, 255, 229, 251},
+    {193, 133, 162, 228, 160, 255, 230, 252},
+    {194, 133, 162, 229, 160, 255, 230, 252},
+    {195, 134, 163, 230, 161, 255, 231, 252},
+    {196, 134, 163, 230, 161, 255, 231, 252},
+    {197, 135, 163, 231, 162, 255, 231, 252},
+    {198, 135, 163, 231, 162, 255, 231, 252},
+    {199, 136, 164, 232, 163, 255, 232, 252},
+    {200, 136, 164, 232, 164, 255, 232, 252},
+    {201, 137, 165, 233, 165, 255, 233, 252},
+    {201, 137, 165, 233, 165, 255, 233, 252},
+    {202, 138, 166, 233, 166, 255, 233, 252},
+    {203, 138, 166, 233, 166, 255, 233, 252},
+    {204, 139, 166, 234, 167, 255, 234, 252},
+    {205, 139, 166, 234, 167, 255, 234, 252},
+    {206, 140, 167, 235, 168, 255, 235, 252},
+    {206, 140, 167, 235, 168, 255, 235, 252},
+    {207, 141, 168, 236, 169, 255, 235, 252},
+    {208, 141, 168, 236, 170, 255, 235, 252},
+    {209, 142, 169, 237, 171, 255, 236, 252},
+    {209, 143, 169, 237, 171, 255, 236, 252},
+    {210, 144, 169, 237, 172, 255, 236, 252},
+    {211, 144, 169, 237, 172, 255, 236, 252},
+    {212, 145, 170, 238, 173, 255, 237, 252},
+    {213, 145, 170, 238, 173, 255, 237, 252},
+    {214, 146, 171, 239, 174, 255, 237, 253},
+    {214, 146, 171, 239, 174, 255, 237, 253},
+    {215, 147, 172, 240, 175, 255, 238, 253},
+    {215, 147, 172, 240, 175, 255, 238, 253},
+    {216, 148, 173, 240, 176, 255, 238, 253},
+    {217, 148, 173, 240, 176, 255, 238, 253},
+    {218, 149, 173, 241, 177, 255, 239, 253},
+    {218, 149, 173, 241, 178, 255, 239, 253},
+    {219, 150, 174, 241, 179, 255, 239, 253},
+    {219, 151, 174, 241, 179, 255, 239, 253},
+    {220, 152, 175, 242, 180, 255, 240, 253},
+    {221, 152, 175, 242, 180, 255, 240, 253},
+    {222, 153, 176, 242, 181, 255, 240, 253},
+    {222, 153, 176, 242, 181, 255, 240, 253},
+    {223, 154, 177, 243, 182, 255, 240, 253},
+    {223, 154, 177, 243, 182, 255, 240, 253},
+    {224, 155, 178, 244, 183, 255, 241, 253},
+    {224, 155, 178, 244, 183, 255, 241, 253},
+    {225, 156, 178, 244, 184, 255, 241, 253},
+    {225, 157, 178, 244, 184, 255, 241, 253},
+    {226, 158, 179, 244, 185, 255, 242, 253},
+    {227, 158, 179, 244, 185, 255, 242, 253},
+    {228, 159, 180, 245, 186, 255, 242, 253},
+    {228, 159, 180, 245, 186, 255, 242, 253},
+    {229, 160, 181, 245, 187, 255, 242, 253},
+    {229, 160, 181, 245, 187, 255, 242, 253},
+    {230, 161, 182, 246, 188, 255, 243, 253},
+    {230, 162, 182, 246, 188, 255, 243, 253},
+    {231, 163, 183, 246, 189, 255, 243, 253},
+    {231, 163, 183, 246, 189, 255, 243, 253},
+    {232, 164, 184, 247, 190, 255, 243, 253},
+    {232, 164, 184, 247, 190, 255, 243, 253},
+    {233, 165, 185, 247, 191, 255, 244, 253},
+    {233, 165, 185, 247, 191, 255, 244, 253},
+    {234, 166, 185, 247, 192, 255, 244, 253},
+    {234, 167, 185, 247, 192, 255, 244, 253},
+    {235, 168, 186, 248, 193, 255, 244, 253},
+    {235, 168, 186, 248, 193, 255, 244, 253},
+    {236, 169, 187, 248, 194, 255, 244, 253},
+    {236, 169, 187, 248, 194, 255, 244, 253},
+    {236, 170, 188, 248, 195, 255, 245, 253},
+    {236, 170, 188, 248, 195, 255, 245, 253},
+    {237, 171, 189, 249, 196, 255, 245, 254},
+    {237, 172, 189, 249, 196, 255, 245, 254},
+    {238, 173, 190, 249, 197, 255, 245, 254},
+    {238, 173, 190, 249, 197, 255, 245, 254},
+    {239, 174, 191, 249, 198, 255, 245, 254},
+    {239, 174, 191, 249, 198, 255, 245, 254},
+    {240, 175, 192, 249, 199, 255, 246, 254},
+    {240, 176, 192, 249, 199, 255, 246, 254},
+    {240, 177, 193, 250, 200, 255, 246, 254},
+    {240, 177, 193, 250, 200, 255, 246, 254},
+    {241, 178, 194, 250, 201, 255, 246, 254},
+    {241, 178, 194, 250, 201, 255, 246, 254},
+    {242, 179, 195, 250, 202, 255, 246, 254},
+    {242, 180, 195, 250, 202, 255, 246, 254},
+    {242, 181, 196, 250, 203, 255, 247, 254},
+    {242, 181, 196, 250, 203, 255, 247, 254},
+    {243, 182, 197, 251, 204, 255, 247, 254},
+    {243, 183, 197, 251, 204, 255, 247, 254},
+    {244, 184, 198, 251, 205, 255, 247, 254},
+    {244, 184, 198, 251, 205, 255, 247, 254},
+    {244, 185, 199, 251, 206, 255, 247, 254},
+    {244, 185, 199, 251, 206, 255, 247, 254},
+    {245, 186, 200, 251, 207, 255, 247, 254},
+    {245, 187, 200, 251, 207, 255, 247, 254},
+    {246, 188, 201, 252, 207, 255, 248, 254},
+    {246, 188, 201, 252, 207, 255, 248, 254},
+    {246, 189, 202, 252, 208, 255, 248, 254},
+    {246, 190, 202, 252, 208, 255, 248, 254},
+    {247, 191, 203, 252, 209, 255, 248, 254},
+    {247, 191, 203, 252, 209, 255, 248, 254},
+    {247, 192, 204, 252, 210, 255, 248, 254},
+    {247, 193, 204, 252, 210, 255, 248, 254},
+    {248, 194, 205, 252, 211, 255, 248, 254},
+    {248, 194, 205, 252, 211, 255, 248, 254},
+    {248, 195, 206, 252, 212, 255, 249, 254},
+    {248, 196, 206, 252, 212, 255, 249, 254},
+    {249, 197, 207, 253, 213, 255, 249, 254},
+    {249, 197, 207, 253, 213, 255, 249, 254},
+    {249, 198, 208, 253, 214, 255, 249, 254},
+    {249, 199, 209, 253, 214, 255, 249, 254},
+    {250, 200, 210, 253, 215, 255, 249, 254},
+    {250, 200, 210, 253, 215, 255, 249, 254},
+    {250, 201, 211, 253, 215, 255, 249, 254},
+    {250, 202, 211, 253, 215, 255, 249, 254},
+    {250, 203, 212, 253, 216, 255, 249, 254},
+    {250, 203, 212, 253, 216, 255, 249, 254},
+    {251, 204, 213, 253, 217, 255, 250, 254},
+    {251, 205, 213, 253, 217, 255, 250, 254},
+    {251, 206, 214, 254, 218, 255, 250, 254},
+    {251, 206, 215, 254, 218, 255, 250, 254},
+    {252, 207, 216, 254, 219, 255, 250, 254},
+    {252, 208, 216, 254, 219, 255, 250, 254},
+    {252, 209, 217, 254, 220, 255, 250, 254},
+    {252, 210, 217, 254, 220, 255, 250, 254},
+    {252, 211, 218, 254, 221, 255, 250, 254},
+    {252, 212, 218, 254, 221, 255, 250, 254},
+    {253, 213, 219, 254, 222, 255, 250, 254},
+    {253, 213, 220, 254, 222, 255, 250, 254},
+    {253, 214, 221, 254, 223, 255, 250, 254},
+    {253, 215, 221, 254, 223, 255, 250, 254},
+    {253, 216, 222, 254, 224, 255, 251, 254},
+    {253, 217, 223, 254, 224, 255, 251, 254},
+    {253, 218, 224, 254, 225, 255, 251, 254},
+    {253, 219, 224, 254, 225, 255, 251, 254},
+    {254, 220, 225, 254, 225, 255, 251, 254},
+    {254, 221, 226, 254, 225, 255, 251, 254},
+    {254, 222, 227, 255, 226, 255, 251, 254},
+    {254, 223, 227, 255, 226, 255, 251, 254},
+    {254, 224, 228, 255, 227, 255, 251, 254},
+    {254, 225, 229, 255, 227, 255, 251, 254},
+    {254, 226, 230, 255, 228, 255, 251, 254},
+    {254, 227, 230, 255, 229, 255, 251, 254},
+    {255, 228, 231, 255, 230, 255, 251, 254},
+    {255, 229, 232, 255, 230, 255, 251, 254},
+    {255, 230, 233, 255, 231, 255, 252, 254},
+    {255, 231, 234, 255, 231, 255, 252, 254},
+    {255, 232, 235, 255, 232, 255, 252, 254},
+    {255, 233, 236, 255, 232, 255, 252, 254},
+    {255, 235, 237, 255, 233, 255, 252, 254},
+    {255, 236, 238, 255, 234, 255, 252, 254},
+    {255, 238, 240, 255, 235, 255, 252, 255},
+    {255, 239, 241, 255, 235, 255, 252, 254},
+    {255, 241, 243, 255, 236, 255, 252, 254},
+    {255, 243, 245, 255, 237, 255, 252, 254},
+    {255, 246, 247, 255, 239, 255, 253, 255},
+};
+
+static void extend_to_full_distribution(vp9_prob *probs, vp9_prob p) {
   const int l = (p - 1) / 2;
-  const vp9_prob (*model)[MODEL_NODES] = modelcoefprobs_pareto8;
   if (p & 1) {
-    vpx_memcpy(tree_probs + UNCONSTRAINED_NODES,
-               model[l], MODEL_NODES * sizeof(vp9_prob));
+    // Just copy
+    vpx_memcpy(probs, pareto8_probs[l], MODEL_NODES * sizeof(vp9_prob));
   } else {
-    // interpolate
+    // Interpolate
     int i;
-    for (i = UNCONSTRAINED_NODES; i < ENTROPY_NODES; ++i)
-      tree_probs[i] = (model[l][i - UNCONSTRAINED_NODES] +
-                       model[l + 1][i - UNCONSTRAINED_NODES]) >> 1;
+    for (i = 0; i < MODEL_NODES; ++i)
+      probs[i] = (pareto8_probs[l][i] + pareto8_probs[l + 1][i]) >> 1;
   }
 }
 
 void vp9_model_to_full_probs(const vp9_prob *model, vp9_prob *full) {
   if (full != model)
     vpx_memcpy(full, model, sizeof(vp9_prob) * UNCONSTRAINED_NODES);
-  extend_model_to_full_distribution(model[PIVOT_NODE], full);
+  extend_to_full_distribution(&full[UNCONSTRAINED_NODES], model[PIVOT_NODE]);
 }
 
 static vp9_tree_index cat1[2], cat2[4], cat3[6], cat4[8], cat5[10], cat6[28];
@@ -275,18 +594,18 @@
 }
 
 const vp9_extra_bit vp9_extra_bits[MAX_ENTROPY_TOKENS] = {
-  { 0, 0, 0, 0},
-  { 0, 0, 0, 1},
-  { 0, 0, 0, 2},
-  { 0, 0, 0, 3},
-  { 0, 0, 0, 4},
-  { cat1, Pcat1, 1, 5},
-  { cat2, Pcat2, 2, 7},
-  { cat3, Pcat3, 3, 11},
-  { cat4, Pcat4, 4, 19},
-  { cat5, Pcat5, 5, 35},
-  { cat6, Pcat6, 14, 67},
-  { 0, 0, 0, 0}
+  {0, 0, 0, 0},           // ZERO_TOKEN
+  {0, 0, 0, 1},           // ONE_TOKEN
+  {0, 0, 0, 2},           // TWO_TOKEN
+  {0, 0, 0, 3},           // THREE_TOKEN
+  {0, 0, 0, 4},           // FOUR_TOKEN
+  {cat1, Pcat1, 1, 5},    // DCT_VAL_CATEGORY1
+  {cat2, Pcat2, 2, 7},    // DCT_VAL_CATEGORY2
+  {cat3, Pcat3, 3, 11},   // DCT_VAL_CATEGORY3
+  {cat4, Pcat4, 4, 19},   // DCT_VAL_CATEGORY4
+  {cat5, Pcat5, 5, 35},   // DCT_VAL_CATEGORY5
+  {cat6, Pcat6, 14, 67},  // DCT_VAL_CATEGORY6
+  {0, 0, 0, 0}            // DCT_EOB_TOKEN
 };
 
 #include "vp9/common/vp9_default_coef_probs.h"
@@ -303,8 +622,6 @@
   vp9_tokens_from_tree(vp9_coef_encodings, vp9_coef_tree);
 }
 
-// #define COEF_COUNT_TESTING
-
 #define COEF_COUNT_SAT 24
 #define COEF_MAX_UPDATE_FACTOR 112
 #define COEF_COUNT_SAT_KEY 24

diff --git a/vp9/common/vp9_entropy.h b/vp9/common/vp9_entropy.h
index 67b1669..0370b32 100644
--- a/vp9/common/vp9_entropy.h
+++ b/vp9/common/vp9_entropy.h

@@ -17,6 +17,7 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_scan.h"
 #include "vp9/common/vp9_treecoder.h"
+#include "vp9/common/vp9_entropymode.h"
 
 #define DIFF_UPDATE_PROB 252
 
@@ -120,12 +121,16 @@
 
 // This is the index in the scan order beyond which all coefficients for
 // 8x8 transform and above are in the top band.
-// For 4x4 blocks the index is less but to keep things common the lookup
-// table for 4x4 is padded out to this index.
+// This macro is currently unused but may be used by certain implementations
 #define MAXBAND_INDEX 21
 
-extern const uint8_t vp9_coefband_trans_8x8plus[MAXBAND_INDEX + 1];
-extern const uint8_t vp9_coefband_trans_4x4[MAXBAND_INDEX + 1];
+extern const uint8_t vp9_coefband_trans_8x8plus[1024];
+extern const uint8_t vp9_coefband_trans_4x4[16];
+
+static const uint8_t *get_band_translate(TX_SIZE tx_size) {
+  return tx_size == TX_4X4 ? vp9_coefband_trans_4x4
+                           : vp9_coefband_trans_8x8plus;
+}
 
 // 128 lists of probabilities are stored for the following ONE node probs:
 // 1, 3, 5, 7, ..., 253, 255
@@ -137,6 +142,9 @@
 
 #define PIVOT_NODE                  2   // which node is pivot
 
+#define MODEL_NODES (ENTROPY_NODES - UNCONSTRAINED_NODES)
+extern const vp9_prob vp9_pareto8_full[255][MODEL_NODES];
+
 typedef vp9_prob vp9_coeff_probs_model[REF_TYPES][COEF_BANDS]
                                       [PREV_COEF_CONTEXTS]
                                       [UNCONSTRAINED_NODES];
@@ -178,23 +186,19 @@
 static void get_scan(const MACROBLOCKD *xd, TX_SIZE tx_size,
                      PLANE_TYPE type, int block_idx,
                      const int16_t **scan, const int16_t **scan_nb) {
-  switch (tx_size) {
-    case TX_4X4:
-      get_scan_nb_4x4(get_tx_type_4x4(type, xd, block_idx), scan, scan_nb);
-      break;
-    case TX_8X8:
-      get_scan_nb_8x8(get_tx_type_8x8(type, xd), scan, scan_nb);
-      break;
-    case TX_16X16:
-      get_scan_nb_16x16(get_tx_type_16x16(type, xd), scan, scan_nb);
-      break;
-    case TX_32X32:
-      *scan = vp9_default_scan_32x32;
-      *scan_nb = vp9_default_scan_32x32_neighbors;
-      break;
-    default:
-      assert(!"Invalid transform size.");
+  const MODE_INFO *const mi = xd->mi_8x8[0];
+  const MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const scan_order *so;
+
+  if (is_inter_block(mbmi) || type != PLANE_TYPE_Y_WITH_DC || xd->lossless) {
+    so = &inter_scan_orders[tx_size];
+  } else {
+    const MB_PREDICTION_MODE mode =
+        mbmi->sb_type < BLOCK_8X8 ? mi->bmi[block_idx].as_mode : mbmi->mode;
+    so = &intra_scan_orders[tx_size][mode];
   }
+  *scan = so->scan;
+  *scan_nb = so->neighbors;
 }
 
 #endif  // VP9_COMMON_VP9_ENTROPY_H_

diff --git a/vp9/common/vp9_idct.c b/vp9/common/vp9_idct.c
index ea8683e..149362a 100644
--- a/vp9/common/vp9_idct.c
+++ b/vp9/common/vp9_idct.c

@@ -835,7 +835,8 @@
     ht.cols(temp_in, temp_out);
     for (j = 0; j < 16; ++j)
       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
-                                  + dest[j * stride + i]);  }
+                                        + dest[j * stride + i]);
+  }
 }
 
 void vp9_idct16x16_10_add_c(const int16_t *input, uint8_t *dest, int stride) {
@@ -1276,7 +1277,7 @@
     idct32_1d(temp_in, temp_out);
     for (j = 0; j < 32; ++j)
       dest[j * stride + i] = clip_pixel(ROUND_POWER_OF_TWO(temp_out[j], 6)
-                                  + dest[j * stride + i]);
+                                        + dest[j * stride + i]);
   }
 }
 

diff --git a/vp9/common/vp9_loopfilter.c b/vp9/common/vp9_loopfilter.c
index ff504a1..8e13afb 100644
--- a/vp9/common/vp9_loopfilter.c
+++ b/vp9/common/vp9_loopfilter.c

@@ -283,10 +283,10 @@
   // n_shift is the a multiplier for lf_deltas
   // the multiplier is 1 for when filter_lvl is between 0 and 31;
   // 2 when filter_lvl is between 32 and 63
-  const int n_shift = default_filt_lvl >> 5;
+  const int scale = 1 << (default_filt_lvl >> 5);
   loop_filter_info_n *const lfi = &cm->lf_info;
   struct loopfilter *const lf = &cm->lf;
-  struct segmentation *const seg = &cm->seg;
+  const struct segmentation *const seg = &cm->seg;
 
   // update limits if sharpness has changed
   if (lf->last_sharpness_level != lf->sharpness_level) {
@@ -295,9 +295,7 @@
   }
 
   for (seg_id = 0; seg_id < MAX_SEGMENTS; seg_id++) {
-    int lvl_seg = default_filt_lvl, ref, mode, intra_lvl;
-
-    // Set the baseline filter values for each segment
+    int lvl_seg = default_filt_lvl;
     if (vp9_segfeature_active(seg, seg_id, SEG_LVL_ALT_LF)) {
       const int data = vp9_get_segdata(seg, seg_id, SEG_LVL_ALT_LF);
       lvl_seg = seg->abs_delta == SEGMENT_ABSDATA
@@ -309,31 +307,22 @@
       // we could get rid of this if we assume that deltas are set to
       // zero when not in use; encoder always uses deltas
       vpx_memset(lfi->lvl[seg_id], lvl_seg, sizeof(lfi->lvl[seg_id]));
-      continue;
-    }
+    } else {
+      int ref, mode;
+      const int intra_lvl = lvl_seg + lf->ref_deltas[INTRA_FRAME] * scale;
+      lfi->lvl[seg_id][INTRA_FRAME][0] = clamp(intra_lvl, 0, MAX_LOOP_FILTER);
 
-    intra_lvl = lvl_seg + lf->ref_deltas[INTRA_FRAME] * (1 << n_shift);
-    lfi->lvl[seg_id][INTRA_FRAME][0] = clamp(intra_lvl, 0, MAX_LOOP_FILTER);
-
-    for (ref = LAST_FRAME; ref < MAX_REF_FRAMES; ++ref)
-      for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) {
-        const int inter_lvl = lvl_seg + lf->ref_deltas[ref] * (1 << n_shift)
-                                      + lf->mode_deltas[mode] * (1 << n_shift);
-        lfi->lvl[seg_id][ref][mode] = clamp(inter_lvl, 0, MAX_LOOP_FILTER);
+      for (ref = LAST_FRAME; ref < MAX_REF_FRAMES; ++ref) {
+        for (mode = 0; mode < MAX_MODE_LF_DELTAS; ++mode) {
+          const int inter_lvl = lvl_seg + lf->ref_deltas[ref] * scale
+                                        + lf->mode_deltas[mode] * scale;
+          lfi->lvl[seg_id][ref][mode] = clamp(inter_lvl, 0, MAX_LOOP_FILTER);
+        }
       }
+    }
   }
 }
 
-static uint8_t build_lfi(const loop_filter_info_n *lfi_n,
-                     const MB_MODE_INFO *mbmi) {
-  const int seg = mbmi->segment_id;
-  const int ref = mbmi->ref_frame[0];
-  const int mode = lfi_n->mode_lf_lut[mbmi->mode];
-  const int filter_level = lfi_n->lvl[seg][ref][mode];
-
-  return filter_level;
-}
-
 static void filter_selectively_vert(uint8_t *s, int pitch,
                                     unsigned int mask_16x16,
                                     unsigned int mask_8x8,
@@ -351,19 +340,12 @@
       if (mask_16x16 & 1) {
         vp9_mb_lpf_vertical_edge_w(s, pitch, lfi->mblim, lfi->lim,
                                    lfi->hev_thr);
-        assert(!(mask_8x8 & 1));
-        assert(!(mask_4x4 & 1));
-        assert(!(mask_4x4_int & 1));
       } else if (mask_8x8 & 1) {
         vp9_mbloop_filter_vertical_edge(s, pitch, lfi->mblim, lfi->lim,
                                         lfi->hev_thr, 1);
-        assert(!(mask_16x16 & 1));
-        assert(!(mask_4x4 & 1));
       } else if (mask_4x4 & 1) {
         vp9_loop_filter_vertical_edge(s, pitch, lfi->mblim, lfi->lim,
                                       lfi->hev_thr, 1);
-        assert(!(mask_16x16 & 1));
-        assert(!(mask_8x8 & 1));
       }
     }
     if (mask_4x4_int & 1)
@@ -403,27 +385,21 @@
           vp9_mb_lpf_horizontal_edge_w(s, pitch, lfi->mblim, lfi->lim,
                                        lfi->hev_thr, 1);
         }
-        assert(!(mask_8x8 & 1));
-        assert(!(mask_4x4 & 1));
-        assert(!(mask_4x4_int & 1));
       } else if (mask_8x8 & 1) {
         if ((mask_8x8 & 3) == 3) {
           // Next block's thresholds
           const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
 
-          // TODO(yunqingwang): Combine next 2 calls as 1 wide filtering.
-          vp9_mbloop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim,
-                                            lfi->hev_thr, 1);
-          vp9_mbloop_filter_horizontal_edge(s + 8, pitch, lfin->mblim,
-                                            lfin->lim, lfin->hev_thr, 1);
+          vp9_mbloop_filter_horizontal_edge_16(s, pitch, lfi->mblim,
+                                               lfi->lim, lfi->hev_thr,
+                                               lfin->mblim, lfin->lim,
+                                               lfin->hev_thr);
 
           if ((mask_4x4_int & 3) == 3) {
-            // TODO(yunqingwang): Combine next 2 calls as 1 wide filtering.
-            vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim,
-                                            lfi->lim, lfi->hev_thr, 1);
-            vp9_loop_filter_horizontal_edge(s + 8 + 4 * pitch, pitch,
-                                            lfin->mblim, lfin->lim,
-                                            lfin->hev_thr, 1);
+            vp9_loop_filter_horizontal_edge_16(s + 4 * pitch, pitch, lfi->mblim,
+                                               lfi->lim, lfi->hev_thr,
+                                               lfin->mblim, lfin->lim,
+                                               lfin->hev_thr);
           } else {
             if (mask_4x4_int & 1)
               vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim,
@@ -442,26 +418,20 @@
             vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim,
                                             lfi->lim, lfi->hev_thr, 1);
         }
-        assert(!(mask_16x16 & 1));
-        assert(!(mask_4x4 & 1));
       } else if (mask_4x4 & 1) {
         if ((mask_4x4 & 3) == 3) {
           // Next block's thresholds
           const loop_filter_thresh *lfin = lfi_n->lfthr + *(lfl + 1);
 
-          // TODO(yunqingwang): Combine next 2 calls as 1 wide filtering.
-          vp9_loop_filter_horizontal_edge(s, pitch, lfi->mblim, lfi->lim,
-                                            lfi->hev_thr, 1);
-          vp9_loop_filter_horizontal_edge(s + 8, pitch, lfin->mblim, lfin->lim,
-                                            lfin->hev_thr, 1);
-
+          vp9_loop_filter_horizontal_edge_16(s, pitch, lfi->mblim,
+                                             lfi->lim, lfi->hev_thr,
+                                             lfin->mblim, lfin->lim,
+                                             lfin->hev_thr);
           if ((mask_4x4_int & 3) == 3) {
-            // TODO(yunqingwang): Combine next 2 calls as 1 wide filtering.
-            vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim,
-                                            lfi->lim, lfi->hev_thr, 1);
-            vp9_loop_filter_horizontal_edge(s + 8 + 4 * pitch, pitch,
-                                            lfin->mblim, lfin->lim,
-                                            lfin->hev_thr, 1);
+            vp9_loop_filter_horizontal_edge_16(s + 4 * pitch, pitch, lfi->mblim,
+                                               lfi->lim, lfi->hev_thr,
+                                               lfin->mblim, lfin->lim,
+                                               lfin->hev_thr);
           } else {
             if (mask_4x4_int & 1)
               vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim,
@@ -480,8 +450,6 @@
           vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim,
                                           lfi->lim, lfi->hev_thr, 1);
         }
-        assert(!(mask_16x16 & 1));
-        assert(!(mask_8x8 & 1));
       } else if (mask_4x4_int & 1) {
         vp9_loop_filter_horizontal_edge(s + 4 * pitch, pitch, lfi->mblim,
                                         lfi->lim, lfi->hev_thr, 1);
@@ -864,9 +832,37 @@
       lfm->left_uv[i] &= 0xeeee;
     }
   }
+
+  // Assert if we try to apply 2 different loop filters at the same position.
+  assert(!(lfm->left_y[TX_16X16] & lfm->left_y[TX_8X8]));
+  assert(!(lfm->left_y[TX_16X16] & lfm->left_y[TX_4X4]));
+  assert(!(lfm->left_y[TX_8X8] & lfm->left_y[TX_4X4]));
+  assert(!(lfm->int_4x4_y & lfm->left_y[TX_16X16]));
+  assert(!(lfm->left_uv[TX_16X16]&lfm->left_uv[TX_8X8]));
+  assert(!(lfm->left_uv[TX_16X16] & lfm->left_uv[TX_4X4]));
+  assert(!(lfm->left_uv[TX_8X8] & lfm->left_uv[TX_4X4]));
+  assert(!(lfm->int_4x4_uv & lfm->left_uv[TX_16X16]));
+  assert(!(lfm->above_y[TX_16X16] & lfm->above_y[TX_8X8]));
+  assert(!(lfm->above_y[TX_16X16] & lfm->above_y[TX_4X4]));
+  assert(!(lfm->above_y[TX_8X8] & lfm->above_y[TX_4X4]));
+  assert(!(lfm->int_4x4_y & lfm->above_y[TX_16X16]));
+  assert(!(lfm->above_uv[TX_16X16] & lfm->above_uv[TX_8X8]));
+  assert(!(lfm->above_uv[TX_16X16] & lfm->above_uv[TX_4X4]));
+  assert(!(lfm->above_uv[TX_8X8] & lfm->above_uv[TX_4X4]));
+  assert(!(lfm->int_4x4_uv & lfm->above_uv[TX_16X16]));
 }
 
 #if CONFIG_NON420
+static uint8_t build_lfi(const loop_filter_info_n *lfi_n,
+                     const MB_MODE_INFO *mbmi) {
+  const int seg = mbmi->segment_id;
+  const int ref = mbmi->ref_frame[0];
+  const int mode = lfi_n->mode_lf_lut[mbmi->mode];
+  const int filter_level = lfi_n->lvl[seg][ref][mode];
+
+  return filter_level;
+}
+
 static void filter_block_plane_non420(VP9_COMMON *cm,
                                       struct macroblockd_plane *plane,
                                       MODE_INFO **mi_8x8,

diff --git a/vp9/common/vp9_loopfilter_filters.c b/vp9/common/vp9_loopfilter_filters.c
index 2c4bf6c..9edf870 100644
--- a/vp9/common/vp9_loopfilter_filters.c
+++ b/vp9/common/vp9_loopfilter_filters.c

@@ -121,6 +121,34 @@
   }
 }
 
+void vp9_loop_filter_horizontal_edge_16_c(uint8_t *s, int p /* pitch */,
+                                          const uint8_t *blimit0,
+                                          const uint8_t *limit0,
+                                          const uint8_t *thresh0,
+                                          const uint8_t *blimit1,
+                                          const uint8_t *limit1,
+                                          const uint8_t *thresh1) {
+  int i, j;
+  const uint8_t *blimit = blimit0;
+  const uint8_t *limit = limit0;
+  const uint8_t *thresh = thresh0;
+
+  for (i = 0; i < 2; ++i) {
+    for (j = 0; j < 8; ++j) {
+      const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+      const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+      const int8_t mask = filter_mask(*limit, *blimit,
+                                      p3, p2, p1, p0, q0, q1, q2, q3);
+      const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1);
+      filter4(mask, hev, s - 2 * p, s - 1 * p, s, s + 1 * p);
+      ++s;
+    }
+    blimit = blimit1;
+    limit = limit1;
+    thresh = thresh1;
+  }
+}
+
 void vp9_loop_filter_vertical_edge_c(uint8_t *s, int pitch,
                                      const uint8_t *blimit,
                                      const uint8_t *limit,
@@ -185,6 +213,37 @@
   }
 }
 
+void vp9_mbloop_filter_horizontal_edge_16_c(uint8_t *s, int p /* pitch */,
+                                            const uint8_t *blimit0,
+                                            const uint8_t *limit0,
+                                            const uint8_t *thresh0,
+                                            const uint8_t *blimit1,
+                                            const uint8_t *limit1,
+                                            const uint8_t *thresh1) {
+  int i, j;
+  const uint8_t *blimit = blimit0;
+  const uint8_t *limit = limit0;
+  const uint8_t *thresh = thresh0;
+
+  for (i = 0; i < 2; ++i) {
+    for (j = 0; j < 8; ++j) {
+      const uint8_t p3 = s[-4 * p], p2 = s[-3 * p], p1 = s[-2 * p], p0 = s[-p];
+      const uint8_t q0 = s[0 * p], q1 = s[1 * p], q2 = s[2 * p], q3 = s[3 * p];
+
+      const int8_t mask = filter_mask(*limit, *blimit,
+                                      p3, p2, p1, p0, q0, q1, q2, q3);
+      const int8_t hev = hev_mask(*thresh, p1, p0, q0, q1);
+      const int8_t flat = flat_mask4(1, p3, p2, p1, p0, q0, q1, q2, q3);
+      filter8(mask, hev, flat, s - 4 * p, s - 3 * p, s - 2 * p, s - 1 * p,
+                               s,         s + 1 * p, s + 2 * p, s + 3 * p);
+      ++s;
+    }
+    blimit = blimit1;
+    limit = limit1;
+    thresh = thresh1;
+  }
+}
+
 void vp9_mbloop_filter_vertical_edge_c(uint8_t *s, int pitch,
                                        const uint8_t *blimit,
                                        const uint8_t *limit,

diff --git a/vp9/common/vp9_onyxc_int.h b/vp9/common/vp9_onyxc_int.h
index a2af57a..fb959cb 100644
--- a/vp9/common/vp9_onyxc_int.h
+++ b/vp9/common/vp9_onyxc_int.h

@@ -38,6 +38,11 @@
 #define NUM_FRAME_CONTEXTS_LOG2 2
 #define NUM_FRAME_CONTEXTS (1 << NUM_FRAME_CONTEXTS_LOG2)
 
+extern const struct {
+  PARTITION_CONTEXT above;
+  PARTITION_CONTEXT left;
+} partition_context_lookup[BLOCK_SIZES];
+
 typedef struct frame_contexts {
   vp9_prob y_mode_prob[BLOCK_SIZE_GROUPS][INTRA_MODES - 1];
   vp9_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
@@ -298,52 +303,40 @@
 static INLINE void update_partition_context(
     PARTITION_CONTEXT *above_seg_context,
     PARTITION_CONTEXT left_seg_context[8],
-    int mi_row, int mi_col,
-    BLOCK_SIZE sb_type,
-    BLOCK_SIZE sb_size) {
-  PARTITION_CONTEXT *above_ctx = above_seg_context + mi_col;
-  PARTITION_CONTEXT *left_ctx = left_seg_context + (mi_row & MI_MASK);
+    int mi_row, int mi_col, BLOCK_SIZE subsize, BLOCK_SIZE bsize) {
+  PARTITION_CONTEXT *const above_ctx = above_seg_context + mi_col;
+  PARTITION_CONTEXT *const left_ctx = left_seg_context + (mi_row & MI_MASK);
 
-  const int bsl = b_width_log2(sb_size), bs = (1 << bsl) / 2;
-  const int bwl = b_width_log2(sb_type);
-  const int bhl = b_height_log2(sb_type);
-  const int boffset = b_width_log2(BLOCK_64X64) - bsl;
-  const char pcval0 = ~(0xe << boffset);
-  const char pcval1 = ~(0xf << boffset);
-  const char pcvalue[2] = {pcval0, pcval1};
-
-  assert(MAX(bwl, bhl) <= bsl);
+  // num_4x4_blocks_wide_lookup[bsize] / 2
+  const int bs = num_8x8_blocks_wide_lookup[bsize];
 
   // update the partition context at the end notes. set partition bits
   // of block sizes larger than the current one to be one, and partition
   // bits of smaller block sizes to be zero.
-  vpx_memset(above_ctx, pcvalue[bwl == bsl], bs);
-  vpx_memset(left_ctx, pcvalue[bhl == bsl], bs);
+  vpx_memset(above_ctx, partition_context_lookup[subsize].above, bs);
+  vpx_memset(left_ctx, partition_context_lookup[subsize].left, bs);
 }
 
 static INLINE int partition_plane_context(
     const PARTITION_CONTEXT *above_seg_context,
     const PARTITION_CONTEXT left_seg_context[8],
-    int mi_row, int mi_col,
-    BLOCK_SIZE sb_type) {
+    int mi_row, int mi_col, BLOCK_SIZE bsize) {
   const PARTITION_CONTEXT *above_ctx = above_seg_context + mi_col;
   const PARTITION_CONTEXT *left_ctx = left_seg_context + (mi_row & MI_MASK);
 
-  int bsl = mi_width_log2(sb_type), bs = 1 << bsl;
+  const int bsl = mi_width_log2(bsize);
+  const int bs = 1 << bsl;
   int above = 0, left = 0, i;
-  int boffset = mi_width_log2(BLOCK_64X64) - bsl;
 
-  assert(mi_width_log2(sb_type) == mi_height_log2(sb_type));
+  assert(mi_width_log2(bsize) == mi_height_log2(bsize));
   assert(bsl >= 0);
-  assert(boffset >= 0);
 
-  for (i = 0; i < bs; i++)
-    above |= (above_ctx[i] & (1 << boffset));
-  for (i = 0; i < bs; i++)
-    left |= (left_ctx[i] & (1 << boffset));
-
-  above = (above > 0);
-  left  = (left > 0);
+  for (i = 0; i < bs; i++) {
+    above |= above_ctx[i];
+    left |= left_ctx[i];
+  }
+  above = (above & bs) > 0;
+  left  = (left & bs) > 0;
 
   return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
 }

diff --git a/vp9/common/vp9_reconinter.c b/vp9/common/vp9_reconinter.c
index 1c96788..3add81b 100644
--- a/vp9/common/vp9_reconinter.c
+++ b/vp9/common/vp9_reconinter.c

@@ -20,25 +20,6 @@
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
 
-void vp9_setup_interp_filters(MACROBLOCKD *xd,
-                              INTERPOLATION_TYPE mcomp_filter_type,
-                              VP9_COMMON *cm) {
-  if (xd->mi_8x8 && xd->mi_8x8[0]) {
-    MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
-
-    set_scale_factors(xd, mbmi->ref_frame[0] - LAST_FRAME,
-                          mbmi->ref_frame[1] - LAST_FRAME,
-                          cm->active_ref_scale);
-  } else {
-    set_scale_factors(xd, -1, -1, cm->active_ref_scale);
-  }
-
-  xd->subpix.filter_x = xd->subpix.filter_y =
-      vp9_get_filter_kernel(mcomp_filter_type == SWITCHABLE ?
-                               EIGHTTAP : mcomp_filter_type);
-
-  assert(((intptr_t)xd->subpix.filter_x & 0xff) == 0);
-}
 
 static void inter_predictor(const uint8_t *src, int src_stride,
                             uint8_t *dst, int dst_stride,
@@ -117,16 +98,13 @@
   return clamped_mv;
 }
 
-struct build_inter_predictors_args {
-  MACROBLOCKD *xd;
-  int x, y;
-};
 
-static void build_inter_predictors(int plane, int block, BLOCK_SIZE bsize,
-                                   int pred_w, int pred_h,
-                                   void *argv) {
-  const struct build_inter_predictors_args* const arg = argv;
-  MACROBLOCKD *const xd = arg->xd;
+// TODO(jkoleszar): In principle, pred_w, pred_h are unnecessary, as we could
+// calculate the subsampled BLOCK_SIZE, but that type isn't defined for
+// sizes smaller than 16x16 yet.
+static void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
+                                   BLOCK_SIZE bsize, int pred_w, int pred_h,
+                                   int mi_x, int mi_y) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int bwl = b_width_log2(bsize) - pd->subsampling_x;
   const int bw = 4 << bwl;
@@ -172,7 +150,7 @@
 
     if (vp9_is_scaled(scale->sfc)) {
       pre = pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, scale);
-      scale->sfc->set_scaled_offsets(scale, arg->y + y, arg->x + x);
+      scale->sfc->set_scaled_offsets(scale, mi_y + y, mi_x + x);
       scaled_mv = scale->sfc->scale_mv(&mv_q4, scale);
       xs = scale->sfc->x_step_q4;
       ys = scale->sfc->y_step_q4;
@@ -190,40 +168,25 @@
   }
 }
 
-// TODO(jkoleszar): In principle, pred_w, pred_h are unnecessary, as we could
-// calculate the subsampled BLOCK_SIZE, but that type isn't defined for
-// sizes smaller than 16x16 yet.
-typedef void (*foreach_predicted_block_visitor)(int plane, int block,
-                                                BLOCK_SIZE bsize,
-                                                int pred_w, int pred_h,
-                                                void *arg);
-static INLINE void foreach_predicted_block_in_plane(
-    const MACROBLOCKD* const xd, BLOCK_SIZE bsize, int plane,
-    foreach_predicted_block_visitor visit, void *arg) {
-  const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
-  const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
-
-  if (xd->mi_8x8[0]->mbmi.sb_type < BLOCK_8X8) {
-    int i = 0, x, y;
-    assert(bsize == BLOCK_8X8);
-    for (y = 0; y < 1 << bhl; ++y)
-      for (x = 0; x < 1 << bwl; ++x)
-        visit(plane, i++, bsize, 0, 0, arg);
-  } else {
-    visit(plane, 0, bsize, bwl, bhl, arg);
-  }
-}
-
 static void build_inter_predictors_for_planes(MACROBLOCKD *xd, BLOCK_SIZE bsize,
                                               int mi_row, int mi_col,
                                               int plane_from, int plane_to) {
   int plane;
   for (plane = plane_from; plane <= plane_to; ++plane) {
-    struct build_inter_predictors_args args = {
-      xd, mi_col * MI_SIZE, mi_row * MI_SIZE,
-    };
-    foreach_predicted_block_in_plane(xd, bsize, plane, build_inter_predictors,
-                                     &args);
+    const int mi_x = mi_col * MI_SIZE;
+    const int mi_y = mi_row * MI_SIZE;
+    const int bwl = b_width_log2(bsize) - xd->plane[plane].subsampling_x;
+    const int bhl = b_height_log2(bsize) - xd->plane[plane].subsampling_y;
+
+    if (xd->mi_8x8[0]->mbmi.sb_type < BLOCK_8X8) {
+      int i = 0, x, y;
+      assert(bsize == BLOCK_8X8);
+      for (y = 0; y < 1 << bhl; ++y)
+        for (x = 0; x < 1 << bwl; ++x)
+          build_inter_predictors(xd, plane, i++, bsize, 0, 0, mi_x, mi_y);
+    } else {
+      build_inter_predictors(xd, plane, 0, bsize, bwl, bhl, mi_x, mi_y);
+    }
   }
 }
 

diff --git a/vp9/common/vp9_reconinter.h b/vp9/common/vp9_reconinter.h
index 2c8a6e4..b328754 100644
--- a/vp9/common/vp9_reconinter.h
+++ b/vp9/common/vp9_reconinter.h

@@ -24,10 +24,6 @@
 void vp9_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
                                    BLOCK_SIZE bsize);
 
-void vp9_setup_interp_filters(MACROBLOCKD *xd,
-                              INTERPOLATION_TYPE filter,
-                              VP9_COMMON *cm);
-
 void vp9_build_inter_predictor(const uint8_t *src, int src_stride,
                                uint8_t *dst, int dst_stride,
                                const MV *mv_q3,

diff --git a/vp9/common/vp9_rtcd_defs.sh b/vp9/common/vp9_rtcd_defs.sh
index 2c0864e..e18e757 100644
--- a/vp9/common/vp9_rtcd_defs.sh
+++ b/vp9/common/vp9_rtcd_defs.sh

@@ -205,9 +205,15 @@
 prototype void vp9_mbloop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
 specialize vp9_mbloop_filter_horizontal_edge sse2 neon dspr2
 
+prototype void vp9_mbloop_filter_horizontal_edge_16 "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"
+specialize vp9_mbloop_filter_horizontal_edge_16 sse2 neon
+
 prototype void vp9_loop_filter_horizontal_edge "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh, int count"
 specialize vp9_loop_filter_horizontal_edge mmx neon dspr2
 
+prototype void vp9_loop_filter_horizontal_edge_16 "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1"
+specialize vp9_loop_filter_horizontal_edge_16 sse2 neon
+
 #
 # post proc
 #
@@ -296,7 +302,8 @@
 specialize vp9_idct32x32_1024_add sse2 neon dspr2
 
 prototype void vp9_idct32x32_34_add "const int16_t *input, uint8_t *dest, int dest_stride"
-specialize vp9_idct32x32_34_add sse2 dspr2
+specialize vp9_idct32x32_34_add sse2 neon dspr2
+vp9_idct32x32_34_add_neon=vp9_idct32x32_1024_add_neon
 
 prototype void vp9_idct32x32_1_add "const int16_t *input, uint8_t *dest, int dest_stride"
 specialize vp9_idct32x32_1_add sse2 neon dspr2

diff --git a/vp9/common/vp9_scan.c b/vp9/common/vp9_scan.c
index f17da91..f62150f 100644
--- a/vp9/common/vp9_scan.c
+++ b/vp9/common/vp9_scan.c

@@ -266,6 +266,62 @@
 DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_16x16[256]);
 DECLARE_ALIGNED(16, int16_t, vp9_default_iscan_32x32[1024]);
 
+const scan_order inter_scan_orders[TX_SIZES] = {
+  {vp9_default_scan_4x4,   vp9_default_scan_4x4_neighbors},    // NEWMV
+  {vp9_default_scan_8x8,   vp9_default_scan_8x8_neighbors},    // NEWMV
+  {vp9_default_scan_16x16, vp9_default_scan_16x16_neighbors},  // NEWMV
+  {vp9_default_scan_32x32, vp9_default_scan_32x32_neighbors},  // NEWMV
+};
+
+const scan_order intra_scan_orders[TX_SIZES][INTRA_MODES] = {
+  {  // 4X4
+    {vp9_default_scan_4x4,   vp9_default_scan_4x4_neighbors},    // DC
+    {vp9_row_scan_4x4,       vp9_row_scan_4x4_neighbors},        // V
+    {vp9_col_scan_4x4,       vp9_col_scan_4x4_neighbors},        // H
+    {vp9_default_scan_4x4,   vp9_default_scan_4x4_neighbors},    // D45
+    {vp9_default_scan_4x4,   vp9_default_scan_4x4_neighbors},    // D135
+    {vp9_row_scan_4x4,       vp9_row_scan_4x4_neighbors},        // D117
+    {vp9_col_scan_4x4,       vp9_col_scan_4x4_neighbors},        // D153
+    {vp9_col_scan_4x4,       vp9_col_scan_4x4_neighbors},        // D207
+    {vp9_row_scan_4x4,       vp9_row_scan_4x4_neighbors},        // D63
+    {vp9_default_scan_4x4,   vp9_default_scan_4x4_neighbors},    // TM
+  }, {  // 8x8
+    {vp9_default_scan_8x8,   vp9_default_scan_8x8_neighbors},    // DC
+    {vp9_row_scan_8x8,       vp9_row_scan_8x8_neighbors},        // V
+    {vp9_col_scan_8x8,       vp9_col_scan_8x8_neighbors},        // H
+    {vp9_default_scan_8x8,   vp9_default_scan_8x8_neighbors},    // D45
+    {vp9_default_scan_8x8,   vp9_default_scan_8x8_neighbors},    // D135
+    {vp9_row_scan_8x8,       vp9_row_scan_8x8_neighbors},        // D117
+    {vp9_col_scan_8x8,       vp9_col_scan_8x8_neighbors},        // D153
+    {vp9_col_scan_8x8,       vp9_col_scan_8x8_neighbors},        // D207
+    {vp9_row_scan_8x8,       vp9_row_scan_8x8_neighbors},        // D63
+    {vp9_default_scan_8x8,   vp9_default_scan_8x8_neighbors},    // TM
+  }, {  // 16x16
+    {vp9_default_scan_16x16, vp9_default_scan_16x16_neighbors},  // DC
+    {vp9_row_scan_16x16,     vp9_row_scan_16x16_neighbors},      // V
+    {vp9_col_scan_16x16,     vp9_col_scan_16x16_neighbors},      // H
+    {vp9_default_scan_16x16, vp9_default_scan_16x16_neighbors},  // D45
+    {vp9_default_scan_16x16, vp9_default_scan_16x16_neighbors},  // D135
+    {vp9_row_scan_16x16,     vp9_row_scan_16x16_neighbors},      // D117
+    {vp9_col_scan_16x16,     vp9_col_scan_16x16_neighbors},      // D153
+    {vp9_col_scan_16x16,     vp9_col_scan_16x16_neighbors},      // D207
+    {vp9_row_scan_16x16,     vp9_row_scan_16x16_neighbors},      // D63
+    {vp9_default_scan_16x16, vp9_default_scan_16x16_neighbors},  // TM
+  }, {  // 32x32
+    {vp9_default_scan_32x32, vp9_default_scan_32x32_neighbors},  // DC
+    {vp9_default_scan_32x32, vp9_default_scan_32x32_neighbors},  // V
+    {vp9_default_scan_32x32, vp9_default_scan_32x32_neighbors},  // H
+    {vp9_default_scan_32x32, vp9_default_scan_32x32_neighbors},  // D45
+    {vp9_default_scan_32x32, vp9_default_scan_32x32_neighbors},  // D135
+    {vp9_default_scan_32x32, vp9_default_scan_32x32_neighbors},  // D117
+    {vp9_default_scan_32x32, vp9_default_scan_32x32_neighbors},  // D153
+    {vp9_default_scan_32x32, vp9_default_scan_32x32_neighbors},  // D207
+    {vp9_default_scan_32x32, vp9_default_scan_32x32_neighbors},  // D63
+    {vp9_default_scan_32x32, vp9_default_scan_32x32_neighbors},  // TM
+  }
+};
+
+
 static int find_in_scan(const int16_t *scan, int l, int idx) {
   int n, l2 = l * l;
   for (n = 0; n < l2; n++) {

diff --git a/vp9/common/vp9_scan.h b/vp9/common/vp9_scan.h
index 14a1a7e..98fc607 100644
--- a/vp9/common/vp9_scan.h
+++ b/vp9/common/vp9_scan.h

@@ -15,6 +15,7 @@
 #include "vpx_ports/mem.h"
 
 #include "vp9/common/vp9_enums.h"
+#include "vp9/common/vp9_blockd.h"
 
 #define MAX_NEIGHBORS 2
 
@@ -67,9 +68,16 @@
 extern DECLARE_ALIGNED(16, int16_t,
                        vp9_default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]);
 
-
 void vp9_init_neighbors();
 
+typedef struct {
+  const int16_t *scan;
+  const int16_t *neighbors;
+} scan_order;
+
+extern const scan_order intra_scan_orders[TX_SIZES][INTRA_MODES];
+extern const scan_order inter_scan_orders[TX_SIZES];
+
 static INLINE const int16_t* get_scan_4x4(TX_TYPE tx_type) {
   switch (tx_type) {
     case ADST_DCT:

diff --git a/vp9/common/vp9_tile_common.c b/vp9/common/vp9_tile_common.c
index e3035d0..78909dd 100644
--- a/vp9/common/vp9_tile_common.c
+++ b/vp9/common/vp9_tile_common.c

@@ -15,46 +15,37 @@
 #define MIN_TILE_WIDTH_B64 4
 #define MAX_TILE_WIDTH_B64 64
 
-static int to_sbs(n_mis) {
-  return mi_cols_aligned_to_sb(n_mis) >> MI_BLOCK_SIZE_LOG2;
+static int get_tile_offset(int idx, int mis, int log2) {
+  const int sb_cols = mi_cols_aligned_to_sb(mis) >> MI_BLOCK_SIZE_LOG2;
+  const int offset = ((idx * sb_cols) >> log2) << MI_BLOCK_SIZE_LOG2;
+  return MIN(offset, mis);
 }
 
-static void get_tile_offsets(int *min_tile_off, int *max_tile_off,
-                             int tile_idx, int log2_n_tiles, int n_mis) {
-  const int n_sbs = to_sbs(n_mis);
-  const int sb_off1 =  (tile_idx      * n_sbs) >> log2_n_tiles;
-  const int sb_off2 = ((tile_idx + 1) * n_sbs) >> log2_n_tiles;
-
-  *min_tile_off = MIN(sb_off1 << 3, n_mis);
-  *max_tile_off = MIN(sb_off2 << 3, n_mis);
-}
-
-void vp9_tile_init(TileInfo *tile, const VP9_COMMON *cm,
-                   int row_idx, int col_idx) {
-  get_tile_offsets(&tile->mi_row_start, &tile->mi_row_end,
-                   row_idx, cm->log2_tile_rows, cm->mi_rows);
-  get_tile_offsets(&tile->mi_col_start, &tile->mi_col_end,
-                   col_idx, cm->log2_tile_cols, cm->mi_cols);
+void vp9_tile_init(TileInfo *tile, const VP9_COMMON *cm, int row, int col) {
+  tile->mi_row_start = get_tile_offset(row, cm->mi_rows, cm->log2_tile_rows);
+  tile->mi_row_end = get_tile_offset(row + 1, cm->mi_rows, cm->log2_tile_rows);
+  tile->mi_col_start = get_tile_offset(col, cm->mi_cols, cm->log2_tile_cols);
+  tile->mi_col_end = get_tile_offset(col + 1, cm->mi_cols, cm->log2_tile_cols);
 }
 
 void vp9_get_tile_n_bits(int mi_cols,
                          int *min_log2_tile_cols, int *max_log2_tile_cols) {
-  const int sb_cols = to_sbs(mi_cols);
-  int min_log2_n_tiles, max_log2_n_tiles;
+  const int sb_cols = mi_cols_aligned_to_sb(mi_cols) >> MI_BLOCK_SIZE_LOG2;
+  int min_log2 = 0, max_log2 = 0;
 
-  for (max_log2_n_tiles = 0;
-       (sb_cols >> max_log2_n_tiles) >= MIN_TILE_WIDTH_B64;
-       max_log2_n_tiles++) {}
-  max_log2_n_tiles--;
-  if (max_log2_n_tiles <  0)
-    max_log2_n_tiles = 0;
+  // max
+  while ((sb_cols >> max_log2) >= MIN_TILE_WIDTH_B64)
+    ++max_log2;
+  --max_log2;
+  if (max_log2 < 0)
+    max_log2 = 0;
 
-  for (min_log2_n_tiles = 0;
-       (MAX_TILE_WIDTH_B64 << min_log2_n_tiles) < sb_cols;
-       min_log2_n_tiles++) {}
+  // min
+  while ((MAX_TILE_WIDTH_B64 << min_log2) < sb_cols)
+    ++min_log2;
 
-  assert(min_log2_n_tiles <= max_log2_n_tiles);
+  assert(min_log2 <= max_log2);
 
-  *min_log2_tile_cols = min_log2_n_tiles;
-  *max_log2_tile_cols = max_log2_n_tiles;
+  *min_log2_tile_cols = min_log2;
+  *max_log2_tile_cols = max_log2;
 }

diff --git a/vp9/common/vp9_tile_common.h b/vp9/common/vp9_tile_common.h
index a110abb..a09876e 100644
--- a/vp9/common/vp9_tile_common.h
+++ b/vp9/common/vp9_tile_common.h

@@ -18,10 +18,10 @@
   int mi_col_start, mi_col_end;
 } TileInfo;
 
-// initializes 'tile->mi_(row|col)_(start|end)' for (row_idx, col_idx) based on
+// initializes 'tile->mi_(row|col)_(start|end)' for (row, col) based on
 // 'cm->log2_tile_(rows|cols)' & 'cm->mi_(rows|cols)'
 void vp9_tile_init(TileInfo *tile, const struct VP9Common *cm,
-                   int row_idx, int col_idx);
+                   int row, int col);
 
 void vp9_get_tile_n_bits(int mi_cols,
                          int *min_log2_tile_cols, int *max_log2_tile_cols);

diff --git a/vp9/common/x86/vp9_idct_intrin_sse2.c b/vp9/common/x86/vp9_idct_intrin_sse2.c
index 2a33844..c65184f 100644
--- a/vp9/common/x86/vp9_idct_intrin_sse2.c
+++ b/vp9/common/x86/vp9_idct_intrin_sse2.c

@@ -174,12 +174,10 @@
 
 static INLINE void transpose_4x4(__m128i *res) {
   const __m128i tr0_0 = _mm_unpacklo_epi16(res[0], res[1]);
-  const __m128i tr0_1 = _mm_unpacklo_epi16(res[2], res[3]);
-  res[0] = _mm_unpacklo_epi32(tr0_0, tr0_1);
-  res[2] = _mm_unpackhi_epi32(tr0_0, tr0_1);
+  const __m128i tr0_1 = _mm_unpackhi_epi16(res[0], res[1]);
 
-  res[1] = _mm_unpackhi_epi64(res[0], res[0]);
-  res[3] = _mm_unpackhi_epi64(res[2], res[2]);
+  res[0] = _mm_unpacklo_epi16(tr0_0, tr0_1);
+  res[1] = _mm_unpackhi_epi16(tr0_0, tr0_1);
 }
 
 static void idct4_1d_sse2(__m128i *in) {
@@ -192,8 +190,8 @@
 
   transpose_4x4(in);
   // stage 1
-  u[0] = _mm_unpacklo_epi16(in[0], in[2]);
-  u[1] = _mm_unpacklo_epi16(in[1], in[3]);
+  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
+  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
   v[0] = _mm_madd_epi16(u[0], k__cospi_p16_p16);
   v[1] = _mm_madd_epi16(u[0], k__cospi_p16_m16);
   v[2] = _mm_madd_epi16(u[1], k__cospi_p24_m08);
@@ -209,16 +207,13 @@
   v[2] = _mm_srai_epi32(u[2], DCT_CONST_BITS);
   v[3] = _mm_srai_epi32(u[3], DCT_CONST_BITS);
 
-  u[0] = _mm_packs_epi32(v[0], v[2]);
-  u[1] = _mm_packs_epi32(v[1], v[3]);
-  u[2] = _mm_unpackhi_epi64(u[0], u[0]);
-  u[3] = _mm_unpackhi_epi64(u[1], u[1]);
+  u[0] = _mm_packs_epi32(v[0], v[1]);
+  u[1] = _mm_packs_epi32(v[3], v[2]);
 
   // stage 2
-  in[0] = _mm_add_epi16(u[0], u[3]);
-  in[1] = _mm_add_epi16(u[1], u[2]);
-  in[2] = _mm_sub_epi16(u[1], u[2]);
-  in[3] = _mm_sub_epi16(u[0], u[3]);
+  in[0] = _mm_add_epi16(u[0], u[1]);
+  in[1] = _mm_sub_epi16(u[0], u[1]);
+  in[1] = _mm_shuffle_epi32(in[1], 0x4E);
 }
 
 static void iadst4_1d_sse2(__m128i *in) {
@@ -232,13 +227,14 @@
   __m128i u[8], v[8], in7;
 
   transpose_4x4(in);
-  in7 = _mm_add_epi16(in[0], in[3]);
-  in7 = _mm_sub_epi16(in7, in[2]);
+  in7 = _mm_srli_si128(in[1], 8);
+  in7 = _mm_add_epi16(in7, in[0]);
+  in7 = _mm_sub_epi16(in7, in[1]);
 
-  u[0] = _mm_unpacklo_epi16(in[0], in[2]);
-  u[1] = _mm_unpacklo_epi16(in[1], in[3]);
+  u[0] = _mm_unpacklo_epi16(in[0], in[1]);
+  u[1] = _mm_unpackhi_epi16(in[0], in[1]);
   u[2] = _mm_unpacklo_epi16(in7, kZero);
-  u[3] = _mm_unpacklo_epi16(in[1], kZero);
+  u[3] = _mm_unpackhi_epi16(in[0], kZero);
 
   v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p04);  // s0 + s3
   v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p02);  // s2 + s5
@@ -265,22 +261,18 @@
   u[2] = _mm_srai_epi32(v[2], DCT_CONST_BITS);
   u[3] = _mm_srai_epi32(v[3], DCT_CONST_BITS);
 
-  in[0] = _mm_packs_epi32(u[0], u[2]);
-  in[1] = _mm_packs_epi32(u[1], u[3]);
-  in[2] = _mm_unpackhi_epi64(in[0], in[0]);
-  in[3] = _mm_unpackhi_epi64(in[1], in[1]);
+  in[0] = _mm_packs_epi32(u[0], u[1]);
+  in[1] = _mm_packs_epi32(u[2], u[3]);
 }
 
 void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
                             int tx_type) {
-  __m128i in[4];
+  __m128i in[2];
   const __m128i zero = _mm_setzero_si128();
   const __m128i eight = _mm_set1_epi16(8);
 
-  in[0] = _mm_loadl_epi64((const __m128i *)input);
-  in[1] = _mm_loadl_epi64((const __m128i *)(input + 4));
-  in[2] = _mm_loadl_epi64((const __m128i *)(input + 8));
-  in[3] = _mm_loadl_epi64((const __m128i *)(input + 12));
+  in[0]= _mm_loadu_si128((const __m128i *)(input));
+  in[1]= _mm_loadu_si128((const __m128i *)(input + 8));
 
   switch (tx_type) {
     case 0:  // DCT_DCT
@@ -307,18 +299,35 @@
   // Final round and shift
   in[0] = _mm_add_epi16(in[0], eight);
   in[1] = _mm_add_epi16(in[1], eight);
-  in[2] = _mm_add_epi16(in[2], eight);
-  in[3] = _mm_add_epi16(in[3], eight);
 
   in[0] = _mm_srai_epi16(in[0], 4);
   in[1] = _mm_srai_epi16(in[1], 4);
-  in[2] = _mm_srai_epi16(in[2], 4);
-  in[3] = _mm_srai_epi16(in[3], 4);
 
-  RECON_AND_STORE4X4(dest, in[0]);
-  RECON_AND_STORE4X4(dest, in[1]);
-  RECON_AND_STORE4X4(dest, in[2]);
-  RECON_AND_STORE4X4(dest, in[3]);
+  // Reconstruction and Store
+  {
+     __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
+     __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
+     d0 = _mm_unpacklo_epi32(d0,
+          _mm_cvtsi32_si128(*(const int *) (dest + stride)));
+     d2 = _mm_unpacklo_epi32(d2, _mm_cvtsi32_si128(
+                    *(const int *) (dest + stride * 3)));
+     d0 = _mm_unpacklo_epi8(d0, zero);
+     d2 = _mm_unpacklo_epi8(d2, zero);
+     d0 = _mm_add_epi16(d0, in[0]);
+     d2 = _mm_add_epi16(d2, in[1]);
+     d0 = _mm_packus_epi16(d0, d2);
+     // store result[0]
+     *(int *)dest = _mm_cvtsi128_si32(d0);
+     // store result[1]
+     d0 = _mm_srli_si128(d0, 4);
+     *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
+     // store result[2]
+     d0 = _mm_srli_si128(d0, 4);
+     *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
+     // store result[3]
+     d0 = _mm_srli_si128(d0, 4);
+     *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
+  }
 }
 
 #define TRANSPOSE_8X8(in0, in1, in2, in3, in4, in5, in6, in7, \

diff --git a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
index fa4dd9b..925f74d 100644
--- a/vp9/common/x86/vp9_loopfilter_intrin_sse2.c
+++ b/vp9/common/x86/vp9_loopfilter_intrin_sse2.c

@@ -17,20 +17,14 @@
                                             const unsigned char *_blimit,
                                             const unsigned char *_limit,
                                             const unsigned char *_thresh) {
-  __m128i mask, hev, flat, flat2;
   const __m128i zero = _mm_set1_epi16(0);
   const __m128i one = _mm_set1_epi8(1);
+  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
+  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+  __m128i mask, hev, flat, flat2;
   __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
   __m128i abs_p1p0;
-  const unsigned int extended_thresh = _thresh[0] * 0x01010101u;
-  const unsigned int extended_limit  = _limit[0]  * 0x01010101u;
-  const unsigned int extended_blimit = _blimit[0] * 0x01010101u;
-  const __m128i thresh =
-      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0);
-  const __m128i limit =
-      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0);
-  const __m128i blimit =
-      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0);
 
   q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
   q4p4 = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(q4p4),
@@ -375,32 +369,25 @@
                                              const unsigned char *_blimit,
                                              const unsigned char *_limit,
                                              const unsigned char *_thresh) {
-  DECLARE_ALIGNED(16, unsigned char, flat2_op[7][16]);
-  DECLARE_ALIGNED(16, unsigned char, flat2_oq[7][16]);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat2_op, 7 * 16);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat2_oq, 7 * 16);
 
-  DECLARE_ALIGNED(16, unsigned char, flat_op[3][16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq[3][16]);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op, 3 * 16);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq, 3 * 16);
 
-  DECLARE_ALIGNED(16, unsigned char, ap[8][16]);
-  DECLARE_ALIGNED(16, unsigned char, aq[8][16]);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, ap, 8 * 16);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, aq, 8 * 16);
 
-
-  __m128i mask, hev, flat, flat2;
   const __m128i zero = _mm_set1_epi16(0);
   const __m128i one = _mm_set1_epi8(1);
+  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
+  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+  __m128i mask, hev, flat, flat2;
   __m128i p7, p6, p5;
   __m128i p4, p3, p2, p1, p0, q0, q1, q2, q3, q4;
   __m128i q5, q6, q7;
   int i = 0;
-  const unsigned int extended_thresh = _thresh[0] * 0x01010101u;
-  const unsigned int extended_limit  = _limit[0]  * 0x01010101u;
-  const unsigned int extended_blimit = _blimit[0] * 0x01010101u;
-  const __m128i thresh =
-      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0);
-  const __m128i limit =
-      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0);
-  const __m128i blimit =
-      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0);
 
   p4 = _mm_loadu_si128((__m128i *)(s - 5 * p));
   p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
@@ -413,16 +400,16 @@
   q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
   q4 = _mm_loadu_si128((__m128i *)(s + 4 * p));
 
-  _mm_store_si128((__m128i *)ap[4], p4);
-  _mm_store_si128((__m128i *)ap[3], p3);
-  _mm_store_si128((__m128i *)ap[2], p2);
-  _mm_store_si128((__m128i *)ap[1], p1);
-  _mm_store_si128((__m128i *)ap[0], p0);
-  _mm_store_si128((__m128i *)aq[4], q4);
-  _mm_store_si128((__m128i *)aq[3], q3);
-  _mm_store_si128((__m128i *)aq[2], q2);
-  _mm_store_si128((__m128i *)aq[1], q1);
-  _mm_store_si128((__m128i *)aq[0], q0);
+  _mm_store_si128((__m128i *)&ap[4 * 16], p4);
+  _mm_store_si128((__m128i *)&ap[3 * 16], p3);
+  _mm_store_si128((__m128i *)&ap[2 * 16], p2);
+  _mm_store_si128((__m128i *)&ap[1 * 16], p1);
+  _mm_store_si128((__m128i *)&ap[0 * 16], p0);
+  _mm_store_si128((__m128i *)&aq[4 * 16], q4);
+  _mm_store_si128((__m128i *)&aq[3 * 16], q3);
+  _mm_store_si128((__m128i *)&aq[2 * 16], q2);
+  _mm_store_si128((__m128i *)&aq[1 * 16], q1);
+  _mm_store_si128((__m128i *)&aq[0 * 16], q0);
 
 
   {
@@ -546,8 +533,8 @@
                                        _mm_subs_epu8(p0, p5)),
                            _mm_or_si128(_mm_subs_epu8(q5, q0),
                                         _mm_subs_epu8(q0, q5)));
-      _mm_store_si128((__m128i *)ap[5], p5);
-      _mm_store_si128((__m128i *)aq[5], q5);
+      _mm_store_si128((__m128i *)&ap[5 * 16], p5);
+      _mm_store_si128((__m128i *)&aq[5 * 16], q5);
       flat2 = _mm_max_epu8(work, flat2);
       p6 = _mm_loadu_si128((__m128i *)(s - 7 * p));
       q6 = _mm_loadu_si128((__m128i *)(s + 6 * p));
@@ -555,8 +542,8 @@
                                        _mm_subs_epu8(p0, p6)),
                            _mm_or_si128(_mm_subs_epu8(q6, q0),
                                         _mm_subs_epu8(q0, q6)));
-      _mm_store_si128((__m128i *)ap[6], p6);
-      _mm_store_si128((__m128i *)aq[6], q6);
+      _mm_store_si128((__m128i *)&ap[6 * 16], p6);
+      _mm_store_si128((__m128i *)&aq[6 * 16], q6);
       flat2 = _mm_max_epu8(work, flat2);
 
       p7 = _mm_loadu_si128((__m128i *)(s - 8 * p));
@@ -565,8 +552,8 @@
                                        _mm_subs_epu8(p0, p7)),
                            _mm_or_si128(_mm_subs_epu8(q7, q0),
                                         _mm_subs_epu8(q0, q7)));
-      _mm_store_si128((__m128i *)ap[7], p7);
-      _mm_store_si128((__m128i *)aq[7], q7);
+      _mm_store_si128((__m128i *)&ap[7 * 16], p7);
+      _mm_store_si128((__m128i *)&aq[7 * 16], q7);
       flat2 = _mm_max_epu8(work, flat2);
       flat2 = _mm_subs_epu8(flat2, one);
       flat2 = _mm_cmpeq_epi8(flat2, zero);
@@ -586,22 +573,38 @@
         __m128i a, b, c;
 
         unsigned int off = i * 8;
-        p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[7] + off)), zero);
-        p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[6] + off)), zero);
-        p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[5] + off)), zero);
-        p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[4] + off)), zero);
-        p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[3] + off)), zero);
-        p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[2] + off)), zero);
-        p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[1] + off)), zero);
-        p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(ap[0] + off)), zero);
-        q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[0] + off)), zero);
-        q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[1] + off)), zero);
-        q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[2] + off)), zero);
-        q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[3] + off)), zero);
-        q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[4] + off)), zero);
-        q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[5] + off)), zero);
-        q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[6] + off)), zero);
-        q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(aq[7] + off)), zero);
+        p7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[7 * 16] + off)),
+                               zero);
+        p6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[6 * 16] + off)),
+                               zero);
+        p5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[5 * 16] + off)),
+                               zero);
+        p4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[4 * 16] + off)),
+                               zero);
+        p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[3 * 16] + off)),
+                               zero);
+        p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[2 * 16] + off)),
+                               zero);
+        p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[1 * 16] + off)),
+                               zero);
+        p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&ap[0 * 16] + off)),
+                               zero);
+        q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[0 * 16] + off)),
+                               zero);
+        q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[1 * 16] + off)),
+                               zero);
+        q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[2 * 16] + off)),
+                               zero);
+        q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[3 * 16] + off)),
+                               zero);
+        q4 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[4 * 16] + off)),
+                               zero);
+        q5 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[5 * 16] + off)),
+                               zero);
+        q6 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[6 * 16] + off)),
+                               zero);
+        q7 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(&aq[7 * 16] + off)),
+                               zero);
 
         c = _mm_sub_epi16(_mm_slli_epi16(p7, 3), p7);  // p7 * 7
         c = _mm_add_epi16(_mm_slli_epi16(p6, 1), _mm_add_epi16(p4, c));
@@ -610,117 +613,117 @@
         a = _mm_add_epi16(p3, _mm_add_epi16(p2, p1));
         a = _mm_add_epi16(_mm_add_epi16(p0, q0), a);
 
-        _mm_storel_epi64((__m128i *)&flat_op[2][i*8],
+        _mm_storel_epi64((__m128i *)&flat_op[2 * 16 + i * 8],
                          _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
                                           , b));
 
         c = _mm_add_epi16(_mm_add_epi16(p5, eight), c);
         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_op[6][i*8],
+        _mm_storel_epi64((__m128i *)&flat2_op[6 * 16 + i * 8],
                          _mm_packus_epi16(workp_shft, workp_shft));
 
         a = _mm_add_epi16(q1, a);
         b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p2)), p1);
-        _mm_storel_epi64((__m128i *)&flat_op[1][i*8],
+        _mm_storel_epi64((__m128i *)&flat_op[1 * 16 + i * 8],
                          _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
                                           , b));
 
         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p6)), p5);
         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_op[5][i*8],
+        _mm_storel_epi64((__m128i *)&flat2_op[5 * 16 + i * 8],
                          _mm_packus_epi16(workp_shft, workp_shft));
 
         a = _mm_add_epi16(q2, a);
         b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p1)), p0);
-        _mm_storel_epi64((__m128i *)&flat_op[0][i*8],
+        _mm_storel_epi64((__m128i *)&flat_op[i * 8],
                          _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
                                           , b));
 
         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p5)), p4);
         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_op[4][i*8],
+        _mm_storel_epi64((__m128i *)&flat2_op[4 * 16 + i * 8],
                          _mm_packus_epi16(workp_shft, workp_shft));
 
         a = _mm_add_epi16(q3, a);
         b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p3, p0)), q0);
-        _mm_storel_epi64((__m128i *)&flat_oq[0][i*8],
+        _mm_storel_epi64((__m128i *)&flat_oq[i * 8],
                          _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
                                           , b));
 
         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p4)), p3);
         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_op[3][i*8],
+        _mm_storel_epi64((__m128i *)&flat2_op[3 * 16 + i * 8],
                          _mm_packus_epi16(workp_shft, workp_shft));
 
         b = _mm_add_epi16(q3, b);
         b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p2, q0)), q1);
-        _mm_storel_epi64((__m128i *)&flat_oq[1][i*8],
+        _mm_storel_epi64((__m128i *)&flat_oq[16 + i * 8],
                          _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
                                           , b));
 
         c = _mm_add_epi16(q4, c);
         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p3)), p2);
         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_op[2][i*8],
+        _mm_storel_epi64((__m128i *)&flat2_op[2 * 16 + i * 8],
                          _mm_packus_epi16(workp_shft, workp_shft));
 
         b = _mm_add_epi16(q3, b);
         b = _mm_add_epi16(_mm_sub_epi16(b, _mm_add_epi16(p1, q1)), q2);
-        _mm_storel_epi64((__m128i *)&flat_oq[2][i*8],
+        _mm_storel_epi64((__m128i *)&flat_oq[2 * 16 + i * 8],
                          _mm_packus_epi16(_mm_srli_epi16(_mm_add_epi16(a, b), 3)
                                           , b));
         a = _mm_add_epi16(q5, a);
         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p2)), p1);
         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_op[1][i*8],
+        _mm_storel_epi64((__m128i *)&flat2_op[16 + i * 8],
                          _mm_packus_epi16(workp_shft, workp_shft));
 
         a = _mm_add_epi16(q6, a);
         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p1)), p0);
         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_op[0][i*8],
+        _mm_storel_epi64((__m128i *)&flat2_op[i * 8],
                          _mm_packus_epi16(workp_shft, workp_shft));
 
         a = _mm_add_epi16(q7, a);
         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p7, p0)), q0);
         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_oq[0][i*8],
+        _mm_storel_epi64((__m128i *)&flat2_oq[i * 8],
                          _mm_packus_epi16(workp_shft, workp_shft));
 
         a = _mm_add_epi16(q7, a);
         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p6, q0)), q1);
         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_oq[1][i*8],
+        _mm_storel_epi64((__m128i *)&flat2_oq[16 + i * 8],
                          _mm_packus_epi16(workp_shft, workp_shft));
 
         a = _mm_add_epi16(q7, a);
         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p5, q1)), q2);
         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_oq[2][i*8],
+        _mm_storel_epi64((__m128i *)&flat2_oq[2 * 16 + i * 8],
                          _mm_packus_epi16(workp_shft, workp_shft));
 
         a = _mm_add_epi16(q7, a);
         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p4, q2)), q3);
         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_oq[3][i*8],
+        _mm_storel_epi64((__m128i *)&flat2_oq[3 * 16 + i * 8],
                          _mm_packus_epi16(workp_shft, workp_shft));
 
         a = _mm_add_epi16(q7, a);
         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p3, q3)), q4);
         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_oq[4][i*8],
+        _mm_storel_epi64((__m128i *)&flat2_oq[4 * 16 + i * 8],
                          _mm_packus_epi16(workp_shft, workp_shft));
 
         a = _mm_add_epi16(q7, a);
         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p2, q4)), q5);
         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_oq[5][i*8],
+        _mm_storel_epi64((__m128i *)&flat2_oq[5 * 16 + i * 8],
                          _mm_packus_epi16(workp_shft, workp_shft));
 
         a = _mm_add_epi16(q7, a);
         c = _mm_add_epi16(_mm_sub_epi16(c, _mm_add_epi16(p1, q5)), q6);
         workp_shft = _mm_srli_epi16(_mm_add_epi16(a, c), 4);
-        _mm_storel_epi64((__m128i *)&flat2_oq[6][i*8],
+        _mm_storel_epi64((__m128i *)&flat2_oq[6 * 16 + i * 8],
                          _mm_packus_epi16(workp_shft, workp_shft));
 
         temp_flat2 = _mm_srli_si128(temp_flat2, 8);
@@ -730,51 +733,51 @@
     // wide flat
     // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-    work_a = _mm_load_si128((__m128i *)ap[2]);
-    p2 = _mm_load_si128((__m128i *)flat_op[2]);
+    work_a = _mm_load_si128((__m128i *)&ap[2 * 16]);
+    p2 = _mm_load_si128((__m128i *)&flat_op[2 * 16]);
     work_a = _mm_andnot_si128(flat, work_a);
     p2 = _mm_and_si128(flat, p2);
     p2 = _mm_or_si128(work_a, p2);
-    _mm_store_si128((__m128i *)flat_op[2], p2);
+    _mm_store_si128((__m128i *)&flat_op[2 * 16], p2);
 
-    p1 = _mm_load_si128((__m128i *)flat_op[1]);
+    p1 = _mm_load_si128((__m128i *)&flat_op[1 * 16]);
     work_a = _mm_andnot_si128(flat, ps1);
     p1 = _mm_and_si128(flat, p1);
     p1 = _mm_or_si128(work_a, p1);
-    _mm_store_si128((__m128i *)flat_op[1], p1);
+    _mm_store_si128((__m128i *)&flat_op[1 * 16], p1);
 
-    p0 = _mm_load_si128((__m128i *)flat_op[0]);
+    p0 = _mm_load_si128((__m128i *)&flat_op[0]);
     work_a = _mm_andnot_si128(flat, ps0);
     p0 = _mm_and_si128(flat, p0);
     p0 = _mm_or_si128(work_a, p0);
-    _mm_store_si128((__m128i *)flat_op[0], p0);
+    _mm_store_si128((__m128i *)&flat_op[0], p0);
 
-    q0 = _mm_load_si128((__m128i *)flat_oq[0]);
+    q0 = _mm_load_si128((__m128i *)&flat_oq[0]);
     work_a = _mm_andnot_si128(flat, qs0);
     q0 = _mm_and_si128(flat, q0);
     q0 = _mm_or_si128(work_a, q0);
-    _mm_store_si128((__m128i *)flat_oq[0], q0);
+    _mm_store_si128((__m128i *)&flat_oq[0], q0);
 
-    q1 = _mm_load_si128((__m128i *)flat_oq[1]);
+    q1 = _mm_load_si128((__m128i *)&flat_oq[1 * 16]);
     work_a = _mm_andnot_si128(flat, qs1);
     q1 = _mm_and_si128(flat, q1);
     q1 = _mm_or_si128(work_a, q1);
-    _mm_store_si128((__m128i *)flat_oq[1], q1);
+    _mm_store_si128((__m128i *)&flat_oq[1 * 16], q1);
 
-    work_a = _mm_load_si128((__m128i *)aq[2]);
-    q2 = _mm_load_si128((__m128i *)flat_oq[2]);
+    work_a = _mm_load_si128((__m128i *)&aq[2 * 16]);
+    q2 = _mm_load_si128((__m128i *)&flat_oq[2 * 16]);
     work_a = _mm_andnot_si128(flat, work_a);
     q2 = _mm_and_si128(flat, q2);
     q2 = _mm_or_si128(work_a, q2);
-    _mm_store_si128((__m128i *)flat_oq[2], q2);
+    _mm_store_si128((__m128i *)&flat_oq[2 * 16], q2);
 
     // write out op6 - op3
     {
       unsigned char *dst = (s - 7 * p);
       for (i = 6; i > 2; i--) {
         __m128i flat2_output;
-        work_a = _mm_load_si128((__m128i *)ap[i]);
-        flat2_output = _mm_load_si128((__m128i *)flat2_op[i]);
+        work_a = _mm_load_si128((__m128i *)&ap[i * 16]);
+        flat2_output = _mm_load_si128((__m128i *)&flat2_op[i * 16]);
         work_a = _mm_andnot_si128(flat2, work_a);
         flat2_output = _mm_and_si128(flat2, flat2_output);
         work_a = _mm_or_si128(work_a, flat2_output);
@@ -783,43 +786,43 @@
       }
     }
 
-    work_a = _mm_load_si128((__m128i *)flat_op[2]);
-    p2 = _mm_load_si128((__m128i *)flat2_op[2]);
+    work_a = _mm_load_si128((__m128i *)&flat_op[2 * 16]);
+    p2 = _mm_load_si128((__m128i *)&flat2_op[2 * 16]);
     work_a = _mm_andnot_si128(flat2, work_a);
     p2 = _mm_and_si128(flat2, p2);
     p2 = _mm_or_si128(work_a, p2);
     _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
 
-    work_a = _mm_load_si128((__m128i *)flat_op[1]);
-    p1 = _mm_load_si128((__m128i *)flat2_op[1]);
+    work_a = _mm_load_si128((__m128i *)&flat_op[1 * 16]);
+    p1 = _mm_load_si128((__m128i *)&flat2_op[1 * 16]);
     work_a = _mm_andnot_si128(flat2, work_a);
     p1 = _mm_and_si128(flat2, p1);
     p1 = _mm_or_si128(work_a, p1);
     _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
 
-    work_a = _mm_load_si128((__m128i *)flat_op[0]);
-    p0 = _mm_load_si128((__m128i *)flat2_op[0]);
+    work_a = _mm_load_si128((__m128i *)&flat_op[0]);
+    p0 = _mm_load_si128((__m128i *)&flat2_op[0]);
     work_a = _mm_andnot_si128(flat2, work_a);
     p0 = _mm_and_si128(flat2, p0);
     p0 = _mm_or_si128(work_a, p0);
     _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
 
-    work_a = _mm_load_si128((__m128i *)flat_oq[0]);
-    q0 = _mm_load_si128((__m128i *)flat2_oq[0]);
+    work_a = _mm_load_si128((__m128i *)&flat_oq[0]);
+    q0 = _mm_load_si128((__m128i *)&flat2_oq[0]);
     work_a = _mm_andnot_si128(flat2, work_a);
     q0 = _mm_and_si128(flat2, q0);
     q0 = _mm_or_si128(work_a, q0);
     _mm_storeu_si128((__m128i *)(s - 0 * p), q0);
 
-    work_a = _mm_load_si128((__m128i *)flat_oq[1]);
-    q1 = _mm_load_si128((__m128i *)flat2_oq[1]);
+    work_a = _mm_load_si128((__m128i *)&flat_oq[1 * 16]);
+    q1 = _mm_load_si128((__m128i *)&flat2_oq[16]);
     work_a = _mm_andnot_si128(flat2, work_a);
     q1 = _mm_and_si128(flat2, q1);
     q1 = _mm_or_si128(work_a, q1);
     _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
 
-    work_a = _mm_load_si128((__m128i *)flat_oq[2]);
-    q2 = _mm_load_si128((__m128i *)flat2_oq[2]);
+    work_a = _mm_load_si128((__m128i *)&flat_oq[2 * 16]);
+    q2 = _mm_load_si128((__m128i *)&flat2_oq[2 * 16]);
     work_a = _mm_andnot_si128(flat2, work_a);
     q2 = _mm_and_si128(flat2, q2);
     q2 = _mm_or_si128(work_a, q2);
@@ -830,8 +833,8 @@
       unsigned char *dst = (s + 3 * p);
       for (i = 3; i < 7; i++) {
         __m128i flat2_output;
-        work_a = _mm_load_si128((__m128i *)aq[i]);
-        flat2_output = _mm_load_si128((__m128i *)flat2_oq[i]);
+        work_a = _mm_load_si128((__m128i *)&aq[i * 16]);
+        flat2_output = _mm_load_si128((__m128i *)&flat2_oq[i * 16]);
         work_a = _mm_andnot_si128(flat2, work_a);
         flat2_output = _mm_and_si128(flat2, flat2_output);
         work_a = _mm_or_si128(work_a, flat2_output);
@@ -842,6 +845,7 @@
   }
 }
 
+// TODO(yunqingwang): remove count and call these 2 functions(8 or 16) directly.
 void vp9_mb_lpf_horizontal_edge_w_sse2(unsigned char *s,
                                        int p,
                                        const unsigned char *_blimit,
@@ -860,47 +864,47 @@
                                             const unsigned char *_limit,
                                             const unsigned char *_thresh,
                                             int count) {
-  DECLARE_ALIGNED(16, unsigned char, flat_op2[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_op1[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_op0[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq2[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq1[16]);
-  DECLARE_ALIGNED(16, unsigned char, flat_oq0[16]);
-  __m128i mask, hev, flat;
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op2, 16);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op1, 16);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op0, 16);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq2, 16);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq1, 16);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq0, 16);
   const __m128i zero = _mm_set1_epi16(0);
+  const __m128i blimit = _mm_load_si128((const __m128i *)_blimit);
+  const __m128i limit = _mm_load_si128((const __m128i *)_limit);
+  const __m128i thresh = _mm_load_si128((const __m128i *)_thresh);
+  __m128i mask, hev, flat;
   __m128i p3, p2, p1, p0, q0, q1, q2, q3;
-  const unsigned int extended_thresh = _thresh[0] * 0x01010101u;
-  const unsigned int extended_limit  = _limit[0]  * 0x01010101u;
-  const unsigned int extended_blimit = _blimit[0] * 0x01010101u;
-  const __m128i thresh =
-      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_thresh), 0);
-  const __m128i limit =
-      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_limit), 0);
-  const __m128i blimit =
-      _mm_shuffle_epi32(_mm_cvtsi32_si128((int)extended_blimit), 0);
+  __m128i q3p3, q2p2, q1p1, q0p0, p1q1, p0q0;
 
   (void)count;
-  p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
-  p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
-  p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
-  p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
-  q0 = _mm_loadl_epi64((__m128i *)(s - 0 * p));
-  q1 = _mm_loadl_epi64((__m128i *)(s + 1 * p));
-  q2 = _mm_loadl_epi64((__m128i *)(s + 2 * p));
-  q3 = _mm_loadl_epi64((__m128i *)(s + 3 * p));
+
+  q3p3 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 4 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 3 * p)));
+  q2p2 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 3 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 2 * p)));
+  q1p1 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
+                            _mm_loadl_epi64((__m128i *)(s + 1 * p)));
+  q0p0 = _mm_unpacklo_epi64(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
+                            _mm_loadl_epi64((__m128i *)(s - 0 * p)));
+  p1q1 = _mm_shuffle_epi32(q1p1, 78);
+  p0q0 = _mm_shuffle_epi32(q0p0, 78);
+
   {
-    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
-                                          _mm_subs_epu8(p0, p1));
-    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
-                                          _mm_subs_epu8(q0, q1));
+    // filter_mask and hev_mask
     const __m128i one = _mm_set1_epi8(1);
     const __m128i fe = _mm_set1_epi8(0xfe);
-    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
-    __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
-                                    _mm_subs_epu8(q0, p0));
-    __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
-                                    _mm_subs_epu8(q1, p1));
-    __m128i work;
+    const __m128i ff = _mm_cmpeq_epi8(fe, fe);
+    __m128i abs_p1q1, abs_p0q0, abs_q1q0, abs_p1p0, work;
+    abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0),
+                            _mm_subs_epu8(q0p0, q1p1));
+    abs_q1q0 =  _mm_srli_si128(abs_p1p0, 8);
+
+    abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0),
+                            _mm_subs_epu8(p0q0, q0p0));
+    abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1),
+                            _mm_subs_epu8(p1q1, q1p1));
     flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
     hev = _mm_subs_epu8(flat, thresh);
     hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
@@ -910,36 +914,32 @@
     mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
     mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
     // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
-    mask = _mm_max_epu8(flat, mask);
+    mask = _mm_max_epu8(abs_p1p0, mask);
     // mask |= (abs(p1 - p0) > limit) * -1;
     // mask |= (abs(q1 - q0) > limit) * -1;
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
-                                     _mm_subs_epu8(p1, p2)),
-                         _mm_or_si128(_mm_subs_epu8(p3, p2),
-                                      _mm_subs_epu8(p2, p3)));
+
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q1p1),
+                                     _mm_subs_epu8(q1p1, q2p2)),
+                        _mm_or_si128(_mm_subs_epu8(q3p3, q2p2),
+                                     _mm_subs_epu8(q2p2, q3p3)));
     mask = _mm_max_epu8(work, mask);
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
-                                     _mm_subs_epu8(q1, q2)),
-                         _mm_or_si128(_mm_subs_epu8(q3, q2),
-                                      _mm_subs_epu8(q2, q3)));
-    mask = _mm_max_epu8(work, mask);
+    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
     mask = _mm_subs_epu8(mask, limit);
     mask = _mm_cmpeq_epi8(mask, zero);
 
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
-                                     _mm_subs_epu8(p0, p2)),
-                         _mm_or_si128(_mm_subs_epu8(q2, q0),
-                                      _mm_subs_epu8(q0, q2)));
-    flat = _mm_max_epu8(work, flat);
-    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
-                                     _mm_subs_epu8(p0, p3)),
-                         _mm_or_si128(_mm_subs_epu8(q3, q0),
-                                      _mm_subs_epu8(q0, q3)));
-    flat = _mm_max_epu8(work, flat);
+    // flat_mask4
+
+    flat = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2p2, q0p0),
+                                     _mm_subs_epu8(q0p0, q2p2)),
+                        _mm_or_si128(_mm_subs_epu8(q3p3, q0p0),
+                                     _mm_subs_epu8(q0p0, q3p3)));
+    flat = _mm_max_epu8(abs_p1p0, flat);
+    flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
     flat = _mm_subs_epu8(flat, one);
     flat = _mm_cmpeq_epi8(flat, zero);
     flat = _mm_and_si128(flat, mask);
   }
+
   {
     const __m128i four = _mm_set1_epi16(4);
     unsigned char *src = s;
@@ -996,11 +996,7 @@
     const __m128i t4 = _mm_set1_epi8(4);
     const __m128i t3 = _mm_set1_epi8(3);
     const __m128i t80 = _mm_set1_epi8(0x80);
-    const __m128i te0 = _mm_set1_epi8(0xe0);
-    const __m128i t1f = _mm_set1_epi8(0x1f);
     const __m128i t1 = _mm_set1_epi8(0x1);
-    const __m128i t7f = _mm_set1_epi8(0x7f);
-
     const __m128i ps1 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 2 * p)),
                                       t80);
     const __m128i ps0 = _mm_xor_si128(_mm_loadl_epi64((__m128i *)(s - 1 * p)),
@@ -1025,26 +1021,20 @@
     filter2 = _mm_adds_epi8(filt, t3);
 
     /* Filter1 >> 3 */
-    work_a = _mm_cmpgt_epi8(zero, filter1);
-    filter1 = _mm_srli_epi16(filter1, 3);
-    work_a = _mm_and_si128(work_a, te0);
-    filter1 = _mm_and_si128(filter1, t1f);
-    filter1 = _mm_or_si128(filter1, work_a);
+    filter1 = _mm_unpacklo_epi8(zero, filter1);
+    filter1 = _mm_srai_epi16(filter1, 11);
+    filter1 = _mm_packs_epi16(filter1, filter1);
 
     /* Filter2 >> 3 */
-    work_a = _mm_cmpgt_epi8(zero, filter2);
-    filter2 = _mm_srli_epi16(filter2, 3);
-    work_a = _mm_and_si128(work_a, te0);
-    filter2 = _mm_and_si128(filter2, t1f);
-    filter2 = _mm_or_si128(filter2, work_a);
+    filter2 = _mm_unpacklo_epi8(zero, filter2);
+    filter2 = _mm_srai_epi16(filter2, 11);
+    filter2 = _mm_packs_epi16(filter2, zero);
 
     /* filt >> 1 */
     filt = _mm_adds_epi8(filter1, t1);
-    work_a = _mm_cmpgt_epi8(zero, filt);
-    filt = _mm_srli_epi16(filt, 1);
-    work_a = _mm_and_si128(work_a, t80);
-    filt = _mm_and_si128(filt, t7f);
-    filt = _mm_or_si128(filt, work_a);
+    filt = _mm_unpacklo_epi8(zero, filt);
+    filt = _mm_srai_epi16(filt, 9);
+    filt = _mm_packs_epi16(filt, zero);
 
     filt = _mm_andnot_si128(hev, filt);
 
@@ -1093,6 +1083,392 @@
   }
 }
 
+void vp9_mbloop_filter_horizontal_edge_16_sse2(uint8_t *s, int p /* pitch */,
+                                               const uint8_t *_blimit0,
+                                               const uint8_t *_limit0,
+                                               const uint8_t *_thresh0,
+                                               const uint8_t *_blimit1,
+                                               const uint8_t *_limit1,
+                                               const uint8_t *_thresh1) {
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op2, 16);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op1, 16);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_op0, 16);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq2, 16);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq1, 16);
+  DECLARE_ALIGNED_ARRAY(16, unsigned char, flat_oq0, 16);
+  const __m128i zero = _mm_set1_epi16(0);
+  const __m128i blimit =
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
+                         _mm_load_si128((const __m128i *)_blimit1));
+  const __m128i limit =
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
+                         _mm_load_si128((const __m128i *)_limit1));
+  const __m128i thresh =
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
+                         _mm_load_si128((const __m128i *)_thresh1));
+
+  __m128i mask, hev, flat;
+  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+
+  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+  {
+    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
+                                          _mm_subs_epu8(p0, p1));
+    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
+                                          _mm_subs_epu8(q0, q1));
+    const __m128i one = _mm_set1_epi8(1);
+    const __m128i fe = _mm_set1_epi8(0xfe);
+    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+    __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
+                                    _mm_subs_epu8(q0, p0));
+    __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
+                                    _mm_subs_epu8(q1, p1));
+    __m128i work;
+
+    // filter_mask and hev_mask
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+    hev = _mm_subs_epu8(flat, thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(flat, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
+                                     _mm_subs_epu8(p1, p2)),
+                         _mm_or_si128(_mm_subs_epu8(p3, p2),
+                                      _mm_subs_epu8(p2, p3)));
+    mask = _mm_max_epu8(work, mask);
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
+                                     _mm_subs_epu8(q1, q2)),
+                         _mm_or_si128(_mm_subs_epu8(q3, q2),
+                                      _mm_subs_epu8(q2, q3)));
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_cmpeq_epi8(mask, zero);
+
+    // flat_mask4
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p0),
+                                     _mm_subs_epu8(p0, p2)),
+                         _mm_or_si128(_mm_subs_epu8(q2, q0),
+                                      _mm_subs_epu8(q0, q2)));
+    flat = _mm_max_epu8(work, flat);
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p3, p0),
+                                     _mm_subs_epu8(p0, p3)),
+                         _mm_or_si128(_mm_subs_epu8(q3, q0),
+                                      _mm_subs_epu8(q0, q3)));
+    flat = _mm_max_epu8(work, flat);
+    flat = _mm_subs_epu8(flat, one);
+    flat = _mm_cmpeq_epi8(flat, zero);
+    flat = _mm_and_si128(flat, mask);
+  }
+  {
+    const __m128i four = _mm_set1_epi16(4);
+    unsigned char *src = s;
+    int i = 0;
+
+    do {
+      __m128i workp_a, workp_b, workp_shft;
+      p3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 4 * p)), zero);
+      p2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 3 * p)), zero);
+      p1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 2 * p)), zero);
+      p0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 1 * p)), zero);
+      q0 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src - 0 * p)), zero);
+      q1 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 1 * p)), zero);
+      q2 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 2 * p)), zero);
+      q3 = _mm_unpacklo_epi8(_mm_loadl_epi64((__m128i *)(src + 3 * p)), zero);
+
+      workp_a = _mm_add_epi16(_mm_add_epi16(p3, p3), _mm_add_epi16(p2, p1));
+      workp_a = _mm_add_epi16(_mm_add_epi16(workp_a, four), p0);
+      workp_b = _mm_add_epi16(_mm_add_epi16(q0, p2), p3);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_op2[i * 8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_b = _mm_add_epi16(_mm_add_epi16(q0, q1), p1);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_op1[i * 8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q2);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p1), p0);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_op0[i * 8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p3), q3);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, p0), q0);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_oq0[i * 8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p2), q3);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q0), q1);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_oq1[i * 8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      workp_a = _mm_add_epi16(_mm_sub_epi16(workp_a, p1), q3);
+      workp_b = _mm_add_epi16(_mm_sub_epi16(workp_b, q1), q2);
+      workp_shft = _mm_srli_epi16(_mm_add_epi16(workp_a, workp_b), 3);
+      _mm_storel_epi64((__m128i *)&flat_oq2[i * 8],
+                       _mm_packus_epi16(workp_shft, workp_shft));
+
+      src += 8;
+    } while (++i < 2);
+  }
+  // lp filter
+  {
+    const __m128i t4 = _mm_set1_epi8(4);
+    const __m128i t3 = _mm_set1_epi8(3);
+    const __m128i t80 = _mm_set1_epi8(0x80);
+    const __m128i te0 = _mm_set1_epi8(0xe0);
+    const __m128i t1f = _mm_set1_epi8(0x1f);
+    const __m128i t1 = _mm_set1_epi8(0x1);
+    const __m128i t7f = _mm_set1_epi8(0x7f);
+
+    const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
+                                      t80);
+    const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
+                                      t80);
+    const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
+                                      t80);
+    const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
+                                      t80);
+    __m128i filt;
+    __m128i work_a;
+    __m128i filter1, filter2;
+
+    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+    work_a = _mm_subs_epi8(qs0, ps0);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+    filt = _mm_and_si128(filt, mask);
+
+    filter1 = _mm_adds_epi8(filt, t4);
+    filter2 = _mm_adds_epi8(filt, t3);
+
+    /* Filter1 >> 3 */
+    work_a = _mm_cmpgt_epi8(zero, filter1);
+    filter1 = _mm_srli_epi16(filter1, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter1 = _mm_and_si128(filter1, t1f);
+    filter1 = _mm_or_si128(filter1, work_a);
+
+    /* Filter2 >> 3 */
+    work_a = _mm_cmpgt_epi8(zero, filter2);
+    filter2 = _mm_srli_epi16(filter2, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter2 = _mm_and_si128(filter2, t1f);
+    filter2 = _mm_or_si128(filter2, work_a);
+
+    /* filt >> 1 */
+    filt = _mm_adds_epi8(filter1, t1);
+    work_a = _mm_cmpgt_epi8(zero, filt);
+    filt = _mm_srli_epi16(filt, 1);
+    work_a = _mm_and_si128(work_a, t80);
+    filt = _mm_and_si128(filt, t7f);
+    filt = _mm_or_si128(filt, work_a);
+
+    filt = _mm_andnot_si128(hev, filt);
+
+    work_a = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+    q0 = _mm_load_si128((__m128i *)flat_oq0);
+    work_a = _mm_andnot_si128(flat, work_a);
+    q0 = _mm_and_si128(flat, q0);
+    q0 = _mm_or_si128(work_a, q0);
+
+    work_a = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+    q1 = _mm_load_si128((__m128i *)flat_oq1);
+    work_a = _mm_andnot_si128(flat, work_a);
+    q1 = _mm_and_si128(flat, q1);
+    q1 = _mm_or_si128(work_a, q1);
+
+    work_a = _mm_loadu_si128((__m128i *)(s + 2 * p));
+    q2 = _mm_load_si128((__m128i *)flat_oq2);
+    work_a = _mm_andnot_si128(flat, work_a);
+    q2 = _mm_and_si128(flat, q2);
+    q2 = _mm_or_si128(work_a, q2);
+
+    work_a = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+    p0 = _mm_load_si128((__m128i *)flat_op0);
+    work_a = _mm_andnot_si128(flat, work_a);
+    p0 = _mm_and_si128(flat, p0);
+    p0 = _mm_or_si128(work_a, p0);
+
+    work_a = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+    p1 = _mm_load_si128((__m128i *)flat_op1);
+    work_a = _mm_andnot_si128(flat, work_a);
+    p1 = _mm_and_si128(flat, p1);
+    p1 = _mm_or_si128(work_a, p1);
+
+    work_a = _mm_loadu_si128((__m128i *)(s - 3 * p));
+    p2 = _mm_load_si128((__m128i *)flat_op2);
+    work_a = _mm_andnot_si128(flat, work_a);
+    p2 = _mm_and_si128(flat, p2);
+    p2 = _mm_or_si128(work_a, p2);
+
+    _mm_storeu_si128((__m128i *)(s - 3 * p), p2);
+    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+    _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
+    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+    _mm_storeu_si128((__m128i *)(s + 2 * p), q2);
+  }
+}
+
+void vp9_loop_filter_horizontal_edge_16_sse2(unsigned char *s,
+                                            int p,
+                                            const unsigned char *_blimit0,
+                                            const unsigned char *_limit0,
+                                            const unsigned char *_thresh0,
+                                            const unsigned char *_blimit1,
+                                            const unsigned char *_limit1,
+                                            const unsigned char *_thresh1) {
+  const __m128i blimit =
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_blimit0),
+                         _mm_load_si128((const __m128i *)_blimit1));
+  const __m128i limit =
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_limit0),
+                         _mm_load_si128((const __m128i *)_limit1));
+  const __m128i thresh =
+      _mm_unpacklo_epi64(_mm_load_si128((const __m128i *)_thresh0),
+                         _mm_load_si128((const __m128i *)_thresh1));
+  const __m128i zero = _mm_set1_epi16(0);
+  __m128i p3, p2, p1, p0, q0, q1, q2, q3;
+  __m128i mask, hev, flat;
+
+  p3 = _mm_loadu_si128((__m128i *)(s - 4 * p));
+  p2 = _mm_loadu_si128((__m128i *)(s - 3 * p));
+  p1 = _mm_loadu_si128((__m128i *)(s - 2 * p));
+  p0 = _mm_loadu_si128((__m128i *)(s - 1 * p));
+  q0 = _mm_loadu_si128((__m128i *)(s - 0 * p));
+  q1 = _mm_loadu_si128((__m128i *)(s + 1 * p));
+  q2 = _mm_loadu_si128((__m128i *)(s + 2 * p));
+  q3 = _mm_loadu_si128((__m128i *)(s + 3 * p));
+
+  // filter_mask and hev_mask
+  {
+    const __m128i abs_p1p0 = _mm_or_si128(_mm_subs_epu8(p1, p0),
+                                          _mm_subs_epu8(p0, p1));
+    const __m128i abs_q1q0 = _mm_or_si128(_mm_subs_epu8(q1, q0),
+                                          _mm_subs_epu8(q0, q1));
+    const __m128i fe = _mm_set1_epi8(0xfe);
+    const __m128i ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
+    __m128i abs_p0q0 = _mm_or_si128(_mm_subs_epu8(p0, q0),
+                                    _mm_subs_epu8(q0, p0));
+    __m128i abs_p1q1 = _mm_or_si128(_mm_subs_epu8(p1, q1),
+                                    _mm_subs_epu8(q1, p1));
+    __m128i work;
+
+    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
+    hev = _mm_subs_epu8(flat, thresh);
+    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);
+
+    abs_p0q0 =_mm_adds_epu8(abs_p0q0, abs_p0q0);
+    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
+    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
+    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
+    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
+    mask = _mm_max_epu8(flat, mask);
+    // mask |= (abs(p1 - p0) > limit) * -1;
+    // mask |= (abs(q1 - q0) > limit) * -1;
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(p2, p1),
+                                     _mm_subs_epu8(p1, p2)),
+                         _mm_or_si128(_mm_subs_epu8(p3, p2),
+                                      _mm_subs_epu8(p2, p3)));
+    mask = _mm_max_epu8(work, mask);
+    work = _mm_max_epu8(_mm_or_si128(_mm_subs_epu8(q2, q1),
+                                     _mm_subs_epu8(q1, q2)),
+                         _mm_or_si128(_mm_subs_epu8(q3, q2),
+                                      _mm_subs_epu8(q2, q3)));
+    mask = _mm_max_epu8(work, mask);
+    mask = _mm_subs_epu8(mask, limit);
+    mask = _mm_cmpeq_epi8(mask, zero);
+  }
+
+  // filter4
+  {
+    const __m128i t4 = _mm_set1_epi8(4);
+    const __m128i t3 = _mm_set1_epi8(3);
+    const __m128i t80 = _mm_set1_epi8(0x80);
+    const __m128i te0 = _mm_set1_epi8(0xe0);
+    const __m128i t1f = _mm_set1_epi8(0x1f);
+    const __m128i t1 = _mm_set1_epi8(0x1);
+    const __m128i t7f = _mm_set1_epi8(0x7f);
+
+    const __m128i ps1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 2 * p)),
+                                      t80);
+    const __m128i ps0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s - 1 * p)),
+                                      t80);
+    const __m128i qs0 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 0 * p)),
+                                      t80);
+    const __m128i qs1 = _mm_xor_si128(_mm_loadu_si128((__m128i *)(s + 1 * p)),
+                                      t80);
+    __m128i filt;
+    __m128i work_a;
+    __m128i filter1, filter2;
+
+    filt = _mm_and_si128(_mm_subs_epi8(ps1, qs1), hev);
+    work_a = _mm_subs_epi8(qs0, ps0);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    filt = _mm_adds_epi8(filt, work_a);
+    /* (vp9_filter + 3 * (qs0 - ps0)) & mask */
+    filt = _mm_and_si128(filt, mask);
+
+    filter1 = _mm_adds_epi8(filt, t4);
+    filter2 = _mm_adds_epi8(filt, t3);
+
+    /* Filter1 >> 3 */
+    work_a = _mm_cmpgt_epi8(zero, filter1);
+    filter1 = _mm_srli_epi16(filter1, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter1 = _mm_and_si128(filter1, t1f);
+    filter1 = _mm_or_si128(filter1, work_a);
+
+    /* Filter2 >> 3 */
+    work_a = _mm_cmpgt_epi8(zero, filter2);
+    filter2 = _mm_srli_epi16(filter2, 3);
+    work_a = _mm_and_si128(work_a, te0);
+    filter2 = _mm_and_si128(filter2, t1f);
+    filter2 = _mm_or_si128(filter2, work_a);
+
+    /* filt >> 1 */
+    filt = _mm_adds_epi8(filter1, t1);
+    work_a = _mm_cmpgt_epi8(zero, filt);
+    filt = _mm_srli_epi16(filt, 1);
+    work_a = _mm_and_si128(work_a, t80);
+    filt = _mm_and_si128(filt, t7f);
+    filt = _mm_or_si128(filt, work_a);
+
+    filt = _mm_andnot_si128(hev, filt);
+
+    q0 = _mm_xor_si128(_mm_subs_epi8(qs0, filter1), t80);
+    q1 = _mm_xor_si128(_mm_subs_epi8(qs1, filt), t80);
+    p0 = _mm_xor_si128(_mm_adds_epi8(ps0, filter2), t80);
+    p1 = _mm_xor_si128(_mm_adds_epi8(ps1, filt), t80);
+
+    _mm_storeu_si128((__m128i *)(s - 2 * p), p1);
+    _mm_storeu_si128((__m128i *)(s - 1 * p), p0);
+    _mm_storeu_si128((__m128i *)(s + 0 * p), q0);
+    _mm_storeu_si128((__m128i *)(s + 1 * p), q1);
+  }
+}
+
 static INLINE void transpose8x16(unsigned char *in0, unsigned char *in1,
                                  int in_p, unsigned char *out, int out_p) {
   __m128i x0, x1, x2, x3, x4, x5, x6, x7;

diff --git a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
index 7a5cca0..dbc17ec 100644
--- a/vp9/common/x86/vp9_subpixel_8t_ssse3.asm
+++ b/vp9/common/x86/vp9_subpixel_8t_ssse3.asm

@@ -11,17 +11,6 @@
 
 %include "vpx_ports/x86_abi_support.asm"
 
-;/************************************************************************************
-; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The
-; input pixel array has output_height rows. This routine assumes that output_height is an
-; even number. This function handles 8 pixels in horizontal direction, calculating ONE
-; rows each iteration to take advantage of the 128 bits operations.
-;
-; This is an implementation of some of the SSE optimizations first seen in ffvp8
-;
-;*************************************************************************************/
-
-
 %macro VERTx4 1
     mov         rdx, arg(5)                 ;filter ptr
     mov         rsi, arg(0)                 ;src_ptr
@@ -81,11 +70,14 @@
     pmaddubsw   xmm4, k4k5
     pmaddubsw   xmm6, k6k7
 
+    movdqa      xmm1, xmm2
     paddsw      xmm0, xmm6
-    paddsw      xmm0, xmm2
+    pmaxsw      xmm2, xmm4
+    pminsw      xmm4, xmm1
     paddsw      xmm0, xmm4
-    paddsw      xmm0, krd
+    paddsw      xmm0, xmm2
 
+    paddsw      xmm0, krd
     psraw       xmm0, 7
     packuswb    xmm0, xmm0
 
@@ -538,14 +530,22 @@
     movdqa      %2,   %1
     pshufb      %1,   [GLOBAL(shuf_t0t1)]
     pshufb      %2,   [GLOBAL(shuf_t2t3)]
-    pmaddubsw   %1,   xmm6
-    pmaddubsw   %2,   xmm7
+    pmaddubsw   %1,   k0k1k4k5
+    pmaddubsw   %2,   k2k3k6k7
 
-    paddsw      %1,   %2
-    movdqa      %2,   %1
+    movdqa      xmm4, %1
+    movdqa      xmm5, %2
+    psrldq      %1,   8
     psrldq      %2,   8
-    paddsw      %1,   %2
-    paddsw      %1,   xmm5
+    movdqa      xmm6, xmm5
+
+    paddsw      xmm4, %2
+    pmaxsw      xmm5, %1
+    pminsw      %1, xmm6
+    paddsw      %1, xmm4
+    paddsw      %1, xmm5
+
+    paddsw      %1,   krd
     psraw       %1,   7
     packuswb    %1,   %1
 %endm
@@ -565,6 +565,10 @@
     pshufhw     xmm7, xmm7, 11111111b       ;k2_k3_k6_k7
     pshufd      xmm5, xmm5, 0               ;rounding
 
+    movdqa      k0k1k4k5, xmm6
+    movdqa      k2k3k6k7, xmm7
+    movdqa      krd, xmm5
+
     movsxd      rax, dword ptr arg(1)       ;src_pixels_per_line
     movsxd      rdx, dword ptr arg(3)       ;output_pitch
     movsxd      rcx, dword ptr arg(4)       ;output_height
@@ -826,8 +830,15 @@
     push        rdi
     ; end prolog
 
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 3
+    %define k0k1k4k5 [rsp + 16 * 0]
+    %define k2k3k6k7 [rsp + 16 * 1]
+    %define krd      [rsp + 16 * 2]
+
     HORIZx4 0
 
+    add rsp, 16 * 3
     ; begin epilog
     pop rdi
     pop rsi
@@ -932,8 +943,15 @@
     push        rdi
     ; end prolog
 
+    ALIGN_STACK 16, rax
+    sub         rsp, 16 * 3
+    %define k0k1k4k5 [rsp + 16 * 0]
+    %define k2k3k6k7 [rsp + 16 * 1]
+    %define krd      [rsp + 16 * 2]
+
     HORIZx4 1
 
+    add rsp, 16 * 3
     ; begin epilog
     pop rdi
     pop rsi

diff --git a/vp9/decoder/vp9_decodframe.c b/vp9/decoder/vp9_decodeframe.c
similarity index 93%
rename from vp9/decoder/vp9_decodframe.c
rename to vp9/decoder/vp9_decodeframe.c
index 75c6384..dbcae76 100644
--- a/vp9/decoder/vp9_decodframe.c
+++ b/vp9/decoder/vp9_decodeframe.c

@@ -18,7 +18,6 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_entropymode.h"
-#include "vp9/common/vp9_extend.h"
 #include "vp9/common/vp9_idct.h"
 #include "vp9/common/vp9_pred_common.h"
 #include "vp9/common/vp9_quant_common.h"
@@ -28,7 +27,7 @@
 #include "vp9/common/vp9_tile_common.h"
 
 #include "vp9/decoder/vp9_dboolhuff.h"
-#include "vp9/decoder/vp9_decodframe.h"
+#include "vp9/decoder/vp9_decodeframe.h"
 #include "vp9/decoder/vp9_detokenize.h"
 #include "vp9/decoder/vp9_decodemv.h"
 #include "vp9/decoder/vp9_dsubexp.h"
@@ -45,7 +44,6 @@
   DECLARE_ALIGNED(16, int16_t,  qcoeff[MAX_MB_PLANE][64 * 64]);
   DECLARE_ALIGNED(16, int16_t,  dqcoeff[MAX_MB_PLANE][64 * 64]);
   DECLARE_ALIGNED(16, uint16_t, eobs[MAX_MB_PLANE][256]);
-  const uint8_t *band_translate[2];
 } TileWorkerData;
 
 static int read_be32(const uint8_t *p) {
@@ -243,16 +241,13 @@
 }
 
 static void inverse_transform_block(MACROBLOCKD* xd, int plane, int block,
-                                    TX_SIZE tx_size, int x, int y) {
+                                    TX_SIZE tx_size, uint8_t *dst, int stride) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int eob = pd->eobs[block];
   if (eob > 0) {
     TX_TYPE tx_type;
     const int plane_type = pd->plane_type;
-    const int stride = pd->dst.stride;
     int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-    uint8_t *const dst = &pd->dst.buf[4 * y * stride + 4 * x];
-
     switch (tx_size) {
       case TX_4X4:
         tx_type = get_tx_type_4x4(plane_type, xd, block);
@@ -295,7 +290,6 @@
   MACROBLOCKD *xd;
   vp9_reader *r;
   uint8_t *token_cache;
-  const uint8_t *band_translate[2];
 };
 
 static void predict_and_reconstruct_intra_block(int plane, int block,
@@ -304,9 +298,6 @@
   struct intra_args *const args = arg;
   VP9_COMMON *const cm = args->cm;
   MACROBLOCKD *const xd = args->xd;
-  const uint8_t *band_translate[2] = {
-    args->band_translate[0], args->band_translate[1]
-  };
   struct macroblockd_plane *const pd = &xd->plane[plane];
   MODE_INFO *const mi = xd->mi_8x8[0];
   const MB_PREDICTION_MODE mode = (plane == 0)
@@ -327,8 +318,8 @@
 
   if (!mi->mbmi.skip_coeff) {
     vp9_decode_block_tokens(cm, xd, plane, block, plane_bsize, x, y, tx_size,
-                            args->r, args->token_cache, band_translate);
-    inverse_transform_block(xd, plane, block, tx_size, x, y);
+                            args->r, args->token_cache);
+    inverse_transform_block(xd, plane, block, tx_size, dst, pd->dst.stride);
   }
 }
 
@@ -338,7 +329,6 @@
   vp9_reader *r;
   int *eobtotal;
   uint8_t *token_cache;
-  const uint8_t *band_translate[2];
 };
 
 static void reconstruct_inter_block(int plane, int block,
@@ -347,17 +337,15 @@
   struct inter_args *args = arg;
   VP9_COMMON *const cm = args->cm;
   MACROBLOCKD *const xd = args->xd;
-  const uint8_t *band_translate[2] = {
-    args->band_translate[0], args->band_translate[1]
-  };
+  struct macroblockd_plane *const pd = &xd->plane[plane];
   int x, y;
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &x, &y);
-
   *args->eobtotal += vp9_decode_block_tokens(cm, xd, plane, block,
                                              plane_bsize, x, y, tx_size,
-                                             args->r, args->token_cache,
-                                             band_translate);
-  inverse_transform_block(xd, plane, block, tx_size, x, y);
+                                             args->r, args->token_cache);
+  inverse_transform_block(xd, plane, block, tx_size,
+                          &pd->dst.buf[4 * y * pd->dst.stride + 4 * x],
+                          pd->dst.stride);
 }
 
 static void set_offsets(VP9_COMMON *const cm, MACROBLOCKD *const xd,
@@ -408,8 +396,7 @@
                            const TileInfo *const tile,
                            int mi_row, int mi_col,
                            vp9_reader *r, BLOCK_SIZE bsize,
-                           uint8_t *token_cache,
-                           const uint8_t *band_translate[2]) {
+                           uint8_t *token_cache) {
   const int less8x8 = bsize < BLOCK_8X8;
   MB_MODE_INFO *mbmi;
 
@@ -432,7 +419,7 @@
 
   if (!is_inter_block(mbmi)) {
     struct intra_args arg = {
-      cm, xd, r, token_cache, {band_translate[0], band_translate[1]}
+      cm, xd, r, token_cache
     };
     foreach_transformed_block(xd, bsize, predict_and_reconstruct_intra_block,
                               &arg);
@@ -452,8 +439,7 @@
     if (!mbmi->skip_coeff) {
       int eobtotal = 0;
       struct inter_args arg = {
-        cm, xd, r, &eobtotal, token_cache,
-        {band_translate[0], band_translate[1]}
+        cm, xd, r, &eobtotal, token_cache
       };
       foreach_transformed_block(xd, bsize, reconstruct_inter_block, &arg);
       if (!less8x8 && eobtotal == 0)
@@ -494,8 +480,7 @@
                             const TileInfo *const tile,
                             int mi_row, int mi_col,
                             vp9_reader* r, BLOCK_SIZE bsize,
-                            uint8_t *token_cache,
-                            const uint8_t *band_translate[2]) {
+                            uint8_t *token_cache) {
   const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
   PARTITION_TYPE partition;
   BLOCK_SIZE subsize;
@@ -506,37 +491,33 @@
   partition = read_partition(cm, xd, hbs, mi_row, mi_col, bsize, r);
   subsize = get_subsize(bsize, partition);
   if (subsize < BLOCK_8X8) {
-    decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache,
-                   band_translate);
+    decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
   } else {
     switch (partition) {
       case PARTITION_NONE:
-        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache,
-                       band_translate);
+        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
         break;
       case PARTITION_HORZ:
-        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache,
-                       band_translate);
+        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
         if (mi_row + hbs < cm->mi_rows)
           decode_modes_b(cm, xd, tile, mi_row + hbs, mi_col, r, subsize,
-                         token_cache, band_translate);
+                         token_cache);
         break;
       case PARTITION_VERT:
-        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache,
-                       band_translate);
+        decode_modes_b(cm, xd, tile, mi_row, mi_col, r, subsize, token_cache);
         if (mi_col + hbs < cm->mi_cols)
           decode_modes_b(cm, xd, tile, mi_row, mi_col + hbs, r, subsize,
-                         token_cache, band_translate);
+                         token_cache);
         break;
       case PARTITION_SPLIT:
         decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, subsize,
-                        token_cache, band_translate);
+                        token_cache);
         decode_modes_sb(cm, xd, tile, mi_row, mi_col + hbs, r, subsize,
-                        token_cache, band_translate);
+                        token_cache);
         decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col, r, subsize,
-                        token_cache, band_translate);
+                        token_cache);
         decode_modes_sb(cm, xd, tile, mi_row + hbs, mi_col + hbs, r, subsize,
-                        token_cache, band_translate);
+                        token_cache);
         break;
       default:
         assert(!"Invalid partition type");
@@ -721,20 +702,19 @@
   VP9_COMMON *cm = &pbi->common;
 
   if (cm->width != width || cm->height != height) {
-    if (!pbi->initial_width || !pbi->initial_height) {
-      if (vp9_alloc_frame_buffers(cm, width, height))
+    // Change in frame size.
+    if (cm->width == 0 || cm->height == 0) {
+      // Assign new frame buffer on first call.
+      cm->new_fb_idx = NUM_YV12_BUFFERS - 1;
+      cm->fb_idx_ref_cnt[cm->new_fb_idx] = 1;
+    }
+
+    // TODO(agrange) Don't test width/height, check overall size.
+    if (width > cm->width || height > cm->height) {
+      // Rescale frame buffers only if they're not big enough already.
+      if (vp9_resize_frame_buffers(cm, width, height))
         vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                            "Failed to allocate frame buffers");
-      pbi->initial_width = width;
-      pbi->initial_height = height;
-    } else {
-      if (width > pbi->initial_width)
-        vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
-                           "Frame width too large");
-
-      if (height > pbi->initial_height)
-        vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
-                           "Frame height too large");
     }
 
     cm->width = width;
@@ -820,11 +800,8 @@
     vp9_zero(xd->left_seg_context);
     for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
          mi_col += MI_BLOCK_SIZE) {
-      const uint8_t *band_translate[2] = {
-        vp9_coefband_trans_4x4, pbi->coefband_trans_8x8plus
-      };
       decode_modes_sb(cm, xd, tile, mi_row, mi_col, r, BLOCK_64X64,
-                      pbi->token_cache, band_translate);
+                      pbi->token_cache);
     }
 
     if (pbi->do_loopfilter_inline) {
@@ -985,8 +962,7 @@
          mi_col += MI_BLOCK_SIZE) {
       decode_modes_sb(tile_data->cm, &tile_data->xd, tile,
                       mi_row, mi_col, &tile_data->bit_reader, BLOCK_64X64,
-                      tile_data->token_cache,
-                      tile_data->band_translate);
+                      tile_data->token_cache);
     }
   }
   return !tile_data->xd.corrupted;
@@ -1045,8 +1021,6 @@
       tile_data->cm = cm;
       tile_data->xd = pbi->mb;
       tile_data->xd.corrupted = 0;
-      tile_data->band_translate[0] = vp9_coefband_trans_4x4;
-      tile_data->band_translate[1] = pbi->coefband_trans_8x8plus;
       vp9_tile_init(tile, tile_data->cm, 0, tile_col);
 
       setup_token_decoder(data, data_end, size, &cm->error,
@@ -1327,13 +1301,6 @@
   const int tile_cols = 1 << cm->log2_tile_cols;
   YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm);
 
-  vpx_memset(pbi->coefband_trans_8x8plus,
-             (COEF_BANDS - 1),
-             sizeof(pbi->coefband_trans_8x8plus));
-  vpx_memcpy(pbi->coefband_trans_8x8plus,
-             vp9_coefband_trans_8x8plus,
-             sizeof(vp9_coefband_trans_8x8plus));
-
   if (!first_partition_size) {
       // showing a frame directly
       *p_data_end = data + 1;

diff --git a/vp9/decoder/vp9_decodframe.h b/vp9/decoder/vp9_decodeframe.h
similarity index 82%
rename from vp9/decoder/vp9_decodframe.h
rename to vp9/decoder/vp9_decodeframe.h
index c665f6f..7245a98 100644
--- a/vp9/decoder/vp9_decodframe.h
+++ b/vp9/decoder/vp9_decodeframe.h

@@ -9,8 +9,8 @@
  */
 
 
-#ifndef VP9_DECODER_VP9_DECODFRAME_H_
-#define VP9_DECODER_VP9_DECODFRAME_H_
+#ifndef VP9_DECODER_VP9_DECODEFRAME_H_
+#define VP9_DECODER_VP9_DECODEFRAME_H_
 
 struct VP9Common;
 struct VP9Decompressor;
@@ -18,4 +18,4 @@
 void vp9_init_dequantizer(struct VP9Common *cm);
 int vp9_decode_frame(struct VP9Decompressor *cpi, const uint8_t **p_data_end);
 
-#endif  // VP9_DECODER_VP9_DECODFRAME_H_
+#endif  // VP9_DECODER_VP9_DECODEFRAME_H_

diff --git a/vp9/decoder/vp9_decodemv.c b/vp9/decoder/vp9_decodemv.c
index b948429..9c1f610 100644
--- a/vp9/decoder/vp9_decodemv.c
+++ b/vp9/decoder/vp9_decodemv.c

@@ -21,7 +21,7 @@
 #include "vp9/common/vp9_seg_common.h"
 
 #include "vp9/decoder/vp9_decodemv.h"
-#include "vp9/decoder/vp9_decodframe.h"
+#include "vp9/decoder/vp9_decodeframe.h"
 #include "vp9/decoder/vp9_onyxd_int.h"
 #include "vp9/decoder/vp9_treereader.h"
 

diff --git a/vp9/decoder/vp9_detokenize.c b/vp9/decoder/vp9_detokenize.c
index d5ad303..75e7e40 100644
--- a/vp9/decoder/vp9_detokenize.c
+++ b/vp9/decoder/vp9_detokenize.c

@@ -23,14 +23,14 @@
 #define EOB_CONTEXT_NODE            0
 #define ZERO_CONTEXT_NODE           1
 #define ONE_CONTEXT_NODE            2
-#define LOW_VAL_CONTEXT_NODE        3
-#define TWO_CONTEXT_NODE            4
-#define THREE_CONTEXT_NODE          5
-#define HIGH_LOW_CONTEXT_NODE       6
-#define CAT_ONE_CONTEXT_NODE        7
-#define CAT_THREEFOUR_CONTEXT_NODE  8
-#define CAT_THREE_CONTEXT_NODE      9
-#define CAT_FIVE_CONTEXT_NODE       10
+#define LOW_VAL_CONTEXT_NODE        0
+#define TWO_CONTEXT_NODE            1
+#define THREE_CONTEXT_NODE          2
+#define HIGH_LOW_CONTEXT_NODE       3
+#define CAT_ONE_CONTEXT_NODE        4
+#define CAT_THREEFOUR_CONTEXT_NODE  5
+#define CAT_THREE_CONTEXT_NODE      6
+#define CAT_FIVE_CONTEXT_NODE       7
 
 #define CAT1_MIN_VAL    5
 #define CAT2_MIN_VAL    7
@@ -67,42 +67,41 @@
   TWO_TOKEN, TWO_TOKEN, TWO_TOKEN, DCT_EOB_MODEL_TOKEN
 };
 
-#define INCREMENT_COUNT(token)                           \
-  do {                                                   \
-     if (!cm->frame_parallel_decoding_mode) {            \
+#define INCREMENT_COUNT(token)                              \
+  do {                                                      \
+     if (!cm->frame_parallel_decoding_mode)                 \
        ++coef_counts[band][pt][token_to_counttoken[token]]; \
-     }                                                   \
-  } while (0);
+  } while (0)
+
 
 #define WRITE_COEF_CONTINUE(val, token)                  \
   {                                                      \
-    dqcoeff_ptr[scan[c]] = vp9_read_and_apply_sign(r, val) * \
-                            dq[c > 0] / (1 + (tx_size == TX_32X32)); \
+    v = (val * dqv) >> dq_shift; \
+    dqcoeff_ptr[scan[c]] = (vp9_read_bit(r) ? -v : v); \
     INCREMENT_COUNT(token);                              \
     token_cache[scan[c]] = vp9_pt_energy_class[token];   \
-    c++;                                                 \
+    ++c;                                                 \
+    dqv = dq[1];                                          \
     continue;                                            \
   }
 
+
 #define ADJUST_COEF(prob, bits_count)                   \
   do {                                                  \
     val += (vp9_read(r, prob) << bits_count);           \
-  } while (0);
+  } while (0)
 
 static int decode_coefs(VP9_COMMON *cm, const MACROBLOCKD *xd,
                         vp9_reader *r, int block_idx,
                         PLANE_TYPE type, int seg_eob, int16_t *dqcoeff_ptr,
                         TX_SIZE tx_size, const int16_t *dq, int pt,
-                        uint8_t *token_cache,
-                        const uint8_t *band_translate) {
+                        uint8_t *token_cache) {
   const FRAME_CONTEXT *const fc = &cm->fc;
   FRAME_COUNTS *const counts = &cm->counts;
   const int ref = is_inter_block(&xd->mi_8x8[0]->mbmi);
   int band, c = 0;
   const vp9_prob (*coef_probs)[PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES] =
       fc->coef_probs[tx_size][type][ref];
-  vp9_prob coef_probs_full[COEF_BANDS][PREV_COEF_CONTEXTS][ENTROPY_NODES];
-  uint8_t load_map[COEF_BANDS][PREV_COEF_CONTEXTS] = { { 0 } };
   const vp9_prob *prob;
   unsigned int (*coef_counts)[PREV_COEF_CONTEXTS][UNCONSTRAINED_NODES + 1] =
       counts->coef[tx_size][type][ref];
@@ -110,6 +109,11 @@
       counts->eob_branch[tx_size][type][ref];
   const int16_t *scan, *nb;
   const uint8_t *cat6;
+  const uint8_t *band_translate = get_band_translate(tx_size);
+  const int dq_shift = (tx_size == TX_32X32);
+  int v;
+  int16_t dqv = dq[0];
+
   get_scan(xd, tx_size, type, block_idx, &scan, &nb);
 
   while (c < seg_eob) {
@@ -136,6 +140,7 @@
     if (!vp9_read(r, prob[ZERO_CONTEXT_NODE])) {
       INCREMENT_COUNT(ZERO_TOKEN);
       token_cache[scan[c]] = vp9_pt_energy_class[ZERO_TOKEN];
+      dqv = dq[1];                                          \
       ++c;
       goto SKIP_START;
     }
@@ -144,13 +149,9 @@
     if (!vp9_read(r, prob[ONE_CONTEXT_NODE])) {
       WRITE_COEF_CONTINUE(1, ONE_TOKEN);
     }
-    // Load full probabilities if not already loaded
-    if (!load_map[band][pt]) {
-      vp9_model_to_full_probs(coef_probs[band][pt],
-                              coef_probs_full[band][pt]);
-      load_map[band][pt] = 1;
-    }
-    prob = coef_probs_full[band][pt];
+
+    prob = vp9_pareto8_full[coef_probs[band][pt][PIVOT_NODE]-1];
+
     // LOW_VAL_CONTEXT_NODE_0_
     if (!vp9_read(r, prob[LOW_VAL_CONTEXT_NODE])) {
       if (!vp9_read(r, prob[TWO_CONTEXT_NODE])) {
@@ -201,10 +202,10 @@
     }
     val = 0;
     cat6 = cat6_prob;
-    while (*cat6) {
+    while (*cat6)
       val = (val << 1) | vp9_read(r, *cat6++);
-    }
     val += CAT6_MIN_VAL;
+
     WRITE_COEF_CONTINUE(val, DCT_VAL_CATEGORY6);
   }
 
@@ -219,8 +220,7 @@
 int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
                             int plane, int block, BLOCK_SIZE plane_bsize,
                             int x, int y, TX_SIZE tx_size, vp9_reader *r,
-                            uint8_t *token_cache,
-                            const uint8_t *band_translate[2]) {
+                            uint8_t *token_cache) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int seg_eob = get_tx_eob(&cm->seg, xd->mi_8x8[0]->mbmi.segment_id,
                                  tx_size);
@@ -228,8 +228,7 @@
                                               pd->left_context + y);
   const int eob = decode_coefs(cm, xd, r, block, pd->plane_type, seg_eob,
                                BLOCK_OFFSET(pd->dqcoeff, block), tx_size,
-                               pd->dequant, pt, token_cache,
-                               band_translate[tx_size != TX_4X4]);
+                               pd->dequant, pt, token_cache);
   set_contexts(xd, pd, plane_bsize, tx_size, eob > 0, x, y);
   pd->eobs[block] = eob;
   return eob;

diff --git a/vp9/decoder/vp9_detokenize.h b/vp9/decoder/vp9_detokenize.h
index 7522c97..e858a19 100644
--- a/vp9/decoder/vp9_detokenize.h
+++ b/vp9/decoder/vp9_detokenize.h

@@ -18,7 +18,6 @@
 int vp9_decode_block_tokens(VP9_COMMON *cm, MACROBLOCKD *xd,
                             int plane, int block, BLOCK_SIZE plane_bsize,
                             int x, int y, TX_SIZE tx_size, vp9_reader *r,
-                            uint8_t *token_cache,
-                            const uint8_t *band_translate[2]);
+                            uint8_t *token_cache);
 
 #endif  // VP9_DECODER_VP9_DETOKENIZE_H_

diff --git a/vp9/decoder/vp9_onyxd_if.c b/vp9/decoder/vp9_onyxd_if.c
index cb45d37..7c0f91d 100644
--- a/vp9/decoder/vp9_onyxd_if.c
+++ b/vp9/decoder/vp9_onyxd_if.c

@@ -25,7 +25,7 @@
 #include "vpx_scale/vpx_scale.h"
 #include "vp9/common/vp9_systemdependent.h"
 #include "vpx_ports/vpx_timer.h"
-#include "vp9/decoder/vp9_decodframe.h"
+#include "vp9/decoder/vp9_decodeframe.h"
 #include "vp9/decoder/vp9_detokenize.h"
 #include "./vpx_scale_rtcd.h"
 

diff --git a/vp9/decoder/vp9_onyxd_int.h b/vp9/decoder/vp9_onyxd_int.h
index e29b453..d3d29e9 100644
--- a/vp9/decoder/vp9_onyxd_int.h
+++ b/vp9/decoder/vp9_onyxd_int.h

@@ -55,7 +55,6 @@
   PARTITION_CONTEXT *above_seg_context;
 
   DECLARE_ALIGNED(16, uint8_t, token_cache[1024]);
-  DECLARE_ALIGNED(16, uint8_t, coefband_trans_8x8plus[1024]);
 } VP9D_COMP;
 
 #endif  // VP9_DECODER_VP9_ONYXD_INT_H_

diff --git a/vp9/decoder/vp9_treereader.h b/vp9/decoder/vp9_treereader.h
index f612497..41680d2 100644
--- a/vp9/decoder/vp9_treereader.h
+++ b/vp9/decoder/vp9_treereader.h

@@ -15,8 +15,6 @@
 #include "vp9/common/vp9_treecoder.h"
 #include "vp9/decoder/vp9_dboolhuff.h"
 
-#define vp9_read_and_apply_sign(r, value) (vp9_read_bit(r) ? -(value) : (value))
-
 // Intent of tree data structure is to make decoding trivial.
 static int treed_read(vp9_reader *const r, /* !!! must return a 0 or 1 !!! */
                       vp9_tree t,

diff --git a/vp9/encoder/vp9_block.h b/vp9/encoder/vp9_block.h
index 6427f7f..4445970 100644
--- a/vp9/encoder/vp9_block.h
+++ b/vp9/encoder/vp9_block.h

@@ -27,16 +27,16 @@
 typedef struct {
   MODE_INFO mic;
   uint8_t *zcoeff_blk;
-  int16_t *coeff[MAX_MB_PLANE][2];
-  int16_t *qcoeff[MAX_MB_PLANE][2];
-  int16_t *dqcoeff[MAX_MB_PLANE][2];
-  uint16_t *eobs[MAX_MB_PLANE][2];
+  int16_t *coeff[MAX_MB_PLANE][3];
+  int16_t *qcoeff[MAX_MB_PLANE][3];
+  int16_t *dqcoeff[MAX_MB_PLANE][3];
+  uint16_t *eobs[MAX_MB_PLANE][3];
 
   // dual buffer pointers, 0: in use, 1: best in store
-  int16_t *coeff_pbuf[MAX_MB_PLANE][2];
-  int16_t *qcoeff_pbuf[MAX_MB_PLANE][2];
-  int16_t *dqcoeff_pbuf[MAX_MB_PLANE][2];
-  uint16_t *eobs_pbuf[MAX_MB_PLANE][2];
+  int16_t *coeff_pbuf[MAX_MB_PLANE][3];
+  int16_t *qcoeff_pbuf[MAX_MB_PLANE][3];
+  int16_t *dqcoeff_pbuf[MAX_MB_PLANE][3];
+  uint16_t *eobs_pbuf[MAX_MB_PLANE][3];
 
   int is_coded;
   int num_4x4_blk;
@@ -94,6 +94,7 @@
   MACROBLOCKD e_mbd;
   int skip_block;
   int select_txfm_size;
+  int skip_recode;
   int skip_optimize;
   int q_index;
 
@@ -193,9 +194,6 @@
   BLOCK_SIZE sb64_partitioning;
 
   void (*fwd_txm4x4)(const int16_t *input, int16_t *output, int stride);
-
-  // band cache
-  DECLARE_ALIGNED(16, uint8_t, coefband_trans_8x8plus[1024]);
 };
 
 // TODO(jingning): the variables used here are little complicated. need further

diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index c733652..2a85dee 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c

@@ -20,7 +20,6 @@
 #include "vp9/common/vp9_common.h"
 #include "vp9/common/vp9_entropy.h"
 #include "vp9/common/vp9_entropymode.h"
-#include "vp9/common/vp9_extend.h"
 #include "vp9/common/vp9_findnearmv.h"
 #include "vp9/common/vp9_idct.h"
 #include "vp9/common/vp9_mvref_common.h"
@@ -31,9 +30,9 @@
 #include "vp9/common/vp9_seg_common.h"
 #include "vp9/common/vp9_tile_common.h"
 #include "vp9/encoder/vp9_encodeframe.h"
-#include "vp9/encoder/vp9_encodeintra.h"
 #include "vp9/encoder/vp9_encodemb.h"
 #include "vp9/encoder/vp9_encodemv.h"
+#include "vp9/encoder/vp9_extend.h"
 #include "vp9/encoder/vp9_onyx_int.h"
 #include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/encoder/vp9_segmentation.h"
@@ -377,6 +376,7 @@
   const int mis = cm->mode_info_stride;
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  int max_plane;
 
   assert(mi->mbmi.mode < MB_MODE_COUNT);
   assert(mi->mbmi.ref_frame[0] < MAX_REF_FRAMES);
@@ -385,13 +385,21 @@
 
   *mi_addr = *mi;
 
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
+  max_plane = is_inter_block(mbmi) ? MAX_MB_PLANE : 1;
+  for (i = 0; i < max_plane; ++i) {
     p[i].coeff = ctx->coeff_pbuf[i][1];
     pd[i].qcoeff = ctx->qcoeff_pbuf[i][1];
     pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
     pd[i].eobs = ctx->eobs_pbuf[i][1];
   }
 
+  for (i = max_plane; i < MAX_MB_PLANE; ++i) {
+    p[i].coeff = ctx->coeff_pbuf[i][2];
+    pd[i].qcoeff = ctx->qcoeff_pbuf[i][2];
+    pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][2];
+    pd[i].eobs = ctx->eobs_pbuf[i][2];
+  }
+
   // Restore the coding context of the MB to that that was in place
   // when the mode was picked for it
   for (y = 0; y < mi_height; y++)
@@ -619,6 +627,7 @@
     pd[i].eobs = ctx->eobs_pbuf[i][0];
   }
   ctx->is_coded = 0;
+  x->skip_recode = 0;
 
   // Set to zero to make sure we do not use the previous encoded frame stats
   xd->mi_8x8[0]->mbmi.skip_coeff = 0;
@@ -2406,6 +2415,7 @@
   const int mis = cm->mode_info_stride;
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  x->skip_recode = !x->select_txfm_size && mbmi->sb_type >= BLOCK_8X8;
   x->skip_optimize = ctx->is_coded;
   ctx->is_coded = 1;
   x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct;

diff --git a/vp9/encoder/vp9_encodeintra.c b/vp9/encoder/vp9_encodeintra.c
deleted file mode 100644
index 32b4593..0000000
--- a/vp9/encoder/vp9_encodeintra.c
+++ /dev/null

@@ -1,28 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "./vpx_config.h"
-#include "./vp9_rtcd.h"
-#include "vp9/encoder/vp9_quantize.h"
-#include "vp9/common/vp9_reconintra.h"
-#include "vp9/encoder/vp9_encodemb.h"
-#include "vp9/encoder/vp9_encodeintra.h"
-
-int vp9_encode_intra(MACROBLOCK *x, int use_16x16_pred) {
-  MB_MODE_INFO * mbmi = &x->e_mbd.mi_8x8[0]->mbmi;
-  x->skip_encode = 0;
-  mbmi->mode = DC_PRED;
-  mbmi->ref_frame[0] = INTRA_FRAME;
-  mbmi->tx_size = use_16x16_pred ? (mbmi->sb_type >= BLOCK_16X16 ? TX_16X16
-                                                                 : TX_8X8)
-                                   : TX_4X4;
-  vp9_encode_intra_block_y(x, mbmi->sb_type);
-  return vp9_get_mb_ss(x->plane[0].src_diff);
-}

diff --git a/vp9/encoder/vp9_encodeintra.h b/vp9/encoder/vp9_encodeintra.h
deleted file mode 100644
index e217924..0000000
--- a/vp9/encoder/vp9_encodeintra.h
+++ /dev/null

@@ -1,20 +0,0 @@
-/*
- *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VP9_ENCODER_VP9_ENCODEINTRA_H_
-#define VP9_ENCODER_VP9_ENCODEINTRA_H_
-
-#include "vp9/encoder/vp9_onyx_int.h"
-
-int vp9_encode_intra(MACROBLOCK *x, int use_16x16_pred);
-void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
-                            TX_SIZE tx_size, void *arg);
-
-#endif  // VP9_ENCODER_VP9_ENCODEINTRA_H_

diff --git a/vp9/encoder/vp9_encodemb.c b/vp9/encoder/vp9_encodemb.c
index 7afed26..0e1523b 100644
--- a/vp9/encoder/vp9_encodemb.c
+++ b/vp9/encoder/vp9_encodemb.c

@@ -25,6 +25,26 @@
 #include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/encoder/vp9_tokenize.h"
 
+void vp9_setup_interp_filters(MACROBLOCKD *xd,
+                              INTERPOLATION_TYPE mcomp_filter_type,
+                              VP9_COMMON *cm) {
+  if (xd->mi_8x8 && xd->mi_8x8[0]) {
+    MB_MODE_INFO *const mbmi = &xd->mi_8x8[0]->mbmi;
+
+    set_scale_factors(xd, mbmi->ref_frame[0] - LAST_FRAME,
+                          mbmi->ref_frame[1] - LAST_FRAME,
+                          cm->active_ref_scale);
+  } else {
+    set_scale_factors(xd, -1, -1, cm->active_ref_scale);
+  }
+
+  xd->subpix.filter_x = xd->subpix.filter_y =
+      vp9_get_filter_kernel(mcomp_filter_type == SWITCHABLE ?
+                               EIGHTTAP : mcomp_filter_type);
+
+  assert(((intptr_t)xd->subpix.filter_x & 0xff) == 0);
+}
+
 void vp9_subtract_block_c(int rows, int cols,
                           int16_t *diff_ptr, ptrdiff_t diff_stride,
                           const uint8_t *src_ptr, ptrdiff_t src_stride,
@@ -136,16 +156,13 @@
   const int16_t *scan, *nb;
   const int mul = 1 + (tx_size == TX_32X32);
   uint8_t token_cache[1024];
-  const int ib = txfrm_block_to_raster_block(plane_bsize, tx_size, block);
   const int16_t *dequant_ptr = pd->dequant;
-  const uint8_t *const band_translate = (tx_size == TX_4X4 ?
-                                         vp9_coefband_trans_4x4 :
-                                         mb->coefband_trans_8x8plus);
+  const uint8_t *const band_translate = get_band_translate(tx_size);
 
   assert((!type && !plane) || (type && plane));
   dqcoeff_ptr = BLOCK_OFFSET(pd->dqcoeff, block);
   qcoeff_ptr = BLOCK_OFFSET(pd->qcoeff, block);
-  get_scan(xd, tx_size, type, ib, &scan, &nb);
+  get_scan(xd, tx_size, type, block, &scan, &nb);
   assert(eob <= default_eob);
 
   /* Now set up a Viterbi trellis to evaluate alternative roundings. */
@@ -422,35 +439,29 @@
   MACROBLOCKD *const xd = &x->e_mbd;
   struct optimize_ctx *const ctx = args->ctx;
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size,
-                                                       block);
-
   int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  uint8_t *const dst = raster_block_offset_uint8(plane_bsize, raster_block,
-                                                 pd->dst.buf, pd->dst.stride);
+  int i, j;
+  uint8_t *dst;
+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
+  dst = &pd->dst.buf[4 * j * pd->dst.stride + 4 * i];
 
   // TODO(jingning): per transformed block zero forcing only enabled for
   // luma component. will integrate chroma components as well.
   if (x->zcoeff_blk[tx_size][block] && plane == 0) {
-    int i, k;
     pd->eobs[block] = 0;
-    txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &k);
     ctx->ta[plane][i] = 0;
-    ctx->tl[plane][k] = 0;
+    ctx->tl[plane][j] = 0;
     return;
   }
 
-  if (x->select_txfm_size || xd->mi_8x8[0]->mbmi.sb_type < BLOCK_8X8)
+  if (!x->skip_recode)
     vp9_xform_quant(plane, block, plane_bsize, tx_size, arg);
 
-  if (x->optimize && (x->select_txfm_size ||
-      xd->mi_8x8[0]->mbmi.sb_type < BLOCK_8X8|| !x->skip_optimize)) {
+  if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
     vp9_optimize_b(plane, block, plane_bsize, tx_size, x, ctx);
   } else {
-    int i, k;
-    txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &k);
     ctx->ta[plane][i] = pd->eobs[block] > 0;
-    ctx->tl[plane][k] = pd->eobs[block] > 0;
+    ctx->tl[plane][j] = pd->eobs[block] > 0;
   }
 
   if (x->skip_encode || pd->eobs[block] == 0)
@@ -483,12 +494,11 @@
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  const int raster_block = txfrm_block_to_raster_block(plane_bsize, tx_size,
-                                                       block);
-
   int16_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  uint8_t *const dst = raster_block_offset_uint8(plane_bsize, raster_block,
-                                                 pd->dst.buf, pd->dst.stride);
+  int i, j;
+  uint8_t *dst;
+  txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &i, &j);
+  dst = &pd->dst.buf[4 * j * pd->dst.stride + 4 * i];
 
   vp9_xform_quant(plane, block, plane_bsize, tx_size, arg);
 
@@ -515,10 +525,10 @@
   struct optimize_ctx ctx;
   struct encode_b_args arg = {x, &ctx};
 
-  if (x->select_txfm_size || xd->mi_8x8[0]->mbmi.sb_type < BLOCK_8X8)
+  if (!x->skip_recode)
     vp9_subtract_sb(x, bsize);
 
-  if (x->optimize) {
+  if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
     int i;
     for (i = 0; i < MAX_MB_PLANE; ++i)
       optimize_init_b(i, bsize, &arg);
@@ -563,19 +573,22 @@
       xoff = 32 * (block & twmask);
       yoff = 32 * (block >> twl);
       dst = pd->dst.buf + yoff * pd->dst.stride + xoff;
-      src = p->src.buf + yoff * p->src.stride + xoff;
-      src_diff = p->src_diff + 4 * bw * yoff + xoff;
       vp9_predict_intra_block(xd, block, bwl, TX_32X32, mode,
                               dst, pd->dst.stride, dst, pd->dst.stride);
-      vp9_subtract_block(32, 32, src_diff, bw * 4,
-                         src, p->src.stride, dst, pd->dst.stride);
-      if (x->use_lp32x32fdct)
-        vp9_fdct32x32_rd(src_diff, coeff, bw * 4);
-      else
-        vp9_fdct32x32(src_diff, coeff, bw * 4);
-      vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
-                           p->quant, p->quant_shift, qcoeff, dqcoeff,
-                           pd->dequant, p->zbin_extra, eob, scan, iscan);
+
+      if (!x->skip_recode) {
+        src = p->src.buf + yoff * p->src.stride + xoff;
+        src_diff = p->src_diff + 4 * bw * yoff + xoff;
+        vp9_subtract_block(32, 32, src_diff, bw * 4,
+                           src, p->src.stride, dst, pd->dst.stride);
+        if (x->use_lp32x32fdct)
+          vp9_fdct32x32_rd(src_diff, coeff, bw * 4);
+        else
+          vp9_fdct32x32(src_diff, coeff, bw * 4);
+        vp9_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
+                             p->quant, p->quant_shift, qcoeff, dqcoeff,
+                             pd->dequant, p->zbin_extra, eob, scan, iscan);
+      }
       if (!x->skip_encode && *eob)
         vp9_idct32x32_add(dqcoeff, dst, pd->dst.stride, *eob);
       break;
@@ -588,16 +601,18 @@
       xoff = 16 * (block & twmask);
       yoff = 16 * (block >> twl);
       dst = pd->dst.buf + yoff * pd->dst.stride + xoff;
-      src = p->src.buf + yoff * p->src.stride + xoff;
-      src_diff = p->src_diff + 4 * bw * yoff + xoff;
       vp9_predict_intra_block(xd, block, bwl, TX_16X16, mode,
                               dst, pd->dst.stride, dst, pd->dst.stride);
-      vp9_subtract_block(16, 16, src_diff, bw * 4,
-                         src, p->src.stride, dst, pd->dst.stride);
-      vp9_fht16x16(tx_type, src_diff, coeff, bw * 4);
-      vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
-                     p->quant, p->quant_shift, qcoeff, dqcoeff,
-                     pd->dequant, p->zbin_extra, eob, scan, iscan);
+      if (!x->skip_recode) {
+        src = p->src.buf + yoff * p->src.stride + xoff;
+        src_diff = p->src_diff + 4 * bw * yoff + xoff;
+        vp9_subtract_block(16, 16, src_diff, bw * 4,
+                           src, p->src.stride, dst, pd->dst.stride);
+        vp9_fht16x16(tx_type, src_diff, coeff, bw * 4);
+        vp9_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
+                       p->quant, p->quant_shift, qcoeff, dqcoeff,
+                       pd->dequant, p->zbin_extra, eob, scan, iscan);
+      }
       if (!x->skip_encode && *eob)
         vp9_iht16x16_add(tx_type, dqcoeff, dst, pd->dst.stride, *eob);
       break;
@@ -610,16 +625,18 @@
       xoff = 8 * (block & twmask);
       yoff = 8 * (block >> twl);
       dst = pd->dst.buf + yoff * pd->dst.stride + xoff;
-      src = p->src.buf + yoff * p->src.stride + xoff;
-      src_diff = p->src_diff + 4 * bw * yoff + xoff;
       vp9_predict_intra_block(xd, block, bwl, TX_8X8, mode,
                               dst, pd->dst.stride, dst, pd->dst.stride);
-      vp9_subtract_block(8, 8, src_diff, bw * 4,
-                         src, p->src.stride, dst, pd->dst.stride);
-      vp9_fht8x8(tx_type, src_diff, coeff, bw * 4);
-      vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
-                     p->quant_shift, qcoeff, dqcoeff,
-                     pd->dequant, p->zbin_extra, eob, scan, iscan);
+      if (!x->skip_recode) {
+        src = p->src.buf + yoff * p->src.stride + xoff;
+        src_diff = p->src_diff + 4 * bw * yoff + xoff;
+        vp9_subtract_block(8, 8, src_diff, bw * 4,
+                           src, p->src.stride, dst, pd->dst.stride);
+        vp9_fht8x8(tx_type, src_diff, coeff, bw * 4);
+        vp9_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
+                       p->quant_shift, qcoeff, dqcoeff,
+                       pd->dequant, p->zbin_extra, eob, scan, iscan);
+      }
       if (!x->skip_encode && *eob)
         vp9_iht8x8_add(tx_type, dqcoeff, dst, pd->dst.stride, *eob);
       break;
@@ -635,19 +652,23 @@
       xoff = 4 * (block & twmask);
       yoff = 4 * (block >> twl);
       dst = pd->dst.buf + yoff * pd->dst.stride + xoff;
-      src = p->src.buf + yoff * p->src.stride + xoff;
-      src_diff = p->src_diff + 4 * bw * yoff + xoff;
       vp9_predict_intra_block(xd, block, bwl, TX_4X4, mode,
                               dst, pd->dst.stride, dst, pd->dst.stride);
-      vp9_subtract_block(4, 4, src_diff, bw * 4,
-                         src, p->src.stride, dst, pd->dst.stride);
-      if (tx_type != DCT_DCT)
-        vp9_short_fht4x4(src_diff, coeff, bw * 4, tx_type);
-      else
-        x->fwd_txm4x4(src_diff, coeff, bw * 4);
-      vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
-                     p->quant_shift, qcoeff, dqcoeff,
-                     pd->dequant, p->zbin_extra, eob, scan, iscan);
+
+      if (!x->skip_recode) {
+        src = p->src.buf + yoff * p->src.stride + xoff;
+        src_diff = p->src_diff + 4 * bw * yoff + xoff;
+        vp9_subtract_block(4, 4, src_diff, bw * 4,
+                           src, p->src.stride, dst, pd->dst.stride);
+        if (tx_type != DCT_DCT)
+          vp9_short_fht4x4(src_diff, coeff, bw * 4, tx_type);
+        else
+          x->fwd_txm4x4(src_diff, coeff, bw * 4);
+        vp9_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
+                       p->quant_shift, qcoeff, dqcoeff,
+                       pd->dequant, p->zbin_extra, eob, scan, iscan);
+      }
+
       if (!x->skip_encode && *eob) {
         if (tx_type == DCT_DCT)
           // this is like vp9_short_idct4x4 but has a special case around eob<=1
@@ -678,3 +699,14 @@
   foreach_transformed_block_uv(xd, bsize, vp9_encode_block_intra, &arg);
 }
 
+int vp9_encode_intra(MACROBLOCK *x, int use_16x16_pred) {
+  MB_MODE_INFO * mbmi = &x->e_mbd.mi_8x8[0]->mbmi;
+  x->skip_encode = 0;
+  mbmi->mode = DC_PRED;
+  mbmi->ref_frame[0] = INTRA_FRAME;
+  mbmi->tx_size = use_16x16_pred ? (mbmi->sb_type >= BLOCK_16X16 ? TX_16X16
+                                                                 : TX_8X8)
+                                   : TX_4X4;
+  vp9_encode_intra_block_y(x, mbmi->sb_type);
+  return vp9_get_mb_ss(x->plane[0].src_diff);
+}

diff --git a/vp9/encoder/vp9_encodemb.h b/vp9/encoder/vp9_encodemb.h
index 61dd735..7be6621 100644
--- a/vp9/encoder/vp9_encodemb.h
+++ b/vp9/encoder/vp9_encodemb.h

@@ -47,8 +47,14 @@
 void vp9_subtract_sbuv(MACROBLOCK *x, BLOCK_SIZE bsize);
 void vp9_subtract_sb(MACROBLOCK *x, BLOCK_SIZE bsize);
 
+void vp9_encode_block_intra(int plane, int block, BLOCK_SIZE plane_bsize,
+                            TX_SIZE tx_size, void *arg);
+
 void vp9_encode_intra_block_y(MACROBLOCK *x, BLOCK_SIZE bsize);
 void vp9_encode_intra_block_uv(MACROBLOCK *x, BLOCK_SIZE bsize);
 
-
+int vp9_encode_intra(MACROBLOCK *x, int use_16x16_pred);
+void vp9_setup_interp_filters(MACROBLOCKD *xd,
+                              INTERPOLATION_TYPE mcomp_filter_type,
+                              VP9_COMMON *cm);
 #endif  // VP9_ENCODER_VP9_ENCODEMB_H_

diff --git a/vp9/encoder/vp9_encodemv.c b/vp9/encoder/vp9_encodemv.c
index 030ca64..7e838c9 100644
--- a/vp9/encoder/vp9_encodemv.c
+++ b/vp9/encoder/vp9_encodemv.c

@@ -126,20 +126,15 @@
 
 static int update_mv(vp9_writer *w, const unsigned int ct[2], vp9_prob *cur_p,
                      vp9_prob upd_p) {
-  const vp9_prob new_p = get_binary_prob(ct[0], ct[1]);
-  vp9_prob mod_p = new_p | 1;
-  const int cur_b = cost_branch256(ct, *cur_p);
-  const int mod_b = cost_branch256(ct, mod_p);
-  const int cost = 7 * 256 + (vp9_cost_one(upd_p) - vp9_cost_zero(upd_p));
-  if (cur_b - mod_b > cost) {
-    *cur_p = mod_p;
-    vp9_write(w, 1, upd_p);
-    vp9_write_literal(w, mod_p >> 1, 7);
-    return 1;
-  } else {
-    vp9_write(w, 0, upd_p);
-    return 0;
+  const vp9_prob new_p = get_binary_prob(ct[0], ct[1]) | 1;
+  const int update = cost_branch256(ct, *cur_p) + vp9_cost_zero(upd_p) >
+                     cost_branch256(ct, new_p) + vp9_cost_one(upd_p) + 7 * 256;
+  vp9_write(w, update, upd_p);
+  if (update) {
+    *cur_p = new_p;
+    vp9_write_literal(w, new_p >> 1, 7);
   }
+  return update;
 }
 
 static void counts_to_nmv_context(

diff --git a/vp9/common/vp9_extend.c b/vp9/encoder/vp9_extend.c
similarity index 98%
rename from vp9/common/vp9_extend.c
rename to vp9/encoder/vp9_extend.c
index 07c68c8..dcbb5ac 100644
--- a/vp9/common/vp9_extend.c
+++ b/vp9/encoder/vp9_extend.c

@@ -11,7 +11,7 @@
 #include "vpx_mem/vpx_mem.h"
 
 #include "vp9/common/vp9_common.h"
-#include "vp9/common/vp9_extend.h"
+#include "vp9/encoder/vp9_extend.h"
 
 static void copy_and_extend_plane(const uint8_t *src, int src_pitch,
                                   uint8_t *dst, int dst_pitch,
@@ -62,7 +62,7 @@
   const int et_y = 16;
   const int el_y = 16;
   // Motion estimation may use src block variance with the block size up
-  // to 64x64, so the right and bottom need to be extended to 64 mulitple
+  // to 64x64, so the right and bottom need to be extended to 64 multiple
   // or up to 16, whichever is greater.
   const int eb_y = MAX(ALIGN_POWER_OF_TWO(src->y_width, 6) - src->y_width,
                        16);

diff --git a/vp9/common/vp9_extend.h b/vp9/encoder/vp9_extend.h
similarity index 100%
rename from vp9/common/vp9_extend.h
rename to vp9/encoder/vp9_extend.h


diff --git a/vp9/encoder/vp9_firstpass.c b/vp9/encoder/vp9_firstpass.c
index 4d6da51..377cffb 100644
--- a/vp9/encoder/vp9_firstpass.c
+++ b/vp9/encoder/vp9_firstpass.c

@@ -11,17 +11,16 @@
 #include <math.h>
 #include <limits.h>
 #include <stdio.h>
+#include "vp9/common/vp9_systemdependent.h"
 #include "vp9/encoder/vp9_block.h"
-#include "vp9/encoder/vp9_onyx_int.h"
-#include "vp9/encoder/vp9_variance.h"
-#include "vp9/encoder/vp9_encodeintra.h"
-#include "vp9/encoder/vp9_mcomp.h"
-#include "vp9/encoder/vp9_firstpass.h"
-#include "vpx_scale/vpx_scale.h"
 #include "vp9/encoder/vp9_encodeframe.h"
 #include "vp9/encoder/vp9_encodemb.h"
-#include "vp9/common/vp9_extend.h"
-#include "vp9/common/vp9_systemdependent.h"
+#include "vp9/encoder/vp9_extend.h"
+#include "vp9/encoder/vp9_firstpass.h"
+#include "vp9/encoder/vp9_mcomp.h"
+#include "vp9/encoder/vp9_onyx_int.h"
+#include "vp9/encoder/vp9_variance.h"
+#include "vpx_scale/vpx_scale.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_scale/yv12config.h"
 #include "vp9/encoder/vp9_quantize.h"
@@ -407,8 +406,6 @@
   // for first pass test
   while ((quart_frm << sr) < MAX_FULL_PEL_VAL)
     sr++;
-  if (sr)
-    sr--;
 
   step_param    += sr;
   further_steps -= sr;
@@ -535,6 +532,7 @@
     pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
     pd[i].eobs = ctx->eobs_pbuf[i][1];
   }
+  x->skip_recode = 0;
 
 
   // Initialise the MV cost table to the defaults

diff --git a/vp9/encoder/vp9_lookahead.c b/vp9/encoder/vp9_lookahead.c
index c28c868..277bd7d 100644
--- a/vp9/encoder/vp9_lookahead.c
+++ b/vp9/encoder/vp9_lookahead.c

@@ -12,8 +12,8 @@
 
 #include "./vpx_config.h"
 #include "vp9/common/vp9_common.h"
+#include "vp9/encoder/vp9_extend.h"
 #include "vp9/encoder/vp9_lookahead.h"
-#include "vp9/common/vp9_extend.h"
 
 struct lookahead_ctx {
   unsigned int max_sz;         /* Absolute size of the queue */

diff --git a/vp9/encoder/vp9_mbgraph.c b/vp9/encoder/vp9_mbgraph.c
index 7b605b2..9870738 100644
--- a/vp9/encoder/vp9_mbgraph.c
+++ b/vp9/encoder/vp9_mbgraph.c

@@ -11,7 +11,6 @@
 #include <limits.h>
 
 #include "vpx_mem/vpx_mem.h"
-#include "vp9/encoder/vp9_encodeintra.h"
 #include "vp9/encoder/vp9_rdopt.h"
 #include "vp9/encoder/vp9_segmentation.h"
 #include "vp9/encoder/vp9_mcomp.h"

diff --git a/vp9/encoder/vp9_mcomp.c b/vp9/encoder/vp9_mcomp.c
index a52f5b1..a383164 100644
--- a/vp9/encoder/vp9_mcomp.c
+++ b/vp9/encoder/vp9_mcomp.c

@@ -51,9 +51,6 @@
   while ((size << sr) < MAX_FULL_PEL_VAL)
     sr++;
 
-  if (sr)
-    sr--;
-
   sr += cpi->sf.reduce_first_step_size;
   sr = MIN(sr, (cpi->sf.max_step_search_steps - 2));
   return sr;

diff --git a/vp9/encoder/vp9_mcomp.h b/vp9/encoder/vp9_mcomp.h
index bcab679..10c2e4f 100644
--- a/vp9/encoder/vp9_mcomp.h
+++ b/vp9/encoder/vp9_mcomp.h

@@ -18,8 +18,9 @@
 // The maximum number of steps in a step search given the largest
 // allowed initial step
 #define MAX_MVSEARCH_STEPS 11
-// Max full pel mv specified in 1 pel units
-#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS)) - 1)
+// Max full pel mv specified in the unit of full pixel
+// Enable the use of motion vector in range [-1023, 1023].
+#define MAX_FULL_PEL_VAL ((1 << (MAX_MVSEARCH_STEPS - 1)) - 1)
 // Maximum size of the first step in full pel units
 #define MAX_FIRST_STEP (1 << (MAX_MVSEARCH_STEPS-1))
 // Allowed motion vector pixel distance outside image border

diff --git a/vp9/encoder/vp9_onyx_if.c b/vp9/encoder/vp9_onyx_if.c
index a383378..b7874d5 100644
--- a/vp9/encoder/vp9_onyx_if.c
+++ b/vp9/encoder/vp9_onyx_if.c

@@ -1222,13 +1222,6 @@
   cpi->fixed_divide[0] = 0;
   for (i = 1; i < 512; i++)
     cpi->fixed_divide[i] = 0x80000 / i;
-
-  vpx_memset(cpi->mb.coefband_trans_8x8plus,
-             (COEF_BANDS-1),
-             sizeof(cpi->mb.coefband_trans_8x8plus));
-  vpx_memcpy(cpi->mb.coefband_trans_8x8plus,
-             vp9_coefband_trans_8x8plus,
-             sizeof(vp9_coefband_trans_8x8plus));
 }
 
 
@@ -1450,7 +1443,7 @@
   CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
                   vpx_calloc(num_4x4_blk, sizeof(uint8_t)));
   for (i = 0; i < MAX_MB_PLANE; ++i) {
-    for (k = 0; k < 2; ++k) {
+    for (k = 0; k < 3; ++k) {
       CHECK_MEM_ERROR(cm, ctx->coeff[i][k],
                       vpx_memalign(16, num_pix * sizeof(int16_t)));
       CHECK_MEM_ERROR(cm, ctx->qcoeff[i][k],
@@ -1472,7 +1465,7 @@
   vpx_free(ctx->zcoeff_blk);
   ctx->zcoeff_blk = 0;
   for (i = 0; i < MAX_MB_PLANE; ++i) {
-    for (k = 0; k < 2; ++k) {
+    for (k = 0; k < 3; ++k) {
       vpx_free(ctx->coeff[i][k]);
       ctx->coeff[i][k] = 0;
       vpx_free(ctx->qcoeff[i][k]);
@@ -3441,7 +3434,9 @@
 
   // Post encode loop adjustment of Q prediction.
   if (!active_worst_qchanged)
-    vp9_update_rate_correction_factors(cpi, (cpi->sf.recode_loop) ? 2 : 0);
+    vp9_update_rate_correction_factors(cpi, (cpi->sf.recode_loop ||
+        cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER) ? 2 : 0);
+
 
   cpi->last_q[cm->frame_type] = cm->base_qindex;
 

diff --git a/vp9/encoder/vp9_rdopt.c b/vp9/encoder/vp9_rdopt.c
index 695a2e2..5d37f83 100644
--- a/vp9/encoder/vp9_rdopt.c
+++ b/vp9/encoder/vp9_rdopt.c

@@ -18,7 +18,6 @@
 #include "vp9/encoder/vp9_treewriter.h"
 #include "vp9/encoder/vp9_onyx_int.h"
 #include "vp9/encoder/vp9_modecosts.h"
-#include "vp9/encoder/vp9_encodeintra.h"
 #include "vp9/common/vp9_entropymode.h"
 #include "vp9/common/vp9_reconinter.h"
 #include "vp9/common/vp9_reconintra.h"
@@ -246,7 +245,8 @@
 
   vp9_set_speed_features(cpi);
 
-  cpi->mb.select_txfm_size = cpi->sf.tx_size_search_method == USE_LARGESTALL ?
+  cpi->mb.select_txfm_size = (cpi->sf.tx_size_search_method == USE_LARGESTALL &&
+                              cm->frame_type != KEY_FRAME) ?
                              0 : 1;
 
   set_block_thresholds(cpi);
@@ -612,8 +612,9 @@
 
   // TODO(jingning): temporarily enabled only for luma component
   rd = MIN(rd1, rd2);
-  if (!xd->lossless && plane == 0)
-    x->zcoeff_blk[tx_size][block] = rd1 > rd2 || !xd->plane[plane].eobs[block];
+  if (plane == 0)
+    x->zcoeff_blk[tx_size][block] = !xd->plane[plane].eobs[block] ||
+                                    (rd1 > rd2 && !xd->lossless);
 
   args->this_rate += args->rate;
   args->this_dist += args->dist;
@@ -1026,10 +1027,10 @@
   struct macroblockd_plane *pd = &xd->plane[0];
   const int src_stride = p->src.stride;
   const int dst_stride = pd->dst.stride;
-  uint8_t *src_init = raster_block_offset_uint8(BLOCK_8X8, ib,
-                                                p->src.buf, src_stride);
-  uint8_t *dst_init = raster_block_offset_uint8(BLOCK_8X8, ib,
-                                                pd->dst.buf, dst_stride);
+  const uint8_t *src_init = &p->src.buf[raster_block_offset(BLOCK_8X8, ib,
+                                                            src_stride)];
+  uint8_t *dst_init = &pd->dst.buf[raster_block_offset(BLOCK_8X8, ib,
+                                                       dst_stride)];
   int16_t *src_diff, *coeff;
 
   ENTROPY_CONTEXT ta[2], tempa[2];
@@ -1071,7 +1072,7 @@
         int64_t ssz;
         const int16_t *scan;
         const int16_t *nb;
-        uint8_t *src = src_init + idx * 4 + idy * 4 * src_stride;
+        const uint8_t *src = src_init + idx * 4 + idy * 4 * src_stride;
         uint8_t *dst = dst_init + idx * 4 + idy * 4 * dst_stride;
         const int block = ib + idy * 2 + idx;
         TX_TYPE tx_type;
@@ -1329,6 +1330,7 @@
 }
 
 static int64_t rd_pick_intra_sbuv_mode(VP9_COMP *cpi, MACROBLOCK *x,
+                                       PICK_MODE_CONTEXT *ctx,
                                        int *rate, int *rate_tokenonly,
                                        int64_t *distortion, int *skippable,
                                        BLOCK_SIZE bsize) {
@@ -1364,6 +1366,27 @@
       *rate_tokenonly = this_rate_tokenonly;
       *distortion     = this_distortion;
       *skippable      = s;
+      if (!x->select_txfm_size) {
+        int i;
+        struct macroblock_plane *const p = x->plane;
+        struct macroblockd_plane *const pd = x->e_mbd.plane;
+        for (i = 1; i < MAX_MB_PLANE; ++i) {
+          p[i].coeff    = ctx->coeff_pbuf[i][2];
+          pd[i].qcoeff  = ctx->qcoeff_pbuf[i][2];
+          pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][2];
+          pd[i].eobs    = ctx->eobs_pbuf[i][2];
+
+          ctx->coeff_pbuf[i][2]   = ctx->coeff_pbuf[i][0];
+          ctx->qcoeff_pbuf[i][2]  = ctx->qcoeff_pbuf[i][0];
+          ctx->dqcoeff_pbuf[i][2] = ctx->dqcoeff_pbuf[i][0];
+          ctx->eobs_pbuf[i][2]    = ctx->eobs_pbuf[i][0];
+
+          ctx->coeff_pbuf[i][0]   = p[i].coeff;
+          ctx->qcoeff_pbuf[i][0]  = pd[i].qcoeff;
+          ctx->dqcoeff_pbuf[i][0] = pd[i].dqcoeff;
+          ctx->eobs_pbuf[i][0]    = pd[i].eobs;
+        }
+      }
     }
   }
 
@@ -1389,8 +1412,9 @@
   return this_rd;
 }
 
-static void choose_intra_uv_mode(VP9_COMP *cpi, BLOCK_SIZE bsize,
-                                 int *rate_uv, int *rate_uv_tokenonly,
+static void choose_intra_uv_mode(VP9_COMP *cpi, PICK_MODE_CONTEXT *ctx,
+                                 BLOCK_SIZE bsize, int *rate_uv,
+                                 int *rate_uv_tokenonly,
                                  int64_t *dist_uv, int *skip_uv,
                                  MB_PREDICTION_MODE *mode_uv) {
   MACROBLOCK *const x = &cpi->mb;
@@ -1403,7 +1427,7 @@
   // Else do a proper rd search for each possible transform size that may
   // be considered in the main rd loop.
   } else {
-    rd_pick_intra_sbuv_mode(cpi, x,
+    rd_pick_intra_sbuv_mode(cpi, x, ctx,
                             rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
                             bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize);
   }
@@ -1530,16 +1554,16 @@
   const int height = plane_block_height(bsize, pd);
   int idx, idy;
 
-  uint8_t *const src = raster_block_offset_uint8(BLOCK_8X8, i,
-                                                 p->src.buf, p->src.stride);
-  uint8_t *const dst = raster_block_offset_uint8(BLOCK_8X8, i,
-                                                 pd->dst.buf, pd->dst.stride);
+  const uint8_t *const src = &p->src.buf[raster_block_offset(BLOCK_8X8, i,
+                                                             p->src.stride)];
+  uint8_t *const dst = &pd->dst.buf[raster_block_offset(BLOCK_8X8, i,
+                                                        pd->dst.stride)];
   int64_t thisdistortion = 0, thissse = 0;
   int thisrate = 0, ref;
   const int is_compound = has_second_ref(&mi->mbmi);
   for (ref = 0; ref < 1 + is_compound; ++ref) {
-    const uint8_t *pre = raster_block_offset_uint8(BLOCK_8X8, i,
-                                     pd->pre[ref].buf, pd->pre[ref].stride);
+    const uint8_t *pre = &pd->pre[ref].buf[raster_block_offset(BLOCK_8X8, i,
+                                               pd->pre[ref].stride)];
     vp9_build_inter_predictor(pre, pd->pre[ref].stride,
                               dst, pd->dst.stride,
                               &mi->bmi[i].as_mv[ref].as_mv,
@@ -1627,14 +1651,13 @@
   struct macroblock_plane *const p = &x->plane[0];
   struct macroblockd_plane *const pd = &x->e_mbd.plane[0];
 
-  p->src.buf = raster_block_offset_uint8(BLOCK_8X8, i, p->src.buf,
-                                         p->src.stride);
+  p->src.buf = &p->src.buf[raster_block_offset(BLOCK_8X8, i, p->src.stride)];
   assert(((intptr_t)pd->pre[0].buf & 0x7) == 0);
-  pd->pre[0].buf = raster_block_offset_uint8(BLOCK_8X8, i, pd->pre[0].buf,
-                                             pd->pre[0].stride);
+  pd->pre[0].buf = &pd->pre[0].buf[raster_block_offset(BLOCK_8X8, i,
+                                                       pd->pre[0].stride)];
   if (has_second_ref(mbmi))
-    pd->pre[1].buf = raster_block_offset_uint8(BLOCK_8X8, i, pd->pre[1].buf,
-                                               pd->pre[1].stride);
+    pd->pre[1].buf = &pd->pre[1].buf[raster_block_offset(BLOCK_8X8, i,
+                                                         pd->pre[1].stride)];
 }
 
 static INLINE void mi_buf_restore(MACROBLOCK *x, struct buf_2d orig_src,
@@ -2613,6 +2636,16 @@
   vpx_free(second_pred);
 }
 
+static INLINE void restore_dst_buf(MACROBLOCKD *xd,
+                                   uint8_t *orig_dst[MAX_MB_PLANE],
+                                   int orig_dst_stride[MAX_MB_PLANE]) {
+  int i;
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    xd->plane[i].dst.buf = orig_dst[i];
+    xd->plane[i].dst.stride = orig_dst_stride[i];
+  }
+}
+
 static int64_t handle_inter_mode(VP9_COMP *cpi, MACROBLOCK *x,
                                  const TileInfo *const tile,
                                  BLOCK_SIZE bsize,
@@ -2764,6 +2797,7 @@
   if (is_comp_pred)
     intpel_mv &= (mbmi->mv[1].as_mv.row & 15) == 0 &&
         (mbmi->mv[1].as_mv.col & 15) == 0;
+
   // Search for best switchable filter by checking the variance of
   // pred error irrespective of whether the filter will be used
   if (cm->mcomp_filter_type != BILINEAR) {
@@ -2803,10 +2837,7 @@
               (cm->mcomp_filter_type != SWITCHABLE &&
                (cm->mcomp_filter_type == mbmi->interp_filter ||
                 (i == 0 && intpel_mv)))) {
-            for (j = 0; j < MAX_MB_PLANE; j++) {
-              xd->plane[j].dst.buf = orig_dst[j];
-              xd->plane[j].dst.stride = orig_dst_stride[j];
-            }
+            restore_dst_buf(xd, orig_dst, orig_dst_stride);
           } else {
             for (j = 0; j < MAX_MB_PLANE; j++) {
               xd->plane[j].dst.buf = tmp_buf + j * 64 * 64;
@@ -2830,10 +2861,7 @@
         }
         if (i == 0 && cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
           if (rd / 2 > ref_best_rd) {
-            for (i = 0; i < MAX_MB_PLANE; i++) {
-              xd->plane[i].dst.buf = orig_dst[i];
-              xd->plane[i].dst.stride = orig_dst_stride[i];
-            }
+            restore_dst_buf(xd, orig_dst, orig_dst_stride);
             return INT64_MAX;
           }
         }
@@ -2852,11 +2880,7 @@
           pred_exists = 1;
         }
       }
-
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = orig_dst[i];
-        xd->plane[i].dst.stride = orig_dst_stride[i];
-      }
+      restore_dst_buf(xd, orig_dst, orig_dst_stride);
     }
   }
   // Set the appropriate filter
@@ -2888,10 +2912,7 @@
     // if current pred_error modeled rd is substantially more than the best
     // so far, do not bother doing full rd
     if (rd / 2 > ref_best_rd) {
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = orig_dst[i];
-        xd->plane[i].dst.stride = orig_dst_stride[i];
-      }
+      restore_dst_buf(xd, orig_dst, orig_dst_stride);
       return INT64_MAX;
     }
   }
@@ -2994,10 +3015,7 @@
     if (*rate_y == INT_MAX) {
       *rate2 = INT_MAX;
       *distortion = INT64_MAX;
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = orig_dst[i];
-        xd->plane[i].dst.stride = orig_dst_stride[i];
-      }
+      restore_dst_buf(xd, orig_dst, orig_dst_stride);
       return INT64_MAX;
     }
 
@@ -3012,10 +3030,7 @@
     if (*rate_uv == INT_MAX) {
       *rate2 = INT_MAX;
       *distortion = INT64_MAX;
-      for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = orig_dst[i];
-        xd->plane[i].dst.stride = orig_dst_stride[i];
-      }
+      restore_dst_buf(xd, orig_dst, orig_dst_stride);
       return INT64_MAX;
     }
 
@@ -3025,20 +3040,17 @@
     *skippable = skippable_y && skippable_uv;
   }
 
-  for (i = 0; i < MAX_MB_PLANE; i++) {
-    xd->plane[i].dst.buf = orig_dst[i];
-    xd->plane[i].dst.stride = orig_dst_stride[i];
-  }
-
+  restore_dst_buf(xd, orig_dst, orig_dst_stride);
   return this_rd;  // if 0, this will be re-calculated by caller
 }
 
-static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
-  int i;
+static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
+                           int max_plane) {
   struct macroblock_plane *const p = x->plane;
   struct macroblockd_plane *const pd = x->e_mbd.plane;
+  int i;
 
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
+  for (i = 0; i < max_plane; ++i) {
     p[i].coeff    = ctx->coeff_pbuf[i][1];
     pd[i].qcoeff  = ctx->qcoeff_pbuf[i][1];
     pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
@@ -3075,7 +3087,7 @@
       *returnrate = INT_MAX;
       return;
     }
-    rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
+    rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly,
                             &dist_uv, &uv_skip, bsize);
   } else {
     y_skip = 0;
@@ -3084,7 +3096,7 @@
       *returnrate = INT_MAX;
       return;
     }
-    rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
+    rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly,
                             &dist_uv, &uv_skip, BLOCK_8X8);
   }
 
@@ -3450,7 +3462,7 @@
 
       uv_tx = MIN(mbmi->tx_size, max_uv_txsize_lookup[bsize]);
       if (rate_uv_intra[uv_tx] == INT_MAX) {
-        choose_intra_uv_mode(cpi, bsize, &rate_uv_intra[uv_tx],
+        choose_intra_uv_mode(cpi, ctx, bsize, &rate_uv_intra[uv_tx],
                              &rate_uv_tokenonly[uv_tx],
                              &dist_uv[uv_tx], &skip_uv[uv_tx],
                              &mode_uv[uv_tx]);
@@ -3584,6 +3596,7 @@
 
     // Did this mode help.. i.e. is it the new best mode
     if (this_rd < best_rd || x->skip) {
+      int max_plane = MAX_MB_PLANE;
       if (!mode_excluded) {
         // Note index of best mode so far
         best_mode_index = mode_index;
@@ -3591,6 +3604,7 @@
         if (ref_frame == INTRA_FRAME) {
           /* required for left and above block mv */
           mbmi->mv[0].as_int = 0;
+          max_plane = 1;
         }
 
         *returnrate = rate2;
@@ -3599,7 +3613,7 @@
         best_mbmode = *mbmi;
         best_skip2 = this_skip2;
         if (!x->select_txfm_size)
-          swap_block_ptr(x, ctx);
+          swap_block_ptr(x, ctx, max_plane);
         vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
                    sizeof(uint8_t) * ctx->num_4x4_blk);
 
@@ -3706,7 +3720,7 @@
     // Do Intra UV best rd mode selection if best mode choice above was intra.
     if (vp9_mode_order[best_mode_index].ref_frame == INTRA_FRAME) {
       TX_SIZE uv_tx_size = get_uv_tx_size(mbmi);
-      rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_intra[uv_tx_size],
+      rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra[uv_tx_size],
                               &rate_uv_tokenonly[uv_tx_size],
                               &dist_uv[uv_tx_size],
                               &skip_uv[uv_tx_size],
@@ -4075,7 +4089,7 @@
       distortion2 += distortion_y;
 
       if (rate_uv_intra[TX_4X4] == INT_MAX) {
-        choose_intra_uv_mode(cpi, bsize, &rate_uv_intra[TX_4X4],
+        choose_intra_uv_mode(cpi, ctx, bsize, &rate_uv_intra[TX_4X4],
                              &rate_uv_tokenonly[TX_4X4],
                              &dist_uv[TX_4X4], &skip_uv[TX_4X4],
                              &mode_uv[TX_4X4]);
@@ -4329,12 +4343,14 @@
     // Did this mode help.. i.e. is it the new best mode
     if (this_rd < best_rd || x->skip) {
       if (!mode_excluded) {
+        int max_plane = MAX_MB_PLANE;
         // Note index of best mode so far
         best_mode_index = mode_index;
 
         if (ref_frame == INTRA_FRAME) {
           /* required for left and above block mv */
           mbmi->mv[0].as_int = 0;
+          max_plane = 1;
         }
 
         *returnrate = rate2;
@@ -4345,7 +4361,7 @@
         best_mbmode = *mbmi;
         best_skip2 = this_skip2;
         if (!x->select_txfm_size)
-          swap_block_ptr(x, ctx);
+          swap_block_ptr(x, ctx, max_plane);
         vpx_memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
                    sizeof(uint8_t) * ctx->num_4x4_blk);
 
@@ -4452,7 +4468,7 @@
     // Do Intra UV best rd mode selection if best mode choice above was intra.
     if (vp9_ref_order[best_mode_index].ref_frame == INTRA_FRAME) {
       TX_SIZE uv_tx_size = get_uv_tx_size(mbmi);
-      rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_intra[uv_tx_size],
+      rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra[uv_tx_size],
                               &rate_uv_tokenonly[uv_tx_size],
                               &dist_uv[uv_tx_size],
                               &skip_uv[uv_tx_size],

diff --git a/vp9/encoder/vp9_rdopt.h b/vp9/encoder/vp9_rdopt.h
index 92fb235..f0e8849 100644
--- a/vp9/encoder/vp9_rdopt.h
+++ b/vp9/encoder/vp9_rdopt.h

@@ -8,10 +8,11 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
 #ifndef VP9_ENCODER_VP9_RDOPT_H_
 #define VP9_ENCODER_VP9_RDOPT_H_
 
+#include "vp9/encoder/vp9_onyx_int.h"
+
 #define RDDIV_BITS          7
 
 #define RDCOST(RM, DM, R, D) \

diff --git a/vp9/encoder/vp9_sad_c.c b/vp9/encoder/vp9_sad_c.c
index 42ddb21..55d595b 100644
--- a/vp9/encoder/vp9_sad_c.c
+++ b/vp9/encoder/vp9_sad_c.c

@@ -10,11 +10,11 @@
 
 
 #include <stdlib.h>
-#include "vp9/common/vp9_sadmxn.h"
-#include "vp9/encoder/vp9_variance.h"
-#include "./vpx_config.h"
-#include "vpx/vpx_integer.h"
 #include "./vp9_rtcd.h"
+#include "./vpx_config.h"
+#include "vp9/encoder/vp9_sadmxn.h"
+#include "vp9/encoder/vp9_variance.h"
+#include "vpx/vpx_integer.h"
 
 #define sad_mxn_func(m, n) \
 unsigned int vp9_sad##m##x##n##_c(const uint8_t *src_ptr, \

diff --git a/vp9/common/vp9_sadmxn.h b/vp9/encoder/vp9_sadmxn.h
similarity index 100%
rename from vp9/common/vp9_sadmxn.h
rename to vp9/encoder/vp9_sadmxn.h


diff --git a/vp9/encoder/vp9_subexp.c b/vp9/encoder/vp9_subexp.c
index 387fc90..f31e568 100644
--- a/vp9/encoder/vp9_subexp.c
+++ b/vp9/encoder/vp9_subexp.c

@@ -14,7 +14,6 @@
 #include "vp9/encoder/vp9_boolhuff.h"
 #include "vp9/encoder/vp9_treewriter.h"
 
-#define vp9_cost_upd  ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)) >> 8)
 #define vp9_cost_upd256  ((int)(vp9_cost_one(upd) - vp9_cost_zero(upd)))
 
 static int update_bits[255];

diff --git a/vp9/encoder/vp9_temporal_filter.c b/vp9/encoder/vp9_temporal_filter.c
index 2cace03..3bffb12 100644
--- a/vp9/encoder/vp9_temporal_filter.c
+++ b/vp9/encoder/vp9_temporal_filter.c

@@ -11,22 +11,22 @@
 #include <math.h>
 #include <limits.h>
 
-#include "vp9/common/vp9_onyxc_int.h"
-#include "vp9/common/vp9_reconinter.h"
-#include "vp9/encoder/vp9_onyx_int.h"
-#include "vp9/common/vp9_systemdependent.h"
-#include "vp9/encoder/vp9_quantize.h"
 #include "vp9/common/vp9_alloccommon.h"
-#include "vp9/encoder/vp9_mcomp.h"
-#include "vp9/encoder/vp9_firstpass.h"
-#include "vp9/encoder/vp9_psnr.h"
-#include "vpx_scale/vpx_scale.h"
-#include "vp9/common/vp9_extend.h"
-#include "vp9/encoder/vp9_ratectrl.h"
+#include "vp9/common/vp9_onyxc_int.h"
 #include "vp9/common/vp9_quant_common.h"
+#include "vp9/common/vp9_reconinter.h"
+#include "vp9/common/vp9_systemdependent.h"
+#include "vp9/encoder/vp9_extend.h"
+#include "vp9/encoder/vp9_firstpass.h"
+#include "vp9/encoder/vp9_mcomp.h"
+#include "vp9/encoder/vp9_onyx_int.h"
+#include "vp9/encoder/vp9_psnr.h"
+#include "vp9/encoder/vp9_quantize.h"
+#include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_segmentation.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/vpx_timer.h"
+#include "vpx_scale/vpx_scale.h"
 
 #define ALT_REF_MC_ENABLED 1    // dis/enable MC in AltRef filtering
 #define ALT_REF_SUBPEL_ENABLED 1  // dis/enable subpel in MC AltRef filtering

diff --git a/vp9/encoder/vp9_tokenize.c b/vp9/encoder/vp9_tokenize.c
index 3d21ea8..c7336d0 100644
--- a/vp9/encoder/vp9_tokenize.c
+++ b/vp9/encoder/vp9_tokenize.c

@@ -115,9 +115,7 @@
   vp9_coeff_count *const counts = cpi->coef_counts[tx_size];
   vp9_coeff_probs_model *const coef_probs = cpi->common.fc.coef_probs[tx_size];
   const int ref = is_inter_block(mbmi);
-  const uint8_t *const band_translate = (tx_size == TX_4X4 ?
-                                         vp9_coefband_trans_4x4 :
-                                         cpi->mb.coefband_trans_8x8plus);
+  const uint8_t *const band_translate = get_band_translate(tx_size);
   const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
   int aoff, loff;
   txfrm_block_to_raster_xy(plane_bsize, tx_size, block, &aoff, &loff);

diff --git a/vp9/encoder/vp9_treewriter.h b/vp9/encoder/vp9_treewriter.h
index eeda5cd..41d1bfb 100644
--- a/vp9/encoder/vp9_treewriter.h
+++ b/vp9/encoder/vp9_treewriter.h

@@ -19,31 +19,20 @@
 
 #include "vp9/encoder/vp9_boolhuff.h"       /* for now */
 
-
 #define vp9_write_prob(w, v) vp9_write_literal((w), (v), 8)
 
-/* Approximate length of an encoded bool in 256ths of a bit at given prob */
+#define vp9_cost_zero(prob) (vp9_prob_cost[prob])
 
-#define vp9_cost_zero(x) (vp9_prob_cost[x])
-#define vp9_cost_one(x) vp9_cost_zero(vp9_complement(x))
+#define vp9_cost_one(prob) vp9_cost_zero(vp9_complement(prob))
 
-#define vp9_cost_bit(x, b) vp9_cost_zero((b) ? vp9_complement(x) : (x))
+#define vp9_cost_bit(prob, bit) vp9_cost_zero((bit) ? vp9_complement(prob) \
+                                                    : (prob))
 
-/* VP8BC version is scaled by 2^20 rather than 2^8; see bool_coder.h */
-
-
-/* Both of these return bits, not scaled bits. */
 static INLINE unsigned int cost_branch256(const unsigned int ct[2],
                                           vp9_prob p) {
   return ct[0] * vp9_cost_zero(p) + ct[1] * vp9_cost_one(p);
 }
 
-static INLINE unsigned int cost_branch(const unsigned int ct[2],
-                                       vp9_prob p) {
-  return cost_branch256(ct, p) >> 8;
-}
-
-
 static INLINE void treed_write(vp9_writer *w,
                                vp9_tree tree, const vp9_prob *probs,
                                int bits, int len) {

diff --git a/vp9/encoder/x86/vp9_dct_sse2.c b/vp9/encoder/x86/vp9_dct_sse2.c
index dc11501..fefca66 100644
--- a/vp9/encoder/x86/vp9_dct_sse2.c
+++ b/vp9/encoder/x86/vp9_dct_sse2.c

@@ -206,12 +206,12 @@
   const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
   __m128i u[8], v[8];
   __m128i in7 = _mm_add_epi16(in[0], in[1]);
-  in7 = _mm_sub_epi16(in7, in[3]);
 
   u[0] = _mm_unpacklo_epi16(in[0], in[1]);
   u[1] = _mm_unpacklo_epi16(in[2], in[3]);
   u[2] = _mm_unpacklo_epi16(in7, kZero);
   u[3] = _mm_unpacklo_epi16(in[2], kZero);
+  u[4] = _mm_unpacklo_epi16(in[3], kZero);
 
   v[0] = _mm_madd_epi16(u[0], k__sinpi_p01_p02);  // s0 + s2
   v[1] = _mm_madd_epi16(u[1], k__sinpi_p03_p04);  // s4 + s5
@@ -219,9 +219,10 @@
   v[3] = _mm_madd_epi16(u[0], k__sinpi_p04_m01);  // s1 - s3
   v[4] = _mm_madd_epi16(u[1], k__sinpi_m03_p02);  // -s4 + s6
   v[5] = _mm_madd_epi16(u[3], k__sinpi_p03_p03);  // s4
+  v[6] = _mm_madd_epi16(u[4], k__sinpi_p03_p03);
 
   u[0] = _mm_add_epi32(v[0], v[1]);
-  u[1] = v[2];
+  u[1] = _mm_sub_epi32(v[2], v[6]);
   u[2] = _mm_add_epi32(v[3], v[4]);
   u[3] = _mm_sub_epi32(u[2], u[0]);
   u[4] = _mm_slli_epi32(v[5], 2);

diff --git a/vp9/vp9_common.mk b/vp9/vp9_common.mk
index 2dd2bf0..c566765 100644
--- a/vp9/vp9_common.mk
+++ b/vp9/vp9_common.mk

@@ -21,7 +21,6 @@
 VP9_COMMON_SRCS-yes += common/vp9_entropy.c
 VP9_COMMON_SRCS-yes += common/vp9_entropymode.c
 VP9_COMMON_SRCS-yes += common/vp9_entropymv.c
-VP9_COMMON_SRCS-yes += common/vp9_extend.c
 VP9_COMMON_SRCS-yes += common/vp9_filter.c
 VP9_COMMON_SRCS-yes += common/vp9_filter.h
 VP9_COMMON_SRCS-yes += common/vp9_findnearmv.c
@@ -34,7 +33,6 @@
 VP9_COMMON_SRCS-yes += common/vp9_entropymode.h
 VP9_COMMON_SRCS-yes += common/vp9_entropymv.h
 VP9_COMMON_SRCS-yes += common/vp9_enums.h
-VP9_COMMON_SRCS-yes += common/vp9_extend.h
 VP9_COMMON_SRCS-yes += common/vp9_findnearmv.h
 VP9_COMMON_SRCS-yes += common/vp9_idct.h
 VP9_COMMON_SRCS-yes += common/vp9_loopfilter.h
@@ -47,7 +45,6 @@
 VP9_COMMON_SRCS-yes += common/vp9_reconintra.h
 VP9_COMMON_SRCS-yes += common/vp9_rtcd.c
 VP9_COMMON_SRCS-yes += common/vp9_rtcd_defs.sh
-VP9_COMMON_SRCS-yes += common/vp9_sadmxn.h
 VP9_COMMON_SRCS-yes += common/vp9_scale.h
 VP9_COMMON_SRCS-yes += common/vp9_scale.c
 VP9_COMMON_SRCS-yes += common/vp9_seg_common.h
@@ -123,6 +120,7 @@
 
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve_neon.c
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_idct16x16_neon.c
+VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_16_neon.c
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_convolve8_avg_neon$(ASM)
 VP9_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/vp9_loopfilter_neon$(ASM)

diff --git a/vp9/vp9cx.mk b/vp9/vp9cx.mk
index 0993c6c..bd13518 100644
--- a/vp9/vp9cx.mk
+++ b/vp9/vp9cx.mk

@@ -23,17 +23,17 @@
 VP9_CX_SRCS-yes += encoder/vp9_dct.h
 VP9_CX_SRCS-yes += encoder/vp9_encodeframe.c
 VP9_CX_SRCS-yes += encoder/vp9_encodeframe.h
-VP9_CX_SRCS-yes += encoder/vp9_encodeintra.c
 VP9_CX_SRCS-yes += encoder/vp9_encodemb.c
 VP9_CX_SRCS-yes += encoder/vp9_encodemv.c
+VP9_CX_SRCS-yes += encoder/vp9_extend.c
 VP9_CX_SRCS-yes += encoder/vp9_firstpass.c
 VP9_CX_SRCS-yes += encoder/vp9_block.h
 VP9_CX_SRCS-yes += encoder/vp9_boolhuff.h
 VP9_CX_SRCS-yes += encoder/vp9_write_bit_buffer.h
 VP9_CX_SRCS-yes += encoder/vp9_bitstream.h
-VP9_CX_SRCS-yes += encoder/vp9_encodeintra.h
 VP9_CX_SRCS-yes += encoder/vp9_encodemb.h
 VP9_CX_SRCS-yes += encoder/vp9_encodemv.h
+VP9_CX_SRCS-yes += encoder/vp9_extend.h
 VP9_CX_SRCS-yes += encoder/vp9_firstpass.h
 VP9_CX_SRCS-yes += encoder/vp9_lookahead.c
 VP9_CX_SRCS-yes += encoder/vp9_lookahead.h
@@ -44,6 +44,7 @@
 VP9_CX_SRCS-yes += encoder/vp9_quantize.h
 VP9_CX_SRCS-yes += encoder/vp9_ratectrl.h
 VP9_CX_SRCS-yes += encoder/vp9_rdopt.h
+VP9_CX_SRCS-yes += encoder/vp9_sadmxn.h
 VP9_CX_SRCS-yes += encoder/vp9_tokenize.h
 VP9_CX_SRCS-yes += encoder/vp9_treewriter.h
 VP9_CX_SRCS-yes += encoder/vp9_variance.h

diff --git a/vp9/vp9dx.mk b/vp9/vp9dx.mk
index 3a27cdd..7e76682 100644
--- a/vp9/vp9dx.mk
+++ b/vp9/vp9dx.mk

@@ -19,8 +19,8 @@
 
 VP9_DX_SRCS-yes += decoder/vp9_dboolhuff.c
 VP9_DX_SRCS-yes += decoder/vp9_decodemv.c
-VP9_DX_SRCS-yes += decoder/vp9_decodframe.c
-VP9_DX_SRCS-yes += decoder/vp9_decodframe.h
+VP9_DX_SRCS-yes += decoder/vp9_decodeframe.c
+VP9_DX_SRCS-yes += decoder/vp9_decodeframe.h
 VP9_DX_SRCS-yes += decoder/vp9_detokenize.c
 VP9_DX_SRCS-yes += decoder/vp9_dboolhuff.h
 VP9_DX_SRCS-yes += decoder/vp9_read_bit_buffer.h

diff --git a/vp9_spatial_scalable_encoder.c b/vp9_spatial_scalable_encoder.c
index 9acfa29..9aaec82 100644
--- a/vp9_spatial_scalable_encoder.c
+++ b/vp9_spatial_scalable_encoder.c

@@ -19,12 +19,12 @@
 #include <string.h>
 #include <time.h>
 #include "./args.h"
+#include "./ivfenc.h"
+#include "./tools_common.h"
 #include "vpx/svc_context.h"
 #include "vpx/vp8cx.h"
 #include "vpx/vpx_encoder.h"
 
-#define VP90_FOURCC 0x30395056
-
 static const struct arg_enum_list encoding_mode_enum[] = {
   {"i", INTER_LAYER_PREDICTION_I},
   {"alt-ip", ALT_INTER_LAYER_PREDICTION_IP},
@@ -77,25 +77,13 @@
 static const int default_use_dummy_frame = 1;
 
 typedef struct {
-  char *input_filename;
   char *output_filename;
   uint32_t frames_to_code;
   uint32_t frames_to_skip;
+  struct VpxInputContext input_ctx;
 } AppInput;
 
-static void mem_put_le16(char *mem, uint32_t val) {
-  mem[0] = val;
-  mem[1] = val >> 8;
-}
-
-static void mem_put_le32(char *mem, uint32_t val) {
-  mem[0] = val;
-  mem[1] = val >> 8;
-  mem[2] = val >> 16;
-  mem[3] = val >> 24;
-}
-
-static void usage(const char *exec_name) {
+void usage_exit(const char *exec_name) {
   fprintf(stderr, "Usage: %s <options> input_filename output_filename\n",
           exec_name);
   fprintf(stderr, "Options:\n");
@@ -103,15 +91,6 @@
   exit(EXIT_FAILURE);
 }
 
-void die(const char *fmt, ...) {
-  va_list ap;
-
-  va_start(ap, fmt);
-  vfprintf(stderr, fmt, ap);
-  if (fmt[strlen(fmt) - 1] != '\n') printf("\n");
-  exit(EXIT_FAILURE);
-}
-
 static void die_codec(vpx_codec_ctx_t *ctx, const char *s) {
   const char *detail = vpx_codec_error_detail(ctx);
 
@@ -120,83 +99,12 @@
   exit(EXIT_FAILURE);
 }
 
-static int read_frame(FILE *f, vpx_image_t *img) {
-  size_t nbytes;
-  int res = 1;
-  int plane;
-
-  for (plane = 0; plane < 3; ++plane) {
-    uint8_t *ptr;
-    const int w = (plane ? (1 + img->d_w) / 2 : img->d_w);
-    const int h = (plane ? (1 + img->d_h) / 2 : img->d_h);
-    int r;
-
-    switch (plane) {
-      case 1:
-        ptr = img->planes[VPX_PLANE_U];
-        break;
-      case 2:
-        ptr = img->planes[VPX_PLANE_V];
-        break;
-      default:
-        ptr = img->planes[plane];
-    }
-    for (r = 0; r < h; ++r) {
-      const int to_read = w;
-
-      nbytes = fread(ptr, 1, to_read, f);
-      if (nbytes != to_read) {
-        res = 0;
-        if (nbytes > 0)
-          printf("Warning: Read partial frame. Check your width & height!\n");
-        break;
-      }
-      ptr += img->stride[plane];
-    }
-    if (!res) break;
-  }
-  return res;
-}
-
 static int create_dummy_frame(vpx_image_t *img) {
   const size_t buf_size = img->w * img->h * 3 / 2;
   memset(img->planes[0], 129, buf_size);
   return 1;
 }
 
-static void write_ivf_file_header(FILE *outfile,
-                                  uint32_t width, uint32_t height,
-                                  int timebase_num, int timebase_den,
-                                  int frame_cnt) {
-  char header[32];
-
-  header[0] = 'D';
-  header[1] = 'K';
-  header[2] = 'I';
-  header[3] = 'F';
-  mem_put_le16(header + 4, 0);             /* version */
-  mem_put_le16(header + 6, 32);            /* headersize */
-  mem_put_le32(header + 8, VP90_FOURCC);   /* fourcc */
-  mem_put_le16(header + 12, width);        /* width */
-  mem_put_le16(header + 14, height);       /* height */
-  mem_put_le32(header + 16, timebase_den); /* rate */
-  mem_put_le32(header + 20, timebase_num); /* scale */
-  mem_put_le32(header + 24, frame_cnt);    /* length */
-  mem_put_le32(header + 28, 0);            /* unused */
-
-  (void)fwrite(header, 1, 32, outfile);
-}
-
-static void write_ivf_frame_header(FILE *outfile, vpx_codec_pts_t pts,
-                                   size_t sz) {
-  char header[12];
-  mem_put_le32(header, (uint32_t)sz);
-  mem_put_le32(header + 4, pts & 0xFFFFFFFF);
-  mem_put_le32(header + 8, pts >> 32);
-
-  (void)fwrite(header, 1, 12, outfile);
-}
-
 static void parse_command_line(int argc, const char **argv_,
                                AppInput *app_input, SvcContext *svc_ctx,
                                vpx_codec_enc_cfg_t *enc_cfg) {
@@ -272,9 +180,9 @@
       die("Error: Unrecognized option %s\n", *argi);
 
   if (argv[0] == NULL || argv[1] == 0) {
-    usage(argv_[0]);
+    usage_exit(argv_[0]);
   }
-  app_input->input_filename = argv[0];
+  app_input->input_ctx.filename = argv[0];
   app_input->output_filename = argv[1];
   free(argv);
 
@@ -298,7 +206,7 @@
 
 int main(int argc, const char **argv) {
   AppInput app_input = {0};
-  FILE *infile, *outfile;
+  FILE *outfile;
   vpx_codec_ctx_t codec;
   vpx_codec_enc_cfg_t enc_cfg;
   SvcContext svc_ctx;
@@ -308,6 +216,8 @@
   vpx_codec_err_t res;
   int pts = 0;            /* PTS starts at 0 */
   int frame_duration = 1; /* 1 timebase tick per frame */
+  vpx_codec_cx_pkt_t packet = {0};
+  packet.kind = VPX_CODEC_CX_FRAME_PKT;
 
   memset(&svc_ctx, 0, sizeof(svc_ctx));
   svc_ctx.log_print = 1;
@@ -317,8 +227,8 @@
   if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, enc_cfg.g_w, enc_cfg.g_h, 32))
     die("Failed to allocate image %dx%d\n", enc_cfg.g_w, enc_cfg.g_h);
 
-  if (!(infile = fopen(app_input.input_filename, "rb")))
-    die("Failed to open %s for reading\n", app_input.input_filename);
+  if (!(app_input.input_ctx.file = fopen(app_input.input_ctx.filename, "rb")))
+    die("Failed to open %s for reading\n", app_input.input_ctx.filename);
 
   if (!(outfile = fopen(app_input.output_filename, "wb")))
     die("Failed to open %s for writing\n", app_input.output_filename);
@@ -328,12 +238,11 @@
       VPX_CODEC_OK)
     die("Failed to initialize encoder\n");
 
-  write_ivf_file_header(outfile, enc_cfg.g_w, enc_cfg.g_h,
-                        enc_cfg.g_timebase.num, enc_cfg.g_timebase.den, 0);
+  ivf_write_file_header(outfile, &enc_cfg, VP9_FOURCC, 0);
 
   // skip initial frames
   for (i = 0; i < app_input.frames_to_skip; ++i) {
-    read_frame(infile, &raw);
+    read_yuv_frame(&app_input.input_ctx, &raw);
   }
 
   // Encode frames
@@ -341,7 +250,7 @@
     if (frame_cnt == 0 && svc_ctx.first_frame_full_size) {
       create_dummy_frame(&raw);
     } else {
-      if (!read_frame(infile, &raw)) break;
+      if (!read_yuv_frame(&app_input.input_ctx, &raw)) break;
     }
     res = vpx_svc_encode(&svc_ctx, &codec, &raw, pts, frame_duration,
                          VPX_DL_REALTIME);
@@ -350,7 +259,9 @@
       die_codec(&codec, "Failed to encode frame");
     }
     if (vpx_svc_get_frame_size(&svc_ctx) > 0) {
-      write_ivf_frame_header(outfile, pts, vpx_svc_get_frame_size(&svc_ctx));
+      packet.data.frame.pts = pts;
+      packet.data.frame.sz = vpx_svc_get_frame_size(&svc_ctx);
+      ivf_write_frame_header(outfile, &packet);
       (void)fwrite(vpx_svc_get_buffer(&svc_ctx), 1,
                    vpx_svc_get_frame_size(&svc_ctx), outfile);
     }
@@ -360,14 +271,12 @@
 
   printf("Processed %d frames\n", frame_cnt - svc_ctx.first_frame_full_size);
 
-  fclose(infile);
+  fclose(app_input.input_ctx.file);
   if (vpx_codec_destroy(&codec)) die_codec(&codec, "Failed to destroy codec");
 
   // rewrite the output file headers with the actual frame count
   if (!fseek(outfile, 0, SEEK_SET)) {
-    write_ivf_file_header(outfile, enc_cfg.g_w, enc_cfg.g_h,
-                          enc_cfg.g_timebase.num, enc_cfg.g_timebase.den,
-                          frame_cnt);
+    ivf_write_file_header(outfile, &enc_cfg, VP9_FOURCC, frame_cnt);
   }
   fclose(outfile);
   vpx_img_free(&raw);

diff --git a/vpx_scale/generic/yv12config.c b/vpx_scale/generic/yv12config.c
index a89e29d..7c3f7ec 100644
--- a/vpx_scale/generic/yv12config.c
+++ b/vpx_scale/generic/yv12config.c

@@ -148,7 +148,10 @@
 #else
     const int frame_size = yplane_size + 2 * uvplane_size;
 #endif
-    if (!ybf->buffer_alloc) {
+    if (frame_size > ybf->buffer_alloc_sz) {
+      // Allocation to hold larger frame, or first allocation.
+      if (ybf->buffer_alloc)
+        vpx_free(ybf->buffer_alloc);
       ybf->buffer_alloc = vpx_memalign(32, frame_size);
       ybf->buffer_alloc_sz = frame_size;
     }

diff --git a/vpxdec.c b/vpxdec.c
index 110e4ac..dc2eec8 100644
--- a/vpxdec.c
+++ b/vpxdec.c

@@ -8,10 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-
-/* This is a simple program that reads ivf files and decodes them
- * using the new interface. Decoded frames are output as YV12 raw.
- */
 #include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -19,27 +15,34 @@
 #include <string.h>
 #include <limits.h>
 
+#include "third_party/libyuv/include/libyuv/scale.h"
+
+#include "./args.h"
+#include "./ivfdec.h"
+
 #define VPX_CODEC_DISABLE_COMPAT 1
-#include "vpx_config.h"
+#include "./vpx_config.h"
 #include "vpx/vpx_decoder.h"
 #include "vpx_ports/vpx_timer.h"
+
 #if CONFIG_VP8_DECODER || CONFIG_VP9_DECODER
 #include "vpx/vp8dx.h"
 #endif
+
 #if CONFIG_MD5
-#include "md5_utils.h"
+#include "./md5_utils.h"
 #endif
-#include "tools_common.h"
-#include "nestegg/include/nestegg/nestegg.h"
-#include "third_party/libyuv/include/libyuv/scale.h"
+
+#include "./tools_common.h"
+#include "./webmdec.h"
 
 static const char *exec_name;
 
 static const struct {
   char const *name;
   const vpx_codec_iface_t *(*iface)(void);
-  unsigned int             fourcc;
-  unsigned int             fourcc_mask;
+  uint32_t fourcc;
+  uint32_t fourcc_mask;
 } ifaces[] = {
 #if CONFIG_VP8_DECODER
   {"vp8",  vpx_codec_vp8_dx,   VP8_FOURCC_MASK, 0x00FFFFFF},
@@ -49,7 +52,11 @@
 #endif
 };
 
-#include "args.h"
+struct VpxDecInputContext {
+  struct VpxInputContext *vpx_input_ctx;
+  struct WebmInputContext *webm_ctx;
+};
+
 static const arg_def_t looparg = ARG_DEF(NULL, "loops", 1,
                                           "Number of times to decode the file");
 static const arg_def_t codecarg = ARG_DEF(NULL, "codec", 1,
@@ -161,123 +168,61 @@
   exit(EXIT_FAILURE);
 }
 
-static unsigned int mem_get_le16(const void *vmem) {
-  unsigned int  val;
-  const unsigned char *mem = (const unsigned char *)vmem;
+static int read_frame(struct VpxDecInputContext *input,
+                      uint8_t **buf,
+                      size_t *bytes_in_buffer,
+                      size_t *buffer_size) {
+  char raw_hdr[RAW_FRAME_HDR_SZ];
+  size_t bytes_to_read = 0;
+  FILE *infile = input->vpx_input_ctx->file;
+  enum VideoFileType kind = input->vpx_input_ctx->file_type;
+  if (kind == FILE_TYPE_WEBM) {
+    return webm_read_frame(input->webm_ctx,
+                           buf, bytes_in_buffer, buffer_size);
+  } else if (kind == FILE_TYPE_RAW) {
+    if (fread(raw_hdr, RAW_FRAME_HDR_SZ, 1, infile) != 1) {
+      if (!feof(infile))
+        warn("Failed to read RAW frame size\n");
+    } else {
+      const int kCorruptFrameThreshold = 256 * 1024 * 1024;
+      const int kFrameTooSmallThreshold = 256 * 1024;
+      bytes_to_read = mem_get_le32(raw_hdr);
 
-  val = mem[1] << 8;
-  val |= mem[0];
-  return val;
-}
+      if (bytes_to_read > kCorruptFrameThreshold) {
+        warn("Read invalid frame size (%u)\n", (unsigned int)bytes_to_read);
+        bytes_to_read = 0;
+      }
 
-static unsigned int mem_get_le32(const void *vmem) {
-  unsigned int  val;
-  const unsigned char *mem = (const unsigned char *)vmem;
+      if (kind == FILE_TYPE_RAW && bytes_to_read < kFrameTooSmallThreshold) {
+        warn("Warning: Read invalid frame size (%u) - not a raw file?\n",
+             (unsigned int)bytes_to_read);
+      }
 
-  val = mem[3] << 24;
-  val |= mem[2] << 16;
-  val |= mem[1] << 8;
-  val |= mem[0];
-  return val;
-}
+      if (bytes_to_read > *buffer_size) {
+        uint8_t *new_buf = realloc(*buf, 2 * bytes_to_read);
 
-enum file_kind {
-  RAW_FILE,
-  IVF_FILE,
-  WEBM_FILE
-};
-
-struct input_ctx {
-  enum file_kind  kind;
-  FILE           *infile;
-  nestegg        *nestegg_ctx;
-  nestegg_packet *pkt;
-  unsigned int    chunk;
-  unsigned int    chunks;
-  unsigned int    video_track;
-};
-
-#define IVF_FRAME_HDR_SZ (sizeof(uint32_t) + sizeof(uint64_t))
-#define RAW_FRAME_HDR_SZ (sizeof(uint32_t))
-static int read_frame(struct input_ctx      *input,
-                      uint8_t               **buf,
-                      size_t                *buf_sz,
-                      size_t                *buf_alloc_sz) {
-  char            raw_hdr[IVF_FRAME_HDR_SZ];
-  size_t          new_buf_sz;
-  FILE           *infile = input->infile;
-  enum file_kind  kind = input->kind;
-  if (kind == WEBM_FILE) {
-    if (input->chunk >= input->chunks) {
-      unsigned int track;
-
-      do {
-        /* End of this packet, get another. */
-        if (input->pkt)
-          nestegg_free_packet(input->pkt);
-
-        if (nestegg_read_packet(input->nestegg_ctx, &input->pkt) <= 0
-            || nestegg_packet_track(input->pkt, &track))
-          return 1;
-
-      } while (track != input->video_track);
-
-      if (nestegg_packet_count(input->pkt, &input->chunks))
-        return 1;
-      input->chunk = 0;
-    }
-
-    if (nestegg_packet_data(input->pkt, input->chunk, buf, buf_sz))
-      return 1;
-    input->chunk++;
-
-    return 0;
-  }
-  /* For both the raw and ivf formats, the frame size is the first 4 bytes
-   * of the frame header. We just need to special case on the header
-   * size.
-   */
-  else if (fread(raw_hdr, kind == IVF_FILE
-                 ? IVF_FRAME_HDR_SZ : RAW_FRAME_HDR_SZ, 1, infile) != 1) {
-    if (!feof(infile))
-      fprintf(stderr, "Failed to read frame size\n");
-
-    new_buf_sz = 0;
-  } else {
-    new_buf_sz = mem_get_le32(raw_hdr);
-
-    if (new_buf_sz > 256 * 1024 * 1024) {
-      fprintf(stderr, "Error: Read invalid frame size (%u)\n",
-              (unsigned int)new_buf_sz);
-      new_buf_sz = 0;
-    }
-
-    if (kind == RAW_FILE && new_buf_sz > 256 * 1024)
-      fprintf(stderr, "Warning: Read invalid frame size (%u)"
-              " - not a raw file?\n", (unsigned int)new_buf_sz);
-
-    if (new_buf_sz > *buf_alloc_sz) {
-      uint8_t *new_buf = realloc(*buf, 2 * new_buf_sz);
-
-      if (new_buf) {
-        *buf = new_buf;
-        *buf_alloc_sz = 2 * new_buf_sz;
-      } else {
-        fprintf(stderr, "Failed to allocate compressed data buffer\n");
-        new_buf_sz = 0;
+        if (new_buf) {
+          *buf = new_buf;
+          *buffer_size = 2 * bytes_to_read;
+        } else {
+          warn("Failed to allocate compressed data buffer\n");
+          bytes_to_read = 0;
+        }
       }
     }
-  }
 
-  *buf_sz = new_buf_sz;
-
-  if (!feof(infile)) {
-    if (fread(*buf, 1, *buf_sz, infile) != *buf_sz) {
-      fprintf(stderr, "Failed to read full frame\n");
-      return 1;
+    if (!feof(infile)) {
+      if (fread(*buf, 1, bytes_to_read, infile) != bytes_to_read) {
+        warn("Failed to read full frame\n");
+        return 1;
+      }
+      *bytes_in_buffer = bytes_to_read;
     }
 
     return 0;
+  } else if (kind == FILE_TYPE_IVF) {
+    return ivf_read_frame(input->vpx_input_ctx,
+                          buf, bytes_in_buffer, buffer_size);
   }
 
   return 1;
@@ -297,8 +242,7 @@
                           : set_binary_mode(stdout);
 
     if (!outfile) {
-      fprintf(stderr, "Failed to output file");
-      exit(EXIT_FAILURE);
+      fatal("Failed to output file");
     }
   }
 
@@ -334,254 +278,42 @@
   }
 }
 
-unsigned int file_is_ivf(FILE *infile,
-                         unsigned int *fourcc,
-                         unsigned int *width,
-                         unsigned int *height,
-                         unsigned int *fps_den,
-                         unsigned int *fps_num) {
-  char raw_hdr[32];
-  int is_ivf = 0;
-
-  if (fread(raw_hdr, 1, 32, infile) == 32) {
-    if (raw_hdr[0] == 'D' && raw_hdr[1] == 'K'
-        && raw_hdr[2] == 'I' && raw_hdr[3] == 'F') {
-      is_ivf = 1;
-
-      if (mem_get_le16(raw_hdr + 4) != 0)
-        fprintf(stderr, "Error: Unrecognized IVF version! This file may not"
-                " decode properly.");
-
-      *fourcc = mem_get_le32(raw_hdr + 8);
-      *width = mem_get_le16(raw_hdr + 12);
-      *height = mem_get_le16(raw_hdr + 14);
-      *fps_num = mem_get_le32(raw_hdr + 16);
-      *fps_den = mem_get_le32(raw_hdr + 20);
-
-      /* Some versions of vpxenc used 1/(2*fps) for the timebase, so
-       * we can guess the framerate using only the timebase in this
-       * case. Other files would require reading ahead to guess the
-       * timebase, like we do for webm.
-       */
-      if (*fps_num < 1000) {
-        /* Correct for the factor of 2 applied to the timebase in the
-         * encoder.
-         */
-        if (*fps_num & 1)*fps_den <<= 1;
-        else *fps_num >>= 1;
-      } else {
-        /* Don't know FPS for sure, and don't have readahead code
-         * (yet?), so just default to 30fps.
-         */
-        *fps_num = 30;
-        *fps_den = 1;
-      }
-    }
-  }
-
-  if (!is_ivf)
-    rewind(infile);
-
-  return is_ivf;
-}
-
-
-unsigned int file_is_raw(FILE *infile,
-                         unsigned int *fourcc,
-                         unsigned int *width,
-                         unsigned int *height,
-                         unsigned int *fps_den,
-                         unsigned int *fps_num) {
-  unsigned char buf[32];
+int file_is_raw(struct VpxInputContext *input) {
+  uint8_t buf[32];
   int is_raw = 0;
   vpx_codec_stream_info_t si;
 
   si.sz = sizeof(si);
 
-  if (fread(buf, 1, 32, infile) == 32) {
+  if (fread(buf, 1, 32, input->file) == 32) {
     int i;
 
-    if (mem_get_le32(buf) < 256 * 1024 * 1024)
-      for (i = 0; i < sizeof(ifaces) / sizeof(ifaces[0]); i++)
+    if (mem_get_le32(buf) < 256 * 1024 * 1024) {
+      for (i = 0; i < sizeof(ifaces) / sizeof(ifaces[0]); i++) {
         if (!vpx_codec_peek_stream_info(ifaces[i].iface(),
                                         buf + 4, 32 - 4, &si)) {
           is_raw = 1;
-          *fourcc = ifaces[i].fourcc;
-          *width = si.w;
-          *height = si.h;
-          *fps_num = 30;
-          *fps_den = 1;
+          input->fourcc = ifaces[i].fourcc;
+          input->width = si.w;
+          input->height = si.h;
+          input->framerate.numerator = 30;
+          input->framerate.denominator = 1;
           break;
         }
+      }
+    }
   }
 
-  rewind(infile);
+  rewind(input->file);
   return is_raw;
 }
 
-
-static int
-nestegg_read_cb(void *buffer, size_t length, void *userdata) {
-  FILE *f = userdata;
-
-  if (fread(buffer, 1, length, f) < length) {
-    if (ferror(f))
-      return -1;
-    if (feof(f))
-      return 0;
-  }
-  return 1;
-}
-
-
-static int
-nestegg_seek_cb(int64_t offset, int whence, void *userdata) {
-  switch (whence) {
-    case NESTEGG_SEEK_SET:
-      whence = SEEK_SET;
-      break;
-    case NESTEGG_SEEK_CUR:
-      whence = SEEK_CUR;
-      break;
-    case NESTEGG_SEEK_END:
-      whence = SEEK_END;
-      break;
-  };
-  return fseek(userdata, (long)offset, whence) ? -1 : 0;
-}
-
-
-static int64_t
-nestegg_tell_cb(void *userdata) {
-  return ftell(userdata);
-}
-
-
-static void
-nestegg_log_cb(nestegg *context, unsigned int severity, char const *format,
-               ...) {
-  va_list ap;
-
-  va_start(ap, format);
-  vfprintf(stderr, format, ap);
-  fprintf(stderr, "\n");
-  va_end(ap);
-}
-
-
-static int
-webm_guess_framerate(struct input_ctx *input,
-                     unsigned int     *fps_den,
-                     unsigned int     *fps_num) {
-  unsigned int i;
-  uint64_t     tstamp = 0;
-
-  /* Check to see if we can seek before we parse any data. */
-  if (nestegg_track_seek(input->nestegg_ctx, input->video_track, 0)) {
-    fprintf(stderr,
-            "WARNING: Failed to guess framerate (no Cues), set to 30fps.\n");
-    *fps_num = 30;
-    *fps_den = 1;
-    return 0;
-  }
-
-  /* Guess the framerate. Read up to 1 second, or 50 video packets,
-   * whichever comes first.
-   */
-  for (i = 0; tstamp < 1000000000 && i < 50;) {
-    nestegg_packet *pkt;
-    unsigned int track;
-
-    if (nestegg_read_packet(input->nestegg_ctx, &pkt) <= 0)
-      break;
-
-    nestegg_packet_track(pkt, &track);
-    if (track == input->video_track) {
-      nestegg_packet_tstamp(pkt, &tstamp);
-      i++;
-    }
-
-    nestegg_free_packet(pkt);
-  }
-
-  if (nestegg_track_seek(input->nestegg_ctx, input->video_track, 0))
-    goto fail;
-
-  *fps_num = (i - 1) * 1000000;
-  *fps_den = (unsigned int)(tstamp / 1000);
-  return 0;
-fail:
-  nestegg_destroy(input->nestegg_ctx);
-  input->nestegg_ctx = NULL;
-  rewind(input->infile);
-  return 1;
-}
-
-
-static int
-file_is_webm(struct input_ctx *input,
-             unsigned int     *fourcc,
-             unsigned int     *width,
-             unsigned int     *height,
-             unsigned int     *fps_den,
-             unsigned int     *fps_num) {
-  unsigned int i, n;
-  int          track_type = -1;
-  int          codec_id;
-
-  nestegg_io io = {nestegg_read_cb, nestegg_seek_cb, nestegg_tell_cb, 0};
-  nestegg_video_params params;
-
-  io.userdata = input->infile;
-  if (nestegg_init(&input->nestegg_ctx, io, NULL))
-    goto fail;
-
-  if (nestegg_track_count(input->nestegg_ctx, &n))
-    goto fail;
-
-  for (i = 0; i < n; i++) {
-    track_type = nestegg_track_type(input->nestegg_ctx, i);
-
-    if (track_type == NESTEGG_TRACK_VIDEO)
-      break;
-    else if (track_type < 0)
-      goto fail;
-  }
-
-  codec_id = nestegg_track_codec_id(input->nestegg_ctx, i);
-  if (codec_id == NESTEGG_CODEC_VP8) {
-    *fourcc = VP8_FOURCC_MASK;
-  } else if (codec_id == NESTEGG_CODEC_VP9) {
-    *fourcc = VP9_FOURCC_MASK;
-  } else {
-    fprintf(stderr, "Not VPx video, quitting.\n");
-    exit(1);
-  }
-
-  input->video_track = i;
-
-  if (nestegg_track_video_params(input->nestegg_ctx, i, &params))
-    goto fail;
-
-  *fps_den = 0;
-  *fps_num = 0;
-  *width = params.width;
-  *height = params.height;
-  return 1;
-fail:
-  input->nestegg_ctx = NULL;
-  rewind(input->infile);
-  return 0;
-}
-
-
 void show_progress(int frame_in, int frame_out, unsigned long dx_time) {
   fprintf(stderr, "%d decoded frames/%d showed frames in %lu us (%.2f fps)\r",
           frame_in, frame_out, dx_time,
           (float)frame_out * 1000000.0 / (float)dx_time);
 }
 
-
 void generate_filename(const char *pattern, char *out, size_t q_len,
                        unsigned int d_w, unsigned int d_h,
                        unsigned int frame_in) {
@@ -663,18 +395,18 @@
 
 
 int main_loop(int argc, const char **argv_) {
-  vpx_codec_ctx_t          decoder;
+  vpx_codec_ctx_t       decoder;
   char                  *fn = NULL;
   int                    i;
   uint8_t               *buf = NULL;
-  size_t                 buf_sz = 0, buf_alloc_sz = 0;
+  size_t                 bytes_in_buffer = 0, buffer_size = 0;
   FILE                  *infile;
-  int                    frame_in = 0, frame_out = 0, flipuv = 0, noblit = 0, do_md5 = 0, progress = 0;
+  int                    frame_in = 0, frame_out = 0, flipuv = 0, noblit = 0;
+  int                    do_md5 = 0, progress = 0;
   int                    stop_after = 0, postproc = 0, summary = 0, quiet = 1;
   int                    arg_skip = 0;
   int                    ec_enabled = 0;
   vpx_codec_iface_t       *iface = NULL;
-  unsigned int           fourcc;
   unsigned long          dx_time = 0;
   struct arg               arg;
   char                   **argv, **argi, **argj;
@@ -682,10 +414,6 @@
   char                    outfile[PATH_MAX];
   int                     single_file;
   int                     use_y4m = 1;
-  unsigned int            width;
-  unsigned int            height;
-  unsigned int            fps_den;
-  unsigned int            fps_num;
   void                   *out = NULL;
   vpx_codec_dec_cfg_t     cfg = {0};
 #if CONFIG_VP8_DECODER
@@ -695,7 +423,6 @@
   int                     vp8_dbg_color_b_modes = 0;
   int                     vp8_dbg_display_mv = 0;
 #endif
-  struct input_ctx        input = {0};
   int                     frames_corrupted = 0;
   int                     dec_flags = 0;
   int                     do_scale = 0;
@@ -703,6 +430,12 @@
   vpx_image_t             *scaled_img = NULL;
   int                     frame_avail, got_data;
 
+  struct VpxDecInputContext input = {0};
+  struct VpxInputContext vpx_input_ctx = {0};
+  struct WebmInputContext webm_ctx = {0};
+  input.vpx_input_ctx = &vpx_input_ctx;
+  input.webm_ctx = &webm_ctx;
+
   /* Parse command line */
   exec_name = argv_[0];
   argv = argv_dup(argc - 1, argv_ + 1);
@@ -840,14 +573,13 @@
     return EXIT_FAILURE;
   }
 #endif
-  input.infile = infile;
-  if (file_is_ivf(infile, &fourcc, &width, &height, &fps_den,
-                  &fps_num))
-    input.kind = IVF_FILE;
-  else if (file_is_webm(&input, &fourcc, &width, &height, &fps_den, &fps_num))
-    input.kind = WEBM_FILE;
-  else if (file_is_raw(infile, &fourcc, &width, &height, &fps_den, &fps_num))
-    input.kind = RAW_FILE;
+  input.vpx_input_ctx->file = infile;
+  if (file_is_ivf(input.vpx_input_ctx))
+    input.vpx_input_ctx->file_type = FILE_TYPE_IVF;
+  else if (file_is_webm(input.webm_ctx, input.vpx_input_ctx))
+    input.vpx_input_ctx->file_type = FILE_TYPE_WEBM;
+  else if (file_is_raw(input.vpx_input_ctx))
+    input.vpx_input_ctx->file_type = FILE_TYPE_RAW;
   else {
     fprintf(stderr, "Unrecognized input file type.\n");
     return EXIT_FAILURE;
@@ -874,7 +606,7 @@
 
   if (single_file && !noblit) {
     generate_filename(outfile_pattern, outfile, sizeof(outfile) - 1,
-                      width, height, 0);
+                      vpx_input_ctx.width, vpx_input_ctx.height, 0);
     out = out_open(outfile, do_md5);
   }
 
@@ -887,8 +619,8 @@
       return EXIT_FAILURE;
     }
 
-    if (input.kind == WEBM_FILE)
-      if (webm_guess_framerate(&input, &fps_den, &fps_num)) {
+    if (vpx_input_ctx.file_type == FILE_TYPE_WEBM)
+      if (webm_guess_framerate(input.webm_ctx, input.vpx_input_ctx)) {
         fprintf(stderr, "Failed to guess framerate -- error parsing "
                 "webm file?\n");
         return EXIT_FAILURE;
@@ -899,21 +631,23 @@
        store one, and neither does VP8.
       That will have to wait until these tools support WebM natively.*/
     snprintf(buffer, sizeof(buffer), "YUV4MPEG2 W%u H%u F%u:%u I%c ",
-             width, height, fps_num, fps_den, 'p');
+             vpx_input_ctx.width, vpx_input_ctx.height,
+             vpx_input_ctx.framerate.numerator,
+             vpx_input_ctx.framerate.denominator,
+             'p');
     out_put(out, (unsigned char *)buffer,
             (unsigned int)strlen(buffer), do_md5);
   }
 
   /* Try to determine the codec from the fourcc. */
   for (i = 0; i < sizeof(ifaces) / sizeof(ifaces[0]); i++)
-    if ((fourcc & ifaces[i].fourcc_mask) == ifaces[i].fourcc) {
-      vpx_codec_iface_t  *ivf_iface = ifaces[i].iface();
+    if ((vpx_input_ctx.fourcc & ifaces[i].fourcc_mask) == ifaces[i].fourcc) {
+      vpx_codec_iface_t *vpx_iface = ifaces[i].iface();
 
-      if (iface && iface != ivf_iface)
-        fprintf(stderr, "Notice -- IVF header indicates codec: %s\n",
-                ifaces[i].name);
+      if (iface && iface != vpx_iface)
+        warn("Header indicates codec: %s\n", ifaces[i].name);
       else
-        iface = ivf_iface;
+        iface = vpx_iface;
 
       break;
     }
@@ -963,10 +697,10 @@
 #endif
 
 
-  if(arg_skip)
+  if (arg_skip)
     fprintf(stderr, "Skiping first %d frames.\n", arg_skip);
   while (arg_skip) {
-    if (read_frame(&input, &buf, &buf_sz, &buf_alloc_sz))
+    if (read_frame(&input, &buf, &bytes_in_buffer, &buffer_size))
       break;
     arg_skip--;
   }
@@ -983,19 +717,19 @@
 
     frame_avail = 0;
     if (!stop_after || frame_in < stop_after) {
-      if(!read_frame(&input, &buf, &buf_sz, &buf_alloc_sz)) {
+      if (!read_frame(&input, &buf, &bytes_in_buffer, &buffer_size)) {
         frame_avail = 1;
         frame_in++;
 
         vpx_usec_timer_start(&timer);
 
-        if (vpx_codec_decode(&decoder, buf, (unsigned int)buf_sz, NULL, 0)) {
+        if (vpx_codec_decode(&decoder, buf, bytes_in_buffer, NULL, 0)) {
           const char *detail = vpx_codec_error_detail(&decoder);
-          fprintf(stderr, "Failed to decode frame: %s\n",
-                  vpx_codec_error(&decoder));
+          warn("Failed to decode frame %d: %s",
+               frame_in, vpx_codec_error(&decoder));
 
           if (detail)
-            fprintf(stderr, "  Additional information: %s\n", detail);
+            warn("Additional information: %s", detail);
           goto fail;
         }
 
@@ -1016,8 +750,7 @@
     dx_time += (unsigned int)vpx_usec_timer_elapsed(&timer);
 
     if (vpx_codec_control(&decoder, VP8D_GET_FRAME_CORRUPTED, &corrupted)) {
-      fprintf(stderr, "Failed VP8_GET_FRAME_CORRUPTED: %s\n",
-              vpx_codec_error(&decoder));
+      warn("Failed VP8_GET_FRAME_CORRUPTED: %s", vpx_codec_error(&decoder));
       goto fail;
     }
     frames_corrupted += corrupted;
@@ -1131,10 +864,11 @@
   if (single_file && !noblit)
     out_close(out, outfile, do_md5);
 
-  if (input.nestegg_ctx)
-    nestegg_destroy(input.nestegg_ctx);
-  if (input.kind != WEBM_FILE)
+  if (input.vpx_input_ctx->file_type == FILE_TYPE_WEBM)
+    webm_free(input.webm_ctx);
+  else
     free(buf);
+
   fclose(infile);
   free(argv);
 

diff --git a/vpxenc.c b/vpxenc.c
index 674da14..2d92ae8 100644
--- a/vpxenc.c
+++ b/vpxenc.c

@@ -10,32 +10,23 @@
 
 #include "./vpx_config.h"
 
-#if defined(_WIN32) || defined(__OS2__) || !CONFIG_OS_SUPPORT
-#define USE_POSIX_MMAP 0
-#else
-#define USE_POSIX_MMAP 1
-#endif
-
+#include <assert.h>
+#include <limits.h>
 #include <math.h>
+#include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <stdarg.h>
 #include <string.h>
-#include <limits.h>
-#include <assert.h>
+
 #include "vpx/vpx_encoder.h"
 #if CONFIG_DECODERS
 #include "vpx/vpx_decoder.h"
 #endif
-#if USE_POSIX_MMAP
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/mman.h>
-#include <fcntl.h>
-#include <unistd.h>
-#endif
 
 #include "third_party/libyuv/include/libyuv/scale.h"
+#include "./args.h"
+#include "./ivfdec.h"
+#include "./ivfenc.h"
 
 #if CONFIG_VP8_ENCODER || CONFIG_VP9_ENCODER
 #include "vpx/vp8cx.h"
@@ -118,199 +109,28 @@
   va_end(ap);
 }
 
-enum video_file_type {
-  FILE_TYPE_RAW,
-  FILE_TYPE_IVF,
-  FILE_TYPE_Y4M
-};
-
-struct detect_buffer {
-  char buf[4];
-  size_t buf_read;
-  size_t position;
-};
-
-
-struct input_state {
-  char                 *fn;
-  FILE                 *file;
-  off_t                 length;
-  y4m_input             y4m;
-  struct detect_buffer  detect;
-  enum video_file_type  file_type;
-  unsigned int          w;
-  unsigned int          h;
-  struct vpx_rational   framerate;
-  int                   use_i420;
-  int                   only_i420;
-};
-
-#define IVF_FRAME_HDR_SZ (4+8) /* 4 byte size + 8 byte timestamp */
-static int read_frame(struct input_state *input, vpx_image_t *img) {
-  FILE *f = input->file;
-  enum video_file_type file_type = input->file_type;
-  y4m_input *y4m = &input->y4m;
-  struct detect_buffer *detect = &input->detect;
-  int plane = 0;
+int read_frame(struct VpxInputContext *input_ctx, vpx_image_t *img) {
+  FILE *f = input_ctx->file;
+  y4m_input *y4m = &input_ctx->y4m;
   int shortread = 0;
 
-  if (file_type == FILE_TYPE_Y4M) {
+  if (input_ctx->file_type == FILE_TYPE_Y4M) {
     if (y4m_input_fetch_frame(y4m, f, img) < 1)
       return 0;
   } else {
-    if (file_type == FILE_TYPE_IVF) {
-      char junk[IVF_FRAME_HDR_SZ];
-
-      /* Skip the frame header. We know how big the frame should be. See
-       * write_ivf_frame_header() for documentation on the frame header
-       * layout.
-       */
-      (void) fread(junk, 1, IVF_FRAME_HDR_SZ, f);
-    }
-
-    for (plane = 0; plane < 3; plane++) {
-      unsigned char *ptr;
-      int w = (plane ? (1 + img->d_w) / 2 : img->d_w);
-      int h = (plane ? (1 + img->d_h) / 2 : img->d_h);
-      int r;
-
-      /* Determine the correct plane based on the image format. The for-loop
-       * always counts in Y,U,V order, but this may not match the order of
-       * the data on disk.
-       */
-      switch (plane) {
-        case 1:
-          ptr = img->planes[img->fmt == VPX_IMG_FMT_YV12 ? VPX_PLANE_V : VPX_PLANE_U];
-          break;
-        case 2:
-          ptr = img->planes[img->fmt == VPX_IMG_FMT_YV12 ? VPX_PLANE_U : VPX_PLANE_V];
-          break;
-        default:
-          ptr = img->planes[plane];
-      }
-
-      for (r = 0; r < h; r++) {
-        size_t needed = w;
-        size_t buf_position = 0;
-        const size_t left = detect->buf_read - detect->position;
-        if (left > 0) {
-          const size_t more = (left < needed) ? left : needed;
-          memcpy(ptr, detect->buf + detect->position, more);
-          buf_position = more;
-          needed -= more;
-          detect->position += more;
-        }
-        if (needed > 0) {
-          shortread |= (fread(ptr + buf_position, 1, needed, f) < needed);
-        }
-
-        ptr += img->stride[plane];
-      }
-    }
+    shortread = read_yuv_frame(input_ctx, img);
   }
 
   return !shortread;
 }
 
-
-unsigned int file_is_y4m(FILE      *infile,
-                         y4m_input *y4m,
-                         char       detect[4]) {
+int file_is_y4m(FILE *infile, y4m_input *y4m, const char detect[4]) {
   if (memcmp(detect, "YUV4", 4) == 0) {
     return 1;
   }
   return 0;
 }
 
-#define IVF_FILE_HDR_SZ (32)
-unsigned int file_is_ivf(struct input_state *input,
-                         unsigned int *fourcc) {
-  char raw_hdr[IVF_FILE_HDR_SZ];
-  int is_ivf = 0;
-  FILE *infile = input->file;
-  unsigned int *width = &input->w;
-  unsigned int *height = &input->h;
-  struct detect_buffer *detect = &input->detect;
-
-  if (memcmp(detect->buf, "DKIF", 4) != 0)
-    return 0;
-
-  /* See write_ivf_file_header() for more documentation on the file header
-   * layout.
-   */
-  if (fread(raw_hdr + 4, 1, IVF_FILE_HDR_SZ - 4, infile)
-      == IVF_FILE_HDR_SZ - 4) {
-    {
-      is_ivf = 1;
-
-      if (mem_get_le16(raw_hdr + 4) != 0)
-        warn("Unrecognized IVF version! This file may not decode "
-             "properly.");
-
-      *fourcc = mem_get_le32(raw_hdr + 8);
-    }
-  }
-
-  if (is_ivf) {
-    *width = mem_get_le16(raw_hdr + 12);
-    *height = mem_get_le16(raw_hdr + 14);
-    detect->position = 4;
-  }
-
-  return is_ivf;
-}
-
-
-static void write_ivf_file_header(FILE *outfile,
-                                  const vpx_codec_enc_cfg_t *cfg,
-                                  unsigned int fourcc,
-                                  int frame_cnt) {
-  char header[32];
-
-  if (cfg->g_pass != VPX_RC_ONE_PASS && cfg->g_pass != VPX_RC_LAST_PASS)
-    return;
-
-  header[0] = 'D';
-  header[1] = 'K';
-  header[2] = 'I';
-  header[3] = 'F';
-  mem_put_le16(header + 4,  0);                 /* version */
-  mem_put_le16(header + 6,  32);                /* headersize */
-  mem_put_le32(header + 8,  fourcc);            /* headersize */
-  mem_put_le16(header + 12, cfg->g_w);          /* width */
-  mem_put_le16(header + 14, cfg->g_h);          /* height */
-  mem_put_le32(header + 16, cfg->g_timebase.den); /* rate */
-  mem_put_le32(header + 20, cfg->g_timebase.num); /* scale */
-  mem_put_le32(header + 24, frame_cnt);         /* length */
-  mem_put_le32(header + 28, 0);                 /* unused */
-
-  (void) fwrite(header, 1, 32, outfile);
-}
-
-
-static void write_ivf_frame_header(FILE *outfile,
-                                   const vpx_codec_cx_pkt_t *pkt) {
-  char             header[12];
-  vpx_codec_pts_t  pts;
-
-  if (pkt->kind != VPX_CODEC_CX_FRAME_PKT)
-    return;
-
-  pts = pkt->data.frame.pts;
-  mem_put_le32(header, (int)pkt->data.frame.sz);
-  mem_put_le32(header + 4, pts & 0xFFFFFFFF);
-  mem_put_le32(header + 8, pts >> 32);
-
-  (void) fwrite(header, 1, 12, outfile);
-}
-
-static void write_ivf_frame_size(FILE *outfile, size_t size) {
-  char             header[4];
-  mem_put_le32(header, (int)size);
-  (void) fwrite(header, 1, 4, outfile);
-}
-
-
 
 /* Murmur hash derived from public domain reference implementation at
  *   http:// sites.google.com/site/murmurhash/
@@ -360,7 +180,6 @@
 }
 
 
-#include "args.h"
 static const arg_def_t debugmode = ARG_DEF("D", "debug", 0,
                                            "Debug mode (makes output deterministic)");
 static const arg_def_t outputfile = ARG_DEF("o", "output", 1,
@@ -817,9 +636,9 @@
 };
 
 
-static void init_rate_histogram(struct rate_hist          *hist,
+static void init_rate_histogram(struct rate_hist *hist,
                                 const vpx_codec_enc_cfg_t *cfg,
-                                const vpx_rational_t      *fps) {
+                                const vpx_rational_t *fps) {
   int i;
 
   /* Determine the number of samples in the buffer. Use the file's framerate
@@ -1215,12 +1034,10 @@
 }
 
 
-void open_input_file(struct input_state *input) {
-  unsigned int fourcc;
-
+void open_input_file(struct VpxInputContext *input) {
   /* Parse certain options from the input file, if possible */
-  input->file = strcmp(input->fn, "-") ? fopen(input->fn, "rb")
-                : set_binary_mode(stdin);
+  input->file = strcmp(input->filename, "-")
+      ? fopen(input->filename, "rb") : set_binary_mode(stdin);
 
   if (!input->file)
     fatal("Failed to open input file");
@@ -1244,14 +1061,14 @@
     if (y4m_input_open(&input->y4m, input->file, input->detect.buf, 4,
                        input->only_i420) >= 0) {
       input->file_type = FILE_TYPE_Y4M;
-      input->w = input->y4m.pic_w;
-      input->h = input->y4m.pic_h;
-      input->framerate.num = input->y4m.fps_n;
-      input->framerate.den = input->y4m.fps_d;
+      input->width = input->y4m.pic_w;
+      input->height = input->y4m.pic_h;
+      input->framerate.numerator = input->y4m.fps_n;
+      input->framerate.denominator = input->y4m.fps_d;
       input->use_i420 = 0;
     } else
       fatal("Unsupported Y4M stream.");
-  } else if (input->detect.buf_read == 4 && file_is_ivf(input, &fourcc)) {
+  } else if (input->detect.buf_read == 4 && file_is_ivf(input)) {
     fatal("IVF is not supported as input.");
   } else {
     input->file_type = FILE_TYPE_RAW;
@@ -1259,7 +1076,7 @@
 }
 
 
-static void close_input_file(struct input_state *input) {
+static void close_input_file(struct VpxInputContext *input) {
   fclose(input->file);
   if (input->file_type == FILE_TYPE_Y4M)
     y4m_input_close(&input->y4m);
@@ -1534,7 +1351,7 @@
 
 static void show_stream_config(struct stream_state  *stream,
                                struct global_config *global,
-                               struct input_state   *input) {
+                               struct VpxInputContext *input) {
 
 #define SHOW(field) \
   fprintf(stderr, "    %-28s = %d\n", #field, stream->config.cfg.field)
@@ -1542,7 +1359,7 @@
   if (stream->index == 0) {
     fprintf(stderr, "Codec: %s\n",
             vpx_codec_iface_name(global->codec->iface()));
-    fprintf(stderr, "Source file: %s Format: %s\n", input->fn,
+    fprintf(stderr, "Source file: %s Format: %s\n", input->filename,
             input->use_i420 ? "I420" : "YV12");
   }
   if (stream->next || stream->index)
@@ -1601,7 +1418,7 @@
                            stream->config.stereo_fmt,
                            global->codec->fourcc);
   } else
-    write_ivf_file_header(stream->file, &stream->config.cfg,
+    ivf_write_file_header(stream->file, &stream->config.cfg,
                           global->codec->fourcc, 0);
 }
 
@@ -1614,7 +1431,7 @@
     stream->ebml.cue_list = NULL;
   } else {
     if (!fseek(stream->file, 0, SEEK_SET))
-      write_ivf_file_header(stream->file, &stream->config.cfg,
+      ivf_write_file_header(stream->file, &stream->config.cfg,
                             fourcc,
                             stream->frames_out);
   }
@@ -1774,14 +1591,14 @@
             ivf_header_pos = ftello(stream->file);
             fsize = pkt->data.frame.sz;
 
-            write_ivf_frame_header(stream->file, pkt);
+            ivf_write_frame_header(stream->file, pkt);
           } else {
             fsize += pkt->data.frame.sz;
 
             if (!(pkt->data.frame.flags & VPX_FRAME_IS_FRAGMENT)) {
               off_t currpos = ftello(stream->file);
               fseeko(stream->file, ivf_header_pos, SEEK_SET);
-              write_ivf_frame_size(stream->file, fsize);
+              ivf_write_frame_size(stream->file, fsize);
               fseeko(stream->file, currpos, SEEK_SET);
             }
           }
@@ -1939,8 +1756,8 @@
   vpx_image_t            raw;
   int                    frame_avail, got_data;
 
-  struct input_state       input = {0};
-  struct global_config     global;
+  struct VpxInputContext  input = {0};
+  struct global_config    global;
   struct stream_state     *streams = NULL;
   char                   **argv, **argi;
   uint64_t                 cx_time = 0;
@@ -1953,8 +1770,8 @@
     usage_exit();
 
   /* Setup default input stream settings */
-  input.framerate.num = 30;
-  input.framerate.den = 1;
+  input.framerate.numerator = 30;
+  input.framerate.denominator = 1;
   input.use_i420 = 1;
   input.only_i420 = 1;
 
@@ -1986,9 +1803,9 @@
       die("Error: Unrecognized option %s\n", *argi);
 
   /* Handle non-option arguments */
-  input.fn = argv[0];
+  input.filename = argv[0];
 
-  if (!input.fn)
+  if (!input.filename)
     usage_exit();
 
 #if CONFIG_NON420
@@ -2008,20 +1825,20 @@
     /* If the input file doesn't specify its w/h (raw files), try to get
      * the data from the first stream's configuration.
      */
-    if (!input.w || !input.h)
+    if (!input.width || !input.height)
       FOREACH_STREAM( {
       if (stream->config.cfg.g_w && stream->config.cfg.g_h) {
-        input.w = stream->config.cfg.g_w;
-        input.h = stream->config.cfg.g_h;
+        input.width = stream->config.cfg.g_w;
+        input.height = stream->config.cfg.g_h;
         break;
       }
     });
 
     /* Update stream configurations from the input file's parameters */
-    if (!input.w || !input.h)
+    if (!input.width || !input.height)
       fatal("Specify stream dimensions with --width (-w) "
             " and --height (-h)");
-    FOREACH_STREAM(set_stream_dimensions(stream, input.w, input.h));
+    FOREACH_STREAM(set_stream_dimensions(stream, input.width, input.height));
     FOREACH_STREAM(validate_stream_config(stream));
 
     /* Ensure that --passes and --pass are consistent. If --pass is set and
@@ -2037,8 +1854,10 @@
     /* Use the frame rate from the file only if none was specified
      * on the command-line.
      */
-    if (!global.have_framerate)
-      global.framerate = input.framerate;
+    if (!global.have_framerate) {
+      global.framerate.num = input.framerate.numerator;
+      global.framerate.den = input.framerate.denominator;
+    }
 
     FOREACH_STREAM(set_default_kf_interval(stream, &global));
 
@@ -2056,7 +1875,7 @@
         vpx_img_alloc(&raw,
                       input.use_i420 ? VPX_IMG_FMT_I420
                       : VPX_IMG_FMT_YV12,
-                      input.w, input.h, 32);
+                      input.width, input.height, 32);
 
       FOREACH_STREAM(init_rate_histogram(&stream->rate_hist,
                                          &stream->config.cfg,

diff --git a/webmdec.c b/webmdec.c
new file mode 100644
index 0000000..4bf7c7e
--- /dev/null
+++ b/webmdec.c

@@ -0,0 +1,193 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./webmdec.h"
+
+#include <stdarg.h>
+
+#include "nestegg/include/nestegg/nestegg.h"
+
+static int nestegg_read_cb(void *buffer, size_t length, void *userdata) {
+  FILE *f = userdata;
+
+  if (fread(buffer, 1, length, f) < length) {
+    if (ferror(f))
+      return -1;
+    if (feof(f))
+      return 0;
+  }
+  return 1;
+}
+
+static int nestegg_seek_cb(int64_t offset, int whence, void *userdata) {
+  switch (whence) {
+    case NESTEGG_SEEK_SET:
+      whence = SEEK_SET;
+      break;
+    case NESTEGG_SEEK_CUR:
+      whence = SEEK_CUR;
+      break;
+    case NESTEGG_SEEK_END:
+      whence = SEEK_END;
+      break;
+  };
+  return fseek(userdata, (int32_t)offset, whence) ? -1 : 0;
+}
+
+static int64_t nestegg_tell_cb(void *userdata) {
+  return ftell(userdata);
+}
+
+static void nestegg_log_cb(nestegg *context,
+                           unsigned int severity,
+                           char const *format, ...) {
+  va_list ap;
+  va_start(ap, format);
+  vfprintf(stderr, format, ap);
+  fprintf(stderr, "\n");
+  va_end(ap);
+}
+
+int file_is_webm(struct WebmInputContext *webm_ctx,
+                 struct VpxInputContext *vpx_ctx) {
+  uint32_t i, n;
+  int track_type = -1;
+  int codec_id;
+
+  nestegg_io io = {nestegg_read_cb, nestegg_seek_cb, nestegg_tell_cb, 0};
+  nestegg_video_params params;
+
+  io.userdata = vpx_ctx->file;
+  if (nestegg_init(&webm_ctx->nestegg_ctx, io, NULL))
+    goto fail;
+
+  if (nestegg_track_count(webm_ctx->nestegg_ctx, &n))
+    goto fail;
+
+  for (i = 0; i < n; i++) {
+    track_type = nestegg_track_type(webm_ctx->nestegg_ctx, i);
+
+    if (track_type == NESTEGG_TRACK_VIDEO)
+      break;
+    else if (track_type < 0)
+      goto fail;
+  }
+
+  codec_id = nestegg_track_codec_id(webm_ctx->nestegg_ctx, i);
+  if (codec_id == NESTEGG_CODEC_VP8) {
+    vpx_ctx->fourcc = VP8_FOURCC_MASK;
+  } else if (codec_id == NESTEGG_CODEC_VP9) {
+    vpx_ctx->fourcc = VP9_FOURCC_MASK;
+  } else {
+    fatal("Not VPx video, quitting.\n");
+  }
+
+  webm_ctx->video_track = i;
+
+  if (nestegg_track_video_params(webm_ctx->nestegg_ctx, i, &params))
+    goto fail;
+
+  vpx_ctx->framerate.denominator = 0;
+  vpx_ctx->framerate.numerator = 0;
+  vpx_ctx->width = params.width;
+  vpx_ctx->height = params.height;
+
+  return 1;
+
+ fail:
+  webm_ctx->nestegg_ctx = NULL;
+  rewind(vpx_ctx->file);
+
+  return 0;
+}
+
+int webm_read_frame(struct WebmInputContext *webm_ctx,
+                    uint8_t **buffer,
+                    size_t *bytes_in_buffer,
+                    size_t *buffer_size) {
+  if (webm_ctx->chunk >= webm_ctx->chunks) {
+    uint32_t track;
+
+    do {
+      /* End of this packet, get another. */
+      if (webm_ctx->pkt)
+        nestegg_free_packet(webm_ctx->pkt);
+
+      if (nestegg_read_packet(webm_ctx->nestegg_ctx, &webm_ctx->pkt) <= 0 ||
+          nestegg_packet_track(webm_ctx->pkt, &track)) {
+        return 1;
+      }
+    } while (track != webm_ctx->video_track);
+
+    if (nestegg_packet_count(webm_ctx->pkt, &webm_ctx->chunks))
+      return 1;
+
+    webm_ctx->chunk = 0;
+  }
+
+  if (nestegg_packet_data(webm_ctx->pkt, webm_ctx->chunk,
+                          buffer, bytes_in_buffer)) {
+    return 1;
+  }
+
+  webm_ctx->chunk++;
+  return 0;
+}
+
+int webm_guess_framerate(struct WebmInputContext *webm_ctx,
+                         struct VpxInputContext *vpx_ctx) {
+  uint32_t i;
+  uint64_t tstamp = 0;
+
+  /* Check to see if we can seek before we parse any data. */
+  if (nestegg_track_seek(webm_ctx->nestegg_ctx, webm_ctx->video_track, 0)) {
+    warn("Failed to guess framerate (no Cues), set to 30fps.\n");
+    vpx_ctx->framerate.numerator = 30;
+    vpx_ctx->framerate.denominator  = 1;
+    return 0;
+  }
+
+  /* Guess the framerate. Read up to 1 second, or 50 video packets,
+   * whichever comes first.
+   */
+  for (i = 0; tstamp < 1000000000 && i < 50;) {
+    nestegg_packet *pkt;
+    uint32_t track;
+
+    if (nestegg_read_packet(webm_ctx->nestegg_ctx, &pkt) <= 0)
+      break;
+
+    nestegg_packet_track(pkt, &track);
+    if (track == webm_ctx->video_track) {
+      nestegg_packet_tstamp(pkt, &tstamp);
+      ++i;
+    }
+
+    nestegg_free_packet(pkt);
+  }
+
+  if (nestegg_track_seek(webm_ctx->nestegg_ctx, webm_ctx->video_track, 0))
+    goto fail;
+
+  vpx_ctx->framerate.numerator = (i - 1) * 1000000;
+  vpx_ctx->framerate.denominator = (int)(tstamp / 1000);
+  return 0;
+
+ fail:
+  nestegg_destroy(webm_ctx->nestegg_ctx);
+  webm_ctx->nestegg_ctx = NULL;
+  rewind(vpx_ctx->file);
+  return 1;
+}
+
+void webm_free(struct WebmInputContext *webm_ctx) {
+  if (webm_ctx && webm_ctx->nestegg_ctx)
+    nestegg_destroy(webm_ctx->nestegg_ctx);
+}

diff --git a/webmdec.h b/webmdec.h
new file mode 100644
index 0000000..002fbe6
--- /dev/null
+++ b/webmdec.h

@@ -0,0 +1,40 @@
+/*
+ *  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef WEBMDEC_H_
+#define WEBMDEC_H_
+
+#include "./tools_common.h"
+
+struct nestegg;
+struct nestegg_packet;
+struct VpxInputContext;
+
+struct WebmInputContext {
+  uint32_t chunk;
+  uint32_t chunks;
+  uint32_t video_track;
+  struct nestegg *nestegg_ctx;
+  struct nestegg_packet *pkt;
+};
+
+int file_is_webm(struct WebmInputContext *webm_ctx,
+                 struct VpxInputContext *vpx_ctx);
+
+int webm_read_frame(struct WebmInputContext *webm_ctx,
+                    uint8_t **buffer,
+                    size_t *bytes_in_buffer,
+                    size_t *buffer_size);
+
+int webm_guess_framerate(struct WebmInputContext *webm_ctx,
+                         struct VpxInputContext *vpx_ctx);
+
+void webm_free(struct WebmInputContext *webm_ctx);
+
+#endif  // WEBMDEC_H_
commit	b00057c88a6c98472fd78a9957453ec012a08336	[log] [tgz]
author	Guillaume Martres <smarter3@gmail.com>	Wed Nov 20 08:13:28 2013 -0800
committer	Gerrit Code Review <gerrit@gerrit.golo.chromium.org>	Wed Nov 20 08:13:28 2013 -0800
tree	e9bf24d5a6c06d1a674b81923c5a8064f6c23c7c
parent	d486427cf1bb1b64ab1e0a746e5e2b7c5bb3a0e2 [diff]
parent	17084657e6da5b02ab1e492b237e52f2bd38ade3 [diff]