Merge branch 'master' into nextgenv2

Change-Id: Ia3c0f2103fd997613d9f16156795028f89f63265
diff --git a/README b/README
index 29072b9..460ad73 100644
--- a/README
+++ b/README
@@ -79,9 +79,6 @@
     x86-os2-gcc
     x86-solaris-gcc
     x86-win32-gcc
-    x86-win32-vs7
-    x86-win32-vs8
-    x86-win32-vs9
     x86-win32-vs10
     x86-win32-vs11
     x86-win32-vs12
@@ -98,8 +95,6 @@
     x86_64-linux-icc
     x86_64-solaris-gcc
     x86_64-win64-gcc
-    x86_64-win64-vs8
-    x86_64-win64-vs9
     x86_64-win64-vs10
     x86_64-win64-vs11
     x86_64-win64-vs12
diff --git a/build/make/Makefile b/build/make/Makefile
index 3e8c024..dfb7e4b 100644
--- a/build/make/Makefile
+++ b/build/make/Makefile
@@ -418,7 +418,6 @@
     DIST-SRCS-yes            += build/make/gen_asm_deps.sh
     DIST-SRCS-yes            += build/make/Makefile
     DIST-SRCS-$(CONFIG_MSVS)  += build/make/gen_msvs_def.sh
-    DIST-SRCS-$(CONFIG_MSVS)  += build/make/gen_msvs_proj.sh
     DIST-SRCS-$(CONFIG_MSVS)  += build/make/gen_msvs_sln.sh
     DIST-SRCS-$(CONFIG_MSVS)  += build/make/gen_msvs_vcxproj.sh
     DIST-SRCS-$(CONFIG_MSVS)  += build/make/msvs_common.sh
diff --git a/build/make/gen_msvs_proj.sh b/build/make/gen_msvs_proj.sh
deleted file mode 100755
index 2b91fbf..0000000
--- a/build/make/gen_msvs_proj.sh
+++ /dev/null
@@ -1,490 +0,0 @@
-#!/bin/bash
-##
-##  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-##
-##  Use of this source code is governed by a BSD-style license
-##  that can be found in the LICENSE file in the root of the source
-##  tree. An additional intellectual property rights grant can be found
-##  in the file PATENTS.  All contributing project authors may
-##  be found in the AUTHORS file in the root of the source tree.
-##
-
-self=$0
-self_basename=${self##*/}
-self_dirname=$(dirname "$0")
-
-. "$self_dirname/msvs_common.sh"|| exit 127
-
-show_help() {
-    cat <<EOF
-Usage: ${self_basename} --name=projname [options] file1 [file2 ...]
-
-This script generates a Visual Studio project file from a list of source
-code files.
-
-Options:
-    --help                      Print this message
-    --exe                       Generate a project for building an Application
-    --lib                       Generate a project for creating a static library
-    --dll                       Generate a project for creating a dll
-    --static-crt                Use the static C runtime (/MT)
-    --target=isa-os-cc          Target specifier (required)
-    --out=filename              Write output to a file [stdout]
-    --name=project_name         Name of the project (required)
-    --proj-guid=GUID            GUID to use for the project
-    --module-def=filename       File containing export definitions (for DLLs)
-    --ver=version               Version (7,8,9) of visual studio to generate for
-    --src-path-bare=dir         Path to root of source tree
-    -Ipath/to/include           Additional include directories
-    -DFLAG[=value]              Preprocessor macros to define
-    -Lpath/to/lib               Additional library search paths
-    -llibname                   Library to link against
-EOF
-    exit 1
-}
-
-generate_filter() {
-    local var=$1
-    local name=$2
-    local pats=$3
-    local file_list_sz
-    local i
-    local f
-    local saveIFS="$IFS"
-    local pack
-    echo "generating filter '$name' from ${#file_list[@]} files" >&2
-    IFS=*
-
-    open_tag Filter \
-        Name=$name \
-        Filter=$pats \
-        UniqueIdentifier=`generate_uuid` \
-
-    file_list_sz=${#file_list[@]}
-    for i in ${!file_list[@]}; do
-        f=${file_list[i]}
-        for pat in ${pats//;/$IFS}; do
-            if [ "${f##*.}" == "$pat" ]; then
-                unset file_list[i]
-
-                objf=$(echo ${f%.*}.obj \
-                       | sed -e "s,$src_path_bare,," \
-                             -e 's/^[\./]\+//g' -e 's,[:/ ],_,g')
-                open_tag File RelativePath="$f"
-
-                if [ "$pat" == "asm" ] && $asm_use_custom_step; then
-                    # Avoid object file name collisions, i.e. vpx_config.c and
-                    # vpx_config.asm produce the same object file without
-                    # this additional suffix.
-                    objf=${objf%.obj}_asm.obj
-                    for plat in "${platforms[@]}"; do
-                        for cfg in Debug Release; do
-                            open_tag FileConfiguration \
-                                Name="${cfg}|${plat}" \
-
-                            tag Tool \
-                                Name="VCCustomBuildTool" \
-                                Description="Assembling \$(InputFileName)" \
-                                CommandLine="$(eval echo \$asm_${cfg}_cmdline) -o \$(IntDir)\\$objf" \
-                                Outputs="\$(IntDir)\\$objf" \
-
-                            close_tag FileConfiguration
-                        done
-                    done
-                fi
-                if [ "$pat" == "c" ] || \
-                   [ "$pat" == "cc" ] || [ "$pat" == "cpp" ]; then
-                    for plat in "${platforms[@]}"; do
-                        for cfg in Debug Release; do
-                            open_tag FileConfiguration \
-                                Name="${cfg}|${plat}" \
-
-                            tag Tool \
-                                Name="VCCLCompilerTool" \
-                                ObjectFile="\$(IntDir)\\$objf" \
-
-                            close_tag FileConfiguration
-                        done
-                    done
-                fi
-                close_tag File
-
-                break
-            fi
-        done
-    done
-
-    close_tag Filter
-    IFS="$saveIFS"
-}
-
-# Process command line
-unset target
-for opt in "$@"; do
-    optval="${opt#*=}"
-    case "$opt" in
-        --help|-h) show_help
-        ;;
-        --target=*) target="${optval}"
-        ;;
-        --out=*) outfile="$optval"
-        ;;
-        --name=*) name="${optval}"
-        ;;
-        --proj-guid=*) guid="${optval}"
-        ;;
-        --module-def=*) link_opts="${link_opts} ModuleDefinitionFile=${optval}"
-        ;;
-        --exe) proj_kind="exe"
-        ;;
-        --dll) proj_kind="dll"
-        ;;
-        --lib) proj_kind="lib"
-        ;;
-        --src-path-bare=*)
-            src_path_bare=$(fix_path "$optval")
-            src_path_bare=${src_path_bare%/}
-        ;;
-        --static-crt) use_static_runtime=true
-        ;;
-        --ver=*)
-            vs_ver="$optval"
-            case "$optval" in
-                [789])
-                ;;
-                *) die Unrecognized Visual Studio Version in $opt
-                ;;
-            esac
-        ;;
-        -I*)
-            opt=${opt##-I}
-            opt=$(fix_path "$opt")
-            opt="${opt%/}"
-            incs="${incs}${incs:+;}&quot;${opt}&quot;"
-            yasmincs="${yasmincs} -I&quot;${opt}&quot;"
-        ;;
-        -D*) defines="${defines}${defines:+;}${opt##-D}"
-        ;;
-        -L*) # fudge . to $(OutDir)
-            if [ "${opt##-L}" == "." ]; then
-                libdirs="${libdirs}${libdirs:+;}&quot;\$(OutDir)&quot;"
-            else
-                 # Also try directories for this platform/configuration
-                 opt=${opt##-L}
-                 opt=$(fix_path "$opt")
-                 libdirs="${libdirs}${libdirs:+;}&quot;${opt}&quot;"
-                 libdirs="${libdirs}${libdirs:+;}&quot;${opt}/\$(PlatformName)/\$(ConfigurationName)&quot;"
-                 libdirs="${libdirs}${libdirs:+;}&quot;${opt}/\$(PlatformName)&quot;"
-            fi
-        ;;
-        -l*) libs="${libs}${libs:+ }${opt##-l}.lib"
-        ;;
-        -*) die_unknown $opt
-        ;;
-        *)
-            # The paths in file_list are fixed outside of the loop.
-            file_list[${#file_list[@]}]="$opt"
-            case "$opt" in
-                 *.asm) uses_asm=true
-                 ;;
-            esac
-        ;;
-    esac
-done
-
-# Make one call to fix_path for file_list to improve performance.
-fix_file_list file_list
-
-outfile=${outfile:-/dev/stdout}
-guid=${guid:-`generate_uuid`}
-asm_use_custom_step=false
-uses_asm=${uses_asm:-false}
-case "${vs_ver:-8}" in
-    7) vs_ver_id="7.10"
-       asm_use_custom_step=$uses_asm
-       warn_64bit='Detect64BitPortabilityProblems=true'
-    ;;
-    8) vs_ver_id="8.00"
-       asm_use_custom_step=$uses_asm
-       warn_64bit='Detect64BitPortabilityProblems=true'
-    ;;
-    9) vs_ver_id="9.00"
-       asm_use_custom_step=$uses_asm
-       warn_64bit='Detect64BitPortabilityProblems=false'
-    ;;
-esac
-
-[ -n "$name" ] || die "Project name (--name) must be specified!"
-[ -n "$target" ] || die "Target (--target) must be specified!"
-
-if ${use_static_runtime:-false}; then
-    release_runtime=0
-    debug_runtime=1
-    lib_sfx=mt
-else
-    release_runtime=2
-    debug_runtime=3
-    lib_sfx=md
-fi
-
-# Calculate debug lib names: If a lib ends in ${lib_sfx}.lib, then rename
-# it to ${lib_sfx}d.lib. This precludes linking to release libs from a
-# debug exe, so this may need to be refactored later.
-for lib in ${libs}; do
-    if [ "$lib" != "${lib%${lib_sfx}.lib}" ]; then
-        lib=${lib%.lib}d.lib
-    fi
-    debug_libs="${debug_libs}${debug_libs:+ }${lib}"
-done
-
-
-# List Keyword for this target
-case "$target" in
-    x86*) keyword="ManagedCProj"
-    ;;
-    *) die "Unsupported target $target!"
-esac
-
-# List of all platforms supported for this target
-case "$target" in
-    x86_64*)
-        platforms[0]="x64"
-        asm_Debug_cmdline="yasm -Xvc -g cv8 -f win64 ${yasmincs} &quot;\$(InputPath)&quot;"
-        asm_Release_cmdline="yasm -Xvc -f win64 ${yasmincs} &quot;\$(InputPath)&quot;"
-    ;;
-    x86*)
-        platforms[0]="Win32"
-        asm_Debug_cmdline="yasm -Xvc -g cv8 -f win32 ${yasmincs} &quot;\$(InputPath)&quot;"
-        asm_Release_cmdline="yasm -Xvc -f win32 ${yasmincs} &quot;\$(InputPath)&quot;"
-    ;;
-    *) die "Unsupported target $target!"
-    ;;
-esac
-
-generate_vcproj() {
-    case "$proj_kind" in
-        exe) vs_ConfigurationType=1
-        ;;
-        dll) vs_ConfigurationType=2
-        ;;
-        *)   vs_ConfigurationType=4
-        ;;
-    esac
-
-    echo "<?xml version=\"1.0\" encoding=\"Windows-1252\"?>"
-    open_tag VisualStudioProject \
-        ProjectType="Visual C++" \
-        Version="${vs_ver_id}" \
-        Name="${name}" \
-        ProjectGUID="{${guid}}" \
-        RootNamespace="${name}" \
-        Keyword="${keyword}" \
-
-    open_tag Platforms
-    for plat in "${platforms[@]}"; do
-        tag Platform Name="$plat"
-    done
-    close_tag Platforms
-
-    open_tag Configurations
-    for plat in "${platforms[@]}"; do
-        plat_no_ws=`echo $plat | sed 's/[^A-Za-z0-9_]/_/g'`
-        open_tag Configuration \
-            Name="Debug|$plat" \
-            OutputDirectory="\$(SolutionDir)$plat_no_ws/\$(ConfigurationName)" \
-            IntermediateDirectory="$plat_no_ws/\$(ConfigurationName)/${name}" \
-            ConfigurationType="$vs_ConfigurationType" \
-            CharacterSet="1" \
-
-        case "$target" in
-            x86*)
-                case "$name" in
-                    vpx)
-                        tag Tool \
-                            Name="VCCLCompilerTool" \
-                            Optimization="0" \
-                            AdditionalIncludeDirectories="$incs" \
-                            PreprocessorDefinitions="WIN32;_DEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE;$defines" \
-                            RuntimeLibrary="$debug_runtime" \
-                            UsePrecompiledHeader="0" \
-                            WarningLevel="3" \
-                            DebugInformationFormat="2" \
-                            $warn_64bit \
-
-                        $uses_asm && tag Tool Name="YASM"  IncludePaths="$incs" Debug="true"
-                    ;;
-                    *)
-                        tag Tool \
-                            Name="VCCLCompilerTool" \
-                            Optimization="0" \
-                            AdditionalIncludeDirectories="$incs" \
-                            PreprocessorDefinitions="WIN32;_DEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE;$defines" \
-                            RuntimeLibrary="$debug_runtime" \
-                            UsePrecompiledHeader="0" \
-                            WarningLevel="3" \
-                            DebugInformationFormat="2" \
-                            $warn_64bit \
-
-                        $uses_asm && tag Tool Name="YASM"  IncludePaths="$incs" Debug="true"
-                    ;;
-                esac
-            ;;
-        esac
-
-        case "$proj_kind" in
-            exe)
-                case "$target" in
-                    x86*)
-                        case "$name" in
-                            *)
-                                tag Tool \
-                                    Name="VCLinkerTool" \
-                                    AdditionalDependencies="$debug_libs \$(NoInherit)" \
-                                    AdditionalLibraryDirectories="$libdirs" \
-                                    GenerateDebugInformation="true" \
-                                    ProgramDatabaseFile="\$(OutDir)/${name}.pdb" \
-                            ;;
-                        esac
-                    ;;
-                 esac
-            ;;
-            lib)
-                case "$target" in
-                    x86*)
-                        tag Tool \
-                            Name="VCLibrarianTool" \
-                            OutputFile="\$(OutDir)/${name}${lib_sfx}d.lib" \
-
-                    ;;
-                esac
-            ;;
-            dll)
-                tag Tool \
-                    Name="VCLinkerTool" \
-                    AdditionalDependencies="\$(NoInherit)" \
-                    LinkIncremental="2" \
-                    GenerateDebugInformation="true" \
-                    AssemblyDebug="1" \
-                    TargetMachine="1" \
-                    $link_opts \
-
-            ;;
-        esac
-
-        close_tag Configuration
-
-        open_tag Configuration \
-            Name="Release|$plat" \
-            OutputDirectory="\$(SolutionDir)$plat_no_ws/\$(ConfigurationName)" \
-            IntermediateDirectory="$plat_no_ws/\$(ConfigurationName)/${name}" \
-            ConfigurationType="$vs_ConfigurationType" \
-            CharacterSet="1" \
-            WholeProgramOptimization="0" \
-
-        case "$target" in
-            x86*)
-                case "$name" in
-                    vpx)
-                        tag Tool \
-                            Name="VCCLCompilerTool" \
-                            Optimization="2" \
-                            FavorSizeorSpeed="1" \
-                            AdditionalIncludeDirectories="$incs" \
-                            PreprocessorDefinitions="WIN32;NDEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE;$defines" \
-                            RuntimeLibrary="$release_runtime" \
-                            UsePrecompiledHeader="0" \
-                            WarningLevel="3" \
-                            DebugInformationFormat="0" \
-                            $warn_64bit \
-
-                        $uses_asm && tag Tool Name="YASM"  IncludePaths="$incs"
-                    ;;
-                    *)
-                        tag Tool \
-                            Name="VCCLCompilerTool" \
-                            AdditionalIncludeDirectories="$incs" \
-                            Optimization="2" \
-                            FavorSizeorSpeed="1" \
-                            PreprocessorDefinitions="WIN32;NDEBUG;_CRT_SECURE_NO_WARNINGS;_CRT_SECURE_NO_DEPRECATE;$defines" \
-                            RuntimeLibrary="$release_runtime" \
-                            UsePrecompiledHeader="0" \
-                            WarningLevel="3" \
-                            DebugInformationFormat="0" \
-                            $warn_64bit \
-
-                        $uses_asm && tag Tool Name="YASM"  IncludePaths="$incs"
-                    ;;
-                esac
-            ;;
-        esac
-
-        case "$proj_kind" in
-            exe)
-                case "$target" in
-                    x86*)
-                        case "$name" in
-                            *)
-                                tag Tool \
-                                    Name="VCLinkerTool" \
-                                    AdditionalDependencies="$libs \$(NoInherit)" \
-                                    AdditionalLibraryDirectories="$libdirs" \
-
-                            ;;
-                        esac
-                    ;;
-                 esac
-            ;;
-            lib)
-                case "$target" in
-                    x86*)
-                        tag Tool \
-                            Name="VCLibrarianTool" \
-                            OutputFile="\$(OutDir)/${name}${lib_sfx}.lib" \
-
-                    ;;
-                esac
-            ;;
-            dll) # note differences to debug version: LinkIncremental, AssemblyDebug
-                tag Tool \
-                    Name="VCLinkerTool" \
-                    AdditionalDependencies="\$(NoInherit)" \
-                    LinkIncremental="1" \
-                    GenerateDebugInformation="true" \
-                    TargetMachine="1" \
-                    $link_opts \
-
-            ;;
-        esac
-
-        close_tag Configuration
-    done
-    close_tag Configurations
-
-    open_tag Files
-    generate_filter srcs   "Source Files"   "c;cc;cpp;def;odl;idl;hpj;bat;asm;asmx"
-    generate_filter hdrs   "Header Files"   "h;hm;inl;inc;xsd"
-    generate_filter resrcs "Resource Files" "rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav"
-    generate_filter resrcs "Build Files"    "mk"
-    close_tag Files
-
-    tag       Globals
-    close_tag VisualStudioProject
-
-    # This must be done from within the {} subshell
-    echo "Ignored files list (${#file_list[@]} items) is:" >&2
-    for f in "${file_list[@]}"; do
-        echo "    $f" >&2
-    done
-}
-
-generate_vcproj |
-    sed  -e '/"/s;\([^ "]\)/;\1\\;g' > ${outfile}
-
-exit
-<!--
-TODO: Add any files not captured by filters.
-                <File
-                        RelativePath=".\ReadMe.txt"
-                        >
-                </File>
--->
diff --git a/build/make/gen_msvs_sln.sh b/build/make/gen_msvs_sln.sh
index 664b404..7d5f468 100755
--- a/build/make/gen_msvs_sln.sh
+++ b/build/make/gen_msvs_sln.sh
@@ -55,16 +55,11 @@
 
 parse_project() {
     local file=$1
-    if [ "$sfx" = "vcproj" ]; then
-        local name=`grep Name "$file" | awk 'BEGIN {FS="\""}{if (NR==1) print $2}'`
-        local guid=`grep ProjectGUID "$file" | awk 'BEGIN {FS="\""}{if (NR==1) print $2}'`
-    else
-        local name=`grep RootNamespace "$file" | sed 's,.*<.*>\(.*\)</.*>.*,\1,'`
-        local guid=`grep ProjectGuid "$file" | sed 's,.*<.*>\(.*\)</.*>.*,\1,'`
-    fi
+    local name=`grep RootNamespace "$file" | sed 's,.*<.*>\(.*\)</.*>.*,\1,'`
+    local guid=`grep ProjectGuid "$file" | sed 's,.*<.*>\(.*\)</.*>.*,\1,'`
 
     # save the project GUID to a varaible, normalizing to the basename of the
-    # vcproj file without the extension
+    # vcxproj file without the extension
     local var
     var=${file##*/}
     var=${var%%.${sfx}}
@@ -72,13 +67,8 @@
     eval "${var}_name=$name"
     eval "${var}_guid=$guid"
 
-    if [ "$sfx" = "vcproj" ]; then
-        cur_config_list=`grep -A1 '<Configuration' $file |
-            grep Name | cut -d\" -f2`
-    else
-        cur_config_list=`grep -B1 'Label="Configuration"' $file |
-            grep Condition | cut -d\' -f4`
-    fi
+    cur_config_list=`grep -B1 'Label="Configuration"' $file |
+        grep Condition | cut -d\' -f4`
     new_config_list=$(for i in $config_list $cur_config_list; do
         echo $i
     done | sort | uniq)
@@ -103,25 +93,6 @@
     eval "${var}_guid=$guid"
 
     echo "Project(\"{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}\") = \"$name\", \"$file\", \"$guid\""
-    indent_push
-
-    eval "local deps=\"\${${var}_deps}\""
-    if [ -n "$deps" ] && [ "$sfx" = "vcproj" ]; then
-        echo "${indent}ProjectSection(ProjectDependencies) = postProject"
-        indent_push
-
-        for dep in $deps; do
-            eval "local dep_guid=\${${dep}_guid}"
-            [ -z "${dep_guid}" ] && die "Unknown GUID for $dep (dependency of $var)"
-            echo "${indent}$dep_guid = $dep_guid"
-        done
-
-        indent_pop
-        echo "${indent}EndProjectSection"
-
-    fi
-
-    indent_pop
     echo "EndProject"
 }
 
@@ -191,11 +162,7 @@
     IFS=$'\r'$'\n'
     local TAB=$'\t'
     cat <<EOF
-ifeq (\$(CONFIG_VS_VERSION),7)
-MSBUILD_TOOL := devenv.com
-else
 MSBUILD_TOOL := msbuild.exe
-endif
 found_devenv := \$(shell which \$(MSBUILD_TOOL) >/dev/null 2>&1 && echo yes)
 .nodevenv.once:
 ${TAB}@echo "  * \$(MSBUILD_TOOL) not found in path."
@@ -204,7 +171,7 @@
 ${TAB}@echo "  * Visual Studio IDE. To allow make to build them automatically,"
 ${TAB}@echo "  * add the Common7/IDE directory of your Visual Studio"
 ${TAB}@echo "  * installation to your path, eg:"
-${TAB}@echo "  *   C:\Program Files\Microsoft Visual Studio 8\Common7\IDE"
+${TAB}@echo "  *   C:\Program Files\Microsoft Visual Studio 10.0\Common7\IDE"
 ${TAB}@echo "  * "
 ${TAB}@touch \$@
 CLEAN-OBJS += \$(if \$(found_devenv),,.nodevenv.once)
@@ -221,16 +188,9 @@
 ${TAB}rm -rf "$platform"/"$config"
 .PHONY: $nows_sln_config
 ifneq (\$(found_devenv),)
-  ifeq (\$(CONFIG_VS_VERSION),7)
-$nows_sln_config: $outfile
-${TAB}\$(MSBUILD_TOOL) $outfile -build "$config"
-
-  else
 $nows_sln_config: $outfile
 ${TAB}\$(MSBUILD_TOOL) $outfile -m -t:Build \\
 ${TAB}${TAB}-p:Configuration="$config" -p:Platform="$platform"
-
-  endif
 else
 $nows_sln_config: $outfile .nodevenv.once
 ${TAB}@echo "  * Skipping build of $sln_config (\$(MSBUILD_TOOL) not in path)."
@@ -255,23 +215,12 @@
     ;;
     --ver=*) vs_ver="$optval"
              case $optval in
-             [789]|10|11|12|14)
+             10|11|12|14)
              ;;
              *) die Unrecognized Visual Studio Version in $opt
              ;;
              esac
     ;;
-    --ver=*) vs_ver="$optval"
-             case $optval in
-             7) sln_vers="8.00"
-                sln_vers_str="Visual Studio .NET 2003"
-             ;;
-             [89])
-             ;;
-             *) die "Unrecognized Visual Studio Version '$optval' in $opt"
-             ;;
-             esac
-    ;;
     --target=*) target="${optval}"
     ;;
     -*) die_unknown $opt
@@ -281,16 +230,7 @@
 done
 outfile=${outfile:-/dev/stdout}
 mkoutfile=${mkoutfile:-/dev/stdout}
-case "${vs_ver:-8}" in
-    7) sln_vers="8.00"
-       sln_vers_str="Visual Studio .NET 2003"
-    ;;
-    8) sln_vers="9.00"
-       sln_vers_str="Visual Studio 2005"
-    ;;
-    9) sln_vers="10.00"
-       sln_vers_str="Visual Studio 2008"
-    ;;
+case "${vs_ver:-10}" in
     10) sln_vers="11.00"
        sln_vers_str="Visual Studio 2010"
     ;;
@@ -304,14 +244,7 @@
        sln_vers_str="Visual Studio 2015"
     ;;
 esac
-case "${vs_ver:-8}" in
-    [789])
-    sfx=vcproj
-    ;;
-    10|11|12|14)
-    sfx=vcxproj
-    ;;
-esac
+sfx=vcxproj
 
 for f in "${file_list[@]}"; do
     parse_project $f
diff --git a/configure b/configure
index 73b0e0a..ae9bb5d 100755
--- a/configure
+++ b/configure
@@ -132,9 +132,6 @@
 all_platforms="${all_platforms} x86-os2-gcc"
 all_platforms="${all_platforms} x86-solaris-gcc"
 all_platforms="${all_platforms} x86-win32-gcc"
-all_platforms="${all_platforms} x86-win32-vs7"
-all_platforms="${all_platforms} x86-win32-vs8"
-all_platforms="${all_platforms} x86-win32-vs9"
 all_platforms="${all_platforms} x86-win32-vs10"
 all_platforms="${all_platforms} x86-win32-vs11"
 all_platforms="${all_platforms} x86-win32-vs12"
@@ -152,8 +149,6 @@
 all_platforms="${all_platforms} x86_64-linux-icc"
 all_platforms="${all_platforms} x86_64-solaris-gcc"
 all_platforms="${all_platforms} x86_64-win64-gcc"
-all_platforms="${all_platforms} x86_64-win64-vs8"
-all_platforms="${all_platforms} x86_64-win64-vs9"
 all_platforms="${all_platforms} x86_64-win64-vs10"
 all_platforms="${all_platforms} x86_64-win64-vs11"
 all_platforms="${all_platforms} x86_64-win64-vs12"
@@ -270,7 +265,26 @@
     spatial_svc
     fp_mb_stats
     emulate_hardware
-    misc_fixes
+    var_tx
+    ref_mv
+    dual_filter
+    ext_tx
+    ext_intra
+    ext_inter
+    ext_interp
+    ext_refs
+    global_motion
+    new_quant
+    supertx
+    ans
+    loop_restoration
+    ext_partition
+    ext_partition_types
+    ext_tile
+    obmc
+    warped_motion
+    entropy
+    bidir_pred
 "
 CONFIG_LIST="
     dependency_tracking
@@ -653,17 +667,9 @@
         vs*) enable_feature msvs
              enable_feature solution
              vs_version=${tgt_cc##vs}
-             case $vs_version in
-             [789])
-                 VCPROJ_SFX=vcproj
-                 gen_vcproj_cmd=${source_path}/build/make/gen_msvs_proj.sh
-                 ;;
-             10|11|12|14)
-                 VCPROJ_SFX=vcxproj
-                 gen_vcproj_cmd=${source_path}/build/make/gen_msvs_vcxproj.sh
-                 enabled werror && gen_vcproj_cmd="${gen_vcproj_cmd} --enable-werror"
-                 ;;
-             esac
+             VCPROJ_SFX=vcxproj
+             gen_vcproj_cmd=${source_path}/build/make/gen_msvs_vcxproj.sh
+             enabled werror && gen_vcproj_cmd="${gen_vcproj_cmd} --enable-werror"
              all_targets="${all_targets} solution"
              INLINE="__forceinline"
         ;;
diff --git a/examples.mk b/examples.mk
index c891a54..593c8ddd 100644
--- a/examples.mk
+++ b/examples.mk
@@ -215,6 +215,18 @@
 vp8cx_set_ref.GUID                  = C5E31F7F-96F6-48BD-BD3E-10EBF6E8057A
 vp8cx_set_ref.DESCRIPTION           = VP8 set encoder reference frame
 
+# If vp9 is enabled, $(CONFIG_VP9_ENCODER) is yes, otherwise, it is blank.
+ifneq (,$(filter yes,$(CONFIG_VP9_ENCODER) $(CONFIG_VP10_ENCODER)))
+ifeq ($(CONFIG_DECODERS),yes)
+EXAMPLES-yes                       += vpx_cx_set_ref.c
+vpx_cx_set_ref.SRCS                += ivfenc.h ivfenc.c
+vpx_cx_set_ref.SRCS                += tools_common.h tools_common.c
+vpx_cx_set_ref.SRCS                += video_common.h
+vpx_cx_set_ref.SRCS                += video_writer.h video_writer.c
+vpx_cx_set_ref.GUID                 = 65D7F14A-2EE6-4293-B958-AB5107A03B55
+vpx_cx_set_ref.DESCRIPTION          = VP9/VP10 set encoder reference frame
+endif
+endif
 
 ifeq ($(CONFIG_MULTI_RES_ENCODING),yes)
 ifeq ($(CONFIG_LIBYUV),yes)
diff --git a/examples/vpx_cx_set_ref.c b/examples/vpx_cx_set_ref.c
new file mode 100644
index 0000000..61d81b8
--- /dev/null
+++ b/examples/vpx_cx_set_ref.c
@@ -0,0 +1,455 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+// VP9/VP10 Set Reference Frame
+// ============================
+//
+// This is an example demonstrating how to overwrite the VP9/VP10 encoder's
+// internal reference frame. In the sample we set the last frame to the
+// current frame. This technique could be used to bounce between two cameras.
+//
+// The decoder would also have to set the reference frame to the same value
+// on the same frame, or the video will become corrupt. The 'test_decode'
+// variable is set to 1 in this example that tests if the encoder and decoder
+// results are matching.
+//
+// Usage
+// -----
+// This example encodes a raw video. And the last argument passed in specifies
+// the frame number to update the reference frame on. For example, run
+// examples/vpx_cx_set_ref vp10 352 288 in.yuv out.ivf 4 30
+// The parameter is parsed as follows:
+//
+//
+// Extra Variables
+// ---------------
+// This example maintains the frame number passed on the command line
+// in the `update_frame_num` variable.
+//
+//
+// Configuration
+// -------------
+//
+// The reference frame is updated on the frame specified on the command
+// line.
+//
+// Observing The Effects
+// ---------------------
+// The encoder and decoder results should be matching when the same reference
+// frame setting operation is done in both encoder and decoder. Otherwise,
+// the encoder/decoder mismatch would be seen.
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "vpx/vp8cx.h"
+#include "vpx/vpx_decoder.h"
+#include "vpx/vpx_encoder.h"
+
+#include "./tools_common.h"
+#include "./video_writer.h"
+
+static const char *exec_name;
+
+void usage_exit() {
+  fprintf(stderr, "Usage: %s <codec> <width> <height> <infile> <outfile> "
+          "<frame> <limit(optional)>\n",
+          exec_name);
+  exit(EXIT_FAILURE);
+}
+
+static int compare_img(const vpx_image_t *const img1,
+                       const vpx_image_t *const img2) {
+  uint32_t l_w = img1->d_w;
+  uint32_t c_w =
+      (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
+  const uint32_t c_h =
+      (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
+  uint32_t i;
+  int match = 1;
+
+  match &= (img1->fmt == img2->fmt);
+  match &= (img1->d_w == img2->d_w);
+  match &= (img1->d_h == img2->d_h);
+
+  for (i = 0; i < img1->d_h; ++i)
+    match &= (memcmp(img1->planes[VPX_PLANE_Y] + i * img1->stride[VPX_PLANE_Y],
+                     img2->planes[VPX_PLANE_Y] + i * img2->stride[VPX_PLANE_Y],
+                     l_w) == 0);
+
+  for (i = 0; i < c_h; ++i)
+    match &= (memcmp(img1->planes[VPX_PLANE_U] + i * img1->stride[VPX_PLANE_U],
+                     img2->planes[VPX_PLANE_U] + i * img2->stride[VPX_PLANE_U],
+                     c_w) == 0);
+
+  for (i = 0; i < c_h; ++i)
+    match &= (memcmp(img1->planes[VPX_PLANE_V] + i * img1->stride[VPX_PLANE_V],
+                     img2->planes[VPX_PLANE_V] + i * img2->stride[VPX_PLANE_V],
+                     c_w) == 0);
+
+  return match;
+}
+
+#define mmin(a, b)  ((a) < (b) ? (a) : (b))
+static void find_mismatch(const vpx_image_t *const img1,
+                          const vpx_image_t *const img2,
+                          int yloc[4], int uloc[4], int vloc[4]) {
+  const uint32_t bsize = 64;
+  const uint32_t bsizey = bsize >> img1->y_chroma_shift;
+  const uint32_t bsizex = bsize >> img1->x_chroma_shift;
+  const uint32_t c_w =
+      (img1->d_w + img1->x_chroma_shift) >> img1->x_chroma_shift;
+  const uint32_t c_h =
+      (img1->d_h + img1->y_chroma_shift) >> img1->y_chroma_shift;
+  int match = 1;
+  uint32_t i, j;
+  yloc[0] = yloc[1] = yloc[2] = yloc[3] = -1;
+  for (i = 0, match = 1; match && i < img1->d_h; i += bsize) {
+    for (j = 0; match && j < img1->d_w; j += bsize) {
+      int k, l;
+      const int si = mmin(i + bsize, img1->d_h) - i;
+      const int sj = mmin(j + bsize, img1->d_w) - j;
+      for (k = 0; match && k < si; ++k) {
+        for (l = 0; match && l < sj; ++l) {
+          if (*(img1->planes[VPX_PLANE_Y] +
+                (i + k) * img1->stride[VPX_PLANE_Y] + j + l) !=
+              *(img2->planes[VPX_PLANE_Y] +
+                (i + k) * img2->stride[VPX_PLANE_Y] + j + l)) {
+            yloc[0] = i + k;
+            yloc[1] = j + l;
+            yloc[2] = *(img1->planes[VPX_PLANE_Y] +
+                        (i + k) * img1->stride[VPX_PLANE_Y] + j + l);
+            yloc[3] = *(img2->planes[VPX_PLANE_Y] +
+                        (i + k) * img2->stride[VPX_PLANE_Y] + j + l);
+            match = 0;
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  uloc[0] = uloc[1] = uloc[2] = uloc[3] = -1;
+  for (i = 0, match = 1; match && i < c_h; i += bsizey) {
+    for (j = 0; match && j < c_w; j += bsizex) {
+      int k, l;
+      const int si = mmin(i + bsizey, c_h - i);
+      const int sj = mmin(j + bsizex, c_w - j);
+      for (k = 0; match && k < si; ++k) {
+        for (l = 0; match && l < sj; ++l) {
+          if (*(img1->planes[VPX_PLANE_U] +
+                (i + k) * img1->stride[VPX_PLANE_U] + j + l) !=
+              *(img2->planes[VPX_PLANE_U] +
+                (i + k) * img2->stride[VPX_PLANE_U] + j + l)) {
+            uloc[0] = i + k;
+            uloc[1] = j + l;
+            uloc[2] = *(img1->planes[VPX_PLANE_U] +
+                        (i + k) * img1->stride[VPX_PLANE_U] + j + l);
+            uloc[3] = *(img2->planes[VPX_PLANE_U] +
+                        (i + k) * img2->stride[VPX_PLANE_U] + j + l);
+            match = 0;
+            break;
+          }
+        }
+      }
+    }
+  }
+  vloc[0] = vloc[1] = vloc[2] = vloc[3] = -1;
+  for (i = 0, match = 1; match && i < c_h; i += bsizey) {
+    for (j = 0; match && j < c_w; j += bsizex) {
+      int k, l;
+      const int si = mmin(i + bsizey, c_h - i);
+      const int sj = mmin(j + bsizex, c_w - j);
+      for (k = 0; match && k < si; ++k) {
+        for (l = 0; match && l < sj; ++l) {
+          if (*(img1->planes[VPX_PLANE_V] +
+                (i + k) * img1->stride[VPX_PLANE_V] + j + l) !=
+              *(img2->planes[VPX_PLANE_V] +
+                (i + k) * img2->stride[VPX_PLANE_V] + j + l)) {
+            vloc[0] = i + k;
+            vloc[1] = j + l;
+            vloc[2] = *(img1->planes[VPX_PLANE_V] +
+                        (i + k) * img1->stride[VPX_PLANE_V] + j + l);
+            vloc[3] = *(img2->planes[VPX_PLANE_V] +
+                        (i + k) * img2->stride[VPX_PLANE_V] + j + l);
+            match = 0;
+            break;
+          }
+        }
+      }
+    }
+  }
+}
+
+static void testing_decode(vpx_codec_ctx_t *encoder,
+                           vpx_codec_ctx_t *decoder,
+                           vpx_codec_enc_cfg_t *cfg,
+                           unsigned int frame_out,
+                           int *mismatch_seen) {
+  vpx_image_t enc_img, dec_img;
+  struct vp9_ref_frame ref_enc, ref_dec;
+
+  if (*mismatch_seen)
+    return;
+
+  ref_enc.idx = 0;
+  ref_dec.idx = 0;
+  if (vpx_codec_control(encoder, VP9_GET_REFERENCE, &ref_enc))
+    die_codec(encoder,  "Failed to get encoder reference frame");
+  enc_img = ref_enc.img;
+  if (vpx_codec_control(decoder, VP9_GET_REFERENCE, &ref_dec))
+    die_codec(decoder, "Failed to get decoder reference frame");
+  dec_img = ref_dec.img;
+
+  if (!compare_img(&enc_img, &dec_img)) {
+    int y[4], u[4], v[4];
+
+    *mismatch_seen = 1;
+
+    find_mismatch(&enc_img, &dec_img, y, u, v);
+    printf("Encode/decode mismatch on frame %d at"
+           " Y[%d, %d] {%d/%d},"
+           " U[%d, %d] {%d/%d},"
+           " V[%d, %d] {%d/%d}",
+           frame_out,
+           y[0], y[1], y[2], y[3],
+           u[0], u[1], u[2], u[3],
+           v[0], v[1], v[2], v[3]);
+  }
+
+  vpx_img_free(&enc_img);
+  vpx_img_free(&dec_img);
+}
+
+static int encode_frame(vpx_codec_ctx_t *ecodec,
+                        vpx_codec_enc_cfg_t *cfg,
+                        vpx_image_t *img,
+                        unsigned int frame_in,
+                        VpxVideoWriter *writer,
+                        int test_decode,
+                        vpx_codec_ctx_t *dcodec,
+                        unsigned int *frame_out,
+                        int *mismatch_seen) {
+  int got_pkts = 0;
+  vpx_codec_iter_t iter = NULL;
+  const vpx_codec_cx_pkt_t *pkt = NULL;
+  int got_data;
+  const vpx_codec_err_t res = vpx_codec_encode(ecodec, img, frame_in, 1,
+                                               0, VPX_DL_GOOD_QUALITY);
+  if (res != VPX_CODEC_OK)
+    die_codec(ecodec, "Failed to encode frame");
+
+  got_data = 0;
+
+  while ((pkt = vpx_codec_get_cx_data(ecodec, &iter)) != NULL) {
+    got_pkts = 1;
+
+    if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) {
+      const int keyframe = (pkt->data.frame.flags & VPX_FRAME_IS_KEY) != 0;
+
+      if (!(pkt->data.frame.flags & VPX_FRAME_IS_FRAGMENT)) {
+                *frame_out += 1;
+        }
+
+      if (!vpx_video_writer_write_frame(writer,
+                                        pkt->data.frame.buf,
+                                        pkt->data.frame.sz,
+                                        pkt->data.frame.pts)) {
+        die_codec(ecodec, "Failed to write compressed frame");
+      }
+      printf(keyframe ? "K" : ".");
+      fflush(stdout);
+      got_data = 1;
+
+      // Decode 1 frame.
+      if (test_decode) {
+        if (vpx_codec_decode(dcodec, pkt->data.frame.buf,
+                             (unsigned int)pkt->data.frame.sz, NULL, 0))
+          die_codec(dcodec, "Failed to decode frame.");
+      }
+    }
+  }
+
+  // Mismatch checking
+  if (got_data && test_decode) {
+    testing_decode(ecodec, dcodec, cfg, *frame_out, mismatch_seen);
+  }
+
+  return got_pkts;
+}
+
+int main(int argc, char **argv) {
+  FILE *infile = NULL;
+  // Encoder
+  vpx_codec_ctx_t ecodec = {0};
+  vpx_codec_enc_cfg_t cfg = {0};
+  unsigned int frame_in = 0;
+  vpx_image_t raw;
+  vpx_codec_err_t res;
+  VpxVideoInfo info = {0};
+  VpxVideoWriter *writer = NULL;
+  const VpxInterface *encoder = NULL;
+
+  // Test encoder/decoder mismatch.
+  int test_decode = 1;
+  // Decoder
+  vpx_codec_ctx_t dcodec;
+  unsigned int frame_out = 0;
+
+  // The frame number to set reference frame on
+  int update_frame_num = 0;
+  int mismatch_seen = 0;
+
+  const int fps = 30;
+  const int bitrate = 500;
+
+  const char *codec_arg = NULL;
+  const char *width_arg = NULL;
+  const char *height_arg = NULL;
+  const char *infile_arg = NULL;
+  const char *outfile_arg = NULL;
+  int limit = 0;
+  exec_name = argv[0];
+
+  if (argc < 7)
+    die("Invalid number of arguments");
+
+  codec_arg = argv[1];
+  width_arg = argv[2];
+  height_arg = argv[3];
+  infile_arg = argv[4];
+  outfile_arg = argv[5];
+
+  encoder = get_vpx_encoder_by_name(codec_arg);
+  if (!encoder)
+    die("Unsupported codec.");
+
+  update_frame_num = atoi(argv[6]);
+  // In VP9, the reference buffers (cm->buffer_pool->frame_bufs[i].buf) are
+  // allocated while calling vpx_codec_encode(), thus, setting reference for
+  // 1st frame isn't supported.
+  if (update_frame_num <= 1)
+    die("Couldn't parse frame number '%s'\n", argv[6]);
+
+  if (argc > 7) {
+    limit = atoi(argv[7]);
+    if (update_frame_num > limit)
+      die("Update frame number couldn't larger than limit\n");
+  }
+
+  info.codec_fourcc = encoder->fourcc;
+  info.frame_width = strtol(width_arg, NULL, 0);
+  info.frame_height = strtol(height_arg, NULL, 0);
+  info.time_base.numerator = 1;
+  info.time_base.denominator = fps;
+
+  if (info.frame_width <= 0 ||
+      info.frame_height <= 0 ||
+      (info.frame_width % 2) != 0 ||
+      (info.frame_height % 2) != 0) {
+    die("Invalid frame size: %dx%d", info.frame_width, info.frame_height);
+  }
+
+  if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, info.frame_width,
+                                             info.frame_height, 1)) {
+    die("Failed to allocate image.");
+  }
+
+  printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface()));
+
+  res = vpx_codec_enc_config_default(encoder->codec_interface(), &cfg, 0);
+  if (res)
+    die_codec(&ecodec, "Failed to get default codec config.");
+
+  cfg.g_w = info.frame_width;
+  cfg.g_h = info.frame_height;
+  cfg.g_timebase.num = info.time_base.numerator;
+  cfg.g_timebase.den = info.time_base.denominator;
+  cfg.rc_target_bitrate = bitrate;
+  cfg.g_lag_in_frames = 3;
+
+  writer = vpx_video_writer_open(outfile_arg, kContainerIVF, &info);
+  if (!writer)
+    die("Failed to open %s for writing.", outfile_arg);
+
+  if (!(infile = fopen(infile_arg, "rb")))
+    die("Failed to open %s for reading.", infile_arg);
+
+  if (vpx_codec_enc_init(&ecodec, encoder->codec_interface(), &cfg, 0))
+    die_codec(&ecodec, "Failed to initialize encoder");
+
+  // Disable alt_ref.
+  if (vpx_codec_control(&ecodec, VP8E_SET_ENABLEAUTOALTREF, 0))
+    die_codec(&ecodec, "Failed to set enable auto alt ref");
+
+  if (test_decode) {
+      const VpxInterface *decoder = get_vpx_decoder_by_name(codec_arg);
+      if (vpx_codec_dec_init(&dcodec, decoder->codec_interface(), NULL, 0))
+        die_codec(&dcodec, "Failed to initialize decoder.");
+  }
+
+  // Encode frames.
+  while (vpx_img_read(&raw, infile)) {
+    if (limit && frame_in >= limit)
+      break;
+    if (update_frame_num > 1 && frame_out + 1 == update_frame_num) {
+      vpx_ref_frame_t ref;
+      ref.frame_type = VP8_LAST_FRAME;
+      ref.img = raw;
+      // Set reference frame in encoder.
+      if (vpx_codec_control(&ecodec, VP8_SET_REFERENCE, &ref))
+        die_codec(&ecodec, "Failed to set reference frame");
+      printf(" <SET_REF>");
+
+      // If set_reference in decoder is commented out, the enc/dec mismatch
+      // would be seen.
+      if (test_decode) {
+        if (vpx_codec_control(&dcodec, VP8_SET_REFERENCE, &ref))
+          die_codec(&dcodec, "Failed to set reference frame");
+      }
+    }
+
+    encode_frame(&ecodec, &cfg, &raw, frame_in, writer, test_decode,
+                 &dcodec, &frame_out, &mismatch_seen);
+    frame_in++;
+    if (mismatch_seen)
+      break;
+  }
+
+  // Flush encoder.
+  if (!mismatch_seen)
+    while (encode_frame(&ecodec, &cfg, NULL, frame_in, writer, test_decode,
+                        &dcodec, &frame_out, &mismatch_seen)) {}
+
+  printf("\n");
+  fclose(infile);
+  printf("Processed %d frames.\n", frame_out);
+
+  if (test_decode) {
+    if (!mismatch_seen)
+      printf("Encoder/decoder results are matching.\n");
+    else
+      printf("Encoder/decoder results are NOT matching.\n");
+  }
+
+  if (test_decode)
+    if (vpx_codec_destroy(&dcodec))
+      die_codec(&dcodec, "Failed to destroy decoder");
+
+  vpx_img_free(&raw);
+  if (vpx_codec_destroy(&ecodec))
+    die_codec(&ecodec, "Failed to destroy encoder.");
+
+  vpx_video_writer_close(writer);
+
+  return EXIT_SUCCESS;
+}
diff --git a/test/active_map_refresh_test.cc b/test/active_map_refresh_test.cc
index c945661..aeff301 100644
--- a/test/active_map_refresh_test.cc
+++ b/test/active_map_refresh_test.cc
@@ -113,8 +113,13 @@
   cfg_.rc_end_usage = VPX_CBR;
   cfg_.kf_max_dist = 90000;
 
-  ::libvpx_test::Y4mVideoSource video("desktop_credits.y4m", 0, 30);
-  ::libvpx_test::Y4mVideoSource video_holder("desktop_credits.y4m", 0, 30);
+#if CONFIG_VP10
+  const int nframes = codec_ == &libvpx_test::kVP10 ? 10 : 30;
+#else
+  const int nframes = 30;
+#endif  // CONFIG_VP10
+  ::libvpx_test::Y4mVideoSource video("desktop_credits.y4m", 0, nframes);
+  ::libvpx_test::Y4mVideoSource video_holder("desktop_credits.y4m", 0, nframes);
   video_holder.Begin();
   y4m_holder_ = &video_holder;
 
@@ -124,4 +129,9 @@
 VP9_INSTANTIATE_TEST_CASE(ActiveMapRefreshTest,
                           ::testing::Values(::libvpx_test::kRealTime),
                           ::testing::Range(5, 6));
+#if CONFIG_VP10
+VP10_INSTANTIATE_TEST_CASE(ActiveMapRefreshTest,
+                           ::testing::Values(::libvpx_test::kRealTime),
+                           ::testing::Range(5, 6));
+#endif  // CONFIG_VP10
 }  // namespace
diff --git a/test/active_map_test.cc b/test/active_map_test.cc
index dc3de72..095a820 100644
--- a/test/active_map_test.cc
+++ b/test/active_map_test.cc
@@ -65,25 +65,43 @@
     }
   }
 
+  void DoTest() {
+    // Validate that this non multiple of 64 wide clip encodes
+    cfg_.g_lag_in_frames = 0;
+    cfg_.rc_target_bitrate = 400;
+    cfg_.rc_resize_allowed = 0;
+    cfg_.g_pass = VPX_RC_ONE_PASS;
+    cfg_.rc_end_usage = VPX_CBR;
+    cfg_.kf_max_dist = 90000;
+    ::libvpx_test::I420VideoSource video("hantro_odd.yuv", kWidth, kHeight, 30,
+                                         1, 0, 20);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
   int cpu_used_;
 };
 
 TEST_P(ActiveMapTest, Test) {
-  // Validate that this non multiple of 64 wide clip encodes
-  cfg_.g_lag_in_frames = 0;
-  cfg_.rc_target_bitrate = 400;
-  cfg_.rc_resize_allowed = 0;
-  cfg_.g_pass = VPX_RC_ONE_PASS;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.kf_max_dist = 90000;
+  DoTest();
+}
 
-  ::libvpx_test::I420VideoSource video("hantro_odd.yuv", kWidth, kHeight, 30,
-                                       1, 0, 20);
+class ActiveMapTestLarge : public ActiveMapTest {};
 
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+TEST_P(ActiveMapTestLarge, Test) {
+  DoTest();
 }
 
 VP9_INSTANTIATE_TEST_CASE(ActiveMapTest,
                           ::testing::Values(::libvpx_test::kRealTime),
                           ::testing::Range(0, 9));
+
+VP10_INSTANTIATE_TEST_CASE(ActiveMapTestLarge,
+                           ::testing::Values(::libvpx_test::kRealTime),
+                           ::testing::Range(0, 5));
+
+VP10_INSTANTIATE_TEST_CASE(ActiveMapTest,
+                           ::testing::Values(::libvpx_test::kRealTime),
+                           ::testing::Range(5, 9));
+
 }  // namespace
diff --git a/test/aq_segment_test.cc b/test/aq_segment_test.cc
index 1b9c943..08ebd2b 100644
--- a/test/aq_segment_test.cc
+++ b/test/aq_segment_test.cc
@@ -38,6 +38,22 @@
     }
   }
 
+  void DoTest(int aq_mode) {
+    aq_mode_ = aq_mode;
+    cfg_.kf_max_dist = 12;
+    cfg_.rc_min_quantizer = 8;
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_end_usage = VPX_CBR;
+    cfg_.g_lag_in_frames = 6;
+    cfg_.rc_buf_initial_sz = 500;
+    cfg_.rc_buf_optimal_sz = 500;
+    cfg_.rc_buf_sz = 1000;
+    cfg_.rc_target_bitrate = 300;
+    ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv",
+                                         352, 288, 30, 1, 0, 15);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
   int set_cpu_used_;
   int aq_mode_;
 };
@@ -45,65 +61,46 @@
 // Validate that this AQ segmentation mode (AQ=1, variance_ap)
 // encodes and decodes without a mismatch.
 TEST_P(AqSegmentTest, TestNoMisMatchAQ1) {
-  cfg_.rc_min_quantizer = 8;
-  cfg_.rc_max_quantizer = 56;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_target_bitrate = 300;
-
-  aq_mode_ = 1;
-
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                        30, 1, 0, 100);
-
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  DoTest(1);
 }
 
 // Validate that this AQ segmentation mode (AQ=2, complexity_aq)
 // encodes and decodes without a mismatch.
 TEST_P(AqSegmentTest, TestNoMisMatchAQ2) {
-  cfg_.rc_min_quantizer = 8;
-  cfg_.rc_max_quantizer = 56;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_target_bitrate = 300;
-
-  aq_mode_ = 2;
-
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                        30, 1, 0, 100);
-
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  DoTest(2);
 }
 
 // Validate that this AQ segmentation mode (AQ=3, cyclic_refresh_aq)
 // encodes and decodes without a mismatch.
 TEST_P(AqSegmentTest, TestNoMisMatchAQ3) {
-  cfg_.rc_min_quantizer = 8;
-  cfg_.rc_max_quantizer = 56;
-  cfg_.rc_end_usage = VPX_CBR;
-  cfg_.g_lag_in_frames = 0;
-  cfg_.rc_buf_initial_sz = 500;
-  cfg_.rc_buf_optimal_sz = 500;
-  cfg_.rc_buf_sz = 1000;
-  cfg_.rc_target_bitrate = 300;
+  DoTest(3);
+}
 
-  aq_mode_ = 3;
+class AqSegmentTestLarge : public AqSegmentTest {};
 
-  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
-                                        30, 1, 0, 100);
+TEST_P(AqSegmentTestLarge, TestNoMisMatchAQ1) {
+  DoTest(1);
+}
 
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+TEST_P(AqSegmentTestLarge, TestNoMisMatchAQ2) {
+  DoTest(2);
+}
+
+TEST_P(AqSegmentTestLarge, TestNoMisMatchAQ3) {
+  DoTest(3);
 }
 
 VP9_INSTANTIATE_TEST_CASE(AqSegmentTest,
                           ::testing::Values(::libvpx_test::kRealTime,
                                             ::libvpx_test::kOnePassGood),
                           ::testing::Range(3, 9));
+
+VP10_INSTANTIATE_TEST_CASE(AqSegmentTest,
+                           ::testing::Values(::libvpx_test::kRealTime,
+                                             ::libvpx_test::kOnePassGood),
+                           ::testing::Range(5, 9));
+VP10_INSTANTIATE_TEST_CASE(AqSegmentTestLarge,
+                           ::testing::Values(::libvpx_test::kRealTime,
+                                             ::libvpx_test::kOnePassGood),
+                           ::testing::Range(3, 5));
 }  // namespace
diff --git a/test/blend_a64_mask_1d_test.cc b/test/blend_a64_mask_1d_test.cc
new file mode 100644
index 0000000..ba63afa
--- /dev/null
+++ b/test/blend_a64_mask_1d_test.cc
@@ -0,0 +1,367 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/register_state_check.h"
+
+#include "test/function_equivalence_test.h"
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+#include "./vp10_rtcd.h"
+
+#include "vp10/common/enums.h"
+
+#include "vpx_dsp/blend.h"
+
+using libvpx_test::FunctionEquivalenceTest;
+
+namespace {
+
+template<typename F, typename T>
+class BlendA64Mask1DTest : public FunctionEquivalenceTest<F> {
+ public:
+  static const int kIterations = 10000;
+  static const int kMaxWidth = MAX_SB_SIZE * 5;  // * 5 to cover longer strides
+  static const int kMaxHeight = MAX_SB_SIZE;
+  static const int kBufSize = kMaxWidth * kMaxHeight;
+  static const int kMaxMaskWidth = 2 * MAX_SB_SIZE;
+  static const int kMaxMaskSize = kMaxMaskWidth;
+
+  virtual ~BlendA64Mask1DTest() {}
+
+  virtual void Execute(const T *p_src0, const T *p_src1) = 0;
+
+  void Common() {
+    w_ = 1 << this->rng_(MAX_SB_SIZE_LOG2 + 1);
+    h_ = 1 << this->rng_(MAX_SB_SIZE_LOG2 + 1);
+
+    dst_offset_ = this->rng_(33);
+    dst_stride_ = this->rng_(kMaxWidth + 1 - w_) + w_;
+
+    src0_offset_ = this->rng_(33);
+    src0_stride_ = this->rng_(kMaxWidth + 1 - w_) + w_;
+
+    src1_offset_ = this->rng_(33);
+    src1_stride_ = this->rng_(kMaxWidth + 1 - w_) + w_;
+
+    T *p_src0;
+    T *p_src1;
+
+    switch (this->rng_(3)) {
+      case 0:   // Separate sources
+        p_src0 = src0_;
+        p_src1 = src1_;
+        break;
+      case 1:   // src0 == dst
+        p_src0 = dst_tst_;
+        src0_stride_ = dst_stride_;
+        src0_offset_ = dst_offset_;
+        p_src1 = src1_;
+        break;
+      case 2:   // src1 == dst
+        p_src0 = src0_;
+        p_src1 = dst_tst_;
+        src1_stride_ = dst_stride_;
+        src1_offset_ = dst_offset_;
+        break;
+      default:
+        FAIL();
+    }
+
+    Execute(p_src0, p_src1);
+
+    for (int r = 0 ; r < h_ ; ++r) {
+      for (int c = 0 ; c < w_ ; ++c) {
+        ASSERT_EQ(dst_ref_[dst_offset_ + r * dst_stride_ + c],
+                  dst_tst_[dst_offset_ + r * dst_stride_ + c]);
+      }
+    }
+  }
+
+  T dst_ref_[kBufSize];
+  T dst_tst_[kBufSize];
+  size_t dst_stride_;
+  size_t dst_offset_;
+
+  T src0_[kBufSize];
+  size_t src0_stride_;
+  size_t src0_offset_;
+
+  T src1_[kBufSize];
+  size_t src1_stride_;
+  size_t src1_offset_;
+
+  uint8_t mask_[kMaxMaskSize];
+
+  int w_;
+  int h_;
+};
+
+//////////////////////////////////////////////////////////////////////////////
+// 8 bit version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*F8B)(uint8_t *dst, uint32_t dst_stride,
+                    const uint8_t *src0, uint32_t src0_stride,
+                    const uint8_t *src1, uint32_t src1_stride,
+                    const uint8_t *mask, int h, int w);
+typedef libvpx_test::FuncParam<F8B> TestFuncs;
+
+class BlendA64Mask1DTest8B : public BlendA64Mask1DTest<F8B, uint8_t> {
+ protected:
+  void Execute(const uint8_t *p_src0, const uint8_t *p_src1) {
+    params_.ref_func(dst_ref_ + dst_offset_, dst_stride_,
+                     p_src0 + src0_offset_, src0_stride_,
+                     p_src1 + src1_offset_, src1_stride_, mask_, h_, w_);
+    ASM_REGISTER_STATE_CHECK(
+        params_.tst_func(dst_tst_ + dst_offset_, dst_stride_,
+                         p_src0 + src0_offset_, src0_stride_,
+                         p_src1 + src1_offset_, src1_stride_, mask_, h_, w_));
+  }
+};
+
+TEST_P(BlendA64Mask1DTest8B, RandomValues) {
+  for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) {
+    for (int i = 0 ; i < kBufSize ; ++i) {
+      dst_ref_[i] = rng_.Rand8();
+      dst_tst_[i] = rng_.Rand8();
+
+      src0_[i] = rng_.Rand8();
+      src1_[i] = rng_.Rand8();
+    }
+
+    for (int i = 0 ; i < kMaxMaskSize ; ++i)
+      mask_[i] = rng_(VPX_BLEND_A64_MAX_ALPHA + 1);
+
+    Common();
+  }
+}
+
+TEST_P(BlendA64Mask1DTest8B, ExtremeValues) {
+  for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) {
+    for (int i = 0 ; i < kBufSize ; ++i) {
+      dst_ref_[i] = rng_(2) + 254;
+      dst_tst_[i] = rng_(2) + 254;
+      src0_[i] = rng_(2) + 254;
+      src1_[i] = rng_(2) + 254;
+    }
+
+    for (int i = 0 ; i < kMaxMaskSize ; ++i)
+      mask_[i] = rng_(2) + VPX_BLEND_A64_MAX_ALPHA - 1;
+
+    Common();
+  }
+}
+
+static void blend_a64_hmask_ref(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w) {
+  uint8_t mask2d[BlendA64Mask1DTest8B::kMaxMaskSize]
+                [BlendA64Mask1DTest8B::kMaxMaskSize];
+
+  for (int row = 0 ; row < h ; ++row)
+    for (int col = 0 ; col < w ; ++col)
+      mask2d[row][col] = mask[col];
+
+  vpx_blend_a64_mask_c(dst, dst_stride,
+                       src0, src0_stride,
+                       src1, src1_stride,
+                       &mask2d[0][0], BlendA64Mask1DTest8B::kMaxMaskSize,
+                       h, w, 0, 0);
+}
+
+static void blend_a64_vmask_ref(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w) {
+  uint8_t mask2d[BlendA64Mask1DTest8B::kMaxMaskSize]
+                [BlendA64Mask1DTest8B::kMaxMaskSize];
+
+  for (int row = 0 ; row < h ; ++row)
+    for (int col = 0 ; col < w ; ++col)
+      mask2d[row][col] = mask[row];
+
+  vpx_blend_a64_mask_c(dst, dst_stride,
+                       src0, src0_stride,
+                       src1, src1_stride,
+                       &mask2d[0][0], BlendA64Mask1DTest8B::kMaxMaskSize,
+                       h, w, 0, 0);
+}
+
+INSTANTIATE_TEST_CASE_P(
+  C, BlendA64Mask1DTest8B,
+  ::testing::Values(TestFuncs(blend_a64_hmask_ref, vpx_blend_a64_hmask_c),
+                    TestFuncs(blend_a64_vmask_ref, vpx_blend_a64_vmask_c)));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+  SSE4_1, BlendA64Mask1DTest8B,
+  ::testing::Values(
+      TestFuncs(blend_a64_hmask_ref, vpx_blend_a64_hmask_sse4_1),
+      TestFuncs(blend_a64_vmask_ref, vpx_blend_a64_vmask_sse4_1)));
+#endif  // HAVE_SSE4_1
+
+#if CONFIG_VP9_HIGHBITDEPTH
+//////////////////////////////////////////////////////////////////////////////
+// High bit-depth version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*FHBD)(uint8_t *dst, uint32_t dst_stride,
+                     const uint8_t *src0, uint32_t src0_stride,
+                     const uint8_t *src1, uint32_t src1_stride,
+                     const uint8_t *mask, int h, int w, int bd);
+typedef libvpx_test::FuncParam<FHBD> TestFuncsHBD;
+
+class BlendA64Mask1DTestHBD : public BlendA64Mask1DTest<FHBD, uint16_t> {
+ protected:
+  void Execute(const uint16_t *p_src0, const uint16_t *p_src1) {
+    params_.ref_func(CONVERT_TO_BYTEPTR(dst_ref_ + dst_offset_), dst_stride_,
+                     CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
+                     CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_,
+                     mask_, h_, w_, bit_depth_);
+    ASM_REGISTER_STATE_CHECK(params_.tst_func(
+        CONVERT_TO_BYTEPTR(dst_tst_ + dst_offset_), dst_stride_,
+        CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
+        CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_,
+        mask_, h_, w_, bit_depth_));
+  }
+
+  int bit_depth_;
+};
+
+TEST_P(BlendA64Mask1DTestHBD, RandomValues) {
+  for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) {
+    switch (rng_(3)) {
+    case 0:
+      bit_depth_ = 8;
+      break;
+    case 1:
+      bit_depth_ = 10;
+      break;
+    default:
+      bit_depth_ = 12;
+      break;
+    }
+
+    const int hi = 1 << bit_depth_;
+
+    for (int i = 0 ; i < kBufSize ; ++i) {
+      dst_ref_[i] = rng_(hi);
+      dst_tst_[i] = rng_(hi);
+      src0_[i] = rng_(hi);
+      src1_[i] = rng_(hi);
+    }
+
+    for (int i = 0 ; i < kMaxMaskSize ; ++i)
+      mask_[i] = rng_(VPX_BLEND_A64_MAX_ALPHA + 1);
+
+    Common();
+  }
+}
+
+TEST_P(BlendA64Mask1DTestHBD, ExtremeValues) {
+  for (int iter = 0 ; iter < 1000 && !HasFatalFailure(); ++iter) {
+    switch (rng_(3)) {
+    case 0:
+      bit_depth_ = 8;
+      break;
+    case 1:
+      bit_depth_ = 10;
+      break;
+    default:
+      bit_depth_ = 12;
+      break;
+    }
+
+    const int hi = 1 << bit_depth_;
+    const int lo = hi - 2;
+
+    for (int i = 0 ; i < kBufSize ; ++i) {
+      dst_ref_[i] = rng_(hi - lo) + lo;
+      dst_tst_[i] = rng_(hi - lo) + lo;
+      src0_[i] = rng_(hi - lo) + lo;
+      src1_[i] = rng_(hi - lo) + lo;
+    }
+
+    for (int i = 0 ; i < kMaxMaskSize ; ++i)
+      mask_[i] = rng_(2) + VPX_BLEND_A64_MAX_ALPHA - 1;
+
+    Common();
+  }
+}
+
+static void highbd_blend_a64_hmask_ref(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w, int bd) {
+  uint8_t mask2d[BlendA64Mask1DTestHBD::kMaxMaskSize]
+                [BlendA64Mask1DTestHBD::kMaxMaskSize];
+
+  for (int row = 0 ; row < h ; ++row)
+    for (int col = 0 ; col < w ; ++col)
+      mask2d[row][col] = mask[col];
+
+  vpx_highbd_blend_a64_mask_c(dst, dst_stride,
+                              src0, src0_stride,
+                              src1, src1_stride,
+                              &mask2d[0][0],
+                              BlendA64Mask1DTestHBD::kMaxMaskSize,
+                              h, w, 0, 0, bd);
+}
+
+static void highbd_blend_a64_vmask_ref(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w, int bd) {
+  uint8_t mask2d[BlendA64Mask1DTestHBD::kMaxMaskSize]
+                [BlendA64Mask1DTestHBD::kMaxMaskSize];
+
+  for (int row = 0 ; row < h ; ++row)
+    for (int col = 0 ; col < w ; ++col)
+      mask2d[row][col] = mask[row];
+
+  vpx_highbd_blend_a64_mask_c(dst, dst_stride,
+                              src0, src0_stride,
+                              src1, src1_stride,
+                              &mask2d[0][0],
+                              BlendA64Mask1DTestHBD::kMaxMaskSize,
+                              h, w, 0, 0, bd);
+}
+
+INSTANTIATE_TEST_CASE_P(
+  C, BlendA64Mask1DTestHBD,
+  ::testing::Values(
+      TestFuncsHBD(highbd_blend_a64_hmask_ref, vpx_highbd_blend_a64_hmask_c),
+      TestFuncsHBD(highbd_blend_a64_vmask_ref, vpx_highbd_blend_a64_vmask_c)));
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+  SSE4_1, BlendA64Mask1DTestHBD,
+  ::testing::Values(
+      TestFuncsHBD(highbd_blend_a64_hmask_ref,
+                   vpx_highbd_blend_a64_hmask_sse4_1),
+      TestFuncsHBD(highbd_blend_a64_vmask_ref,
+                   vpx_highbd_blend_a64_vmask_sse4_1)));
+#endif  // HAVE_SSE4_1
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}  // namespace
diff --git a/test/blend_a64_mask_test.cc b/test/blend_a64_mask_test.cc
new file mode 100644
index 0000000..f6c09bb
--- /dev/null
+++ b/test/blend_a64_mask_test.cc
@@ -0,0 +1,288 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/register_state_check.h"
+
+#include "test/function_equivalence_test.h"
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+#include "./vp10_rtcd.h"
+
+#include "vp10/common/enums.h"
+
+#include "vpx_dsp/blend.h"
+
+using libvpx_test::FunctionEquivalenceTest;
+
+namespace {
+
+template<typename F, typename T>
+class BlendA64MaskTest : public FunctionEquivalenceTest<F> {
+ protected:
+  static const int kIterations = 10000;
+  static const int kMaxWidth = MAX_SB_SIZE * 5;  // * 5 to cover longer strides
+  static const int kMaxHeight = MAX_SB_SIZE;
+  static const int kBufSize = kMaxWidth * kMaxHeight;
+  static const int kMaxMaskWidth = 2 * MAX_SB_SIZE;
+  static const int kMaxMaskSize = kMaxMaskWidth * kMaxMaskWidth;
+
+  virtual ~BlendA64MaskTest() {}
+
+  virtual void Execute(const T *p_src0, const T *p_src1) = 0;
+
+  void Common() {
+    w_ = 1 << this->rng_(MAX_SB_SIZE_LOG2 + 1);
+    h_ = 1 << this->rng_(MAX_SB_SIZE_LOG2 + 1);
+
+    subx_ = this->rng_(2);
+    suby_ = this->rng_(2);
+
+    dst_offset_ = this->rng_(33);
+    dst_stride_ = this->rng_(kMaxWidth + 1 - w_) + w_;
+
+    src0_offset_ = this->rng_(33);
+    src0_stride_ = this->rng_(kMaxWidth + 1 - w_) + w_;
+
+    src1_offset_ = this->rng_(33);
+    src1_stride_ = this->rng_(kMaxWidth + 1 - w_) + w_;
+
+    mask_stride_ = this->rng_(kMaxWidth + 1 - w_ * (subx_ ? 2 : 1)) +
+                   w_ * (subx_ ? 2 : 1);
+
+    T *p_src0;
+    T *p_src1;
+
+    switch (this->rng_(3)) {
+      case 0:   // Separate sources
+        p_src0 = src0_;
+        p_src1 = src1_;
+        break;
+      case 1:   // src0 == dst
+        p_src0 = dst_tst_;
+        src0_stride_ = dst_stride_;
+        src0_offset_ = dst_offset_;
+        p_src1 = src1_;
+        break;
+      case 2:   // src1 == dst
+        p_src0 = src0_;
+        p_src1 = dst_tst_;
+        src1_stride_ = dst_stride_;
+        src1_offset_ = dst_offset_;
+        break;
+      default:
+        FAIL();
+    }
+
+    Execute(p_src0, p_src1);
+
+    for (int r = 0 ; r < h_ ; ++r) {
+      for (int c = 0 ; c < w_ ; ++c) {
+        ASSERT_EQ(dst_ref_[dst_offset_ + r * dst_stride_ + c],
+                  dst_tst_[dst_offset_ + r * dst_stride_ + c]);
+      }
+    }
+  }
+
+  T dst_ref_[kBufSize];
+  T dst_tst_[kBufSize];
+  size_t dst_stride_;
+  size_t dst_offset_;
+
+  T src0_[kBufSize];
+  size_t src0_stride_;
+  size_t src0_offset_;
+
+  T src1_[kBufSize];
+  size_t src1_stride_;
+  size_t src1_offset_;
+
+  uint8_t mask_[kMaxMaskSize];
+  size_t mask_stride_;
+
+  int w_;
+  int h_;
+
+  bool suby_;
+  bool subx_;
+};
+
+//////////////////////////////////////////////////////////////////////////////
+// 8 bit version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*F8B)(uint8_t *dst, uint32_t dst_stride,
+                    const uint8_t *src0, uint32_t src0_stride,
+                    const uint8_t *src1, uint32_t src1_stride,
+                    const uint8_t *mask, uint32_t mask_stride,
+                    int h, int w, int suby, int subx);
+typedef libvpx_test::FuncParam<F8B> TestFuncs;
+
+class BlendA64MaskTest8B : public BlendA64MaskTest<F8B, uint8_t> {
+ protected:
+  void Execute(const uint8_t *p_src0, const uint8_t *p_src1) {
+    params_.ref_func(dst_ref_ + dst_offset_, dst_stride_,
+                     p_src0 + src0_offset_, src0_stride_,
+                     p_src1 + src1_offset_, src1_stride_,
+                     mask_, kMaxMaskWidth, h_, w_, suby_, subx_);
+    ASM_REGISTER_STATE_CHECK(
+        params_.tst_func(dst_tst_ + dst_offset_, dst_stride_,
+                         p_src0 + src0_offset_, src0_stride_,
+                         p_src1 + src1_offset_, src1_stride_,
+                         mask_, kMaxMaskWidth, h_, w_, suby_, subx_));
+  }
+};
+
+TEST_P(BlendA64MaskTest8B, RandomValues) {
+  for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) {
+    for (int i = 0 ; i < kBufSize ; ++i) {
+      dst_ref_[i] = rng_.Rand8();
+      dst_tst_[i] = rng_.Rand8();
+
+      src0_[i] = rng_.Rand8();
+      src1_[i] = rng_.Rand8();
+    }
+
+    for (int i = 0 ; i < kMaxMaskSize ; ++i)
+      mask_[i] = rng_(VPX_BLEND_A64_MAX_ALPHA + 1);
+
+    Common();
+  }
+}
+
+TEST_P(BlendA64MaskTest8B, ExtremeValues) {
+  for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) {
+    for (int i = 0 ; i < kBufSize ; ++i) {
+      dst_ref_[i] = rng_(2) + 254;
+      dst_tst_[i] = rng_(2) + 254;
+      src0_[i] = rng_(2) + 254;
+      src1_[i] = rng_(2) + 254;
+    }
+
+    for (int i = 0 ; i < kMaxMaskSize ; ++i)
+      mask_[i] = rng_(2) + VPX_BLEND_A64_MAX_ALPHA - 1;
+
+    Common();
+  }
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+  SSE4_1_C_COMPARE, BlendA64MaskTest8B,
+  ::testing::Values(
+      TestFuncs(vpx_blend_a64_mask_c, vpx_blend_a64_mask_sse4_1)));
+#endif  // HAVE_SSE4_1
+
+#if CONFIG_VP9_HIGHBITDEPTH
+//////////////////////////////////////////////////////////////////////////////
+// High bit-depth version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*FHBD)(uint8_t *dst, uint32_t dst_stride,
+                     const uint8_t *src0, uint32_t src0_stride,
+                     const uint8_t *src1, uint32_t src1_stride,
+                     const uint8_t *mask, uint32_t mask_stride,
+                     int h, int w, int suby, int subx, int bd);
+typedef libvpx_test::FuncParam<FHBD> TestFuncsHBD;
+
+class BlendA64MaskTestHBD : public BlendA64MaskTest<FHBD, uint16_t> {
+ protected:
+  void Execute(const uint16_t *p_src0, const uint16_t *p_src1) {
+    params_.ref_func(CONVERT_TO_BYTEPTR(dst_ref_ + dst_offset_), dst_stride_,
+                     CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
+                     CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_,
+                     mask_, kMaxMaskWidth, h_, w_, suby_, subx_, bit_depth_);
+    ASM_REGISTER_STATE_CHECK(params_.tst_func(
+        CONVERT_TO_BYTEPTR(dst_tst_ + dst_offset_), dst_stride_,
+        CONVERT_TO_BYTEPTR(p_src0 + src0_offset_), src0_stride_,
+        CONVERT_TO_BYTEPTR(p_src1 + src1_offset_), src1_stride_,
+        mask_, kMaxMaskWidth, h_, w_, suby_, subx_, bit_depth_));
+  }
+
+  int bit_depth_;
+};
+
+TEST_P(BlendA64MaskTestHBD, RandomValues) {
+  for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) {
+    switch (rng_(3)) {
+    case 0:
+      bit_depth_ = 8;
+      break;
+    case 1:
+      bit_depth_ = 10;
+      break;
+    default:
+      bit_depth_ = 12;
+      break;
+    }
+
+    const int hi = 1 << bit_depth_;
+
+    for (int i = 0 ; i < kBufSize ; ++i) {
+      dst_ref_[i] = rng_(hi);
+      dst_tst_[i] = rng_(hi);
+      src0_[i] = rng_(hi);
+      src1_[i] = rng_(hi);
+    }
+
+    for (int i = 0 ; i < kMaxMaskSize ; ++i)
+      mask_[i] = rng_(VPX_BLEND_A64_MAX_ALPHA + 1);
+
+    Common();
+  }
+}
+
+TEST_P(BlendA64MaskTestHBD, ExtremeValues) {
+  for (int iter = 0 ; iter < 1000 && !HasFatalFailure(); ++iter) {
+    switch (rng_(3)) {
+    case 0:
+      bit_depth_ = 8;
+      break;
+    case 1:
+      bit_depth_ = 10;
+      break;
+    default:
+      bit_depth_ = 12;
+      break;
+    }
+
+    const int hi = 1 << bit_depth_;
+    const int lo = hi - 2;
+
+    for (int i = 0 ; i < kBufSize ; ++i) {
+      dst_ref_[i] = rng_(hi - lo) + lo;
+      dst_tst_[i] = rng_(hi - lo) + lo;
+      src0_[i] = rng_(hi - lo) + lo;
+      src1_[i] = rng_(hi - lo) + lo;
+    }
+
+    for (int i = 0 ; i < kMaxMaskSize ; ++i)
+      mask_[i] = rng_(2) + VPX_BLEND_A64_MAX_ALPHA - 1;
+
+    Common();
+  }
+}
+
+#if HAVE_SSE4_1
+INSTANTIATE_TEST_CASE_P(
+  SSE4_1_C_COMPARE, BlendA64MaskTestHBD,
+  ::testing::Values(
+      TestFuncsHBD(vpx_highbd_blend_a64_mask_c,
+                   vpx_highbd_blend_a64_mask_sse4_1)));
+#endif  // HAVE_SSE4_1
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}  // namespace
diff --git a/test/convolve_test.cc b/test/convolve_test.cc
index 73b0edb..21f185a 100644
--- a/test/convolve_test.cc
+++ b/test/convolve_test.cc
@@ -28,7 +28,7 @@
 
 namespace {
 
-static const unsigned int kMaxDimension = 64;
+static const unsigned int kMaxDimension = MAX_SB_SIZE;
 
 typedef void (*ConvolveFunc)(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
@@ -69,6 +69,25 @@
 
 typedef std::tr1::tuple<int, int, const ConvolveFunctions *> ConvolveParam;
 
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+#define ALL_SIZES(convolve_fn) \
+    make_tuple(128, 64, &convolve_fn),  \
+    make_tuple(64, 128, &convolve_fn),  \
+    make_tuple(128, 128, &convolve_fn), \
+    make_tuple(4, 4, &convolve_fn),     \
+    make_tuple(8, 4, &convolve_fn),     \
+    make_tuple(4, 8, &convolve_fn),     \
+    make_tuple(8, 8, &convolve_fn),     \
+    make_tuple(16, 8, &convolve_fn),    \
+    make_tuple(8, 16, &convolve_fn),    \
+    make_tuple(16, 16, &convolve_fn),   \
+    make_tuple(32, 16, &convolve_fn),   \
+    make_tuple(16, 32, &convolve_fn),   \
+    make_tuple(32, 32, &convolve_fn),   \
+    make_tuple(64, 32, &convolve_fn),   \
+    make_tuple(32, 64, &convolve_fn),   \
+    make_tuple(64, 64, &convolve_fn)
+#else
 #define ALL_SIZES(convolve_fn) \
     make_tuple(4, 4, &convolve_fn),     \
     make_tuple(8, 4, &convolve_fn),     \
@@ -83,6 +102,7 @@
     make_tuple(64, 32, &convolve_fn),   \
     make_tuple(32, 64, &convolve_fn),   \
     make_tuple(64, 64, &convolve_fn)
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
 
 // Reference 8-tap subpixel filter, slightly modified to fit into this test.
 #define VP9_FILTER_WEIGHT 128
@@ -117,7 +137,7 @@
   //                               = 23
   // and filter_max_width          = 16
   //
-  uint8_t intermediate_buffer[71 * kMaxDimension];
+  uint8_t intermediate_buffer[(kMaxDimension+8) * kMaxDimension];
   const int intermediate_next_stride =
       1 - static_cast<int>(intermediate_height * output_width);
 
@@ -199,9 +219,9 @@
 
   assert(output_width <= kMaxDimension);
   assert(output_height <= kMaxDimension);
-  filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, 64,
+  filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, kMaxDimension,
                      output_width, output_height);
-  block2d_average_c(tmp, 64, dst_ptr, dst_stride,
+  block2d_average_c(tmp, kMaxDimension, dst_ptr, dst_stride,
                     output_width, output_height);
 }
 
@@ -230,7 +250,7 @@
    *                               = 23
    * and filter_max_width = 16
    */
-  uint16_t intermediate_buffer[71 * kMaxDimension];
+  uint16_t intermediate_buffer[(kMaxDimension+8) * kMaxDimension];
   const int intermediate_next_stride =
       1 - static_cast<int>(intermediate_height * output_width);
 
@@ -319,9 +339,10 @@
 
   assert(output_width <= kMaxDimension);
   assert(output_height <= kMaxDimension);
-  highbd_filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter, tmp, 64,
+  highbd_filter_block2d_8_c(src_ptr, src_stride, HFilter, VFilter,
+                            tmp, kMaxDimension,
                             output_width, output_height, bd);
-  highbd_block2d_average_c(tmp, 64, dst_ptr, dst_stride,
+  highbd_block2d_average_c(tmp, kMaxDimension, dst_ptr, dst_stride,
                            output_width, output_height);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -368,7 +389,7 @@
 
  protected:
   static const int kDataAlignment = 16;
-  static const int kOuterBlockSize = 256;
+  static const int kOuterBlockSize = 4*kMaxDimension;
   static const int kInputStride = kOuterBlockSize;
   static const int kOutputStride = kOuterBlockSize;
   static const int kInputBufferSize = kOuterBlockSize * kOuterBlockSize;
@@ -431,7 +452,8 @@
   void CopyOutputToRef() {
     memcpy(output_ref_, output_, kOutputBufferSize);
 #if CONFIG_VP9_HIGHBITDEPTH
-    memcpy(output16_ref_, output16_, kOutputBufferSize);
+    memcpy(output16_ref_, output16_,
+           kOutputBufferSize * sizeof(*output16_ref_));
 #endif
   }
 
@@ -443,41 +465,41 @@
   }
 
   uint8_t *input() const {
+    const int index = BorderTop() * kOuterBlockSize + BorderLeft();
 #if CONFIG_VP9_HIGHBITDEPTH
     if (UUT_->use_highbd_ == 0) {
-      return input_ + BorderTop() * kOuterBlockSize + BorderLeft();
+      return input_ + index;
     } else {
-      return CONVERT_TO_BYTEPTR(input16_ + BorderTop() * kOuterBlockSize +
-                                BorderLeft());
+      return CONVERT_TO_BYTEPTR(input16_) + index;
     }
 #else
-    return input_ + BorderTop() * kOuterBlockSize + BorderLeft();
+    return input_ + index;
 #endif
   }
 
   uint8_t *output() const {
+    const int index = BorderTop() * kOuterBlockSize + BorderLeft();
 #if CONFIG_VP9_HIGHBITDEPTH
     if (UUT_->use_highbd_ == 0) {
-      return output_ + BorderTop() * kOuterBlockSize + BorderLeft();
+      return output_ + index;
     } else {
-      return CONVERT_TO_BYTEPTR(output16_ + BorderTop() * kOuterBlockSize +
-                                BorderLeft());
+      return CONVERT_TO_BYTEPTR(output16_ + index);
     }
 #else
-    return output_ + BorderTop() * kOuterBlockSize + BorderLeft();
+    return output_ + index;
 #endif
   }
 
   uint8_t *output_ref() const {
+    const int index = BorderTop() * kOuterBlockSize + BorderLeft();
 #if CONFIG_VP9_HIGHBITDEPTH
     if (UUT_->use_highbd_ == 0) {
-      return output_ref_ + BorderTop() * kOuterBlockSize + BorderLeft();
+      return output_ref_ + index;
     } else {
-      return CONVERT_TO_BYTEPTR(output16_ref_ + BorderTop() * kOuterBlockSize +
-                                BorderLeft());
+      return CONVERT_TO_BYTEPTR(output16_ref_ + index);
     }
 #else
-    return output_ref_ + BorderTop() * kOuterBlockSize + BorderLeft();
+    return output_ref_ + index;
 #endif
   }
 
@@ -1180,7 +1202,8 @@
                         ::testing::ValuesIn(kArrayConvolve8_avx2));
 #endif  // HAVE_AVX2 && HAVE_SSSE3
 
-#if HAVE_NEON
+// TODO(any): Make NEON versions support 128x128 128x64 64x128 block sizes
+#if HAVE_NEON && !(CONFIG_VP10 && CONFIG_EXT_PARTITION)
 #if HAVE_NEON_ASM
 const ConvolveFunctions convolve8_neon(
     vpx_convolve_copy_neon, vpx_convolve_avg_neon,
@@ -1206,7 +1229,8 @@
                         ::testing::ValuesIn(kArrayConvolve8_neon));
 #endif  // HAVE_NEON
 
-#if HAVE_DSPR2
+// TODO(any): Make DSPR2 versions support 128x128 128x64 64x128 block sizes
+#if HAVE_DSPR2 && !(CONFIG_VP10 && CONFIG_EXT_PARTITION)
 const ConvolveFunctions convolve8_dspr2(
     vpx_convolve_copy_dspr2, vpx_convolve_avg_dspr2,
     vpx_convolve8_horiz_dspr2, vpx_convolve8_avg_horiz_dspr2,
@@ -1221,7 +1245,8 @@
                         ::testing::ValuesIn(kArrayConvolve8_dspr2));
 #endif  // HAVE_DSPR2
 
-#if HAVE_MSA
+// TODO(any): Make MSA versions support 128x128 128x64 64x128 block sizes
+#if HAVE_MSA && !(CONFIG_VP10 && CONFIG_EXT_PARTITION)
 const ConvolveFunctions convolve8_msa(
     vpx_convolve_copy_msa, vpx_convolve_avg_msa,
     vpx_convolve8_horiz_msa, vpx_convolve8_avg_horiz_msa,
diff --git a/test/cpu_speed_test.cc b/test/cpu_speed_test.cc
index 572834c..8e36666 100644
--- a/test/cpu_speed_test.cc
+++ b/test/cpu_speed_test.cc
@@ -83,7 +83,7 @@
   cfg_.rc_min_quantizer = 0;
 
   ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
-                                       20);
+                                       10);
 
   init_flags_ = VPX_CODEC_USE_PSNR;
 
@@ -92,7 +92,7 @@
 }
 
 TEST_P(CpuSpeedTest, TestScreencastQ0) {
-  ::libvpx_test::Y4mVideoSource video("screendata.y4m", 0, 25);
+  ::libvpx_test::Y4mVideoSource video("screendata.y4m", 0, 10);
   cfg_.g_timebase = video.timebase();
   cfg_.rc_2pass_vbr_minsection_pct = 5;
   cfg_.rc_2pass_vbr_maxsection_pct = 2000;
@@ -107,7 +107,7 @@
 }
 
 TEST_P(CpuSpeedTest, TestTuneScreen) {
-  ::libvpx_test::Y4mVideoSource video("screendata.y4m", 0, 25);
+  ::libvpx_test::Y4mVideoSource video("screendata.y4m", 0, 10);
   cfg_.g_timebase = video.timebase();
   cfg_.rc_2pass_vbr_minsection_pct = 5;
   cfg_.rc_2pass_vbr_minsection_pct = 2000;
@@ -133,7 +133,7 @@
   cfg_.rc_min_quantizer = 0;
 
   ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
-                                       20);
+                                       10);
 
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
@@ -148,7 +148,7 @@
   cfg_.rc_min_quantizer = 40;
 
   ::libvpx_test::I420VideoSource video("hantro_odd.yuv", 208, 144, 30, 1, 0,
-                                       20);
+                                       10);
 
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 }
diff --git a/test/vp8cx_set_ref.sh b/test/cx_set_ref.sh
similarity index 60%
rename from test/vp8cx_set_ref.sh
rename to test/cx_set_ref.sh
index 5d760bc..c21894e 100755
--- a/test/vp8cx_set_ref.sh
+++ b/test/cx_set_ref.sh
@@ -8,30 +8,27 @@
 ##  in the file PATENTS.  All contributing project authors may
 ##  be found in the AUTHORS file in the root of the source tree.
 ##
-##  This file tests the libvpx vp8cx_set_ref example. To add new tests to this
+##  This file tests the libvpx cx_set_ref example. To add new tests to this
 ##  file, do the following:
 ##    1. Write a shell function (this is your test).
-##    2. Add the function to vp8cx_set_ref_tests (on a new line).
+##    2. Add the function to cx_set_ref_tests (on a new line).
 ##
 . $(dirname $0)/tools_common.sh
 
 # Environment check: $YUV_RAW_INPUT is required.
-vp8cx_set_ref_verify_environment() {
+cx_set_ref_verify_environment() {
   if [ ! -e "${YUV_RAW_INPUT}" ]; then
     echo "Libvpx test data must exist in LIBVPX_TEST_DATA_PATH."
     return 1
   fi
 }
 
-# Runs vp8cx_set_ref and updates the reference frame before encoding frame 90.
-# $1 is the codec name, which vp8cx_set_ref does not support at present: It's
-# currently used only to name the output file.
-# TODO(tomfinegan): Pass the codec param once the example is updated to support
-# VP9.
+# Runs cx_set_ref and updates the reference frame before encoding frame 90.
+# $1 is the codec name.
 vpx_set_ref() {
-  local encoder="${LIBVPX_BIN_PATH}/vp8cx_set_ref${VPX_TEST_EXE_SUFFIX}"
   local codec="$1"
-  local output_file="${VPX_TEST_OUTPUT_DIR}/vp8cx_set_ref_${codec}.ivf"
+  local encoder="${LIBVPX_BIN_PATH}/${codec}cx_set_ref${VPX_TEST_EXE_SUFFIX}"
+  local output_file="${VPX_TEST_OUTPUT_DIR}/${codec}cx_set_ref_${codec}.ivf"
   local ref_frame_num=90
 
   if [ ! -x "${encoder}" ]; then
@@ -46,12 +43,24 @@
   [ -e "${output_file}" ] || return 1
 }
 
-vp8cx_set_ref_vp8() {
+cx_set_ref_vp8() {
   if [ "$(vp8_encode_available)" = "yes" ]; then
     vpx_set_ref vp8 || return 1
   fi
 }
 
-vp8cx_set_ref_tests="vp8cx_set_ref_vp8"
+cx_set_ref_vp9() {
+  if [ "$(vp9_encode_available)" = "yes" ]; then
+    vpx_set_ref vp9 || return 1
+  fi
+}
 
-run_tests vp8cx_set_ref_verify_environment "${vp8cx_set_ref_tests}"
+cx_set_ref_vp10() {
+  if [ "$(vp10_encode_available)" = "yes" ]; then
+    vpx_set_ref vp10 || return 1
+  fi
+}
+
+cx_set_ref_tests="cx_set_ref_vp8 cx_set_ref_vp9 cx_set_ref_vp10"
+
+run_tests cx_set_ref_verify_environment "${cx_set_ref_tests}"
diff --git a/test/dct16x16_test.cc b/test/dct16x16_test.cc
index ddaf939..e6224b2 100644
--- a/test/dct16x16_test.cc
+++ b/test/dct16x16_test.cc
@@ -25,20 +25,12 @@
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
+#include "vpx_ports/msvc.h"  // for round()
 
 using libvpx_test::ACMRandom;
 
 namespace {
 
-#ifdef _MSC_VER
-static int round(double x) {
-  if (x < 0)
-    return static_cast<int>(ceil(x - 0.5));
-  else
-    return static_cast<int>(floor(x + 0.5));
-}
-#endif
-
 const int kNumCoeffs = 256;
 const double C1 = 0.995184726672197;
 const double C2 = 0.98078528040323;
diff --git a/test/dct32x32_test.cc b/test/dct32x32_test.cc
index 16d8825..278d72d 100644
--- a/test/dct32x32_test.cc
+++ b/test/dct32x32_test.cc
@@ -25,18 +25,11 @@
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
+#include "vpx_ports/msvc.h"  // for round()
 
 using libvpx_test::ACMRandom;
 
 namespace {
-#ifdef _MSC_VER
-static int round(double x) {
-  if (x < 0)
-    return static_cast<int>(ceil(x - 0.5));
-  else
-    return static_cast<int>(floor(x + 0.5));
-}
-#endif
 
 const int kNumCoeffs = 1024;
 const double kPi = 3.141592653589793238462643383279502884;
diff --git a/test/decode_test_driver.cc b/test/decode_test_driver.cc
index ad861c3..7fb3e37 100644
--- a/test/decode_test_driver.cc
+++ b/test/decode_test_driver.cc
@@ -18,6 +18,7 @@
 namespace libvpx_test {
 
 const char kVP8Name[] = "WebM Project VP8";
+const char kVP10Name[] = "WebM Project VP10";
 
 vpx_codec_err_t Decoder::PeekStream(const uint8_t *cxdata, size_t size,
                                     vpx_codec_stream_info_t *stream_info) {
@@ -46,6 +47,11 @@
   return strncmp(kVP8Name, codec_name, sizeof(kVP8Name) - 1) == 0;
 }
 
+bool Decoder::IsVP10() const {
+  const char *codec_name = GetDecoderName();
+  return strncmp(kVP10Name, codec_name, sizeof(kVP10Name) - 1) == 0;
+}
+
 void DecoderTest::HandlePeekResult(Decoder *const decoder,
                                    CompressedVideoSource *video,
                                    const vpx_codec_err_t res_peek) {
diff --git a/test/decode_test_driver.h b/test/decode_test_driver.h
index f566c53..1492c5a 100644
--- a/test/decode_test_driver.h
+++ b/test/decode_test_driver.h
@@ -107,6 +107,8 @@
 
   bool IsVP8() const;
 
+  bool IsVP10() const;
+
   vpx_codec_ctx_t * GetDecoder() {
     return &decoder_;
   }
diff --git a/test/encode_test_driver.cc b/test/encode_test_driver.cc
index 128436e..753a7e4 100644
--- a/test/encode_test_driver.cc
+++ b/test/encode_test_driver.cc
@@ -13,6 +13,7 @@
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 #include "./vpx_config.h"
+#include "vpx_ports/mem.h"
 #include "test/codec_factory.h"
 #include "test/decode_test_driver.h"
 #include "test/encode_test_driver.h"
@@ -45,17 +46,22 @@
 #endif
 #if CONFIG_VP10_ENCODER
     if (CodecInterface() == &vpx_codec_vp10_cx_algo) {
-      // Default to 1 tile column for VP10.
+      // Default to 1 tile column for VP10. With CONFIG_EXT_TILE, the
+      // default is already the largest possible tile size
+#if !CONFIG_EXT_TILE
       const int log2_tile_columns = 0;
       res = vpx_codec_control_(&encoder_, VP9E_SET_TILE_COLUMNS,
                                log2_tile_columns);
       ASSERT_EQ(VPX_CODEC_OK, res) << EncoderError();
+#endif  // !CONFIG_EXT_TILE
     } else
 #endif
     {
 #if CONFIG_VP8_ENCODER
-      ASSERT_EQ(&vpx_codec_vp8_cx_algo, CodecInterface())
-          << "Unknown Codec Interface";
+      if (CodecInterface() == &vpx_codec_vp8_cx_algo) {
+        ASSERT_EQ(&vpx_codec_vp8_cx_algo, CodecInterface())
+            << "Unknown Codec Interface";
+      }
 #endif
     }
   }
@@ -138,38 +144,120 @@
   else
     passes_ = 1;
 }
+
+static bool compare_plane(const uint8_t *const buf1, const int stride1,
+                          const uint8_t *const buf2, const int stride2,
+                          const int w, const int h,
+                          int *const mismatch_row,
+                          int *const mismatch_col,
+                          int *const mismatch_pix1,
+                          int *const mismatch_pix2) {
+  int r, c;
+
+  for (r = 0; r < h; ++r) {
+    for (c = 0; c < w; ++c) {
+      const int pix1 = buf1[r * stride1 + c];
+      const int pix2 = buf2[r * stride2 + c];
+
+      if (pix1 != pix2) {
+        if (mismatch_row != NULL)
+          *mismatch_row = r;
+        if (mismatch_col != NULL)
+          *mismatch_col = c;
+        if (mismatch_pix1 != NULL)
+          *mismatch_pix1 = pix1;
+        if (mismatch_pix2 != NULL)
+          *mismatch_pix2 = pix2;
+        return false;
+      }
+    }
+  }
+
+  return true;
+}
+
 // The function should return "true" most of the time, therefore no early
 // break-out is implemented within the match checking process.
 static bool compare_img(const vpx_image_t *img1,
-                        const vpx_image_t *img2) {
-  bool match = (img1->fmt == img2->fmt) &&
-               (img1->cs == img2->cs) &&
-               (img1->d_w == img2->d_w) &&
-               (img1->d_h == img2->d_h);
+                        const vpx_image_t *img2,
+                        int *const mismatch_row,
+                        int *const mismatch_col,
+                        int *const mismatch_plane,
+                        int *const mismatch_pix1,
+                        int *const mismatch_pix2) {
 
-  const unsigned int width_y  = img1->d_w;
-  const unsigned int height_y = img1->d_h;
-  unsigned int i;
-  for (i = 0; i < height_y; ++i)
-    match = (memcmp(img1->planes[VPX_PLANE_Y] + i * img1->stride[VPX_PLANE_Y],
-                    img2->planes[VPX_PLANE_Y] + i * img2->stride[VPX_PLANE_Y],
-                    width_y) == 0) && match;
-  const unsigned int width_uv  = (img1->d_w + 1) >> 1;
-  const unsigned int height_uv = (img1->d_h + 1) >> 1;
-  for (i = 0; i <  height_uv; ++i)
-    match = (memcmp(img1->planes[VPX_PLANE_U] + i * img1->stride[VPX_PLANE_U],
-                    img2->planes[VPX_PLANE_U] + i * img2->stride[VPX_PLANE_U],
-                    width_uv) == 0) && match;
-  for (i = 0; i < height_uv; ++i)
-    match = (memcmp(img1->planes[VPX_PLANE_V] + i * img1->stride[VPX_PLANE_V],
-                    img2->planes[VPX_PLANE_V] + i * img2->stride[VPX_PLANE_V],
-                    width_uv) == 0) && match;
-  return match;
+  const unsigned int w_y = img1->d_w;
+  const unsigned int h_y = img1->d_h;
+  const unsigned int w_uv = ROUND_POWER_OF_TWO(w_y, img1->x_chroma_shift);
+  const unsigned int h_uv = ROUND_POWER_OF_TWO(h_y, img1->y_chroma_shift);
+
+  if (img1->fmt != img2->fmt
+      || img1->cs != img2->cs
+      || img1->d_w != img2->d_w
+      || img1->d_h != img2->d_h) {
+    if (mismatch_row != NULL)
+      *mismatch_row = -1;
+    if (mismatch_col != NULL)
+      *mismatch_col = -1;
+    return false;
+  }
+
+  if (!compare_plane(img1->planes[VPX_PLANE_Y],  img1->stride[VPX_PLANE_Y],
+                     img2->planes[VPX_PLANE_Y],  img2->stride[VPX_PLANE_Y],
+                     w_y, h_y,
+                     mismatch_row, mismatch_col,
+                     mismatch_pix1, mismatch_pix2)) {
+    if (mismatch_plane != NULL)
+      *mismatch_plane = VPX_PLANE_Y;
+    return false;
+  }
+
+  if (!compare_plane(img1->planes[VPX_PLANE_U],  img1->stride[VPX_PLANE_U],
+                     img2->planes[VPX_PLANE_U],  img2->stride[VPX_PLANE_U],
+                     w_uv, h_uv,
+                     mismatch_row, mismatch_col,
+                     mismatch_pix1, mismatch_pix2)) {
+    if (mismatch_plane != NULL)
+      *mismatch_plane = VPX_PLANE_U;
+    return false;
+  }
+
+  if (!compare_plane(img1->planes[VPX_PLANE_V],  img1->stride[VPX_PLANE_V],
+                     img2->planes[VPX_PLANE_V],  img2->stride[VPX_PLANE_V],
+                     w_uv, h_uv,
+                     mismatch_row, mismatch_col,
+                     mismatch_pix1, mismatch_pix2)) {
+    if (mismatch_plane != NULL)
+      *mismatch_plane = VPX_PLANE_U;
+    return false;
+  }
+
+  return true;
 }
 
-void EncoderTest::MismatchHook(const vpx_image_t* /*img1*/,
-                               const vpx_image_t* /*img2*/) {
-  ASSERT_TRUE(0) << "Encode/Decode mismatch found";
+void EncoderTest::MismatchHook(const vpx_image_t* img_enc,
+                               const vpx_image_t* img_dec) {
+  int mismatch_row = 0;
+  int mismatch_col = 0;
+  int mismatch_plane = 0;
+  int mismatch_pix_enc = 0;
+  int mismatch_pix_dec = 0;
+
+  ASSERT_FALSE(compare_img(img_enc, img_dec,
+                           &mismatch_row, &mismatch_col,
+                           &mismatch_plane,
+                           &mismatch_pix_enc,
+                           &mismatch_pix_dec));
+
+  GTEST_FAIL()
+    << "Encode/Decode mismatch found:"
+    << std::endl
+    << "  pixel value enc/dec: "  << mismatch_pix_enc << "/" << mismatch_pix_dec
+    << std::endl
+    << "                plane: " << mismatch_plane
+    << std::endl
+    << "              row/col: " << mismatch_row << "/" << mismatch_col
+    << std::endl;
 }
 
 void EncoderTest::RunLoop(VideoSource *video) {
@@ -203,6 +291,15 @@
     if (init_flags_ & VPX_CODEC_USE_OUTPUT_PARTITION)
       dec_init_flags |= VPX_CODEC_USE_INPUT_FRAGMENTS;
     Decoder* const decoder = codec_->CreateDecoder(dec_cfg, dec_init_flags, 0);
+#if CONFIG_VP10 && CONFIG_EXT_TILE
+    if (decoder->IsVP10()) {
+      // Set dec_cfg.tile_row = -1 and dec_cfg.tile_col = -1 so that the whole
+      // frame is decoded.
+      decoder->Control(VP10_SET_DECODE_TILE_ROW, -1);
+      decoder->Control(VP10_SET_DECODE_TILE_COL, -1);
+    }
+#endif
+
     bool again;
     for (again = true; again; video->Next()) {
       again = (video->img() != NULL);
@@ -256,7 +353,8 @@
         DxDataIterator dec_iter = decoder->GetDxData();
         const vpx_image_t *img_dec = dec_iter.Next();
         if (img_enc && img_dec) {
-          const bool res = compare_img(img_enc, img_dec);
+          const bool res = compare_img(img_enc, img_dec,
+                                       NULL, NULL, NULL, NULL, NULL);
           if (!res) {  // Mismatch
             MismatchHook(img_enc, img_dec);
           }
diff --git a/test/vp9_end_to_end_test.cc b/test/end_to_end_test.cc
similarity index 88%
rename from test/vp9_end_to_end_test.cc
rename to test/end_to_end_test.cc
index be1fa68..e9c4296 100644
--- a/test/vp9_end_to_end_test.cc
+++ b/test/end_to_end_test.cc
@@ -25,6 +25,20 @@
 const int kBitrate = 500;
 // List of psnr thresholds for speed settings 0-7 and 5 encoding modes
 const double kPsnrThreshold[][5] = {
+// Note:
+// VP10 HBD average PSNR is slightly lower than VP9.
+// We make two cases here to enable the testing and
+// guard picture quality.
+#if CONFIG_VP10_ENCODER && CONFIG_VP9_HIGHBITDEPTH
+  { 36.0, 37.0, 37.0, 37.0, 37.0 },
+  { 31.0, 36.0, 36.0, 36.0, 36.0 },
+  { 31.0, 35.0, 35.0, 35.0, 35.0 },
+  { 31.0, 34.0, 34.0, 34.0, 34.0 },
+  { 31.0, 33.0, 33.0, 33.0, 33.0 },
+  { 31.0, 32.0, 32.0, 32.0, 32.0 },
+  { 30.0, 31.0, 31.0, 31.0, 31.0 },
+  { 29.0, 30.0, 30.0, 30.0, 30.0 },
+#else
   { 36.0, 37.0, 37.0, 37.0, 37.0 },
   { 35.0, 36.0, 36.0, 36.0, 36.0 },
   { 34.0, 35.0, 35.0, 35.0, 35.0 },
@@ -33,6 +47,7 @@
   { 31.0, 32.0, 32.0, 32.0, 32.0 },
   { 30.0, 31.0, 31.0, 31.0, 31.0 },
   { 29.0, 30.0, 30.0, 30.0, 30.0 },
+#endif  // CONFIG_VP9_HIGHBITDEPTH && CONFIG_VP10_ENCODER
 };
 
 typedef struct {
@@ -126,6 +141,11 @@
       encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 1);
       encoder->Control(VP9E_SET_TILE_COLUMNS, 4);
       encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
+      // Test screen coding tools at cpu_used = 1 && encoding mode is two-pass.
+      if (cpu_used_ == 1 && encoding_mode_ == ::libvpx_test::kTwoPassGood)
+        encoder->Control(VP9E_SET_TUNE_CONTENT, VP9E_CONTENT_SCREEN);
+      else
+        encoder->Control(VP9E_SET_TUNE_CONTENT, VP9E_CONTENT_DEFAULT);
       if (encoding_mode_ != ::libvpx_test::kRealTime) {
         encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
         encoder->Control(VP8E_SET_ARNR_MAXFRAMES, 7);
@@ -187,23 +207,9 @@
     ::testing::ValuesIn(kTestVectors),
     ::testing::ValuesIn(kCpuUsedVectors));
 
-#if CONFIG_VP9_HIGHBITDEPTH
-# if CONFIG_VP10_ENCODER
-// TODO(angiebird): many fail in high bitdepth mode.
-INSTANTIATE_TEST_CASE_P(
-    DISABLED_VP10, EndToEndTestLarge,
-    ::testing::Combine(
-        ::testing::Values(static_cast<const libvpx_test::CodecFactory *>(
-            &libvpx_test::kVP10)),
-        ::testing::ValuesIn(kEncodingModeVectors),
-        ::testing::ValuesIn(kTestVectors),
-        ::testing::ValuesIn(kCpuUsedVectors)));
-# endif  // CONFIG_VP10_ENCODER
-#else
 VP10_INSTANTIATE_TEST_CASE(
     EndToEndTestLarge,
     ::testing::ValuesIn(kEncodingModeVectors),
     ::testing::ValuesIn(kTestVectors),
     ::testing::ValuesIn(kCpuUsedVectors));
-#endif  // CONFIG_VP9_HIGHBITDEPTH
 }  // namespace
diff --git a/test/error_resilience_test.cc b/test/error_resilience_test.cc
index cd0dca2..c4e2721 100644
--- a/test/error_resilience_test.cc
+++ b/test/error_resilience_test.cc
@@ -164,6 +164,7 @@
     mismatch_psnr_ += mismatch_psnr;
     ++mismatch_nframes_;
     // std::cout << "Mismatch frame psnr: " << mismatch_psnr << "\n";
+    ::libvpx_test::EncoderTest::MismatchHook(img1, img2);
   }
 
   void SetErrorFrames(int num, unsigned int *list) {
diff --git a/test/fdct4x4_test.cc b/test/fdct4x4_test.cc
index 735cccf..f6b6567 100644
--- a/test/fdct4x4_test.cc
+++ b/test/fdct4x4_test.cc
@@ -19,8 +19,8 @@
 #include "test/acm_random.h"
 #include "test/clear_system_state.h"
 #include "test/register_state_check.h"
+#include "test/transform_test_base.h"
 #include "test/util.h"
-#include "vp9/common/vp9_entropy.h"
 #include "vpx/vpx_codec.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
@@ -28,16 +28,16 @@
 using libvpx_test::ACMRandom;
 
 namespace {
-const int kNumCoeffs = 16;
 typedef void (*FdctFunc)(const int16_t *in, tran_low_t *out, int stride);
 typedef void (*IdctFunc)(const tran_low_t *in, uint8_t *out, int stride);
-typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
-                        int tx_type);
 typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
                         int tx_type);
+using libvpx_test::FhtFunc;
 
-typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t> Dct4x4Param;
-typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t> Ht4x4Param;
+typedef std::tr1::tuple<FdctFunc, IdctFunc, int, vpx_bit_depth_t, int>
+Dct4x4Param;
+typedef std::tr1::tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t, int>
+Ht4x4Param;
 
 void fdct4x4_ref(const int16_t *in, tran_low_t *out, int stride,
                  int /*tx_type*/) {
@@ -89,197 +89,9 @@
 #endif  // HAVE_SSE2
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-class Trans4x4TestBase {
- public:
-  virtual ~Trans4x4TestBase() {}
-
- protected:
-  virtual void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) = 0;
-
-  virtual void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) = 0;
-
-  void RunAccuracyCheck(int limit) {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    uint32_t max_error = 0;
-    int64_t total_error = 0;
-    const int count_test_block = 10000;
-    for (int i = 0; i < count_test_block; ++i) {
-      DECLARE_ALIGNED(16, int16_t, test_input_block[kNumCoeffs]);
-      DECLARE_ALIGNED(16, tran_low_t, test_temp_block[kNumCoeffs]);
-      DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
-      DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
-#if CONFIG_VP9_HIGHBITDEPTH
-      DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
-      DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
-#endif
-
-      // Initialize a test block with input range [-255, 255].
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        if (bit_depth_ == VPX_BITS_8) {
-          src[j] = rnd.Rand8();
-          dst[j] = rnd.Rand8();
-          test_input_block[j] = src[j] - dst[j];
-#if CONFIG_VP9_HIGHBITDEPTH
-        } else {
-          src16[j] = rnd.Rand16() & mask_;
-          dst16[j] = rnd.Rand16() & mask_;
-          test_input_block[j] = src16[j] - dst16[j];
-#endif
-        }
-      }
-
-      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(test_input_block,
-                                          test_temp_block, pitch_));
-      if (bit_depth_ == VPX_BITS_8) {
-        ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
-#if CONFIG_VP9_HIGHBITDEPTH
-      } else {
-        ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block,
-                                            CONVERT_TO_BYTEPTR(dst16), pitch_));
-#endif
-      }
-
-      for (int j = 0; j < kNumCoeffs; ++j) {
-#if CONFIG_VP9_HIGHBITDEPTH
-        const int diff =
-            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
-#else
-        ASSERT_EQ(VPX_BITS_8, bit_depth_);
-        const int diff = dst[j] - src[j];
-#endif
-        const uint32_t error = diff * diff;
-        if (max_error < error)
-          max_error = error;
-        total_error += error;
-      }
-    }
-
-    EXPECT_GE(static_cast<uint32_t>(limit), max_error)
-        << "Error: 4x4 FHT/IHT has an individual round trip error > "
-        << limit;
-
-    EXPECT_GE(count_test_block * limit, total_error)
-        << "Error: 4x4 FHT/IHT has average round trip error > " << limit
-        << " per block";
-  }
-
-  void RunCoeffCheck() {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int count_test_block = 5000;
-    DECLARE_ALIGNED(16, int16_t, input_block[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);
-
-    for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-mask_, mask_].
-      for (int j = 0; j < kNumCoeffs; ++j)
-        input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
-
-      fwd_txfm_ref(input_block, output_ref_block, pitch_, tx_type_);
-      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, pitch_));
-
-      // The minimum quant value is 4.
-      for (int j = 0; j < kNumCoeffs; ++j)
-        EXPECT_EQ(output_block[j], output_ref_block[j]);
-    }
-  }
-
-  void RunMemCheck() {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int count_test_block = 5000;
-    DECLARE_ALIGNED(16, int16_t, input_extreme_block[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, output_ref_block[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, output_block[kNumCoeffs]);
-
-    for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-mask_, mask_].
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_;
-      }
-      if (i == 0) {
-        for (int j = 0; j < kNumCoeffs; ++j)
-          input_extreme_block[j] = mask_;
-      } else if (i == 1) {
-        for (int j = 0; j < kNumCoeffs; ++j)
-          input_extreme_block[j] = -mask_;
-      }
-
-      fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
-      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_extreme_block,
-                                          output_block, pitch_));
-
-      // The minimum quant value is 4.
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        EXPECT_EQ(output_block[j], output_ref_block[j]);
-        EXPECT_GE(4 * DCT_MAX_VALUE << (bit_depth_ - 8), abs(output_block[j]))
-            << "Error: 4x4 FDCT has coefficient larger than 4*DCT_MAX_VALUE";
-      }
-    }
-  }
-
-  void RunInvAccuracyCheck(int limit) {
-    ACMRandom rnd(ACMRandom::DeterministicSeed());
-    const int count_test_block = 1000;
-    DECLARE_ALIGNED(16, int16_t, in[kNumCoeffs]);
-    DECLARE_ALIGNED(16, tran_low_t, coeff[kNumCoeffs]);
-    DECLARE_ALIGNED(16, uint8_t, dst[kNumCoeffs]);
-    DECLARE_ALIGNED(16, uint8_t, src[kNumCoeffs]);
-#if CONFIG_VP9_HIGHBITDEPTH
-    DECLARE_ALIGNED(16, uint16_t, dst16[kNumCoeffs]);
-    DECLARE_ALIGNED(16, uint16_t, src16[kNumCoeffs]);
-#endif
-
-    for (int i = 0; i < count_test_block; ++i) {
-      // Initialize a test block with input range [-mask_, mask_].
-      for (int j = 0; j < kNumCoeffs; ++j) {
-        if (bit_depth_ == VPX_BITS_8) {
-          src[j] = rnd.Rand8();
-          dst[j] = rnd.Rand8();
-          in[j] = src[j] - dst[j];
-#if CONFIG_VP9_HIGHBITDEPTH
-        } else {
-          src16[j] = rnd.Rand16() & mask_;
-          dst16[j] = rnd.Rand16() & mask_;
-          in[j] = src16[j] - dst16[j];
-#endif
-        }
-      }
-
-      fwd_txfm_ref(in, coeff, pitch_, tx_type_);
-
-      if (bit_depth_ == VPX_BITS_8) {
-        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
-#if CONFIG_VP9_HIGHBITDEPTH
-      } else {
-        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16),
-                                            pitch_));
-#endif
-      }
-
-      for (int j = 0; j < kNumCoeffs; ++j) {
-#if CONFIG_VP9_HIGHBITDEPTH
-        const int diff =
-            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
-#else
-        const int diff = dst[j] - src[j];
-#endif
-        const uint32_t error = diff * diff;
-        EXPECT_GE(static_cast<uint32_t>(limit), error)
-            << "Error: 4x4 IDCT has error " << error
-            << " at index " << j;
-      }
-    }
-  }
-
-  int pitch_;
-  int tx_type_;
-  FhtFunc fwd_txfm_ref;
-  vpx_bit_depth_t bit_depth_;
-  int mask_;
-};
 
 class Trans4x4DCT
-    : public Trans4x4TestBase,
+    : public libvpx_test::TransformTestBase,
       public ::testing::TestWithParam<Dct4x4Param> {
  public:
   virtual ~Trans4x4DCT() {}
@@ -292,6 +104,7 @@
     fwd_txfm_ref = fdct4x4_ref;
     bit_depth_ = GET_PARAM(3);
     mask_ = (1 << bit_depth_) - 1;
+    num_coeffs_ = GET_PARAM(4);
   }
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
 
@@ -324,7 +137,7 @@
 }
 
 class Trans4x4HT
-    : public Trans4x4TestBase,
+    : public libvpx_test::TransformTestBase,
       public ::testing::TestWithParam<Ht4x4Param> {
  public:
   virtual ~Trans4x4HT() {}
@@ -337,6 +150,7 @@
     fwd_txfm_ref = fht4x4_ref;
     bit_depth_ = GET_PARAM(3);
     mask_ = (1 << bit_depth_) - 1;
+    num_coeffs_ = GET_PARAM(4);
   }
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
 
@@ -370,7 +184,7 @@
 }
 
 class Trans4x4WHT
-    : public Trans4x4TestBase,
+    : public libvpx_test::TransformTestBase,
       public ::testing::TestWithParam<Dct4x4Param> {
  public:
   virtual ~Trans4x4WHT() {}
@@ -383,6 +197,7 @@
     fwd_txfm_ref = fwht4x4_ref;
     bit_depth_ = GET_PARAM(3);
     mask_ = (1 << bit_depth_) - 1;
+    num_coeffs_ = GET_PARAM(4);
   }
   virtual void TearDown() { libvpx_test::ClearSystemState(); }
 
@@ -419,54 +234,54 @@
 INSTANTIATE_TEST_CASE_P(
     C, Trans4x4DCT,
     ::testing::Values(
-        make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_10, 0, VPX_BITS_10),
-        make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_12, 0, VPX_BITS_12),
-        make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c, 0, VPX_BITS_8)));
+        make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_10, 0, VPX_BITS_10, 16),
+        make_tuple(&vpx_highbd_fdct4x4_c, &idct4x4_12, 0, VPX_BITS_12, 16),
+        make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c, 0, VPX_BITS_8, 16)));
 #else
 INSTANTIATE_TEST_CASE_P(
     C, Trans4x4DCT,
     ::testing::Values(
-        make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c, 0, VPX_BITS_8)));
+         make_tuple(&vpx_fdct4x4_c, &vpx_idct4x4_16_add_c, 0, VPX_BITS_8, 16)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     C, Trans4x4HT,
     ::testing::Values(
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 0, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 1, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 2, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 3, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 0, VPX_BITS_12),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 1, VPX_BITS_12),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 2, VPX_BITS_12),
-        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 3, VPX_BITS_12),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 0, VPX_BITS_10, 16),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 1, VPX_BITS_10, 16),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 2, VPX_BITS_10, 16),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_10, 3, VPX_BITS_10, 16),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 0, VPX_BITS_12, 16),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 1, VPX_BITS_12, 16),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 2, VPX_BITS_12, 16),
+        make_tuple(&vp9_highbd_fht4x4_c, &iht4x4_12, 3, VPX_BITS_12, 16),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8, 16),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8, 16),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8, 16),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8, 16)));
 #else
 INSTANTIATE_TEST_CASE_P(
     C, Trans4x4HT,
     ::testing::Values(
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8, 16),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8, 16),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8, 16),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8, 16)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if CONFIG_VP9_HIGHBITDEPTH
 INSTANTIATE_TEST_CASE_P(
     C, Trans4x4WHT,
     ::testing::Values(
-        make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_10, 0, VPX_BITS_10),
-        make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_12, 0, VPX_BITS_12),
-        make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8)));
+        make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_10, 0, VPX_BITS_10, 16),
+        make_tuple(&vp9_highbd_fwht4x4_c, &iwht4x4_12, 0, VPX_BITS_12, 16),
+        make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8, 16)));
 #else
 INSTANTIATE_TEST_CASE_P(
     C, Trans4x4WHT,
     ::testing::Values(
-        make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8)));
+        make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8, 16)));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
@@ -474,25 +289,25 @@
     NEON, Trans4x4DCT,
     ::testing::Values(
         make_tuple(&vpx_fdct4x4_c,
-                   &vpx_idct4x4_16_add_neon, 0, VPX_BITS_8)));
+                   &vpx_idct4x4_16_add_neon, 0, VPX_BITS_8, 16)));
 #endif  // HAVE_NEON_ASM && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     NEON, Trans4x4HT,
     ::testing::Values(
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3, VPX_BITS_8)));
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 0, VPX_BITS_8, 16),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 1, VPX_BITS_8, 16),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 2, VPX_BITS_8, 16),
+        make_tuple(&vp9_fht4x4_c, &vp9_iht4x4_16_add_neon, 3, VPX_BITS_8, 16)));
 #endif  // HAVE_NEON && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if CONFIG_USE_X86INC && HAVE_SSE2 && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans4x4WHT,
     ::testing::Values(
-        make_tuple(&vp9_fwht4x4_sse2, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8),
-        make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_sse2, 0, VPX_BITS_8)));
+        make_tuple(&vp9_fwht4x4_sse2, &vpx_iwht4x4_16_add_c, 0, VPX_BITS_8, 16),
+        make_tuple(&vp9_fwht4x4_c, &vpx_iwht4x4_16_add_sse2, 0, VPX_BITS_8, 16)));
 #endif
 
 #if HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
@@ -500,47 +315,60 @@
     SSE2, Trans4x4DCT,
     ::testing::Values(
         make_tuple(&vpx_fdct4x4_sse2,
-                   &vpx_idct4x4_16_add_sse2, 0, VPX_BITS_8)));
+                   &vpx_idct4x4_16_add_sse2, 0, VPX_BITS_8, 16)));
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans4x4HT,
     ::testing::Values(
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 3, VPX_BITS_8)));
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 0,
+                   VPX_BITS_8, 16),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 1,
+                   VPX_BITS_8, 16),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 2,
+                   VPX_BITS_8, 16),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_sse2, 3,
+                   VPX_BITS_8, 16)));
 #endif  // HAVE_SSE2 && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans4x4DCT,
     ::testing::Values(
-        make_tuple(&vpx_highbd_fdct4x4_c,    &idct4x4_10_sse2, 0, VPX_BITS_10),
-        make_tuple(&vpx_highbd_fdct4x4_sse2, &idct4x4_10_sse2, 0, VPX_BITS_10),
-        make_tuple(&vpx_highbd_fdct4x4_c,    &idct4x4_12_sse2, 0, VPX_BITS_12),
-        make_tuple(&vpx_highbd_fdct4x4_sse2, &idct4x4_12_sse2, 0, VPX_BITS_12),
+        make_tuple(&vpx_highbd_fdct4x4_c,    &idct4x4_10_sse2, 0,
+                   VPX_BITS_10, 16),
+        make_tuple(&vpx_highbd_fdct4x4_sse2, &idct4x4_10_sse2, 0,
+                   VPX_BITS_10, 16),
+        make_tuple(&vpx_highbd_fdct4x4_c,    &idct4x4_12_sse2, 0,
+                   VPX_BITS_12, 16),
+        make_tuple(&vpx_highbd_fdct4x4_sse2, &idct4x4_12_sse2, 0,
+                   VPX_BITS_12, 16),
         make_tuple(&vpx_fdct4x4_sse2,      &vpx_idct4x4_16_add_c, 0,
-                   VPX_BITS_8)));
+                   VPX_BITS_8, 16)));
 
 INSTANTIATE_TEST_CASE_P(
     SSE2, Trans4x4HT,
     ::testing::Values(
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8)));
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 0, VPX_BITS_8, 16),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 1, VPX_BITS_8, 16),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 2, VPX_BITS_8, 16),
+        make_tuple(&vp9_fht4x4_sse2, &vp9_iht4x4_16_add_c, 3, VPX_BITS_8, 16)));
 #endif  // HAVE_SSE2 && CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 
 #if HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 INSTANTIATE_TEST_CASE_P(
     MSA, Trans4x4DCT,
     ::testing::Values(
-        make_tuple(&vpx_fdct4x4_msa, &vpx_idct4x4_16_add_msa, 0, VPX_BITS_8)));
+        make_tuple(&vpx_fdct4x4_msa, &vpx_idct4x4_16_add_msa, 0,
+                   VPX_BITS_8, 16)));
 INSTANTIATE_TEST_CASE_P(
     MSA, Trans4x4HT,
     ::testing::Values(
-        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 0, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 1, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 2, VPX_BITS_8),
-        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 3, VPX_BITS_8)));
+        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 0,
+                   VPX_BITS_8, 16),
+        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 1,
+                   VPX_BITS_8, 16),
+        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 2,
+                   VPX_BITS_8, 16),
+        make_tuple(&vp9_fht4x4_msa, &vp9_iht4x4_16_add_msa, 3,
+                   VPX_BITS_8, 16)));
 #endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace
diff --git a/test/function_equivalence_test.h b/test/function_equivalence_test.h
new file mode 100644
index 0000000..70f33d1
--- /dev/null
+++ b/test/function_equivalence_test.h
@@ -0,0 +1,65 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef TEST_FUNCTION_EQUIVALENCE_TEST_H_
+#define TEST_FUNCTION_EQUIVALENCE_TEST_H_
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/util.h"
+
+using libvpx_test::ACMRandom;
+
+namespace libvpx_test {
+// Base class for tests that compare 2 implementations of the same function
+// for equivalence. The template parameter should be pointer to a function
+// that is being tested.
+//
+// The test takes a 3-parameters encapsulating struct 'FuncParam', containing:
+//   - Pointer to reference function
+//   - Pointer to tested function
+//   - Integer bit depth (default to 0).
+//
+// These values are then accessible in the tests as member of params_:
+// params_.ref_func, params_.tst_func, and params_.bit_depth.
+//
+
+template <typename T>
+struct FuncParam {
+  FuncParam(T ref = NULL, T tst = NULL, int bit_depth = 0)
+      : ref_func(ref), tst_func(tst), bit_depth(bit_depth) {}
+  T ref_func;
+  T tst_func;
+  int bit_depth;
+};
+
+template <typename T>
+class FunctionEquivalenceTest : public ::testing::TestWithParam<FuncParam<T> > {
+ public:
+  FunctionEquivalenceTest() : rng_(ACMRandom::DeterministicSeed()) {}
+
+  virtual ~FunctionEquivalenceTest() {}
+
+  virtual void SetUp() {
+    params_ = this->GetParam();
+  }
+
+  virtual void TearDown() {
+    libvpx_test::ClearSystemState();
+  }
+
+ protected:
+  ACMRandom rng_;
+  FuncParam<T> params_;
+};
+
+}   // namespace libvpx_test
+#endif  // TEST_FUNCTION_EQUIVALENCE_TEST_H_
diff --git a/test/hbd_metrics_test.cc b/test/hbd_metrics_test.cc
new file mode 100644
index 0000000..14a8815
--- /dev/null
+++ b/test/hbd_metrics_test.cc
@@ -0,0 +1,251 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <new>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/util.h"
+#include "./vpx_config.h"
+#include "vpx_dsp/psnr.h"
+#include "vpx_dsp/ssim.h"
+#include "vpx_ports/mem.h"
+#include "vpx_ports/msvc.h"
+#include "vpx_scale/yv12config.h"
+
+
+using libvpx_test::ACMRandom;
+
+namespace {
+
+typedef double (*LBDMetricFunc)(const YV12_BUFFER_CONFIG *source,
+                                const YV12_BUFFER_CONFIG *dest);
+typedef double (*HBDMetricFunc)(const YV12_BUFFER_CONFIG *source,
+                                const YV12_BUFFER_CONFIG *dest,
+                                uint32_t in_bd, uint32_t bd);
+
+double compute_hbd_psnr(const YV12_BUFFER_CONFIG *source,
+                        const YV12_BUFFER_CONFIG *dest,
+                        uint32_t in_bd, uint32_t bd) {
+  PSNR_STATS psnr;
+  vpx_calc_highbd_psnr(source, dest, &psnr, bd, in_bd);
+  return psnr.psnr[0];
+}
+
+double compute_psnr(const YV12_BUFFER_CONFIG *source,
+  const YV12_BUFFER_CONFIG *dest) {
+  PSNR_STATS psnr;
+  vpx_calc_psnr(source, dest, &psnr);
+  return psnr.psnr[0];
+}
+
+double compute_hbd_psnrhvs(const YV12_BUFFER_CONFIG *source,
+                           const YV12_BUFFER_CONFIG *dest,
+                           uint32_t in_bd, uint32_t bd) {
+  double tempy, tempu, tempv;
+  return vpx_psnrhvs(source, dest,
+                     &tempy, &tempu, &tempv, bd, in_bd);
+}
+
+double compute_psnrhvs(const YV12_BUFFER_CONFIG *source,
+  const YV12_BUFFER_CONFIG *dest) {
+  double tempy, tempu, tempv;
+  return vpx_psnrhvs(source, dest,
+                     &tempy, &tempu, &tempv, 8, 8);
+}
+
+double compute_hbd_fastssim(const YV12_BUFFER_CONFIG *source,
+                            const YV12_BUFFER_CONFIG *dest,
+                            uint32_t in_bd, uint32_t bd) {
+  double tempy, tempu, tempv;
+  return vpx_calc_fastssim(source, dest,
+                               &tempy, &tempu, &tempv, bd, in_bd);
+}
+
+double compute_fastssim(const YV12_BUFFER_CONFIG *source,
+                        const YV12_BUFFER_CONFIG *dest) {
+  double tempy, tempu, tempv;
+  return vpx_calc_fastssim(source, dest,
+                           &tempy, &tempu, &tempv, 8, 8);
+}
+
+double compute_hbd_vpxssim(const YV12_BUFFER_CONFIG *source,
+                           const YV12_BUFFER_CONFIG *dest,
+                           uint32_t in_bd, uint32_t bd) {
+  double ssim, weight;
+  ssim = vpx_highbd_calc_ssim(source, dest, &weight, bd, in_bd);
+  return 100 * pow(ssim / weight, 8.0);
+}
+
+double compute_vpxssim(const YV12_BUFFER_CONFIG *source,
+  const YV12_BUFFER_CONFIG *dest) {
+  double ssim, weight;
+  ssim = vpx_calc_ssim(source, dest, &weight);
+  return 100 * pow(ssim / weight, 8.0);
+}
+
+
+class HBDMetricsTestBase {
+ public:
+  virtual ~HBDMetricsTestBase() {}
+
+ protected:
+  void RunAccuracyCheck() {
+    const int width = 1920;
+    const int height = 1080;
+    int i = 0;
+    const uint8_t kPixFiller = 128;
+    YV12_BUFFER_CONFIG lbd_src, lbd_dst;
+    YV12_BUFFER_CONFIG hbd_src, hbd_dst;
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    double lbd_db, hbd_db;
+
+    memset(&lbd_src, 0, sizeof(lbd_src));
+    memset(&lbd_dst, 0, sizeof(lbd_dst));
+    memset(&hbd_src, 0, sizeof(hbd_src));
+    memset(&hbd_dst, 0, sizeof(hbd_dst));
+
+    vpx_alloc_frame_buffer(&lbd_src, width, height, 1, 1, 0, 32, 16);
+    vpx_alloc_frame_buffer(&lbd_dst, width, height, 1, 1, 0, 32, 16);
+    vpx_alloc_frame_buffer(&hbd_src, width, height, 1, 1, 1, 32, 16);
+    vpx_alloc_frame_buffer(&hbd_dst, width, height, 1, 1, 1, 32, 16);
+
+    memset(lbd_src.buffer_alloc, kPixFiller, lbd_src.buffer_alloc_sz);
+    while (i < lbd_src.buffer_alloc_sz) {
+      uint16_t spel, dpel;
+      spel = lbd_src.buffer_alloc[i];
+      // Create some distortion for dst buffer.
+      dpel = rnd.Rand8();
+      lbd_dst.buffer_alloc[i] = (uint8_t)dpel;
+      ((uint16_t*)(hbd_src.buffer_alloc))[i] = spel << (bit_depth_ - 8);
+      ((uint16_t*)(hbd_dst.buffer_alloc))[i] = dpel << (bit_depth_ - 8);
+      i++;
+    }
+
+    lbd_db = lbd_metric_(&lbd_src, &lbd_dst);
+    hbd_db = hbd_metric_(&hbd_src, &hbd_dst, input_bit_depth_, bit_depth_);
+    EXPECT_LE(fabs(lbd_db - hbd_db), threshold_);
+
+    i = 0;
+    while (i < lbd_src.buffer_alloc_sz) {
+      uint16_t dpel;
+      // Create some small distortion for dst buffer.
+      dpel = 120 + (rnd.Rand8() >> 4);
+      lbd_dst.buffer_alloc[i] = (uint8_t)dpel;
+      ((uint16_t*)(hbd_dst.buffer_alloc))[i] = dpel << (bit_depth_ - 8);
+      i++;
+    }
+
+    lbd_db = lbd_metric_(&lbd_src, &lbd_dst);
+    hbd_db = hbd_metric_(&hbd_src, &hbd_dst, input_bit_depth_, bit_depth_);
+    EXPECT_LE(fabs(lbd_db - hbd_db), threshold_);
+
+    i = 0;
+    while (i < lbd_src.buffer_alloc_sz) {
+      uint16_t dpel;
+      // Create some small distortion for dst buffer.
+      dpel = 126 + (rnd.Rand8() >> 6);
+      lbd_dst.buffer_alloc[i] = (uint8_t)dpel;
+      ((uint16_t*)(hbd_dst.buffer_alloc))[i] = dpel << (bit_depth_ - 8);
+      i++;
+    }
+
+    lbd_db = lbd_metric_(&lbd_src, &lbd_dst);
+    hbd_db = hbd_metric_(&hbd_src, &hbd_dst, input_bit_depth_, bit_depth_);
+    EXPECT_LE(fabs(lbd_db - hbd_db), threshold_);
+
+    vpx_free_frame_buffer(&lbd_src);
+    vpx_free_frame_buffer(&lbd_dst);
+    vpx_free_frame_buffer(&hbd_src);
+    vpx_free_frame_buffer(&hbd_dst);
+  }
+
+  int input_bit_depth_;
+  int bit_depth_;
+  double threshold_;
+  LBDMetricFunc lbd_metric_;
+  HBDMetricFunc hbd_metric_;
+};
+
+typedef std::tr1::tuple<LBDMetricFunc,
+                        HBDMetricFunc, int, int, double> MetricTestTParam;
+class HBDMetricsTest
+    : public HBDMetricsTestBase,
+      public ::testing::TestWithParam<MetricTestTParam> {
+ public:
+  virtual void SetUp() {
+    lbd_metric_ = GET_PARAM(0);
+    hbd_metric_ = GET_PARAM(1);
+    input_bit_depth_ = GET_PARAM(2);
+    bit_depth_ = GET_PARAM(3);
+    threshold_ = GET_PARAM(4);
+  }
+  virtual void TearDown() {}
+};
+
+TEST_P(HBDMetricsTest, RunAccuracyCheck) {
+  RunAccuracyCheck();
+}
+
+// Allow small variation due to floating point operations.
+static const double kSsim_thresh = 0.001;
+// Allow some additional errors accumulated in floating point operations.
+static const double kFSsim_thresh = 0.03;
+// Allow some extra variation due to rounding error accumulated in dct.
+static const double kPhvs_thresh = 0.3;
+
+INSTANTIATE_TEST_CASE_P(
+    VPXSSIM, HBDMetricsTest,
+    ::testing::Values(
+        MetricTestTParam(&compute_vpxssim, &compute_hbd_vpxssim, 8, 10,
+                         kSsim_thresh),
+        MetricTestTParam(&compute_vpxssim, &compute_hbd_vpxssim, 10, 10,
+                         kPhvs_thresh),
+        MetricTestTParam(&compute_vpxssim, &compute_hbd_vpxssim, 8, 12,
+                         kSsim_thresh),
+        MetricTestTParam(&compute_vpxssim, &compute_hbd_vpxssim, 12, 12,
+                         kPhvs_thresh)));
+INSTANTIATE_TEST_CASE_P(
+    FASTSSIM, HBDMetricsTest,
+    ::testing::Values(
+        MetricTestTParam(&compute_fastssim, &compute_hbd_fastssim, 8, 10,
+                         kFSsim_thresh),
+        MetricTestTParam(&compute_fastssim, &compute_hbd_fastssim, 10, 10,
+                         kFSsim_thresh),
+        MetricTestTParam(&compute_fastssim, &compute_hbd_fastssim, 8, 12,
+                         kFSsim_thresh),
+        MetricTestTParam(&compute_fastssim, &compute_hbd_fastssim, 12, 12,
+                         kFSsim_thresh)));
+INSTANTIATE_TEST_CASE_P(
+    PSNRHVS, HBDMetricsTest,
+    ::testing::Values(
+        MetricTestTParam(&compute_psnrhvs, &compute_hbd_psnrhvs, 8, 10,
+                         kPhvs_thresh),
+        MetricTestTParam(&compute_psnrhvs, &compute_hbd_psnrhvs, 10, 10,
+                         kPhvs_thresh),
+        MetricTestTParam(&compute_psnrhvs, &compute_hbd_psnrhvs, 8, 12,
+                         kPhvs_thresh),
+        MetricTestTParam(&compute_psnrhvs, &compute_hbd_psnrhvs, 12, 12,
+                         kPhvs_thresh)));
+INSTANTIATE_TEST_CASE_P(
+    PSNR, HBDMetricsTest,
+    ::testing::Values(
+        MetricTestTParam(&compute_psnr, &compute_hbd_psnr, 8, 10,
+                         kPhvs_thresh),
+        MetricTestTParam(&compute_psnr, &compute_hbd_psnr, 10, 10,
+                         kPhvs_thresh),
+        MetricTestTParam(&compute_psnr, &compute_hbd_psnr, 8, 12,
+                         kPhvs_thresh),
+        MetricTestTParam(&compute_psnr, &compute_hbd_psnr, 12, 12,
+                         kPhvs_thresh)));
+}  // namespace
+
diff --git a/test/idct8x8_test.cc b/test/idct8x8_test.cc
index 7f9d751..04487c4 100644
--- a/test/idct8x8_test.cc
+++ b/test/idct8x8_test.cc
@@ -17,20 +17,12 @@
 #include "./vpx_dsp_rtcd.h"
 #include "test/acm_random.h"
 #include "vpx/vpx_integer.h"
+#include "vpx_ports/msvc.h"  // for round()
 
 using libvpx_test::ACMRandom;
 
 namespace {
 
-#ifdef _MSC_VER
-static int round(double x) {
-  if (x < 0)
-    return static_cast<int>(ceil(x - 0.5));
-  else
-    return static_cast<int>(floor(x + 0.5));
-}
-#endif
-
 void reference_dct_1d(double input[8], double output[8]) {
   const double kPi = 3.141592653589793238462643383279502884;
   const double kInvSqrt2 = 0.707106781186547524400844362104;
@@ -86,7 +78,7 @@
 
     reference_dct_2d(input, output_r);
     for (int j = 0; j < 64; ++j)
-      coeff[j] = round(output_r[j]);
+      coeff[j] = static_cast<tran_low_t>(round(output_r[j]));
     vpx_idct8x8_64_add_c(coeff, dst, 8);
     for (int j = 0; j < 64; ++j) {
       const int diff = dst[j] - src[j];
diff --git a/test/masked_sad_test.cc b/test/masked_sad_test.cc
new file mode 100644
index 0000000..13fff0f
--- /dev/null
+++ b/test/masked_sad_test.cc
@@ -0,0 +1,225 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+const int number_of_iterations = 500;
+
+typedef unsigned int (*MaskedSADFunc)(const uint8_t *a, int a_stride,
+                                      const uint8_t *b, int b_stride,
+                                      const uint8_t *m, int m_stride);
+typedef std::tr1::tuple<MaskedSADFunc, MaskedSADFunc> MaskedSADParam;
+
+class MaskedSADTest : public ::testing::TestWithParam<MaskedSADParam> {
+ public:
+  virtual ~MaskedSADTest() {}
+  virtual void SetUp() {
+    maskedSAD_op_   = GET_PARAM(0);
+    ref_maskedSAD_op_ = GET_PARAM(1);
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  MaskedSADFunc maskedSAD_op_;
+  MaskedSADFunc ref_maskedSAD_op_;
+};
+
+TEST_P(MaskedSADTest, OperationCheck) {
+  unsigned int ref_ret, ret;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, uint8_t, src_ptr[MAX_SB_SIZE*MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t, ref_ptr[MAX_SB_SIZE*MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t, msk_ptr[MAX_SB_SIZE*MAX_SB_SIZE]);
+  int err_count = 0;
+  int first_failure = -1;
+  int src_stride = MAX_SB_SIZE;
+  int ref_stride = MAX_SB_SIZE;
+  int msk_stride = MAX_SB_SIZE;
+  for (int i = 0; i < number_of_iterations; ++i) {
+    for (int j = 0; j < MAX_SB_SIZE*MAX_SB_SIZE; j++) {
+      src_ptr[j] = rnd.Rand8();
+      ref_ptr[j] = rnd.Rand8();
+      msk_ptr[j] = ((rnd.Rand8()&0x7f) > 64) ? rnd.Rand8()&0x3f : 64;
+      assert(msk_ptr[j] <= 64);
+    }
+
+    ref_ret = ref_maskedSAD_op_(src_ptr, src_stride, ref_ptr, ref_stride,
+                                msk_ptr, msk_stride);
+    ASM_REGISTER_STATE_CHECK(ret = maskedSAD_op_(src_ptr, src_stride,
+                                                 ref_ptr, ref_stride,
+                                                 msk_ptr, msk_stride));
+    if (ret != ref_ret) {
+      err_count++;
+      if (first_failure == -1)
+        first_failure = i;
+    }
+  }
+  EXPECT_EQ(0, err_count)
+    << "Error: Masked SAD Test, C output doesn't match SSSE3 output. "
+    << "First failed at test case " << first_failure;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef unsigned int (*HighbdMaskedSADFunc)(const uint8_t *a, int a_stride,
+                                            const uint8_t *b, int b_stride,
+                                            const uint8_t *m, int m_stride);
+typedef std::tr1::tuple<HighbdMaskedSADFunc, HighbdMaskedSADFunc>
+    HighbdMaskedSADParam;
+
+class HighbdMaskedSADTest : public ::testing::
+        TestWithParam<HighbdMaskedSADParam> {
+ public:
+  virtual ~HighbdMaskedSADTest() {}
+  virtual void SetUp() {
+    maskedSAD_op_   = GET_PARAM(0);
+    ref_maskedSAD_op_ = GET_PARAM(1);
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  HighbdMaskedSADFunc maskedSAD_op_;
+  HighbdMaskedSADFunc ref_maskedSAD_op_;
+};
+
+TEST_P(HighbdMaskedSADTest, OperationCheck) {
+  unsigned int ref_ret, ret;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, uint16_t, src_ptr[MAX_SB_SIZE*MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint16_t, ref_ptr[MAX_SB_SIZE*MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t, msk_ptr[MAX_SB_SIZE*MAX_SB_SIZE]);
+  uint8_t* src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
+  uint8_t* ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
+  int err_count = 0;
+  int first_failure = -1;
+  int src_stride = MAX_SB_SIZE;
+  int ref_stride = MAX_SB_SIZE;
+  int msk_stride = MAX_SB_SIZE;
+  for (int i = 0; i < number_of_iterations; ++i) {
+    for (int j = 0; j < MAX_SB_SIZE*MAX_SB_SIZE; j++) {
+      src_ptr[j] = rnd.Rand16()&0xfff;
+      ref_ptr[j] = rnd.Rand16()&0xfff;
+      msk_ptr[j] = ((rnd.Rand8()&0x7f) > 64) ? rnd.Rand8()&0x3f : 64;
+    }
+
+    ref_ret = ref_maskedSAD_op_(src8_ptr, src_stride, ref8_ptr, ref_stride,
+                                msk_ptr, msk_stride);
+    ASM_REGISTER_STATE_CHECK(ret = maskedSAD_op_(src8_ptr, src_stride,
+                                                 ref8_ptr, ref_stride,
+                                                 msk_ptr, msk_stride));
+    if (ret != ref_ret) {
+      err_count++;
+      if (first_failure == -1)
+        first_failure = i;
+    }
+  }
+  EXPECT_EQ(0, err_count)
+    << "Error: High BD Masked SAD Test, C output doesn't match SSSE3 output. "
+    << "First failed at test case " << first_failure;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+using std::tr1::make_tuple;
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(
+  SSSE3_C_COMPARE, MaskedSADTest,
+  ::testing::Values(
+#if CONFIG_EXT_PARTITION
+    make_tuple(&vpx_masked_sad128x128_ssse3,
+               &vpx_masked_sad128x128_c),
+    make_tuple(&vpx_masked_sad128x64_ssse3,
+               &vpx_masked_sad128x64_c),
+    make_tuple(&vpx_masked_sad64x128_ssse3,
+               &vpx_masked_sad64x128_c),
+#endif  // CONFIG_EXT_PARTITION
+    make_tuple(&vpx_masked_sad64x64_ssse3,
+               &vpx_masked_sad64x64_c),
+    make_tuple(&vpx_masked_sad64x32_ssse3,
+               &vpx_masked_sad64x32_c),
+    make_tuple(&vpx_masked_sad32x64_ssse3,
+               &vpx_masked_sad32x64_c),
+    make_tuple(&vpx_masked_sad32x32_ssse3,
+               &vpx_masked_sad32x32_c),
+    make_tuple(&vpx_masked_sad32x16_ssse3,
+               &vpx_masked_sad32x16_c),
+    make_tuple(&vpx_masked_sad16x32_ssse3,
+               &vpx_masked_sad16x32_c),
+    make_tuple(&vpx_masked_sad16x16_ssse3,
+               &vpx_masked_sad16x16_c),
+    make_tuple(&vpx_masked_sad16x8_ssse3,
+               &vpx_masked_sad16x8_c),
+    make_tuple(&vpx_masked_sad8x16_ssse3,
+               &vpx_masked_sad8x16_c),
+    make_tuple(&vpx_masked_sad8x8_ssse3,
+               &vpx_masked_sad8x8_c),
+    make_tuple(&vpx_masked_sad8x4_ssse3,
+               &vpx_masked_sad8x4_c),
+    make_tuple(&vpx_masked_sad4x8_ssse3,
+               &vpx_masked_sad4x8_c),
+    make_tuple(&vpx_masked_sad4x4_ssse3,
+               &vpx_masked_sad4x4_c)));
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+  SSSE3_C_COMPARE, HighbdMaskedSADTest,
+  ::testing::Values(
+#if CONFIG_EXT_PARTITION
+    make_tuple(&vpx_highbd_masked_sad128x128_ssse3,
+               &vpx_highbd_masked_sad128x128_c),
+    make_tuple(&vpx_highbd_masked_sad128x64_ssse3,
+               &vpx_highbd_masked_sad128x64_c),
+    make_tuple(&vpx_highbd_masked_sad64x128_ssse3,
+               &vpx_highbd_masked_sad64x128_c),
+#endif  // CONFIG_EXT_PARTITION
+    make_tuple(&vpx_highbd_masked_sad64x64_ssse3,
+               &vpx_highbd_masked_sad64x64_c),
+    make_tuple(&vpx_highbd_masked_sad64x32_ssse3,
+               &vpx_highbd_masked_sad64x32_c),
+    make_tuple(&vpx_highbd_masked_sad32x64_ssse3,
+               &vpx_highbd_masked_sad32x64_c),
+    make_tuple(&vpx_highbd_masked_sad32x32_ssse3,
+               &vpx_highbd_masked_sad32x32_c),
+    make_tuple(&vpx_highbd_masked_sad32x16_ssse3,
+               &vpx_highbd_masked_sad32x16_c),
+    make_tuple(&vpx_highbd_masked_sad16x32_ssse3,
+               &vpx_highbd_masked_sad16x32_c),
+    make_tuple(&vpx_highbd_masked_sad16x16_ssse3,
+               &vpx_highbd_masked_sad16x16_c),
+    make_tuple(&vpx_highbd_masked_sad16x8_ssse3,
+               &vpx_highbd_masked_sad16x8_c),
+    make_tuple(&vpx_highbd_masked_sad8x16_ssse3,
+               &vpx_highbd_masked_sad8x16_c),
+    make_tuple(&vpx_highbd_masked_sad8x8_ssse3,
+               &vpx_highbd_masked_sad8x8_c),
+    make_tuple(&vpx_highbd_masked_sad8x4_ssse3,
+               &vpx_highbd_masked_sad8x4_c),
+    make_tuple(&vpx_highbd_masked_sad4x8_ssse3,
+               &vpx_highbd_masked_sad4x8_c),
+    make_tuple(&vpx_highbd_masked_sad4x4_ssse3,
+               &vpx_highbd_masked_sad4x4_c)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // HAVE_SSSE3
+}  // namespace
diff --git a/test/masked_variance_test.cc b/test/masked_variance_test.cc
new file mode 100644
index 0000000..1710285
--- /dev/null
+++ b/test/masked_variance_test.cc
@@ -0,0 +1,816 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_codec.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_mem/vpx_mem.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+const int number_of_iterations = 500;
+
+typedef unsigned int (*MaskedVarianceFunc)(const uint8_t *a, int a_stride,
+                                           const uint8_t *b, int b_stride,
+                                           const uint8_t *m, int m_stride,
+                                           unsigned int *sse);
+
+typedef std::tr1::tuple<MaskedVarianceFunc,
+                        MaskedVarianceFunc> MaskedVarianceParam;
+
+class MaskedVarianceTest :
+  public ::testing::TestWithParam<MaskedVarianceParam> {
+ public:
+  virtual ~MaskedVarianceTest() {}
+  virtual void SetUp() {
+    opt_func_ = GET_PARAM(0);
+    ref_func_ = GET_PARAM(1);
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  MaskedVarianceFunc opt_func_;
+  MaskedVarianceFunc ref_func_;
+};
+
+TEST_P(MaskedVarianceTest, OperationCheck) {
+  unsigned int ref_ret, opt_ret;
+  unsigned int ref_sse, opt_sse;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, uint8_t,  src_ptr[MAX_SB_SIZE*MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t,  ref_ptr[MAX_SB_SIZE*MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[MAX_SB_SIZE*MAX_SB_SIZE]);
+  int err_count = 0;
+  int first_failure = -1;
+  int src_stride = MAX_SB_SIZE;
+  int ref_stride = MAX_SB_SIZE;
+  int msk_stride = MAX_SB_SIZE;
+
+  for (int i = 0; i < number_of_iterations; ++i) {
+    for (int j = 0; j < MAX_SB_SIZE*MAX_SB_SIZE; j++) {
+      src_ptr[j] = rnd.Rand8();
+      ref_ptr[j] = rnd.Rand8();
+      msk_ptr[j] = rnd(65);
+    }
+
+    ref_ret = ref_func_(src_ptr, src_stride,
+                        ref_ptr, ref_stride,
+                        msk_ptr, msk_stride,
+                        &ref_sse);
+    ASM_REGISTER_STATE_CHECK(opt_ret = opt_func_(src_ptr, src_stride,
+                                                 ref_ptr, ref_stride,
+                                                 msk_ptr, msk_stride,
+                                                 &opt_sse));
+
+    if (opt_ret != ref_ret || opt_sse != ref_sse) {
+      err_count++;
+      if (first_failure == -1)
+        first_failure = i;
+    }
+  }
+
+  EXPECT_EQ(0, err_count)
+  << "Error: Masked Variance Test OperationCheck,"
+  << "C output doesn't match SSSE3 output. "
+  << "First failed at test case " << first_failure;
+}
+
+TEST_P(MaskedVarianceTest, ExtremeValues) {
+  unsigned int ref_ret, opt_ret;
+  unsigned int ref_sse, opt_sse;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, uint8_t,  src_ptr[MAX_SB_SIZE*MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t,  ref_ptr[MAX_SB_SIZE*MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[MAX_SB_SIZE*MAX_SB_SIZE]);
+  int err_count = 0;
+  int first_failure = -1;
+  int src_stride = MAX_SB_SIZE;
+  int ref_stride = MAX_SB_SIZE;
+  int msk_stride = MAX_SB_SIZE;
+
+  for (int i = 0; i < 8; ++i) {
+    memset(src_ptr, (i & 0x1) ? 255 : 0, MAX_SB_SIZE*MAX_SB_SIZE);
+    memset(ref_ptr, (i & 0x2) ? 255 : 0, MAX_SB_SIZE*MAX_SB_SIZE);
+    memset(msk_ptr, (i & 0x4) ?  64 : 0, MAX_SB_SIZE*MAX_SB_SIZE);
+
+    ref_ret = ref_func_(src_ptr, src_stride,
+                        ref_ptr, ref_stride,
+                        msk_ptr, msk_stride,
+                        &ref_sse);
+    ASM_REGISTER_STATE_CHECK(opt_ret = opt_func_(src_ptr, src_stride,
+                                                 ref_ptr, ref_stride,
+                                                 msk_ptr, msk_stride,
+                                                 &opt_sse));
+
+    if (opt_ret != ref_ret || opt_sse != ref_sse) {
+      err_count++;
+      if (first_failure == -1)
+        first_failure = i;
+    }
+  }
+
+  EXPECT_EQ(0, err_count)
+  << "Error: Masked Variance Test ExtremeValues,"
+  << "C output doesn't match SSSE3 output. "
+  << "First failed at test case " << first_failure;
+}
+
+typedef unsigned int (*MaskedSubPixelVarianceFunc)(
+    const uint8_t *a, int a_stride,
+    int xoffset, int  yoffset,
+    const uint8_t *b, int b_stride,
+    const uint8_t *m, int m_stride,
+    unsigned int *sse);
+
+typedef std::tr1::tuple<MaskedSubPixelVarianceFunc,
+                        MaskedSubPixelVarianceFunc> MaskedSubPixelVarianceParam;
+
+class MaskedSubPixelVarianceTest :
+  public ::testing::TestWithParam<MaskedSubPixelVarianceParam> {
+ public:
+  virtual ~MaskedSubPixelVarianceTest() {}
+  virtual void SetUp() {
+    opt_func_ = GET_PARAM(0);
+    ref_func_ = GET_PARAM(1);
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  MaskedSubPixelVarianceFunc opt_func_;
+  MaskedSubPixelVarianceFunc ref_func_;
+};
+
+TEST_P(MaskedSubPixelVarianceTest, OperationCheck) {
+  unsigned int ref_ret, opt_ret;
+  unsigned int ref_sse, opt_sse;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, uint8_t,  src_ptr[(MAX_SB_SIZE+1)*(MAX_SB_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint8_t,  ref_ptr[(MAX_SB_SIZE+1)*(MAX_SB_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[(MAX_SB_SIZE+1)*(MAX_SB_SIZE+1)]);
+  int err_count = 0;
+  int first_failure = -1;
+  int src_stride = (MAX_SB_SIZE+1);
+  int ref_stride = (MAX_SB_SIZE+1);
+  int msk_stride = (MAX_SB_SIZE+1);
+  int xoffset;
+  int yoffset;
+
+  for (int i = 0; i < number_of_iterations; ++i) {
+    int xoffsets[] = {0, 4, rnd(BIL_SUBPEL_SHIFTS)};
+    int yoffsets[] = {0, 4, rnd(BIL_SUBPEL_SHIFTS)};
+    for (int j = 0; j < (MAX_SB_SIZE+1)*(MAX_SB_SIZE+1); j++) {
+      src_ptr[j] = rnd.Rand8();
+      ref_ptr[j] = rnd.Rand8();
+      msk_ptr[j] = rnd(65);
+    }
+    for (int k = 0; k < 3; k++) {
+      xoffset = xoffsets[k];
+      for (int l = 0; l < 3; l++) {
+        xoffset = xoffsets[k];
+        yoffset = yoffsets[l];
+
+        ref_ret = ref_func_(src_ptr, src_stride,
+                            xoffset, yoffset,
+                            ref_ptr, ref_stride,
+                            msk_ptr, msk_stride,
+                            &ref_sse);
+        ASM_REGISTER_STATE_CHECK(opt_ret = opt_func_(src_ptr, src_stride,
+                                                     xoffset, yoffset,
+                                                     ref_ptr, ref_stride,
+                                                     msk_ptr, msk_stride,
+                                                     &opt_sse));
+
+        if (opt_ret != ref_ret || opt_sse != ref_sse) {
+        err_count++;
+        if (first_failure == -1)
+            first_failure = i;
+        }
+      }
+    }
+  }
+
+  EXPECT_EQ(0, err_count)
+    << "Error: Masked Sub Pixel Variance Test OperationCheck,"
+    << "C output doesn't match SSSE3 output. "
+    << "First failed at test case " << first_failure;
+}
+
+TEST_P(MaskedSubPixelVarianceTest, ExtremeValues) {
+  unsigned int ref_ret, opt_ret;
+  unsigned int ref_sse, opt_sse;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, uint8_t,  src_ptr[(MAX_SB_SIZE+1)*(MAX_SB_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint8_t,  ref_ptr[(MAX_SB_SIZE+1)*(MAX_SB_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[(MAX_SB_SIZE+1)*(MAX_SB_SIZE+1)]);
+  int first_failure_x = -1;
+  int first_failure_y = -1;
+  int err_count = 0;
+  int first_failure = -1;
+  int src_stride = (MAX_SB_SIZE+1);
+  int ref_stride = (MAX_SB_SIZE+1);
+  int msk_stride = (MAX_SB_SIZE+1);
+
+  for (int xoffset = 0 ; xoffset < BIL_SUBPEL_SHIFTS ; xoffset++) {
+    for (int yoffset = 0 ; yoffset < BIL_SUBPEL_SHIFTS ; yoffset++) {
+      for (int i = 0; i < 8; ++i) {
+        memset(src_ptr, (i & 0x1) ? 255 : 0, (MAX_SB_SIZE+1)*(MAX_SB_SIZE+1));
+        memset(ref_ptr, (i & 0x2) ? 255 : 0, (MAX_SB_SIZE+1)*(MAX_SB_SIZE+1));
+        memset(msk_ptr, (i & 0x4) ?  64 : 0, (MAX_SB_SIZE+1)*(MAX_SB_SIZE+1));
+
+        ref_ret = ref_func_(src_ptr, src_stride,
+                            xoffset, yoffset,
+                            ref_ptr, ref_stride,
+                            msk_ptr, msk_stride,
+                            &ref_sse);
+        ASM_REGISTER_STATE_CHECK(opt_ret = opt_func_(src_ptr, src_stride,
+                                                     xoffset, yoffset,
+                                                     ref_ptr, ref_stride,
+                                                     msk_ptr, msk_stride,
+                                                     &opt_sse));
+
+        if (opt_ret != ref_ret || opt_sse != ref_sse) {
+          err_count++;
+          if (first_failure == -1) {
+            first_failure = i;
+            first_failure_x = xoffset;
+            first_failure_y = yoffset;
+          }
+        }
+      }
+    }
+  }
+
+  EXPECT_EQ(0, err_count)
+  << "Error: Masked Variance Test ExtremeValues,"
+  << "C output doesn't match SSSE3 output. "
+  << "First failed at test case " << first_failure
+  << " x_offset = " << first_failure_x
+  << " y_offset = " << first_failure_y;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef std::tr1::tuple<MaskedVarianceFunc,
+                        MaskedVarianceFunc,
+                        vpx_bit_depth_t> HighbdMaskedVarianceParam;
+
+class HighbdMaskedVarianceTest :
+  public ::testing::TestWithParam<HighbdMaskedVarianceParam> {
+ public:
+  virtual ~HighbdMaskedVarianceTest() {}
+  virtual void SetUp() {
+    opt_func_ = GET_PARAM(0);
+    ref_func_ = GET_PARAM(1);
+    bit_depth_ = GET_PARAM(2);
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  MaskedVarianceFunc opt_func_;
+  MaskedVarianceFunc ref_func_;
+  vpx_bit_depth_t bit_depth_;
+};
+
+TEST_P(HighbdMaskedVarianceTest, OperationCheck) {
+  unsigned int ref_ret, opt_ret;
+  unsigned int ref_sse, opt_sse;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, uint16_t, src_ptr[MAX_SB_SIZE*MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint16_t, ref_ptr[MAX_SB_SIZE*MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[MAX_SB_SIZE*MAX_SB_SIZE]);
+  uint8_t* src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
+  uint8_t* ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
+  int err_count = 0;
+  int first_failure = -1;
+  int src_stride = MAX_SB_SIZE;
+  int ref_stride = MAX_SB_SIZE;
+  int msk_stride = MAX_SB_SIZE;
+
+  for (int i = 0; i < number_of_iterations; ++i) {
+    for (int j = 0; j < MAX_SB_SIZE*MAX_SB_SIZE; j++) {
+      src_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
+      ref_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
+      msk_ptr[j] = rnd(65);
+    }
+
+    ref_ret = ref_func_(src8_ptr, src_stride,
+                        ref8_ptr, ref_stride,
+                        msk_ptr, msk_stride,
+                        &ref_sse);
+    ASM_REGISTER_STATE_CHECK(opt_ret = opt_func_(src8_ptr, src_stride,
+                                                 ref8_ptr, ref_stride,
+                                                 msk_ptr, msk_stride,
+                                                 &opt_sse));
+
+    if (opt_ret != ref_ret || opt_sse != ref_sse) {
+      err_count++;
+      if (first_failure == -1)
+        first_failure = i;
+    }
+  }
+
+  EXPECT_EQ(0, err_count)
+  << "Error: Masked Variance Test OperationCheck,"
+  << "C output doesn't match SSSE3 output. "
+  << "First failed at test case " << first_failure;
+}
+
+TEST_P(HighbdMaskedVarianceTest, ExtremeValues) {
+  unsigned int ref_ret, opt_ret;
+  unsigned int ref_sse, opt_sse;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, uint16_t, src_ptr[MAX_SB_SIZE*MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint16_t, ref_ptr[MAX_SB_SIZE*MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[MAX_SB_SIZE*MAX_SB_SIZE]);
+  uint8_t* src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
+  uint8_t* ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
+  int err_count = 0;
+  int first_failure = -1;
+  int src_stride = MAX_SB_SIZE;
+  int ref_stride = MAX_SB_SIZE;
+  int msk_stride = MAX_SB_SIZE;
+
+  for (int i = 0; i < 8; ++i) {
+    vpx_memset16(src_ptr, (i & 0x1) ? ((1 << bit_depth_) - 1) : 0,
+                 MAX_SB_SIZE*MAX_SB_SIZE);
+    vpx_memset16(ref_ptr, (i & 0x2) ? ((1 << bit_depth_) - 1) : 0,
+                 MAX_SB_SIZE*MAX_SB_SIZE);
+    memset(msk_ptr, (i & 0x4) ?  64 : 0, MAX_SB_SIZE*MAX_SB_SIZE);
+
+    ref_ret = ref_func_(src8_ptr, src_stride,
+                        ref8_ptr, ref_stride,
+                        msk_ptr, msk_stride,
+                        &ref_sse);
+    ASM_REGISTER_STATE_CHECK(opt_ret = opt_func_(src8_ptr, src_stride,
+                                                 ref8_ptr, ref_stride,
+                                                 msk_ptr, msk_stride,
+                                                 &opt_sse));
+
+    if (opt_ret != ref_ret || opt_sse != ref_sse) {
+      err_count++;
+      if (first_failure == -1)
+        first_failure = i;
+    }
+  }
+
+  EXPECT_EQ(0, err_count)
+  << "Error: Masked Variance Test ExtremeValues,"
+  << "C output doesn't match SSSE3 output. "
+  << "First failed at test case " << first_failure;
+}
+
+typedef std::tr1::tuple<MaskedSubPixelVarianceFunc,
+                        MaskedSubPixelVarianceFunc,
+                        vpx_bit_depth_t> HighbdMaskedSubPixelVarianceParam;
+
+class HighbdMaskedSubPixelVarianceTest :
+  public ::testing::TestWithParam<HighbdMaskedSubPixelVarianceParam> {
+ public:
+  virtual ~HighbdMaskedSubPixelVarianceTest() {}
+  virtual void SetUp() {
+    opt_func_ = GET_PARAM(0);
+    ref_func_ = GET_PARAM(1);
+    bit_depth_ = GET_PARAM(2);
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  MaskedSubPixelVarianceFunc opt_func_;
+  MaskedSubPixelVarianceFunc ref_func_;
+  vpx_bit_depth_t bit_depth_;
+};
+
+TEST_P(HighbdMaskedSubPixelVarianceTest, OperationCheck) {
+  unsigned int ref_ret, opt_ret;
+  unsigned int ref_sse, opt_sse;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, uint16_t, src_ptr[(MAX_SB_SIZE+1)*(MAX_SB_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint16_t, ref_ptr[(MAX_SB_SIZE+1)*(MAX_SB_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[(MAX_SB_SIZE+1)*(MAX_SB_SIZE+1)]);
+  uint8_t* src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
+  uint8_t* ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
+  int err_count = 0;
+  int first_failure = -1;
+  int first_failure_x = -1;
+  int first_failure_y = -1;
+  int src_stride = (MAX_SB_SIZE+1);
+  int ref_stride = (MAX_SB_SIZE+1);
+  int msk_stride = (MAX_SB_SIZE+1);
+  int xoffset, yoffset;
+
+  for (int i = 0; i < number_of_iterations; ++i) {
+    for (xoffset = 0; xoffset < BIL_SUBPEL_SHIFTS; xoffset++) {
+      for (yoffset = 0; yoffset < BIL_SUBPEL_SHIFTS; yoffset++) {
+        for (int j = 0; j < (MAX_SB_SIZE+1)*(MAX_SB_SIZE+1); j++) {
+          src_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
+          ref_ptr[j] = rnd.Rand16() & ((1 << bit_depth_) - 1);
+          msk_ptr[j] = rnd(65);
+        }
+
+        ref_ret = ref_func_(src8_ptr, src_stride,
+                            xoffset, yoffset,
+                            ref8_ptr, ref_stride,
+                            msk_ptr, msk_stride,
+                            &ref_sse);
+        ASM_REGISTER_STATE_CHECK(opt_ret = opt_func_(src8_ptr, src_stride,
+                                                     xoffset, yoffset,
+                                                     ref8_ptr, ref_stride,
+                                                     msk_ptr, msk_stride,
+                                                     &opt_sse));
+
+        if (opt_ret != ref_ret || opt_sse != ref_sse) {
+          err_count++;
+          if (first_failure == -1) {
+            first_failure = i;
+            first_failure_x = xoffset;
+            first_failure_y = yoffset;
+          }
+        }
+      }
+    }
+  }
+
+  EXPECT_EQ(0, err_count)
+    << "Error: Masked Sub Pixel Variance Test OperationCheck,"
+    << "C output doesn't match SSSE3 output. "
+    << "First failed at test case " << first_failure
+    << " x_offset = " << first_failure_x
+    << " y_offset = " << first_failure_y;
+}
+
+TEST_P(HighbdMaskedSubPixelVarianceTest, ExtremeValues) {
+  unsigned int ref_ret, opt_ret;
+  unsigned int ref_sse, opt_sse;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, uint16_t, src_ptr[(MAX_SB_SIZE+1)*(MAX_SB_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint16_t, ref_ptr[(MAX_SB_SIZE+1)*(MAX_SB_SIZE+1)]);
+  DECLARE_ALIGNED(16, uint8_t,  msk_ptr[(MAX_SB_SIZE+1)*(MAX_SB_SIZE+1)]);
+  uint8_t* src8_ptr = CONVERT_TO_BYTEPTR(src_ptr);
+  uint8_t* ref8_ptr = CONVERT_TO_BYTEPTR(ref_ptr);
+  int first_failure_x = -1;
+  int first_failure_y = -1;
+  int err_count = 0;
+  int first_failure = -1;
+  int src_stride = (MAX_SB_SIZE+1);
+  int ref_stride = (MAX_SB_SIZE+1);
+  int msk_stride = (MAX_SB_SIZE+1);
+
+  for (int xoffset = 0 ; xoffset < BIL_SUBPEL_SHIFTS ; xoffset++) {
+    for (int yoffset = 0 ; yoffset < BIL_SUBPEL_SHIFTS ; yoffset++) {
+      for (int i = 0; i < 8; ++i) {
+        vpx_memset16(src_ptr, (i & 0x1) ? ((1 << bit_depth_) - 1) : 0,
+                     (MAX_SB_SIZE+1)*(MAX_SB_SIZE+1));
+        vpx_memset16(ref_ptr, (i & 0x2) ? ((1 << bit_depth_) - 1) : 0,
+                     (MAX_SB_SIZE+1)*(MAX_SB_SIZE+1));
+        memset(msk_ptr, (i & 0x4) ?   64 : 0, (MAX_SB_SIZE+1)*(MAX_SB_SIZE+1));
+
+        ref_ret = ref_func_(src8_ptr, src_stride,
+                            xoffset, yoffset,
+                            ref8_ptr, ref_stride,
+                            msk_ptr, msk_stride,
+                            &ref_sse);
+        ASM_REGISTER_STATE_CHECK(opt_ret = opt_func_(src8_ptr, src_stride,
+                                                     xoffset, yoffset,
+                                                     ref8_ptr, ref_stride,
+                                                     msk_ptr, msk_stride,
+                                                     &opt_sse));
+
+        if (opt_ret != ref_ret || opt_sse != ref_sse) {
+          err_count++;
+          if (first_failure == -1) {
+            first_failure = i;
+            first_failure_x = xoffset;
+            first_failure_y = yoffset;
+          }
+        }
+      }
+    }
+  }
+
+  EXPECT_EQ(0, err_count)
+  << "Error: Masked Variance Test ExtremeValues,"
+  << "C output doesn't match SSSE3 output. "
+  << "First failed at test case " << first_failure
+  << " x_offset = " << first_failure_x
+  << " y_offset = " << first_failure_y;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+using std::tr1::make_tuple;
+
+#if HAVE_SSSE3
+INSTANTIATE_TEST_CASE_P(
+  SSSE3_C_COMPARE, MaskedVarianceTest,
+  ::testing::Values(
+#if CONFIG_EXT_PARTITION
+    make_tuple(&vpx_masked_variance128x128_ssse3,
+               &vpx_masked_variance128x128_c),
+    make_tuple(&vpx_masked_variance128x64_ssse3,
+               &vpx_masked_variance128x64_c),
+    make_tuple(&vpx_masked_variance64x128_ssse3,
+               &vpx_masked_variance64x128_c),
+#endif  // CONFIG_EXT_PARTITION
+    make_tuple(&vpx_masked_variance64x64_ssse3,
+               &vpx_masked_variance64x64_c),
+    make_tuple(&vpx_masked_variance64x32_ssse3,
+               &vpx_masked_variance64x32_c),
+    make_tuple(&vpx_masked_variance32x64_ssse3,
+               &vpx_masked_variance32x64_c),
+    make_tuple(&vpx_masked_variance32x32_ssse3,
+               &vpx_masked_variance32x32_c),
+    make_tuple(&vpx_masked_variance32x16_ssse3,
+               &vpx_masked_variance32x16_c),
+    make_tuple(&vpx_masked_variance16x32_ssse3,
+               &vpx_masked_variance16x32_c),
+    make_tuple(&vpx_masked_variance16x16_ssse3,
+               &vpx_masked_variance16x16_c),
+    make_tuple(&vpx_masked_variance16x8_ssse3,
+               &vpx_masked_variance16x8_c),
+    make_tuple(&vpx_masked_variance8x16_ssse3,
+               &vpx_masked_variance8x16_c),
+    make_tuple(&vpx_masked_variance8x8_ssse3,
+               &vpx_masked_variance8x8_c),
+    make_tuple(&vpx_masked_variance8x4_ssse3,
+               &vpx_masked_variance8x4_c),
+    make_tuple(&vpx_masked_variance4x8_ssse3,
+               &vpx_masked_variance4x8_c),
+    make_tuple(&vpx_masked_variance4x4_ssse3,
+               &vpx_masked_variance4x4_c)));
+
+INSTANTIATE_TEST_CASE_P(
+  SSSE3_C_COMPARE, MaskedSubPixelVarianceTest,
+  ::testing::Values(
+#if CONFIG_EXT_PARTITION
+    make_tuple(&vpx_masked_sub_pixel_variance128x128_ssse3,
+               &vpx_masked_sub_pixel_variance128x128_c),
+    make_tuple(&vpx_masked_sub_pixel_variance128x64_ssse3,
+               &vpx_masked_sub_pixel_variance128x64_c),
+    make_tuple(&vpx_masked_sub_pixel_variance64x128_ssse3,
+               &vpx_masked_sub_pixel_variance64x128_c),
+#endif  // CONFIG_EXT_PARTITION
+    make_tuple(&vpx_masked_sub_pixel_variance64x64_ssse3,
+               &vpx_masked_sub_pixel_variance64x64_c),
+    make_tuple(&vpx_masked_sub_pixel_variance64x32_ssse3,
+               &vpx_masked_sub_pixel_variance64x32_c),
+    make_tuple(&vpx_masked_sub_pixel_variance32x64_ssse3,
+               &vpx_masked_sub_pixel_variance32x64_c),
+    make_tuple(&vpx_masked_sub_pixel_variance32x32_ssse3,
+               &vpx_masked_sub_pixel_variance32x32_c),
+    make_tuple(&vpx_masked_sub_pixel_variance32x16_ssse3,
+               &vpx_masked_sub_pixel_variance32x16_c),
+    make_tuple(&vpx_masked_sub_pixel_variance16x32_ssse3,
+               &vpx_masked_sub_pixel_variance16x32_c),
+    make_tuple(&vpx_masked_sub_pixel_variance16x16_ssse3,
+               &vpx_masked_sub_pixel_variance16x16_c),
+    make_tuple(&vpx_masked_sub_pixel_variance16x8_ssse3,
+               &vpx_masked_sub_pixel_variance16x8_c),
+    make_tuple(&vpx_masked_sub_pixel_variance8x16_ssse3,
+               &vpx_masked_sub_pixel_variance8x16_c),
+    make_tuple(&vpx_masked_sub_pixel_variance8x8_ssse3,
+               &vpx_masked_sub_pixel_variance8x8_c),
+    make_tuple(&vpx_masked_sub_pixel_variance8x4_ssse3,
+               &vpx_masked_sub_pixel_variance8x4_c),
+    make_tuple(&vpx_masked_sub_pixel_variance4x8_ssse3,
+               &vpx_masked_sub_pixel_variance4x8_c),
+    make_tuple(&vpx_masked_sub_pixel_variance4x4_ssse3,
+               &vpx_masked_sub_pixel_variance4x4_c)));
+
+#if CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+  SSSE3_C_COMPARE, HighbdMaskedVarianceTest,
+  ::testing::Values(
+#if CONFIG_EXT_PARTITION
+    make_tuple(&vpx_highbd_masked_variance128x128_ssse3,
+               &vpx_highbd_masked_variance128x128_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_variance128x64_ssse3,
+               &vpx_highbd_masked_variance128x64_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_variance64x128_ssse3,
+               &vpx_highbd_masked_variance64x128_c, VPX_BITS_8),
+#endif  // CONFIG_EXT_PARTITION
+    make_tuple(&vpx_highbd_masked_variance64x64_ssse3,
+               &vpx_highbd_masked_variance64x64_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_variance64x32_ssse3,
+               &vpx_highbd_masked_variance64x32_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_variance32x64_ssse3,
+               &vpx_highbd_masked_variance32x64_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_variance32x32_ssse3,
+               &vpx_highbd_masked_variance32x32_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_variance32x16_ssse3,
+               &vpx_highbd_masked_variance32x16_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_variance16x32_ssse3,
+               &vpx_highbd_masked_variance16x32_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_variance16x16_ssse3,
+               &vpx_highbd_masked_variance16x16_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_variance16x8_ssse3,
+               &vpx_highbd_masked_variance16x8_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_variance8x16_ssse3,
+               &vpx_highbd_masked_variance8x16_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_variance8x8_ssse3,
+               &vpx_highbd_masked_variance8x8_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_variance8x4_ssse3,
+               &vpx_highbd_masked_variance8x4_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_variance4x8_ssse3,
+               &vpx_highbd_masked_variance4x8_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_variance4x4_ssse3,
+               &vpx_highbd_masked_variance4x4_c, VPX_BITS_8),
+#if CONFIG_EXT_PARTITION
+    make_tuple(&vpx_highbd_10_masked_variance128x128_ssse3,
+               &vpx_highbd_10_masked_variance128x128_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_variance128x64_ssse3,
+               &vpx_highbd_10_masked_variance128x64_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_variance64x128_ssse3,
+               &vpx_highbd_10_masked_variance64x128_c, VPX_BITS_10),
+#endif  // CONFIG_EXT_PARTITION
+    make_tuple(&vpx_highbd_10_masked_variance64x64_ssse3,
+               &vpx_highbd_10_masked_variance64x64_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_variance64x32_ssse3,
+               &vpx_highbd_10_masked_variance64x32_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_variance32x64_ssse3,
+               &vpx_highbd_10_masked_variance32x64_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_variance32x32_ssse3,
+               &vpx_highbd_10_masked_variance32x32_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_variance32x16_ssse3,
+               &vpx_highbd_10_masked_variance32x16_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_variance16x32_ssse3,
+               &vpx_highbd_10_masked_variance16x32_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_variance16x16_ssse3,
+               &vpx_highbd_10_masked_variance16x16_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_variance16x8_ssse3,
+               &vpx_highbd_10_masked_variance16x8_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_variance8x16_ssse3,
+               &vpx_highbd_10_masked_variance8x16_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_variance8x8_ssse3,
+               &vpx_highbd_10_masked_variance8x8_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_variance8x4_ssse3,
+               &vpx_highbd_10_masked_variance8x4_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_variance4x8_ssse3,
+               &vpx_highbd_10_masked_variance4x8_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_variance4x4_ssse3,
+               &vpx_highbd_10_masked_variance4x4_c, VPX_BITS_10),
+#if CONFIG_EXT_PARTITION
+    make_tuple(&vpx_highbd_12_masked_variance128x128_ssse3,
+               &vpx_highbd_12_masked_variance128x128_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_variance128x64_ssse3,
+               &vpx_highbd_12_masked_variance128x64_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_variance64x128_ssse3,
+               &vpx_highbd_12_masked_variance64x128_c, VPX_BITS_12),
+#endif  // CONFIG_EXT_PARTITION
+    make_tuple(&vpx_highbd_12_masked_variance64x64_ssse3,
+               &vpx_highbd_12_masked_variance64x64_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_variance64x32_ssse3,
+               &vpx_highbd_12_masked_variance64x32_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_variance32x64_ssse3,
+               &vpx_highbd_12_masked_variance32x64_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_variance32x32_ssse3,
+               &vpx_highbd_12_masked_variance32x32_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_variance32x16_ssse3,
+               &vpx_highbd_12_masked_variance32x16_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_variance16x32_ssse3,
+               &vpx_highbd_12_masked_variance16x32_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_variance16x16_ssse3,
+               &vpx_highbd_12_masked_variance16x16_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_variance16x8_ssse3,
+               &vpx_highbd_12_masked_variance16x8_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_variance8x16_ssse3,
+               &vpx_highbd_12_masked_variance8x16_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_variance8x8_ssse3,
+               &vpx_highbd_12_masked_variance8x8_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_variance8x4_ssse3,
+               &vpx_highbd_12_masked_variance8x4_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_variance4x8_ssse3,
+               &vpx_highbd_12_masked_variance4x8_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_variance4x4_ssse3,
+               &vpx_highbd_12_masked_variance4x4_c, VPX_BITS_12)));
+
+INSTANTIATE_TEST_CASE_P(
+  SSSE3_C_COMPARE, HighbdMaskedSubPixelVarianceTest,
+  ::testing::Values(
+#if CONFIG_EXT_PARTITION
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance128x128_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance128x128_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance128x64_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance128x64_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance64x128_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance64x128_c, VPX_BITS_8),
+#endif  // CONFIG_EXT_PARTITION
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance64x64_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance64x64_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance64x32_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance64x32_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance32x64_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance32x64_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance32x32_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance32x32_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance32x16_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance32x16_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance16x32_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance16x32_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance16x16_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance16x16_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance16x8_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance16x8_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance8x16_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance8x16_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance8x8_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance8x8_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance8x4_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance8x4_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance4x8_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance4x8_c, VPX_BITS_8),
+    make_tuple(&vpx_highbd_masked_sub_pixel_variance4x4_ssse3,
+               &vpx_highbd_masked_sub_pixel_variance4x4_c, VPX_BITS_8),
+#if CONFIG_EXT_PARTITION
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance128x128_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance128x128_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance128x64_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance128x64_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance64x128_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance64x128_c, VPX_BITS_10),
+#endif  // CONFIG_EXT_PARTITION
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance64x64_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance64x64_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance64x32_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance64x32_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance32x64_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance32x64_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance32x32_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance32x32_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance32x16_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance32x16_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance16x32_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance16x32_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance16x16_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance16x16_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance16x8_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance16x8_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance8x16_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance8x16_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance8x8_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance8x8_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance8x4_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance8x4_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance4x8_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance4x8_c, VPX_BITS_10),
+    make_tuple(&vpx_highbd_10_masked_sub_pixel_variance4x4_ssse3,
+               &vpx_highbd_10_masked_sub_pixel_variance4x4_c, VPX_BITS_10),
+#if CONFIG_EXT_PARTITION
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance128x128_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance128x128_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance128x64_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance128x64_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance64x128_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance64x128_c, VPX_BITS_12),
+#endif  // CONFIG_EXT_PARTITION
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance64x64_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance64x64_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance64x32_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance64x32_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance32x64_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance32x64_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance32x32_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance32x32_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance32x16_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance32x16_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance16x32_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance16x32_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance16x16_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance16x16_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance16x8_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance16x8_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance8x16_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance8x16_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance8x8_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance8x8_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance8x4_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance8x4_c, VPX_BITS_12) ,
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance4x8_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance4x8_c, VPX_BITS_12),
+    make_tuple(&vpx_highbd_12_masked_sub_pixel_variance4x4_ssse3,
+               &vpx_highbd_12_masked_sub_pixel_variance4x4_c, VPX_BITS_12)));
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#endif  // HAVE_SSSE3
+}  // namespace
diff --git a/test/obmc_sad_test.cc b/test/obmc_sad_test.cc
new file mode 100644
index 0000000..beb7106
--- /dev/null
+++ b/test/obmc_sad_test.cc
@@ -0,0 +1,192 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "test/function_equivalence_test.h"
+#include "test/register_state_check.h"
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+#define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE)
+
+using libvpx_test::FunctionEquivalenceTest;
+
+namespace {
+
+static const int kIterations = 1000;
+static const int kMaskMax = 64;
+
+typedef unsigned int (*ObmcSadF)(const uint8_t *pre, int pre_stride,
+                                 const int32_t *wsrc, const int32_t *mask);
+typedef libvpx_test::FuncParam<ObmcSadF> TestFuncs;
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+class ObmcSadTest : public FunctionEquivalenceTest<ObmcSadF> {};
+
+TEST_P(ObmcSadTest, RandomValues) {
+  DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
+
+  for (int iter = 0 ; iter < kIterations && !HasFatalFailure() ; ++iter) {
+    const int pre_stride = rng_(MAX_SB_SIZE + 1);
+
+    for (int i = 0 ; i < MAX_SB_SQUARE ; ++i) {
+      pre[i] = rng_.Rand8();
+      wsrc[i] = rng_.Rand8() * rng_(kMaskMax * kMaskMax + 1);
+      mask[i] = rng_(kMaskMax * kMaskMax + 1);
+    }
+
+    const unsigned int ref_res = params_.ref_func(pre, pre_stride, wsrc, mask);
+    unsigned int tst_res;
+    ASM_REGISTER_STATE_CHECK(tst_res =
+        params_.tst_func(pre, pre_stride, wsrc, mask));
+
+    ASSERT_EQ(ref_res, tst_res);
+  }
+}
+
+TEST_P(ObmcSadTest, ExtremeValues) {
+  DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
+
+  for (int iter = 0 ; iter < MAX_SB_SIZE && !HasFatalFailure() ; ++iter) {
+    const int pre_stride = iter;
+
+    for (int i = 0 ; i < MAX_SB_SQUARE ; ++i) {
+      pre[i] = UINT8_MAX;
+      wsrc[i] = UINT8_MAX * kMaskMax * kMaskMax;
+      mask[i] = kMaskMax * kMaskMax;
+    }
+
+    const unsigned int ref_res = params_.ref_func(pre, pre_stride, wsrc, mask);
+    unsigned int tst_res;
+    ASM_REGISTER_STATE_CHECK(tst_res =
+        params_.tst_func(pre, pre_stride, wsrc, mask));
+
+    ASSERT_EQ(ref_res, tst_res);
+  }
+}
+
+#if HAVE_SSE4_1
+const ObmcSadTest::ParamType sse4_functions[] = {
+#if CONFIG_EXT_PARTITION
+  TestFuncs(vpx_obmc_sad128x128_c, vpx_obmc_sad128x128_sse4_1),
+  TestFuncs(vpx_obmc_sad128x64_c, vpx_obmc_sad128x64_sse4_1),
+  TestFuncs(vpx_obmc_sad64x128_c, vpx_obmc_sad64x128_sse4_1),
+#endif  // CONFIG_EXT_PARTITION
+  TestFuncs(vpx_obmc_sad64x64_c, vpx_obmc_sad64x64_sse4_1),
+  TestFuncs(vpx_obmc_sad64x32_c, vpx_obmc_sad64x32_sse4_1),
+  TestFuncs(vpx_obmc_sad32x64_c, vpx_obmc_sad32x64_sse4_1),
+  TestFuncs(vpx_obmc_sad32x32_c, vpx_obmc_sad32x32_sse4_1),
+  TestFuncs(vpx_obmc_sad32x16_c, vpx_obmc_sad32x16_sse4_1),
+  TestFuncs(vpx_obmc_sad16x32_c, vpx_obmc_sad16x32_sse4_1),
+  TestFuncs(vpx_obmc_sad16x16_c, vpx_obmc_sad16x16_sse4_1),
+  TestFuncs(vpx_obmc_sad16x8_c, vpx_obmc_sad16x8_sse4_1),
+  TestFuncs(vpx_obmc_sad8x16_c, vpx_obmc_sad8x16_sse4_1),
+  TestFuncs(vpx_obmc_sad8x8_c, vpx_obmc_sad8x8_sse4_1),
+  TestFuncs(vpx_obmc_sad8x4_c, vpx_obmc_sad8x4_sse4_1),
+  TestFuncs(vpx_obmc_sad4x8_c, vpx_obmc_sad4x8_sse4_1),
+  TestFuncs(vpx_obmc_sad4x4_c, vpx_obmc_sad4x4_sse4_1)
+};
+
+INSTANTIATE_TEST_CASE_P(SSE4_1_C_COMPARE, ObmcSadTest,
+                        ::testing::ValuesIn(sse4_functions));
+#endif  // HAVE_SSE4_1
+
+////////////////////////////////////////////////////////////////////////////////
+// High bit-depth
+////////////////////////////////////////////////////////////////////////////////
+
+#if CONFIG_VP9_HIGHBITDEPTH
+class ObmcSadHBDTest : public FunctionEquivalenceTest<ObmcSadF> {};
+
+TEST_P(ObmcSadHBDTest, RandomValues) {
+  DECLARE_ALIGNED(32, uint16_t, pre[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
+
+  for (int iter = 0 ; iter < kIterations && !HasFatalFailure() ; ++iter) {
+    const int pre_stride = rng_(MAX_SB_SIZE + 1);
+
+    for (int i = 0 ; i < MAX_SB_SQUARE ; ++i) {
+      pre[i] = rng_(1<<12);
+      wsrc[i] = rng_(1<<12) * rng_(kMaskMax * kMaskMax + 1);
+      mask[i] = rng_(kMaskMax * kMaskMax + 1);
+    }
+
+    const unsigned int ref_res =
+        params_.ref_func(CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask);
+    unsigned int tst_res;
+    ASM_REGISTER_STATE_CHECK(tst_res =
+        params_.tst_func(CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask));
+
+    ASSERT_EQ(ref_res, tst_res);
+  }
+}
+
+TEST_P(ObmcSadHBDTest, ExtremeValues) {
+  DECLARE_ALIGNED(32, uint16_t, pre[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
+
+  for (int iter = 0 ; iter < MAX_SB_SIZE && !HasFatalFailure() ; ++iter) {
+    const int pre_stride = iter;
+
+    for (int i = 0 ; i < MAX_SB_SQUARE ; ++i) {
+      pre[i] = (1 << 12) - 1;
+      wsrc[i] = ((1 << 12) - 1) * kMaskMax * kMaskMax;
+      mask[i] = kMaskMax * kMaskMax;
+    }
+
+    const unsigned int ref_res =
+        params_.ref_func(CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask);
+    unsigned int tst_res;
+    ASM_REGISTER_STATE_CHECK(tst_res =
+        params_.tst_func(CONVERT_TO_BYTEPTR(pre), pre_stride, wsrc, mask));
+
+    ASSERT_EQ(ref_res, tst_res);
+  }
+}
+
+#if HAVE_SSE4_1
+ObmcSadHBDTest::ParamType sse4_functions_hbd[] = {
+#if CONFIG_EXT_PARTITION
+  TestFuncs(vpx_highbd_obmc_sad128x128_c, vpx_highbd_obmc_sad128x128_sse4_1),
+  TestFuncs(vpx_highbd_obmc_sad128x64_c, vpx_highbd_obmc_sad128x64_sse4_1),
+  TestFuncs(vpx_highbd_obmc_sad64x128_c, vpx_highbd_obmc_sad64x128_sse4_1),
+#endif  // CONFIG_EXT_PARTITION
+  TestFuncs(vpx_highbd_obmc_sad64x64_c, vpx_highbd_obmc_sad64x64_sse4_1),
+  TestFuncs(vpx_highbd_obmc_sad64x32_c, vpx_highbd_obmc_sad64x32_sse4_1),
+  TestFuncs(vpx_highbd_obmc_sad32x64_c, vpx_highbd_obmc_sad32x64_sse4_1),
+  TestFuncs(vpx_highbd_obmc_sad32x32_c, vpx_highbd_obmc_sad32x32_sse4_1),
+  TestFuncs(vpx_highbd_obmc_sad32x16_c, vpx_highbd_obmc_sad32x16_sse4_1),
+  TestFuncs(vpx_highbd_obmc_sad16x32_c, vpx_highbd_obmc_sad16x32_sse4_1),
+  TestFuncs(vpx_highbd_obmc_sad16x16_c, vpx_highbd_obmc_sad16x16_sse4_1),
+  TestFuncs(vpx_highbd_obmc_sad16x8_c, vpx_highbd_obmc_sad16x8_sse4_1),
+  TestFuncs(vpx_highbd_obmc_sad8x16_c, vpx_highbd_obmc_sad8x16_sse4_1),
+  TestFuncs(vpx_highbd_obmc_sad8x8_c, vpx_highbd_obmc_sad8x8_sse4_1),
+  TestFuncs(vpx_highbd_obmc_sad8x4_c, vpx_highbd_obmc_sad8x4_sse4_1),
+  TestFuncs(vpx_highbd_obmc_sad4x8_c, vpx_highbd_obmc_sad4x8_sse4_1),
+  TestFuncs(vpx_highbd_obmc_sad4x4_c, vpx_highbd_obmc_sad4x4_sse4_1)
+};
+
+INSTANTIATE_TEST_CASE_P(SSE4_1_C_COMPARE, ObmcSadHBDTest,
+                        ::testing::ValuesIn(sse4_functions_hbd));
+#endif  // HAVE_SSE4_1
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}  // namespace
diff --git a/test/obmc_variance_test.cc b/test/obmc_variance_test.cc
new file mode 100644
index 0000000..40295f2
--- /dev/null
+++ b/test/obmc_variance_test.cc
@@ -0,0 +1,292 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/acm_random.h"
+
+#include "test/function_equivalence_test.h"
+#include "test/register_state_check.h"
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+
+#define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE)
+
+using libvpx_test::ACMRandom;
+using libvpx_test::FunctionEquivalenceTest;
+
+namespace {
+
+static const int kIterations = 1000;
+static const int kMaskMax = 64;
+
+typedef unsigned int (*ObmcVarF)(const uint8_t *pre, int pre_stride,
+                                 const int32_t *wsrc, const int32_t *mask,
+                                 unsigned int *sse);
+typedef libvpx_test::FuncParam<ObmcVarF> TestFuncs;
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+class ObmcVarianceTest : public FunctionEquivalenceTest<ObmcVarF> {};
+
+TEST_P(ObmcVarianceTest, RandomValues) {
+  DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
+
+  for (int iter = 0 ; iter < kIterations && !HasFatalFailure() ; ++iter) {
+    const int pre_stride = this->rng_(MAX_SB_SIZE + 1);
+
+    for (int i = 0 ; i < MAX_SB_SQUARE ; ++i) {
+      pre[i] = this->rng_.Rand8();
+      wsrc[i] = this->rng_.Rand8() * this->rng_(kMaskMax * kMaskMax + 1);
+      mask[i] = this->rng_(kMaskMax * kMaskMax + 1);
+    }
+
+    unsigned int ref_sse, tst_sse;
+    const unsigned int ref_res = params_.ref_func(pre, pre_stride, wsrc, mask,
+                                                  &ref_sse);
+    unsigned int tst_res;
+    ASM_REGISTER_STATE_CHECK(
+        tst_res = params_.tst_func(pre, pre_stride, wsrc, mask, &tst_sse));
+
+    ASSERT_EQ(ref_res, tst_res);
+    ASSERT_EQ(ref_sse, tst_sse);
+  }
+}
+
+TEST_P(ObmcVarianceTest, ExtremeValues) {
+  DECLARE_ALIGNED(32, uint8_t, pre[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
+
+  for (int iter = 0 ; iter < MAX_SB_SIZE && !HasFatalFailure() ; ++iter) {
+    const int pre_stride = iter;
+
+    for (int i = 0 ; i < MAX_SB_SQUARE ; ++i) {
+      pre[i] = UINT8_MAX;
+      wsrc[i] = UINT8_MAX * kMaskMax * kMaskMax;
+      mask[i] = kMaskMax * kMaskMax;
+    }
+
+    unsigned int ref_sse, tst_sse;
+    const unsigned int ref_res = params_.ref_func(pre, pre_stride, wsrc, mask,
+                                                  &ref_sse);
+    unsigned int tst_res;
+    ASM_REGISTER_STATE_CHECK(
+        tst_res = params_.tst_func(pre, pre_stride, wsrc, mask, &tst_sse));
+
+    ASSERT_EQ(ref_res, tst_res);
+    ASSERT_EQ(ref_sse, tst_sse);
+  }
+}
+
+#if HAVE_SSE4_1
+const ObmcVarianceTest::ParamType sse4_functions[] = {
+#if CONFIG_EXT_PARTITION
+  TestFuncs(vpx_obmc_variance128x128_c, vpx_obmc_variance128x128_sse4_1),
+  TestFuncs(vpx_obmc_variance128x64_c, vpx_obmc_variance128x64_sse4_1),
+  TestFuncs(vpx_obmc_variance64x128_c, vpx_obmc_variance64x128_sse4_1),
+#endif  // CONFIG_EXT_PARTITION
+  TestFuncs(vpx_obmc_variance64x64_c, vpx_obmc_variance64x64_sse4_1),
+  TestFuncs(vpx_obmc_variance64x32_c, vpx_obmc_variance64x32_sse4_1),
+  TestFuncs(vpx_obmc_variance32x64_c, vpx_obmc_variance32x64_sse4_1),
+  TestFuncs(vpx_obmc_variance32x32_c, vpx_obmc_variance32x32_sse4_1),
+  TestFuncs(vpx_obmc_variance32x16_c, vpx_obmc_variance32x16_sse4_1),
+  TestFuncs(vpx_obmc_variance16x32_c, vpx_obmc_variance16x32_sse4_1),
+  TestFuncs(vpx_obmc_variance16x16_c, vpx_obmc_variance16x16_sse4_1),
+  TestFuncs(vpx_obmc_variance16x8_c, vpx_obmc_variance16x8_sse4_1),
+  TestFuncs(vpx_obmc_variance8x16_c, vpx_obmc_variance8x16_sse4_1),
+  TestFuncs(vpx_obmc_variance8x8_c, vpx_obmc_variance8x8_sse4_1),
+  TestFuncs(vpx_obmc_variance8x4_c, vpx_obmc_variance8x4_sse4_1),
+  TestFuncs(vpx_obmc_variance4x8_c, vpx_obmc_variance4x8_sse4_1),
+  TestFuncs(vpx_obmc_variance4x4_c, vpx_obmc_variance4x4_sse4_1)
+};
+
+INSTANTIATE_TEST_CASE_P(SSE4_1_C_COMPARE, ObmcVarianceTest,
+                        ::testing::ValuesIn(sse4_functions));
+#endif  // HAVE_SSE4_1
+
+////////////////////////////////////////////////////////////////////////////////
+// High bit-depth
+////////////////////////////////////////////////////////////////////////////////
+
+#if CONFIG_VP9_HIGHBITDEPTH
+class ObmcVarianceHBDTest : public FunctionEquivalenceTest<ObmcVarF> {}
+
+TEST_P(ObmcVarianceHBDTest, RandomValues) {
+  DECLARE_ALIGNED(32, uint16_t, pre[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
+
+  for (int iter = 0 ; iter < kIterations && !HasFatalFailure() ; ++iter) {
+    const int pre_stride = this->rng_(MAX_SB_SIZE + 1);
+
+    for (int i = 0 ; i < MAX_SB_SQUARE ; ++i) {
+      pre[i] = this->rng_(1 << this->bit_depth);
+      wsrc[i] = this->rng_(1 << this->bit_depth) *
+                this->rng_(kMaskMax * kMaskMax + 1);
+      mask[i] = this->rng_(kMaskMax * kMaskMax + 1);
+    }
+
+    unsigned int ref_sse, tst_sse;
+    const unsigned int ref_res = ref_func_(CONVERT_TO_BYTEPTR(pre), pre_stride,
+                                           wsrc, mask, &ref_sse);
+    unsigned int tst_res;
+    ASM_REGISTER_STATE_CHECK(
+        tst_res = tst_func_(CONVERT_TO_BYTEPTR(pre),
+                            pre_stride, wsrc, mask, &tst_sse));
+
+    ASSERT_EQ(ref_res, tst_res);
+    ASSERT_EQ(ref_sse, tst_sse);
+  }
+}
+
+TEST_P(ObmcVarianceHBDTest, ExtremeValues) {
+  DECLARE_ALIGNED(32, uint16_t, pre[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, wsrc[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int32_t, mask[MAX_SB_SQUARE]);
+
+  for (int iter = 0 ; iter < MAX_SB_SIZE && !HasFatalFailure() ; ++iter) {
+    const int pre_stride = iter;
+
+    for (int i = 0 ; i < MAX_SB_SQUARE ; ++i) {
+      pre[i] = (1 << this->bit_depth) - 1;
+      wsrc[i] = ((1 << this->bit_depth) - 1) * kMaskMax * kMaskMax;
+      mask[i] = kMaskMax * kMaskMax;
+    }
+
+    unsigned int ref_sse, tst_sse;
+    const unsigned int ref_res = ref_func_(CONVERT_TO_BYTEPTR(pre), pre_stride,
+                                           wsrc, mask, &ref_sse);
+    unsigned int tst_res;
+    ASM_REGISTER_STATE_CHECK(
+        tst_res = tst_func_(CONVERT_TO_BYTEPTR(pre), pre_stride,
+                            wsrc, mask, &tst_sse));
+
+    ASSERT_EQ(ref_res, tst_res);
+    ASSERT_EQ(ref_sse, tst_sse);
+  }
+}
+
+#if HAVE_SSE4_1
+ObmcVarianceHBDTest::ParamType sse4_functions_hbd[] = {
+#if CONFIG_EXT_PARTITION
+  TestFuncs(vpx_highbd_obmc_variance128x128_c,
+            vpx_highbd_obmc_variance128x128_sse4_1, 8),
+  TestFuncs(vpx_highbd_obmc_variance128x64_c,
+            vpx_highbd_obmc_variance128x64_sse4_1, 8),
+  TestFuncs(vpx_highbd_obmc_variance64x128_c,
+            vpx_highbd_obmc_variance64x128_sse4_1, 8),
+#endif  // CONFIG_EXT_PARTITION
+  TestFuncs(vpx_highbd_obmc_variance64x64_c,
+            vpx_highbd_obmc_variance64x64_sse4_1, 8),
+  TestFuncs(vpx_highbd_obmc_variance64x32_c,
+            vpx_highbd_obmc_variance64x32_sse4_1, 8),
+  TestFuncs(vpx_highbd_obmc_variance32x64_c,
+            vpx_highbd_obmc_variance32x64_sse4_1, 8),
+  TestFuncs(vpx_highbd_obmc_variance32x32_c,
+            vpx_highbd_obmc_variance32x32_sse4_1, 8),
+  TestFuncs(vpx_highbd_obmc_variance32x16_c,
+            vpx_highbd_obmc_variance32x16_sse4_1, 8),
+  TestFuncs(vpx_highbd_obmc_variance16x32_c,
+            vpx_highbd_obmc_variance16x32_sse4_1, 8),
+  TestFuncs(vpx_highbd_obmc_variance16x16_c,
+            vpx_highbd_obmc_variance16x16_sse4_1, 8),
+  TestFuncs(vpx_highbd_obmc_variance16x8_c,
+            vpx_highbd_obmc_variance16x8_sse4_1, 8),
+  TestFuncs(vpx_highbd_obmc_variance8x16_c,
+            vpx_highbd_obmc_variance8x16_sse4_1, 8),
+  TestFuncs(vpx_highbd_obmc_variance8x8_c,
+            vpx_highbd_obmc_variance8x8_sse4_1, 8),
+  TestFuncs(vpx_highbd_obmc_variance8x4_c,
+            vpx_highbd_obmc_variance8x4_sse4_1, 8),
+  TestFuncs(vpx_highbd_obmc_variance4x8_c,
+            vpx_highbd_obmc_variance4x8_sse4_1, 8),
+  TestFuncs(vpx_highbd_obmc_variance4x4_c,
+            vpx_highbd_obmc_variance4x4_sse4_1, 8),
+#if CONFIG_EXT_PARTITION
+  TestFuncs(vpx_highbd_10_obmc_variance128x128_c,
+            vpx_highbd_10_obmc_variance128x128_sse4_1, 10),
+  TestFuncs(vpx_highbd_10_obmc_variance128x64_c,
+            vpx_highbd_10_obmc_variance128x64_sse4_1, 10),
+  TestFuncs(vpx_highbd_10_obmc_variance64x128_c,
+            vpx_highbd_10_obmc_variance64x128_sse4_1, 10),
+#endif  // CONFIG_EXT_PARTITION
+  TestFuncs(vpx_highbd_10_obmc_variance64x64_c,
+            vpx_highbd_10_obmc_variance64x64_sse4_1, 10),
+  TestFuncs(vpx_highbd_10_obmc_variance64x32_c,
+            vpx_highbd_10_obmc_variance64x32_sse4_1, 10),
+  TestFuncs(vpx_highbd_10_obmc_variance32x64_c,
+            vpx_highbd_10_obmc_variance32x64_sse4_1, 10),
+  TestFuncs(vpx_highbd_10_obmc_variance32x32_c,
+            vpx_highbd_10_obmc_variance32x32_sse4_1, 10),
+  TestFuncs(vpx_highbd_10_obmc_variance32x16_c,
+            vpx_highbd_10_obmc_variance32x16_sse4_1, 10),
+  TestFuncs(vpx_highbd_10_obmc_variance16x32_c,
+            vpx_highbd_10_obmc_variance16x32_sse4_1, 10),
+  TestFuncs(vpx_highbd_10_obmc_variance16x16_c,
+            vpx_highbd_10_obmc_variance16x16_sse4_1, 10),
+  TestFuncs(vpx_highbd_10_obmc_variance16x8_c,
+            vpx_highbd_10_obmc_variance16x8_sse4_1, 10),
+  TestFuncs(vpx_highbd_10_obmc_variance8x16_c,
+            vpx_highbd_10_obmc_variance8x16_sse4_1, 10),
+  TestFuncs(vpx_highbd_10_obmc_variance8x8_c,
+            vpx_highbd_10_obmc_variance8x8_sse4_1, 10),
+  TestFuncs(vpx_highbd_10_obmc_variance8x4_c,
+            vpx_highbd_10_obmc_variance8x4_sse4_1, 10),
+  TestFuncs(vpx_highbd_10_obmc_variance4x8_c,
+            vpx_highbd_10_obmc_variance4x8_sse4_1, 10),
+  TestFuncs(vpx_highbd_10_obmc_variance4x4_c,
+            vpx_highbd_10_obmc_variance4x4_sse4_1, 10),
+#if CONFIG_EXT_PARTITION
+  TestFuncs(vpx_highbd_12_obmc_variance128x128_c,
+            vpx_highbd_12_obmc_variance128x128_sse4_1, 12),
+  TestFuncs(vpx_highbd_12_obmc_variance128x64_c,
+            vpx_highbd_12_obmc_variance128x64_sse4_1, 12),
+  TestFuncs(vpx_highbd_12_obmc_variance64x128_c,
+            vpx_highbd_12_obmc_variance64x128_sse4_1, 12),
+#endif  // CONFIG_EXT_PARTITION
+  TestFuncs(vpx_highbd_12_obmc_variance64x64_c,
+            vpx_highbd_12_obmc_variance64x64_sse4_1, 12),
+  TestFuncs(vpx_highbd_12_obmc_variance64x32_c,
+            vpx_highbd_12_obmc_variance64x32_sse4_1, 12),
+  TestFuncs(vpx_highbd_12_obmc_variance32x64_c,
+            vpx_highbd_12_obmc_variance32x64_sse4_1, 12),
+  TestFuncs(vpx_highbd_12_obmc_variance32x32_c,
+            vpx_highbd_12_obmc_variance32x32_sse4_1, 12),
+  TestFuncs(vpx_highbd_12_obmc_variance32x16_c,
+            vpx_highbd_12_obmc_variance32x16_sse4_1, 12),
+  TestFuncs(vpx_highbd_12_obmc_variance16x32_c,
+            vpx_highbd_12_obmc_variance16x32_sse4_1, 12),
+  TestFuncs(vpx_highbd_12_obmc_variance16x16_c,
+            vpx_highbd_12_obmc_variance16x16_sse4_1, 12),
+  TestFuncs(vpx_highbd_12_obmc_variance16x8_c,
+            vpx_highbd_12_obmc_variance16x8_sse4_1, 12),
+  TestFuncs(vpx_highbd_12_obmc_variance8x16_c,
+            vpx_highbd_12_obmc_variance8x16_sse4_1, 12),
+  TestFuncs(vpx_highbd_12_obmc_variance8x8_c,
+            vpx_highbd_12_obmc_variance8x8_sse4_1, 12),
+  TestFuncs(vpx_highbd_12_obmc_variance8x4_c,
+            vpx_highbd_12_obmc_variance8x4_sse4_1, 12),
+  TestFuncs(vpx_highbd_12_obmc_variance4x8_c,
+            vpx_highbd_12_obmc_variance4x8_sse4_1, 12),
+  TestFuncs(vpx_highbd_12_obmc_variance4x4_c,
+            vpx_highbd_12_obmc_variance4x4_sse4_1, 12)
+};
+
+INSTANTIATE_TEST_CASE_P(SSE4_1_C_COMPARE, ObmcVarianceHBDTest,
+                        ::testing::ValuesIn(sse4_functions_hbd));
+#endif  // HAVE_SSE4_1
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}  // namespace
diff --git a/test/sad_test.cc b/test/sad_test.cc
index e6bd0d7..f277294 100644
--- a/test/sad_test.cc
+++ b/test/sad_test.cc
@@ -59,13 +59,13 @@
     reference_data8_ = reinterpret_cast<uint8_t*>(
         vpx_memalign(kDataAlignment, kDataBufferSize));
     second_pred8_ = reinterpret_cast<uint8_t*>(
-        vpx_memalign(kDataAlignment, 64*64));
+        vpx_memalign(kDataAlignment, 128*128));
     source_data16_ = reinterpret_cast<uint16_t*>(
         vpx_memalign(kDataAlignment, kDataBlockSize*sizeof(uint16_t)));
     reference_data16_ = reinterpret_cast<uint16_t*>(
         vpx_memalign(kDataAlignment, kDataBufferSize*sizeof(uint16_t)));
     second_pred16_ = reinterpret_cast<uint16_t*>(
-        vpx_memalign(kDataAlignment, 64*64*sizeof(uint16_t)));
+        vpx_memalign(kDataAlignment, 128*128*sizeof(uint16_t)));
   }
 
   static void TearDownTestCase() {
@@ -88,9 +88,9 @@
   }
 
  protected:
-  // Handle blocks up to 4 blocks 64x64 with stride up to 128
+  // Handle up to 4 128x128 blocks, with stride up to 256
   static const int kDataAlignment = 16;
-  static const int kDataBlockSize = 64 * 128;
+  static const int kDataBlockSize = 128 * 256;
   static const int kDataBufferSize = 4 * kDataBlockSize;
 
   virtual void SetUp() {
@@ -485,6 +485,11 @@
 //------------------------------------------------------------------------------
 // C functions
 const SadMxNParam c_tests[] = {
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &vpx_sad128x128_c, -1),
+  make_tuple(128, 64, &vpx_sad128x64_c, -1),
+  make_tuple(64, 128, &vpx_sad64x128_c, -1),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &vpx_sad64x64_c, -1),
   make_tuple(64, 32, &vpx_sad64x32_c, -1),
   make_tuple(32, 64, &vpx_sad32x64_c, -1),
@@ -499,6 +504,11 @@
   make_tuple(4, 8, &vpx_sad4x8_c, -1),
   make_tuple(4, 4, &vpx_sad4x4_c, -1),
 #if CONFIG_VP9_HIGHBITDEPTH
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &vpx_highbd_sad128x128_c, 8),
+  make_tuple(128, 64, &vpx_highbd_sad128x64_c, 8),
+  make_tuple(64, 128, &vpx_highbd_sad64x128_c, 8),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &vpx_highbd_sad64x64_c, 8),
   make_tuple(64, 32, &vpx_highbd_sad64x32_c, 8),
   make_tuple(32, 64, &vpx_highbd_sad32x64_c, 8),
@@ -512,6 +522,11 @@
   make_tuple(8, 4, &vpx_highbd_sad8x4_c, 8),
   make_tuple(4, 8, &vpx_highbd_sad4x8_c, 8),
   make_tuple(4, 4, &vpx_highbd_sad4x4_c, 8),
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &vpx_highbd_sad128x128_c, 10),
+  make_tuple(128, 64, &vpx_highbd_sad128x64_c, 10),
+  make_tuple(64, 128, &vpx_highbd_sad64x128_c, 10),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &vpx_highbd_sad64x64_c, 10),
   make_tuple(64, 32, &vpx_highbd_sad64x32_c, 10),
   make_tuple(32, 64, &vpx_highbd_sad32x64_c, 10),
@@ -525,6 +540,11 @@
   make_tuple(8, 4, &vpx_highbd_sad8x4_c, 10),
   make_tuple(4, 8, &vpx_highbd_sad4x8_c, 10),
   make_tuple(4, 4, &vpx_highbd_sad4x4_c, 10),
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &vpx_highbd_sad128x128_c, 12),
+  make_tuple(128, 64, &vpx_highbd_sad128x64_c, 12),
+  make_tuple(64, 128, &vpx_highbd_sad64x128_c, 12),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &vpx_highbd_sad64x64_c, 12),
   make_tuple(64, 32, &vpx_highbd_sad64x32_c, 12),
   make_tuple(32, 64, &vpx_highbd_sad32x64_c, 12),
@@ -543,6 +563,11 @@
 INSTANTIATE_TEST_CASE_P(C, SADTest, ::testing::ValuesIn(c_tests));
 
 const SadMxNAvgParam avg_c_tests[] = {
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &vpx_sad128x128_avg_c, -1),
+  make_tuple(128, 64, &vpx_sad128x64_avg_c, -1),
+  make_tuple(64, 128, &vpx_sad64x128_avg_c, -1),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &vpx_sad64x64_avg_c, -1),
   make_tuple(64, 32, &vpx_sad64x32_avg_c, -1),
   make_tuple(32, 64, &vpx_sad32x64_avg_c, -1),
@@ -557,6 +582,11 @@
   make_tuple(4, 8, &vpx_sad4x8_avg_c, -1),
   make_tuple(4, 4, &vpx_sad4x4_avg_c, -1),
 #if CONFIG_VP9_HIGHBITDEPTH
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &vpx_highbd_sad128x128_avg_c, 8),
+  make_tuple(128, 64, &vpx_highbd_sad128x64_avg_c, 8),
+  make_tuple(64, 128, &vpx_highbd_sad64x128_avg_c, 8),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &vpx_highbd_sad64x64_avg_c, 8),
   make_tuple(64, 32, &vpx_highbd_sad64x32_avg_c, 8),
   make_tuple(32, 64, &vpx_highbd_sad32x64_avg_c, 8),
@@ -570,6 +600,11 @@
   make_tuple(8, 4, &vpx_highbd_sad8x4_avg_c, 8),
   make_tuple(4, 8, &vpx_highbd_sad4x8_avg_c, 8),
   make_tuple(4, 4, &vpx_highbd_sad4x4_avg_c, 8),
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &vpx_highbd_sad128x128_avg_c, 10),
+  make_tuple(128, 64, &vpx_highbd_sad128x64_avg_c, 10),
+  make_tuple(64, 128, &vpx_highbd_sad64x128_avg_c, 10),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &vpx_highbd_sad64x64_avg_c, 10),
   make_tuple(64, 32, &vpx_highbd_sad64x32_avg_c, 10),
   make_tuple(32, 64, &vpx_highbd_sad32x64_avg_c, 10),
@@ -583,6 +618,11 @@
   make_tuple(8, 4, &vpx_highbd_sad8x4_avg_c, 10),
   make_tuple(4, 8, &vpx_highbd_sad4x8_avg_c, 10),
   make_tuple(4, 4, &vpx_highbd_sad4x4_avg_c, 10),
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &vpx_highbd_sad128x128_avg_c, 12),
+  make_tuple(128, 64, &vpx_highbd_sad128x64_avg_c, 12),
+  make_tuple(64, 128, &vpx_highbd_sad64x128_avg_c, 12),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &vpx_highbd_sad64x64_avg_c, 12),
   make_tuple(64, 32, &vpx_highbd_sad64x32_avg_c, 12),
   make_tuple(32, 64, &vpx_highbd_sad32x64_avg_c, 12),
@@ -601,6 +641,11 @@
 INSTANTIATE_TEST_CASE_P(C, SADavgTest, ::testing::ValuesIn(avg_c_tests));
 
 const SadMxNx4Param x4d_c_tests[] = {
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &vpx_sad128x128x4d_c, -1),
+  make_tuple(128, 64, &vpx_sad128x64x4d_c, -1),
+  make_tuple(64, 128, &vpx_sad64x128x4d_c, -1),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &vpx_sad64x64x4d_c, -1),
   make_tuple(64, 32, &vpx_sad64x32x4d_c, -1),
   make_tuple(32, 64, &vpx_sad32x64x4d_c, -1),
@@ -615,6 +660,11 @@
   make_tuple(4, 8, &vpx_sad4x8x4d_c, -1),
   make_tuple(4, 4, &vpx_sad4x4x4d_c, -1),
 #if CONFIG_VP9_HIGHBITDEPTH
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &vpx_highbd_sad128x128x4d_c, 8),
+  make_tuple(128, 64, &vpx_highbd_sad128x64x4d_c, 8),
+  make_tuple(64, 128, &vpx_highbd_sad64x128x4d_c, 8),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &vpx_highbd_sad64x64x4d_c, 8),
   make_tuple(64, 32, &vpx_highbd_sad64x32x4d_c, 8),
   make_tuple(32, 64, &vpx_highbd_sad32x64x4d_c, 8),
@@ -628,6 +678,11 @@
   make_tuple(8, 4, &vpx_highbd_sad8x4x4d_c, 8),
   make_tuple(4, 8, &vpx_highbd_sad4x8x4d_c, 8),
   make_tuple(4, 4, &vpx_highbd_sad4x4x4d_c, 8),
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &vpx_highbd_sad128x128x4d_c, 10),
+  make_tuple(128, 64, &vpx_highbd_sad128x64x4d_c, 10),
+  make_tuple(64, 128, &vpx_highbd_sad64x128x4d_c, 10),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &vpx_highbd_sad64x64x4d_c, 10),
   make_tuple(64, 32, &vpx_highbd_sad64x32x4d_c, 10),
   make_tuple(32, 64, &vpx_highbd_sad32x64x4d_c, 10),
@@ -641,6 +696,11 @@
   make_tuple(8, 4, &vpx_highbd_sad8x4x4d_c, 10),
   make_tuple(4, 8, &vpx_highbd_sad4x8x4d_c, 10),
   make_tuple(4, 4, &vpx_highbd_sad4x4x4d_c, 10),
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &vpx_highbd_sad128x128x4d_c, 12),
+  make_tuple(128, 64, &vpx_highbd_sad128x64x4d_c, 12),
+  make_tuple(64, 128, &vpx_highbd_sad64x128x4d_c, 12),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &vpx_highbd_sad64x64x4d_c, 12),
   make_tuple(64, 32, &vpx_highbd_sad64x32x4d_c, 12),
   make_tuple(32, 64, &vpx_highbd_sad32x64x4d_c, 12),
@@ -692,6 +752,11 @@
 #if HAVE_SSE2
 #if CONFIG_USE_X86INC
 const SadMxNParam sse2_tests[] = {
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &vpx_sad128x128_sse2, -1),
+  make_tuple(128, 64, &vpx_sad128x64_sse2, -1),
+  make_tuple(64, 128, &vpx_sad64x128_sse2, -1),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &vpx_sad64x64_sse2, -1),
   make_tuple(64, 32, &vpx_sad64x32_sse2, -1),
   make_tuple(32, 64, &vpx_sad32x64_sse2, -1),
@@ -744,6 +809,11 @@
 INSTANTIATE_TEST_CASE_P(SSE2, SADTest, ::testing::ValuesIn(sse2_tests));
 
 const SadMxNAvgParam avg_sse2_tests[] = {
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &vpx_sad128x128_avg_sse2, -1),
+  make_tuple(128, 64, &vpx_sad128x64_avg_sse2, -1),
+  make_tuple(64, 128, &vpx_sad64x128_avg_sse2, -1),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &vpx_sad64x64_avg_sse2, -1),
   make_tuple(64, 32, &vpx_sad64x32_avg_sse2, -1),
   make_tuple(32, 64, &vpx_sad32x64_avg_sse2, -1),
@@ -796,6 +866,11 @@
 INSTANTIATE_TEST_CASE_P(SSE2, SADavgTest, ::testing::ValuesIn(avg_sse2_tests));
 
 const SadMxNx4Param x4d_sse2_tests[] = {
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+  make_tuple(128, 128, &vpx_sad128x128x4d_sse2, -1),
+  make_tuple(128, 64, &vpx_sad128x64x4d_sse2, -1),
+  make_tuple(64, 128, &vpx_sad64x128x4d_sse2, -1),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
   make_tuple(64, 64, &vpx_sad64x64x4d_sse2, -1),
   make_tuple(64, 32, &vpx_sad64x32x4d_sse2, -1),
   make_tuple(32, 64, &vpx_sad32x64x4d_sse2, -1),
diff --git a/test/subtract_test.cc b/test/subtract_test.cc
new file mode 100644
index 0000000..48edf1e
--- /dev/null
+++ b/test/subtract_test.cc
@@ -0,0 +1,262 @@
+/*
+ *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#if CONFIG_VP10
+#include "vp10/common/blockd.h"
+#elif CONFIG_VP9
+#include "vp9/common/vp9_blockd.h"
+#endif
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+
+#define USE_SPEED_TEST (0)
+
+typedef void (*SubtractFunc)(int rows, int cols,
+                             int16_t *diff_ptr, ptrdiff_t diff_stride,
+                             const uint8_t *src_ptr, ptrdiff_t src_stride,
+                             const uint8_t *pred_ptr, ptrdiff_t pred_stride);
+
+namespace {
+
+class VP9SubtractBlockTest : public ::testing::TestWithParam<SubtractFunc> {
+ public:
+  virtual void TearDown() {
+    libvpx_test::ClearSystemState();
+  }
+};
+
+using libvpx_test::ACMRandom;
+
+TEST_P(VP9SubtractBlockTest, SimpleSubtract) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+  // FIXME(rbultje) split in its own file
+  for (BLOCK_SIZE bsize = BLOCK_4X4; bsize < BLOCK_SIZES;
+       bsize = static_cast<BLOCK_SIZE>(static_cast<int>(bsize) + 1)) {
+    const int block_width = 4 * num_4x4_blocks_wide_lookup[bsize];
+    const int block_height = 4 * num_4x4_blocks_high_lookup[bsize];
+    int16_t *diff = reinterpret_cast<int16_t *>(
+        vpx_memalign(16, sizeof(*diff) * block_width * block_height * 2));
+    uint8_t *pred = reinterpret_cast<uint8_t *>(
+        vpx_memalign(16, block_width * block_height * 2));
+    uint8_t *src  = reinterpret_cast<uint8_t *>(
+        vpx_memalign(16, block_width * block_height * 2));
+
+    for (int n = 0; n < 100; n++) {
+      for (int r = 0; r < block_height; ++r) {
+        for (int c = 0; c < block_width * 2; ++c) {
+          src[r * block_width * 2 + c] = rnd.Rand8();
+          pred[r * block_width * 2 + c] = rnd.Rand8();
+        }
+      }
+
+      GetParam()(block_height, block_width, diff, block_width,
+                 src, block_width, pred, block_width);
+
+      for (int r = 0; r < block_height; ++r) {
+        for (int c = 0; c < block_width; ++c) {
+          EXPECT_EQ(diff[r * block_width + c],
+                    (src[r * block_width + c] -
+                     pred[r * block_width + c])) << "r = " << r
+                                                 << ", c = " << c
+                                                 << ", bs = " << bsize;
+        }
+      }
+
+      GetParam()(block_height, block_width, diff, block_width * 2,
+                 src, block_width * 2, pred, block_width * 2);
+
+      for (int r = 0; r < block_height; ++r) {
+        for (int c = 0; c < block_width; ++c) {
+          EXPECT_EQ(diff[r * block_width * 2 + c],
+                    (src[r * block_width * 2 + c] -
+                     pred[r * block_width * 2 + c])) << "r = " << r
+                                                     << ", c = " << c
+                                                     << ", bs = " << bsize;
+        }
+      }
+    }
+    vpx_free(diff);
+    vpx_free(pred);
+    vpx_free(src);
+  }
+}
+
+INSTANTIATE_TEST_CASE_P(C, VP9SubtractBlockTest,
+                        ::testing::Values(vpx_subtract_block_c));
+
+#if HAVE_SSE2 && CONFIG_USE_X86INC
+INSTANTIATE_TEST_CASE_P(SSE2, VP9SubtractBlockTest,
+                        ::testing::Values(vpx_subtract_block_sse2));
+#endif
+#if HAVE_NEON
+INSTANTIATE_TEST_CASE_P(NEON, VP9SubtractBlockTest,
+                        ::testing::Values(vpx_subtract_block_neon));
+#endif
+#if HAVE_MSA
+INSTANTIATE_TEST_CASE_P(MSA, VP9SubtractBlockTest,
+                        ::testing::Values(vpx_subtract_block_msa));
+#endif
+
+typedef void (*HBDSubtractFunc)(int rows, int cols,
+                                int16_t *diff_ptr, ptrdiff_t diff_stride,
+                                const uint8_t *src_ptr, ptrdiff_t src_stride,
+                                const uint8_t *pred_ptr, ptrdiff_t pred_stride,
+                                int bd);
+
+using ::std::tr1::get;
+using ::std::tr1::make_tuple;
+using ::std::tr1::tuple;
+
+// <width, height, bit_dpeth, subtract>
+typedef tuple<int, int, int, HBDSubtractFunc> Params;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+class VP10HBDSubtractBlockTest : public ::testing::TestWithParam<Params> {
+ public:
+  virtual void SetUp() {
+    block_width_ = GET_PARAM(0);
+    block_height_ = GET_PARAM(1);
+    bit_depth_ = static_cast<vpx_bit_depth_t>(GET_PARAM(2));
+    func_ = GET_PARAM(3);
+
+    rnd_.Reset(ACMRandom::DeterministicSeed());
+
+    const size_t max_width = 128;
+    const size_t max_block_size = max_width * max_width;
+    src_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
+        vpx_memalign(16, max_block_size * sizeof(uint16_t))));
+    pred_ = CONVERT_TO_BYTEPTR(reinterpret_cast<uint16_t *>(
+        vpx_memalign(16, max_block_size * sizeof(uint16_t))));
+    diff_ = reinterpret_cast<int16_t *>(
+        vpx_memalign(16, max_block_size * sizeof(int16_t)));
+  }
+
+  virtual void TearDown() {
+    vpx_free(CONVERT_TO_SHORTPTR(src_));
+    vpx_free(CONVERT_TO_SHORTPTR(pred_));
+    vpx_free(diff_);
+  }
+
+ protected:
+  void RunForSpeed();
+  void CheckResult();
+
+ private:
+  ACMRandom rnd_;
+  int block_height_;
+  int block_width_;
+  vpx_bit_depth_t bit_depth_;
+  HBDSubtractFunc func_;
+  uint8_t *src_;
+  uint8_t *pred_;
+  int16_t *diff_;
+};
+
+void VP10HBDSubtractBlockTest::RunForSpeed() {
+  const int test_num = 200000;
+  const int max_width = 128;
+  const int max_block_size = max_width * max_width;
+  const int mask = (1 << bit_depth_) - 1;
+  int i, j;
+
+  for (j = 0; j < max_block_size; ++j) {
+    CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask;
+    CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask;
+  }
+
+  for (i = 0; i < test_num; ++i) {
+    func_(block_height_, block_width_, diff_, block_width_,
+          src_, block_width_, pred_, block_width_, bit_depth_);
+  }
+}
+
+void VP10HBDSubtractBlockTest::CheckResult() {
+  const int test_num = 100;
+  const int max_width = 128;
+  const int max_block_size = max_width * max_width;
+  const int mask = (1 << bit_depth_) - 1;
+  int i, j;
+
+  for (i = 0; i < test_num; ++i) {
+    for (j = 0; j < max_block_size; ++j) {
+      CONVERT_TO_SHORTPTR(src_)[j] = rnd_.Rand16() & mask;
+      CONVERT_TO_SHORTPTR(pred_)[j] = rnd_.Rand16() & mask;
+    }
+
+    func_(block_height_, block_width_, diff_, block_width_,
+          src_, block_width_, pred_, block_width_, bit_depth_);
+
+    for (int r = 0; r < block_height_; ++r) {
+      for (int c = 0; c < block_width_; ++c) {
+        EXPECT_EQ(diff_[r * block_width_ + c],
+                  (CONVERT_TO_SHORTPTR(src_)[r * block_width_ + c] -
+                   CONVERT_TO_SHORTPTR(pred_)[r * block_width_ + c]))
+            << "r = " << r << ", c = " << c << ", test: " << i;
+      }
+    }
+  }
+}
+
+TEST_P(VP10HBDSubtractBlockTest, CheckResult) {
+  CheckResult();
+}
+
+#if USE_SPEED_TEST
+TEST_P(VP10HBDSubtractBlockTest, CheckSpeed) {
+  RunForSpeed();
+}
+#endif  // USE_SPEED_TEST
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(SSE2, VP10HBDSubtractBlockTest, ::testing::Values(
+    make_tuple(4, 4, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(4, 4, 12, vpx_highbd_subtract_block_c),
+    make_tuple(4, 8, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(4, 8, 12, vpx_highbd_subtract_block_c),
+    make_tuple(8, 4, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(8, 4, 12, vpx_highbd_subtract_block_c),
+    make_tuple(8, 8, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(8, 8, 12, vpx_highbd_subtract_block_c),
+    make_tuple(8, 16, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(8, 16, 12, vpx_highbd_subtract_block_c),
+    make_tuple(16, 8, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(16, 8, 12, vpx_highbd_subtract_block_c),
+    make_tuple(16, 16, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(16, 16, 12, vpx_highbd_subtract_block_c),
+    make_tuple(16, 32, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(16, 32, 12, vpx_highbd_subtract_block_c),
+    make_tuple(32, 16, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(32, 16, 12, vpx_highbd_subtract_block_c),
+    make_tuple(32, 32, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(32, 32, 12, vpx_highbd_subtract_block_c),
+    make_tuple(32, 64, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(32, 64, 12, vpx_highbd_subtract_block_c),
+    make_tuple(64, 32, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(64, 32, 12, vpx_highbd_subtract_block_c),
+    make_tuple(64, 64, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(64, 64, 12, vpx_highbd_subtract_block_c),
+    make_tuple(64, 128, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(64, 128, 12, vpx_highbd_subtract_block_c),
+    make_tuple(128, 64, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(128, 64, 12, vpx_highbd_subtract_block_c),
+    make_tuple(128, 128, 12, vpx_highbd_subtract_block_sse2),
+    make_tuple(128, 128, 12, vpx_highbd_subtract_block_c)));
+#endif  // HAVE_SSE2
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}  // namespace
diff --git a/test/sum_squares_test.cc b/test/sum_squares_test.cc
new file mode 100644
index 0000000..9adb86e
--- /dev/null
+++ b/test/sum_squares_test.cc
@@ -0,0 +1,192 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <cmath>
+#include <cstdlib>
+#include <string>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "test/function_equivalence_test.h"
+
+using libvpx_test::ACMRandom;
+using libvpx_test::FunctionEquivalenceTest;
+
+namespace {
+const int kNumIterations = 10000;
+
+static const int16_t kInt13Max = (1 << 12) - 1;
+
+typedef uint64_t (*SSI16Func)(const int16_t *src, int stride, int size);
+typedef libvpx_test::FuncParam<SSI16Func> TestFuncs;
+
+class SumSquaresTest :
+    public ::testing::TestWithParam<TestFuncs> {
+ public:
+  virtual ~SumSquaresTest() {}
+  virtual void SetUp() {
+    params_ = this->GetParam();
+  }
+
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  TestFuncs params_;
+};
+
+TEST_P(SumSquaresTest, OperationCheck) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, int16_t, src[256*256]);
+
+  int failed = 0;
+
+  const int msb = 11;   // Up to 12 bit input
+  const int limit = 1 << (msb+1);
+
+  for (int k = 0; k < kNumIterations; k++) {
+    int size = 4 << rnd(6);     // Up to 128x128
+    int stride = 4 << rnd(7);   // Up to 256 stride
+    while (stride < size) {     // Make sure it's valid
+      stride = 4 << rnd(7);
+    }
+
+    for (int ii = 0 ; ii < size; ii++) {
+      for (int jj = 0; jj < size; jj++) {
+        src[ii*stride+jj] = rnd(2) ? rnd(limit) : -rnd(limit);
+      }
+    }
+
+    const uint64_t res_ref = params_.ref_func(src, stride, size);
+    uint64_t res_tst;
+    ASM_REGISTER_STATE_CHECK(res_tst = params_.tst_func(src, stride, size));
+
+    if (!failed) {
+      failed = res_ref != res_tst;
+      EXPECT_EQ(res_ref, res_tst)
+        << "Error: Sum Squares Test"
+        << " C output does not match optimized output.";
+    }
+  }
+}
+
+TEST_P(SumSquaresTest, ExtremeValues) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  DECLARE_ALIGNED(16, int16_t, src[256*256]);
+
+  int failed = 0;
+
+  const int msb = 11;   // Up to 12 bit input
+  const int limit = 1 << (msb+1);
+
+  for (int k = 0; k < kNumIterations; k++) {
+    int size = 4 << rnd(6);     // Up to 128x128
+    int stride = 4 << rnd(7);   // Up to 256 stride
+    while (stride < size) {     // Make sure it's valid
+      stride = 4 << rnd(7);
+    }
+
+    int val = rnd(2) ? limit-1 : -(limit-1);
+    for (int ii = 0 ; ii < size; ii++) {
+      for (int jj = 0; jj < size; jj++) {
+        src[ii*stride+jj] = val;
+      }
+    }
+
+    const uint64_t res_ref = params_.ref_func(src, stride, size);
+    uint64_t res_tst;
+    ASM_REGISTER_STATE_CHECK(res_tst = params_.tst_func(src, stride, size));
+
+    if (!failed) {
+      failed = res_ref != res_tst;
+      EXPECT_EQ(res_ref, res_tst)
+        << "Error: Sum Squares Test"
+        << " C output does not match optimized output.";
+    }
+  }
+}
+
+#if HAVE_SSE2
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, SumSquaresTest,
+    ::testing::Values(
+        TestFuncs(&vpx_sum_squares_2d_i16_c, &vpx_sum_squares_2d_i16_sse2)));
+
+#endif  // HAVE_SSE2
+
+//////////////////////////////////////////////////////////////////////////////
+// 1D version
+//////////////////////////////////////////////////////////////////////////////
+
+typedef uint64_t (*F1D)(const int16_t *src, uint32_t N);
+typedef libvpx_test::FuncParam<F1D> TestFuncs1D;
+
+class SumSquares1DTest : public FunctionEquivalenceTest<F1D> {
+ protected:
+  static const int kIterations = 1000;
+  static const int kMaxSize = 256;
+};
+
+TEST_P(SumSquares1DTest, RandomValues) {
+  DECLARE_ALIGNED(16, int16_t, src[kMaxSize * kMaxSize]);
+
+  for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) {
+    for (int i = 0 ; i < kMaxSize * kMaxSize ; ++i)
+      src[i] = rng_(kInt13Max * 2 + 1) - kInt13Max;
+
+    const int N = rng_(2) ? rng_(kMaxSize * kMaxSize + 1 - kMaxSize) + kMaxSize
+                          : rng_(kMaxSize) + 1;
+
+    const uint64_t ref_res = params_.ref_func(src, N);
+    uint64_t tst_res;
+    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(src, N));
+
+    ASSERT_EQ(ref_res, tst_res);
+  }
+}
+
+TEST_P(SumSquares1DTest, ExtremeValues) {
+  DECLARE_ALIGNED(16, int16_t, src[kMaxSize * kMaxSize]);
+
+  for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) {
+    if (rng_(2)) {
+      for (int i = 0 ; i < kMaxSize * kMaxSize ; ++i)
+        src[i] = kInt13Max;
+    } else {
+      for (int i = 0 ; i < kMaxSize * kMaxSize ; ++i)
+        src[i] = -kInt13Max;
+    }
+
+    const int N = rng_(2) ? rng_(kMaxSize * kMaxSize + 1 - kMaxSize) + kMaxSize
+                          : rng_(kMaxSize) + 1;
+
+    const uint64_t ref_res = params_.ref_func(src, N);
+    uint64_t tst_res;
+    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(src, N));
+
+    ASSERT_EQ(ref_res, tst_res);
+  }
+}
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, SumSquares1DTest,
+    ::testing::Values(
+        TestFuncs1D(vpx_sum_squares_i16_c, vpx_sum_squares_i16_sse2)));
+
+#endif  // HAVE_SSE2
+}  // namespace
diff --git a/test/superframe_test.cc b/test/superframe_test.cc
index 90aa75b..3e2f49a 100644
--- a/test/superframe_test.cc
+++ b/test/superframe_test.cc
@@ -18,8 +18,11 @@
 
 const int kTestMode = 0;
 const int kSuperframeSyntax = 1;
+const int kTileCols = 2;
+const int kTileRows = 3;
 
-typedef std::tr1::tuple<libvpx_test::TestMode,int> SuperframeTestParam;
+typedef std::tr1::tuple<libvpx_test::TestMode, int,
+                        int, int> SuperframeTestParam;
 
 class SuperframeTest : public ::libvpx_test::EncoderTest,
     public ::libvpx_test::CodecTestWithParam<SuperframeTestParam> {
@@ -37,6 +40,8 @@
     sf_count_ = 0;
     sf_count_max_ = INT_MAX;
     is_vp10_style_superframe_ = syntax;
+    n_tile_cols_ = std::tr1::get<kTileCols>(input);
+    n_tile_rows_ = std::tr1::get<kTileRows>(input);
   }
 
   virtual void TearDown() {
@@ -47,6 +52,9 @@
                                   libvpx_test::Encoder *encoder) {
     if (video->frame() == 1) {
       encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
+      encoder->Control(VP8E_SET_CPUUSED, 2);
+      encoder->Control(VP9E_SET_TILE_COLUMNS, n_tile_cols_);
+      encoder->Control(VP9E_SET_TILE_ROWS, n_tile_rows_);
     }
   }
 
@@ -91,6 +99,10 @@
   vpx_codec_cx_pkt_t modified_pkt_;
   uint8_t *modified_buf_;
   vpx_codec_pts_t last_sf_pts_;
+
+ private:
+  int n_tile_cols_;
+  int n_tile_rows_;
 };
 
 TEST_P(SuperframeTest, TestSuperframeIndexIsOptional) {
@@ -100,14 +112,39 @@
   ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 352, 288,
                                        30, 1, 0, 40);
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+#if CONFIG_BIDIR_PRED
+  // NOTE: The use of BWDREF_FRAME will enable the coding of more non-show
+  //       frames besides ALTREF_FRAME.
+  EXPECT_GE(sf_count_, 1);
+#else
   EXPECT_EQ(sf_count_, 1);
+#endif  // CONFIG_BIDIR_PRED
 }
 
 VP9_INSTANTIATE_TEST_CASE(SuperframeTest, ::testing::Combine(
     ::testing::Values(::libvpx_test::kTwoPassGood),
-    ::testing::Values(0)));
+    ::testing::Values(0), ::testing::Values(0), ::testing::Values(0)));
 
+// The superframe index is currently mandatory with ANS due to the decoder
+// starting at the end of the buffer.
+#if CONFIG_EXT_TILE
+// Single tile does not work with ANS (see comment above).
+#if CONFIG_ANS
+const int tile_col_values[] = { 1, 2 };
+#else
+const int tile_col_values[] = { 1, 2, 32 };
+#endif
+const int tile_row_values[] = { 1, 2, 32 };
 VP10_INSTANTIATE_TEST_CASE(SuperframeTest, ::testing::Combine(
     ::testing::Values(::libvpx_test::kTwoPassGood),
-    ::testing::Values(CONFIG_MISC_FIXES)));
+    ::testing::Values(1),
+    ::testing::ValuesIn(tile_col_values),
+    ::testing::ValuesIn(tile_row_values)));
+#else
+#if !CONFIG_ANS
+VP10_INSTANTIATE_TEST_CASE(SuperframeTest, ::testing::Combine(
+    ::testing::Values(::libvpx_test::kTwoPassGood),
+    ::testing::Values(1), ::testing::Values(0), ::testing::Values(0)));
+#endif  // !CONFIG_ANS
+#endif  // CONFIG_EXT_TILE
 }  // namespace
diff --git a/test/svc_test.cc b/test/svc_test.cc
index b955cee..1ad17be 100644
--- a/test/svc_test.cc
+++ b/test/svc_test.cc
@@ -63,6 +63,12 @@
     vpx_codec_dec_cfg_t dec_cfg = vpx_codec_dec_cfg_t();
     VP9CodecFactory codec_factory;
     decoder_ = codec_factory.CreateDecoder(dec_cfg, 0);
+#if CONFIG_VP10 && CONFIG_EXT_TILE
+    if (decoder_->IsVP10()) {
+      decoder_->Control(VP10_SET_DECODE_TILE_ROW, -1);
+      decoder_->Control(VP10_SET_DECODE_TILE_COL, -1);
+    }
+#endif
 
     tile_columns_ = 0;
     tile_rows_ = 0;
diff --git a/test/test-data.mk b/test/test-data.mk
index 05a0885..695c54e 100644
--- a/test/test-data.mk
+++ b/test/test-data.mk
@@ -23,6 +23,8 @@
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += rush_hour_444.y4m
 LIBVPX_TEST_DATA-$(CONFIG_VP9_ENCODER) += screendata.y4m
 
+LIBVPX_TEST_DATA-$(CONFIG_VP10_ENCODER) += desktop_credits.y4m
+
 # Test vectors
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf
 LIBVPX_TEST_DATA-$(CONFIG_VP8_DECODER) += vp80-00-comprehensive-001.ivf.md5
diff --git a/test/test.mk b/test/test.mk
index 04acd96..cdef53c 100644
--- a/test/test.mk
+++ b/test/test.mk
@@ -9,6 +9,7 @@
 LIBVPX_TEST_SRCS-yes += test_vectors.h
 LIBVPX_TEST_SRCS-yes += util.h
 LIBVPX_TEST_SRCS-yes += video_source.h
+LIBVPX_TEST_SRCS-yes += transform_test_base.h
 
 ##
 ## BLACK BOX TESTS
@@ -38,13 +39,10 @@
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += invalid_file_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += user_priv_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_DECODER) += vp9_frame_parallel_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += active_map_refresh_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += active_map_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += borders_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += cpu_speed_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += frame_size_tests.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_lossless_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_end_to_end_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_ethread_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += level_test.cc
 
@@ -88,6 +86,13 @@
 LIBVPX_TEST_SRCS-yes += encode_perf_test.cc
 endif
 
+## Multi-codec / unconditional black box tests.
+ifeq ($(findstring yes,$(CONFIG_VP9_ENCODER)$(CONFIG_VP10_ENCODER)),yes)
+LIBVPX_TEST_SRCS-yes += active_map_refresh_test.cc
+LIBVPX_TEST_SRCS-yes += active_map_test.cc
+LIBVPX_TEST_SRCS-yes += end_to_end_test.cc
+endif
+
 ##
 ## WHITE BOX TESTS
 ##
@@ -151,7 +156,7 @@
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += variance_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_error_block_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_quantize_test.cc
-LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_subtract_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_ENCODER) += subtract_test.cc
 
 ifeq ($(CONFIG_VP9_ENCODER),yes)
 LIBVPX_TEST_SRCS-$(CONFIG_SPATIAL_SVC) += svc_test.cc
@@ -168,19 +173,51 @@
 
 ## VP10
 ifeq ($(CONFIG_VP10),yes)
-
 LIBVPX_TEST_SRCS-yes                    += vp10_inv_txfm_test.cc
 LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_dct_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_fht4x4_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_fht8x8_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_fht16x16_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_ANS)          += vp10_ans_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_EXT_TILE)     += vp10_ext_tile_test.cc
 
+LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += sum_squares_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += subtract_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += blend_a64_mask_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += blend_a64_mask_1d_test.cc
+
+ifeq ($(CONFIG_EXT_INTER),yes)
+LIBVPX_TEST_SRCS-$(HAVE_SSSE3) += masked_variance_test.cc
+LIBVPX_TEST_SRCS-$(HAVE_SSSE3) += masked_sad_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += vp10_wedge_utils_test.cc
+endif
+
+ifeq ($(CONFIG_OBMC),yes)
+LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += obmc_sad_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP10_ENCODER) += obmc_variance_test.cc
+endif
+
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+LIBVPX_TEST_SRCS-$(HAVE_SSE4_1) += vp10_highbd_iht_test.cc
+endif # CONFIG_VP9_HIGHBITDEPTH
 endif # VP10
 
 ## Multi-codec / unconditional whitebox tests.
-
 ifeq ($(findstring yes,$(CONFIG_VP9_ENCODER)$(CONFIG_VP10_ENCODER)),yes)
 LIBVPX_TEST_SRCS-yes += avg_test.cc
 endif
-
+ifeq ($(CONFIG_INTERNAL_STATS),yes)
+LIBVPX_TEST_SRCS-$(CONFIG_VP9_HIGHBITDEPTH) += hbd_metrics_test.cc
+endif
 LIBVPX_TEST_SRCS-$(CONFIG_ENCODERS) += sad_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_txfm_test.h
+LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_txfm_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_fwd_txfm1d_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_inv_txfm1d_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_fwd_txfm2d_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_inv_txfm2d_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_convolve_test.cc
+LIBVPX_TEST_SRCS-$(CONFIG_VP10) += vp10_convolve_optimz_test.cc
 
 TEST_INTRA_PRED_SPEED_SRCS-yes := test_intra_pred_speed.cc
 TEST_INTRA_PRED_SPEED_SRCS-yes += ../md5_utils.h ../md5_utils.c
diff --git a/test/test_libvpx.cc b/test/test_libvpx.cc
index 005ea8d..d374bd0 100644
--- a/test/test_libvpx.cc
+++ b/test/test_libvpx.cc
@@ -22,6 +22,9 @@
 #if CONFIG_VP9
 extern void vp9_rtcd();
 #endif  // CONFIG_VP9
+#if CONFIG_VP10
+extern void vp10_rtcd();
+#endif  // CONFIG_VP10
 extern void vpx_dsp_rtcd();
 extern void vpx_scale_rtcd();
 }
@@ -69,6 +72,9 @@
 #if CONFIG_VP9
   vp9_rtcd();
 #endif  // CONFIG_VP9
+#if CONFIG_VP10
+  vp10_rtcd();
+#endif  // CONFIG_VP10
   vpx_dsp_rtcd();
   vpx_scale_rtcd();
 #endif  // !CONFIG_SHARED
diff --git a/test/tile_independence_test.cc b/test/tile_independence_test.cc
index 193bd45..adc8a8a 100644
--- a/test/tile_independence_test.cc
+++ b/test/tile_independence_test.cc
@@ -21,13 +21,15 @@
 
 namespace {
 class TileIndependenceTest : public ::libvpx_test::EncoderTest,
-                             public ::libvpx_test::CodecTestWithParam<int> {
+                             public ::libvpx_test::CodecTestWith2Params<int,
+                                                                        int> {
  protected:
   TileIndependenceTest()
       : EncoderTest(GET_PARAM(0)),
         md5_fw_order_(),
         md5_inv_order_(),
-        n_tiles_(GET_PARAM(1)) {
+        n_tile_cols_(GET_PARAM(1)),
+        n_tile_rows_(GET_PARAM(2)) {
     init_flags_ = VPX_CODEC_USE_PSNR;
     vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
     cfg.w = 704;
@@ -36,6 +38,15 @@
     fw_dec_ = codec_->CreateDecoder(cfg, 0);
     inv_dec_ = codec_->CreateDecoder(cfg, 0);
     inv_dec_->Control(VP9_INVERT_TILE_DECODE_ORDER, 1);
+
+#if CONFIG_VP10 && CONFIG_EXT_TILE
+    if (fw_dec_->IsVP10() && inv_dec_->IsVP10()) {
+      fw_dec_->Control(VP10_SET_DECODE_TILE_ROW, -1);
+      fw_dec_->Control(VP10_SET_DECODE_TILE_COL, -1);
+      inv_dec_->Control(VP10_SET_DECODE_TILE_ROW, -1);
+      inv_dec_->Control(VP10_SET_DECODE_TILE_COL, -1);
+    }
+#endif
   }
 
   virtual ~TileIndependenceTest() {
@@ -51,10 +62,17 @@
   virtual void PreEncodeFrameHook(libvpx_test::VideoSource *video,
                                   libvpx_test::Encoder *encoder) {
     if (video->frame() == 1) {
-      encoder->Control(VP9E_SET_TILE_COLUMNS, n_tiles_);
+      encoder->Control(VP9E_SET_TILE_COLUMNS, n_tile_cols_);
+      encoder->Control(VP9E_SET_TILE_ROWS, n_tile_rows_);
+      SetCpuUsed(encoder);
     }
   }
 
+  virtual void SetCpuUsed(libvpx_test::Encoder *encoder) {
+    static const int kCpuUsed = 3;
+    encoder->Control(VP8E_SET_CPUUSED, kCpuUsed);
+  }
+
   void UpdateMD5(::libvpx_test::Decoder *dec, const vpx_codec_cx_pkt_t *pkt,
                  ::libvpx_test::MD5 *md5) {
     const vpx_codec_err_t res = dec->DecodeFrame(
@@ -72,37 +90,64 @@
     UpdateMD5(inv_dec_, pkt, &md5_inv_order_);
   }
 
+  void DoTest() {
+    const vpx_rational timebase = { 33333333, 1000000000 };
+    cfg_.g_timebase = timebase;
+    cfg_.rc_target_bitrate = 500;
+    cfg_.g_lag_in_frames = 12;
+    cfg_.rc_end_usage = VPX_VBR;
+
+    libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 704, 576,
+                                       timebase.den, timebase.num, 0, 5);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+    const char *md5_fw_str = md5_fw_order_.Get();
+    const char *md5_inv_str = md5_inv_order_.Get();
+    ASSERT_STREQ(md5_fw_str, md5_inv_str);
+  }
+
   ::libvpx_test::MD5 md5_fw_order_, md5_inv_order_;
   ::libvpx_test::Decoder *fw_dec_, *inv_dec_;
 
  private:
-  int n_tiles_;
+  int n_tile_cols_;
+  int n_tile_rows_;
 };
 
 // run an encode with 2 or 4 tiles, and do the decode both in normal and
 // inverted tile ordering. Ensure that the MD5 of the output in both cases
 // is identical. If so, tiles are considered independent and the test passes.
 TEST_P(TileIndependenceTest, MD5Match) {
-  const vpx_rational timebase = { 33333333, 1000000000 };
-  cfg_.g_timebase = timebase;
-  cfg_.rc_target_bitrate = 500;
-  cfg_.g_lag_in_frames = 25;
-  cfg_.rc_end_usage = VPX_VBR;
-
-  libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv", 704, 144,
-                                     timebase.den, timebase.num, 0, 30);
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-
-  const char *md5_fw_str = md5_fw_order_.Get();
-  const char *md5_inv_str = md5_inv_order_.Get();
-
-  // could use ASSERT_EQ(!memcmp(.., .., 16) here, but this gives nicer
-  // output if it fails. Not sure if it's helpful since it's really just
-  // a MD5...
-  ASSERT_STREQ(md5_fw_str, md5_inv_str);
+  DoTest();
 }
 
-VP9_INSTANTIATE_TEST_CASE(TileIndependenceTest, ::testing::Range(0, 2, 1));
+class TileIndependenceTestLarge : public TileIndependenceTest {
+  virtual void SetCpuUsed(libvpx_test::Encoder *encoder) {
+    static const int kCpuUsed = 0;
+    encoder->Control(VP8E_SET_CPUUSED, kCpuUsed);
+  }
+};
 
-VP10_INSTANTIATE_TEST_CASE(TileIndependenceTest, ::testing::Range(0, 2, 1));
+TEST_P(TileIndependenceTestLarge, MD5Match) {
+  DoTest();
+}
+
+VP9_INSTANTIATE_TEST_CASE(TileIndependenceTest, ::testing::Values(0, 1),
+                                                ::testing::Values(0));
+VP9_INSTANTIATE_TEST_CASE(TileIndependenceTestLarge, ::testing::Values(0, 1),
+                                                     ::testing::Values(0));
+
+#if CONFIG_EXT_TILE
+VP10_INSTANTIATE_TEST_CASE(TileIndependenceTest, ::testing::Values(1, 2, 32),
+                                                 ::testing::Values(1, 2, 32));
+VP10_INSTANTIATE_TEST_CASE(TileIndependenceTestLarge,
+                           ::testing::Values(1, 2, 32),
+                           ::testing::Values(1, 2, 32));
+#else
+VP10_INSTANTIATE_TEST_CASE(TileIndependenceTest, ::testing::Values(0, 1),
+                                                 ::testing::Values(0, 1));
+VP10_INSTANTIATE_TEST_CASE(TileIndependenceTestLarge,
+                           ::testing::Values(0, 1),
+                           ::testing::Values(0, 1));
+#endif  // CONFIG_EXT_TILE
 }  // namespace
diff --git a/test/transform_test_base.h b/test/transform_test_base.h
new file mode 100644
index 0000000..cf2facd
--- /dev/null
+++ b/test/transform_test_base.h
@@ -0,0 +1,291 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef TEST_TRANSFORM_TEST_BASE_H_
+#define TEST_TRANSFORM_TEST_BASE_H_
+
+#include "./vpx_config.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx/vpx_codec.h"
+
+
+namespace libvpx_test {
+
+//  Note:
+//   Same constant are defined in vp9/common/vp9_entropy.h and
+//   vp10/common/entropy.h.  Goal is to make this base class
+//   to use for future codec transform testing.  But including
+//   either of them would lead to compiling error when we do
+//   unit test for another codec. Suggest to move the definition
+//   to a vpx header file.
+const int kDctMaxValue = 16384;
+
+typedef void (*FhtFunc)(const int16_t *in, tran_low_t *out, int stride,
+                        int tx_type);
+
+class TransformTestBase {
+ public:
+  virtual ~TransformTestBase() {}
+
+ protected:
+  virtual void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) = 0;
+
+  virtual void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) = 0;
+
+  void RunAccuracyCheck(int limit) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    uint32_t max_error = 0;
+    int64_t total_error = 0;
+    const int count_test_block = 10000;
+
+    int16_t *test_input_block = reinterpret_cast<int16_t *>
+        (vpx_memalign(16, sizeof(int16_t) * num_coeffs_));
+    tran_low_t *test_temp_block = reinterpret_cast<tran_low_t *>
+        (vpx_memalign(16, sizeof(tran_low_t) * num_coeffs_));
+    uint8_t *dst = reinterpret_cast<uint8_t *>
+        (vpx_memalign(16, sizeof(uint8_t) * num_coeffs_));
+    uint8_t *src = reinterpret_cast<uint8_t *>
+        (vpx_memalign(16, sizeof(uint8_t) * num_coeffs_));
+#if CONFIG_VP9_HIGHBITDEPTH
+    uint16_t *dst16 = reinterpret_cast<uint16_t *>
+        (vpx_memalign(16, sizeof(uint16_t) * num_coeffs_));
+    uint16_t *src16 = reinterpret_cast<uint16_t *>
+        (vpx_memalign(16, sizeof(uint16_t) * num_coeffs_));
+#endif
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-255, 255].
+      for (int j = 0; j < num_coeffs_; ++j) {
+        if (bit_depth_ == VPX_BITS_8) {
+          src[j] = rnd.Rand8();
+          dst[j] = rnd.Rand8();
+          test_input_block[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          src16[j] = rnd.Rand16() & mask_;
+          dst16[j] = rnd.Rand16() & mask_;
+          test_input_block[j] = src16[j] - dst16[j];
+#endif
+        }
+      }
+
+      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(test_input_block,
+                                          test_temp_block, pitch_));
+      if (bit_depth_ == VPX_BITS_8) {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(test_temp_block,
+                                            CONVERT_TO_BYTEPTR(dst16), pitch_));
+#endif
+      }
+
+      for (int j = 0; j < num_coeffs_; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t diff =
+            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
+        ASSERT_EQ(VPX_BITS_8, bit_depth_);
+        const uint32_t diff = dst[j] - src[j];
+#endif
+        const uint32_t error = diff * diff;
+        if (max_error < error)
+          max_error = error;
+        total_error += error;
+      }
+    }
+
+    EXPECT_GE(static_cast<uint32_t>(limit), max_error)
+        << "Error: 4x4 FHT/IHT has an individual round trip error > "
+        << limit;
+
+    EXPECT_GE(count_test_block * limit, total_error)
+        << "Error: 4x4 FHT/IHT has average round trip error > " << limit
+        << " per block";
+
+    vpx_free(test_input_block);
+    vpx_free(test_temp_block);
+    vpx_free(dst);
+    vpx_free(src);
+#if CONFIG_VP9_HIGHBITDEPTH
+    vpx_free(dst16);
+    vpx_free(src16);
+#endif
+  }
+
+  void RunCoeffCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 5000;
+
+    int16_t *input_block = reinterpret_cast<int16_t *>
+        (vpx_memalign(16, sizeof(int16_t) * num_coeffs_));
+    tran_low_t *output_ref_block = reinterpret_cast<tran_low_t *>
+        (vpx_memalign(16, sizeof(tran_low_t) * num_coeffs_));
+    tran_low_t *output_block = reinterpret_cast<tran_low_t *>
+        (vpx_memalign(16, sizeof(tran_low_t) * num_coeffs_));
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-mask_, mask_].
+      for (int j = 0; j < num_coeffs_; ++j)
+        input_block[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+
+      fwd_txfm_ref(input_block, output_ref_block, pitch_, tx_type_);
+      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_block, output_block, pitch_));
+
+      // The minimum quant value is 4.
+      for (int j = 0; j < num_coeffs_; ++j) {
+        EXPECT_EQ(output_block[j], output_ref_block[j])
+            << "Error: not bit-exact result at index: " << j
+            << " at test block: " << i;
+      }
+    }
+    vpx_free(input_block);
+    vpx_free(output_ref_block);
+    vpx_free(output_block);
+  }
+
+  void RunMemCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 5000;
+
+    int16_t *input_extreme_block = reinterpret_cast<int16_t *>
+        (vpx_memalign(16, sizeof(int16_t) * num_coeffs_));
+    tran_low_t *output_ref_block = reinterpret_cast<tran_low_t *>
+        (vpx_memalign(16, sizeof(tran_low_t) * num_coeffs_));
+    tran_low_t *output_block = reinterpret_cast<tran_low_t *>
+        (vpx_memalign(16, sizeof(tran_low_t) * num_coeffs_));
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-mask_, mask_].
+      for (int j = 0; j < num_coeffs_; ++j) {
+        input_extreme_block[j] = rnd.Rand8() % 2 ? mask_ : -mask_;
+      }
+      if (i == 0) {
+        for (int j = 0; j < num_coeffs_; ++j)
+          input_extreme_block[j] = mask_;
+      } else if (i == 1) {
+        for (int j = 0; j < num_coeffs_; ++j)
+          input_extreme_block[j] = -mask_;
+      }
+
+      fwd_txfm_ref(input_extreme_block, output_ref_block, pitch_, tx_type_);
+      ASM_REGISTER_STATE_CHECK(RunFwdTxfm(input_extreme_block,
+                                          output_block, pitch_));
+
+      int row_length = FindRowLength();
+      // The minimum quant value is 4.
+      for (int j = 0; j < num_coeffs_; ++j) {
+        EXPECT_EQ(output_block[j], output_ref_block[j]);
+        EXPECT_GE(row_length * kDctMaxValue << (bit_depth_ - 8),
+                  abs(output_block[j]))
+            << "Error: NxN FDCT has coefficient larger than N*DCT_MAX_VALUE";
+      }
+    }
+    vpx_free(input_extreme_block);
+    vpx_free(output_ref_block);
+    vpx_free(output_block);
+  }
+
+  void RunInvAccuracyCheck(int limit) {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    const int count_test_block = 1000;
+
+    int16_t *in = reinterpret_cast<int16_t *>
+        (vpx_memalign(16, sizeof(int16_t) * num_coeffs_));
+    tran_low_t *coeff = reinterpret_cast<tran_low_t *>
+        (vpx_memalign(16, sizeof(tran_low_t) * num_coeffs_));
+    uint8_t *dst = reinterpret_cast<uint8_t *>
+        (vpx_memalign(16, sizeof(uint8_t) * num_coeffs_));
+    uint8_t *src = reinterpret_cast<uint8_t *>
+        (vpx_memalign(16, sizeof(uint8_t) * num_coeffs_));
+
+#if CONFIG_VP9_HIGHBITDEPTH
+    uint16_t *dst16 = reinterpret_cast<uint16_t *>
+        (vpx_memalign(16, sizeof(uint16_t) * num_coeffs_));
+    uint16_t *src16 = reinterpret_cast<uint16_t *>
+        (vpx_memalign(16, sizeof(uint16_t) * num_coeffs_));
+#endif
+
+    for (int i = 0; i < count_test_block; ++i) {
+      // Initialize a test block with input range [-mask_, mask_].
+      for (int j = 0; j < num_coeffs_; ++j) {
+        if (bit_depth_ == VPX_BITS_8) {
+          src[j] = rnd.Rand8();
+          dst[j] = rnd.Rand8();
+          in[j] = src[j] - dst[j];
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          src16[j] = rnd.Rand16() & mask_;
+          dst16[j] = rnd.Rand16() & mask_;
+          in[j] = src16[j] - dst16[j];
+#endif
+        }
+      }
+
+      fwd_txfm_ref(in, coeff, pitch_, tx_type_);
+
+      if (bit_depth_ == VPX_BITS_8) {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, dst, pitch_));
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        ASM_REGISTER_STATE_CHECK(RunInvTxfm(coeff, CONVERT_TO_BYTEPTR(dst16),
+                                            pitch_));
+#endif
+      }
+
+      for (int j = 0; j < num_coeffs_; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        const uint32_t diff =
+            bit_depth_ == VPX_BITS_8 ? dst[j] - src[j] : dst16[j] - src16[j];
+#else
+        const uint32_t diff = dst[j] - src[j];
+#endif
+        const uint32_t error = diff * diff;
+        EXPECT_GE(static_cast<uint32_t>(limit), error)
+            << "Error: 4x4 IDCT has error " << error
+            << " at index " << j;
+      }
+    }
+    vpx_free(in);
+    vpx_free(coeff);
+    vpx_free(dst);
+    vpx_free(src);
+#if CONFIG_VP9_HIGHBITDEPTH
+    vpx_free(src16);
+    vpx_free(dst16);
+#endif
+  }
+
+  int pitch_;
+  int tx_type_;
+  FhtFunc fwd_txfm_ref;
+  vpx_bit_depth_t bit_depth_;
+  int mask_;
+  int num_coeffs_;
+
+ private:
+  //  Assume transform size is 4x4, 8x8, 16x16,...
+  int FindRowLength() const {
+    int row = 4;
+    if (16 == num_coeffs_) {
+      row = 4;
+    } else if (64 == num_coeffs_) {
+      row = 8;
+    } else if (256 == num_coeffs_) {
+      row = 16;
+    } else if (1024 == num_coeffs_) {
+      row = 32;
+    }
+    return row;
+  }
+};
+
+}  // namespace libvpx_test
+
+#endif  // TEST_TRANSFORM_TEST_BASE_H_
diff --git a/test/variance_test.cc b/test/variance_test.cc
index cb63390..7eaed27 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -217,6 +217,7 @@
     : public ::testing::TestWithParam<tuple<int, int,
                                             VarianceFunctionType, int> > {
  public:
+  typedef tuple<int, int, VarianceFunctionType, int> ParamType;
   virtual void SetUp() {
     const tuple<int, int, VarianceFunctionType, int>& params = this->GetParam();
     log2width_  = get<0>(params);
@@ -765,53 +766,77 @@
                                           make_tuple(3, 4, &vpx_mse8x16_c),
                                           make_tuple(3, 3, &vpx_mse8x8_c)));
 
+const VpxVarianceTest::ParamType kArrayVariance_c[] = {
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(7, 7, &vpx_variance128x128_c, 0),
+    make_tuple(7, 6, &vpx_variance128x64_c, 0),
+    make_tuple(6, 7, &vpx_variance64x128_c, 0),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(6, 6, &vpx_variance64x64_c, 0),
+    make_tuple(6, 5, &vpx_variance64x32_c, 0),
+    make_tuple(5, 6, &vpx_variance32x64_c, 0),
+    make_tuple(5, 5, &vpx_variance32x32_c, 0),
+    make_tuple(5, 4, &vpx_variance32x16_c, 0),
+    make_tuple(4, 5, &vpx_variance16x32_c, 0),
+    make_tuple(4, 4, &vpx_variance16x16_c, 0),
+    make_tuple(4, 3, &vpx_variance16x8_c, 0),
+    make_tuple(3, 4, &vpx_variance8x16_c, 0),
+    make_tuple(3, 3, &vpx_variance8x8_c, 0),
+    make_tuple(3, 2, &vpx_variance8x4_c, 0),
+    make_tuple(2, 3, &vpx_variance4x8_c, 0),
+    make_tuple(2, 2, &vpx_variance4x4_c, 0)
+};
 INSTANTIATE_TEST_CASE_P(
     C, VpxVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &vpx_variance64x64_c, 0),
-                      make_tuple(6, 5, &vpx_variance64x32_c, 0),
-                      make_tuple(5, 6, &vpx_variance32x64_c, 0),
-                      make_tuple(5, 5, &vpx_variance32x32_c, 0),
-                      make_tuple(5, 4, &vpx_variance32x16_c, 0),
-                      make_tuple(4, 5, &vpx_variance16x32_c, 0),
-                      make_tuple(4, 4, &vpx_variance16x16_c, 0),
-                      make_tuple(4, 3, &vpx_variance16x8_c, 0),
-                      make_tuple(3, 4, &vpx_variance8x16_c, 0),
-                      make_tuple(3, 3, &vpx_variance8x8_c, 0),
-                      make_tuple(3, 2, &vpx_variance8x4_c, 0),
-                      make_tuple(2, 3, &vpx_variance4x8_c, 0),
-                      make_tuple(2, 2, &vpx_variance4x4_c, 0)));
+    ::testing::ValuesIn(kArrayVariance_c));
 
+const VpxSubpelVarianceTest::ParamType kArraySubpelVariance_c[] = {
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(7, 7, &vpx_sub_pixel_variance128x128_c, 0),
+    make_tuple(7, 6, &vpx_sub_pixel_variance128x64_c, 0),
+    make_tuple(6, 7, &vpx_sub_pixel_variance64x128_c, 0),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(6, 6, &vpx_sub_pixel_variance64x64_c, 0),
+    make_tuple(6, 5, &vpx_sub_pixel_variance64x32_c, 0),
+    make_tuple(5, 6, &vpx_sub_pixel_variance32x64_c, 0),
+    make_tuple(5, 5, &vpx_sub_pixel_variance32x32_c, 0),
+    make_tuple(5, 4, &vpx_sub_pixel_variance32x16_c, 0),
+    make_tuple(4, 5, &vpx_sub_pixel_variance16x32_c, 0),
+    make_tuple(4, 4, &vpx_sub_pixel_variance16x16_c, 0),
+    make_tuple(4, 3, &vpx_sub_pixel_variance16x8_c, 0),
+    make_tuple(3, 4, &vpx_sub_pixel_variance8x16_c, 0),
+    make_tuple(3, 3, &vpx_sub_pixel_variance8x8_c, 0),
+    make_tuple(3, 2, &vpx_sub_pixel_variance8x4_c, 0),
+    make_tuple(2, 3, &vpx_sub_pixel_variance4x8_c, 0),
+    make_tuple(2, 2, &vpx_sub_pixel_variance4x4_c, 0)
+};
 INSTANTIATE_TEST_CASE_P(
     C, VpxSubpelVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_variance64x64_c, 0),
-                      make_tuple(6, 5, &vpx_sub_pixel_variance64x32_c, 0),
-                      make_tuple(5, 6, &vpx_sub_pixel_variance32x64_c, 0),
-                      make_tuple(5, 5, &vpx_sub_pixel_variance32x32_c, 0),
-                      make_tuple(5, 4, &vpx_sub_pixel_variance32x16_c, 0),
-                      make_tuple(4, 5, &vpx_sub_pixel_variance16x32_c, 0),
-                      make_tuple(4, 4, &vpx_sub_pixel_variance16x16_c, 0),
-                      make_tuple(4, 3, &vpx_sub_pixel_variance16x8_c, 0),
-                      make_tuple(3, 4, &vpx_sub_pixel_variance8x16_c, 0),
-                      make_tuple(3, 3, &vpx_sub_pixel_variance8x8_c, 0),
-                      make_tuple(3, 2, &vpx_sub_pixel_variance8x4_c, 0),
-                      make_tuple(2, 3, &vpx_sub_pixel_variance4x8_c, 0),
-                      make_tuple(2, 2, &vpx_sub_pixel_variance4x4_c, 0)));
+    ::testing::ValuesIn(kArraySubpelVariance_c));
 
+const VpxSubpelAvgVarianceTest::ParamType kArraySubpelAvgVariance_c[] = {
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(7, 7, &vpx_sub_pixel_avg_variance128x128_c, 0),
+    make_tuple(7, 6, &vpx_sub_pixel_avg_variance128x64_c, 0),
+    make_tuple(6, 7, &vpx_sub_pixel_avg_variance64x128_c, 0),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_c, 0),
+    make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_c, 0),
+    make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_c, 0),
+    make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_c, 0),
+    make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_c, 0),
+    make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_c, 0),
+    make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_c, 0),
+    make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_c, 0),
+    make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_c, 0),
+    make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_c, 0),
+    make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_c, 0),
+    make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_c, 0),
+    make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_c, 0)
+};
 INSTANTIATE_TEST_CASE_P(
     C, VpxSubpelAvgVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &vpx_sub_pixel_avg_variance64x64_c, 0),
-                      make_tuple(6, 5, &vpx_sub_pixel_avg_variance64x32_c, 0),
-                      make_tuple(5, 6, &vpx_sub_pixel_avg_variance32x64_c, 0),
-                      make_tuple(5, 5, &vpx_sub_pixel_avg_variance32x32_c, 0),
-                      make_tuple(5, 4, &vpx_sub_pixel_avg_variance32x16_c, 0),
-                      make_tuple(4, 5, &vpx_sub_pixel_avg_variance16x32_c, 0),
-                      make_tuple(4, 4, &vpx_sub_pixel_avg_variance16x16_c, 0),
-                      make_tuple(4, 3, &vpx_sub_pixel_avg_variance16x8_c, 0),
-                      make_tuple(3, 4, &vpx_sub_pixel_avg_variance8x16_c, 0),
-                      make_tuple(3, 3, &vpx_sub_pixel_avg_variance8x8_c, 0),
-                      make_tuple(3, 2, &vpx_sub_pixel_avg_variance8x4_c, 0),
-                      make_tuple(2, 3, &vpx_sub_pixel_avg_variance4x8_c, 0),
-                      make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_c, 0)));
+    ::testing::ValuesIn(kArraySubpelAvgVariance_c));
 
 #if CONFIG_VP9_HIGHBITDEPTH
 typedef MseTest<VarianceMxNFunc> VpxHBDMseTest;
@@ -847,133 +872,194 @@
                       make_tuple(4, 4, &vpx_highbd_8_mse8x8_c)));
 */
 
+const VpxHBDVarianceTest::ParamType kArrayHBDVariance_c[] = {
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(7, 7, &vpx_highbd_12_variance128x128_c, 12),
+    make_tuple(7, 6, &vpx_highbd_12_variance128x64_c, 12),
+    make_tuple(6, 7, &vpx_highbd_12_variance64x128_c, 12),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(6, 6, &vpx_highbd_12_variance64x64_c, 12),
+    make_tuple(6, 5, &vpx_highbd_12_variance64x32_c, 12),
+    make_tuple(5, 6, &vpx_highbd_12_variance32x64_c, 12),
+    make_tuple(5, 5, &vpx_highbd_12_variance32x32_c, 12),
+    make_tuple(5, 4, &vpx_highbd_12_variance32x16_c, 12),
+    make_tuple(4, 5, &vpx_highbd_12_variance16x32_c, 12),
+    make_tuple(4, 4, &vpx_highbd_12_variance16x16_c, 12),
+    make_tuple(4, 3, &vpx_highbd_12_variance16x8_c, 12),
+    make_tuple(3, 4, &vpx_highbd_12_variance8x16_c, 12),
+    make_tuple(3, 3, &vpx_highbd_12_variance8x8_c, 12),
+    make_tuple(3, 2, &vpx_highbd_12_variance8x4_c, 12),
+    make_tuple(2, 3, &vpx_highbd_12_variance4x8_c, 12),
+    make_tuple(2, 2, &vpx_highbd_12_variance4x4_c, 12),
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(7, 7, &vpx_highbd_10_variance128x128_c, 10),
+    make_tuple(7, 6, &vpx_highbd_10_variance128x64_c, 10),
+    make_tuple(6, 7, &vpx_highbd_10_variance64x128_c, 10),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(6, 6, &vpx_highbd_10_variance64x64_c, 10),
+    make_tuple(6, 5, &vpx_highbd_10_variance64x32_c, 10),
+    make_tuple(5, 6, &vpx_highbd_10_variance32x64_c, 10),
+    make_tuple(5, 5, &vpx_highbd_10_variance32x32_c, 10),
+    make_tuple(5, 4, &vpx_highbd_10_variance32x16_c, 10),
+    make_tuple(4, 5, &vpx_highbd_10_variance16x32_c, 10),
+    make_tuple(4, 4, &vpx_highbd_10_variance16x16_c, 10),
+    make_tuple(4, 3, &vpx_highbd_10_variance16x8_c, 10),
+    make_tuple(3, 4, &vpx_highbd_10_variance8x16_c, 10),
+    make_tuple(3, 3, &vpx_highbd_10_variance8x8_c, 10),
+    make_tuple(3, 2, &vpx_highbd_10_variance8x4_c, 10),
+    make_tuple(2, 3, &vpx_highbd_10_variance4x8_c, 10),
+    make_tuple(2, 2, &vpx_highbd_10_variance4x4_c, 10),
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(7, 7, &vpx_highbd_8_variance128x128_c, 8),
+    make_tuple(7, 6, &vpx_highbd_8_variance128x64_c, 8),
+    make_tuple(6, 7, &vpx_highbd_8_variance64x128_c, 8),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(6, 6, &vpx_highbd_8_variance64x64_c, 8),
+    make_tuple(6, 5, &vpx_highbd_8_variance64x32_c, 8),
+    make_tuple(5, 6, &vpx_highbd_8_variance32x64_c, 8),
+    make_tuple(5, 5, &vpx_highbd_8_variance32x32_c, 8),
+    make_tuple(5, 4, &vpx_highbd_8_variance32x16_c, 8),
+    make_tuple(4, 5, &vpx_highbd_8_variance16x32_c, 8),
+    make_tuple(4, 4, &vpx_highbd_8_variance16x16_c, 8),
+    make_tuple(4, 3, &vpx_highbd_8_variance16x8_c, 8),
+    make_tuple(3, 4, &vpx_highbd_8_variance8x16_c, 8),
+    make_tuple(3, 3, &vpx_highbd_8_variance8x8_c, 8),
+    make_tuple(3, 2, &vpx_highbd_8_variance8x4_c, 8),
+    make_tuple(2, 3, &vpx_highbd_8_variance4x8_c, 8),
+    make_tuple(2, 2, &vpx_highbd_8_variance4x4_c, 8)
+};
 INSTANTIATE_TEST_CASE_P(
     C, VpxHBDVarianceTest,
-    ::testing::Values(make_tuple(6, 6, &vpx_highbd_12_variance64x64_c, 12),
-                      make_tuple(6, 5, &vpx_highbd_12_variance64x32_c, 12),
-                      make_tuple(5, 6, &vpx_highbd_12_variance32x64_c, 12),
-                      make_tuple(5, 5, &vpx_highbd_12_variance32x32_c, 12),
-                      make_tuple(5, 4, &vpx_highbd_12_variance32x16_c, 12),
-                      make_tuple(4, 5, &vpx_highbd_12_variance16x32_c, 12),
-                      make_tuple(4, 4, &vpx_highbd_12_variance16x16_c, 12),
-                      make_tuple(4, 3, &vpx_highbd_12_variance16x8_c, 12),
-                      make_tuple(3, 4, &vpx_highbd_12_variance8x16_c, 12),
-                      make_tuple(3, 3, &vpx_highbd_12_variance8x8_c, 12),
-                      make_tuple(3, 2, &vpx_highbd_12_variance8x4_c, 12),
-                      make_tuple(2, 3, &vpx_highbd_12_variance4x8_c, 12),
-                      make_tuple(2, 2, &vpx_highbd_12_variance4x4_c, 12),
-                      make_tuple(6, 6, &vpx_highbd_10_variance64x64_c, 10),
-                      make_tuple(6, 5, &vpx_highbd_10_variance64x32_c, 10),
-                      make_tuple(5, 6, &vpx_highbd_10_variance32x64_c, 10),
-                      make_tuple(5, 5, &vpx_highbd_10_variance32x32_c, 10),
-                      make_tuple(5, 4, &vpx_highbd_10_variance32x16_c, 10),
-                      make_tuple(4, 5, &vpx_highbd_10_variance16x32_c, 10),
-                      make_tuple(4, 4, &vpx_highbd_10_variance16x16_c, 10),
-                      make_tuple(4, 3, &vpx_highbd_10_variance16x8_c, 10),
-                      make_tuple(3, 4, &vpx_highbd_10_variance8x16_c, 10),
-                      make_tuple(3, 3, &vpx_highbd_10_variance8x8_c, 10),
-                      make_tuple(3, 2, &vpx_highbd_10_variance8x4_c, 10),
-                      make_tuple(2, 3, &vpx_highbd_10_variance4x8_c, 10),
-                      make_tuple(2, 2, &vpx_highbd_10_variance4x4_c, 10),
-                      make_tuple(6, 6, &vpx_highbd_8_variance64x64_c, 8),
-                      make_tuple(6, 5, &vpx_highbd_8_variance64x32_c, 8),
-                      make_tuple(5, 6, &vpx_highbd_8_variance32x64_c, 8),
-                      make_tuple(5, 5, &vpx_highbd_8_variance32x32_c, 8),
-                      make_tuple(5, 4, &vpx_highbd_8_variance32x16_c, 8),
-                      make_tuple(4, 5, &vpx_highbd_8_variance16x32_c, 8),
-                      make_tuple(4, 4, &vpx_highbd_8_variance16x16_c, 8),
-                      make_tuple(4, 3, &vpx_highbd_8_variance16x8_c, 8),
-                      make_tuple(3, 4, &vpx_highbd_8_variance8x16_c, 8),
-                      make_tuple(3, 3, &vpx_highbd_8_variance8x8_c, 8),
-                      make_tuple(3, 2, &vpx_highbd_8_variance8x4_c, 8),
-                      make_tuple(2, 3, &vpx_highbd_8_variance4x8_c, 8),
-                      make_tuple(2, 2, &vpx_highbd_8_variance4x4_c, 8)));
+    ::testing::ValuesIn(kArrayHBDVariance_c));
 
+#if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, VpxHBDVarianceTest,
+    ::testing::Values(
+         make_tuple(2, 2, &vpx_highbd_8_variance4x4_sse4_1, 8),
+         make_tuple(2, 2, &vpx_highbd_10_variance4x4_sse4_1, 10),
+         make_tuple(2, 2, &vpx_highbd_12_variance4x4_sse4_1, 12)));
+#endif  // HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
+
+const VpxHBDSubpelVarianceTest::ParamType kArrayHBDSubpelVariance_c[] = {
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(7, 7, &vpx_highbd_8_sub_pixel_variance128x128_c, 8),
+    make_tuple(7, 6, &vpx_highbd_8_sub_pixel_variance128x64_c, 8),
+    make_tuple(6, 7, &vpx_highbd_8_sub_pixel_variance64x128_c, 8),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_c, 8),
+    make_tuple(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_c, 8),
+    make_tuple(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_c, 8),
+    make_tuple(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_c, 8),
+    make_tuple(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_c, 8),
+    make_tuple(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_c, 8),
+    make_tuple(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_c, 8),
+    make_tuple(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_c, 8),
+    make_tuple(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_c, 8),
+    make_tuple(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_c, 8),
+    make_tuple(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_c, 8),
+    make_tuple(2, 3, &vpx_highbd_8_sub_pixel_variance4x8_c, 8),
+    make_tuple(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_c, 8),
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(7, 7, &vpx_highbd_10_sub_pixel_variance128x128_c, 10),
+    make_tuple(7, 6, &vpx_highbd_10_sub_pixel_variance128x64_c, 10),
+    make_tuple(6, 7, &vpx_highbd_10_sub_pixel_variance64x128_c, 10),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_c, 10),
+    make_tuple(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_c, 10),
+    make_tuple(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_c, 10),
+    make_tuple(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_c, 10),
+    make_tuple(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_c, 10),
+    make_tuple(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_c, 10),
+    make_tuple(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_c, 10),
+    make_tuple(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_c, 10),
+    make_tuple(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_c, 10),
+    make_tuple(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_c, 10),
+    make_tuple(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_c, 10),
+    make_tuple(2, 3, &vpx_highbd_10_sub_pixel_variance4x8_c, 10),
+    make_tuple(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_c, 10),
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(7, 7, &vpx_highbd_12_sub_pixel_variance128x128_c, 12),
+    make_tuple(7, 6, &vpx_highbd_12_sub_pixel_variance128x64_c, 12),
+    make_tuple(6, 7, &vpx_highbd_12_sub_pixel_variance64x128_c, 12),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_c, 12),
+    make_tuple(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_c, 12),
+    make_tuple(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_c, 12),
+    make_tuple(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_c, 12),
+    make_tuple(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_c, 12),
+    make_tuple(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_c, 12),
+    make_tuple(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_c, 12),
+    make_tuple(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_c, 12),
+    make_tuple(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_c, 12),
+    make_tuple(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_c, 12),
+    make_tuple(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_c, 12),
+    make_tuple(2, 3, &vpx_highbd_12_sub_pixel_variance4x8_c, 12),
+    make_tuple(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_c, 12)
+};
 INSTANTIATE_TEST_CASE_P(
     C, VpxHBDSubpelVarianceTest,
-    ::testing::Values(
-        make_tuple(6, 6, &vpx_highbd_8_sub_pixel_variance64x64_c, 8),
-        make_tuple(6, 5, &vpx_highbd_8_sub_pixel_variance64x32_c, 8),
-        make_tuple(5, 6, &vpx_highbd_8_sub_pixel_variance32x64_c, 8),
-        make_tuple(5, 5, &vpx_highbd_8_sub_pixel_variance32x32_c, 8),
-        make_tuple(5, 4, &vpx_highbd_8_sub_pixel_variance32x16_c, 8),
-        make_tuple(4, 5, &vpx_highbd_8_sub_pixel_variance16x32_c, 8),
-        make_tuple(4, 4, &vpx_highbd_8_sub_pixel_variance16x16_c, 8),
-        make_tuple(4, 3, &vpx_highbd_8_sub_pixel_variance16x8_c, 8),
-        make_tuple(3, 4, &vpx_highbd_8_sub_pixel_variance8x16_c, 8),
-        make_tuple(3, 3, &vpx_highbd_8_sub_pixel_variance8x8_c, 8),
-        make_tuple(3, 2, &vpx_highbd_8_sub_pixel_variance8x4_c, 8),
-        make_tuple(2, 3, &vpx_highbd_8_sub_pixel_variance4x8_c, 8),
-        make_tuple(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_c, 8),
-        make_tuple(6, 6, &vpx_highbd_10_sub_pixel_variance64x64_c, 10),
-        make_tuple(6, 5, &vpx_highbd_10_sub_pixel_variance64x32_c, 10),
-        make_tuple(5, 6, &vpx_highbd_10_sub_pixel_variance32x64_c, 10),
-        make_tuple(5, 5, &vpx_highbd_10_sub_pixel_variance32x32_c, 10),
-        make_tuple(5, 4, &vpx_highbd_10_sub_pixel_variance32x16_c, 10),
-        make_tuple(4, 5, &vpx_highbd_10_sub_pixel_variance16x32_c, 10),
-        make_tuple(4, 4, &vpx_highbd_10_sub_pixel_variance16x16_c, 10),
-        make_tuple(4, 3, &vpx_highbd_10_sub_pixel_variance16x8_c, 10),
-        make_tuple(3, 4, &vpx_highbd_10_sub_pixel_variance8x16_c, 10),
-        make_tuple(3, 3, &vpx_highbd_10_sub_pixel_variance8x8_c, 10),
-        make_tuple(3, 2, &vpx_highbd_10_sub_pixel_variance8x4_c, 10),
-        make_tuple(2, 3, &vpx_highbd_10_sub_pixel_variance4x8_c, 10),
-        make_tuple(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_c, 10),
-        make_tuple(6, 6, &vpx_highbd_12_sub_pixel_variance64x64_c, 12),
-        make_tuple(6, 5, &vpx_highbd_12_sub_pixel_variance64x32_c, 12),
-        make_tuple(5, 6, &vpx_highbd_12_sub_pixel_variance32x64_c, 12),
-        make_tuple(5, 5, &vpx_highbd_12_sub_pixel_variance32x32_c, 12),
-        make_tuple(5, 4, &vpx_highbd_12_sub_pixel_variance32x16_c, 12),
-        make_tuple(4, 5, &vpx_highbd_12_sub_pixel_variance16x32_c, 12),
-        make_tuple(4, 4, &vpx_highbd_12_sub_pixel_variance16x16_c, 12),
-        make_tuple(4, 3, &vpx_highbd_12_sub_pixel_variance16x8_c, 12),
-        make_tuple(3, 4, &vpx_highbd_12_sub_pixel_variance8x16_c, 12),
-        make_tuple(3, 3, &vpx_highbd_12_sub_pixel_variance8x8_c, 12),
-        make_tuple(3, 2, &vpx_highbd_12_sub_pixel_variance8x4_c, 12),
-        make_tuple(2, 3, &vpx_highbd_12_sub_pixel_variance4x8_c, 12),
-        make_tuple(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_c, 12)));
+    ::testing::ValuesIn(kArrayHBDSubpelVariance_c));
 
+const VpxHBDSubpelAvgVarianceTest::ParamType kArrayHBDSubpelAvgVariance_c[] = {
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(7, 7, &vpx_highbd_8_sub_pixel_avg_variance128x128_c, 8),
+    make_tuple(7, 6, &vpx_highbd_8_sub_pixel_avg_variance128x64_c, 8),
+    make_tuple(6, 7, &vpx_highbd_8_sub_pixel_avg_variance64x128_c, 8),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(6, 6, &vpx_highbd_8_sub_pixel_avg_variance64x64_c, 8),
+    make_tuple(6, 5, &vpx_highbd_8_sub_pixel_avg_variance64x32_c, 8),
+    make_tuple(5, 6, &vpx_highbd_8_sub_pixel_avg_variance32x64_c, 8),
+    make_tuple(5, 5, &vpx_highbd_8_sub_pixel_avg_variance32x32_c, 8),
+    make_tuple(5, 4, &vpx_highbd_8_sub_pixel_avg_variance32x16_c, 8),
+    make_tuple(4, 5, &vpx_highbd_8_sub_pixel_avg_variance16x32_c, 8),
+    make_tuple(4, 4, &vpx_highbd_8_sub_pixel_avg_variance16x16_c, 8),
+    make_tuple(4, 3, &vpx_highbd_8_sub_pixel_avg_variance16x8_c, 8),
+    make_tuple(3, 4, &vpx_highbd_8_sub_pixel_avg_variance8x16_c, 8),
+    make_tuple(3, 3, &vpx_highbd_8_sub_pixel_avg_variance8x8_c, 8),
+    make_tuple(3, 2, &vpx_highbd_8_sub_pixel_avg_variance8x4_c, 8),
+    make_tuple(2, 3, &vpx_highbd_8_sub_pixel_avg_variance4x8_c, 8),
+    make_tuple(2, 2, &vpx_highbd_8_sub_pixel_avg_variance4x4_c, 8),
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(7, 7, &vpx_highbd_10_sub_pixel_avg_variance128x128_c, 10),
+    make_tuple(7, 6, &vpx_highbd_10_sub_pixel_avg_variance128x64_c, 10),
+    make_tuple(6, 7, &vpx_highbd_10_sub_pixel_avg_variance64x128_c, 10),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(6, 6, &vpx_highbd_10_sub_pixel_avg_variance64x64_c, 10),
+    make_tuple(6, 5, &vpx_highbd_10_sub_pixel_avg_variance64x32_c, 10),
+    make_tuple(5, 6, &vpx_highbd_10_sub_pixel_avg_variance32x64_c, 10),
+    make_tuple(5, 5, &vpx_highbd_10_sub_pixel_avg_variance32x32_c, 10),
+    make_tuple(5, 4, &vpx_highbd_10_sub_pixel_avg_variance32x16_c, 10),
+    make_tuple(4, 5, &vpx_highbd_10_sub_pixel_avg_variance16x32_c, 10),
+    make_tuple(4, 4, &vpx_highbd_10_sub_pixel_avg_variance16x16_c, 10),
+    make_tuple(4, 3, &vpx_highbd_10_sub_pixel_avg_variance16x8_c, 10),
+    make_tuple(3, 4, &vpx_highbd_10_sub_pixel_avg_variance8x16_c, 10),
+    make_tuple(3, 3, &vpx_highbd_10_sub_pixel_avg_variance8x8_c, 10),
+    make_tuple(3, 2, &vpx_highbd_10_sub_pixel_avg_variance8x4_c, 10),
+    make_tuple(2, 3, &vpx_highbd_10_sub_pixel_avg_variance4x8_c, 10),
+    make_tuple(2, 2, &vpx_highbd_10_sub_pixel_avg_variance4x4_c, 10),
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(7, 7, &vpx_highbd_12_sub_pixel_avg_variance128x128_c, 12),
+    make_tuple(7, 6, &vpx_highbd_12_sub_pixel_avg_variance128x64_c, 12),
+    make_tuple(6, 7, &vpx_highbd_12_sub_pixel_avg_variance64x128_c, 12),
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
+    make_tuple(6, 6, &vpx_highbd_12_sub_pixel_avg_variance64x64_c, 12),
+    make_tuple(6, 5, &vpx_highbd_12_sub_pixel_avg_variance64x32_c, 12),
+    make_tuple(5, 6, &vpx_highbd_12_sub_pixel_avg_variance32x64_c, 12),
+    make_tuple(5, 5, &vpx_highbd_12_sub_pixel_avg_variance32x32_c, 12),
+    make_tuple(5, 4, &vpx_highbd_12_sub_pixel_avg_variance32x16_c, 12),
+    make_tuple(4, 5, &vpx_highbd_12_sub_pixel_avg_variance16x32_c, 12),
+    make_tuple(4, 4, &vpx_highbd_12_sub_pixel_avg_variance16x16_c, 12),
+    make_tuple(4, 3, &vpx_highbd_12_sub_pixel_avg_variance16x8_c, 12),
+    make_tuple(3, 4, &vpx_highbd_12_sub_pixel_avg_variance8x16_c, 12),
+    make_tuple(3, 3, &vpx_highbd_12_sub_pixel_avg_variance8x8_c, 12),
+    make_tuple(3, 2, &vpx_highbd_12_sub_pixel_avg_variance8x4_c, 12),
+    make_tuple(2, 3, &vpx_highbd_12_sub_pixel_avg_variance4x8_c, 12),
+    make_tuple(2, 2, &vpx_highbd_12_sub_pixel_avg_variance4x4_c, 12)
+};
 INSTANTIATE_TEST_CASE_P(
     C, VpxHBDSubpelAvgVarianceTest,
-    ::testing::Values(
-        make_tuple(6, 6, &vpx_highbd_8_sub_pixel_avg_variance64x64_c, 8),
-        make_tuple(6, 5, &vpx_highbd_8_sub_pixel_avg_variance64x32_c, 8),
-        make_tuple(5, 6, &vpx_highbd_8_sub_pixel_avg_variance32x64_c, 8),
-        make_tuple(5, 5, &vpx_highbd_8_sub_pixel_avg_variance32x32_c, 8),
-        make_tuple(5, 4, &vpx_highbd_8_sub_pixel_avg_variance32x16_c, 8),
-        make_tuple(4, 5, &vpx_highbd_8_sub_pixel_avg_variance16x32_c, 8),
-        make_tuple(4, 4, &vpx_highbd_8_sub_pixel_avg_variance16x16_c, 8),
-        make_tuple(4, 3, &vpx_highbd_8_sub_pixel_avg_variance16x8_c, 8),
-        make_tuple(3, 4, &vpx_highbd_8_sub_pixel_avg_variance8x16_c, 8),
-        make_tuple(3, 3, &vpx_highbd_8_sub_pixel_avg_variance8x8_c, 8),
-        make_tuple(3, 2, &vpx_highbd_8_sub_pixel_avg_variance8x4_c, 8),
-        make_tuple(2, 3, &vpx_highbd_8_sub_pixel_avg_variance4x8_c, 8),
-        make_tuple(2, 2, &vpx_highbd_8_sub_pixel_avg_variance4x4_c, 8),
-        make_tuple(6, 6, &vpx_highbd_10_sub_pixel_avg_variance64x64_c, 10),
-        make_tuple(6, 5, &vpx_highbd_10_sub_pixel_avg_variance64x32_c, 10),
-        make_tuple(5, 6, &vpx_highbd_10_sub_pixel_avg_variance32x64_c, 10),
-        make_tuple(5, 5, &vpx_highbd_10_sub_pixel_avg_variance32x32_c, 10),
-        make_tuple(5, 4, &vpx_highbd_10_sub_pixel_avg_variance32x16_c, 10),
-        make_tuple(4, 5, &vpx_highbd_10_sub_pixel_avg_variance16x32_c, 10),
-        make_tuple(4, 4, &vpx_highbd_10_sub_pixel_avg_variance16x16_c, 10),
-        make_tuple(4, 3, &vpx_highbd_10_sub_pixel_avg_variance16x8_c, 10),
-        make_tuple(3, 4, &vpx_highbd_10_sub_pixel_avg_variance8x16_c, 10),
-        make_tuple(3, 3, &vpx_highbd_10_sub_pixel_avg_variance8x8_c, 10),
-        make_tuple(3, 2, &vpx_highbd_10_sub_pixel_avg_variance8x4_c, 10),
-        make_tuple(2, 3, &vpx_highbd_10_sub_pixel_avg_variance4x8_c, 10),
-        make_tuple(2, 2, &vpx_highbd_10_sub_pixel_avg_variance4x4_c, 10),
-        make_tuple(6, 6, &vpx_highbd_12_sub_pixel_avg_variance64x64_c, 12),
-        make_tuple(6, 5, &vpx_highbd_12_sub_pixel_avg_variance64x32_c, 12),
-        make_tuple(5, 6, &vpx_highbd_12_sub_pixel_avg_variance32x64_c, 12),
-        make_tuple(5, 5, &vpx_highbd_12_sub_pixel_avg_variance32x32_c, 12),
-        make_tuple(5, 4, &vpx_highbd_12_sub_pixel_avg_variance32x16_c, 12),
-        make_tuple(4, 5, &vpx_highbd_12_sub_pixel_avg_variance16x32_c, 12),
-        make_tuple(4, 4, &vpx_highbd_12_sub_pixel_avg_variance16x16_c, 12),
-        make_tuple(4, 3, &vpx_highbd_12_sub_pixel_avg_variance16x8_c, 12),
-        make_tuple(3, 4, &vpx_highbd_12_sub_pixel_avg_variance8x16_c, 12),
-        make_tuple(3, 3, &vpx_highbd_12_sub_pixel_avg_variance8x8_c, 12),
-        make_tuple(3, 2, &vpx_highbd_12_sub_pixel_avg_variance8x4_c, 12),
-        make_tuple(2, 3, &vpx_highbd_12_sub_pixel_avg_variance4x8_c, 12),
-        make_tuple(2, 2, &vpx_highbd_12_sub_pixel_avg_variance4x4_c, 12)));
+    ::testing::ValuesIn(kArrayHBDSubpelAvgVariance_c));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #if HAVE_SSE2
@@ -1037,6 +1123,22 @@
         make_tuple(2, 2, &vpx_sub_pixel_avg_variance4x4_sse2, 0)));
 #endif  // CONFIG_USE_X86INC
 
+#if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, VpxSubpelVarianceTest,
+    ::testing::Values(
+         make_tuple(2, 2, &vpx_highbd_8_sub_pixel_variance4x4_sse4_1, 8),
+         make_tuple(2, 2, &vpx_highbd_10_sub_pixel_variance4x4_sse4_1, 10),
+         make_tuple(2, 2, &vpx_highbd_12_sub_pixel_variance4x4_sse4_1, 12)));
+
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, VpxSubpelAvgVarianceTest,
+    ::testing::Values(
+        make_tuple(2, 2, &vpx_highbd_8_sub_pixel_avg_variance4x4_sse4_1, 8),
+        make_tuple(2, 2, &vpx_highbd_10_sub_pixel_avg_variance4x4_sse4_1, 10),
+        make_tuple(2, 2, &vpx_highbd_12_sub_pixel_avg_variance4x4_sse4_1, 12)));
+#endif  // HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
+
 #if CONFIG_VP9_HIGHBITDEPTH
 /* TODO(debargha): This test does not support the highbd version
 INSTANTIATE_TEST_CASE_P(
diff --git a/test/vp10_ans_test.cc b/test/vp10_ans_test.cc
new file mode 100644
index 0000000..20aedba
--- /dev/null
+++ b/test/vp10_ans_test.cc
@@ -0,0 +1,339 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#define VP10_FORCE_VPXBOOL_TREEWRITER
+
+#include <assert.h>
+#include <math.h>
+#include <stdio.h>
+#include <ctime>
+#include <utility>
+#include <vector>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "test/acm_random.h"
+#include "vp10/common/ans.h"
+#include "vp10/encoder/treewriter.h"
+#include "vpx_dsp/bitreader.h"
+#include "vpx_dsp/bitwriter.h"
+
+namespace {
+typedef std::vector<std::pair<uint8_t, bool> > PvVec;
+
+PvVec abs_encode_build_vals(int iters) {
+  PvVec ret;
+  libvpx_test::ACMRandom gen(0x30317076);
+  double entropy = 0;
+  for (int i = 0; i < iters; ++i) {
+    uint8_t p;
+    do {
+      p = gen.Rand8();
+    } while (p == 0);  // zero is not a valid coding probability
+    bool b = gen.Rand8() < p;
+    ret.push_back(std::make_pair(static_cast<uint8_t>(p), b));
+    double d = p / 256.;
+    entropy += -d * log2(d) - (1 - d) * log2(1 - d);
+  }
+  printf("entropy %f\n", entropy);
+  return ret;
+}
+
+bool check_rabs(const PvVec &pv_vec, uint8_t *buf) {
+  AnsCoder a;
+  ans_write_init(&a, buf);
+
+  std::clock_t start = std::clock();
+  for (PvVec::const_reverse_iterator it = pv_vec.rbegin(); it != pv_vec.rend();
+       ++it) {
+    rabs_write(&a, it->second, 256 - it->first);
+  }
+  std::clock_t enc_time = std::clock() - start;
+  int offset = ans_write_end(&a);
+  bool okay = true;
+  AnsDecoder d;
+  if (ans_read_init(&d, buf, offset)) return false;
+  start = std::clock();
+  for (PvVec::const_iterator it = pv_vec.begin(); it != pv_vec.end(); ++it) {
+    okay &= rabs_read(&d, 256 - it->first) == it->second;
+  }
+  std::clock_t dec_time = std::clock() - start;
+  if (!okay) return false;
+  printf("rABS size %d enc_time %f dec_time %f\n", offset,
+         static_cast<float>(enc_time) / CLOCKS_PER_SEC,
+         static_cast<float>(dec_time) / CLOCKS_PER_SEC);
+  return ans_read_end(&d);
+}
+
+bool check_rabs_asc(const PvVec &pv_vec, uint8_t *buf) {
+  AnsCoder a;
+  ans_write_init(&a, buf);
+
+  std::clock_t start = std::clock();
+  for (PvVec::const_reverse_iterator it = pv_vec.rbegin(); it != pv_vec.rend();
+       ++it) {
+    rabs_asc_write(&a, it->second, 256 - it->first);
+  }
+  std::clock_t enc_time = std::clock() - start;
+  int offset = ans_write_end(&a);
+  bool okay = true;
+  AnsDecoder d;
+  if (ans_read_init(&d, buf, offset)) return false;
+  start = std::clock();
+  for (PvVec::const_iterator it = pv_vec.begin(); it != pv_vec.end(); ++it) {
+    okay &= rabs_asc_read(&d, 256 - it->first) == it->second;
+  }
+  std::clock_t dec_time = std::clock() - start;
+  if (!okay) return false;
+  printf("rABS (asc) size %d enc_time %f dec_time %f\n", offset,
+         static_cast<float>(enc_time) / CLOCKS_PER_SEC,
+         static_cast<float>(dec_time) / CLOCKS_PER_SEC);
+  return ans_read_end(&d);
+}
+
+bool check_uabs(const PvVec &pv_vec, uint8_t *buf) {
+  AnsCoder a;
+  ans_write_init(&a, buf);
+
+  std::clock_t start = std::clock();
+  for (PvVec::const_reverse_iterator it = pv_vec.rbegin(); it != pv_vec.rend();
+       ++it) {
+    uabs_write(&a, it->second, 256 - it->first);
+  }
+  std::clock_t enc_time = std::clock() - start;
+  int offset = ans_write_end(&a);
+  bool okay = true;
+  AnsDecoder d;
+  if (ans_read_init(&d, buf, offset)) return false;
+  start = std::clock();
+  for (PvVec::const_iterator it = pv_vec.begin(); it != pv_vec.end(); ++it) {
+    okay &= uabs_read(&d, 256 - it->first) == it->second;
+  }
+  std::clock_t dec_time = std::clock() - start;
+  if (!okay) return false;
+  printf("uABS size %d enc_time %f dec_time %f\n", offset,
+         static_cast<float>(enc_time) / CLOCKS_PER_SEC,
+         static_cast<float>(dec_time) / CLOCKS_PER_SEC);
+  return ans_read_end(&d);
+}
+
+bool check_vpxbool(const PvVec &pv_vec, uint8_t *buf) {
+  vpx_writer w;
+  vpx_reader r;
+  vpx_start_encode(&w, buf);
+
+  std::clock_t start = std::clock();
+  for (PvVec::const_iterator it = pv_vec.begin(); it != pv_vec.end(); ++it) {
+    vpx_write(&w, it->second, 256 - it->first);
+  }
+  std::clock_t enc_time = std::clock() - start;
+  vpx_stop_encode(&w);
+  bool okay = true;
+  vpx_reader_init(&r, buf, w.pos, NULL, NULL);
+  start = std::clock();
+  for (PvVec::const_iterator it = pv_vec.begin(); it != pv_vec.end(); ++it) {
+    okay &= vpx_read(&r, 256 - it->first) == it->second;
+  }
+  std::clock_t dec_time = std::clock() - start;
+  printf("VPX size %d enc_time %f dec_time %f\n", w.pos,
+         static_cast<float>(enc_time) / CLOCKS_PER_SEC,
+         static_cast<float>(dec_time) / CLOCKS_PER_SEC);
+  return okay;
+}
+
+// TODO(aconverse): replace this with a more representative distribution from
+// the codec.
+const rans_sym rans_sym_tab[] = {
+    {16 * 4, 0 * 4}, {100 * 4, 16 * 4}, {70 * 4, 116 *4}, {70 * 4, 186 *4},
+};
+const int kDistinctSyms = sizeof(rans_sym_tab) / sizeof(rans_sym_tab[0]);
+
+std::vector<int> ans_encode_build_vals(const rans_sym *tab, int iters) {
+  std::vector<int> p_to_sym;
+  int i = 0;
+  while (p_to_sym.size() < rans_precision) {
+    p_to_sym.insert(p_to_sym.end(), tab[i].prob, i);
+    ++i;
+  }
+  assert(p_to_sym.size() == rans_precision);
+  std::vector<int> ret;
+  libvpx_test::ACMRandom gen(18543637);
+  for (int i = 0; i < iters; ++i) {
+    int sym = p_to_sym[gen.Rand8() * 4];
+    ret.push_back(sym);
+  }
+  return ret;
+}
+
+void rans_build_dec_tab(const struct rans_sym sym_tab[],
+                        rans_dec_lut dec_tab) {
+  dec_tab[0] = 0;
+  for (int i = 1; dec_tab[i - 1] < rans_precision; ++i) {
+    dec_tab[i] = dec_tab[i - 1] + sym_tab[i - 1].prob;
+  }
+}
+
+bool check_rans(const std::vector<int> &sym_vec, const rans_sym *const tab,
+                uint8_t *buf) {
+  AnsCoder a;
+  ans_write_init(&a, buf);
+  rans_dec_lut dec_tab;
+  rans_build_dec_tab(tab, dec_tab);
+
+  std::clock_t start = std::clock();
+  for (std::vector<int>::const_reverse_iterator it = sym_vec.rbegin();
+       it != sym_vec.rend(); ++it) {
+    rans_write(&a, &tab[*it]);
+  }
+  std::clock_t enc_time = std::clock() - start;
+  int offset = ans_write_end(&a);
+  bool okay = true;
+  AnsDecoder d;
+  if (ans_read_init(&d, buf, offset)) return false;
+  start = std::clock();
+  for (std::vector<int>::const_iterator it = sym_vec.begin();
+       it != sym_vec.end(); ++it) {
+    okay &= rans_read(&d, dec_tab) == *it;
+  }
+  std::clock_t dec_time = std::clock() - start;
+  if (!okay) return false;
+  printf("rANS size %d enc_time %f dec_time %f\n", offset,
+         static_cast<float>(enc_time) / CLOCKS_PER_SEC,
+         static_cast<float>(dec_time) / CLOCKS_PER_SEC);
+  return ans_read_end(&d);
+}
+
+void build_tree(vpx_tree_index *tree, int num_syms) {
+  vpx_tree_index i;
+  int sym = 0;
+  for (i = 0; i < num_syms - 1; ++i) {
+    tree[2 * i] = sym--;
+    tree[2 * i + 1] = 2 * (i + 1);
+  }
+  tree[2 * i - 1] = sym;
+}
+
+/* The treep array contains the probabilities of nodes of a tree structured
+ * like:
+ *          *
+ *         / \
+ *    -sym0   *
+ *           / \
+ *       -sym1  *
+ *             / \
+ *        -sym2  -sym3
+ */
+void tab2tree(const rans_sym *tab, int tab_size, vpx_prob *treep) {
+  const unsigned basep = rans_precision;
+  unsigned pleft = basep;
+  for (int i = 0; i < tab_size - 1; ++i) {
+    unsigned prob = (tab[i].prob * basep + basep * 2) / (pleft * 4);
+    assert(prob > 0 && prob < 256);
+    treep[i] = prob;
+    pleft -= tab[i].prob;
+  }
+}
+
+struct sym_bools {
+  unsigned bits;
+  int len;
+};
+
+static void make_tree_bits_tab(sym_bools *tab, int num_syms) {
+  unsigned bits = 0;
+  int len = 0;
+  int i;
+  for (i = 0; i < num_syms - 1; ++i) {
+    bits *= 2;
+    ++len;
+    tab[i].bits = bits;
+    tab[i].len = len;
+    ++bits;
+  }
+  tab[i].bits = bits;
+  tab[i].len = len;
+}
+
+void build_tpb(vpx_prob probs[/*num_syms*/],
+               vpx_tree_index tree[/*2*num_syms*/],
+               sym_bools bit_len[/*num_syms*/],
+               const rans_sym sym_tab[/*num_syms*/], int num_syms) {
+  tab2tree(sym_tab, num_syms, probs);
+  build_tree(tree, num_syms);
+  make_tree_bits_tab(bit_len, num_syms);
+}
+
+bool check_vpxtree(const std::vector<int> &sym_vec, const rans_sym *sym_tab,
+                   uint8_t *buf) {
+  vpx_writer w;
+  vpx_reader r;
+  vpx_start_encode(&w, buf);
+
+  vpx_prob probs[kDistinctSyms];
+  vpx_tree_index tree[2 * kDistinctSyms];
+  sym_bools bit_len[kDistinctSyms];
+  build_tpb(probs, tree, bit_len, sym_tab, kDistinctSyms);
+
+  std::clock_t start = std::clock();
+  for (std::vector<int>::const_iterator it = sym_vec.begin();
+       it != sym_vec.end(); ++it) {
+    vp10_write_tree(&w, tree, probs, bit_len[*it].bits, bit_len[*it].len, 0);
+  }
+  std::clock_t enc_time = std::clock() - start;
+  vpx_stop_encode(&w);
+  vpx_reader_init(&r, buf, w.pos, NULL, NULL);
+  start = std::clock();
+  for (std::vector<int>::const_iterator it = sym_vec.begin();
+       it != sym_vec.end(); ++it) {
+    if (vpx_read_tree(&r, tree, probs) != *it) return false;
+  }
+  std::clock_t dec_time = std::clock() - start;
+  printf("VPXtree size %u enc_time %f dec_time %f\n", w.pos,
+         static_cast<float>(enc_time) / CLOCKS_PER_SEC,
+         static_cast<float>(dec_time) / CLOCKS_PER_SEC);
+  return true;
+}
+
+class Vp10AbsTest : public ::testing::Test {
+ protected:
+  static void SetUpTestCase() { pv_vec_ = abs_encode_build_vals(kNumBools); }
+  virtual void SetUp() { buf_ = new uint8_t[kNumBools / 8]; }
+  virtual void TearDown() { delete[] buf_; }
+  static const int kNumBools = 100000000;
+  static PvVec pv_vec_;
+  uint8_t *buf_;
+};
+PvVec Vp10AbsTest::pv_vec_;
+
+class Vp10AnsTest : public ::testing::Test {
+ protected:
+  static void SetUpTestCase() {
+    sym_vec_ = ans_encode_build_vals(rans_sym_tab, kNumSyms);
+  }
+  virtual void SetUp() { buf_ = new uint8_t[kNumSyms / 2]; }
+  virtual void TearDown() { delete[] buf_; }
+  static const int kNumSyms = 25000000;
+  static std::vector<int> sym_vec_;
+  uint8_t *buf_;
+};
+std::vector<int> Vp10AnsTest::sym_vec_;
+
+TEST_F(Vp10AbsTest, Vpxbool) { EXPECT_TRUE(check_vpxbool(pv_vec_, buf_)); }
+TEST_F(Vp10AbsTest, Rabs) { EXPECT_TRUE(check_rabs(pv_vec_, buf_)); }
+TEST_F(Vp10AbsTest, RabsAsc) { EXPECT_TRUE(check_rabs_asc(pv_vec_, buf_)); }
+TEST_F(Vp10AbsTest, Uabs) { EXPECT_TRUE(check_uabs(pv_vec_, buf_)); }
+
+TEST_F(Vp10AnsTest, Rans) {
+  EXPECT_TRUE(check_rans(sym_vec_, rans_sym_tab, buf_));
+}
+TEST_F(Vp10AnsTest, Vpxtree) {
+  EXPECT_TRUE(check_vpxtree(sym_vec_, rans_sym_tab, buf_));
+}
+}  // namespace
diff --git a/test/vp10_convolve_optimz_test.cc b/test/vp10_convolve_optimz_test.cc
new file mode 100644
index 0000000..ec77035
--- /dev/null
+++ b/test/vp10_convolve_optimz_test.cc
@@ -0,0 +1,407 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp10_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+
+namespace {
+
+using std::tr1::tuple;
+using libvpx_test::ACMRandom;
+
+typedef void (*conv_filter_t)(const uint8_t*, int, uint8_t*, int,
+                              int, int, const InterpFilterParams,
+                              const int, int, int);
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef void (*hbd_conv_filter_t)(const uint16_t*, int, uint16_t*, int,
+                                  int, int, const InterpFilterParams,
+                                  const int, int, int, int);
+#endif
+
+// Test parameter list:
+//  <convolve_horiz_func, convolve_vert_func,
+//  <width, height>, filter_params, subpel_x_q4, avg>
+typedef tuple<int, int> BlockDimension;
+typedef tuple<conv_filter_t, conv_filter_t, BlockDimension, INTERP_FILTER,
+              int, int> ConvParams;
+#if CONFIG_VP9_HIGHBITDEPTH
+// Test parameter list:
+//  <convolve_horiz_func, convolve_vert_func,
+//  <width, height>, filter_params, subpel_x_q4, avg, bit_dpeth>
+typedef tuple<hbd_conv_filter_t, hbd_conv_filter_t, BlockDimension,
+              INTERP_FILTER, int, int, int> HbdConvParams;
+#endif
+
+// Note:
+//  src_ and src_ref_ have special boundary requirement
+//  dst_ and dst_ref_ don't
+const size_t maxWidth = 256;
+const size_t maxHeight = 256;
+const size_t maxBlockSize = maxWidth * maxHeight;
+const int horizOffset = 32;
+const int vertiOffset = 32;
+const size_t testMaxBlk = 128;
+const int stride = 128;
+const int x_step_q4 = 16;
+
+class VP10ConvolveOptimzTest : public ::testing::TestWithParam<ConvParams> {
+ public:
+  virtual ~VP10ConvolveOptimzTest() {}
+  virtual void SetUp() {
+    conv_horiz_ = GET_PARAM(0);
+    conv_vert_ = GET_PARAM(1);
+    BlockDimension block = GET_PARAM(2);
+    width_ = std::tr1::get<0>(block);
+    height_ = std::tr1::get<1>(block);
+    filter_ = GET_PARAM(3);
+    subpel_ = GET_PARAM(4);
+    avg_ = GET_PARAM(5);
+
+    alloc_ = new uint8_t[maxBlockSize * 4];
+    src_ = alloc_ + (vertiOffset * maxWidth);
+    src_ += horizOffset;
+    src_ref_ = src_ + maxBlockSize;
+
+    dst_ = alloc_ + 2 * maxBlockSize;
+    dst_ref_ = alloc_ + 3 * maxBlockSize;
+  }
+
+  virtual void TearDown() {
+    delete[] alloc_;
+    libvpx_test::ClearSystemState();
+  }
+
+ protected:
+  void RunHorizFilterBitExactCheck();
+  void RunVertFilterBitExactCheck();
+
+ private:
+  void PrepFilterBuffer(int w, int h);
+  void DiffFilterBuffer();
+  conv_filter_t conv_horiz_;
+  conv_filter_t conv_vert_;
+  uint8_t *alloc_;
+  uint8_t *src_;
+  uint8_t *dst_;
+  uint8_t *src_ref_;
+  uint8_t *dst_ref_;
+  int width_;
+  int height_;
+  int filter_;
+  int subpel_;
+  int avg_;
+};
+
+void VP10ConvolveOptimzTest::PrepFilterBuffer(int w, int h) {
+  int r, c;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+  memset(alloc_, 0, 4 * maxBlockSize * sizeof(alloc_[0]));
+
+  uint8_t *src_ptr = src_;
+  uint8_t *dst_ptr = dst_;
+  uint8_t *src_ref_ptr = src_ref_;
+  uint8_t *dst_ref_ptr = dst_ref_;
+
+  for (r = 0; r < height_; ++r) {
+    for (c = 0; c < width_; ++c) {
+      src_ptr[c] = rnd.Rand8();
+      src_ref_ptr[c] = src_ptr[c];
+      dst_ptr[c] = rnd.Rand8();
+      dst_ref_ptr[c] = dst_ptr[c];
+    }
+    src_ptr += stride;
+    src_ref_ptr += stride;
+    dst_ptr += stride;
+    dst_ref_ptr += stride;
+  }
+}
+
+void VP10ConvolveOptimzTest::DiffFilterBuffer() {
+  int r, c;
+  const uint8_t *dst_ptr = dst_;
+  const uint8_t *dst_ref_ptr = dst_ref_;
+  for (r = 0; r < height_; ++r) {
+    for (c = 0; c < width_; ++c) {
+      EXPECT_EQ((uint8_t)dst_ref_ptr[c], (uint8_t)dst_ptr[c])
+      << "Error at row: " << r << " col: " << c << " "
+      << "w = " << width_ << " " << "h = " << height_ << " "
+      << "filter group index = " << filter_ << " "
+      << "filter index = " << subpel_;
+    }
+    dst_ptr += stride;
+    dst_ref_ptr += stride;
+  }
+}
+
+void VP10ConvolveOptimzTest::RunHorizFilterBitExactCheck() {
+  PrepFilterBuffer(testMaxBlk, testMaxBlk);
+
+  InterpFilterParams filter_params = vp10_get_interp_filter_params(filter_);
+
+  vp10_convolve_horiz_c(src_ref_, stride, dst_ref_, stride, width_, height_,
+                        filter_params, subpel_, x_step_q4, avg_);
+
+  conv_horiz_(src_, stride, dst_, stride, width_, height_,
+              filter_params, subpel_, x_step_q4, avg_);
+
+  DiffFilterBuffer();
+
+  // Note:
+  // Here we need calculate a height which is different from the specified one
+  // and test again.
+  int intermediate_height =
+      (((height_ - 1) * 16 + subpel_) >> SUBPEL_BITS) + filter_params.taps;
+  PrepFilterBuffer(testMaxBlk, testMaxBlk);
+
+  vp10_convolve_horiz_c(src_ref_, stride, dst_ref_, stride, width_,
+                        intermediate_height, filter_params, subpel_, x_step_q4,
+                        avg_);
+
+  conv_horiz_(src_, stride, dst_, stride, width_,
+              intermediate_height, filter_params, subpel_, x_step_q4,
+              avg_);
+
+  DiffFilterBuffer();
+}
+
+void VP10ConvolveOptimzTest::RunVertFilterBitExactCheck() {
+  PrepFilterBuffer(testMaxBlk, testMaxBlk);
+
+  InterpFilterParams filter_params = vp10_get_interp_filter_params(filter_);
+
+  vp10_convolve_vert_c(src_ref_, stride, dst_ref_, stride, width_, height_,
+                       filter_params, subpel_, x_step_q4, avg_);
+
+  conv_vert_(src_, stride, dst_, stride, width_, height_,
+             filter_params, subpel_, x_step_q4, avg_);
+
+  DiffFilterBuffer();
+}
+
+TEST_P(VP10ConvolveOptimzTest, HorizBitExactCheck) {
+  RunHorizFilterBitExactCheck();
+}
+TEST_P(VP10ConvolveOptimzTest, VerticalBitExactCheck) {
+  RunVertFilterBitExactCheck();
+}
+
+using std::tr1::make_tuple;
+
+#if (HAVE_SSSE3 || HAVE_SSE4_1) && CONFIG_EXT_INTERP
+const BlockDimension kBlockDim[] = {
+  make_tuple(2, 2),
+  make_tuple(2, 4),
+  make_tuple(4, 4),
+  make_tuple(4, 8),
+  make_tuple(8, 4),
+  make_tuple(8, 8),
+  make_tuple(8, 16),
+  make_tuple(16, 8),
+  make_tuple(16, 16),
+  make_tuple(16, 32),
+  make_tuple(32, 16),
+  make_tuple(32, 32),
+  make_tuple(32, 64),
+  make_tuple(64, 32),
+  make_tuple(64, 64),
+  make_tuple(64, 128),
+  make_tuple(128, 64),
+  make_tuple(128, 128),
+};
+
+// 10/12-tap filters
+const INTERP_FILTER kFilter[] = {6, 4, 2};
+
+const int kSubpelQ4[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
+
+const int kAvg[] = {0, 1};
+#endif
+
+#if HAVE_SSSE3 && CONFIG_EXT_INTERP
+INSTANTIATE_TEST_CASE_P(
+    SSSE3, VP10ConvolveOptimzTest,
+    ::testing::Combine(
+         ::testing::Values(vp10_convolve_horiz_ssse3),
+         ::testing::Values(vp10_convolve_vert_ssse3),
+         ::testing::ValuesIn(kBlockDim),
+         ::testing::ValuesIn(kFilter),
+         ::testing::ValuesIn(kSubpelQ4),
+         ::testing::ValuesIn(kAvg)));
+#endif  // HAVE_SSSE3 && CONFIG_EXT_INTERP
+
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef ::testing::TestWithParam<HbdConvParams> TestWithHbdConvParams;
+class VP10HbdConvolveOptimzTest : public TestWithHbdConvParams {
+ public:
+  virtual ~VP10HbdConvolveOptimzTest() {}
+  virtual void SetUp() {
+    conv_horiz_ = GET_PARAM(0);
+    conv_vert_ = GET_PARAM(1);
+    BlockDimension block = GET_PARAM(2);
+    width_ = std::tr1::get<0>(block);
+    height_ = std::tr1::get<1>(block);
+    filter_ = GET_PARAM(3);
+    subpel_ = GET_PARAM(4);
+    avg_ = GET_PARAM(5);
+    bit_depth_ = GET_PARAM(6);
+
+    alloc_ = new uint16_t[maxBlockSize * 4];
+    src_ = alloc_ + (vertiOffset * maxWidth);
+    src_ += horizOffset;
+    src_ref_ = src_ + maxBlockSize;
+
+    dst_ = alloc_ + 2 * maxBlockSize;
+    dst_ref_ = alloc_ + 3 * maxBlockSize;
+  }
+
+  virtual void TearDown() {
+    delete[] alloc_;
+    libvpx_test::ClearSystemState();
+  }
+
+ protected:
+  void RunHorizFilterBitExactCheck();
+  void RunVertFilterBitExactCheck();
+
+ private:
+  void PrepFilterBuffer(int w, int h);
+  void DiffFilterBuffer();
+  hbd_conv_filter_t conv_horiz_;
+  hbd_conv_filter_t conv_vert_;
+  uint16_t *alloc_;
+  uint16_t *src_;
+  uint16_t *dst_;
+  uint16_t *src_ref_;
+  uint16_t *dst_ref_;
+  int width_;
+  int height_;
+  int filter_;
+  int subpel_;
+  int avg_;
+  int bit_depth_;
+};
+
+void VP10HbdConvolveOptimzTest::PrepFilterBuffer(int w, int h) {
+  int r, c;
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+
+  memset(alloc_, 0, 4 * maxBlockSize * sizeof(alloc_[0]));
+
+  uint16_t *src_ptr = src_;
+  uint16_t *dst_ptr = dst_;
+  uint16_t *dst_ref_ptr = dst_ref_;
+  uint16_t hbd_mask = (1 << bit_depth_) - 1;
+
+  for (r = 0; r < height_; ++r) {
+    for (c = 0; c < width_; ++c) {
+      src_ptr[c] = rnd.Rand16() & hbd_mask;
+      dst_ptr[c] = rnd.Rand16() & hbd_mask;
+      dst_ref_ptr[c] = dst_ptr[c];
+    }
+    src_ptr += stride;
+    dst_ptr += stride;
+    dst_ref_ptr += stride;
+  }
+}
+
+void VP10HbdConvolveOptimzTest::DiffFilterBuffer() {
+  int r, c;
+  const uint16_t *dst_ptr = dst_;
+  const uint16_t *dst_ref_ptr = dst_ref_;
+  for (r = 0; r < height_; ++r) {
+    for (c = 0; c < width_; ++c) {
+      EXPECT_EQ((uint16_t)dst_ref_ptr[c], (uint16_t)dst_ptr[c])
+      << "Error at row: " << r << " col: " << c << " "
+      << "w = " << width_ << " " << "h = " << height_ << " "
+      << "filter group index = " << filter_ << " "
+      << "filter index = " << subpel_ << " "
+      << "bit depth = " << bit_depth_;
+    }
+    dst_ptr += stride;
+    dst_ref_ptr += stride;
+  }
+}
+
+void VP10HbdConvolveOptimzTest::RunHorizFilterBitExactCheck() {
+  PrepFilterBuffer(testMaxBlk, testMaxBlk);
+
+  InterpFilterParams filter_params = vp10_get_interp_filter_params(filter_);
+
+  vp10_highbd_convolve_horiz_c(src_, stride, dst_ref_, stride, width_,
+                               height_, filter_params, subpel_, x_step_q4,
+                               avg_, bit_depth_);
+
+  conv_horiz_(src_, stride, dst_, stride, width_, height_,
+              filter_params, subpel_, x_step_q4, avg_, bit_depth_);
+
+  DiffFilterBuffer();
+
+  // Note:
+  // Here we need calculate a height which is different from the specified one
+  // and test again.
+  int intermediate_height =
+      (((height_ - 1) * 16 + subpel_) >> SUBPEL_BITS) + filter_params.taps;
+  PrepFilterBuffer(testMaxBlk, testMaxBlk);
+
+  vp10_highbd_convolve_horiz_c(src_, stride, dst_ref_, stride, width_,
+                               intermediate_height, filter_params, subpel_,
+                               x_step_q4, avg_, bit_depth_);
+
+  conv_horiz_(src_, stride, dst_, stride, width_, intermediate_height,
+              filter_params, subpel_, x_step_q4, avg_, bit_depth_);
+
+  DiffFilterBuffer();
+}
+
+void VP10HbdConvolveOptimzTest::RunVertFilterBitExactCheck() {
+  PrepFilterBuffer(testMaxBlk, testMaxBlk);
+
+  InterpFilterParams filter_params = vp10_get_interp_filter_params(filter_);
+
+  vp10_highbd_convolve_vert_c(src_, stride, dst_ref_, stride, width_, height_,
+                              filter_params, subpel_, x_step_q4, avg_,
+                              bit_depth_);
+
+  conv_vert_(src_, stride, dst_, stride, width_, height_,
+             filter_params, subpel_, x_step_q4, avg_, bit_depth_);
+
+  DiffFilterBuffer();
+}
+
+TEST_P(VP10HbdConvolveOptimzTest, HorizBitExactCheck) {
+  RunHorizFilterBitExactCheck();
+}
+TEST_P(VP10HbdConvolveOptimzTest, VertBitExactCheck) {
+  RunVertFilterBitExactCheck();
+}
+
+#if HAVE_SSE4_1 && CONFIG_EXT_INTERP
+
+const int kBitdepth[] = {10, 12};
+
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, VP10HbdConvolveOptimzTest,
+    ::testing::Combine(
+         ::testing::Values(vp10_highbd_convolve_horiz_sse4_1),
+         ::testing::Values(vp10_highbd_convolve_vert_sse4_1),
+         ::testing::ValuesIn(kBlockDim),
+         ::testing::ValuesIn(kFilter),
+         ::testing::ValuesIn(kSubpelQ4),
+         ::testing::ValuesIn(kAvg),
+         ::testing::ValuesIn(kBitdepth)));
+#endif  // HAVE_SSE4_1 && CONFIG_EXT_INTERP
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}  // namespace
diff --git a/test/vp10_convolve_test.cc b/test/vp10_convolve_test.cc
new file mode 100644
index 0000000..0d6bbcd
--- /dev/null
+++ b/test/vp10_convolve_test.cc
@@ -0,0 +1,461 @@
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp10_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+#include "test/acm_random.h"
+#include "vp10/common/filter.h"
+#include "vp10/common/vp10_convolve.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/mem.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+void setup_convolve() {
+#if HAVE_SSSE3
+  vp10_convolve_horiz = vp10_convolve_horiz_c;
+  vp10_convolve_vert = vp10_convolve_vert_c;
+#endif
+}
+
+TEST(VP10ConvolveTest, vp10_convolve8) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+#if CONFIG_DUAL_FILTER
+  INTERP_FILTER interp_filter[4] = {
+      EIGHTTAP_REGULAR, EIGHTTAP_REGULAR,
+      EIGHTTAP_REGULAR, EIGHTTAP_REGULAR
+  };
+  InterpFilterParams filter_params =
+      vp10_get_interp_filter_params(interp_filter[0]);
+#else
+  INTERP_FILTER interp_filter = EIGHTTAP_REGULAR;
+  InterpFilterParams filter_params =
+      vp10_get_interp_filter_params(interp_filter);
+#endif
+  ptrdiff_t filter_size = filter_params.taps;
+  int filter_center = filter_size / 2 - 1;
+  uint8_t src[12 * 12];
+  int src_stride = filter_size;
+  uint8_t dst[1] = {0};
+  uint8_t dst1[1] = {0};
+  int dst_stride = 1;
+  int x_step_q4 = 16;
+  int y_step_q4 = 16;
+  int subpel_x_q4 = 3;
+  int subpel_y_q4 = 2;
+  int avg = 0;
+
+  int w = 1;
+  int h = 1;
+
+  setup_convolve();
+
+  for (int i = 0; i < filter_size * filter_size; i++) {
+    src[i] = rnd.Rand16() % (1 << 8);
+  }
+
+  vp10_convolve(src + src_stride * filter_center + filter_center, src_stride,
+                dst, dst_stride, w, h, interp_filter, subpel_x_q4, x_step_q4,
+                subpel_y_q4, y_step_q4, avg);
+
+  const int16_t* x_filter =
+      vp10_get_interp_filter_subpel_kernel(filter_params, subpel_x_q4);
+  const int16_t* y_filter =
+      vp10_get_interp_filter_subpel_kernel(filter_params, subpel_y_q4);
+
+  vpx_convolve8_c(src + src_stride * filter_center + filter_center, src_stride,
+                  dst1, dst_stride, x_filter, 16, y_filter, 16, w, h);
+  EXPECT_EQ(dst[0], dst1[0]);
+}
+TEST(VP10ConvolveTest, vp10_convolve) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+#if CONFIG_DUAL_FILTER
+  INTERP_FILTER interp_filter[4] = {
+      EIGHTTAP_REGULAR, EIGHTTAP_REGULAR,
+      EIGHTTAP_REGULAR, EIGHTTAP_REGULAR
+  };
+  InterpFilterParams filter_params =
+      vp10_get_interp_filter_params(interp_filter[0]);
+#else
+  INTERP_FILTER interp_filter = EIGHTTAP_REGULAR;
+  InterpFilterParams filter_params =
+      vp10_get_interp_filter_params(interp_filter);
+#endif
+  ptrdiff_t filter_size = filter_params.taps;
+  int filter_center = filter_size / 2 - 1;
+  uint8_t src[12 * 12];
+  int src_stride = filter_size;
+  uint8_t dst[1] = {0};
+  int dst_stride = 1;
+  int x_step_q4 = 16;
+  int y_step_q4 = 16;
+  int avg = 0;
+  int w = 1;
+  int h = 1;
+
+  int subpel_x_q4;
+  int subpel_y_q4;
+
+  setup_convolve();
+
+  for (int i = 0; i < filter_size * filter_size; i++) {
+    src[i] = rnd.Rand16() % (1 << 8);
+  }
+
+  for (subpel_x_q4 = 0; subpel_x_q4 < 16; subpel_x_q4++) {
+    for (subpel_y_q4 = 0; subpel_y_q4 < 16; subpel_y_q4++) {
+      vp10_convolve(src + src_stride * filter_center + filter_center,
+                    src_stride, dst, dst_stride, w, h, interp_filter,
+                    subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4, avg);
+
+      const int16_t* x_filter =
+          vp10_get_interp_filter_subpel_kernel(filter_params, subpel_x_q4);
+      const int16_t* y_filter =
+          vp10_get_interp_filter_subpel_kernel(filter_params, subpel_y_q4);
+
+      int temp[12];
+      int dst_ref = 0;
+      for (int r = 0; r < filter_size; r++) {
+        temp[r] = 0;
+        for (int c = 0; c < filter_size; c++) {
+          temp[r] += x_filter[c] * src[r * filter_size + c];
+        }
+        temp[r] = clip_pixel(ROUND_POWER_OF_TWO(temp[r], FILTER_BITS));
+        dst_ref += temp[r] * y_filter[r];
+      }
+      dst_ref = clip_pixel(ROUND_POWER_OF_TWO(dst_ref, FILTER_BITS));
+      EXPECT_EQ(dst[0], dst_ref);
+    }
+  }
+}
+
+TEST(VP10ConvolveTest, vp10_convolve_avg) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+#if CONFIG_DUAL_FILTER
+  INTERP_FILTER interp_filter[4] = {
+      EIGHTTAP_REGULAR, EIGHTTAP_REGULAR,
+      EIGHTTAP_REGULAR, EIGHTTAP_REGULAR
+  };
+  InterpFilterParams filter_params =
+      vp10_get_interp_filter_params(interp_filter[0]);
+#else
+  INTERP_FILTER interp_filter = EIGHTTAP_REGULAR;
+  InterpFilterParams filter_params =
+      vp10_get_interp_filter_params(interp_filter);
+#endif
+  ptrdiff_t filter_size = filter_params.taps;
+  int filter_center = filter_size / 2 - 1;
+  uint8_t src0[12 * 12];
+  uint8_t src1[12 * 12];
+  int src_stride = filter_size;
+  uint8_t dst0[1] = {0};
+  uint8_t dst1[1] = {0};
+  uint8_t dst[1] = {0};
+  int dst_stride = 1;
+  int x_step_q4 = 16;
+  int y_step_q4 = 16;
+  int avg = 0;
+
+  int w = 1;
+  int h = 1;
+
+  int subpel_x_q4;
+  int subpel_y_q4;
+
+  setup_convolve();
+
+  for (int i = 0; i < filter_size * filter_size; i++) {
+    src0[i] = rnd.Rand16() % (1 << 8);
+    src1[i] = rnd.Rand16() % (1 << 8);
+  }
+
+  int offset = filter_size * filter_center + filter_center;
+
+  for (subpel_x_q4 = 0; subpel_x_q4 < 16; subpel_x_q4++) {
+    for (subpel_y_q4 = 0; subpel_y_q4 < 16; subpel_y_q4++) {
+      avg = 0;
+      vp10_convolve(src0 + offset, src_stride, dst0, dst_stride, w, h,
+                    interp_filter, subpel_x_q4, x_step_q4, subpel_y_q4,
+                    y_step_q4, avg);
+      avg = 0;
+      vp10_convolve(src1 + offset, src_stride, dst1, dst_stride, w, h,
+                    interp_filter, subpel_x_q4, x_step_q4, subpel_y_q4,
+                    y_step_q4, avg);
+
+      avg = 0;
+      vp10_convolve(src0 + offset, src_stride, dst, dst_stride, w, h,
+                    interp_filter, subpel_x_q4, x_step_q4, subpel_y_q4,
+                    y_step_q4, avg);
+      avg = 1;
+      vp10_convolve(src1 + offset, src_stride, dst, dst_stride, w, h,
+                    interp_filter, subpel_x_q4, x_step_q4, subpel_y_q4,
+                    y_step_q4, avg);
+
+      EXPECT_EQ(dst[0], ROUND_POWER_OF_TWO(dst0[0] + dst1[0], 1));
+    }
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+TEST(VP10ConvolveTest, vp10_highbd_convolve) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+#if CONFIG_DUAL_FILTER
+  INTERP_FILTER interp_filter[4] = {
+      EIGHTTAP_REGULAR, EIGHTTAP_REGULAR,
+      EIGHTTAP_REGULAR, EIGHTTAP_REGULAR
+  };
+  InterpFilterParams filter_params =
+      vp10_get_interp_filter_params(interp_filter[0]);
+#else
+  INTERP_FILTER interp_filter = EIGHTTAP_REGULAR;
+  InterpFilterParams filter_params =
+      vp10_get_interp_filter_params(interp_filter);
+#endif
+  ptrdiff_t filter_size = filter_params.taps;
+  int filter_center = filter_size / 2 - 1;
+  uint16_t src[12 * 12];
+  int src_stride = filter_size;
+  uint16_t dst[1] = {0};
+  int dst_stride = 1;
+  int x_step_q4 = 16;
+  int y_step_q4 = 16;
+  int avg = 0;
+  int bd = 10;
+  int w = 1;
+  int h = 1;
+
+  int subpel_x_q4;
+  int subpel_y_q4;
+
+  for (int i = 0; i < filter_size * filter_size; i++) {
+    src[i] = rnd.Rand16() % (1 << bd);
+  }
+
+  for (subpel_x_q4 = 0; subpel_x_q4 < 16; subpel_x_q4++) {
+    for (subpel_y_q4 = 0; subpel_y_q4 < 16; subpel_y_q4++) {
+      vp10_highbd_convolve(
+          CONVERT_TO_BYTEPTR(src + src_stride * filter_center + filter_center),
+          src_stride, CONVERT_TO_BYTEPTR(dst), dst_stride, w, h, interp_filter,
+          subpel_x_q4, x_step_q4, subpel_y_q4, y_step_q4, avg, bd);
+
+      const int16_t* x_filter =
+          vp10_get_interp_filter_subpel_kernel(filter_params, subpel_x_q4);
+      const int16_t* y_filter =
+          vp10_get_interp_filter_subpel_kernel(filter_params, subpel_y_q4);
+
+      int temp[12];
+      int dst_ref = 0;
+      for (int r = 0; r < filter_size; r++) {
+        temp[r] = 0;
+        for (int c = 0; c < filter_size; c++) {
+          temp[r] += x_filter[c] * src[r * filter_size + c];
+        }
+        temp[r] =
+            clip_pixel_highbd(ROUND_POWER_OF_TWO(temp[r], FILTER_BITS), bd);
+        dst_ref += temp[r] * y_filter[r];
+      }
+      dst_ref = clip_pixel_highbd(ROUND_POWER_OF_TWO(dst_ref, FILTER_BITS), bd);
+      EXPECT_EQ(dst[0], dst_ref);
+    }
+  }
+}
+
+TEST(VP10ConvolveTest, vp10_highbd_convolve_avg) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+#if CONFIG_DUAL_FILTER
+  INTERP_FILTER interp_filter[4] = {
+      EIGHTTAP_REGULAR, EIGHTTAP_REGULAR,
+      EIGHTTAP_REGULAR, EIGHTTAP_REGULAR
+  };
+  InterpFilterParams filter_params =
+      vp10_get_interp_filter_params(interp_filter[0]);
+#else
+  INTERP_FILTER interp_filter = EIGHTTAP_REGULAR;
+  InterpFilterParams filter_params =
+      vp10_get_interp_filter_params(interp_filter);
+#endif
+  ptrdiff_t filter_size = filter_params.taps;
+  int filter_center = filter_size / 2 - 1;
+  uint16_t src0[12 * 12];
+  uint16_t src1[12 * 12];
+  int src_stride = filter_size;
+  uint16_t dst0[1] = {0};
+  uint16_t dst1[1] = {0};
+  uint16_t dst[1] = {0};
+  int dst_stride = 1;
+  int x_step_q4 = 16;
+  int y_step_q4 = 16;
+  int avg = 0;
+  int bd = 10;
+
+  int w = 1;
+  int h = 1;
+
+  int subpel_x_q4;
+  int subpel_y_q4;
+
+  for (int i = 0; i < filter_size * filter_size; i++) {
+    src0[i] = rnd.Rand16() % (1 << bd);
+    src1[i] = rnd.Rand16() % (1 << bd);
+  }
+
+  for (subpel_x_q4 = 0; subpel_x_q4 < 16; subpel_x_q4++) {
+    for (subpel_y_q4 = 0; subpel_y_q4 < 16; subpel_y_q4++) {
+      int offset = filter_size * filter_center + filter_center;
+
+      avg = 0;
+      vp10_highbd_convolve(CONVERT_TO_BYTEPTR(src0 + offset), src_stride,
+                           CONVERT_TO_BYTEPTR(dst0), dst_stride, w, h,
+                           interp_filter, subpel_x_q4, x_step_q4, subpel_y_q4,
+                           y_step_q4, avg, bd);
+      avg = 0;
+      vp10_highbd_convolve(CONVERT_TO_BYTEPTR(src1 + offset), src_stride,
+                           CONVERT_TO_BYTEPTR(dst1), dst_stride, w, h,
+                           interp_filter, subpel_x_q4, x_step_q4, subpel_y_q4,
+                           y_step_q4, avg, bd);
+
+      avg = 0;
+      vp10_highbd_convolve(CONVERT_TO_BYTEPTR(src0 + offset), src_stride,
+                           CONVERT_TO_BYTEPTR(dst), dst_stride, w, h,
+                           interp_filter, subpel_x_q4, x_step_q4, subpel_y_q4,
+                           y_step_q4, avg, bd);
+      avg = 1;
+      vp10_highbd_convolve(CONVERT_TO_BYTEPTR(src1 + offset), src_stride,
+                           CONVERT_TO_BYTEPTR(dst), dst_stride, w, h,
+                           interp_filter, subpel_x_q4, x_step_q4, subpel_y_q4,
+                           y_step_q4, avg, bd);
+
+      EXPECT_EQ(dst[0], ROUND_POWER_OF_TWO(dst0[0] + dst1[0], 1));
+    }
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#define CONVOLVE_SPEED_TEST 0
+#if CONVOLVE_SPEED_TEST
+#define highbd_convolve_speed(func, block_size, frame_size)                  \
+  TEST(VP10ConvolveTest, func##_speed_##block_size##_##frame_size) {         \
+    ACMRandom rnd(ACMRandom::DeterministicSeed());                           \
+    INTERP_FILTER interp_filter = EIGHTTAP;                                  \
+    InterpFilterParams filter_params =                                       \
+        vp10_get_interp_filter_params(interp_filter);                        \
+    ptrdiff_t filter_size = filter_params.tap;                               \
+    int filter_center = filter_size / 2 - 1;                                 \
+    DECLARE_ALIGNED(16, uint16_t,                                            \
+                    src[(frame_size + 7) * (frame_size + 7)]) = {0};         \
+    int src_stride = frame_size + 7;                                         \
+    DECLARE_ALIGNED(16, uint16_t, dst[frame_size * frame_size]) = {0};       \
+    int dst_stride = frame_size;                                             \
+    int x_step_q4 = 16;                                                      \
+    int y_step_q4 = 16;                                                      \
+    int subpel_x_q4 = 8;                                                     \
+    int subpel_y_q4 = 6;                                                     \
+    int bd = 10;                                                             \
+                                                                             \
+    int w = block_size;                                                      \
+    int h = block_size;                                                      \
+                                                                             \
+    const int16_t* filter_x =                                                \
+        vp10_get_interp_filter_kernel(filter_params, subpel_x_q4);           \
+    const int16_t* filter_y =                                                \
+        vp10_get_interp_filter_kernel(filter_params, subpel_y_q4);           \
+                                                                             \
+    for (int i = 0; i < src_stride * src_stride; i++) {                      \
+      src[i] = rnd.Rand16() % (1 << bd);                                     \
+    }                                                                        \
+                                                                             \
+    int offset = filter_center * src_stride + filter_center;                 \
+    int row_offset = 0;                                                      \
+    int col_offset = 0;                                                      \
+    for (int i = 0; i < 100000; i++) {                                       \
+      int src_total_offset = offset + col_offset * src_stride + row_offset;  \
+      int dst_total_offset = col_offset * dst_stride + row_offset;           \
+      func(CONVERT_TO_BYTEPTR(src + src_total_offset), src_stride,           \
+           CONVERT_TO_BYTEPTR(dst + dst_total_offset), dst_stride, filter_x, \
+           x_step_q4, filter_y, y_step_q4, w, h, bd);                        \
+      if (offset + w + w < frame_size) {                                     \
+        row_offset += w;                                                     \
+      } else {                                                               \
+        row_offset = 0;                                                      \
+        col_offset += h;                                                     \
+      }                                                                      \
+      if (col_offset + h >= frame_size) {                                    \
+        col_offset = 0;                                                      \
+      }                                                                      \
+    }                                                                        \
+  }
+
+#define lowbd_convolve_speed(func, block_size, frame_size)                  \
+  TEST(VP10ConvolveTest, func##_speed_l_##block_size##_##frame_size) {      \
+    ACMRandom rnd(ACMRandom::DeterministicSeed());                          \
+    INTERP_FILTER interp_filter = EIGHTTAP;                                 \
+    InterpFilterParams filter_params =                                      \
+        vp10_get_interp_filter_params(interp_filter);                       \
+    ptrdiff_t filter_size = filter_params.tap;                              \
+    int filter_center = filter_size / 2 - 1;                                \
+    DECLARE_ALIGNED(16, uint8_t, src[(frame_size + 7) * (frame_size + 7)]); \
+    int src_stride = frame_size + 7;                                        \
+    DECLARE_ALIGNED(16, uint8_t, dst[frame_size * frame_size]);             \
+    int dst_stride = frame_size;                                            \
+    int x_step_q4 = 16;                                                     \
+    int y_step_q4 = 16;                                                     \
+    int subpel_x_q4 = 8;                                                    \
+    int subpel_y_q4 = 6;                                                    \
+    int bd = 8;                                                             \
+                                                                            \
+    int w = block_size;                                                     \
+    int h = block_size;                                                     \
+                                                                            \
+    const int16_t* filter_x =                                               \
+        vp10_get_interp_filter_kernel(filter_params, subpel_x_q4);          \
+    const int16_t* filter_y =                                               \
+        vp10_get_interp_filter_kernel(filter_params, subpel_y_q4);          \
+                                                                            \
+    for (int i = 0; i < src_stride * src_stride; i++) {                     \
+      src[i] = rnd.Rand16() % (1 << bd);                                    \
+    }                                                                       \
+                                                                            \
+    int offset = filter_center * src_stride + filter_center;                \
+    int row_offset = 0;                                                     \
+    int col_offset = 0;                                                     \
+    for (int i = 0; i < 100000; i++) {                                      \
+      func(src + offset, src_stride, dst, dst_stride, filter_x, x_step_q4,  \
+           filter_y, y_step_q4, w, h);                                      \
+      if (offset + w + w < frame_size) {                                    \
+        row_offset += w;                                                    \
+      } else {                                                              \
+        row_offset = 0;                                                     \
+        col_offset += h;                                                    \
+      }                                                                     \
+      if (col_offset + h >= frame_size) {                                   \
+        col_offset = 0;                                                     \
+      }                                                                     \
+    }                                                                       \
+  }
+
+// This experiment shows that when frame size is 64x64
+// vpx_highbd_convolve8_sse2 and vpx_convolve8_sse2's speed are similar.
+// However when frame size becomes 1024x1024
+// vpx_highbd_convolve8_sse2 is around 50% slower than vpx_convolve8_sse2
+// we think the bottleneck is from memory IO
+highbd_convolve_speed(vpx_highbd_convolve8_sse2, 8, 64);
+highbd_convolve_speed(vpx_highbd_convolve8_sse2, 16, 64);
+highbd_convolve_speed(vpx_highbd_convolve8_sse2, 32, 64);
+highbd_convolve_speed(vpx_highbd_convolve8_sse2, 64, 64);
+
+lowbd_convolve_speed(vpx_convolve8_sse2, 8, 64);
+lowbd_convolve_speed(vpx_convolve8_sse2, 16, 64);
+lowbd_convolve_speed(vpx_convolve8_sse2, 32, 64);
+lowbd_convolve_speed(vpx_convolve8_sse2, 64, 64);
+
+highbd_convolve_speed(vpx_highbd_convolve8_sse2, 8, 1024);
+highbd_convolve_speed(vpx_highbd_convolve8_sse2, 16, 1024);
+highbd_convolve_speed(vpx_highbd_convolve8_sse2, 32, 1024);
+highbd_convolve_speed(vpx_highbd_convolve8_sse2, 64, 1024);
+
+lowbd_convolve_speed(vpx_convolve8_sse2, 8, 1024);
+lowbd_convolve_speed(vpx_convolve8_sse2, 16, 1024);
+lowbd_convolve_speed(vpx_convolve8_sse2, 32, 1024);
+lowbd_convolve_speed(vpx_convolve8_sse2, 64, 1024);
+#endif  // CONVOLVE_SPEED_TEST
+}  // namespace
diff --git a/test/vp10_dct_test.cc b/test/vp10_dct_test.cc
index b2c301a..8cf034f 100644
--- a/test/vp10_dct_test.cc
+++ b/test/vp10_dct_test.cc
@@ -26,7 +26,6 @@
 
 namespace {
 void reference_dct_1d(const double *in, double *out, int size) {
-  const double PI = 3.141592653589793238462643383279502884;
   const double kInvSqrt2 = 0.707106781186547524400844362104;
   for (int k = 0; k < size; ++k) {
     out[k] = 0;
diff --git a/test/vp10_ext_tile_test.cc b/test/vp10_ext_tile_test.cc
new file mode 100644
index 0000000..ad04eeb
--- /dev/null
+++ b/test/vp10_ext_tile_test.cc
@@ -0,0 +1,201 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <string>
+#include <vector>
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/md5_helper.h"
+#include "test/util.h"
+
+namespace {
+// The number of frames to be encoded/decoded
+const int kLimit = 8;
+// Skip 1 frame to check the frame decoding independency.
+const int kSkip = 5;
+const int kTileSize = 1;
+const int kTIleSizeInPixels = (kTileSize << 6);
+// Fake width and height so that they can be multiples of the tile size.
+const int kImgWidth = 704;
+const int kImgHeight = 576;
+
+class VP10ExtTileTest
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
+ protected:
+  VP10ExtTileTest()
+      : EncoderTest(GET_PARAM(0)),
+        encoding_mode_(GET_PARAM(1)),
+        set_cpu_used_(GET_PARAM(2)) {
+    init_flags_ = VPX_CODEC_USE_PSNR;
+    vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
+    cfg.w = kImgWidth;
+    cfg.h = kImgHeight;
+
+    decoder_ = codec_->CreateDecoder(cfg, 0);
+    decoder_->Control(VP10_SET_DECODE_TILE_ROW, -1);
+    decoder_->Control(VP10_SET_DECODE_TILE_COL, -1);
+
+    // Allocate buffer to store tile image.
+    vpx_img_alloc(&tile_img_, VPX_IMG_FMT_I420, kImgWidth, kImgHeight, 32);
+
+    md5_.clear();
+    tile_md5_.clear();
+  }
+
+  virtual ~VP10ExtTileTest() {
+    vpx_img_free(&tile_img_);
+    delete decoder_;
+  }
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(encoding_mode_);
+
+    cfg_.g_lag_in_frames = 0;
+    cfg_.rc_end_usage = VPX_VBR;
+    cfg_.g_error_resilient = 1;
+
+    cfg_.rc_max_quantizer = 56;
+    cfg_.rc_min_quantizer = 0;
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource * video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      // Encode setting
+      encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
+      encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 0);
+      encoder->Control(VP9E_SET_FRAME_PARALLEL_DECODING, 1);
+
+      // The tile size is 64x64.
+      encoder->Control(VP9E_SET_TILE_COLUMNS, kTileSize);
+      encoder->Control(VP9E_SET_TILE_ROWS, kTileSize);
+#if CONFIG_EXT_PARTITION
+      // Always use 64x64 max partition.
+      encoder->Control(VP10E_SET_SUPERBLOCK_SIZE, VPX_SUPERBLOCK_SIZE_64X64);
+#endif
+    }
+
+    if (video->frame() == 1) {
+      frame_flags_ = VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF |
+          VP8_EFLAG_NO_UPD_ARF;
+    }
+  }
+
+  virtual void DecompressedFrameHook(const vpx_image_t &img,
+                                     vpx_codec_pts_t pts) {
+    // Skip 1 already decoded frame to be consistent with the decoder in this
+    // test.
+    if (pts == (vpx_codec_pts_t)kSkip)
+      return;
+
+    // Calculate MD5 as the reference.
+    ::libvpx_test::MD5 md5_res;
+    md5_res.Add(&img);
+    md5_.push_back(md5_res.Get());
+  }
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    // Skip decoding 1 frame.
+    if (pkt->data.frame.pts == (vpx_codec_pts_t)kSkip)
+      return;
+
+    bool IsLastFrame = (pkt->data.frame.pts == (vpx_codec_pts_t)(kLimit - 1));
+
+    // Decode the first (kLimit - 1) frames as whole frame, and decode the last
+    // frame in single tiles.
+    for (int r = 0; r < kImgHeight / kTIleSizeInPixels; ++r) {
+      for (int c = 0; c < kImgWidth / kTIleSizeInPixels; ++c) {
+        if (!IsLastFrame) {
+          decoder_->Control(VP10_SET_DECODE_TILE_ROW, -1);
+          decoder_->Control(VP10_SET_DECODE_TILE_COL, -1);
+        } else {
+          decoder_->Control(VP10_SET_DECODE_TILE_ROW, r);
+          decoder_->Control(VP10_SET_DECODE_TILE_COL, c);
+        }
+
+        const vpx_codec_err_t res = decoder_->DecodeFrame(
+            reinterpret_cast<uint8_t*>(pkt->data.frame.buf),
+            pkt->data.frame.sz);
+        if (res != VPX_CODEC_OK) {
+          abort_ = true;
+          ASSERT_EQ(VPX_CODEC_OK, res);
+        }
+        const vpx_image_t *img = decoder_->GetDxData().Next();
+
+        if (!IsLastFrame) {
+          if (img) {
+            ::libvpx_test::MD5 md5_res;
+            md5_res.Add(img);
+            tile_md5_.push_back(md5_res.Get());
+          }
+          break;
+        }
+
+        const int kMaxMBPlane = 3;
+        for (int plane = 0; plane < kMaxMBPlane; ++plane) {
+          const int shift = (plane == 0) ? 0 : 1;
+          int tile_height = kTIleSizeInPixels >> shift;
+          int tile_width = kTIleSizeInPixels >> shift;
+
+          for (int tr = 0; tr < tile_height; ++tr) {
+            memcpy(tile_img_.planes[plane] +
+                   tile_img_.stride[plane] * (r * tile_height + tr) +
+                   c * tile_width,
+                   img->planes[plane] + img->stride[plane] * tr, tile_width);
+          }
+        }
+      }
+
+      if (!IsLastFrame)
+        break;
+    }
+
+    if (IsLastFrame) {
+      ::libvpx_test::MD5 md5_res;
+      md5_res.Add(&tile_img_);
+      tile_md5_.push_back(md5_res.Get());
+    }
+  }
+
+  ::libvpx_test::TestMode encoding_mode_;
+  int set_cpu_used_;
+  ::libvpx_test::Decoder *decoder_;
+  vpx_image_t tile_img_;
+  std::vector<std::string> md5_;
+  std::vector<std::string> tile_md5_;
+};
+
+TEST_P(VP10ExtTileTest, DecoderResultTest) {
+  ::libvpx_test::I420VideoSource video("hantro_collage_w352h288.yuv",
+                                       kImgWidth, kImgHeight, 30, 1, 0, kLimit);
+  cfg_.rc_target_bitrate = 500;
+  cfg_.g_error_resilient = VPX_ERROR_RESILIENT_DEFAULT;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.g_threads = 1;
+
+  // Tile encoding
+  init_flags_ = VPX_CODEC_USE_PSNR;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+
+  // Compare to check if two vectors are equal.
+  ASSERT_EQ(md5_, tile_md5_);
+}
+
+VP10_INSTANTIATE_TEST_CASE(
+    // Now only test 2-pass mode.
+    VP10ExtTileTest,
+    ::testing::Values(::libvpx_test::kTwoPassGood),
+    ::testing::Range(0, 4));
+}  // namespace
diff --git a/test/vp10_fht16x16_test.cc b/test/vp10_fht16x16_test.cc
new file mode 100644
index 0000000..deccc81
--- /dev/null
+++ b/test/vp10_fht16x16_test.cc
@@ -0,0 +1,229 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp10_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/transform_test_base.h"
+#include "test/util.h"
+#include "vpx_ports/mem.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
+                        int tx_type);
+using std::tr1::tuple;
+using libvpx_test::FhtFunc;
+typedef tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t, int> Ht16x16Param;
+
+void fht16x16_ref(const int16_t *in, tran_low_t *out, int stride,
+                int tx_type) {
+  vp10_fht16x16_c(in, out, stride, tx_type);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef void (*IHbdHtFunc)(const tran_low_t *in, uint8_t *out, int stride,
+                           int tx_type, int bd);
+typedef void (*HbdHtFunc)(const int16_t *input, int32_t *output, int stride,
+                          int tx_type, int bd);
+
+// Target optimized function, tx_type, bit depth
+typedef tuple<HbdHtFunc, int, int> HighbdHt16x16Param;
+
+void highbd_fht16x16_ref(const int16_t *in, int32_t *out, int stride,
+                         int tx_type, int bd) {
+  vp10_fwd_txfm2d_16x16_c(in, out, stride, tx_type, bd);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+class VP10Trans16x16HT
+    : public libvpx_test::TransformTestBase,
+      public ::testing::TestWithParam<Ht16x16Param> {
+ public:
+  virtual ~VP10Trans16x16HT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    tx_type_  = GET_PARAM(2);
+    pitch_    = 16;
+    fwd_txfm_ref = fht16x16_ref;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
+    num_coeffs_ = GET_PARAM(4);
+  }
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
+    fwd_txfm_(in, out, stride, tx_type_);
+  }
+
+  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride, tx_type_);
+  }
+
+  FhtFunc fwd_txfm_;
+  IhtFunc inv_txfm_;
+};
+
+TEST_P(VP10Trans16x16HT, CoeffCheck) {
+  RunCoeffCheck();
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+class VP10HighbdTrans16x16HT
+    : public ::testing::TestWithParam<HighbdHt16x16Param> {
+ public:
+  virtual ~VP10HighbdTrans16x16HT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    fwd_txfm_ref_ = highbd_fht16x16_ref;
+    tx_type_  = GET_PARAM(1);
+    bit_depth_ = GET_PARAM(2);
+    mask_ = (1 << bit_depth_) - 1;
+    num_coeffs_ = 256;
+
+    input_ = reinterpret_cast<int16_t *>(
+        vpx_memalign(16, sizeof(int16_t) * num_coeffs_));
+    output_ = reinterpret_cast<int32_t *>(
+        vpx_memalign(16, sizeof(int32_t) * num_coeffs_));
+    output_ref_ = reinterpret_cast<int32_t *>(
+        vpx_memalign(16, sizeof(int32_t) * num_coeffs_));
+  }
+
+  virtual void TearDown() {
+    vpx_free(input_);
+    vpx_free(output_);
+    vpx_free(output_ref_);
+    libvpx_test::ClearSystemState();
+  }
+
+ protected:
+  void RunBitexactCheck();
+
+ private:
+  HbdHtFunc fwd_txfm_;
+  HbdHtFunc fwd_txfm_ref_;
+  int tx_type_;
+  int bit_depth_;
+  int mask_;
+  int num_coeffs_;
+  int16_t *input_;
+  int32_t *output_;
+  int32_t *output_ref_;
+};
+
+void VP10HighbdTrans16x16HT::RunBitexactCheck() {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  int i, j;
+  const int stride = 16;
+  const int num_tests = 1000;
+
+  for (i = 0; i < num_tests; ++i) {
+    for (j = 0; j < num_coeffs_; ++j) {
+      input_[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+    }
+
+    fwd_txfm_ref_(input_, output_ref_, stride, tx_type_, bit_depth_);
+    ASM_REGISTER_STATE_CHECK(fwd_txfm_(input_, output_, stride, tx_type_,
+                                       bit_depth_));
+
+    for (j = 0; j < num_coeffs_; ++j) {
+      EXPECT_EQ(output_ref_[j], output_[j])
+          << "Not bit-exact result at index: " << j
+          << " at test block: " << i;
+    }
+  }
+}
+
+TEST_P(VP10HighbdTrans16x16HT, HighbdCoeffCheck) {
+  RunBitexactCheck();
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+using std::tr1::make_tuple;
+
+#if HAVE_SSE2
+const Ht16x16Param kArrayHt16x16Param_sse2[] = {
+      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 0,
+                 VPX_BITS_8, 256),
+      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 1,
+                 VPX_BITS_8, 256),
+      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 2,
+                 VPX_BITS_8, 256),
+      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 3,
+                 VPX_BITS_8, 256),
+#if CONFIG_EXT_TX
+      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 4,
+                 VPX_BITS_8, 256),
+      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 5,
+                 VPX_BITS_8, 256),
+      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 6,
+                 VPX_BITS_8, 256),
+      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 7,
+                 VPX_BITS_8, 256),
+      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 8,
+                 VPX_BITS_8, 256),
+      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 10,
+                 VPX_BITS_8, 256),
+      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 11,
+                 VPX_BITS_8, 256),
+      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 12,
+                 VPX_BITS_8, 256),
+      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 13,
+                 VPX_BITS_8, 256),
+      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 14,
+                 VPX_BITS_8, 256),
+      make_tuple(&vp10_fht16x16_sse2, &vp10_iht16x16_256_add_sse2, 15,
+                 VPX_BITS_8, 256)
+#endif  // CONFIG_EXT_TX
+};
+INSTANTIATE_TEST_CASE_P(
+    SSE2, VP10Trans16x16HT,
+    ::testing::ValuesIn(kArrayHt16x16Param_sse2));
+#endif  // HAVE_SSE2
+
+#if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
+const HighbdHt16x16Param kArrayHBDHt16x16Param_sse4_1[] = {
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 0, 10),
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 0, 12),
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 1, 10),
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 1, 12),
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 2, 10),
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 2, 12),
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 3, 10),
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 3, 12),
+#if CONFIG_EXT_TX
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 4, 10),
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 4, 12),
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 5, 10),
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 5, 12),
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 6, 10),
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 6, 12),
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 7, 10),
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 7, 12),
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 8, 10),
+    make_tuple(&vp10_fwd_txfm2d_16x16_sse4_1, 8, 12),
+#endif  // CONFIG_EXT_TX
+};
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, VP10HighbdTrans16x16HT,
+    ::testing::ValuesIn(kArrayHBDHt16x16Param_sse4_1));
+#endif  // HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
+
+}  // namespace
diff --git a/test/vp10_fht4x4_test.cc b/test/vp10_fht4x4_test.cc
new file mode 100644
index 0000000..c5a4382
--- /dev/null
+++ b/test/vp10_fht4x4_test.cc
@@ -0,0 +1,231 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp10_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/transform_test_base.h"
+#include "test/util.h"
+#include "vpx_ports/mem.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
+                        int tx_type);
+using std::tr1::tuple;
+using libvpx_test::FhtFunc;
+typedef tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t, int> Ht4x4Param;
+
+void fht4x4_ref(const int16_t *in, tran_low_t *out, int stride,
+                int tx_type) {
+  vp10_fht4x4_c(in, out, stride, tx_type);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef void (*IhighbdHtFunc)(const tran_low_t *in, uint8_t *out, int stride,
+                              int tx_type, int bd);
+typedef void (*HBDFhtFunc)(const int16_t *input, int32_t *output, int stride,
+                           int tx_type, int bd);
+
+// HighbdHt4x4Param argument list:
+// <Target optimized function, tx_type, bit depth>
+typedef tuple<HBDFhtFunc, int, int> HighbdHt4x4Param;
+
+void highbe_fht4x4_ref(const int16_t *in, int32_t *out, int stride,
+                       int tx_type, int bd) {
+  vp10_fwd_txfm2d_4x4_c(in, out, stride, tx_type, bd);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+class VP10Trans4x4HT
+    : public libvpx_test::TransformTestBase,
+      public ::testing::TestWithParam<Ht4x4Param> {
+ public:
+  virtual ~VP10Trans4x4HT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    tx_type_  = GET_PARAM(2);
+    pitch_    = 4;
+    fwd_txfm_ref = fht4x4_ref;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
+    num_coeffs_ = GET_PARAM(4);
+  }
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
+    fwd_txfm_(in, out, stride, tx_type_);
+  }
+
+  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride, tx_type_);
+  }
+
+  FhtFunc fwd_txfm_;
+  IhtFunc inv_txfm_;
+};
+
+TEST_P(VP10Trans4x4HT, CoeffCheck) {
+  RunCoeffCheck();
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+class VP10HighbdTrans4x4HT : public ::testing::TestWithParam<HighbdHt4x4Param> {
+ public:
+  virtual ~VP10HighbdTrans4x4HT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    fwd_txfm_ref_ = highbe_fht4x4_ref;
+    tx_type_  = GET_PARAM(1);
+    bit_depth_ = GET_PARAM(2);
+    mask_ = (1 << bit_depth_) - 1;
+    num_coeffs_ = 16;
+
+    input_ = reinterpret_cast<int16_t *>(
+        vpx_memalign(16, sizeof(int16_t) * num_coeffs_));
+    output_ = reinterpret_cast<int32_t *>(
+        vpx_memalign(16, sizeof(int32_t) * num_coeffs_));
+    output_ref_ = reinterpret_cast<int32_t *>(
+        vpx_memalign(16, sizeof(int32_t) * num_coeffs_));
+  }
+
+  virtual void TearDown() {
+    vpx_free(input_);
+    vpx_free(output_);
+    vpx_free(output_ref_);
+    libvpx_test::ClearSystemState();
+  }
+
+ protected:
+  void RunBitexactCheck();
+
+ private:
+  HBDFhtFunc fwd_txfm_;
+  HBDFhtFunc fwd_txfm_ref_;
+  int tx_type_;
+  int bit_depth_;
+  int mask_;
+  int num_coeffs_;
+  int16_t *input_;
+  int32_t *output_;
+  int32_t *output_ref_;
+};
+
+void VP10HighbdTrans4x4HT::RunBitexactCheck() {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  int i, j;
+  const int stride = 4;
+  const int num_tests = 1000;
+  const int num_coeffs = 16;
+
+  for (i = 0; i < num_tests; ++i) {
+    for (j = 0; j < num_coeffs; ++j) {
+      input_[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+    }
+
+    fwd_txfm_ref_(input_, output_ref_, stride, tx_type_, bit_depth_);
+    fwd_txfm_(input_, output_, stride, tx_type_, bit_depth_);
+
+    for (j = 0; j < num_coeffs; ++j) {
+      EXPECT_EQ(output_[j], output_ref_[j])
+          << "Not bit-exact result at index: " << j
+          << " at test block: " << i;
+    }
+  }
+}
+
+TEST_P(VP10HighbdTrans4x4HT, HighbdCoeffCheck) {
+  RunBitexactCheck();
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+using std::tr1::make_tuple;
+
+#if HAVE_SSE2
+const Ht4x4Param kArrayHt4x4Param_sse2[] = {
+      make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 0,
+                 VPX_BITS_8, 16),
+      make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 1,
+                 VPX_BITS_8, 16),
+      make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 2,
+                 VPX_BITS_8, 16),
+      make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 3,
+                 VPX_BITS_8, 16),
+#if CONFIG_EXT_TX
+      make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 4,
+                 VPX_BITS_8, 16),
+      make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 5,
+                 VPX_BITS_8, 16),
+      make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 6,
+                 VPX_BITS_8, 16),
+      make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 7,
+                 VPX_BITS_8, 16),
+      make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 8,
+                 VPX_BITS_8, 16),
+      make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 10,
+                 VPX_BITS_8, 16),
+      make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 11,
+                 VPX_BITS_8, 16),
+      make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 12,
+                 VPX_BITS_8, 16),
+      make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 13,
+                 VPX_BITS_8, 16),
+      make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 14,
+                 VPX_BITS_8, 16),
+      make_tuple(&vp10_fht4x4_sse2, &vp10_iht4x4_16_add_sse2, 15,
+                 VPX_BITS_8, 16)
+#endif  // CONFIG_EXT_TX
+};
+INSTANTIATE_TEST_CASE_P(
+    SSE2, VP10Trans4x4HT,
+    ::testing::ValuesIn(kArrayHt4x4Param_sse2));
+#endif  // HAVE_SSE2
+
+#if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
+const HighbdHt4x4Param kArrayHighbdHt4x4Param[] = {
+         make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 0, 10),
+         make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 0, 12),
+         make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 1, 10),
+         make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 1, 12),
+         make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 2, 10),
+         make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 2, 12),
+         make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 3, 10),
+         make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 3, 12),
+#if CONFIG_EXT_TX
+         make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 4, 10),
+         make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 4, 12),
+         make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 5, 10),
+         make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 5, 12),
+         make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 6, 10),
+         make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 6, 12),
+         make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 7, 10),
+         make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 7, 12),
+         make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 8, 10),
+         make_tuple(&vp10_fwd_txfm2d_4x4_sse4_1, 8, 12),
+#endif  // CONFIG_EXT_TX
+};
+
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, VP10HighbdTrans4x4HT,
+      ::testing::ValuesIn(kArrayHighbdHt4x4Param));
+
+#endif  // HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
+
+}  // namespace
diff --git a/test/vp10_fht8x8_test.cc b/test/vp10_fht8x8_test.cc
new file mode 100644
index 0000000..da278c4
--- /dev/null
+++ b/test/vp10_fht8x8_test.cc
@@ -0,0 +1,229 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp10_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/transform_test_base.h"
+#include "test/util.h"
+#include "vpx_ports/mem.h"
+
+using libvpx_test::ACMRandom;
+
+namespace {
+typedef void (*IhtFunc)(const tran_low_t *in, uint8_t *out, int stride,
+                        int tx_type);
+
+using libvpx_test::FhtFunc;
+using std::tr1::tuple;
+typedef tuple<FhtFunc, IhtFunc, int, vpx_bit_depth_t, int> Ht8x8Param;
+
+void fht8x8_ref(const int16_t *in, tran_low_t *out, int stride,
+                int tx_type) {
+  vp10_fht8x8_c(in, out, stride, tx_type);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef void (*IHbdHtFunc)(const tran_low_t *in, uint8_t *out, int stride,
+                           int tx_type, int bd);
+typedef void (*HbdHtFunc)(const int16_t *input, int32_t *output, int stride,
+                          int tx_type, int bd);
+// Target optimized function, tx_type, bit depth
+typedef tuple<HbdHtFunc, int, int> HighbdHt8x8Param;
+
+void highbd_fht8x8_ref(const int16_t *in, int32_t *out, int stride,
+                       int tx_type, int bd) {
+  vp10_fwd_txfm2d_8x8_c(in, out, stride, tx_type, bd);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+class VP10Trans8x8HT
+    : public libvpx_test::TransformTestBase,
+      public ::testing::TestWithParam<Ht8x8Param> {
+ public:
+  virtual ~VP10Trans8x8HT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    tx_type_  = GET_PARAM(2);
+    pitch_    = 8;
+    fwd_txfm_ref = fht8x8_ref;
+    bit_depth_ = GET_PARAM(3);
+    mask_ = (1 << bit_depth_) - 1;
+    num_coeffs_ = GET_PARAM(4);
+  }
+  virtual void TearDown() { libvpx_test::ClearSystemState(); }
+
+ protected:
+  void RunFwdTxfm(const int16_t *in, tran_low_t *out, int stride) {
+    fwd_txfm_(in, out, stride, tx_type_);
+  }
+
+  void RunInvTxfm(const tran_low_t *out, uint8_t *dst, int stride) {
+    inv_txfm_(out, dst, stride, tx_type_);
+  }
+
+  FhtFunc fwd_txfm_;
+  IhtFunc inv_txfm_;
+};
+
+TEST_P(VP10Trans8x8HT, CoeffCheck) {
+  RunCoeffCheck();
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+class VP10HighbdTrans8x8HT : public ::testing::TestWithParam<HighbdHt8x8Param> {
+ public:
+  virtual ~VP10HighbdTrans8x8HT() {}
+
+  virtual void SetUp() {
+    fwd_txfm_ = GET_PARAM(0);
+    fwd_txfm_ref_ = highbd_fht8x8_ref;
+    tx_type_  = GET_PARAM(1);
+    bit_depth_ = GET_PARAM(2);
+    mask_ = (1 << bit_depth_) - 1;
+    num_coeffs_ = 64;
+
+    input_ = reinterpret_cast<int16_t *>(
+        vpx_memalign(16, sizeof(int16_t) * num_coeffs_));
+    output_ = reinterpret_cast<int32_t *>(
+        vpx_memalign(16, sizeof(int32_t) * num_coeffs_));
+    output_ref_ = reinterpret_cast<int32_t *>(
+        vpx_memalign(16, sizeof(int32_t) * num_coeffs_));
+  }
+
+  virtual void TearDown() {
+    vpx_free(input_);
+    vpx_free(output_);
+    vpx_free(output_ref_);
+    libvpx_test::ClearSystemState();
+  }
+
+ protected:
+  void RunBitexactCheck();
+
+ private:
+  HbdHtFunc fwd_txfm_;
+  HbdHtFunc fwd_txfm_ref_;
+  int tx_type_;
+  int bit_depth_;
+  int mask_;
+  int num_coeffs_;
+  int16_t *input_;
+  int32_t *output_;
+  int32_t *output_ref_;
+};
+
+void VP10HighbdTrans8x8HT::RunBitexactCheck() {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  int i, j;
+  const int stride = 8;
+  const int num_tests = 1000;
+  const int num_coeffs = 64;
+
+  for (i = 0; i < num_tests; ++i) {
+    for (j = 0; j < num_coeffs; ++j) {
+      input_[j] = (rnd.Rand16() & mask_) - (rnd.Rand16() & mask_);
+    }
+
+    fwd_txfm_ref_(input_, output_ref_, stride, tx_type_, bit_depth_);
+    ASM_REGISTER_STATE_CHECK(fwd_txfm_(input_, output_, stride, tx_type_,
+                                       bit_depth_));
+
+    for (j = 0; j < num_coeffs; ++j) {
+      EXPECT_EQ(output_ref_[j], output_[j])
+          << "Not bit-exact result at index: " << j
+          << " at test block: " << i;
+    }
+  }
+}
+
+TEST_P(VP10HighbdTrans8x8HT, HighbdCoeffCheck) {
+  RunBitexactCheck();
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+using std::tr1::make_tuple;
+
+#if HAVE_SSE2
+const Ht8x8Param kArrayHt8x8Param_sse2[] = {
+      make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 0,
+                 VPX_BITS_8, 64),
+      make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 1,
+                 VPX_BITS_8, 64),
+      make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 2,
+                 VPX_BITS_8, 64),
+      make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 3,
+                 VPX_BITS_8, 64),
+#if CONFIG_EXT_TX
+      make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 4,
+                 VPX_BITS_8, 64),
+      make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 5,
+                 VPX_BITS_8, 64),
+      make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 6,
+                 VPX_BITS_8, 64),
+      make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 7,
+                 VPX_BITS_8, 64),
+      make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 8,
+                 VPX_BITS_8, 64),
+      make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 10,
+                 VPX_BITS_8, 64),
+      make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 11,
+                 VPX_BITS_8, 64),
+      make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 12,
+                 VPX_BITS_8, 64),
+      make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 13,
+                 VPX_BITS_8, 64),
+      make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 14,
+                 VPX_BITS_8, 64),
+      make_tuple(&vp10_fht8x8_sse2, &vp10_iht8x8_64_add_sse2, 15,
+                 VPX_BITS_8, 64)
+#endif  // CONFIG_EXT_TX
+};
+INSTANTIATE_TEST_CASE_P(
+    SSE2, VP10Trans8x8HT,
+    ::testing::ValuesIn(kArrayHt8x8Param_sse2));
+#endif  // HAVE_SSE2
+
+#if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
+const HighbdHt8x8Param kArrayHBDHt8x8Param_sse4_1[] = {
+    make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 0, 10),
+    make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 0, 12),
+    make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 1, 10),
+    make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 1, 12),
+    make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 2, 10),
+    make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 2, 12),
+    make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 3, 10),
+    make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 3, 12),
+#if CONFIG_EXT_TX
+    make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 4, 10),
+    make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 4, 12),
+    make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 5, 10),
+    make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 5, 12),
+    make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 6, 10),
+    make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 6, 12),
+    make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 7, 10),
+    make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 7, 12),
+    make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 8, 10),
+    make_tuple(&vp10_fwd_txfm2d_8x8_sse4_1, 8, 12),
+#endif  // CONFIG_EXT_TX
+};
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, VP10HighbdTrans8x8HT,
+    ::testing::ValuesIn(kArrayHBDHt8x8Param_sse4_1));
+#endif  // HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
+
+}  // namespace
diff --git a/test/vp10_fwd_txfm1d_test.cc b/test/vp10_fwd_txfm1d_test.cc
new file mode 100644
index 0000000..f8dc0b6
--- /dev/null
+++ b/test/vp10_fwd_txfm1d_test.cc
@@ -0,0 +1,128 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp10/common/vp10_fwd_txfm1d.h"
+#include "test/vp10_txfm_test.h"
+
+using libvpx_test::ACMRandom;
+using libvpx_test::input_base;
+using libvpx_test::reference_hybrid_1d;
+using libvpx_test::TYPE_TXFM;
+using libvpx_test::TYPE_DCT;
+using libvpx_test::TYPE_ADST;
+
+namespace {
+const int txfm_type_num = 2;
+const TYPE_TXFM txfm_type_ls[2] = {TYPE_DCT, TYPE_ADST};
+
+const int txfm_size_num = 5;
+const int txfm_size_ls[5] = {4, 8, 16, 32, 64};
+
+const TxfmFunc fwd_txfm_func_ls[2][5] = {
+    {vp10_fdct4_new, vp10_fdct8_new, vp10_fdct16_new, vp10_fdct32_new,
+     vp10_fdct64_new},
+    {vp10_fadst4_new, vp10_fadst8_new, vp10_fadst16_new, vp10_fadst32_new,
+     NULL}};
+
+// the maximum stage number of fwd/inv 1d dct/adst txfm is 12
+const int8_t cos_bit[12] = {14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
+const int8_t range_bit[12] = {32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};
+
+TEST(vp10_fwd_txfm1d, round_shift) {
+  EXPECT_EQ(round_shift(7, 1), 4);
+  EXPECT_EQ(round_shift(-7, 1), -3);
+
+  EXPECT_EQ(round_shift(7, 2), 2);
+  EXPECT_EQ(round_shift(-7, 2), -2);
+
+  EXPECT_EQ(round_shift(8, 2), 2);
+  EXPECT_EQ(round_shift(-8, 2), -2);
+}
+
+TEST(vp10_fwd_txfm1d, get_max_bit) {
+  int max_bit = get_max_bit(8);
+  EXPECT_EQ(max_bit, 3);
+}
+
+TEST(vp10_fwd_txfm1d, cospi_arr) {
+  for (int i = 0; i < 7; i++) {
+    for (int j = 0; j < 64; j++) {
+      EXPECT_EQ(cospi_arr[i][j],
+                (int32_t)round(cos(M_PI * j / 128) * (1 << (cos_bit_min + i))));
+    }
+  }
+}
+
+TEST(vp10_fwd_txfm1d, clamp_block) {
+  int16_t block[5][5] = {{7, -5, 6, -3, 9},
+                         {7, -5, 6, -3, 9},
+                         {7, -5, 6, -3, 9},
+                         {7, -5, 6, -3, 9},
+                         {7, -5, 6, -3, 9}};
+
+  int16_t ref_block[5][5] = {{7, -5, 6, -3, 9},
+                             {7, -5, 6, -3, 9},
+                             {7, -4, 2, -3, 9},
+                             {7, -4, 2, -3, 9},
+                             {7, -4, 2, -3, 9}};
+
+  int row = 2;
+  int col = 1;
+  int block_size = 3;
+  int stride = 5;
+  clamp_block(block[row] + col, block_size, stride, -4, 2);
+  for (int r = 0; r < stride; r++) {
+    for (int c = 0; c < stride; c++) {
+      EXPECT_EQ(block[r][c], ref_block[r][c]);
+    }
+  }
+}
+
+TEST(vp10_fwd_txfm1d, accuracy) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  for (int si = 0; si < txfm_size_num; ++si) {
+    int txfm_size = txfm_size_ls[si];
+    int32_t *input = new int32_t[txfm_size];
+    int32_t *output = new int32_t[txfm_size];
+    double *ref_input = new double[txfm_size];
+    double *ref_output = new double[txfm_size];
+
+    for (int ti = 0; ti < txfm_type_num; ++ti) {
+      TYPE_TXFM txfm_type = txfm_type_ls[ti];
+      TxfmFunc fwd_txfm_func = fwd_txfm_func_ls[ti][si];
+      int max_error = 7;
+
+      const int count_test_block = 5000;
+      if (fwd_txfm_func != NULL) {
+        for (int ti = 0; ti < count_test_block; ++ti) {
+          for (int ni = 0; ni < txfm_size; ++ni) {
+            input[ni] = rnd.Rand16() % input_base - rnd.Rand16() % input_base;
+            ref_input[ni] = static_cast<double>(input[ni]);
+          }
+
+          fwd_txfm_func(input, output, cos_bit, range_bit);
+          reference_hybrid_1d(ref_input, ref_output, txfm_size, txfm_type);
+
+          for (int ni = 0; ni < txfm_size; ++ni) {
+            EXPECT_LE(
+                abs(output[ni] - static_cast<int32_t>(round(ref_output[ni]))),
+                max_error);
+          }
+        }
+      }
+    }
+
+    delete[] input;
+    delete[] output;
+    delete[] ref_input;
+    delete[] ref_output;
+  }
+}
+}  // namespace
diff --git a/test/vp10_fwd_txfm2d_test.cc b/test/vp10_fwd_txfm2d_test.cc
new file mode 100644
index 0000000..953ae11
--- /dev/null
+++ b/test/vp10_fwd_txfm2d_test.cc
@@ -0,0 +1,181 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "test/acm_random.h"
+#include "test/util.h"
+#include "test/vp10_txfm_test.h"
+#include "vp10/common/vp10_txfm.h"
+#include "./vp10_rtcd.h"
+
+using libvpx_test::ACMRandom;
+using libvpx_test::input_base;
+using libvpx_test::bd;
+using libvpx_test::compute_avg_abs_error;
+using libvpx_test::Fwd_Txfm2d_Func;
+using libvpx_test::TYPE_TXFM;
+
+namespace {
+#if CONFIG_VP9_HIGHBITDEPTH
+// tx_type_, tx_size_, max_error_, max_avg_error_
+typedef std::tr1::tuple<TX_TYPE, TX_SIZE, double, double> VP10FwdTxfm2dParam;
+
+class VP10FwdTxfm2d : public ::testing::TestWithParam<VP10FwdTxfm2dParam> {
+ public:
+  virtual void SetUp() {
+    tx_type_ = GET_PARAM(0);
+    tx_size_ = GET_PARAM(1);
+    max_error_ = GET_PARAM(2);
+    max_avg_error_ = GET_PARAM(3);
+    count_ = 500;
+    TXFM_2D_FLIP_CFG fwd_txfm_flip_cfg =
+        vp10_get_fwd_txfm_cfg(tx_type_, tx_size_);
+    const TXFM_2D_CFG *fwd_txfm_cfg = fwd_txfm_flip_cfg.cfg;
+    int amplify_bit = fwd_txfm_cfg->shift[0] + fwd_txfm_cfg->shift[1] +
+                      fwd_txfm_cfg->shift[2];
+    ud_flip_ = fwd_txfm_flip_cfg.ud_flip;
+    lr_flip_ = fwd_txfm_flip_cfg.lr_flip;
+    amplify_factor_ =
+        amplify_bit >= 0 ? (1 << amplify_bit) : (1.0 / (1 << -amplify_bit));
+
+    fwd_txfm_ = libvpx_test::fwd_txfm_func_ls[tx_size_];
+    txfm1d_size_ = libvpx_test::get_txfm1d_size(tx_size_);
+    txfm2d_size_ = txfm1d_size_ * txfm1d_size_;
+    get_txfm1d_type(tx_type_, &type0_, &type1_);
+    input_ = reinterpret_cast<int16_t *>
+       (vpx_memalign(16, sizeof(int16_t) * txfm2d_size_));
+    output_ = reinterpret_cast<int32_t *>
+        (vpx_memalign(16, sizeof(int32_t) * txfm2d_size_));
+    ref_input_ = reinterpret_cast<double *>
+        (vpx_memalign(16, sizeof(double) * txfm2d_size_));
+    ref_output_ = reinterpret_cast<double *>
+        (vpx_memalign(16, sizeof(double) * txfm2d_size_));
+  }
+
+  void RunFwdAccuracyCheck() {
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    double avg_abs_error = 0;
+    for (int ci = 0; ci < count_; ci++) {
+      for (int ni = 0; ni < txfm2d_size_; ++ni) {
+        input_[ni] = rnd.Rand16() % input_base;
+        ref_input_[ni] = static_cast<double>(input_[ni]);
+        output_[ni] = 0;
+        ref_output_[ni] = 0;
+      }
+
+      fwd_txfm_(input_, output_, txfm1d_size_, tx_type_, bd);
+
+      if (lr_flip_ && ud_flip_)
+        libvpx_test::fliplrud(ref_input_, txfm1d_size_, txfm1d_size_);
+      else if (lr_flip_)
+        libvpx_test::fliplr(ref_input_, txfm1d_size_, txfm1d_size_);
+      else if (ud_flip_)
+        libvpx_test::flipud(ref_input_, txfm1d_size_, txfm1d_size_);
+
+      reference_hybrid_2d(ref_input_, ref_output_, txfm1d_size_,
+                          type0_, type1_);
+
+      for (int ni = 0; ni < txfm2d_size_; ++ni) {
+        ref_output_[ni] = round(ref_output_[ni] * amplify_factor_);
+        EXPECT_GE(max_error_,
+                  fabs(output_[ni] - ref_output_[ni]) / amplify_factor_);
+      }
+      avg_abs_error += compute_avg_abs_error<int32_t, double>(
+          output_, ref_output_, txfm2d_size_);
+    }
+
+    avg_abs_error /= amplify_factor_;
+    avg_abs_error /= count_;
+    // max_abs_avg_error comes from upper bound of avg_abs_error
+    // printf("type0: %d type1: %d txfm_size: %d accuracy_avg_abs_error:
+    // %f\n", type0_, type1_, txfm1d_size_, avg_abs_error);
+    EXPECT_GE(max_avg_error_, avg_abs_error);
+  }
+
+  virtual void TearDown() {
+    vpx_free(input_);
+    vpx_free(output_);
+    vpx_free(ref_input_);
+    vpx_free(ref_output_);
+  }
+
+ private:
+  double max_error_;
+  double max_avg_error_;
+  int count_;
+  double amplify_factor_;
+  TX_TYPE tx_type_;
+  TX_SIZE tx_size_;
+  int txfm1d_size_;
+  int txfm2d_size_;
+  Fwd_Txfm2d_Func fwd_txfm_;
+  TYPE_TXFM type0_;
+  TYPE_TXFM type1_;
+  int16_t* input_;
+  int32_t* output_;
+  double* ref_input_;
+  double* ref_output_;
+  int ud_flip_;  // flip upside down
+  int lr_flip_;  // flip left to right
+};
+
+TEST_P(VP10FwdTxfm2d, RunFwdAccuracyCheck) {
+  RunFwdAccuracyCheck();
+}
+const VP10FwdTxfm2dParam vp10_fwd_txfm2d_param_c[] = {
+#if CONFIG_EXT_TX
+  VP10FwdTxfm2dParam(FLIPADST_DCT,  TX_4X4, 2, 0.2),
+  VP10FwdTxfm2dParam(DCT_FLIPADST,  TX_4X4, 2, 0.2),
+  VP10FwdTxfm2dParam(FLIPADST_FLIPADST, TX_4X4, 2, 0.2),
+  VP10FwdTxfm2dParam(ADST_FLIPADST, TX_4X4, 2, 0.2),
+  VP10FwdTxfm2dParam(FLIPADST_ADST, TX_4X4, 2, 0.2),
+  VP10FwdTxfm2dParam(FLIPADST_DCT,  TX_8X8, 5, 0.6),
+  VP10FwdTxfm2dParam(DCT_FLIPADST,  TX_8X8, 5, 0.6),
+  VP10FwdTxfm2dParam(FLIPADST_FLIPADST, TX_8X8, 5, 0.6),
+  VP10FwdTxfm2dParam(ADST_FLIPADST, TX_8X8, 5, 0.6),
+  VP10FwdTxfm2dParam(FLIPADST_ADST, TX_8X8, 5, 0.6),
+  VP10FwdTxfm2dParam(FLIPADST_DCT,  TX_16X16, 11, 1.5),
+  VP10FwdTxfm2dParam(DCT_FLIPADST,  TX_16X16, 11, 1.5),
+  VP10FwdTxfm2dParam(FLIPADST_FLIPADST, TX_16X16, 11, 1.5),
+  VP10FwdTxfm2dParam(ADST_FLIPADST, TX_16X16, 11, 1.5),
+  VP10FwdTxfm2dParam(FLIPADST_ADST, TX_16X16, 11, 1.5),
+  VP10FwdTxfm2dParam(FLIPADST_DCT,  TX_32X32, 70, 7),
+  VP10FwdTxfm2dParam(DCT_FLIPADST,  TX_32X32, 70, 7),
+  VP10FwdTxfm2dParam(FLIPADST_FLIPADST, TX_32X32, 70, 7),
+  VP10FwdTxfm2dParam(ADST_FLIPADST, TX_32X32, 70, 7),
+  VP10FwdTxfm2dParam(FLIPADST_ADST, TX_32X32, 70, 7),
+#endif
+  VP10FwdTxfm2dParam(DCT_DCT,   TX_4X4, 2, 0.2),
+  VP10FwdTxfm2dParam(ADST_DCT,  TX_4X4, 2, 0.2),
+  VP10FwdTxfm2dParam(DCT_ADST,  TX_4X4, 2, 0.2),
+  VP10FwdTxfm2dParam(ADST_ADST, TX_4X4, 2, 0.2),
+  VP10FwdTxfm2dParam(DCT_DCT,   TX_8X8, 5, 0.6),
+  VP10FwdTxfm2dParam(ADST_DCT,  TX_8X8, 5, 0.6),
+  VP10FwdTxfm2dParam(DCT_ADST,  TX_8X8, 5, 0.6),
+  VP10FwdTxfm2dParam(ADST_ADST, TX_8X8, 5, 0.6),
+  VP10FwdTxfm2dParam(DCT_DCT,   TX_16X16, 11, 1.5),
+  VP10FwdTxfm2dParam(ADST_DCT,  TX_16X16, 11, 1.5),
+  VP10FwdTxfm2dParam(DCT_ADST,  TX_16X16, 11, 1.5),
+  VP10FwdTxfm2dParam(ADST_ADST, TX_16X16, 11, 1.5),
+  VP10FwdTxfm2dParam(DCT_DCT,   TX_32X32, 70, 7),
+  VP10FwdTxfm2dParam(ADST_DCT,  TX_32X32, 70, 7),
+  VP10FwdTxfm2dParam(DCT_ADST,  TX_32X32, 70, 7),
+  VP10FwdTxfm2dParam(ADST_ADST, TX_32X32, 70, 7)
+};
+
+INSTANTIATE_TEST_CASE_P(
+    C, VP10FwdTxfm2d,
+    ::testing::ValuesIn(vp10_fwd_txfm2d_param_c));
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}  // namespace
diff --git a/test/vp10_highbd_iht_test.cc b/test/vp10_highbd_iht_test.cc
new file mode 100644
index 0000000..caab04c
--- /dev/null
+++ b/test/vp10_highbd_iht_test.cc
@@ -0,0 +1,222 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vp10_rtcd.h"
+#include "test/acm_random.h"
+#include "test/clear_system_state.h"
+#include "test/register_state_check.h"
+#include "test/util.h"
+#include "vp10/common/enums.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/mem.h"
+
+namespace {
+
+using std::tr1::tuple;
+using libvpx_test::ACMRandom;
+
+typedef void (*HbdHtFunc)(const int16_t *input, int32_t *output, int stride,
+                          int tx_type, int bd);
+
+typedef void (*IHbdHtFunc)(const int32_t *coeff, uint16_t *output, int stride,
+                           int tx_type, int bd);
+
+// Test parameter argument list:
+//   <transform reference function,
+//    optimized inverse transform function,
+//    inverse transform reference function,
+//    num_coeffs,
+//    tx_type,
+//    bit_depth>
+typedef tuple<HbdHtFunc, IHbdHtFunc, IHbdHtFunc, int, int, int> IHbdHtParam;
+
+class VP10HighbdInvHTNxN : public ::testing::TestWithParam<IHbdHtParam> {
+ public:
+  virtual ~VP10HighbdInvHTNxN() {}
+
+  virtual void SetUp() {
+    txfm_ref_ = GET_PARAM(0);
+    inv_txfm_ = GET_PARAM(1);
+    inv_txfm_ref_ = GET_PARAM(2);
+    num_coeffs_ = GET_PARAM(3);
+    tx_type_ = GET_PARAM(4);
+    bit_depth_ = GET_PARAM(5);
+
+    input_ = reinterpret_cast<int16_t *>(
+        vpx_memalign(16, sizeof(input_[0]) * num_coeffs_));
+
+    // Note:
+    // Inverse transform input buffer is 32-byte aligned
+    // Refer to <root>/vp10/encoder/context_tree.c, function,
+    // void alloc_mode_context().
+    coeffs_ = reinterpret_cast<int32_t *>(
+        vpx_memalign(32, sizeof(coeffs_[0]) * num_coeffs_));
+    output_ = reinterpret_cast<uint16_t *>(
+        vpx_memalign(32, sizeof(output_[0]) * num_coeffs_));
+    output_ref_ = reinterpret_cast<uint16_t *>(
+        vpx_memalign(32, sizeof(output_ref_[0]) * num_coeffs_));
+  }
+
+  virtual void TearDown() {
+    vpx_free(input_);
+    vpx_free(coeffs_);
+    vpx_free(output_);
+    vpx_free(output_ref_);
+    libvpx_test::ClearSystemState();
+  }
+
+ protected:
+  void RunBitexactCheck();
+
+ private:
+  int GetStride() const {
+    if (16 == num_coeffs_) {
+      return 4;
+    } else if (64 == num_coeffs_) {
+      return 8;
+    } else if (256 == num_coeffs_) {
+      return 16;
+    } else {
+      return 0;
+    }
+  }
+
+  HbdHtFunc txfm_ref_;
+  IHbdHtFunc inv_txfm_;
+  IHbdHtFunc inv_txfm_ref_;
+  int num_coeffs_;
+  int tx_type_;
+  int bit_depth_;
+
+  int16_t *input_;
+  int32_t *coeffs_;
+  uint16_t *output_;
+  uint16_t *output_ref_;
+};
+
+void VP10HighbdInvHTNxN::RunBitexactCheck() {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  const int stride = GetStride();
+  const int num_tests = 20000;
+  const uint16_t mask = (1 << bit_depth_) - 1;
+
+  for (int i = 0; i < num_tests; ++i) {
+    for (int j = 0; j < num_coeffs_; ++j) {
+      input_[j] = (rnd.Rand16() & mask) - (rnd.Rand16() & mask);
+      output_ref_[j] = rnd.Rand16() & mask;
+      output_[j] = output_ref_[j];
+    }
+
+    txfm_ref_(input_, coeffs_, stride, tx_type_, bit_depth_);
+    inv_txfm_ref_(coeffs_, output_ref_, stride, tx_type_, bit_depth_);
+    ASM_REGISTER_STATE_CHECK(inv_txfm_(coeffs_, output_, stride, tx_type_,
+                                       bit_depth_));
+
+    for (int j = 0; j < num_coeffs_; ++j) {
+      EXPECT_EQ(output_ref_[j], output_[j])
+          << "Not bit-exact result at index: " << j
+          << " At test block: " << i;
+    }
+  }
+}
+
+TEST_P(VP10HighbdInvHTNxN, InvTransResultCheck) {
+  RunBitexactCheck();
+}
+
+using std::tr1::make_tuple;
+
+#if HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
+#define PARAM_LIST_4X4 &vp10_fwd_txfm2d_4x4_c, \
+             &vp10_inv_txfm2d_add_4x4_sse4_1,  \
+             &vp10_inv_txfm2d_add_4x4_c, 16
+
+#define PARAM_LIST_8X8 &vp10_fwd_txfm2d_8x8_c, \
+             &vp10_inv_txfm2d_add_8x8_sse4_1,  \
+             &vp10_inv_txfm2d_add_8x8_c, 64
+
+#define PARAM_LIST_16X16 &vp10_fwd_txfm2d_16x16_c, \
+             &vp10_inv_txfm2d_add_16x16_sse4_1,    \
+             &vp10_inv_txfm2d_add_16x16_c, 256
+
+const IHbdHtParam kArrayIhtParam[] = {
+  // 16x16
+  make_tuple(PARAM_LIST_16X16, DCT_DCT, 10),
+  make_tuple(PARAM_LIST_16X16, DCT_DCT, 12),
+  make_tuple(PARAM_LIST_16X16, ADST_DCT, 10),
+  make_tuple(PARAM_LIST_16X16, ADST_DCT, 12),
+  make_tuple(PARAM_LIST_16X16, DCT_ADST, 10),
+  make_tuple(PARAM_LIST_16X16, DCT_ADST, 12),
+  make_tuple(PARAM_LIST_16X16, ADST_ADST, 10),
+  make_tuple(PARAM_LIST_16X16, ADST_ADST, 12),
+#if CONFIG_EXT_TX
+  make_tuple(PARAM_LIST_16X16, FLIPADST_DCT, 10),
+  make_tuple(PARAM_LIST_16X16, FLIPADST_DCT, 12),
+  make_tuple(PARAM_LIST_16X16, DCT_FLIPADST, 10),
+  make_tuple(PARAM_LIST_16X16, DCT_FLIPADST, 12),
+  make_tuple(PARAM_LIST_16X16, FLIPADST_FLIPADST, 10),
+  make_tuple(PARAM_LIST_16X16, FLIPADST_FLIPADST, 12),
+  make_tuple(PARAM_LIST_16X16, ADST_FLIPADST, 10),
+  make_tuple(PARAM_LIST_16X16, ADST_FLIPADST, 12),
+  make_tuple(PARAM_LIST_16X16, FLIPADST_ADST, 10),
+  make_tuple(PARAM_LIST_16X16, FLIPADST_ADST, 12),
+#endif
+  // 8x8
+  make_tuple(PARAM_LIST_8X8, DCT_DCT, 10),
+  make_tuple(PARAM_LIST_8X8, DCT_DCT, 12),
+  make_tuple(PARAM_LIST_8X8, ADST_DCT, 10),
+  make_tuple(PARAM_LIST_8X8, ADST_DCT, 12),
+  make_tuple(PARAM_LIST_8X8, DCT_ADST, 10),
+  make_tuple(PARAM_LIST_8X8, DCT_ADST, 12),
+  make_tuple(PARAM_LIST_8X8, ADST_ADST, 10),
+  make_tuple(PARAM_LIST_8X8, ADST_ADST, 12),
+#if CONFIG_EXT_TX
+  make_tuple(PARAM_LIST_8X8, FLIPADST_DCT, 10),
+  make_tuple(PARAM_LIST_8X8, FLIPADST_DCT, 12),
+  make_tuple(PARAM_LIST_8X8, DCT_FLIPADST, 10),
+  make_tuple(PARAM_LIST_8X8, DCT_FLIPADST, 12),
+  make_tuple(PARAM_LIST_8X8, FLIPADST_FLIPADST, 10),
+  make_tuple(PARAM_LIST_8X8, FLIPADST_FLIPADST, 12),
+  make_tuple(PARAM_LIST_8X8, ADST_FLIPADST, 10),
+  make_tuple(PARAM_LIST_8X8, ADST_FLIPADST, 12),
+  make_tuple(PARAM_LIST_8X8, FLIPADST_ADST, 10),
+  make_tuple(PARAM_LIST_8X8, FLIPADST_ADST, 12),
+#endif
+  // 4x4
+  make_tuple(PARAM_LIST_4X4, DCT_DCT, 10),
+  make_tuple(PARAM_LIST_4X4, DCT_DCT, 12),
+  make_tuple(PARAM_LIST_4X4, ADST_DCT, 10),
+  make_tuple(PARAM_LIST_4X4, ADST_DCT, 12),
+  make_tuple(PARAM_LIST_4X4, DCT_ADST, 10),
+  make_tuple(PARAM_LIST_4X4, DCT_ADST, 12),
+  make_tuple(PARAM_LIST_4X4, ADST_ADST, 10),
+  make_tuple(PARAM_LIST_4X4, ADST_ADST, 12),
+#if CONFIG_EXT_TX
+  make_tuple(PARAM_LIST_4X4, FLIPADST_DCT, 10),
+  make_tuple(PARAM_LIST_4X4, FLIPADST_DCT, 12),
+  make_tuple(PARAM_LIST_4X4, DCT_FLIPADST, 10),
+  make_tuple(PARAM_LIST_4X4, DCT_FLIPADST, 12),
+  make_tuple(PARAM_LIST_4X4, FLIPADST_FLIPADST, 10),
+  make_tuple(PARAM_LIST_4X4, FLIPADST_FLIPADST, 12),
+  make_tuple(PARAM_LIST_4X4, ADST_FLIPADST, 10),
+  make_tuple(PARAM_LIST_4X4, ADST_FLIPADST, 12),
+  make_tuple(PARAM_LIST_4X4, FLIPADST_ADST, 10),
+  make_tuple(PARAM_LIST_4X4, FLIPADST_ADST, 12),
+#endif
+};
+
+INSTANTIATE_TEST_CASE_P(
+    SSE4_1, VP10HighbdInvHTNxN,
+    ::testing::ValuesIn(kArrayIhtParam));
+#endif  // HAVE_SSE4_1 && CONFIG_VP9_HIGHBITDEPTH
+
+}  // namespace
diff --git a/test/vp10_inv_txfm1d_test.cc b/test/vp10_inv_txfm1d_test.cc
new file mode 100644
index 0000000..c024f2c
--- /dev/null
+++ b/test/vp10_inv_txfm1d_test.cc
@@ -0,0 +1,77 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "test/vp10_txfm_test.h"
+#include "vp10/common/vp10_fwd_txfm1d.h"
+#include "vp10/common/vp10_inv_txfm1d.h"
+
+using libvpx_test::ACMRandom;
+using libvpx_test::input_base;
+
+namespace {
+const int txfm_type_num = 2;
+const int txfm_size_num = 5;
+const int txfm_size_ls[5] = {4, 8, 16, 32, 64};
+
+const TxfmFunc fwd_txfm_func_ls[2][5] = {
+    {vp10_fdct4_new, vp10_fdct8_new, vp10_fdct16_new, vp10_fdct32_new,
+     vp10_fdct64_new},
+    {vp10_fadst4_new, vp10_fadst8_new, vp10_fadst16_new, vp10_fadst32_new,
+     NULL}};
+
+const TxfmFunc inv_txfm_func_ls[2][5] = {
+    {vp10_idct4_new, vp10_idct8_new, vp10_idct16_new, vp10_idct32_new,
+     vp10_idct64_new},
+    {vp10_iadst4_new, vp10_iadst8_new, vp10_iadst16_new, vp10_iadst32_new,
+     NULL}};
+
+// the maximum stage number of fwd/inv 1d dct/adst txfm is 12
+const int8_t cos_bit[12] = {14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14};
+const int8_t range_bit[12] = {32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32};
+
+TEST(vp10_inv_txfm1d, round_trip) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  for (int si = 0; si < txfm_size_num; ++si) {
+    int txfm_size = txfm_size_ls[si];
+    int32_t *input = new int32_t[txfm_size];
+    int32_t *output = new int32_t[txfm_size];
+    int32_t *round_trip_output = new int32_t[txfm_size];
+
+    for (int ti = 0; ti < txfm_type_num; ++ti) {
+      TxfmFunc fwd_txfm_func = fwd_txfm_func_ls[ti][si];
+      TxfmFunc inv_txfm_func = inv_txfm_func_ls[ti][si];
+      int max_error = 2;
+
+      if (fwd_txfm_func != NULL) {
+        const int count_test_block = 5000;
+        for (int ci = 0; ci < count_test_block; ++ci) {
+          for (int ni = 0; ni < txfm_size; ++ni) {
+            input[ni] = rnd.Rand16() % input_base - rnd.Rand16() % input_base;
+          }
+
+          fwd_txfm_func(input, output, cos_bit, range_bit);
+          inv_txfm_func(output, round_trip_output, cos_bit, range_bit);
+
+          for (int ni = 0; ni < txfm_size; ++ni) {
+            int node_err =
+                abs(input[ni] - round_shift(round_trip_output[ni],
+                                            get_max_bit(txfm_size) - 1));
+            EXPECT_LE(node_err, max_error);
+          }
+        }
+      }
+    }
+    delete[] input;
+    delete[] output;
+    delete[] round_trip_output;
+  }
+}
+
+}  // namespace
diff --git a/test/vp10_inv_txfm2d_test.cc b/test/vp10_inv_txfm2d_test.cc
new file mode 100644
index 0000000..d2ab04b
--- /dev/null
+++ b/test/vp10_inv_txfm2d_test.cc
@@ -0,0 +1,158 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "./vp10_rtcd.h"
+#include "test/acm_random.h"
+#include "test/util.h"
+#include "test/vp10_txfm_test.h"
+#include "vp10/common/vp10_inv_txfm2d_cfg.h"
+
+using libvpx_test::ACMRandom;
+using libvpx_test::input_base;
+using libvpx_test::bd;
+using libvpx_test::compute_avg_abs_error;
+using libvpx_test::Fwd_Txfm2d_Func;
+using libvpx_test::Inv_Txfm2d_Func;
+
+namespace {
+
+#if CONFIG_VP9_HIGHBITDEPTH
+// VP10InvTxfm2dParam argument list:
+// tx_type_, tx_size_, max_error_, max_avg_error_
+typedef std::tr1::tuple<TX_TYPE, TX_SIZE, double, double> VP10InvTxfm2dParam;
+
+class VP10InvTxfm2d : public ::testing::TestWithParam<VP10InvTxfm2dParam> {
+ public:
+  virtual void SetUp() {
+    tx_type_ = GET_PARAM(0);
+    tx_size_ = GET_PARAM(1);
+    max_error_ = GET_PARAM(2);
+    max_avg_error_ = GET_PARAM(3);
+    txfm1d_size_ = libvpx_test::get_txfm1d_size(tx_size_);
+    txfm2d_size_ = txfm1d_size_ * txfm1d_size_;
+    count_ = 500;
+
+    input_ = reinterpret_cast<int16_t *>
+        (vpx_memalign(16, sizeof(int16_t) * txfm2d_size_));
+    ref_input_ = reinterpret_cast<uint16_t *>
+        (vpx_memalign(16, sizeof(uint16_t) * txfm2d_size_));
+    output_ = reinterpret_cast<int32_t *>
+        (vpx_memalign(16, sizeof(int32_t) * txfm2d_size_));
+  }
+
+  void RunRoundtripCheck() {
+    const Fwd_Txfm2d_Func fwd_txfm_func =
+        libvpx_test::fwd_txfm_func_ls[tx_size_];
+    const Inv_Txfm2d_Func inv_txfm_func =
+        libvpx_test::inv_txfm_func_ls[tx_size_];
+    double avg_abs_error = 0;
+    ACMRandom rnd(ACMRandom::DeterministicSeed());
+    for (int ci = 0; ci < count_; ci++) {
+      for (int ni = 0; ni < txfm2d_size_; ++ni) {
+        if (ci == 0) {
+          int extreme_input = input_base - 1;
+          input_[ni] = extreme_input;  // extreme case
+          ref_input_[ni] = 0;
+        } else {
+          input_[ni] = rnd.Rand16() % input_base;
+          ref_input_[ni] = 0;
+        }
+      }
+
+      fwd_txfm_func(input_, output_, txfm1d_size_, tx_type_, bd);
+      inv_txfm_func(output_, ref_input_, txfm1d_size_, tx_type_, bd);
+
+      for (int ni = 0; ni < txfm2d_size_; ++ni) {
+        EXPECT_GE(max_error_, abs(input_[ni] - ref_input_[ni]));
+      }
+      avg_abs_error += compute_avg_abs_error<int16_t, uint16_t>(
+          input_, ref_input_, txfm2d_size_);
+    }
+
+    avg_abs_error /= count_;
+    // max_abs_avg_error comes from upper bound of
+    // printf("txfm1d_size: %d accuracy_avg_abs_error: %f\n",
+    // txfm1d_size_, avg_abs_error);
+    EXPECT_GE(max_avg_error_, avg_abs_error);
+  }
+
+  virtual void TearDown() {
+    vpx_free(input_);
+    vpx_free(output_);
+    vpx_free(ref_input_);
+  }
+
+ private:
+  int count_;
+  int max_error_;
+  double max_avg_error_;
+  TX_TYPE tx_type_;
+  TX_SIZE tx_size_;
+  int txfm1d_size_;
+  int txfm2d_size_;
+  int16_t* input_;
+  uint16_t* ref_input_;
+  int32_t* output_;
+};
+
+TEST_P(VP10InvTxfm2d, RunRoundtripCheck) { RunRoundtripCheck(); }
+
+const VP10InvTxfm2dParam vp10_inv_txfm2d_param[] = {
+#if CONFIG_EXT_TX
+  VP10InvTxfm2dParam(FLIPADST_DCT, TX_4X4, 2, 0.002),
+  VP10InvTxfm2dParam(DCT_FLIPADST, TX_4X4, 2, 0.002),
+  VP10InvTxfm2dParam(FLIPADST_FLIPADST, TX_4X4, 2, 0.002),
+  VP10InvTxfm2dParam(ADST_FLIPADST, TX_4X4, 2, 0.002),
+  VP10InvTxfm2dParam(FLIPADST_ADST, TX_4X4, 2, 0.002),
+  VP10InvTxfm2dParam(FLIPADST_DCT, TX_8X8, 2, 0.02),
+  VP10InvTxfm2dParam(DCT_FLIPADST, TX_8X8, 2, 0.02),
+  VP10InvTxfm2dParam(FLIPADST_FLIPADST, TX_8X8, 2, 0.02),
+  VP10InvTxfm2dParam(ADST_FLIPADST, TX_8X8, 2, 0.02),
+  VP10InvTxfm2dParam(FLIPADST_ADST, TX_8X8, 2, 0.02),
+  VP10InvTxfm2dParam(FLIPADST_DCT, TX_16X16, 2, 0.04),
+  VP10InvTxfm2dParam(DCT_FLIPADST, TX_16X16, 2, 0.04),
+  VP10InvTxfm2dParam(FLIPADST_FLIPADST, TX_16X16, 11, 0.04),
+  VP10InvTxfm2dParam(ADST_FLIPADST, TX_16X16, 2, 0.04),
+  VP10InvTxfm2dParam(FLIPADST_ADST, TX_16X16, 2, 0.04),
+  VP10InvTxfm2dParam(FLIPADST_DCT, TX_32X32, 4, 0.4),
+  VP10InvTxfm2dParam(DCT_FLIPADST, TX_32X32, 4, 0.4),
+  VP10InvTxfm2dParam(FLIPADST_FLIPADST, TX_32X32, 4, 0.4),
+  VP10InvTxfm2dParam(ADST_FLIPADST, TX_32X32, 4, 0.4),
+  VP10InvTxfm2dParam(FLIPADST_ADST, TX_32X32, 4, 0.4),
+#endif
+  VP10InvTxfm2dParam(DCT_DCT, TX_4X4, 2, 0.002),
+  VP10InvTxfm2dParam(ADST_DCT, TX_4X4, 2, 0.002),
+  VP10InvTxfm2dParam(DCT_ADST, TX_4X4, 2, 0.002),
+  VP10InvTxfm2dParam(ADST_ADST, TX_4X4, 2, 0.002),
+  VP10InvTxfm2dParam(DCT_DCT, TX_8X8, 2, 0.02),
+  VP10InvTxfm2dParam(ADST_DCT, TX_8X8, 2, 0.02),
+  VP10InvTxfm2dParam(DCT_ADST, TX_8X8, 2, 0.02),
+  VP10InvTxfm2dParam(ADST_ADST, TX_8X8, 2, 0.02),
+  VP10InvTxfm2dParam(DCT_DCT, TX_16X16, 2, 0.04),
+  VP10InvTxfm2dParam(ADST_DCT, TX_16X16, 2, 0.04),
+  VP10InvTxfm2dParam(DCT_ADST, TX_16X16, 2, 0.04),
+  VP10InvTxfm2dParam(ADST_ADST, TX_16X16, 2, 0.04),
+  VP10InvTxfm2dParam(DCT_DCT, TX_32X32, 4, 0.4),
+  VP10InvTxfm2dParam(ADST_DCT, TX_32X32, 4, 0.4),
+  VP10InvTxfm2dParam(DCT_ADST, TX_32X32, 4, 0.4),
+  VP10InvTxfm2dParam(ADST_ADST, TX_32X32, 4, 0.4)
+};
+
+INSTANTIATE_TEST_CASE_P(
+    C, VP10InvTxfm2d,
+    ::testing::ValuesIn(vp10_inv_txfm2d_param));
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+}  // namespace
diff --git a/test/vp10_inv_txfm_test.cc b/test/vp10_inv_txfm_test.cc
index c49081e..e37b906 100644
--- a/test/vp10_inv_txfm_test.cc
+++ b/test/vp10_inv_txfm_test.cc
@@ -28,7 +28,6 @@
 using libvpx_test::ACMRandom;
 
 namespace {
-const double PI = 3.141592653589793238462643383279502884;
 const double kInvSqrt2 = 0.707106781186547524400844362104;
 
 void reference_idct_1d(const double *in, double *out, int size) {
@@ -203,7 +202,7 @@
       // quantization with maximum allowed step sizes
       test_coef_block1[0] = (output_ref_block[0] / 1336) * 1336;
       for (int j = 1; j < last_nonzero_; ++j)
-        test_coef_block1[vp10_default_scan_orders[tx_size_].scan[j]]
+        test_coef_block1[get_scan(tx_size_, DCT_DCT, 0)->scan[j]]
                          = (output_ref_block[j] / 1828) * 1828;
     }
 
@@ -265,7 +264,7 @@
         max_energy_leftover = 0;
         coef = 0;
       }
-      test_coef_block1[vp10_default_scan_orders[tx_size_].scan[j]] = coef;
+      test_coef_block1[get_scan(tx_size_, DCT_DCT, 0)->scan[j]] = coef;
     }
 
     memcpy(test_coef_block2, test_coef_block1,
diff --git a/test/vp10_txfm_test.cc b/test/vp10_txfm_test.cc
new file mode 100644
index 0000000..6b36126
--- /dev/null
+++ b/test/vp10_txfm_test.cc
@@ -0,0 +1,166 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include "test/vp10_txfm_test.h"
+
+namespace libvpx_test {
+
+int get_txfm1d_size(TX_SIZE tx_size) {
+  return 1 << (tx_size + 2);
+}
+
+void get_txfm1d_type(TX_TYPE txfm2d_type, TYPE_TXFM* type0,
+                     TYPE_TXFM* type1) {
+  switch (txfm2d_type) {
+    case DCT_DCT:
+      *type0 = TYPE_DCT;
+      *type1 = TYPE_DCT;
+      break;
+    case ADST_DCT:
+      *type0 = TYPE_ADST;
+      *type1 = TYPE_DCT;
+      break;
+    case DCT_ADST:
+      *type0 = TYPE_DCT;
+      *type1 = TYPE_ADST;
+      break;
+    case ADST_ADST:
+      *type0 = TYPE_ADST;
+      *type1 = TYPE_ADST;
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      *type0 = TYPE_ADST;
+      *type1 = TYPE_DCT;
+      break;
+    case DCT_FLIPADST:
+      *type0 = TYPE_DCT;
+      *type1 = TYPE_ADST;
+      break;
+    case FLIPADST_FLIPADST:
+      *type0 = TYPE_ADST;
+      *type1 = TYPE_ADST;
+      break;
+    case ADST_FLIPADST:
+      *type0 = TYPE_ADST;
+      *type1 = TYPE_ADST;
+      break;
+    case FLIPADST_ADST:
+      *type0 = TYPE_ADST;
+      *type1 = TYPE_ADST;
+      break;
+#endif  // CONFIG_EXT_TX
+    default:
+      *type0 = TYPE_DCT;
+      *type1 = TYPE_DCT;
+      assert(0);
+      break;
+  }
+}
+
+double invSqrt2 = 1 / pow(2, 0.5);
+
+void reference_dct_1d(const double* in, double* out, int size) {
+  for (int k = 0; k < size; ++k) {
+    out[k] = 0;
+    for (int n = 0; n < size; ++n) {
+      out[k] += in[n] * cos(M_PI * (2 * n + 1) * k / (2 * size));
+    }
+    if (k == 0) out[k] = out[k] * invSqrt2;
+  }
+}
+
+void reference_adst_1d(const double* in, double* out, int size) {
+  for (int k = 0; k < size; ++k) {
+    out[k] = 0;
+    for (int n = 0; n < size; ++n) {
+      out[k] += in[n] * sin(M_PI * (2 * n + 1) * (2 * k + 1) / (4 * size));
+    }
+  }
+}
+
+void reference_hybrid_1d(double* in, double* out, int size, int type) {
+  if (type == TYPE_DCT)
+    reference_dct_1d(in, out, size);
+  else
+    reference_adst_1d(in, out, size);
+}
+
+void reference_hybrid_2d(double* in, double* out, int size,
+                         int type0, int type1) {
+  double* tempOut = new double[size * size];
+
+  for (int r = 0; r < size; r++) {
+    // out ->tempOut
+    for (int c = 0; c < size; c++) {
+      tempOut[r * size + c] = in[c * size + r];
+    }
+  }
+
+  // dct each row: in -> out
+  for (int r = 0; r < size; r++) {
+    reference_hybrid_1d(tempOut + r * size, out + r * size, size, type0);
+  }
+
+  for (int r = 0; r < size; r++) {
+    // out ->tempOut
+    for (int c = 0; c < size; c++) {
+      tempOut[r * size + c] = out[c * size + r];
+    }
+  }
+
+  for (int r = 0; r < size; r++) {
+    reference_hybrid_1d(tempOut + r * size, out + r * size, size, type1);
+  }
+  delete[] tempOut;
+}
+
+template<typename Type>
+void fliplr(Type *dest, int stride, int length) {
+  int i, j;
+  for (i = 0; i < length; ++i) {
+    for (j = 0; j < length / 2; ++j) {
+      const Type tmp = dest[i * stride + j];
+      dest[i * stride + j] = dest[i * stride + length - 1 - j];
+      dest[i * stride + length - 1 - j] = tmp;
+    }
+  }
+}
+
+template<typename Type>
+void flipud(Type *dest, int stride, int length) {
+  int i, j;
+  for (j = 0; j < length; ++j) {
+    for (i = 0; i < length / 2; ++i) {
+      const Type tmp = dest[i * stride + j];
+      dest[i * stride + j] = dest[(length - 1 - i) * stride + j];
+      dest[(length - 1 - i) * stride + j] = tmp;
+    }
+  }
+}
+
+template<typename Type>
+void fliplrud(Type *dest, int stride, int length) {
+  int i, j;
+  for (i = 0; i < length / 2; ++i) {
+    for (j = 0; j < length; ++j) {
+      const Type tmp = dest[i * stride + j];
+      dest[i * stride + j] = dest[(length - 1 - i) * stride + length - 1 - j];
+      dest[(length - 1 - i) * stride + length - 1 - j] = tmp;
+    }
+  }
+}
+
+template void fliplr<double>(double *dest, int stride, int length);
+template void flipud<double>(double *dest, int stride, int length);
+template void fliplrud<double>(double *dest, int stride, int length);
+
+}  // namespace libvpx_test
diff --git a/test/vp10_txfm_test.h b/test/vp10_txfm_test.h
new file mode 100644
index 0000000..fb9e12e
--- /dev/null
+++ b/test/vp10_txfm_test.h
@@ -0,0 +1,90 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_TXFM_TEST_H_
+#define VP10_TXFM_TEST_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#ifdef _MSC_VER
+#define _USE_MATH_DEFINES
+#endif
+#include <math.h>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "test/acm_random.h"
+#include "vp10/common/enums.h"
+#include "vp10/common/vp10_txfm.h"
+#include "./vp10_rtcd.h"
+
+namespace libvpx_test {
+typedef enum {
+  TYPE_DCT = 0,
+  TYPE_ADST,
+  TYPE_IDCT,
+  TYPE_IADST,
+  TYPE_LAST
+} TYPE_TXFM;
+
+int get_txfm1d_size(TX_SIZE tx_size);
+
+void get_txfm1d_type(TX_TYPE txfm2d_type, TYPE_TXFM* type0,
+                     TYPE_TXFM* type1);
+
+void reference_dct_1d(const double* in, double* out, int size);
+
+void reference_adst_1d(const double* in, double* out, int size);
+
+void reference_hybrid_1d(double* in, double* out, int size, int type);
+
+void reference_hybrid_2d(double* in, double* out, int size,
+                         int type0, int type1);
+template <typename Type1, typename Type2>
+static double compute_avg_abs_error(const Type1* a, const Type2* b,
+                                    const int size) {
+  double error = 0;
+  for (int i = 0; i < size; i++) {
+    error += fabs(static_cast<double>(a[i]) - static_cast<double>(b[i]));
+  }
+  error = error / size;
+  return error;
+}
+
+template<typename Type>
+void fliplr(Type *dest, int stride, int length);
+
+template<typename Type>
+void flipud(Type *dest, int stride, int length);
+
+template<typename Type>
+void fliplrud(Type *dest, int stride, int length);
+
+typedef void (*TxfmFunc)(const int32_t* in, int32_t* out, const int8_t* cos_bit,
+                         const int8_t* range_bit);
+
+typedef void (*Fwd_Txfm2d_Func)(const int16_t*, int32_t*, int, int, int);
+typedef void (*Inv_Txfm2d_Func)(const int32_t*, uint16_t*, int, int, int);
+
+static const int bd = 10;
+static const int input_base = (1 << bd);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static const Fwd_Txfm2d_Func fwd_txfm_func_ls[TX_SIZES] = {
+    vp10_fwd_txfm2d_4x4_c, vp10_fwd_txfm2d_8x8_c, vp10_fwd_txfm2d_16x16_c,
+    vp10_fwd_txfm2d_32x32_c};
+
+static const Inv_Txfm2d_Func inv_txfm_func_ls[TX_SIZES] = {
+    vp10_inv_txfm2d_add_4x4_c, vp10_inv_txfm2d_add_8x8_c,
+    vp10_inv_txfm2d_add_16x16_c, vp10_inv_txfm2d_add_32x32_c};
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+}  // namespace libvpx_test
+#endif  // VP10_TXFM_TEST_H_
diff --git a/test/vp10_wedge_utils_test.cc b/test/vp10_wedge_utils_test.cc
new file mode 100644
index 0000000..9fa4849
--- /dev/null
+++ b/test/vp10_wedge_utils_test.cc
@@ -0,0 +1,409 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "./vpx_config.h"
+
+#include "./vpx_dsp_rtcd.h"
+#include "./vp10_rtcd.h"
+
+#include "vpx_dsp/vpx_dsp_common.h"
+
+#include "vp10/common/enums.h"
+
+#include "test/acm_random.h"
+#include "test/function_equivalence_test.h"
+#include "test/register_state_check.h"
+
+#define WEDGE_WEIGHT_BITS 6
+#define MAX_MASK_VALUE  (1 << (WEDGE_WEIGHT_BITS))
+
+using libvpx_test::ACMRandom;
+using libvpx_test::FunctionEquivalenceTest;
+
+namespace {
+
+static const int16_t kInt13Max = (1 << 12) - 1;
+
+//////////////////////////////////////////////////////////////////////////////
+// vp10_wedge_sse_from_residuals - functionality
+//////////////////////////////////////////////////////////////////////////////
+
+class WedgeUtilsSSEFuncTest : public testing::Test {
+ protected:
+  WedgeUtilsSSEFuncTest() : rng_(ACMRandom::DeterministicSeed()) {}
+
+  static const int kIterations = 1000;
+
+  ACMRandom rng_;
+};
+
+static void equiv_blend_residuals(int16_t *r,
+                                  const int16_t *r0,
+                                  const int16_t *r1,
+                                  const uint8_t *m,
+                                  int N) {
+  for (int i = 0 ; i < N ; i++) {
+    const int32_t m0 = m[i];
+    const int32_t m1 = MAX_MASK_VALUE - m0;
+    const int16_t R = m0 * r0[i] + m1 * r1[i];
+    // Note that this rounding is designed to match the result
+    // you would get when actually blending the 2 predictors and computing
+    // the residuals.
+    r[i] = ROUND_POWER_OF_TWO(R - 1, WEDGE_WEIGHT_BITS);
+  }
+}
+
+static uint64_t equiv_sse_from_residuals(const int16_t *r0,
+                                         const int16_t *r1,
+                                         const uint8_t *m,
+                                         int N) {
+  uint64_t acc = 0;
+  for (int i = 0 ; i < N ; i++) {
+    const int32_t m0 = m[i];
+    const int32_t m1 = MAX_MASK_VALUE - m0;
+    const int16_t R = m0 * r0[i] + m1 * r1[i];
+    const int32_t r = ROUND_POWER_OF_TWO(R - 1, WEDGE_WEIGHT_BITS);
+    acc += r * r;
+  }
+  return acc;
+}
+
+TEST_F(WedgeUtilsSSEFuncTest, ResidualBlendingEquiv) {
+  DECLARE_ALIGNED(32, uint8_t, s[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint8_t, p0[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint8_t, p1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint8_t, p[MAX_SB_SQUARE]);
+
+  DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, r_ref[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, r_tst[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint8_t, m[MAX_SB_SQUARE]);
+
+  for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) {
+    for (int i = 0 ; i < MAX_SB_SQUARE ; ++i) {
+      s[i] = rng_.Rand8();
+      m[i] = rng_(MAX_MASK_VALUE + 1);
+    }
+
+    const int w = 1 << (rng_(MAX_SB_SIZE_LOG2 + 1 - 3) + 3);
+    const int h = 1 << (rng_(MAX_SB_SIZE_LOG2 + 1 - 3) + 3);
+    const int N = w * h;
+
+    for (int j = 0 ; j < N ; j++) {
+      p0[j] = clamp(s[j] + rng_(33) - 16, 0, UINT8_MAX);
+      p1[j] = clamp(s[j] + rng_(33) - 16, 0, UINT8_MAX);
+    }
+
+    vpx_blend_a64_mask(p, w, p0, w, p1, w, m, w, h, w, 0, 0);
+
+    vpx_subtract_block(h, w, r0, w, s, w, p0, w);
+    vpx_subtract_block(h, w, r1, w, s, w, p1, w);
+
+    vpx_subtract_block(h, w, r_ref, w, s, w, p, w);
+    equiv_blend_residuals(r_tst, r0, r1, m, N);
+
+    for (int i = 0 ; i < N ; ++i)
+      ASSERT_EQ(r_ref[i], r_tst[i]);
+
+    uint64_t ref_sse = vpx_sum_squares_i16(r_ref, N);
+    uint64_t tst_sse = equiv_sse_from_residuals(r0, r1, m, N);
+
+    ASSERT_EQ(ref_sse, tst_sse);
+  }
+}
+
+static uint64_t sse_from_residuals(const int16_t *r0,
+                                   const int16_t *r1,
+                                   const uint8_t *m,
+                                   int N) {
+  uint64_t acc = 0;
+  for (int i = 0 ; i < N ; i++) {
+    const int32_t m0 = m[i];
+    const int32_t m1 = MAX_MASK_VALUE - m0;
+    const int32_t r = m0 * r0[i] + m1 * r1[i];
+    acc += r * r;
+  }
+  return ROUND_POWER_OF_TWO(acc, 2 * WEDGE_WEIGHT_BITS);
+}
+
+TEST_F(WedgeUtilsSSEFuncTest, ResidualBlendingMethod) {
+  DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, d[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint8_t, m[MAX_SB_SQUARE]);
+
+  for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) {
+    for (int i = 0 ; i < MAX_SB_SQUARE ; ++i) {
+      r1[i] = rng_(2 * INT8_MAX - 2 * INT8_MIN + 1) + 2 * INT8_MIN;
+      d[i] = rng_(2 * INT8_MAX - 2 * INT8_MIN + 1) + 2 * INT8_MIN;
+      m[i] = rng_(MAX_MASK_VALUE + 1);
+    }
+
+    const int N = 64 * (rng_(MAX_SB_SQUARE/64) + 1);
+
+    for (int i = 0 ; i < N ; i++)
+      r0[i] = r1[i] + d[i];
+
+    const uint64_t ref_res = sse_from_residuals(r0, r1, m, N);
+    const uint64_t tst_res = vp10_wedge_sse_from_residuals(r1, d, m, N);
+
+    ASSERT_EQ(ref_res, tst_res);
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// vp10_wedge_sse_from_residuals - optimizations
+//////////////////////////////////////////////////////////////////////////////
+
+typedef uint64_t (*FSSE)(const int16_t *r1,
+                         const int16_t *d,
+                         const uint8_t *m,
+                         int N);
+typedef libvpx_test::FuncParam<FSSE> TestFuncsFSSE;
+
+class WedgeUtilsSSEOptTest : public FunctionEquivalenceTest<FSSE> {
+ protected:
+  static const int kIterations = 10000;
+};
+
+TEST_P(WedgeUtilsSSEOptTest, RandomValues) {
+  DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, d[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint8_t, m[MAX_SB_SQUARE]);
+
+  for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) {
+    for (int i = 0 ; i < MAX_SB_SQUARE ; ++i) {
+      r1[i] = rng_(2 * kInt13Max + 1) - kInt13Max;
+      d[i] = rng_(2 * kInt13Max + 1) - kInt13Max;
+      m[i] = rng_(MAX_MASK_VALUE + 1);
+    }
+
+    const int N = 64 * (rng_(MAX_SB_SQUARE/64) + 1);
+
+    const uint64_t ref_res = params_.ref_func(r1, d, m, N);
+    uint64_t tst_res;
+    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(r1, d, m, N));
+
+    ASSERT_EQ(ref_res, tst_res);
+  }
+}
+
+TEST_P(WedgeUtilsSSEOptTest, ExtremeValues) {
+  DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, d[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint8_t, m[MAX_SB_SQUARE]);
+
+  for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) {
+    if (rng_(2)) {
+      for (int i = 0 ; i < MAX_SB_SQUARE ; ++i)
+        r1[i] = kInt13Max;
+    } else {
+      for (int i = 0 ; i < MAX_SB_SQUARE ; ++i)
+        r1[i] = -kInt13Max;
+    }
+
+    if (rng_(2)) {
+      for (int i = 0 ; i < MAX_SB_SQUARE ; ++i)
+        d[i] = kInt13Max;
+    } else {
+      for (int i = 0 ; i < MAX_SB_SQUARE ; ++i)
+        d[i] = -kInt13Max;
+    }
+
+    for (int i = 0 ; i < MAX_SB_SQUARE ; ++i)
+      m[i] = MAX_MASK_VALUE;
+
+    const int N = 64 * (rng_(MAX_SB_SQUARE/64) + 1);
+
+    const uint64_t ref_res = params_.ref_func(r1, d, m, N);
+    uint64_t tst_res;
+    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(r1, d, m, N));
+
+    ASSERT_EQ(ref_res, tst_res);
+  }
+}
+
+#if HAVE_SSE2
+INSTANTIATE_TEST_CASE_P(
+    SSE2, WedgeUtilsSSEOptTest,
+    ::testing::Values(
+        TestFuncsFSSE(vp10_wedge_sse_from_residuals_c,
+                      vp10_wedge_sse_from_residuals_sse2)));
+
+#endif  // HAVE_SSE2
+
+//////////////////////////////////////////////////////////////////////////////
+// vp10_wedge_sign_from_residuals
+//////////////////////////////////////////////////////////////////////////////
+
+typedef int (*FSign)(const int16_t *ds,
+                     const uint8_t *m,
+                     int N,
+                     int64_t limit);
+typedef libvpx_test::FuncParam<FSign> TestFuncsFSign;
+
+class WedgeUtilsSignOptTest : public FunctionEquivalenceTest<FSign> {
+ protected:
+  static const int kIterations = 10000;
+  static const int kMaxSize = 8196;  // Size limited by SIMD implementation.
+};
+
+TEST_P(WedgeUtilsSignOptTest, RandomValues) {
+  DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, ds[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint8_t, m[MAX_SB_SQUARE]);
+
+  for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) {
+    for (int i = 0 ; i < MAX_SB_SQUARE ; ++i) {
+      r0[i] = rng_(2 * kInt13Max + 1) - kInt13Max;
+      r1[i] = rng_(2 * kInt13Max + 1) - kInt13Max;
+      m[i] = rng_(MAX_MASK_VALUE + 1);
+    }
+
+    const int maxN = VPXMIN(kMaxSize, MAX_SB_SQUARE);
+    const int N = 64 * (rng_(maxN/64 - 1) + 1);
+
+    int64_t limit;
+    limit = (int64_t)vpx_sum_squares_i16(r0, N);
+    limit -= (int64_t)vpx_sum_squares_i16(r1, N);
+    limit *= (1 << WEDGE_WEIGHT_BITS) / 2;
+
+    for (int i = 0 ; i < N ; i++)
+      ds[i] = clamp(r0[i]*r0[i] - r1[i]*r1[i], INT16_MIN, INT16_MAX);
+
+    const int ref_res = params_.ref_func(ds, m, N, limit);
+    int tst_res;
+    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(ds, m, N, limit));
+
+    ASSERT_EQ(ref_res, tst_res);
+  }
+}
+
+TEST_P(WedgeUtilsSignOptTest, ExtremeValues) {
+  DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, ds[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, uint8_t, m[MAX_SB_SQUARE]);
+
+  for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) {
+    switch (rng_(4)) {
+    case 0:
+      for (int i = 0 ; i < MAX_SB_SQUARE ; ++i) {
+        r0[i] = 0;
+        r1[i] = kInt13Max;
+      }
+      break;
+    case 1:
+      for (int i = 0 ; i < MAX_SB_SQUARE ; ++i) {
+        r0[i] = kInt13Max;
+        r1[i] = 0;
+      }
+      break;
+    case 2:
+      for (int i = 0 ; i < MAX_SB_SQUARE ; ++i) {
+        r0[i] = 0;
+        r1[i] = -kInt13Max;
+      }
+      break;
+    default:
+      for (int i = 0 ; i < MAX_SB_SQUARE ; ++i) {
+        r0[i] = -kInt13Max;
+        r1[i] = 0;
+      }
+      break;
+    }
+
+    for (int i = 0 ; i < MAX_SB_SQUARE ; ++i)
+      m[i] = MAX_MASK_VALUE;
+
+    const int maxN = VPXMIN(kMaxSize, MAX_SB_SQUARE);
+    const int N = 64 * (rng_(maxN/64 - 1) + 1);
+
+    int64_t limit;
+    limit = (int64_t)vpx_sum_squares_i16(r0, N);
+    limit -= (int64_t)vpx_sum_squares_i16(r1, N);
+    limit *= (1 << WEDGE_WEIGHT_BITS) / 2;
+
+    for (int i = 0 ; i < N ; i++)
+      ds[i] = clamp(r0[i]*r0[i] - r1[i]*r1[i], INT16_MIN, INT16_MAX);
+
+    const int ref_res = params_.ref_func(ds, m, N, limit);
+    int tst_res;
+    ASM_REGISTER_STATE_CHECK(tst_res = params_.tst_func(ds, m, N, limit));
+
+    ASSERT_EQ(ref_res, tst_res);
+  }
+}
+
+#if HAVE_SSE2
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, WedgeUtilsSignOptTest,
+    ::testing::Values(
+      TestFuncsFSign(vp10_wedge_sign_from_residuals_c,
+                     vp10_wedge_sign_from_residuals_sse2)));
+
+#endif  // HAVE_SSE2
+
+//////////////////////////////////////////////////////////////////////////////
+// vp10_wedge_compute_delta_squares
+//////////////////////////////////////////////////////////////////////////////
+
+typedef void (*FDS)(int16_t *d,
+                    const int16_t *a,
+                    const int16_t *b,
+                    int N);
+typedef libvpx_test::FuncParam<FDS> TestFuncsFDS;
+
+class WedgeUtilsDeltaSquaresOptTest : public FunctionEquivalenceTest<FDS> {
+ protected:
+  static const int kIterations = 10000;
+};
+
+TEST_P(WedgeUtilsDeltaSquaresOptTest, RandomValues) {
+  DECLARE_ALIGNED(32, int16_t, a[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, b[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, d_ref[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, d_tst[MAX_SB_SQUARE]);
+
+  for (int iter = 0 ; iter < kIterations && !HasFatalFailure(); ++iter) {
+    for (int i = 0 ; i < MAX_SB_SQUARE ; ++i) {
+      a[i] = rng_.Rand16();
+      b[i] = rng_(2 * INT16_MAX + 1) - INT16_MAX;
+    }
+
+    const int N = 64 * (rng_(MAX_SB_SQUARE/64) + 1);
+
+    memset(&d_ref, INT16_MAX, sizeof(d_ref));
+    memset(&d_tst, INT16_MAX, sizeof(d_tst));
+
+    params_.ref_func(d_ref, a, b, N);
+    ASM_REGISTER_STATE_CHECK(params_.tst_func(d_tst, a, b, N));
+
+    for (int i = 0 ; i < MAX_SB_SQUARE ; ++i)
+      ASSERT_EQ(d_ref[i], d_tst[i]);
+  }
+}
+
+#if HAVE_SSE2
+
+INSTANTIATE_TEST_CASE_P(
+    SSE2, WedgeUtilsDeltaSquaresOptTest,
+    ::testing::Values(
+      TestFuncsFDS(vp10_wedge_compute_delta_squares_c,
+                   vp10_wedge_compute_delta_squares_sse2)));
+
+#endif  // HAVE_SSE2
+
+}  // namespace
diff --git a/test/vp9_arf_freq_test.cc b/test/vp9_arf_freq_test.cc
index 89200d4..6cf15de 100644
--- a/test/vp9_arf_freq_test.cc
+++ b/test/vp9_arf_freq_test.cc
@@ -78,19 +78,19 @@
     return !strcmp(dot, ".y4m");
 }
 
-class ArfFreqTest
+class ArfFreqTestLarge
     : public ::libvpx_test::EncoderTest,
       public ::libvpx_test::CodecTestWith3Params<TestVideoParam, \
                                                  TestEncodeParam, int> {
  protected:
-  ArfFreqTest()
+  ArfFreqTestLarge()
       : EncoderTest(GET_PARAM(0)),
         test_video_param_(GET_PARAM(1)),
         test_encode_param_(GET_PARAM(2)),
         min_arf_requested_(GET_PARAM(3)) {
   }
 
-  virtual ~ArfFreqTest() {}
+  virtual ~ArfFreqTestLarge() {}
 
   virtual void SetUp() {
     InitializeConfig();
@@ -190,7 +190,7 @@
   int run_of_visible_frames_;
 };
 
-TEST_P(ArfFreqTest, MinArfFreqTest) {
+TEST_P(ArfFreqTestLarge, MinArfFreqTest) {
   cfg_.rc_target_bitrate = kBitrate;
   cfg_.g_error_resilient = 0;
   cfg_.g_profile = test_video_param_.profile;
@@ -225,28 +225,32 @@
 }
 
 VP9_INSTANTIATE_TEST_CASE(
-    ArfFreqTest,
+    ArfFreqTestLarge,
     ::testing::ValuesIn(kTestVectors),
     ::testing::ValuesIn(kEncodeVectors),
     ::testing::ValuesIn(kMinArfVectors));
 
-#if CONFIG_VP9_HIGHBITDEPTH
-# if CONFIG_VP10_ENCODER
+#if CONFIG_VP9_HIGHBITDEPTH || CONFIG_BIDIR_PRED
+#if CONFIG_VP10_ENCODER
 // TODO(angiebird): 25-29 fail in high bitdepth mode.
+// TODO(zoeliu): This ArfFreqTest does not work with BWDREF_FRAME, as
+// BWDREF_FRAME is also a non-show frame, and the minimum run between two
+// consecutive BWDREF_FRAME's may vary between 1 and any arbitrary positive
+// number as long as it does not exceed the gf_group interval.
 INSTANTIATE_TEST_CASE_P(
-    DISABLED_VP10, ArfFreqTest,
+    DISABLED_VP10, ArfFreqTestLarge,
     ::testing::Combine(
         ::testing::Values(static_cast<const libvpx_test::CodecFactory *>(
             &libvpx_test::kVP10)),
         ::testing::ValuesIn(kTestVectors),
         ::testing::ValuesIn(kEncodeVectors),
         ::testing::ValuesIn(kMinArfVectors)));
-# endif  // CONFIG_VP10_ENCODER
+#endif  // CONFIG_VP10_ENCODER
 #else
 VP10_INSTANTIATE_TEST_CASE(
-    ArfFreqTest,
+    ArfFreqTestLarge,
     ::testing::ValuesIn(kTestVectors),
     ::testing::ValuesIn(kEncodeVectors),
     ::testing::ValuesIn(kMinArfVectors));
-#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // CONFIG_VP9_HIGHBITDEPTH || CONFIG_BIDIR_PRED
 }  // namespace
diff --git a/test/vp9_ethread_test.cc b/test/vp9_ethread_test.cc
index f0b8cef..aa84a9e 100644
--- a/test/vp9_ethread_test.cc
+++ b/test/vp9_ethread_test.cc
@@ -25,13 +25,27 @@
   VPxEncoderThreadTest()
       : EncoderTest(GET_PARAM(0)),
         encoder_initialized_(false),
-        tiles_(2),
         encoding_mode_(GET_PARAM(1)),
         set_cpu_used_(GET_PARAM(2)) {
     init_flags_ = VPX_CODEC_USE_PSNR;
-    md5_.clear();
+    vpx_codec_dec_cfg_t cfg = vpx_codec_dec_cfg_t();
+    cfg.w = 1280;
+    cfg.h = 720;
+    decoder_ = codec_->CreateDecoder(cfg, 0);
+#if CONFIG_VP10 && CONFIG_EXT_TILE
+    if (decoder_->IsVP10()) {
+      decoder_->Control(VP10_SET_DECODE_TILE_ROW, -1);
+      decoder_->Control(VP10_SET_DECODE_TILE_COL, -1);
+    }
+#endif
+
+    size_enc_.clear();
+    md5_dec_.clear();
+    md5_enc_.clear();
   }
-  virtual ~VPxEncoderThreadTest() {}
+  virtual ~VPxEncoderThreadTest() {
+    delete decoder_;
+  }
 
   virtual void SetUp() {
     InitializeConfig();
@@ -58,8 +72,20 @@
   virtual void PreEncodeFrameHook(::libvpx_test::VideoSource * /*video*/,
                                   ::libvpx_test::Encoder *encoder) {
     if (!encoder_initialized_) {
-      // Encode 4 column tiles.
-      encoder->Control(VP9E_SET_TILE_COLUMNS, tiles_);
+#if CONFIG_VP10 && CONFIG_EXT_TILE
+      encoder->Control(VP9E_SET_TILE_COLUMNS, 1);
+      if (codec_ == &libvpx_test::kVP10) {
+        // TODO(geza): Start using multiple tile rows when the multi-threaded
+        // encoder can handle them
+        encoder->Control(VP9E_SET_TILE_ROWS, 32);
+      } else {
+        encoder->Control(VP9E_SET_TILE_ROWS, 0);
+      }
+#else
+      // Encode 4 tile columns.
+      encoder->Control(VP9E_SET_TILE_COLUMNS, 2);
+      encoder->Control(VP9E_SET_TILE_ROWS, 0);
+#endif  // CONFIG_VP10 && CONFIG_EXT_TILE
       encoder->Control(VP8E_SET_CPUUSED, set_cpu_used_);
       if (encoding_mode_ != ::libvpx_test::kRealTime) {
         encoder->Control(VP8E_SET_ENABLEAUTOALTREF, 1);
@@ -74,53 +100,83 @@
     }
   }
 
-  virtual void DecompressedFrameHook(const vpx_image_t &img,
-                                     vpx_codec_pts_t /*pts*/) {
-    ::libvpx_test::MD5 md5_res;
-    md5_res.Add(&img);
-    md5_.push_back(md5_res.Get());
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    size_enc_.push_back(pkt->data.frame.sz);
+
+    ::libvpx_test::MD5 md5_enc;
+    md5_enc.Add(reinterpret_cast<uint8_t*>(pkt->data.frame.buf),
+                pkt->data.frame.sz);
+    md5_enc_.push_back(md5_enc.Get());
+
+    const vpx_codec_err_t res = decoder_->DecodeFrame(
+        reinterpret_cast<uint8_t*>(pkt->data.frame.buf), pkt->data.frame.sz);
+    if (res != VPX_CODEC_OK) {
+      abort_ = true;
+      ASSERT_EQ(VPX_CODEC_OK, res);
+    }
+    const vpx_image_t *img = decoder_->GetDxData().Next();
+
+    if (img) {
+      ::libvpx_test::MD5 md5_res;
+      md5_res.Add(img);
+      md5_dec_.push_back(md5_res.Get());
+    }
   }
 
-  virtual bool HandleDecodeResult(const vpx_codec_err_t res,
-                                  const libvpx_test::VideoSource& /*video*/,
-                                  libvpx_test::Decoder * /*decoder*/) {
-    if (res != VPX_CODEC_OK) {
-      EXPECT_EQ(VPX_CODEC_OK, res);
-      return false;
-    }
+  void DoTest() {
+    ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 15, 18);
+    cfg_.rc_target_bitrate = 1000;
 
-    return true;
+    // Encode using single thread.
+    cfg_.g_threads = 1;
+    init_flags_ = VPX_CODEC_USE_PSNR;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    std::vector<size_t> single_thr_size_enc;
+    std::vector<std::string> single_thr_md5_enc;
+    std::vector<std::string> single_thr_md5_dec;
+    single_thr_size_enc = size_enc_;
+    single_thr_md5_enc = md5_enc_;
+    single_thr_md5_dec = md5_dec_;
+    size_enc_.clear();
+    md5_enc_.clear();
+    md5_dec_.clear();
+
+    // Encode using multiple threads.
+    cfg_.g_threads = 4;
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    std::vector<size_t> multi_thr_size_enc;
+    std::vector<std::string> multi_thr_md5_enc;
+    std::vector<std::string> multi_thr_md5_dec;
+    multi_thr_size_enc = size_enc_;
+    multi_thr_md5_enc = md5_enc_;
+    multi_thr_md5_dec = md5_dec_;
+    size_enc_.clear();
+    md5_enc_.clear();
+    md5_dec_.clear();
+
+    // Check that the vectors are equal.
+    ASSERT_EQ(single_thr_size_enc, multi_thr_size_enc);
+    ASSERT_EQ(single_thr_md5_enc, multi_thr_md5_enc);
+    ASSERT_EQ(single_thr_md5_dec, multi_thr_md5_dec);
   }
 
   bool encoder_initialized_;
-  int tiles_;
   ::libvpx_test::TestMode encoding_mode_;
   int set_cpu_used_;
-  std::vector<std::string> md5_;
+  ::libvpx_test::Decoder *decoder_;
+  std::vector<size_t> size_enc_;
+  std::vector<std::string> md5_enc_;
+  std::vector<std::string> md5_dec_;
 };
 
 TEST_P(VPxEncoderThreadTest, EncoderResultTest) {
-  std::vector<std::string> single_thr_md5, multi_thr_md5;
+  DoTest();
+}
 
-  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 15, 20);
+class VPxEncoderThreadTestLarge : public VPxEncoderThreadTest {};
 
-  cfg_.rc_target_bitrate = 1000;
-
-  // Encode using single thread.
-  cfg_.g_threads = 1;
-  init_flags_ = VPX_CODEC_USE_PSNR;
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  single_thr_md5 = md5_;
-  md5_.clear();
-
-  // Encode using multiple threads.
-  cfg_.g_threads = 4;
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  multi_thr_md5 = md5_;
-  md5_.clear();
-
-  // Compare to check if two vectors are equal.
-  ASSERT_EQ(single_thr_md5, multi_thr_md5);
+TEST_P(VPxEncoderThreadTestLarge, EncoderResultTest) {
+  DoTest();
 }
 
 VP9_INSTANTIATE_TEST_CASE(
@@ -132,5 +188,10 @@
 VP10_INSTANTIATE_TEST_CASE(
     VPxEncoderThreadTest,
     ::testing::Values(::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood),
+    ::testing::Range(3, 9));
+
+VP10_INSTANTIATE_TEST_CASE(
+    VPxEncoderThreadTestLarge,
+    ::testing::Values(::libvpx_test::kTwoPassGood, ::libvpx_test::kOnePassGood),
     ::testing::Range(1, 3));
 }  // namespace
diff --git a/test/vp9_subtract_test.cc b/test/vp9_subtract_test.cc
deleted file mode 100644
index 3cad4d7..0000000
--- a/test/vp9_subtract_test.cc
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- *  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include "third_party/googletest/src/include/gtest/gtest.h"
-
-#include "./vp9_rtcd.h"
-#include "./vpx_config.h"
-#include "./vpx_dsp_rtcd.h"
-#include "test/acm_random.h"
-#include "test/clear_system_state.h"
-#include "test/register_state_check.h"
-#include "vp9/common/vp9_blockd.h"
-#include "vpx_mem/vpx_mem.h"
-
-typedef void (*SubtractFunc)(int rows, int cols,
-                             int16_t *diff_ptr, ptrdiff_t diff_stride,
-                             const uint8_t *src_ptr, ptrdiff_t src_stride,
-                             const uint8_t *pred_ptr, ptrdiff_t pred_stride);
-
-namespace vp9 {
-
-class VP9SubtractBlockTest : public ::testing::TestWithParam<SubtractFunc> {
- public:
-  virtual void TearDown() {
-    libvpx_test::ClearSystemState();
-  }
-};
-
-using libvpx_test::ACMRandom;
-
-TEST_P(VP9SubtractBlockTest, SimpleSubtract) {
-  ACMRandom rnd(ACMRandom::DeterministicSeed());
-
-  // FIXME(rbultje) split in its own file
-  for (BLOCK_SIZE bsize = BLOCK_4X4; bsize < BLOCK_SIZES;
-       bsize = static_cast<BLOCK_SIZE>(static_cast<int>(bsize) + 1)) {
-    const int block_width = 4 * num_4x4_blocks_wide_lookup[bsize];
-    const int block_height = 4 * num_4x4_blocks_high_lookup[bsize];
-    int16_t *diff = reinterpret_cast<int16_t *>(
-        vpx_memalign(16, sizeof(*diff) * block_width * block_height * 2));
-    uint8_t *pred = reinterpret_cast<uint8_t *>(
-        vpx_memalign(16, block_width * block_height * 2));
-    uint8_t *src  = reinterpret_cast<uint8_t *>(
-        vpx_memalign(16, block_width * block_height * 2));
-
-    for (int n = 0; n < 100; n++) {
-      for (int r = 0; r < block_height; ++r) {
-        for (int c = 0; c < block_width * 2; ++c) {
-          src[r * block_width * 2 + c] = rnd.Rand8();
-          pred[r * block_width * 2 + c] = rnd.Rand8();
-        }
-      }
-
-      GetParam()(block_height, block_width, diff, block_width,
-                 src, block_width, pred, block_width);
-
-      for (int r = 0; r < block_height; ++r) {
-        for (int c = 0; c < block_width; ++c) {
-          EXPECT_EQ(diff[r * block_width + c],
-                    (src[r * block_width + c] -
-                     pred[r * block_width + c])) << "r = " << r
-                                                 << ", c = " << c
-                                                 << ", bs = " << bsize;
-        }
-      }
-
-      GetParam()(block_height, block_width, diff, block_width * 2,
-                 src, block_width * 2, pred, block_width * 2);
-
-      for (int r = 0; r < block_height; ++r) {
-        for (int c = 0; c < block_width; ++c) {
-          EXPECT_EQ(diff[r * block_width * 2 + c],
-                    (src[r * block_width * 2 + c] -
-                     pred[r * block_width * 2 + c])) << "r = " << r
-                                                     << ", c = " << c
-                                                     << ", bs = " << bsize;
-        }
-      }
-    }
-    vpx_free(diff);
-    vpx_free(pred);
-    vpx_free(src);
-  }
-}
-
-INSTANTIATE_TEST_CASE_P(C, VP9SubtractBlockTest,
-                        ::testing::Values(vpx_subtract_block_c));
-
-#if HAVE_SSE2 && CONFIG_USE_X86INC
-INSTANTIATE_TEST_CASE_P(SSE2, VP9SubtractBlockTest,
-                        ::testing::Values(vpx_subtract_block_sse2));
-#endif
-#if HAVE_NEON
-INSTANTIATE_TEST_CASE_P(NEON, VP9SubtractBlockTest,
-                        ::testing::Values(vpx_subtract_block_neon));
-#endif
-#if HAVE_MSA
-INSTANTIATE_TEST_CASE_P(MSA, VP9SubtractBlockTest,
-                        ::testing::Values(vpx_subtract_block_msa));
-#endif
-
-}  // namespace vp9
diff --git a/vp10/common/alloccommon.c b/vp10/common/alloccommon.c
index 9ca86e5..abdc72b 100644
--- a/vp10/common/alloccommon.c
+++ b/vp10/common/alloccommon.c
@@ -81,6 +81,12 @@
   }
 }
 
+#if CONFIG_LOOP_RESTORATION
+void vp10_free_restoration_buffers(VP10_COMMON *cm) {
+  vpx_free_frame_buffer(&cm->tmp_loop_buf);
+}
+#endif  // CONFIG_LOOP_RESTORATION
+
 void vp10_free_postproc_buffers(VP10_COMMON *cm) {
 #if CONFIG_VP9_POSTPROC
   vpx_free_frame_buffer(&cm->post_proc_buffer);
@@ -91,12 +97,19 @@
 }
 
 void vp10_free_context_buffers(VP10_COMMON *cm) {
+  int i;
   cm->free_mi(cm);
   free_seg_map(cm);
-  vpx_free(cm->above_context);
-  cm->above_context = NULL;
+  for (i = 0 ; i < MAX_MB_PLANE ; i++) {
+    vpx_free(cm->above_context[i]);
+    cm->above_context[i] = NULL;
+  }
   vpx_free(cm->above_seg_context);
   cm->above_seg_context = NULL;
+#if CONFIG_VAR_TX
+  vpx_free(cm->above_txfm_context);
+  cm->above_txfm_context = NULL;
+#endif
 }
 
 int vp10_alloc_context_buffers(VP10_COMMON *cm, int width, int height) {
@@ -118,17 +131,33 @@
   }
 
   if (cm->above_context_alloc_cols < cm->mi_cols) {
-    vpx_free(cm->above_context);
-    cm->above_context = (ENTROPY_CONTEXT *)vpx_calloc(
-        2 * mi_cols_aligned_to_sb(cm->mi_cols) * MAX_MB_PLANE,
-        sizeof(*cm->above_context));
-    if (!cm->above_context) goto fail;
+    // TODO(geza.lore): These are bigger than they need to be.
+    // cm->tile_width would be enough but it complicates indexing a
+    // little elsewhere.
+    const int aligned_mi_cols =
+        ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
+    int i;
+
+    for (i = 0 ; i < MAX_MB_PLANE ; i++) {
+      vpx_free(cm->above_context[i]);
+      cm->above_context[i] = (ENTROPY_CONTEXT *)vpx_calloc(
+          2 * aligned_mi_cols, sizeof(*cm->above_context[0]));
+      if (!cm->above_context[i]) goto fail;
+    }
 
     vpx_free(cm->above_seg_context);
     cm->above_seg_context = (PARTITION_CONTEXT *)vpx_calloc(
-        mi_cols_aligned_to_sb(cm->mi_cols), sizeof(*cm->above_seg_context));
+        aligned_mi_cols, sizeof(*cm->above_seg_context));
     if (!cm->above_seg_context) goto fail;
-    cm->above_context_alloc_cols = cm->mi_cols;
+
+#if CONFIG_VAR_TX
+    vpx_free(cm->above_txfm_context);
+    cm->above_txfm_context = (TXFM_CONTEXT *)vpx_calloc(
+        aligned_mi_cols, sizeof(*cm->above_txfm_context));
+    if (!cm->above_txfm_context) goto fail;
+#endif
+
+    cm->above_context_alloc_cols = aligned_mi_cols;
   }
 
   return 0;
diff --git a/vp10/common/alloccommon.h b/vp10/common/alloccommon.h
index 5cfe660..f77833b 100644
--- a/vp10/common/alloccommon.h
+++ b/vp10/common/alloccommon.h
@@ -29,6 +29,9 @@
 
 void vp10_free_ref_frame_buffers(struct BufferPool *pool);
 void vp10_free_postproc_buffers(struct VP10Common *cm);
+#if CONFIG_LOOP_RESTORATION
+void vp10_free_restoration_buffers(struct VP10Common *cm);
+#endif  // CONFIG_LOOP_RESTORATION
 
 int vp10_alloc_state_buffers(struct VP10Common *cm, int width, int height);
 void vp10_free_state_buffers(struct VP10Common *cm);
diff --git a/vp10/common/ans.h b/vp10/common/ans.h
new file mode 100644
index 0000000..24d7c09
--- /dev/null
+++ b/vp10/common/ans.h
@@ -0,0 +1,422 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_COMMON_ANS_H_
+#define VP10_COMMON_ANS_H_
+// An implementation of Asymmetric Numeral Systems
+// http://arxiv.org/abs/1311.2540v2
+
+#include <assert.h>
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/prob.h"
+#include "vpx_ports/mem_ops.h"
+
+#define ANS_DIVIDE_BY_MULTIPLY 1
+#if ANS_DIVIDE_BY_MULTIPLY
+#include "vp10/common/divide.h"
+#define ANS_DIVREM(quotient, remainder, dividend, divisor) \
+  do { \
+    quotient = fastdiv(dividend, divisor); \
+    remainder = dividend - quotient * divisor; \
+  } while (0)
+#define ANS_DIV(dividend, divisor) \
+  fastdiv(dividend, divisor)
+#else
+#define ANS_DIVREM(quotient, remainder, dividend, divisor) \
+  do { \
+    quotient = dividend / divisor; \
+    remainder = dividend % divisor; \
+  } while (0)
+#define ANS_DIV(dividend, divisor) \
+    ((dividend) / (divisor))
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+struct AnsCoder {
+  uint8_t *buf;
+  int buf_offset;
+  uint32_t state;
+};
+
+struct AnsDecoder {
+  const uint8_t *buf;
+  int buf_offset;
+  uint32_t state;
+};
+
+typedef uint8_t AnsP8;
+#define ans_p8_precision 256u
+#define ans_p8_shift 8
+typedef uint16_t AnsP10;
+#define ans_p10_precision 1024u
+
+#define rans_precision ans_p10_precision
+
+#define l_base (ans_p10_precision * 4)  // l_base % precision must be 0
+#define io_base 256
+// Range I = { l_base, l_base + 1, ..., l_base * io_base - 1 }
+
+static INLINE void ans_write_init(struct AnsCoder *const ans,
+                                  uint8_t *const buf) {
+  ans->buf = buf;
+  ans->buf_offset = 0;
+  ans->state = l_base;
+}
+
+static INLINE int ans_write_end(struct AnsCoder *const ans) {
+  uint32_t state;
+  assert(ans->state >= l_base);
+  assert(ans->state < l_base * io_base);
+  state = ans->state - l_base;
+  if (state < (1 << 6)) {
+    ans->buf[ans->buf_offset] = (0x00 << 6) + state;
+    return ans->buf_offset + 1;
+  } else if (state < (1 << 14)) {
+    mem_put_le16(ans->buf + ans->buf_offset, (0x01 << 14) + state);
+    return ans->buf_offset + 2;
+  } else if (state < (1 << 22)) {
+    mem_put_le24(ans->buf + ans->buf_offset, (0x02 << 22) + state);
+    return ans->buf_offset + 3;
+  } else {
+    assert(0 && "State is too large to be serialized");
+    return ans->buf_offset;
+  }
+}
+
+// rABS with descending spread
+// p or p0 takes the place of l_s from the paper
+// ans_p8_precision is m
+static INLINE void rabs_desc_write(struct AnsCoder *ans, int val, AnsP8 p0) {
+  const AnsP8 p = ans_p8_precision - p0;
+  const unsigned l_s = val ? p : p0;
+  unsigned quot, rem;
+  if (ans->state >= l_base / ans_p8_precision * io_base * l_s) {
+    ans->buf[ans->buf_offset++] = ans->state % io_base;
+    ans->state /= io_base;
+  }
+  ANS_DIVREM(quot, rem, ans->state, l_s);
+  ans->state = quot * ans_p8_precision + rem + (val ? 0 : p);
+}
+
+#define ANS_IMPL1 0
+#define UNPREDICTABLE(x) x
+static INLINE int rabs_desc_read(struct AnsDecoder *ans, AnsP8 p0) {
+  int val;
+#if ANS_IMPL1
+  unsigned l_s;
+#else
+  unsigned quot, rem, x, xn;
+#endif
+  const AnsP8 p = ans_p8_precision - p0;
+  if (ans->state < l_base) {
+    ans->state = ans->state * io_base + ans->buf[--ans->buf_offset];
+  }
+#if ANS_IMPL1
+  val = ans->state % ans_p8_precision < p;
+  l_s = val ? p : p0;
+  ans->state = (ans->state / ans_p8_precision) * l_s +
+               ans->state % ans_p8_precision - (!val * p);
+#else
+  x = ans->state;
+  quot = x / ans_p8_precision;
+  rem = x % ans_p8_precision;
+  xn = quot * p;
+  val = rem < p;
+  if (UNPREDICTABLE(val)) {
+    ans->state = xn + rem;
+  } else {
+    // ans->state = quot * p0 + rem - p;
+    ans->state = x - xn - p;
+  }
+#endif
+  return val;
+}
+
+// rABS with ascending spread
+// p or p0 takes the place of l_s from the paper
+// ans_p8_precision is m
+static INLINE void rabs_asc_write(struct AnsCoder *ans, int val, AnsP8 p0) {
+  const AnsP8 p = ans_p8_precision - p0;
+  const unsigned l_s = val ? p : p0;
+  unsigned quot, rem;
+  if (ans->state >= l_base / ans_p8_precision * io_base * l_s) {
+    ans->buf[ans->buf_offset++] = ans->state % io_base;
+    ans->state /= io_base;
+  }
+  ANS_DIVREM(quot, rem, ans->state, l_s);
+  ans->state = quot * ans_p8_precision + rem + (val ? p0 : 0);
+}
+
+static INLINE int rabs_asc_read(struct AnsDecoder *ans, AnsP8 p0) {
+  int val;
+#if ANS_IMPL1
+  unsigned l_s;
+#else
+  unsigned quot, rem, x, xn;
+#endif
+  const AnsP8 p = ans_p8_precision - p0;
+  if (ans->state < l_base) {
+    ans->state = ans->state * io_base + ans->buf[--ans->buf_offset];
+  }
+#if ANS_IMPL1
+  val = ans->state % ans_p8_precision < p;
+  l_s = val ? p : p0;
+  ans->state = (ans->state / ans_p8_precision) * l_s +
+               ans->state % ans_p8_precision - (!val * p);
+#else
+  x = ans->state;
+  quot = x / ans_p8_precision;
+  rem = x % ans_p8_precision;
+  xn = quot * p;
+  val = rem >= p0;
+  if (UNPREDICTABLE(val)) {
+    ans->state = xn + rem - p0;
+  } else {
+    // ans->state = quot * p0 + rem - p0;
+    ans->state = x - xn;
+  }
+#endif
+  return val;
+}
+
+#define rabs_read rabs_desc_read
+#define rabs_write rabs_desc_write
+
+// uABS with normalization
+static INLINE void uabs_write(struct AnsCoder *ans, int val, AnsP8 p0) {
+  AnsP8 p = ans_p8_precision - p0;
+  const unsigned l_s = val ? p : p0;
+  while (ans->state >= l_base / ans_p8_precision * io_base * l_s) {
+    ans->buf[ans->buf_offset++] = ans->state % io_base;
+    ans->state /= io_base;
+  }
+  if (!val)
+    ans->state = ANS_DIV(ans->state * ans_p8_precision, p0);
+  else
+    ans->state = ANS_DIV((ans->state + 1) * ans_p8_precision + p - 1, p) - 1;
+}
+
+static INLINE int uabs_read(struct AnsDecoder *ans, AnsP8 p0) {
+  AnsP8 p = ans_p8_precision - p0;
+  int s;
+  // unsigned int xp1;
+  unsigned xp, sp;
+  unsigned state = ans->state;
+  while (state < l_base && ans->buf_offset > 0) {
+    state = state * io_base + ans->buf[--ans->buf_offset];
+  }
+  sp = state * p;
+  // xp1 = (sp + p) / ans_p8_precision;
+  xp = sp / ans_p8_precision;
+  // s = xp1 - xp;
+  s = (sp & 0xFF) >= p0;
+  if (UNPREDICTABLE(s))
+    ans->state = xp;
+  else
+    ans->state = state - xp;
+  return s;
+}
+
+static INLINE int uabs_read_bit(struct AnsDecoder *ans) {
+  int s;
+  unsigned state = ans->state;
+  while (state < l_base && ans->buf_offset > 0) {
+    state = state * io_base + ans->buf[--ans->buf_offset];
+  }
+  s = (int)(state & 1);
+  ans->state = state >> 1;
+  return s;
+}
+
+static INLINE int uabs_read_literal(struct AnsDecoder *ans, int bits) {
+  int literal = 0, bit;
+  assert(bits < 31);
+
+  // TODO(aconverse): Investigate ways to read/write literals faster,
+  // e.g. 8-bit chunks.
+  for (bit = bits - 1; bit >= 0; bit--)
+    literal |= uabs_read_bit(ans) << bit;
+
+  return literal;
+}
+
+// TODO(aconverse): Replace trees with tokensets.
+static INLINE int uabs_read_tree(struct AnsDecoder *ans,
+                                 const vpx_tree_index *tree,
+                                 const AnsP8 *probs) {
+  vpx_tree_index i = 0;
+
+  while ((i = tree[i + uabs_read(ans, probs[i >> 1])]) > 0)
+    continue;
+
+  return -i;
+}
+
+struct rans_sym {
+  AnsP10 prob;
+  AnsP10 cum_prob;  // not-inclusive
+};
+
+struct rans_dec_sym {
+  uint8_t val;
+  AnsP10 prob;
+  AnsP10 cum_prob;  // not-inclusive
+};
+
+// This is now just a boring cdf. It starts with an explicit zero.
+// TODO(aconverse): Remove starting zero.
+typedef uint16_t rans_dec_lut[16];
+
+static INLINE void rans_build_cdf_from_pdf(const AnsP10 token_probs[],
+                                           rans_dec_lut cdf_tab) {
+  int i;
+  cdf_tab[0] = 0;
+  for (i = 1; cdf_tab[i - 1] < rans_precision; ++i) {
+    cdf_tab[i] = cdf_tab[i - 1] + token_probs[i - 1];
+  }
+  assert(cdf_tab[i - 1] == rans_precision);
+}
+
+static INLINE int ans_find_largest(const AnsP10 *const pdf_tab,
+                                   int num_syms) {
+  int largest_idx = -1;
+  int largest_p = -1;
+  int i;
+  for (i = 0; i < num_syms; ++i) {
+    int p = pdf_tab[i];
+    if (p > largest_p) {
+      largest_p = p;
+      largest_idx = i;
+    }
+  }
+  return largest_idx;
+}
+
+static INLINE void rans_merge_prob8_pdf(AnsP10 *const out_pdf,
+                                        const AnsP8 node_prob,
+                                        const AnsP10 *const src_pdf,
+                                        int in_syms) {
+  int i;
+  int adjustment = rans_precision;
+  const int round_fact = ans_p8_precision >> 1;
+  const AnsP8 p1 = ans_p8_precision - node_prob;
+  const int out_syms = in_syms + 1;
+  assert(src_pdf != out_pdf);
+
+  out_pdf[0] = node_prob << (10 - 8);
+  adjustment -= out_pdf[0];
+  for (i = 0; i < in_syms; ++i) {
+    int p = (p1 * src_pdf[i] + round_fact) >> ans_p8_shift;
+    p = VPXMIN(p, (int)rans_precision - in_syms);
+    p = VPXMAX(p, 1);
+    out_pdf[i + 1] = p;
+    adjustment -= p;
+  }
+
+  // Adjust probabilities so they sum to the total probability
+  if (adjustment > 0) {
+    i = ans_find_largest(out_pdf, out_syms);
+    out_pdf[i] += adjustment;
+  } else {
+    while (adjustment < 0) {
+      i = ans_find_largest(out_pdf, out_syms);
+      --out_pdf[i];
+      assert(out_pdf[i] > 0);
+      adjustment++;
+    }
+  }
+}
+
+// rANS with normalization
+// sym->prob takes the place of l_s from the paper
+// ans_p10_precision is m
+static INLINE void rans_write(struct AnsCoder *ans,
+                              const struct rans_sym *const sym) {
+  const AnsP10 p = sym->prob;
+  while (ans->state >= l_base / rans_precision * io_base * p) {
+    ans->buf[ans->buf_offset++] = ans->state % io_base;
+    ans->state /= io_base;
+  }
+  ans->state =
+      (ans->state / p) * rans_precision + ans->state % p + sym->cum_prob;
+}
+
+static INLINE void fetch_sym(struct rans_dec_sym *out, const rans_dec_lut cdf,
+                             AnsP10 rem) {
+  int i = 0;
+  // TODO(skal): if critical, could be a binary search.
+  // Or, better, an O(1) alias-table.
+  while (rem >= cdf[i]) {
+    ++i;
+  }
+  out->val = i - 1;
+  out->prob = (AnsP10)(cdf[i] - cdf[i - 1]);
+  out->cum_prob = (AnsP10)cdf[i - 1];
+}
+
+static INLINE int rans_read(struct AnsDecoder *ans,
+                            const rans_dec_lut tab) {
+  unsigned rem;
+  unsigned quo;
+  struct rans_dec_sym sym;
+  while (ans->state < l_base && ans->buf_offset > 0) {
+    ans->state = ans->state * io_base + ans->buf[--ans->buf_offset];
+  }
+  quo = ans->state / rans_precision;
+  rem = ans->state % rans_precision;
+  fetch_sym(&sym, tab, rem);
+  ans->state = quo * sym.prob + rem - sym.cum_prob;
+  return sym.val;
+}
+
+static INLINE int ans_read_init(struct AnsDecoder *const ans,
+                                const uint8_t *const buf,
+                                int offset) {
+  unsigned x;
+  if (offset < 1) return 1;
+  ans->buf = buf;
+  x = buf[offset - 1] >> 6;
+  if (x == 0) {
+    ans->buf_offset = offset - 1;
+    ans->state = buf[offset - 1] & 0x3F;
+  } else if (x == 1) {
+    if (offset < 2) return 1;
+    ans->buf_offset = offset - 2;
+    ans->state = mem_get_le16(buf + offset - 2) & 0x3FFF;
+  } else if (x == 2) {
+    if (offset < 3) return 1;
+    ans->buf_offset = offset - 3;
+    ans->state = mem_get_le24(buf + offset - 3) & 0x3FFFFF;
+  } else {
+    // x == 3 implies this byte is a superframe marker
+    return 1;
+  }
+  ans->state += l_base;
+  if (ans->state >= l_base * io_base)
+    return 1;
+  return 0;
+}
+
+static INLINE int ans_read_end(struct AnsDecoder *const ans) {
+  return ans->state == l_base;
+}
+
+static INLINE int ans_reader_has_error(const struct AnsDecoder *const ans) {
+  return ans->state < l_base && ans->buf_offset == 0;
+}
+#undef ANS_DIVREM
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // VP10_COMMON_ANS_H_
diff --git a/vp10/common/blockd.c b/vp10/common/blockd.c
index b6f910f..5ca5c05 100644
--- a/vp10/common/blockd.c
+++ b/vp10/common/blockd.c
@@ -8,6 +8,10 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <math.h>
+
+#include "vpx_ports/system_state.h"
+
 #include "vp10/common/blockd.h"
 
 PREDICTION_MODE vp10_left_block_mode(const MODE_INFO *cur_mi,
@@ -134,3 +138,92 @@
     xd->plane[i].subsampling_y = i ? ss_y : 0;
   }
 }
+
+#if CONFIG_EXT_INTRA
+// If angle > 0 && angle < 90, dx = -((int)(256 / t)), dy = 1;
+// If angle > 90 && angle < 180, dx = (int)(256 / t), dy = (int)(256 * t);
+// If angle > 180 && angle < 270, dx = 1, dy = -((int)(256 * t));
+const int16_t dr_intra_derivative[270][2] = {
+    {     1,     1 }, { -14666,    1 }, { -7330,     1 }, { -4884,     1 },
+    { -3660,     1 }, { -2926,     1 }, { -2435,     1 }, { -2084,     1 },
+    { -1821,     1 }, { -1616,     1 }, { -1451,     1 }, { -1317,     1 },
+    { -1204,     1 }, { -1108,     1 }, { -1026,     1 }, {  -955,     1 },
+    {  -892,     1 }, {  -837,     1 }, {  -787,     1 }, {  -743,     1 },
+    {  -703,     1 }, {  -666,     1 }, {  -633,     1 }, {  -603,     1 },
+    {  -574,     1 }, {  -548,     1 }, {  -524,     1 }, {  -502,     1 },
+    {  -481,     1 }, {  -461,     1 }, {  -443,     1 }, {  -426,     1 },
+    {  -409,     1 }, {  -394,     1 }, {  -379,     1 }, {  -365,     1 },
+    {  -352,     1 }, {  -339,     1 }, {  -327,     1 }, {  -316,     1 },
+    {  -305,     1 }, {  -294,     1 }, {  -284,     1 }, {  -274,     1 },
+    {  -265,     1 }, {  -256,     1 }, {  -247,     1 }, {  -238,     1 },
+    {  -230,     1 }, {  -222,     1 }, {  -214,     1 }, {  -207,     1 },
+    {  -200,     1 }, {  -192,     1 }, {  -185,     1 }, {  -179,     1 },
+    {  -172,     1 }, {  -166,     1 }, {  -159,     1 }, {  -153,     1 },
+    {  -147,     1 }, {  -141,     1 }, {  -136,     1 }, {  -130,     1 },
+    {  -124,     1 }, {  -119,     1 }, {  -113,     1 }, {  -108,     1 },
+    {  -103,     1 }, {   -98,     1 }, {   -93,     1 }, {   -88,     1 },
+    {   -83,     1 }, {   -78,     1 }, {   -73,     1 }, {   -68,     1 },
+    {   -63,     1 }, {   -59,     1 }, {   -54,     1 }, {   -49,     1 },
+    {   -45,     1 }, {   -40,     1 }, {   -35,     1 }, {   -31,     1 },
+    {   -26,     1 }, {   -22,     1 }, {   -17,     1 }, {   -13,     1 },
+    {    -8,     1 }, {    -4,     1 }, {     1,     1 }, {     4, 14666 },
+    {     8,  7330 }, {    13,  4884 }, {    17,  3660 }, {    22,  2926 },
+    {    26,  2435 }, {    31,  2084 }, {    35,  1821 }, {    40,  1616 },
+    {    45,  1451 }, {    49,  1317 }, {    54,  1204 }, {    59,  1108 },
+    {    63,  1026 }, {    68,   955 }, {    73,   892 }, {    78,   837 },
+    {    83,   787 }, {    88,   743 }, {    93,   703 }, {    98,   666 },
+    {   103,   633 }, {   108,   603 }, {   113,   574 }, {   119,   548 },
+    {   124,   524 }, {   130,   502 }, {   136,   481 }, {   141,   461 },
+    {   147,   443 }, {   153,   426 }, {   159,   409 }, {   166,   394 },
+    {   172,   379 }, {   179,   365 }, {   185,   352 }, {   192,   339 },
+    {   200,   327 }, {   207,   316 }, {   214,   305 }, {   222,   294 },
+    {   230,   284 }, {   238,   274 }, {   247,   265 }, {   255,   256 },
+    {   265,   247 }, {   274,   238 }, {   284,   230 }, {   294,   222 },
+    {   305,   214 }, {   316,   207 }, {   327,   200 }, {   339,   192 },
+    {   352,   185 }, {   365,   179 }, {   379,   172 }, {   394,   166 },
+    {   409,   159 }, {   426,   153 }, {   443,   147 }, {   461,   141 },
+    {   481,   136 }, {   502,   130 }, {   524,   124 }, {   548,   119 },
+    {   574,   113 }, {   603,   108 }, {   633,   103 }, {   666,    98 },
+    {   703,    93 }, {   743,    88 }, {   787,    83 }, {   837,    78 },
+    {   892,    73 }, {   955,    68 }, {  1026,    63 }, {  1108,    59 },
+    {  1204,    54 }, {  1317,    49 }, {  1451,    45 }, {  1616,    40 },
+    {  1821,    35 }, {  2084,    31 }, {  2435,    26 }, {  2926,    22 },
+    {  3660,    17 }, {  4884,    13 }, {  7330,     8 }, { 14666,     4 },
+    {     1,     1 }, {     1,    -4 }, {     1,    -8 }, {     1,   -13 },
+    {     1,   -17 }, {     1,   -22 }, {     1,   -26 }, {     1,   -31 },
+    {     1,   -35 }, {     1,   -40 }, {     1,   -45 }, {     1,   -49 },
+    {     1,   -54 }, {     1,   -59 }, {     1,   -63 }, {     1,   -68 },
+    {     1,   -73 }, {     1,   -78 }, {     1,   -83 }, {     1,   -88 },
+    {     1,   -93 }, {     1,   -98 }, {     1,  -103 }, {     1,  -108 },
+    {     1,  -113 }, {     1,  -119 }, {     1,  -124 }, {     1,  -130 },
+    {     1,  -136 }, {     1,  -141 }, {     1,  -147 }, {     1,  -153 },
+    {     1,  -159 }, {     1,  -166 }, {     1,  -172 }, {     1,  -179 },
+    {     1,  -185 }, {     1,  -192 }, {     1,  -200 }, {     1,  -207 },
+    {     1,  -214 }, {     1,  -222 }, {     1,  -230 }, {     1,  -238 },
+    {     1,  -247 }, {     1,  -255 }, {     1,  -265 }, {     1,  -274 },
+    {     1,  -284 }, {     1,  -294 }, {     1,  -305 }, {     1,  -316 },
+    {     1,  -327 }, {     1,  -339 }, {     1,  -352 }, {     1,  -365 },
+    {     1,  -379 }, {     1,  -394 }, {     1,  -409 }, {     1,  -426 },
+    {     1,  -443 }, {     1,  -461 }, {     1,  -481 }, {     1,  -502 },
+    {     1,  -524 }, {     1,  -548 }, {     1,  -574 }, {     1,  -603 },
+    {     1,  -633 }, {     1,  -666 }, {     1,  -703 }, {     1,  -743 },
+    {     1,  -787 }, {     1,  -837 }, {     1,  -892 }, {     1,  -955 },
+    {     1, -1026 }, {     1, -1108 }, {     1, -1204 }, {     1, -1317 },
+    {     1, -1451 }, {     1, -1616 }, {     1, -1821 }, {     1, -2084 },
+    {     1, -2435 }, {     1, -2926 }, {     1, -3660 }, {     1, -4884 },
+    {     1, -7330 }, {     1, -14666 },
+};
+
+// Returns whether filter selection is needed for a given
+// intra prediction angle.
+int vp10_is_intra_filter_switchable(int angle) {
+  assert(angle > 0 && angle < 270);
+  if (angle % 45 == 0)
+    return 0;
+  if (angle > 90 && angle < 180) {
+    return 1;
+  } else {
+    return ((-(dr_intra_derivative[angle][angle > 180])) & 0xFF) > 0;
+  }
+}
+#endif  // CONFIG_EXT_INTRA
diff --git a/vp10/common/blockd.h b/vp10/common/blockd.h
index fce1767..0f8b972 100644
--- a/vp10/common/blockd.h
+++ b/vp10/common/blockd.h
@@ -19,6 +19,7 @@
 #include "vpx_scale/yv12config.h"
 
 #include "vp10/common/common_data.h"
+#include "vp10/common/quant_common.h"
 #include "vp10/common/entropy.h"
 #include "vp10/common/entropymode.h"
 #include "vp10/common/mv.h"
@@ -38,10 +39,106 @@
   FRAME_TYPES,
 } FRAME_TYPE;
 
+#if CONFIG_EXT_INTERP && SUPPORT_NONINTERPOLATING_FILTERS
+#define IsInterpolatingFilter(filter)  (vp10_is_interpolating_filter(filter))
+#else
+#define IsInterpolatingFilter(filter)  (1)
+#endif  // CONFIG_EXT_INTERP && SUPPORT_NONINTERPOLATING_FILTERS
+
 static INLINE int is_inter_mode(PREDICTION_MODE mode) {
+#if CONFIG_EXT_INTER
+  return mode >= NEARESTMV && mode <= NEW_NEWMV;
+#else
   return mode >= NEARESTMV && mode <= NEWMV;
+#endif  // CONFIG_EXT_INTER
 }
 
+#if CONFIG_EXT_INTER
+static INLINE int is_inter_singleref_mode(PREDICTION_MODE mode) {
+  return mode >= NEARESTMV && mode <= NEWFROMNEARMV;
+}
+
+static INLINE int is_inter_compound_mode(PREDICTION_MODE mode) {
+  return mode >= NEAREST_NEARESTMV && mode <= NEW_NEWMV;
+}
+
+static INLINE PREDICTION_MODE compound_ref0_mode(PREDICTION_MODE mode) {
+  static PREDICTION_MODE lut[MB_MODE_COUNT] = {
+    MB_MODE_COUNT,  // DC_PRED            0
+    MB_MODE_COUNT,  // V_PRED             1
+    MB_MODE_COUNT,  // H_PRED             2
+    MB_MODE_COUNT,  // D45_PRED           3
+    MB_MODE_COUNT,  // D135_PRED          4
+    MB_MODE_COUNT,  // D117_PRED          5
+    MB_MODE_COUNT,  // D153_PRED          6
+    MB_MODE_COUNT,  // D207_PRED          7
+    MB_MODE_COUNT,  // D63_PRED           8
+    MB_MODE_COUNT,  // TM_PRED            9
+    MB_MODE_COUNT,  // NEARESTMV         10
+    MB_MODE_COUNT,  // NEARMV            11
+    MB_MODE_COUNT,  // ZEROMV            12
+    MB_MODE_COUNT,  // NEWMV             13
+    MB_MODE_COUNT,  // NEWFROMNEARMV     14
+    NEARESTMV,      // NEAREST_NEARESTMV 15
+    NEARESTMV,      // NEAREST_NEARMV    16
+    NEARMV,         // NEAR_NEARESTMV    17
+    NEARMV,         // NEAR_NEARMV       18
+    NEARESTMV,      // NEAREST_NEWMV     19
+    NEWMV,          // NEW_NEARESTMV     20
+    NEARMV,         // NEAR_NEWMV        21
+    NEWMV,          // NEW_NEARMV        22
+    ZEROMV,         // ZERO_ZEROMV       23
+    NEWMV,          // NEW_NEWMV         24
+  };
+  assert(is_inter_compound_mode(mode));
+  return lut[mode];
+}
+
+static INLINE PREDICTION_MODE compound_ref1_mode(PREDICTION_MODE mode) {
+  static PREDICTION_MODE lut[MB_MODE_COUNT] = {
+    MB_MODE_COUNT,  // DC_PRED            0
+    MB_MODE_COUNT,  // V_PRED             1
+    MB_MODE_COUNT,  // H_PRED             2
+    MB_MODE_COUNT,  // D45_PRED           3
+    MB_MODE_COUNT,  // D135_PRED          4
+    MB_MODE_COUNT,  // D117_PRED          5
+    MB_MODE_COUNT,  // D153_PRED          6
+    MB_MODE_COUNT,  // D207_PRED          7
+    MB_MODE_COUNT,  // D63_PRED           8
+    MB_MODE_COUNT,  // TM_PRED            9
+    MB_MODE_COUNT,  // NEARESTMV         10
+    MB_MODE_COUNT,  // NEARMV            11
+    MB_MODE_COUNT,  // ZEROMV            12
+    MB_MODE_COUNT,  // NEWMV             13
+    MB_MODE_COUNT,  // NEWFROMNEARMV     14
+    NEARESTMV,      // NEAREST_NEARESTMV 15
+    NEARMV,         // NEAREST_NEARMV    16
+    NEARESTMV,      // NEAR_NEARESTMV    17
+    NEARMV,         // NEAR_NEARMV       18
+    NEWMV,          // NEAREST_NEWMV     19
+    NEARESTMV,      // NEW_NEARESTMV     20
+    NEWMV,          // NEAR_NEWMV        21
+    NEARMV,         // NEW_NEARMV        22
+    ZEROMV,         // ZERO_ZEROMV       23
+    NEWMV,          // NEW_NEWMV         24
+  };
+  assert(is_inter_compound_mode(mode));
+  return lut[mode];
+}
+
+static INLINE int have_newmv_in_inter_mode(PREDICTION_MODE mode) {
+  return (mode == NEWMV || mode == NEWFROMNEARMV ||
+          mode == NEW_NEWMV ||
+          mode == NEAREST_NEWMV || mode == NEW_NEARESTMV ||
+          mode == NEAR_NEWMV || mode == NEW_NEARMV);
+}
+#else
+
+static INLINE int have_newmv_in_inter_mode(PREDICTION_MODE mode) {
+  return (mode == NEWMV);
+}
+#endif  // CONFIG_EXT_INTER
+
 /* For keyframes, intra block modes are predicted by the (already decoded)
    modes for the Y blocks to the left and above us; for interframes, there
    is a single probability table. */
@@ -49,43 +146,101 @@
 typedef struct {
   PREDICTION_MODE as_mode;
   int_mv as_mv[2];  // first, second inter predictor motion vectors
+#if CONFIG_REF_MV
+  int_mv pred_mv_s8[2];
+#endif
+#if CONFIG_EXT_INTER
+  int_mv ref_mv[2];
+#endif  // CONFIG_EXT_INTER
 } b_mode_info;
 
-// Note that the rate-distortion optimization loop, bit-stream writer, and
-// decoder implementation modules critically rely on the defined entry values
-// specified herein. They should be refactored concurrently.
-
-#define NONE           -1
-#define INTRA_FRAME     0
-#define LAST_FRAME      1
-#define GOLDEN_FRAME    2
-#define ALTREF_FRAME    3
-#define MAX_REF_FRAMES  4
 typedef int8_t MV_REFERENCE_FRAME;
 
+typedef struct {
+  // Number of base colors for Y (0) and UV (1)
+  uint8_t palette_size[2];
+  // Value of base colors for Y, U, and V
+#if CONFIG_VP9_HIGHBITDEPTH
+  uint16_t palette_colors[3 * PALETTE_MAX_SIZE];
+#else
+  uint8_t palette_colors[3 * PALETTE_MAX_SIZE];
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  // Only used by encoder to store the color index of the top left pixel.
+  // TODO(huisu): move this to encoder
+  uint8_t palette_first_color_idx[2];
+} PALETTE_MODE_INFO;
+
+#if CONFIG_EXT_INTRA
+typedef struct {
+  // 1: an ext intra mode is used; 0: otherwise.
+  uint8_t use_ext_intra_mode[PLANE_TYPES];
+  EXT_INTRA_MODE ext_intra_mode[PLANE_TYPES];
+} EXT_INTRA_MODE_INFO;
+#endif  // CONFIG_EXT_INTRA
+
 // This structure now relates to 8x8 block regions.
 typedef struct {
   // Common for both INTER and INTRA blocks
   BLOCK_SIZE sb_type;
   PREDICTION_MODE mode;
   TX_SIZE tx_size;
-  int8_t skip;
-#if CONFIG_MISC_FIXES
-  int8_t has_no_coeffs;
+#if CONFIG_VAR_TX
+  // TODO(jingning): This effectively assigned a separate entry for each
+  // 8x8 block. Apparently it takes much more space than needed.
+  TX_SIZE inter_tx_size[MAX_MIB_SIZE][MAX_MIB_SIZE];
 #endif
+  int8_t skip;
+  int8_t has_no_coeffs;
   int8_t segment_id;
+#if CONFIG_SUPERTX
+  // Minimum of all segment IDs under the current supertx block.
+  int8_t segment_id_supertx;
+#endif  // CONFIG_SUPERTX
   int8_t seg_id_predicted;  // valid only when temporal_update is enabled
 
   // Only for INTRA blocks
   PREDICTION_MODE uv_mode;
+  PALETTE_MODE_INFO palette_mode_info;
 
   // Only for INTER blocks
+#if CONFIG_DUAL_FILTER
+  INTERP_FILTER interp_filter[4];
+#else
   INTERP_FILTER interp_filter;
+#endif
   MV_REFERENCE_FRAME ref_frame[2];
   TX_TYPE tx_type;
 
-  // TODO(slavarnway): Delete and use bmi[3].as_mv[] instead.
+#if CONFIG_EXT_INTRA
+  EXT_INTRA_MODE_INFO ext_intra_mode_info;
+  int8_t angle_delta[2];
+  // To-Do (huisu): this may be replaced by interp_filter
+  INTRA_FILTER intra_filter;
+#endif  // CONFIG_EXT_INTRA
+
+#if CONFIG_EXT_INTER
+  INTERINTRA_MODE interintra_mode;
+  // TODO(debargha): Consolidate these flags
+  int use_wedge_interintra;
+  int interintra_wedge_index;
+  int interintra_wedge_sign;
+  int use_wedge_interinter;
+  int interinter_wedge_index;
+  int interinter_wedge_sign;
+#endif  // CONFIG_EXT_INTER
+  MOTION_VARIATION motion_variation;
   int_mv mv[2];
+  int_mv pred_mv[2];
+#if CONFIG_REF_MV
+  uint8_t ref_mv_idx;
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+  PARTITION_TYPE partition;
+#endif
+#if CONFIG_NEW_QUANT
+  int dq_off_index;
+  int send_dq_bit;
+#endif  // CONFIG_NEW_QUANT
 } MB_MODE_INFO;
 
 typedef struct MODE_INFO {
@@ -122,7 +277,7 @@
   int stride;
 };
 
-struct macroblockd_plane {
+typedef struct macroblockd_plane {
   tran_low_t *dqcoeff;
   PLANE_TYPE plane_type;
   int subsampling_x;
@@ -132,6 +287,10 @@
   ENTROPY_CONTEXT *above_context;
   ENTROPY_CONTEXT *left_context;
   int16_t seg_dequant[MAX_SEGMENTS][2];
+#if CONFIG_NEW_QUANT
+  dequant_val_type_nuq
+    seg_dequant_nuq[MAX_SEGMENTS][QUANT_PROFILES][COEF_BANDS];
+#endif
   uint8_t *color_index_map;
 
   // number of 4x4s in current block
@@ -141,7 +300,10 @@
 
   // encoder
   const int16_t *dequant;
-};
+#if CONFIG_NEW_QUANT
+  const dequant_val_type_nuq* dequant_val_nuq[QUANT_PROFILES];
+#endif  // CONFIG_NEW_QUANT
+} MACROBLOCKD_PLANE;
 
 #define BLOCK_OFFSET(x, i) ((x) + (i) * 16)
 
@@ -172,6 +334,8 @@
   int up_available;
   int left_available;
 
+  const vpx_prob (*partition_probs)[PARTITION_TYPES - 1];
+
   /* Distance of MB away from frame edges */
   int mb_to_left_edge;
   int mb_to_right_edge;
@@ -187,10 +351,30 @@
   const YV12_BUFFER_CONFIG *cur_buf;
 
   ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
-  ENTROPY_CONTEXT left_context[MAX_MB_PLANE][16];
+  ENTROPY_CONTEXT left_context[MAX_MB_PLANE][2 * MAX_MIB_SIZE];
 
   PARTITION_CONTEXT *above_seg_context;
-  PARTITION_CONTEXT left_seg_context[8];
+  PARTITION_CONTEXT left_seg_context[MAX_MIB_SIZE];
+
+#if CONFIG_VAR_TX
+  TXFM_CONTEXT *above_txfm_context;
+  TXFM_CONTEXT *left_txfm_context;
+  TXFM_CONTEXT left_txfm_context_buffer[MAX_MIB_SIZE];
+
+  TX_SIZE max_tx_size;
+#if CONFIG_SUPERTX
+  TX_SIZE supertx_size;
+#endif
+#endif
+
+  // dimension in the unit of 8x8 block of the current block
+  uint8_t n8_w, n8_h;
+
+#if CONFIG_REF_MV
+  uint8_t ref_mv_count[MODE_CTX_REF_FRAMES];
+  CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE];
+  uint8_t is_sec_rect;
+#endif
 
 #if CONFIG_VP9_HIGHBITDEPTH
   /* Bit depth: 8, 10, 12 */
@@ -205,7 +389,10 @@
 
 static INLINE BLOCK_SIZE get_subsize(BLOCK_SIZE bsize,
                                      PARTITION_TYPE partition) {
-  return subsize_lookup[partition][bsize];
+  if (partition == PARTITION_INVALID)
+    return PARTITION_INVALID;
+  else
+    return subsize_lookup[partition][bsize];
 }
 
 static const TX_TYPE intra_mode_to_tx_type_context[INTRA_MODES] = {
@@ -221,17 +408,246 @@
   ADST_ADST,  // TM
 };
 
-static INLINE TX_TYPE get_tx_type(PLANE_TYPE plane_type, const MACROBLOCKD *xd,
-                                  int block_idx) {
+#if CONFIG_SUPERTX
+static INLINE int supertx_enabled(const MB_MODE_INFO *mbmi) {
+  return (int)mbmi->tx_size >
+      VPXMIN(b_width_log2_lookup[mbmi->sb_type],
+             b_height_log2_lookup[mbmi->sb_type]);
+}
+#endif  // CONFIG_SUPERTX
+
+#if CONFIG_EXT_TX
+#define ALLOW_INTRA_EXT_TX          1
+// whether masked transforms are used for 32X32
+#define USE_MSKTX_FOR_32X32         0
+#define USE_REDUCED_TXSET_FOR_16X16 1
+
+static const int num_ext_tx_set_inter[EXT_TX_SETS_INTER] = {
+  1, 16, 12, 2
+};
+static const int num_ext_tx_set_intra[EXT_TX_SETS_INTRA] = {
+  1, 7, 5
+};
+
+#if EXT_TX_SIZES == 4
+static INLINE int get_ext_tx_set(TX_SIZE tx_size, BLOCK_SIZE bs,
+                                 int is_inter) {
+  if (tx_size > TX_32X32 || bs < BLOCK_8X8) return 0;
+#if USE_REDUCED_TXSET_FOR_16X16
+  if (tx_size == TX_32X32)
+    return is_inter ? 3 - USE_MSKTX_FOR_32X32 : 0;
+  return (tx_size == TX_16X16 ? 2 : 1);
+#else
+  if (tx_size == TX_32X32)
+    return is_inter ? 3 - 2 * USE_MSKTX_FOR_32X32 : 0;
+  return (tx_size == TX_16X16 && !is_inter ? 2 : 1);
+#endif  // USE_REDUCED_TXSET_FOR_16X16
+}
+
+static const int use_intra_ext_tx_for_txsize[EXT_TX_SETS_INTRA][TX_SIZES] = {
+  { 0, 0, 0, 0, },  // unused
+  { 1, 1, 0, 0, },
+  { 0, 0, 1, 0, },
+};
+
+static const int use_inter_ext_tx_for_txsize[EXT_TX_SETS_INTER][TX_SIZES] = {
+  { 0, 0, 0, 0, },  // unused
+  { 1, 1, (!USE_REDUCED_TXSET_FOR_16X16), USE_MSKTX_FOR_32X32, },
+  { 0, 0, USE_REDUCED_TXSET_FOR_16X16, 0, },
+  { 0, 0, 0, (!USE_MSKTX_FOR_32X32), },
+};
+
+#else  // EXT_TX_SIZES == 4
+
+static INLINE int get_ext_tx_set(TX_SIZE tx_size, BLOCK_SIZE bs,
+                                 int is_inter) {
+  (void) is_inter;
+  if (tx_size > TX_32X32 || bs < BLOCK_8X8) return 0;
+  if (tx_size == TX_32X32) return 0;
+#if USE_REDUCED_TXSET_FOR_16X16
+  return (tx_size == TX_16X16 ? 2 : 1);
+#else
+  return (tx_size == TX_16X16 && !is_inter ? 2 : 1);
+#endif  // USE_REDUCED_TXSET_FOR_16X16
+}
+
+static const int use_intra_ext_tx_for_txsize[EXT_TX_SETS_INTRA][TX_SIZES] = {
+  { 0, 0, 0, 0, },  // unused
+  { 1, 1, 0, 0, },
+  { 0, 0, 1, 0, },
+};
+
+static const int use_inter_ext_tx_for_txsize[EXT_TX_SETS_INTER][TX_SIZES] = {
+  { 0, 0, 0, 0, },  // unused
+  { 1, 1, (!USE_REDUCED_TXSET_FOR_16X16), 0, },
+  { 0, 0, USE_REDUCED_TXSET_FOR_16X16, 0, },
+  { 0, 0, 0, 1, },
+};
+#endif  // EXT_TX_SIZES == 4
+
+// Transform types used in each intra set
+static const int ext_tx_used_intra[EXT_TX_SETS_INTRA][TX_TYPES] = {
+  {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+  {1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0},
+  {1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0},
+};
+
+// Transform types used in each inter set
+static const int ext_tx_used_inter[EXT_TX_SETS_INTER][TX_TYPES] = {
+  {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
+  {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1},
+  {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0},
+  {1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0},
+};
+
+// 1D Transforms used in inter set, this needs to be changed if
+// ext_tx_used_inter is changed
+static const int ext_tx_used_inter_1D[EXT_TX_SETS_INTER][TX_TYPES_1D] = {
+  {1, 0, 0, 0},
+  {1, 1, 1, 1},
+  {1, 1, 1, 1},
+  {1, 0, 0, 1},
+};
+
+static INLINE int get_ext_tx_types(TX_SIZE tx_size, BLOCK_SIZE bs,
+                                   int is_inter) {
+  const int set = get_ext_tx_set(tx_size, bs, is_inter);
+  return is_inter ? num_ext_tx_set_inter[set] : num_ext_tx_set_intra[set];
+}
+#endif  // CONFIG_EXT_TX
+
+#if CONFIG_EXT_INTRA
+#define ALLOW_FILTER_INTRA_MODES 1
+#define ANGLE_STEP 3
+#define MAX_ANGLE_DELTAS 3
+
+extern const int16_t dr_intra_derivative[270][2];
+
+static const uint8_t mode_to_angle_map[INTRA_MODES] = {
+    0, 90, 180, 45, 135, 111, 157, 203, 67, 0,
+};
+
+static const TX_TYPE filter_intra_mode_to_tx_type_lookup[FILTER_INTRA_MODES] = {
+  DCT_DCT,    // FILTER_DC
+  ADST_DCT,   // FILTER_V
+  DCT_ADST,   // FILTER_H
+  DCT_DCT,    // FILTER_D45
+  ADST_ADST,  // FILTER_D135
+  ADST_DCT,   // FILTER_D117
+  DCT_ADST,   // FILTER_D153
+  DCT_ADST,   // FILTER_D207
+  ADST_DCT,   // FILTER_D63
+  ADST_ADST,  // FILTER_TM
+};
+
+int vp10_is_intra_filter_switchable(int angle);
+#endif  // CONFIG_EXT_INTRA
+
+#if CONFIG_EXT_TILE
+#define FIXED_TX_TYPE 1
+#else
+#define FIXED_TX_TYPE 0
+#endif
+
+static INLINE TX_TYPE get_default_tx_type(PLANE_TYPE plane_type,
+                                          const MACROBLOCKD *xd,
+                                          int block_idx, TX_SIZE tx_size) {
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+
+  if (is_inter_block(mbmi) || plane_type != PLANE_TYPE_Y ||
+      xd->lossless[mbmi->segment_id] || tx_size >= TX_32X32)
+    return DCT_DCT;
+
+  return intra_mode_to_tx_type_context[plane_type == PLANE_TYPE_Y ?
+      get_y_mode(xd->mi[0], block_idx) : mbmi->uv_mode];
+}
+
+static INLINE TX_TYPE get_tx_type(PLANE_TYPE plane_type,
+                                  const MACROBLOCKD *xd,
+                                  int block_idx, TX_SIZE tx_size) {
   const MODE_INFO *const mi = xd->mi[0];
   const MB_MODE_INFO *const mbmi = &mi->mbmi;
 
+  if (FIXED_TX_TYPE)
+    return get_default_tx_type(plane_type, xd, block_idx, tx_size);
+
+#if CONFIG_EXT_INTRA
+  if (!is_inter_block(mbmi)) {
+    const int use_ext_intra_mode_info =
+        mbmi->ext_intra_mode_info.use_ext_intra_mode[plane_type];
+    const EXT_INTRA_MODE ext_intra_mode =
+        mbmi->ext_intra_mode_info.ext_intra_mode[plane_type];
+    const PREDICTION_MODE mode = (plane_type == PLANE_TYPE_Y) ?
+        get_y_mode(mi, block_idx) : mbmi->uv_mode;
+
+    if (xd->lossless[mbmi->segment_id] || tx_size >= TX_32X32)
+      return DCT_DCT;
+
+#if CONFIG_EXT_TX
+#if ALLOW_INTRA_EXT_TX
+    if (mbmi->sb_type >= BLOCK_8X8 && plane_type == PLANE_TYPE_Y)
+      return mbmi->tx_type;
+#endif  // ALLOW_INTRA_EXT_TX
+#endif  // CONFIG_EXT_TX
+
+    if (use_ext_intra_mode_info)
+      return filter_intra_mode_to_tx_type_lookup[ext_intra_mode];
+
+    if (mode == DC_PRED) {
+      return DCT_DCT;
+    } else if (mode == TM_PRED) {
+      return ADST_ADST;
+    } else {
+      int angle = mode_to_angle_map[mode];
+      if (mbmi->sb_type >= BLOCK_8X8)
+        angle += mbmi->angle_delta[plane_type] * ANGLE_STEP;
+      assert(angle > 0 && angle < 270);
+      if (angle == 135)
+        return ADST_ADST;
+      else if (angle < 45 || angle > 225)
+        return DCT_DCT;
+      else if (angle < 135)
+        return ADST_DCT;
+      else
+        return DCT_ADST;
+    }
+  }
+#endif  // CONFIG_EXT_INTRA
+
+#if CONFIG_EXT_TX
+#if EXT_TX_SIZES == 4
+  if (xd->lossless[mbmi->segment_id] || tx_size > TX_32X32 ||
+      (tx_size >= TX_32X32 && !is_inter_block(mbmi)))
+#else
+  if (xd->lossless[mbmi->segment_id] || tx_size >= TX_32X32)
+#endif
+    return DCT_DCT;
+  if (mbmi->sb_type >= BLOCK_8X8) {
+    if (plane_type == PLANE_TYPE_Y) {
+#if !ALLOW_INTRA_EXT_TX
+      if (is_inter_block(mbmi))
+#endif  // ALLOW_INTRA_EXT_TX
+        return mbmi->tx_type;
+    }
+    if (is_inter_block(mbmi))
+      // UV Inter only
+      return (mbmi->tx_type == IDTX && tx_size == TX_32X32 ?
+              DCT_DCT : mbmi->tx_type);
+  }
+
+  // Sub8x8-Inter/Intra OR UV-Intra
+  if (is_inter_block(mbmi))  // Sub8x8-Inter
+    return DCT_DCT;
+  else  // Sub8x8 Intra OR UV-Intra
+    return intra_mode_to_tx_type_context[plane_type == PLANE_TYPE_Y ?
+        get_y_mode(mi, block_idx) : mbmi->uv_mode];
+#else
   (void) block_idx;
   if (plane_type != PLANE_TYPE_Y || xd->lossless[mbmi->segment_id] ||
-      mbmi->tx_size >= TX_32X32)
+      tx_size >= TX_32X32)
     return DCT_DCT;
-
   return mbmi->tx_type;
+#endif  // CONFIG_EXT_TX
 }
 
 void vp10_setup_block_planes(MACROBLOCKD *xd, int ss_x, int ss_y);
@@ -248,6 +664,11 @@
 
 static INLINE TX_SIZE get_uv_tx_size(const MB_MODE_INFO *mbmi,
                                      const struct macroblockd_plane *pd) {
+#if CONFIG_SUPERTX
+  if (supertx_enabled(mbmi))
+    return uvsupertx_size_lookup[mbmi->tx_size][pd->subsampling_x]
+                                               [pd->subsampling_y];
+#endif  // CONFIG_SUPERTX
   return get_uv_tx_size_impl(mbmi->tx_size, mbmi->sb_type, pd->subsampling_x,
                              pd->subsampling_y);
 }
@@ -279,7 +700,6 @@
     const MACROBLOCKD *const xd, BLOCK_SIZE bsize, int plane,
     foreach_transformed_block_visitor visit, void *arg);
 
-
 void vp10_foreach_transformed_block(
     const MACROBLOCKD* const xd, BLOCK_SIZE bsize,
     foreach_transformed_block_visitor visit, void *arg);
@@ -288,6 +708,57 @@
                       BLOCK_SIZE plane_bsize, TX_SIZE tx_size, int has_eob,
                       int aoff, int loff);
 
+#if CONFIG_EXT_INTER
+static INLINE int is_interintra_allowed_bsize(const BLOCK_SIZE bsize) {
+  // TODO(debargha): Should this be bsize < BLOCK_LARGEST?
+  return (bsize >= BLOCK_8X8) && (bsize < BLOCK_64X64);
+}
+
+static INLINE int is_interintra_allowed_mode(const PREDICTION_MODE mode) {
+  return (mode >= NEARESTMV) && (mode <= NEWMV);
+}
+
+static INLINE int is_interintra_allowed_ref(const MV_REFERENCE_FRAME rf[2]) {
+  return (rf[0] > INTRA_FRAME) && (rf[1] <= INTRA_FRAME);
+}
+
+static INLINE int is_interintra_allowed(const MB_MODE_INFO *mbmi) {
+  return is_interintra_allowed_bsize(mbmi->sb_type)
+          && is_interintra_allowed_mode(mbmi->mode)
+          && is_interintra_allowed_ref(mbmi->ref_frame);
+}
+
+static INLINE int is_interintra_allowed_bsize_group(const int group) {
+  int i;
+  for (i = 0; i < BLOCK_SIZES; i++) {
+    if (size_group_lookup[i] == group &&
+        is_interintra_allowed_bsize(i))
+      return 1;
+  }
+  return 0;
+}
+
+static INLINE int is_interintra_pred(const MB_MODE_INFO *mbmi) {
+  return (mbmi->ref_frame[1] == INTRA_FRAME) && is_interintra_allowed(mbmi);
+}
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_OBMC || CONFIG_WARPED_MOTION
+static INLINE int is_motvar_allowed(const MB_MODE_INFO *mbmi) {
+#if CONFIG_EXT_INTER
+  return (mbmi->sb_type >= BLOCK_8X8 && mbmi->ref_frame[1] != INTRA_FRAME);
+#else
+  return (mbmi->sb_type >= BLOCK_8X8);
+#endif  // CONFIG_EXT_INTER
+}
+
+#if CONFIG_OBMC
+static INLINE int is_neighbor_overlappable(const MB_MODE_INFO *mbmi) {
+  return (is_inter_block(mbmi));
+}
+#endif  // CONFIG_OBMC
+#endif  // CONFIG_OBMC || CONFIG_WARPED_MOTION
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp10/common/common.h b/vp10/common/common.h
index 4abcbf6..ea5d1d6 100644
--- a/vp10/common/common.h
+++ b/vp10/common/common.h
@@ -25,6 +25,8 @@
 extern "C" {
 #endif
 
+#define PI 3.141592653589793238462643383279502884
+
 // Only need this for fixed-size arrays, for structs just assign.
 #define vp10_copy(dest, src) {            \
     assert(sizeof(dest) == sizeof(src)); \
@@ -33,12 +35,12 @@
 
 // Use this for variably-sized arrays.
 #define vp10_copy_array(dest, src, n) {       \
-    assert(sizeof(*dest) == sizeof(*src));   \
-    memcpy(dest, src, n * sizeof(*src)); \
+    assert(sizeof(*(dest)) == sizeof(*(src)));   \
+    memcpy(dest, src, n * sizeof(*(src))); \
   }
 
 #define vp10_zero(dest) memset(&(dest), 0, sizeof(dest))
-#define vp10_zero_array(dest, n) memset(dest, 0, n * sizeof(*dest))
+#define vp10_zero_array(dest, n) memset(dest, 0, n * sizeof(*(dest)))
 
 static INLINE int get_unsigned_bits(unsigned int num_values) {
   return num_values > 0 ? get_msb(num_values) + 1 : 0;
@@ -67,7 +69,6 @@
 
 #define VP9_FRAME_MARKER 0x2
 
-
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp10/common/common_data.h b/vp10/common/common_data.h
index 334489c..44ebff2 100644
--- a/vp10/common/common_data.h
+++ b/vp10/common/common_data.h
@@ -19,100 +19,282 @@
 extern "C" {
 #endif
 
+#if CONFIG_EXT_PARTITION
+# define IF_EXT_PARTITION(...) __VA_ARGS__
+#else
+# define IF_EXT_PARTITION(...)
+#endif
+
 // Log 2 conversion lookup tables for block width and height
 static const uint8_t b_width_log2_lookup[BLOCK_SIZES] =
-  {0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4};
+  {0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, IF_EXT_PARTITION(4, 5, 5)};
 static const uint8_t b_height_log2_lookup[BLOCK_SIZES] =
-  {0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4};
-static const uint8_t num_4x4_blocks_wide_lookup[BLOCK_SIZES] =
-  {1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16};
-static const uint8_t num_4x4_blocks_high_lookup[BLOCK_SIZES] =
-  {1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16};
+  {0, 1, 0, 1, 2, 1, 2, 3, 2, 3, 4, 3, 4, IF_EXT_PARTITION(5, 4, 5)};
 // Log 2 conversion lookup tables for modeinfo width and height
 static const uint8_t mi_width_log2_lookup[BLOCK_SIZES] =
-  {0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3};
+  {0, 0, 0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, IF_EXT_PARTITION(3, 4, 4)};
 static const uint8_t mi_height_log2_lookup[BLOCK_SIZES] =
-  {0, 0, 0, 0, 1, 0, 1, 2, 1, 2, 3, 2, 3};
+  {0, 0, 0, 0, 1, 0, 1, 2, 1, 2, 3, 2, 3, IF_EXT_PARTITION(4, 3, 4)};
+
+// Width/height lookup tables in units of varios block sizes
+static const uint8_t num_4x4_blocks_wide_lookup[BLOCK_SIZES] =
+  {1, 1, 2, 2, 2, 4, 4, 4, 8, 8, 8, 16, 16, IF_EXT_PARTITION(16, 32, 32)};
+static const uint8_t num_4x4_blocks_high_lookup[BLOCK_SIZES] =
+  {1, 2, 1, 2, 4, 2, 4, 8, 4, 8, 16, 8, 16, IF_EXT_PARTITION(32, 16, 32)};
 static const uint8_t num_8x8_blocks_wide_lookup[BLOCK_SIZES] =
-  {1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8};
+  {1, 1, 1, 1, 1, 2, 2, 2, 4, 4, 4, 8, 8, IF_EXT_PARTITION(8, 16, 16)};
 static const uint8_t num_8x8_blocks_high_lookup[BLOCK_SIZES] =
-  {1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8};
+  {1, 1, 1, 1, 2, 1, 2, 4, 2, 4, 8, 4, 8, IF_EXT_PARTITION(16, 8, 16)};
+static const uint8_t num_16x16_blocks_wide_lookup[BLOCK_SIZES] =
+  {1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 4, 4, IF_EXT_PARTITION(4, 8, 8)};
+static const uint8_t num_16x16_blocks_high_lookup[BLOCK_SIZES] =
+  {1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 4, 2, 4, IF_EXT_PARTITION(8, 4, 8)};
 
 // VPXMIN(3, VPXMIN(b_width_log2(bsize), b_height_log2(bsize)))
 static const uint8_t size_group_lookup[BLOCK_SIZES] =
-  {0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3};
+  {0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, IF_EXT_PARTITION(3, 3, 3)};
 
 static const uint8_t num_pels_log2_lookup[BLOCK_SIZES] =
-  {4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12};
+  {4, 5, 5, 6, 7, 7, 8, 9, 9, 10, 11, 11, 12, IF_EXT_PARTITION(13, 13, 14)};
 
-static const PARTITION_TYPE partition_lookup[][BLOCK_SIZES] = {
-  {  // 4X4
-    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
-    PARTITION_NONE, PARTITION_INVALID, PARTITION_INVALID,
+static const PARTITION_TYPE
+  partition_lookup[MAX_SB_SIZE_LOG2 - 1][BLOCK_SIZES] = {
+  {     // 4X4 ->
+    //                                    4X4
+                                          PARTITION_NONE,
+    // 4X8,            8X4,               8X8
     PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    // 8X16,           16X8,              16X16
     PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    // 16X32,          32X16,             32X32
     PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
-    PARTITION_INVALID
-  }, {  // 8X8
-    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
-    PARTITION_SPLIT, PARTITION_VERT, PARTITION_HORZ, PARTITION_NONE,
+    // 32X64,          64X32,             64X64
     PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+#if CONFIG_EXT_PARTITION
     PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
-    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID
-  }, {  // 16X16
-    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
-    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
-    PARTITION_VERT, PARTITION_HORZ, PARTITION_NONE, PARTITION_INVALID,
+#endif  // CONFIG_EXT_PARTITION
+  }, {  // 8X8 ->
+    //                                    4X4
+                                          PARTITION_SPLIT,
+    // 4X8,            8X4,               8X8
+    PARTITION_VERT,    PARTITION_HORZ,    PARTITION_NONE,
+    // 8X16,           16X8,              16X16
     PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
-    PARTITION_INVALID, PARTITION_INVALID
-  }, {  // 32X32
-    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
-    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
-    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_VERT,
-    PARTITION_HORZ, PARTITION_NONE, PARTITION_INVALID,
-    PARTITION_INVALID, PARTITION_INVALID
-  }, {  // 64X64
-    // 4X4, 4X8,8X4,8X8,8X16,16X8,16X16,16X32,32X16,32X32,32X64,64X32,64X64
-    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
-    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_SPLIT,
-    PARTITION_SPLIT, PARTITION_SPLIT, PARTITION_VERT, PARTITION_HORZ,
-    PARTITION_NONE
+    // 16X32,          32X16,             32X32
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    // 32X64,          64X32,             64X64
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+#if CONFIG_EXT_PARTITION
+    // 64x128,         128x64,            128x128
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+#endif  // CONFIG_EXT_PARTITION
+  }, {  // 16X16 ->
+    //                                    4X4
+                                          PARTITION_SPLIT,
+    // 4X8,            8X4,               8X8
+    PARTITION_SPLIT,   PARTITION_SPLIT,   PARTITION_SPLIT,
+    // 8X16,           16X8,              16X16
+    PARTITION_VERT,    PARTITION_HORZ,    PARTITION_NONE,
+    // 16X32,          32X16,             32X32
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+    // 32X64,          64X32,             64X64
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+#if CONFIG_EXT_PARTITION
+    // 64x128,         128x64,            128x128
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+#endif  // CONFIG_EXT_PARTITION
+  }, {  // 32X32 ->
+    //                                    4X4
+                                          PARTITION_SPLIT,
+    // 4X8,            8X4,               8X8
+    PARTITION_SPLIT,   PARTITION_SPLIT,   PARTITION_SPLIT,
+    // 8X16,           16X8,              16X16
+    PARTITION_SPLIT,   PARTITION_SPLIT,   PARTITION_SPLIT,
+    // 16X32,          32X16,             32X32
+    PARTITION_VERT,    PARTITION_HORZ,    PARTITION_NONE,
+    // 32X64,          64X32,             64X64
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+#if CONFIG_EXT_PARTITION
+    // 64x128,         128x64,            128x128
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+#endif  // CONFIG_EXT_PARTITION
+  }, {  // 64X64 ->
+    //                                    4X4
+                                          PARTITION_SPLIT,
+    // 4X8,            8X4,               8X8
+    PARTITION_SPLIT,   PARTITION_SPLIT,   PARTITION_SPLIT,
+    // 8X16,           16X8,              16X16
+    PARTITION_SPLIT,   PARTITION_SPLIT,   PARTITION_SPLIT,
+    // 16X32,          32X16,             32X32
+    PARTITION_SPLIT,   PARTITION_SPLIT,   PARTITION_SPLIT,
+    // 32X64,          64X32,             64X64
+    PARTITION_VERT,    PARTITION_HORZ,    PARTITION_NONE,
+#if CONFIG_EXT_PARTITION
+    // 64x128,         128x64,            128x128
+    PARTITION_INVALID, PARTITION_INVALID, PARTITION_INVALID,
+  }, {  // 128x128 ->
+    //                                    4X4
+                                          PARTITION_SPLIT,
+    // 4X8,            8X4,               8X8
+    PARTITION_SPLIT,   PARTITION_SPLIT,   PARTITION_SPLIT,
+    // 8X16,           16X8,              16X16
+    PARTITION_SPLIT,   PARTITION_SPLIT,   PARTITION_SPLIT,
+    // 16X32,          32X16,             32X32
+    PARTITION_SPLIT,   PARTITION_SPLIT,   PARTITION_SPLIT,
+    // 32X64,          64X32,             64X64
+    PARTITION_SPLIT,   PARTITION_SPLIT,   PARTITION_SPLIT,
+    // 64x128,         128x64,            128x128
+    PARTITION_VERT,    PARTITION_HORZ,    PARTITION_NONE,
+#endif  // CONFIG_EXT_PARTITION
   }
 };
 
-static const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES] = {
+#if CONFIG_EXT_PARTITION_TYPES
+static const BLOCK_SIZE subsize_lookup[EXT_PARTITION_TYPES][BLOCK_SIZES] =
+#else
+static const BLOCK_SIZE subsize_lookup[PARTITION_TYPES][BLOCK_SIZES] =
+#endif  // CONFIG_EXT_PARTITION_TYPES
+{
   {     // PARTITION_NONE
-    BLOCK_4X4,   BLOCK_4X8,   BLOCK_8X4,
-    BLOCK_8X8,   BLOCK_8X16,  BLOCK_16X8,
-    BLOCK_16X16, BLOCK_16X32, BLOCK_32X16,
-    BLOCK_32X32, BLOCK_32X64, BLOCK_64X32,
-    BLOCK_64X64,
+    //                            4X4
+                                  BLOCK_4X4,
+    // 4X8,        8X4,           8X8
+    BLOCK_4X8,     BLOCK_8X4,     BLOCK_8X8,
+    // 8X16,       16X8,          16X16
+    BLOCK_8X16,    BLOCK_16X8,    BLOCK_16X16,
+    // 16X32,      32X16,         32X32
+    BLOCK_16X32,   BLOCK_32X16,   BLOCK_32X32,
+    // 32X64,      64X32,         64X64
+    BLOCK_32X64,   BLOCK_64X32,   BLOCK_64X64,
+#if CONFIG_EXT_PARTITION
+    // 64x128,     128x64,        128x128
+    BLOCK_64X128,  BLOCK_128X64,  BLOCK_128X128,
+#endif  // CONFIG_EXT_PARTITION
   }, {  // PARTITION_HORZ
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_8X4,     BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_16X8,    BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_32X16,   BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_64X32,
+    //                            4X4
+                                  BLOCK_INVALID,
+    // 4X8,        8X4,           8X8
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X4,
+    // 8X16,       16X8,          16X16
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8,
+    // 16X32,      32X16,         32X32
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X16,
+    // 32X64,      64X32,         64X64
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X32,
+#if CONFIG_EXT_PARTITION
+    // 64x128,     128x64,        128x128
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X64,
+#endif  // CONFIG_EXT_PARTITION
   }, {  // PARTITION_VERT
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_4X8,     BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_8X16,    BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_16X32,   BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_32X64,
+    //                            4X4
+                                  BLOCK_INVALID,
+    // 4X8,        8X4,           8X8
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X8,
+    // 8X16,       16X8,          16X16
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16,
+    // 16X32,      32X16,         32X32
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X32,
+    // 32X64,      64X32,         64X64
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X64,
+#if CONFIG_EXT_PARTITION
+    // 64x128,     128x64,        128x128
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X128,
+#endif  // CONFIG_EXT_PARTITION
   }, {  // PARTITION_SPLIT
-    BLOCK_INVALID, BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_4X4,     BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_8X8,     BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_16X16,   BLOCK_INVALID, BLOCK_INVALID,
-    BLOCK_32X32,
+    //                            4X4
+                                  BLOCK_INVALID,
+    // 4X8,        8X4,           8X8
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X4,
+    // 8X16,       16X8,          16X16
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X8,
+    // 16X32,      32X16,         32X32
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X16,
+    // 32X64,      64X32,         64X64
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X32,
+#if CONFIG_EXT_PARTITION
+    // 64x128,     128x64,        128x128
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X64,
+#endif  // CONFIG_EXT_PARTITION
+#if CONFIG_EXT_PARTITION_TYPES
+  }, {  // PARTITION_HORZ_A
+    //                            4X4
+                                  BLOCK_INVALID,
+    // 4X8,        8X4,           8X8
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X4,
+    // 8X16,       16X8,          16X16
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8,
+    // 16X32,      32X16,         32X32
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X16,
+    // 32X64,      64X32,         64X64
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X32,
+#if CONFIG_EXT_PARTITION
+    // 64x128,     128x64,        128x128
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X64,
+#endif  // CONFIG_EXT_PARTITION
+  }, {  // PARTITION_HORZ_B
+    //                            4X4
+                                  BLOCK_INVALID,
+    // 4X8,        8X4,           8X8
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X4,
+    // 8X16,       16X8,          16X16
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X8,
+    // 16X32,      32X16,         32X32
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X16,
+    // 32X64,      64X32,         64X64
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X32,
+#if CONFIG_EXT_PARTITION
+    // 64x128,     128x64,        128x128
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_128X64,
+#endif  // CONFIG_EXT_PARTITION
+  }, {  // PARTITION_VERT_A
+    //                            4X4
+                                  BLOCK_INVALID,
+    // 4X8,        8X4,           8X8
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X8,
+    // 8X16,       16X8,          16X16
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16,
+    // 16X32,      32X16,         32X32
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X32,
+    // 32X64,      64X32,         64X64
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X64,
+#if CONFIG_EXT_PARTITION
+    // 64x128,     128x64,        128x128
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X128,
+#endif  // CONFIG_EXT_PARTITION
+  }, {  // PARTITION_VERT_B
+    //                            4X4
+                                  BLOCK_INVALID,
+    // 4X8,        8X4,           8X8
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_4X8,
+    // 8X16,       16X8,          16X16
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_8X16,
+    // 16X32,      32X16,         32X32
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_16X32,
+    // 32X64,      64X32,         64X64
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_32X64,
+#if CONFIG_EXT_PARTITION
+    // 64x128,     128x64,        128x128
+    BLOCK_INVALID, BLOCK_INVALID, BLOCK_64X128,
+#endif  // CONFIG_EXT_PARTITION
+#endif  // CONFIG_EXT_PARTITION_TYPES
   }
 };
 
 static const TX_SIZE max_txsize_lookup[BLOCK_SIZES] = {
-  TX_4X4,   TX_4X4,   TX_4X4,
-  TX_8X8,   TX_8X8,   TX_8X8,
-  TX_16X16, TX_16X16, TX_16X16,
-  TX_32X32, TX_32X32, TX_32X32, TX_32X32
+  //                   4X4
+                       TX_4X4,
+  // 4X8,    8X4,      8X8
+  TX_4X4,    TX_4X4,   TX_8X8,
+  // 8X16,   16X8,     16X16
+  TX_8X8,    TX_8X8,   TX_16X16,
+  // 16X32,  32X16,    32X32
+  TX_16X16,  TX_16X16, TX_32X32,
+  // 32X64,  64X32,    64X64
+  TX_32X32,  TX_32X32, TX_32X32,
+#if CONFIG_EXT_PARTITION
+  // 64x128, 128x64,   128x128
+  TX_32X32,  TX_32X32, TX_32X32,
+#endif  // CONFIG_EXT_PARTITION
 };
 
 static const BLOCK_SIZE txsize_to_bsize[TX_SIZES] = {
@@ -146,6 +328,11 @@
   {{BLOCK_32X64, BLOCK_32X32},   {BLOCK_INVALID, BLOCK_16X32}},
   {{BLOCK_64X32, BLOCK_INVALID}, {BLOCK_32X32,   BLOCK_32X16}},
   {{BLOCK_64X64, BLOCK_64X32},   {BLOCK_32X64,   BLOCK_32X32}},
+#if CONFIG_EXT_PARTITION
+  {{BLOCK_64X128, BLOCK_64X64},   {BLOCK_INVALID, BLOCK_32X64}},
+  {{BLOCK_128X64, BLOCK_INVALID}, {BLOCK_64X64,   BLOCK_64X32}},
+  {{BLOCK_128X128, BLOCK_128X64}, {BLOCK_64X128,  BLOCK_64X64}},
+#endif  // CONFIG_EXT_PARTITION
 };
 
 // Generates 4 bit field in which each bit set to 1 represents
@@ -155,6 +342,24 @@
   PARTITION_CONTEXT above;
   PARTITION_CONTEXT left;
 } partition_context_lookup[BLOCK_SIZES]= {
+#if CONFIG_EXT_PARTITION
+  {31, 31},  // 4X4   - {0b11111, 0b11111}
+  {31, 30},  // 4X8   - {0b11111, 0b11110}
+  {30, 31},  // 8X4   - {0b11110, 0b11111}
+  {30, 30},  // 8X8   - {0b11110, 0b11110}
+  {30, 28},  // 8X16  - {0b11110, 0b11100}
+  {28, 30},  // 16X8  - {0b11100, 0b11110}
+  {28, 28},  // 16X16 - {0b11100, 0b11100}
+  {28, 24},  // 16X32 - {0b11100, 0b11000}
+  {24, 28},  // 32X16 - {0b11000, 0b11100}
+  {24, 24},  // 32X32 - {0b11000, 0b11000}
+  {24, 16},  // 32X64 - {0b11000, 0b10000}
+  {16, 24},  // 64X32 - {0b10000, 0b11000}
+  {16, 16},  // 64X64 - {0b10000, 0b10000}
+  {16, 0 },  // 64X128- {0b10000, 0b00000}
+  {0,  16},  // 128X64- {0b00000, 0b10000}
+  {0,  0 },  // 128X128-{0b00000, 0b00000}
+#else
   {15, 15},  // 4X4   - {0b1111, 0b1111}
   {15, 14},  // 4X8   - {0b1111, 0b1110}
   {14, 15},  // 8X4   - {0b1110, 0b1111}
@@ -168,8 +373,31 @@
   {8,  0 },  // 32X64 - {0b1000, 0b0000}
   {0,  8 },  // 64X32 - {0b0000, 0b1000}
   {0,  0 },  // 64X64 - {0b0000, 0b0000}
+#endif  // CONFIG_EXT_PARTITION
 };
 
+#if CONFIG_SUPERTX
+static const TX_SIZE uvsupertx_size_lookup[TX_SIZES][2][2] = {
+  //  ss_x == 0 ss_x == 0   ss_x == 1 ss_x == 1
+  //  ss_y == 0 ss_y == 1   ss_y == 0 ss_y == 1
+  {{TX_4X4,   TX_4X4},   {TX_4X4,   TX_4X4}},
+  {{TX_8X8,   TX_4X4},   {TX_4X4,   TX_4X4}},
+  {{TX_16X16, TX_8X8},   {TX_8X8,   TX_8X8}},
+  {{TX_32X32, TX_16X16}, {TX_16X16, TX_16X16}},
+};
+
+#if CONFIG_EXT_PARTITION_TYPES
+static const int partition_supertx_context_lookup[EXT_PARTITION_TYPES] = {
+  -1, 0, 0, 1, 0, 0, 0, 0
+};
+
+#else
+static const int partition_supertx_context_lookup[PARTITION_TYPES] = {
+  -1, 0, 0, 1
+};
+#endif  // CONFIG_EXT_PARTITION_TYPES
+#endif  // CONFIG_SUPERTX
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp10/common/divide.c b/vp10/common/divide.c
new file mode 100644
index 0000000..3f144d7
--- /dev/null
+++ b/vp10/common/divide.c
@@ -0,0 +1,93 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp10/common/divide.h"
+
+/* Constants for divide by multiply for small divisors generated with:
+void init_fastdiv() {
+  int i;
+  for (i = 3; i < 256; ++i) {
+    const int s = 31 ^ __builtin_clz(2 * i + 1);
+    const unsigned long long base = (1ull << (sizeof(unsigned) * 8 + s)) - 1;
+    fastdiv_tab[i].mult = (base / i + 1) & 0xFFFFFFFF;
+    fastdiv_tab[i].shift = s;
+  }
+  for (i = 0; i < 8; ++i) {
+    fastdiv_tab[1 << i].mult = 0;
+    fastdiv_tab[1 << i].shift = i;
+  }
+}
+*/
+const struct fastdiv_elem vp10_fastdiv_tab[256] = {
+    {0, 0},           {0, 0},           {0, 1},           {1431655766, 2},
+    {0, 2},           {2576980378u, 3}, {1431655766, 3},  {613566757, 3},
+    {0, 3},           {3340530120u, 4}, {2576980378u, 4}, {1952257862, 4},
+    {1431655766, 4},  {991146300, 4},   {613566757, 4},   {286331154u, 4},
+    {0, 4},           {3789677026u, 5}, {3340530120u, 5}, {2938661835u, 5},
+    {2576980378u, 5}, {2249744775u, 5}, {1952257862, 5},  {1680639377, 5},
+    {1431655766, 5},  {1202590843, 5},  {991146300, 5},   {795364315, 5},
+    {613566757, 5},   {444306962, 5},   {286331154, 5},   {138547333, 5},
+    {0, 5},           {4034666248u, 6}, {3789677026u, 6}, {3558687189u, 6},
+    {3340530120u, 6}, {3134165325u, 6}, {2938661835u, 6}, {2753184165u, 6},
+    {2576980378u, 6}, {2409371898u, 6}, {2249744775u, 6}, {2097542168u, 6},
+    {1952257862, 6},  {1813430637, 6}, {1680639377, 6}, {1553498810, 6},
+    {1431655766, 6},  {1314785907, 6}, {1202590843, 6}, {1094795586, 6},
+    {991146300, 6},   {891408307, 6},  {795364315, 6},  {702812831, 6},
+    {613566757, 6},   {527452125, 6},  {444306962, 6},  {363980280, 6},
+    {286331154, 6},   {211227900, 6},  {138547333, 6},  {68174085, 6},
+    {0, 6},           {4162814457u, 7}, {4034666248u, 7}, {3910343360u, 7},
+    {3789677026u, 7}, {3672508268u, 7}, {3558687189u, 7}, {3448072337u, 7},
+    {3340530120u, 7}, {3235934265u, 7}, {3134165325u, 7}, {3035110223u, 7},
+    {2938661835u, 7}, {2844718599u, 7}, {2753184165u, 7}, {2663967058u, 7},
+    {2576980378u, 7}, {2492141518u, 7}, {2409371898u, 7}, {2328596727u, 7},
+    {2249744775u, 7}, {2172748162u, 7}, {2097542168, 7},   {2024065048, 7},
+    {1952257862, 7},  {1882064321, 7}, {1813430637, 7},   {1746305385, 7},
+    {1680639377, 7},  {1616385542, 7}, {1553498810, 7}, {1491936009, 7},
+    {1431655766, 7},  {1372618415, 7}, {1314785907, 7}, {1258121734, 7},
+    {1202590843, 7},  {1148159575, 7}, {1094795586, 7}, {1042467791, 7},
+    {991146300, 7},   {940802361, 7},  {891408307, 7},  {842937507, 7},
+    {795364315, 7},   {748664025, 7},  {702812831, 7},  {657787785, 7},
+    {613566757, 7},   {570128403, 7},  {527452125, 7},  {485518043, 7},
+    {444306962, 7},   {403800345, 7},  {363980280, 7},  {324829460, 7},
+    {286331154, 7},   {248469183, 7},  {211227900, 7},  {174592167, 7},
+    {138547333, 7},   {103079216, 7},  {68174085, 7},   {33818641, 7},
+    {0, 7},           {4228378656u, 8}, {4162814457u, 8}, {4098251237u, 8},
+    {4034666248u, 8}, {3972037425u, 8}, {3910343360u, 8}, {3849563281u, 8},
+    {3789677026u, 8}, {3730665024u, 8}, {3672508268u, 8}, {3615188300u, 8},
+    {3558687189u, 8}, {3502987511u, 8}, {3448072337u, 8}, {3393925206u, 8},
+    {3340530120u, 8}, {3287871517u, 8}, {3235934265u, 8}, {3184703642u, 8},
+    {3134165325u, 8}, {3084305374u, 8}, {3035110223u, 8}, {2986566663u, 8},
+    {2938661835u, 8}, {2891383213u, 8}, {2844718599u, 8}, {2798656110u, 8},
+    {2753184165u, 8}, {2708291480u, 8}, {2663967058u, 8}, {2620200175u, 8},
+    {2576980378u, 8}, {2534297473u, 8}, {2492141518u, 8}, {2450502814u, 8},
+    {2409371898u, 8}, {2368739540u, 8}, {2328596727u, 8}, {2288934667u, 8},
+    {2249744775u, 8}, {2211018668u, 8}, {2172748162u, 8}, {2134925265u, 8},
+    {2097542168, 8},  {2060591247, 8}, {2024065048, 8}, {1987956292, 8},
+    {1952257862, 8},  {1916962805, 8}, {1882064321, 8}, {1847555765, 8},
+    {1813430637, 8},  {1779682582, 8}, {1746305385, 8}, {1713292966, 8},
+    {1680639377, 8},  {1648338801, 8}, {1616385542, 8}, {1584774030, 8},
+    {1553498810, 8},  {1522554545, 8}, {1491936009, 8}, {1461638086, 8},
+    {1431655766, 8},  {1401984144, 8}, {1372618415, 8}, {1343553873, 8},
+    {1314785907, 8},  {1286310003, 8}, {1258121734, 8}, {1230216764, 8},
+    {1202590843, 8},  {1175239808, 8}, {1148159575, 8}, {1121346142, 8},
+    {1094795586, 8},  {1068504060, 8}, {1042467791, 8}, {1016683080, 8},
+    {991146300, 8},   {965853890, 8},  {940802361, 8},  {915988286, 8},
+    {891408307, 8},   {867059126, 8},  {842937507, 8},  {819040276, 8},
+    {795364315, 8},   {771906565, 8},  {748664025, 8},  {725633745, 8},
+    {702812831, 8},   {680198441, 8},  {657787785, 8},  {635578121, 8},
+    {613566757, 8},   {591751050, 8},  {570128403, 8},  {548696263, 8},
+    {527452125, 8},   {506393524, 8},  {485518043, 8},  {464823301, 8},
+    {444306962, 8},   {423966729, 8},  {403800345, 8},  {383805589, 8},
+    {363980280, 8},   {344322273, 8},  {324829460, 8},  {305499766, 8},
+    {286331154, 8},   {267321616, 8},  {248469183, 8},  {229771913, 8},
+    {211227900, 8},   {192835267, 8},  {174592167, 8},  {156496785, 8},
+    {138547333, 8},   {120742053, 8},  {103079216, 8},  {85557118, 8},
+    {68174085, 8},    {50928466, 8},   {33818641, 8},   {16843010, 8},
+};
diff --git a/vp10/common/divide.h b/vp10/common/divide.h
new file mode 100644
index 0000000..2f3c35c
--- /dev/null
+++ b/vp10/common/divide.h
@@ -0,0 +1,40 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_COMMON_DIVIDE_H_
+#define VP10_COMMON_DIVIDE_H_
+// An implemntation of the divide by multiply alogrithm
+// https://gmplib.org/~tege/divcnst-pldi94.pdf
+
+#include <limits.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+struct fastdiv_elem {
+  unsigned mult;
+  unsigned shift;
+};
+
+extern const struct fastdiv_elem vp10_fastdiv_tab[256];
+
+static INLINE unsigned fastdiv(unsigned x, int y) {
+  unsigned t =
+      ((uint64_t)x * vp10_fastdiv_tab[y].mult) >> (sizeof(x) * CHAR_BIT);
+  return (t + x) >> vp10_fastdiv_tab[y].shift;
+}
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // VP10_COMMON_DIVIDE_H_
diff --git a/vp10/common/entropy.c b/vp10/common/entropy.c
index 3da08a6..eea552c 100644
--- a/vp10/common/entropy.c
+++ b/vp10/common/entropy.c
@@ -133,7 +133,7 @@
   0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 5, 5
 };
 
-// Model obtained from a 2-sided zero-centerd distribuition derived
+// Model obtained from a 2-sided zero-centered distribution derived
 // from a Pareto distribution. The cdf of the distribution is:
 // cdf(x) = 0.5 + 0.5 * sgn(x) * [1 - {alpha/(alpha + |x|)} ^ beta]
 //
@@ -405,6 +405,2051 @@
   {255, 246, 247, 255, 239, 255, 253, 255},
 };
 
+#if CONFIG_ANS
+// Model obtained from a 2-sided zero-centerd distribuition derived
+// from a Pareto distribution. The cdf of the distribution is:
+// cdf(x) = 0.5 + 0.5 * sgn(x) * [1 - {alpha/(alpha + |x|)} ^ beta]
+//
+// For a given beta and a given probablity of the 1-node, the alpha
+// is first solved, and then the {alpha, beta} pair is used to generate
+// the probabilities for the rest of the nodes.
+//
+// beta = 8
+// Values for tokens ONE_TOKEN through CATEGORY6_TOKEN included here.
+// ZERO_TOKEN and EOB_TOKEN are coded as flags outside this coder.
+const AnsP10 vp10_pareto8_token_probs[COEFF_PROB_MODELS]
+                                     [ENTROPY_TOKENS - 2] = {
+{ 4, 4, 4, 4, 8, 15, 30, 57, 103, 795 },
+{ 8, 8, 8, 8, 15, 30, 57, 103, 168, 619 },
+{ 12, 12, 12, 12, 23, 43, 80, 138, 205, 487 },
+{ 16, 16, 15, 15, 30, 56, 101, 165, 225, 385 },
+{ 20, 20, 19, 19, 36, 68, 119, 186, 231, 306 },
+{ 24, 23, 23, 22, 43, 79, 135, 201, 230, 244 },
+{ 28, 27, 26, 26, 49, 89, 149, 211, 223, 196 },
+{ 32, 31, 30, 29, 55, 98, 160, 218, 212, 159 },
+{ 36, 35, 33, 32, 60, 107, 171, 221, 200, 129 },
+{ 40, 38, 37, 35, 66, 115, 179, 222, 187, 105 },
+{ 44, 42, 40, 38, 71, 122, 186, 221, 174, 86 },
+{ 48, 45, 43, 41, 76, 129, 192, 219, 160, 71 },
+{ 52, 49, 46, 44, 80, 136, 196, 215, 148, 58 },
+{ 56, 53, 49, 46, 85, 142, 200, 210, 135, 48 },
+{ 60, 56, 52, 49, 89, 147, 203, 204, 124, 40 },
+{ 64, 60, 55, 52, 93, 151, 205, 198, 113, 33 },
+{ 68, 63, 58, 54, 97, 156, 205, 192, 103, 28 },
+{ 72, 66, 61, 57, 100, 160, 206, 185, 94, 23 },
+{ 76, 70, 64, 59, 104, 163, 205, 178, 85, 20 },
+{ 80, 73, 67, 61, 107, 166, 205, 171, 77, 17 },
+{ 84, 76, 69, 63, 110, 169, 204, 164, 71, 14 },
+{ 88, 80, 72, 65, 113, 171, 202, 157, 64, 12 },
+{ 92, 83, 75, 67, 116, 173, 200, 150, 58, 10 },
+{ 96, 86, 77, 69, 118, 175, 198, 143, 53, 9 },
+{ 100, 89, 80, 71, 121, 176, 195, 137, 48, 7 },
+{ 104, 92, 82, 73, 123, 178, 192, 130, 44, 6 },
+{ 108, 96, 84, 75, 125, 178, 189, 124, 40, 5 },
+{ 112, 98, 87, 76, 127, 179, 186, 118, 36, 5 },
+{ 116, 101, 89, 78, 129, 179, 183, 112, 33, 4 },
+{ 120, 104, 91, 80, 131, 180, 179, 106, 30, 3 },
+{ 124, 107, 93, 81, 132, 180, 176, 101, 27, 3 },
+{ 128, 110, 95, 82, 134, 179, 172, 96, 25, 3 },
+{ 132, 113, 97, 84, 135, 179, 168, 91, 23, 2 },
+{ 136, 116, 99, 85, 136, 179, 164, 86, 21, 2 },
+{ 140, 119, 101, 86, 137, 178, 160, 82, 19, 2 },
+{ 144, 122, 103, 88, 138, 177, 157, 77, 17, 1 },
+{ 148, 124, 105, 89, 139, 176, 153, 73, 16, 1 },
+{ 152, 127, 107, 90, 140, 175, 149, 69, 14, 1 },
+{ 156, 130, 108, 91, 141, 173, 145, 66, 13, 1 },
+{ 160, 133, 110, 92, 141, 172, 141, 62, 12, 1 },
+{ 164, 135, 111, 93, 142, 171, 137, 59, 11, 1 },
+{ 168, 138, 113, 94, 142, 169, 133, 56, 10, 1 },
+{ 172, 140, 115, 94, 142, 168, 130, 53, 9, 1 },
+{ 176, 143, 116, 95, 143, 166, 126, 50, 8, 1 },
+{ 180, 145, 118, 96, 143, 164, 122, 47, 8, 1 },
+{ 184, 147, 119, 96, 143, 163, 119, 45, 7, 1 },
+{ 188, 150, 120, 97, 143, 161, 116, 42, 6, 1 },
+{ 192, 152, 121, 98, 143, 159, 112, 40, 6, 1 },
+{ 196, 155, 123, 98, 142, 157, 109, 38, 5, 1 },
+{ 200, 157, 124, 99, 142, 155, 105, 36, 5, 1 },
+{ 204, 159, 125, 99, 142, 153, 102, 34, 5, 1 },
+{ 208, 161, 126, 100, 142, 151, 99, 32, 4, 1 },
+{ 212, 164, 127, 100, 141, 149, 96, 30, 4, 1 },
+{ 216, 166, 129, 100, 141, 147, 93, 28, 3, 1 },
+{ 220, 168, 130, 101, 140, 144, 90, 27, 3, 1 },
+{ 224, 170, 131, 101, 140, 142, 87, 25, 3, 1 },
+{ 228, 172, 132, 101, 139, 140, 84, 24, 3, 1 },
+{ 232, 174, 132, 101, 139, 138, 81, 23, 3, 1 },
+{ 236, 176, 133, 101, 138, 136, 79, 22, 2, 1 },
+{ 240, 178, 134, 102, 137, 134, 76, 20, 2, 1 },
+{ 244, 180, 135, 102, 136, 131, 74, 19, 2, 1 },
+{ 248, 182, 135, 102, 136, 129, 71, 18, 2, 1 },
+{ 252, 184, 136, 101, 135, 127, 69, 17, 2, 1 },
+{ 256, 186, 137, 102, 134, 124, 66, 16, 2, 1 },
+{ 260, 188, 138, 102, 133, 122, 64, 15, 1, 1 },
+{ 264, 190, 138, 101, 132, 120, 62, 15, 1, 1 },
+{ 268, 191, 139, 101, 131, 118, 60, 14, 1, 1 },
+{ 272, 193, 139, 101, 130, 116, 58, 13, 1, 1 },
+{ 276, 195, 139, 101, 129, 114, 56, 12, 1, 1 },
+{ 280, 196, 140, 101, 128, 111, 54, 12, 1, 1 },
+{ 284, 198, 140, 101, 127, 109, 52, 11, 1, 1 },
+{ 288, 200, 141, 100, 126, 107, 50, 10, 1, 1 },
+{ 292, 201, 141, 100, 125, 105, 48, 10, 1, 1 },
+{ 296, 203, 141, 100, 123, 103, 47, 9, 1, 1 },
+{ 300, 204, 142, 99, 122, 101, 45, 9, 1, 1 },
+{ 304, 206, 142, 99, 121, 99, 43, 8, 1, 1 },
+{ 308, 207, 142, 99, 119, 97, 42, 8, 1, 1 },
+{ 312, 209, 142, 99, 118, 95, 40, 7, 1, 1 },
+{ 316, 210, 142, 98, 117, 93, 39, 7, 1, 1 },
+{ 320, 211, 142, 98, 116, 91, 37, 7, 1, 1 },
+{ 324, 213, 142, 97, 115, 89, 36, 6, 1, 1 },
+{ 328, 214, 142, 97, 113, 87, 35, 6, 1, 1 },
+{ 332, 215, 143, 96, 112, 85, 33, 6, 1, 1 },
+{ 336, 216, 143, 96, 111, 83, 32, 5, 1, 1 },
+{ 340, 218, 143, 95, 109, 81, 31, 5, 1, 1 },
+{ 344, 219, 142, 95, 108, 79, 30, 5, 1, 1 },
+{ 348, 220, 142, 94, 107, 78, 29, 4, 1, 1 },
+{ 352, 221, 142, 94, 105, 76, 28, 4, 1, 1 },
+{ 356, 222, 142, 93, 104, 74, 27, 4, 1, 1 },
+{ 360, 223, 142, 92, 103, 72, 26, 4, 1, 1 },
+{ 364, 224, 142, 92, 101, 70, 25, 4, 1, 1 },
+{ 368, 225, 142, 91, 100, 69, 24, 3, 1, 1 },
+{ 372, 226, 141, 91, 99, 67, 23, 3, 1, 1 },
+{ 376, 227, 141, 90, 97, 66, 22, 3, 1, 1 },
+{ 380, 228, 141, 89, 96, 64, 21, 3, 1, 1 },
+{ 384, 229, 140, 89, 95, 62, 20, 3, 1, 1 },
+{ 388, 229, 140, 88, 93, 61, 20, 3, 1, 1 },
+{ 392, 230, 140, 87, 92, 60, 19, 2, 1, 1 },
+{ 396, 231, 140, 86, 91, 58, 18, 2, 1, 1 },
+{ 400, 232, 139, 86, 89, 57, 17, 2, 1, 1 },
+{ 404, 232, 139, 85, 88, 55, 17, 2, 1, 1 },
+{ 408, 233, 138, 84, 87, 54, 16, 2, 1, 1 },
+{ 412, 234, 138, 84, 85, 52, 15, 2, 1, 1 },
+{ 416, 234, 137, 83, 84, 51, 15, 2, 1, 1 },
+{ 420, 235, 137, 82, 82, 50, 14, 2, 1, 1 },
+{ 424, 236, 136, 81, 81, 48, 14, 2, 1, 1 },
+{ 428, 236, 136, 81, 80, 47, 13, 1, 1, 1 },
+{ 432, 236, 135, 80, 79, 46, 13, 1, 1, 1 },
+{ 436, 237, 135, 79, 77, 45, 12, 1, 1, 1 },
+{ 440, 238, 134, 78, 76, 43, 12, 1, 1, 1 },
+{ 444, 238, 134, 77, 75, 42, 11, 1, 1, 1 },
+{ 448, 238, 133, 77, 73, 41, 11, 1, 1, 1 },
+{ 452, 239, 132, 76, 72, 40, 10, 1, 1, 1 },
+{ 456, 239, 131, 75, 71, 39, 10, 1, 1, 1 },
+{ 460, 239, 131, 74, 70, 38, 9, 1, 1, 1 },
+{ 464, 240, 130, 73, 68, 37, 9, 1, 1, 1 },
+{ 468, 240, 129, 72, 67, 36, 9, 1, 1, 1 },
+{ 472, 240, 128, 72, 66, 35, 8, 1, 1, 1 },
+{ 476, 240, 127, 71, 65, 34, 8, 1, 1, 1 },
+{ 480, 240, 127, 70, 63, 33, 8, 1, 1, 1 },
+{ 484, 241, 126, 69, 62, 32, 7, 1, 1, 1 },
+{ 488, 241, 125, 68, 61, 31, 7, 1, 1, 1 },
+{ 492, 241, 124, 67, 60, 30, 7, 1, 1, 1 },
+{ 496, 241, 124, 66, 59, 29, 6, 1, 1, 1 },
+{ 500, 240, 123, 66, 58, 28, 6, 1, 1, 1 },
+{ 504, 240, 122, 65, 57, 27, 6, 1, 1, 1 },
+{ 508, 240, 121, 64, 55, 27, 6, 1, 1, 1 },
+{ 512, 241, 120, 63, 54, 26, 5, 1, 1, 1 },
+{ 516, 241, 119, 62, 53, 25, 5, 1, 1, 1 },
+{ 520, 240, 118, 62, 52, 24, 5, 1, 1, 1 },
+{ 524, 240, 117, 60, 51, 24, 5, 1, 1, 1 },
+{ 528, 239, 116, 60, 50, 23, 5, 1, 1, 1 },
+{ 532, 239, 116, 59, 49, 22, 4, 1, 1, 1 },
+{ 536, 239, 115, 58, 48, 21, 4, 1, 1, 1 },
+{ 540, 239, 113, 57, 47, 21, 4, 1, 1, 1 },
+{ 544, 238, 113, 56, 46, 20, 4, 1, 1, 1 },
+{ 548, 238, 112, 55, 45, 19, 4, 1, 1, 1 },
+{ 552, 238, 110, 55, 44, 19, 3, 1, 1, 1 },
+{ 556, 237, 110, 54, 43, 18, 3, 1, 1, 1 },
+{ 560, 237, 108, 53, 42, 18, 3, 1, 1, 1 },
+{ 564, 236, 108, 52, 41, 17, 3, 1, 1, 1 },
+{ 568, 236, 106, 51, 40, 17, 3, 1, 1, 1 },
+{ 572, 235, 105, 51, 39, 16, 3, 1, 1, 1 },
+{ 576, 235, 104, 50, 38, 15, 3, 1, 1, 1 },
+{ 580, 234, 103, 49, 37, 15, 3, 1, 1, 1 },
+{ 584, 234, 102, 48, 37, 14, 2, 1, 1, 1 },
+{ 588, 233, 101, 47, 36, 14, 2, 1, 1, 1 },
+{ 592, 233, 100, 46, 35, 13, 2, 1, 1, 1 },
+{ 596, 231, 99, 46, 34, 13, 2, 1, 1, 1 },
+{ 600, 230, 98, 45, 33, 13, 2, 1, 1, 1 },
+{ 604, 230, 97, 44, 32, 12, 2, 1, 1, 1 },
+{ 608, 229, 96, 43, 31, 12, 2, 1, 1, 1 },
+{ 612, 228, 95, 42, 31, 11, 2, 1, 1, 1 },
+{ 616, 227, 93, 42, 30, 11, 2, 1, 1, 1 },
+{ 620, 227, 92, 41, 29, 10, 2, 1, 1, 1 },
+{ 624, 226, 92, 40, 28, 10, 1, 1, 1, 1 },
+{ 628, 225, 90, 39, 28, 10, 1, 1, 1, 1 },
+{ 632, 224, 89, 39, 27, 9, 1, 1, 1, 1 },
+{ 636, 223, 88, 38, 26, 9, 1, 1, 1, 1 },
+{ 640, 222, 87, 37, 25, 9, 1, 1, 1, 1 },
+{ 644, 221, 86, 36, 25, 8, 1, 1, 1, 1 },
+{ 648, 220, 84, 36, 24, 8, 1, 1, 1, 1 },
+{ 652, 219, 83, 35, 23, 8, 1, 1, 1, 1 },
+{ 656, 218, 82, 34, 23, 7, 1, 1, 1, 1 },
+{ 660, 217, 81, 33, 22, 7, 1, 1, 1, 1 },
+{ 664, 215, 80, 33, 21, 7, 1, 1, 1, 1 },
+{ 668, 214, 78, 32, 21, 7, 1, 1, 1, 1 },
+{ 672, 213, 78, 31, 20, 6, 1, 1, 1, 1 },
+{ 676, 211, 76, 31, 20, 6, 1, 1, 1, 1 },
+{ 680, 210, 75, 30, 19, 6, 1, 1, 1, 1 },
+{ 684, 209, 74, 29, 18, 6, 1, 1, 1, 1 },
+{ 688, 208, 73, 28, 18, 5, 1, 1, 1, 1 },
+{ 692, 206, 72, 28, 17, 5, 1, 1, 1, 1 },
+{ 696, 205, 70, 27, 17, 5, 1, 1, 1, 1 },
+{ 700, 203, 69, 27, 16, 5, 1, 1, 1, 1 },
+{ 704, 201, 68, 26, 16, 5, 1, 1, 1, 1 },
+{ 708, 201, 67, 25, 15, 4, 1, 1, 1, 1 },
+{ 712, 198, 66, 25, 15, 4, 1, 1, 1, 1 },
+{ 716, 197, 65, 24, 14, 4, 1, 1, 1, 1 },
+{ 720, 196, 63, 23, 14, 4, 1, 1, 1, 1 },
+{ 724, 194, 62, 23, 13, 4, 1, 1, 1, 1 },
+{ 728, 193, 61, 22, 13, 3, 1, 1, 1, 1 },
+{ 732, 191, 60, 22, 12, 3, 1, 1, 1, 1 },
+{ 736, 189, 59, 21, 12, 3, 1, 1, 1, 1 },
+{ 740, 188, 58, 20, 11, 3, 1, 1, 1, 1 },
+{ 744, 186, 56, 20, 11, 3, 1, 1, 1, 1 },
+{ 748, 184, 55, 19, 11, 3, 1, 1, 1, 1 },
+{ 752, 182, 54, 19, 10, 3, 1, 1, 1, 1 },
+{ 756, 181, 53, 18, 10, 2, 1, 1, 1, 1 },
+{ 760, 179, 52, 18, 9, 2, 1, 1, 1, 1 },
+{ 764, 177, 51, 17, 9, 2, 1, 1, 1, 1 },
+{ 768, 174, 50, 17, 9, 2, 1, 1, 1, 1 },
+{ 772, 173, 49, 16, 8, 2, 1, 1, 1, 1 },
+{ 776, 171, 47, 16, 8, 2, 1, 1, 1, 1 },
+{ 780, 169, 46, 15, 8, 2, 1, 1, 1, 1 },
+{ 784, 167, 45, 15, 7, 2, 1, 1, 1, 1 },
+{ 788, 165, 44, 14, 7, 2, 1, 1, 1, 1 },
+{ 792, 162, 43, 14, 7, 2, 1, 1, 1, 1 },
+{ 796, 161, 42, 13, 7, 1, 1, 1, 1, 1 },
+{ 800, 159, 41, 13, 6, 1, 1, 1, 1, 1 },
+{ 804, 157, 40, 12, 6, 1, 1, 1, 1, 1 },
+{ 808, 154, 39, 12, 6, 1, 1, 1, 1, 1 },
+{ 812, 153, 38, 11, 5, 1, 1, 1, 1, 1 },
+{ 816, 150, 37, 11, 5, 1, 1, 1, 1, 1 },
+{ 820, 148, 36, 10, 5, 1, 1, 1, 1, 1 },
+{ 824, 145, 35, 10, 5, 1, 1, 1, 1, 1 },
+{ 828, 143, 34, 10, 4, 1, 1, 1, 1, 1 },
+{ 832, 141, 33, 9, 4, 1, 1, 1, 1, 1 },
+{ 836, 138, 32, 9, 4, 1, 1, 1, 1, 1 },
+{ 840, 136, 30, 9, 4, 1, 1, 1, 1, 1 },
+{ 844, 133, 30, 8, 4, 1, 1, 1, 1, 1 },
+{ 848, 131, 29, 8, 3, 1, 1, 1, 1, 1 },
+{ 852, 129, 28, 7, 3, 1, 1, 1, 1, 1 },
+{ 856, 126, 27, 7, 3, 1, 1, 1, 1, 1 },
+{ 860, 123, 26, 7, 3, 1, 1, 1, 1, 1 },
+{ 864, 121, 25, 6, 3, 1, 1, 1, 1, 1 },
+{ 868, 118, 24, 6, 3, 1, 1, 1, 1, 1 },
+{ 872, 116, 23, 6, 2, 1, 1, 1, 1, 1 },
+{ 876, 113, 22, 6, 2, 1, 1, 1, 1, 1 },
+{ 880, 111, 21, 5, 2, 1, 1, 1, 1, 1 },
+{ 884, 108, 20, 5, 2, 1, 1, 1, 1, 1 },
+{ 888, 105, 19, 5, 2, 1, 1, 1, 1, 1 },
+{ 892, 102, 19, 4, 2, 1, 1, 1, 1, 1 },
+{ 896, 99, 18, 4, 2, 1, 1, 1, 1, 1 },
+{ 900, 97, 17, 4, 1, 1, 1, 1, 1, 1 },
+{ 904, 94, 16, 4, 1, 1, 1, 1, 1, 1 },
+{ 908, 92, 15, 3, 1, 1, 1, 1, 1, 1 },
+{ 912, 89, 14, 3, 1, 1, 1, 1, 1, 1 },
+{ 916, 85, 14, 3, 1, 1, 1, 1, 1, 1 },
+{ 920, 82, 13, 3, 1, 1, 1, 1, 1, 1 },
+{ 924, 79, 12, 3, 1, 1, 1, 1, 1, 1 },
+{ 928, 77, 11, 2, 1, 1, 1, 1, 1, 1 },
+{ 932, 73, 11, 2, 1, 1, 1, 1, 1, 1 },
+{ 936, 70, 10, 2, 1, 1, 1, 1, 1, 1 },
+{ 940, 67, 9, 2, 1, 1, 1, 1, 1, 1 },
+{ 944, 64, 8, 2, 1, 1, 1, 1, 1, 1 },
+{ 948, 60, 8, 2, 1, 1, 1, 1, 1, 1 },
+{ 952, 58, 7, 1, 1, 1, 1, 1, 1, 1 },
+{ 956, 54, 7, 1, 1, 1, 1, 1, 1, 1 },
+{ 960, 51, 6, 1, 1, 1, 1, 1, 1, 1 },
+{ 964, 48, 5, 1, 1, 1, 1, 1, 1, 1 },
+{ 968, 44, 5, 1, 1, 1, 1, 1, 1, 1 },
+{ 972, 41, 4, 1, 1, 1, 1, 1, 1, 1 },
+{ 976, 37, 4, 1, 1, 1, 1, 1, 1, 1 },
+{ 980, 34, 3, 1, 1, 1, 1, 1, 1, 1 },
+{ 984, 30, 3, 1, 1, 1, 1, 1, 1, 1 },
+{ 988, 27, 2, 1, 1, 1, 1, 1, 1, 1 },
+{ 992, 23, 2, 1, 1, 1, 1, 1, 1, 1 },
+{ 996, 19, 2, 1, 1, 1, 1, 1, 1, 1 },
+{ 1000, 16, 1, 1, 1, 1, 1, 1, 1, 1 },
+{ 1004, 12, 1, 1, 1, 1, 1, 1, 1, 1 },
+{ 1008, 8, 1, 1, 1, 1, 1, 1, 1, 1 },
+{ 1012, 4, 1, 1, 1, 1, 1, 1, 1, 1 },
+{ 1015, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+{ 1015, 1, 1, 1, 1, 1, 1, 1, 1, 1 },
+};
+#endif  // CONFIG_ANS
+
+#if CONFIG_ENTROPY
+const vp10_coeff_probs_model
+default_qctx_coef_probs[QCTX_BINS][TX_SIZES][PLANE_TYPES] = {
+    {  // Q_Index 0
+        {  // TX_SIZE 0
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        {182,  34, 137}, { 79,  39, 103}, { 10,  28,  51},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 45,  88, 147}, { 46,  80, 140}, { 25,  69, 119},
+                        { 12,  57,  96}, {  4,  41,  65}, {  1,  20,  31},
+                    },
+                    {  // band 2
+                        { 58, 124, 190}, { 39, 106, 178}, { 16,  86, 147},
+                        {  7,  69, 114}, {  3,  50,  80}, {  1,  25,  42},
+                    },
+                    {  // band 3
+                        { 90, 138, 215}, { 54, 116, 198}, { 18,  86, 155},
+                        {  5,  62, 112}, {  1,  38,  68}, {  1,  17,  30},
+                    },
+                    {  // band 4
+                        {126, 149, 231}, { 82, 114, 211}, { 21,  80, 157},
+                        {  6,  56, 105}, {  1,  36,  64}, {  1,  17,  31},
+                    },
+                    {  // band 5
+                        {171,  56, 236}, {140,  54, 219}, { 57,  45, 167},
+                        { 26,  36, 113}, { 11,  29,  72}, {  3,  18,  39},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        {153, 122, 186}, {106, 109, 171}, { 36,  84, 128},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 27, 151, 201}, { 34, 131, 199}, { 23, 102, 161},
+                        { 10,  80, 120}, {  4,  52,  78}, {  1,  24,  37},
+                    },
+                    {  // band 2
+                        { 43, 158, 213}, { 35, 133, 203}, {  8,  92, 151},
+                        {  2,  64, 106}, {  1,  36,  60}, {  1,  13,  24},
+                    },
+                    {  // band 3
+                        { 68, 167, 223}, { 36, 135, 211}, {  9,  94, 157},
+                        {  2,  67, 112}, {  1,  40,  68}, {  1,  17,  31},
+                    },
+                    {  // band 4
+                        {131, 146, 237}, { 72, 119, 223}, { 17,  82, 164},
+                        {  4,  55, 107}, {  1,  34,  63}, {  1,  16,  29},
+                    },
+                    {  // band 5
+                        {184,  68, 244}, {153,  59, 232}, { 68,  51, 179},
+                        { 31,  40, 123}, { 13,  29,  77}, {  4,  17,  37},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {203,  41, 203}, {127,  56, 174}, { 49,  56, 127},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {110, 121, 217}, {119, 113, 213}, { 64,  95, 185},
+                        { 30,  72, 144}, {  8,  42,  76}, {  2,  17,  25},
+                    },
+                    {  // band 2
+                        {127, 159, 229}, {115, 134, 223}, { 36, 100, 189},
+                        { 11,  75, 142}, {  3,  48,  83}, {  1,  19,  33},
+                    },
+                    {  // band 3
+                        {150, 172, 241}, { 90, 133, 231}, { 28, 102, 192},
+                        {  7,  81, 147}, {  1,  53,  91}, {  1,  25,  42},
+                    },
+                    {  // band 4
+                        {184, 144, 248}, {114, 117, 237}, { 37,  89, 192},
+                        { 10,  63, 130}, {  4,  42,  76}, {  1,  19,  38},
+                    },
+                    {  // band 5
+                        {207,  79, 250}, {179,  74, 241}, { 83,  67, 199},
+                        { 38,  51, 142}, { 17,  37,  97}, { 10,  14,  55},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {220,  82, 232}, {150,  93, 214}, { 66,  95, 177},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {116, 160, 227}, {136, 141, 227}, { 67, 114, 190},
+                        { 40,  94, 148}, { 21,  70, 107}, { 10,  43,  63},
+                    },
+                    {  // band 2
+                        {124, 173, 235}, {105, 147, 226}, { 27, 107, 184},
+                        { 10,  80, 142}, {  3,  50,  86}, {  1,  16,  32},
+                    },
+                    {  // band 3
+                        {149, 179, 243}, { 89, 147, 234}, { 29, 112, 193},
+                        {  9,  94, 157}, {  1,  64, 111}, {  1,  25,  43},
+                    },
+                    {  // band 4
+                        {187, 153, 248}, {127, 130, 241}, { 52,  99, 202},
+                        { 20,  79, 152}, {  4,  50,  93}, {  1,  19,  32},
+                    },
+                    {  // band 5
+                        {215,  82, 251}, {195,  80, 246}, { 93,  70, 204},
+                        { 39,  54, 147}, { 14,  33,  88}, {  6,  14,  39},
+                    },
+                },
+            },
+        },
+        {  // TX_SIZE 1
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        {116,  43, 131}, { 39,  41,  94}, {  4,  28,  47},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 28, 101, 141}, { 27,  95, 140}, { 18,  80, 121},
+                        { 10,  61,  95}, {  4,  39,  60}, {  1,  19,  26},
+                    },
+                    {  // band 2
+                        { 29, 150, 183}, { 19, 127, 175}, {  8,  98, 147},
+                        {  3,  76, 115}, {  1,  55,  84}, {  1,  29,  43},
+                    },
+                    {  // band 3
+                        { 26, 168, 202}, { 12, 138, 188}, {  2,  98, 149},
+                        {  1,  69, 110}, {  1,  40,  65}, {  1,  17,  25},
+                    },
+                    {  // band 4
+                        { 33, 188, 225}, { 12, 155, 207}, {  2, 101, 155},
+                        {  1,  65, 106}, {  1,  36,  60}, {  1,  18,  26},
+                    },
+                    {  // band 5
+                        { 79, 205, 242}, { 30, 168, 224}, {  5, 106, 164},
+                        {  1,  68, 110}, {  1,  39,  65}, {  1,  18,  28},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        { 96,  80, 201}, { 51,  88, 168}, { 14,  78, 116},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {  6, 167, 216}, { 32, 152, 211}, { 24, 121, 182},
+                        { 13,  98, 149}, { 12,  76, 108}, {  8,  48,  62},
+                    },
+                    {  // band 2
+                        { 17, 176, 225}, { 13, 147, 209}, {  3,  96, 155},
+                        {  1,  65, 108}, {  2,  43,  63}, {  2,  23,  25},
+                    },
+                    {  // band 3
+                        { 18, 183, 232}, { 10, 153, 214}, {  1,  96, 154},
+                        {  1,  63, 105}, {  1,  39,  59}, {  1,  21,  24},
+                    },
+                    {  // band 4
+                        { 23, 191, 239}, {  8, 159, 221}, {  1,  97, 158},
+                        {  1,  61, 105}, {  1,  37,  60}, {  1,  20,  26},
+                    },
+                    {  // band 5
+                        { 70, 201, 243}, { 29, 163, 228}, {  4, 102, 169},
+                        {  1,  67, 114}, {  1,  39,  66}, {  1,  17,  29},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {181,  38, 192}, { 95,  47, 151}, { 29,  49, 102},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 72, 131, 202}, { 93, 120, 205}, { 50, 103, 179},
+                        { 24,  79, 143}, { 11,  47,  78}, {  7,  19,  25},
+                    },
+                    {  // band 2
+                        { 84, 176, 221}, { 56, 144, 214}, { 21, 108, 182},
+                        {  8,  83, 139}, {  3,  55,  90}, {  2,  27,  41},
+                    },
+                    {  // band 3
+                        { 84, 195, 234}, { 42, 156, 222}, { 10, 109, 180},
+                        {  4,  77, 133}, {  1,  48,  80}, {  1,  23,  35},
+                    },
+                    {  // band 4
+                        { 89, 210, 238}, { 35, 165, 221}, {  6, 106, 172},
+                        {  2,  70, 123}, {  1,  44,  74}, {  1,  21,  30},
+                    },
+                    {  // band 5
+                        {114, 221, 247}, { 49, 170, 234}, {  7, 113, 184},
+                        {  2,  77, 132}, {  1,  48,  79}, {  1,  25,  33},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {192,  66, 237}, {113,  84, 211}, { 35,  84, 154},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 81, 180, 234}, {127, 165, 229}, { 58, 137, 204},
+                        { 41, 114, 174}, { 44,  94, 136}, { 29,  66,  86},
+                    },
+                    {  // band 2
+                        { 82, 193, 240}, { 39, 162, 223}, {  8, 113, 179},
+                        {  3,  83, 136}, {  6,  62,  84}, {  5,  45,  45},
+                    },
+                    {  // band 3
+                        { 78, 203, 242}, { 31, 170, 227}, {  4, 115, 181},
+                        {  1,  82, 135}, {  2,  59,  82}, {  1,  45,  47},
+                    },
+                    {  // band 4
+                        { 76, 210, 239}, { 25, 170, 213}, {  2,  99, 152},
+                        {  1,  69, 115}, {  1,  49,  80}, {  1,  47,  57},
+                    },
+                    {  // band 5
+                        {103, 217, 250}, { 42, 180, 237}, {  3, 124, 191},
+                        {  1,  90, 150}, {  1,  69, 116}, {  1,  52,  46},
+                    },
+                },
+            },
+        },
+        {  // TX_SIZE 2
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        { 58,  38,  99}, {  9,  26,  51}, {  1,  14,  22},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 14,  78, 109}, { 16,  73, 105}, { 11,  62,  92},
+                        {  6,  47,  72}, {  2,  29,  45}, {  1,  12,  18},
+                    },
+                    {  // band 2
+                        { 17, 131, 148}, { 11, 112, 140}, {  5,  87, 118},
+                        {  2,  63,  90}, {  1,  42,  63}, {  1,  19,  31},
+                    },
+                    {  // band 3
+                        { 12, 151, 168}, {  6, 116, 152}, {  1,  76, 115},
+                        {  1,  50,  81}, {  1,  32,  52}, {  1,  14,  23},
+                    },
+                    {  // band 4
+                        { 10, 174, 191}, {  3, 130, 172}, {  1,  80, 126},
+                        {  1,  53,  88}, {  1,  32,  55}, {  1,  14,  24},
+                    },
+                    {  // band 5
+                        { 19, 219, 237}, {  3, 168, 211}, {  1,  90, 142},
+                        {  1,  53,  91}, {  1,  29,  51}, {  1,  12,  21},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        { 21,  46, 184}, { 10,  53, 130}, {  2,  49,  78},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {  3, 169, 198}, { 37, 165, 196}, { 26, 134, 176},
+                        { 11, 108, 149}, {  5,  81, 112}, {  3,  47,  64},
+                    },
+                    {  // band 2
+                        { 11, 183, 215}, {  8, 142, 192}, {  2,  91, 141},
+                        {  1,  62, 100}, {  1,  38,  62}, {  1,  17,  28},
+                    },
+                    {  // band 3
+                        { 12, 190, 223}, {  6, 149, 199}, {  1,  88, 139},
+                        {  1,  56,  93}, {  1,  31,  54}, {  1,  13,  21},
+                    },
+                    {  // band 4
+                        { 11, 197, 230}, {  3, 154, 204}, {  1,  83, 134},
+                        {  1,  50,  86}, {  1,  28,  49}, {  1,  12,  21},
+                    },
+                    {  // band 5
+                        { 17, 211, 240}, {  2, 167, 217}, {  1,  88, 143},
+                        {  1,  53,  91}, {  1,  30,  53}, {  1,  14,  24},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {151,  30, 151}, { 50,  36, 105}, {  8,  34,  66},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 39, 111, 160}, { 62, 111, 165}, { 37,  99, 147},
+                        { 15,  77, 118}, {  3,  47,  73}, {  1,  17,  27},
+                    },
+                    {  // band 2
+                        { 48, 170, 190}, { 32, 135, 180}, { 11, 100, 149},
+                        {  4,  76, 116}, {  1,  51,  80}, {  1,  22,  36},
+                    },
+                    {  // band 3
+                        { 39, 191, 208}, { 18, 141, 191}, {  3,  96, 150},
+                        {  1,  66, 110}, {  1,  41,  69}, {  1,  17,  28},
+                    },
+                    {  // band 4
+                        { 32, 209, 219}, {  8, 152, 201}, {  1,  96, 153},
+                        {  1,  63, 106}, {  1,  38,  66}, {  1,  17,  29},
+                    },
+                    {  // band 5
+                        { 33, 230, 237}, {  5, 173, 214}, {  1, 100, 155},
+                        {  1,  62, 105}, {  1,  38,  66}, {  1,  18,  32},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {149,  38, 231}, { 59,  51, 186}, { 12,  54, 117},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 53, 179, 226}, {126, 176, 223}, { 58, 147, 202},
+                        { 28, 118, 174}, { 15,  94, 138}, { 14,  63,  87},
+                    },
+                    {  // band 2
+                        { 58, 196, 232}, { 26, 158, 213}, {  5, 106, 166},
+                        {  1,  75, 124}, {  1,  46,  79}, {  1,  23,  39},
+                    },
+                    {  // band 3
+                        { 46, 203, 235}, { 17, 162, 213}, {  2, 104, 165},
+                        {  1,  72, 120}, {  1,  44,  74}, {  1,  22,  33},
+                    },
+                    {  // band 4
+                        { 37, 213, 238}, {  8, 167, 216}, {  1, 104, 168},
+                        {  1,  68, 119}, {  1,  40,  67}, {  1,  17,  29},
+                    },
+                    {  // band 5
+                        { 30, 228, 239}, {  4, 181, 213}, {  1, 103, 153},
+                        {  1,  65, 110}, {  1,  43,  79}, {  1,  27,  56},
+                    },
+                },
+            },
+        },
+        {  // TX_SIZE 3
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        { 76,  25,  53}, {  9,  18,  32}, {  1,  12,  18},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 29,  55,  91}, { 19,  58,  95}, { 15,  57,  89},
+                        { 12,  49,  77}, {  3,  29,  44}, {  1,   8,  12},
+                    },
+                    {  // band 2
+                        { 32, 160, 148}, { 33, 143, 146}, { 19, 122, 132},
+                        {  6,  90, 102}, {  1,  58,  70}, {  1,  17,  24},
+                    },
+                    {  // band 3
+                        { 16, 181, 181}, {  6, 142, 165}, {  1,  90, 120},
+                        {  1,  50,  71}, {  1,  25,  38}, {  1,   9,  14},
+                    },
+                    {  // band 4
+                        { 13, 203, 203}, {  3, 154, 176}, {  1,  80, 108},
+                        {  1,  41,  61}, {  1,  24,  37}, {  1,  11,  17},
+                    },
+                    {  // band 5
+                        {  6, 234, 240}, {  1, 178, 204}, {  1,  80, 119},
+                        {  1,  45,  71}, {  1,  26,  42}, {  1,  12,  19},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        { 78,  20, 135}, { 25,  18, 101}, {  5,  19,  57},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {  7, 144, 183}, {117, 151, 195}, {109, 151, 187},
+                        { 39, 130, 168}, { 11, 100, 125}, {  4,  59,  64},
+                    },
+                    {  // band 2
+                        { 20, 184, 212}, { 12, 148, 191}, {  2,  98, 141},
+                        {  1,  65, 100}, {  1,  39,  61}, {  1,  14,  22},
+                    },
+                    {  // band 3
+                        { 15, 194, 222}, {  6, 153, 198}, {  1,  92, 138},
+                        {  1,  58,  91}, {  1,  32,  52}, {  1,  12,  18},
+                    },
+                    {  // band 4
+                        { 14, 206, 232}, {  3, 162, 206}, {  1,  89, 134},
+                        {  1,  52,  83}, {  1,  28,  46}, {  1,  11,  17},
+                    },
+                    {  // band 5
+                        {  6, 225, 241}, {  1, 175, 210}, {  1,  81, 125},
+                        {  1,  48,  78}, {  1,  28,  46}, {  1,  13,  21},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {124,  23,  93}, { 31,  24,  63}, {  6,  24,  46},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 23,  86, 126}, { 45,  90, 145}, { 31,  91, 133},
+                        { 19,  80, 114}, {  7,  53,  72}, {  1,  20,  27},
+                    },
+                    {  // band 2
+                        { 51, 186, 189}, { 48, 159, 182}, { 33, 128, 156},
+                        { 15,  92, 124}, {  2,  62,  83}, {  1,  29,  43},
+                    },
+                    {  // band 3
+                        { 36, 198, 211}, { 15, 156, 187}, {  3,  97, 137},
+                        {  1,  61,  93}, {  1,  35,  57}, {  1,  15,  23},
+                    },
+                    {  // band 4
+                        { 34, 219, 223}, {  9, 162, 193}, {  1,  91, 136},
+                        {  1,  58,  92}, {  1,  35,  54}, {  1,  14,  23},
+                    },
+                    {  // band 5
+                        { 19, 243, 243}, {  3, 191, 208}, {  1,  91, 137},
+                        {  1,  56,  90}, {  1,  34,  55}, {  1,  16,  24},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {119,  20, 197}, { 19,  29, 156}, {  3,  30, 107},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 24, 192, 226}, {161, 193, 227}, { 97, 185, 222},
+                        { 31, 158, 204}, { 16, 122, 165}, { 17,  84, 112},
+                    },
+                    {  // band 2
+                        { 26, 202, 229}, { 11, 165, 210}, {  2, 103, 152},
+                        {  1,  68, 104}, {  1,  42,  70}, {  1,  16,  36},
+                    },
+                    {  // band 3
+                        { 24, 209, 237}, {  6, 169, 214}, {  1, 102, 154},
+                        {  1,  65, 107}, {  1,  45,  68}, {  1,  17,  24},
+                    },
+                    {  // band 4
+                        { 19, 219, 243}, {  4, 183, 226}, {  1, 115, 172},
+                        {  1,  73, 119}, {  1,  43,  77}, {  1,  15,  37},
+                    },
+                    {  // band 5
+                        { 11, 237, 241}, {  2, 190, 216}, {  1, 108, 146},
+                        {  1,  59,  94}, {  1,  40,  67}, {  1,  30,  53},
+                    },
+                },
+            },
+        },
+    },
+    {  // Q_Index 1
+        {  // TX_SIZE 0
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        {174,  30, 159}, { 76,  38, 115}, { 15,  33,  65},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 60,  80, 153}, { 72,  75, 147}, { 36,  68, 129},
+                        { 15,  59, 104}, {  4,  45,  74}, {  1,  28,  45},
+                    },
+                    {  // band 2
+                        { 70, 122, 186}, { 55, 104, 175}, { 21,  83, 144},
+                        {  8,  67, 112}, {  2,  51,  82}, {  1,  34,  57},
+                    },
+                    {  // band 3
+                        { 97, 144, 207}, { 52, 109, 195}, { 16,  77, 153},
+                        {  4,  58, 113}, {  1,  43,  77}, {  1,  27,  48},
+                    },
+                    {  // band 4
+                        {128, 148, 229}, { 76, 104, 210}, { 18,  77, 159},
+                        {  4,  65, 110}, {  1,  52,  82}, {  1,  31,  55},
+                    },
+                    {  // band 5
+                        {165,  51, 238}, {128,  50, 230}, { 57,  49, 185},
+                        { 28,  47, 130}, { 12,  44,  96}, {  3,  36,  60},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        {169, 103, 203}, {117,  96, 176}, { 56,  81, 137},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 31, 150, 224}, { 49, 128, 212}, { 19,  92, 165},
+                        {  6,  67, 116}, {  2,  43,  71}, {  1,  21,  36},
+                    },
+                    {  // band 2
+                        { 58, 156, 230}, { 47, 130, 215}, {  7,  87, 158},
+                        {  2,  63, 114}, {  1,  39,  71}, {  1,  18,  36},
+                    },
+                    {  // band 3
+                        { 85, 176, 240}, { 43, 138, 226}, {  8,  93, 172},
+                        {  2,  70, 127}, {  1,  46,  81}, {  1,  26,  47},
+                    },
+                    {  // band 4
+                        {155, 144, 248}, { 93, 116, 235}, { 21,  83, 180},
+                        {  4,  59, 119}, {  1,  43,  80}, {  1,  25,  50},
+                    },
+                    {  // band 5
+                        {203,  61, 250}, {171,  57, 243}, { 71,  57, 199},
+                        { 31,  49, 144}, { 13,  42,  96}, {  7,  30,  52},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {204,  44, 204}, {137,  57, 184}, { 72,  62, 152},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {145, 117, 236}, {151, 112, 231}, { 87,  95, 208},
+                        { 31,  77, 165}, {  5,  49,  98}, {  1,  24,  39},
+                    },
+                    {  // band 2
+                        {146, 152, 241}, {140, 132, 236}, { 41, 103, 209},
+                        { 10,  86, 165}, {  2,  55, 106}, {  1,  25,  58},
+                    },
+                    {  // band 3
+                        {154, 181, 249}, { 84, 143, 240}, { 23, 114, 210},
+                        {  6, 102, 182}, {  2,  71, 137}, {  1,  35,  90},
+                    },
+                    {  // band 4
+                        {184, 150, 251}, {115, 130, 244}, { 34, 105, 215},
+                        { 15,  89, 173}, {  1,  51, 141}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {211,  71, 253}, {193,  78, 249}, {106,  91, 232},
+                        { 61,  87, 198}, { 85, 153, 254}, {128, 128, 128},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {232, 104, 242}, {165, 114, 227}, { 96, 120, 206},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {137, 178, 250}, {146, 153, 245}, { 74, 108, 205},
+                        { 41,  81, 149}, { 24,  55, 104}, { 13,  36,  68},
+                    },
+                    {  // band 2
+                        {147, 185, 252}, {127, 161, 246}, { 30, 104, 208},
+                        { 11,  74, 154}, {  6,  54, 100}, {  2,  29,  63},
+                    },
+                    {  // band 3
+                        {163, 191, 254}, {101, 161, 249}, { 22, 114, 215},
+                        {  6,  89, 173}, {  1,  65, 120}, {  1,   1, 170},
+                    },
+                    {  // band 4
+                        {197, 160, 254}, {142, 141, 251}, { 39, 102, 218},
+                        { 10,  76, 158}, {  1,  56, 122}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {224,  76, 254}, {215,  84, 253}, {107,  85, 232},
+                        { 43,  71, 177}, {  1,   1, 254}, {128, 128, 128},
+                    },
+                },
+            },
+        },
+        {  // TX_SIZE 1
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        { 68,  37, 120}, { 21,  34,  82}, {  5,  26,  49},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 41,  89, 138}, { 56,  83, 132}, { 31,  73, 115},
+                        { 16,  62,  92}, {  5,  45,  62}, {  1,  24,  32},
+                    },
+                    {  // band 2
+                        { 48, 139, 165}, { 30, 114, 160}, { 13,  92, 132},
+                        {  6,  72, 103}, {  3,  49,  72}, {  1,  26,  41},
+                    },
+                    {  // band 3
+                        { 44, 162, 191}, { 20, 127, 175}, {  5,  90, 137},
+                        {  1,  62, 100}, {  1,  38,  63}, {  1,  20,  32},
+                    },
+                    {  // band 4
+                        { 51, 184, 213}, { 16, 137, 193}, {  2,  89, 143},
+                        {  1,  60, 102}, {  1,  39,  66}, {  1,  23,  37},
+                    },
+                    {  // band 5
+                        { 76, 200, 235}, { 27, 150, 216}, {  3,  99, 164},
+                        {  1,  70, 119}, {  1,  45,  77}, {  1,  22,  38},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        { 81, 112, 199}, { 49, 101, 164}, { 19,  80, 119},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 12, 181, 217}, { 48, 151, 212}, { 38, 118, 180},
+                        { 22,  95, 140}, { 11,  67,  92}, { 13,  46,  44},
+                    },
+                    {  // band 2
+                        { 29, 188, 226}, { 19, 147, 210}, {  5,  95, 154},
+                        {  4,  68, 106}, {  3,  44,  60}, {  1,  24,  27},
+                    },
+                    {  // band 3
+                        { 30, 195, 234}, { 15, 153, 216}, {  3,  95, 156},
+                        {  2,  66, 108}, {  2,  44,  62}, {  1,  24,  29},
+                    },
+                    {  // band 4
+                        { 36, 203, 243}, { 12, 162, 225}, {  2,  98, 163},
+                        {  2,  67, 113}, {  2,  45,  68}, {  1,  24,  34},
+                    },
+                    {  // band 5
+                        { 86, 207, 248}, { 35, 165, 236}, {  3, 107, 180},
+                        {  1,  73, 128}, {  1,  45,  78}, {  1,  20,  34},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {188,  37, 205}, {118,  51, 172}, { 56,  57, 135},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {116, 135, 225}, {144, 123, 221}, { 72, 103, 197},
+                        { 35,  77, 153}, { 15,  47,  82}, {  6,  25,  34},
+                    },
+                    {  // band 2
+                        {128, 171, 233}, { 82, 142, 226}, { 31, 106, 191},
+                        { 16,  82, 146}, {  9,  59,  98}, {  4,  33,  54},
+                    },
+                    {  // band 3
+                        {126, 197, 241}, { 66, 155, 230}, { 18, 108, 190},
+                        {  7,  82, 148}, {  3,  58,  98}, {  1,  25,  50},
+                    },
+                    {  // band 4
+                        {117, 207, 244}, { 44, 163, 233}, {  9, 112, 191},
+                        {  5,  84, 148}, {  3,  61,  87}, {  1,  28,  38},
+                    },
+                    {  // band 5
+                        {112, 214, 249}, { 39, 174, 240}, {  6, 125, 205},
+                        {  4,  96, 163}, {  5,  66, 100}, {  1, 128, 254},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {227,  70, 234}, {145,  91, 213}, { 61, 100, 173},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {108, 198, 243}, {171, 172, 240}, {118, 130, 210},
+                        {104, 107, 165}, { 64,  85, 114}, { 55,  64,  60},
+                    },
+                    {  // band 2
+                        {110, 208, 247}, { 64, 175, 237}, { 24, 112, 187},
+                        { 24,  81, 133}, { 24,  63,  83}, { 21,  47,  53},
+                    },
+                    {  // band 3
+                        { 91, 218, 249}, { 46, 188, 238}, {  8, 113, 184},
+                        {  5,  83, 137}, {  6,  62,  95}, { 17,  44,  94},
+                    },
+                    {  // band 4
+                        { 84, 216, 248}, { 30, 187, 237}, {  2, 117, 188},
+                        {  1,  88, 141}, {  3,  63,  98}, {  1,   1,   1},
+                    },
+                    {  // band 5
+                        {116, 218, 252}, { 47, 186, 242}, {  2, 132, 204},
+                        {  1, 106, 175}, {  1,  88, 104}, {  1, 254, 128},
+                    },
+                },
+            },
+        },
+        {  // TX_SIZE 2
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        { 35,  41, 129}, { 12,  30,  70}, {  2,  19,  32},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 30,  77, 116}, { 39,  70, 110}, { 20,  58,  96},
+                        {  8,  47,  77}, {  2,  33,  52}, {  1,  17,  26},
+                    },
+                    {  // band 2
+                        { 31, 123, 146}, { 18, 103, 140}, {  7,  81, 119},
+                        {  2,  62,  95}, {  1,  44,  70}, {  1,  26,  42},
+                    },
+                    {  // band 3
+                        { 21, 149, 170}, {  9, 114, 158}, {  2,  80, 126},
+                        {  1,  57,  94}, {  1,  36,  61}, {  1,  18,  31},
+                    },
+                    {  // band 4
+                        { 20, 178, 199}, {  6, 134, 183}, {  1,  87, 139},
+                        {  1,  60, 100}, {  1,  37,  64}, {  1,  18,  31},
+                    },
+                    {  // band 5
+                        { 36, 218, 233}, {  6, 160, 207}, {  1,  92, 147},
+                        {  1,  59, 101}, {  1,  35,  62}, {  1,  18,  31},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        { 17,  62, 211}, { 14,  62, 153}, {  5,  50,  84},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 11, 180, 205}, { 87, 160, 205}, { 53, 128, 184},
+                        { 27, 106, 156}, { 13,  79, 115}, {  6,  46,  67},
+                    },
+                    {  // band 2
+                        { 32, 194, 220}, { 20, 145, 202}, {  4,  96, 152},
+                        {  1,  67, 111}, {  1,  42,  70}, {  1,  21,  37},
+                    },
+                    {  // band 3
+                        { 30, 204, 228}, { 14, 152, 207}, {  1,  92, 149},
+                        {  1,  61, 103}, {  1,  34,  59}, {  1,  16,  28},
+                    },
+                    {  // band 4
+                        { 27, 213, 235}, {  7, 159, 210}, {  1,  88, 143},
+                        {  1,  55,  94}, {  1,  31,  53}, {  1,  16,  27},
+                    },
+                    {  // band 5
+                        { 28, 223, 243}, {  4, 173, 217}, {  1,  91, 146},
+                        {  1,  58,  98}, {  1,  35,  60}, {  1,  19,  33},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {172,  37, 202}, { 83,  51, 156}, { 24,  53, 110},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 76, 134, 206}, {110, 124, 200}, { 47, 106, 180},
+                        { 15,  82, 145}, {  3,  48,  83}, {  1,  19,  32},
+                    },
+                    {  // band 2
+                        { 80, 176, 220}, { 49, 145, 212}, { 17, 112, 180},
+                        {  7,  84, 140}, {  1,  53,  89}, {  1,  27,  43},
+                    },
+                    {  // band 3
+                        { 74, 201, 232}, { 38, 158, 221}, {  8, 112, 179},
+                        {  2,  79, 132}, {  1,  47,  82}, {  1,  26,  42},
+                    },
+                    {  // band 4
+                        { 73, 215, 239}, { 28, 169, 227}, {  3, 112, 176},
+                        {  1,  74, 126}, {  1,  48,  79}, {  1,  27,  44},
+                    },
+                    {  // band 5
+                        { 71, 233, 244}, { 18, 180, 230}, {  1, 114, 180},
+                        {  1,  80, 134}, {  1,  51,  85}, {  1,  26,  36},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {213,  34, 244}, {126,  57, 212}, { 46,  67, 151},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {120, 202, 245}, {198, 173, 241}, {119, 146, 224},
+                        { 76, 126, 195}, { 44, 102, 159}, { 40,  76, 115},
+                    },
+                    {  // band 2
+                        {120, 215, 248}, { 69, 171, 237}, { 23, 119, 194},
+                        { 10,  86, 147}, {  2,  56,  94}, {  1,  25,  44},
+                    },
+                    {  // band 3
+                        {102, 226, 250}, { 53, 183, 239}, {  9, 118, 188},
+                        {  2,  78, 131}, {  1,  48,  89}, {  1,  17,  36},
+                    },
+                    {  // band 4
+                        { 86, 235, 252}, { 34, 194, 240}, {  2, 109, 173},
+                        {  1,  68, 118}, {  1,  44,  79}, {  1,   1,  38},
+                    },
+                    {  // band 5
+                        { 59, 236, 243}, { 11, 189, 228}, {  1, 112, 187},
+                        {  1,  88, 145}, {  1,  55,  92}, {  1,   1, 128},
+                    },
+                },
+            },
+        },
+        {  // TX_SIZE 3
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        { 41,  40, 104}, { 12,  31,  64}, {  2,  16,  28},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 65,  58, 132}, { 50,  61, 130}, { 40,  57, 116},
+                        { 22,  46,  87}, {  2,  28,  44}, {  1,  11,  17},
+                    },
+                    {  // band 2
+                        { 55, 139, 135}, { 46, 122, 132}, { 21,  89, 110},
+                        {  6,  60,  78}, {  1,  38,  54}, {  1,  17,  27},
+                    },
+                    {  // band 3
+                        { 29, 167, 161}, { 10, 120, 141}, {  1,  69,  98},
+                        {  1,  42,  66}, {  1,  28,  44}, {  1,  15,  24},
+                    },
+                    {  // band 4
+                        { 19, 191, 180}, {  4, 125, 154}, {  1,  70, 107},
+                        {  1,  48,  77}, {  1,  33,  53}, {  1,  17,  28},
+                    },
+                    {  // band 5
+                        { 16, 238, 231}, {  2, 163, 198}, {  1,  85, 134},
+                        {  1,  54,  90}, {  1,  34,  57}, {  1,  17,  29},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        { 70,  15, 216}, { 40,  18, 164}, { 14,  17,  83},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 25, 150, 200}, {185, 154, 211}, {123, 137, 199},
+                        { 67, 119, 177}, { 31,  96, 137}, { 18,  63,  86},
+                    },
+                    {  // band 2
+                        { 57, 187, 223}, { 35, 148, 207}, {  7, 104, 159},
+                        {  2,  72, 113}, {  1,  44,  71}, {  1,  20,  34},
+                    },
+                    {  // band 3
+                        { 44, 203, 233}, { 18, 157, 212}, {  1,  98, 150},
+                        {  1,  61, 102}, {  1,  38,  62}, {  1,  19,  31},
+                    },
+                    {  // band 4
+                        { 41, 215, 238}, { 11, 166, 215}, {  1,  94, 146},
+                        {  1,  60, 101}, {  1,  37,  63}, {  1,  17,  28},
+                    },
+                    {  // band 5
+                        { 19, 236, 246}, {  3, 188, 223}, {  1,  95, 146},
+                        {  1,  58,  95}, {  1,  34,  56}, {  1,  17,  27},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {146,  27, 156}, { 49,  32, 116}, { 10,  39,  77},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 47, 101, 172}, { 93, 100, 178}, { 58,  91, 165},
+                        { 26,  75, 134}, {  4,  49,  82}, {  2,  22,  33},
+                    },
+                    {  // band 2
+                        { 60, 158, 196}, { 44, 135, 186}, { 25, 106, 157},
+                        {  8,  81, 124}, {  2,  56,  86}, {  1,  28,  45},
+                    },
+                    {  // band 3
+                        { 44, 169, 212}, { 15, 138, 196}, {  2, 100, 157},
+                        {  1,  74, 119}, {  1,  49,  76}, {  1,  20,  34},
+                    },
+                    {  // band 4
+                        { 38, 199, 231}, { 11, 158, 214}, {  1, 111, 167},
+                        {  1,  76, 122}, {  1,  44,  76}, {  1,  17,  39},
+                    },
+                    {  // band 5
+                        { 40, 236, 246}, { 10, 187, 230}, {  1, 115, 175},
+                        {  1,  74, 122}, {  1,  42,  71}, {  1,  14,  59},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {161,  26, 237}, { 65,  46, 209}, { 21,  46, 161},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 87, 229, 245}, {206, 214, 244}, {148, 186, 236},
+                        { 89, 165, 221}, { 41, 132, 186}, { 37,  93, 141},
+                    },
+                    {  // band 2
+                        { 93, 231, 246}, { 47, 181, 231}, {  8, 117, 188},
+                        {  2,  84, 138}, {  1,  43,  87}, {  1,  27,  41},
+                    },
+                    {  // band 3
+                        { 80, 239, 250}, { 28, 190, 236}, {  1, 119, 183},
+                        {  1,  84, 135}, {  1,  81,  69}, {  1, 102,   1},
+                    },
+                    {  // band 4
+                        { 67, 245, 252}, { 22, 206, 242}, {  1, 130, 195},
+                        {  1,  77, 136}, {  1,  35,  88}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        { 43, 250, 228}, { 31, 185, 204}, {  6, 101, 183},
+                        {  1,  92, 151}, {  1,  84, 137}, {128, 128, 128},
+                    },
+                },
+            },
+        },
+    },
+    {  // Q_Index 2
+        {  // TX_SIZE 0
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        {181,  22, 175}, { 96,  37, 147}, { 35,  41, 105},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 80,  95, 197}, {111,  92, 193}, { 59,  87, 175},
+                        { 29,  79, 150}, { 10,  65, 118}, {  2,  47,  82},
+                    },
+                    {  // band 2
+                        { 90, 141, 216}, { 77, 120, 210}, { 23,  95, 184},
+                        { 11,  81, 151}, {  6,  75, 130}, {  2,  58, 113},
+                    },
+                    {  // band 3
+                        {122, 167, 231}, { 66, 119, 225}, { 26,  87, 189},
+                        {  7,  76, 151}, {  2,  63, 125}, {  1,  59,  77},
+                    },
+                    {  // band 4
+                        {162, 147, 244}, {110,  97, 236}, { 32,  88, 204},
+                        { 11,  89, 174}, {  5,  78, 151}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {205,  59, 251}, {176,  68, 248}, { 90,  71, 223},
+                        { 49,  72, 188}, { 17,  74, 203}, {128, 128, 128},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        {188,  70, 207}, {140,  73, 189}, { 85,  73, 163},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 59, 144, 239}, { 79, 126, 237}, { 31, 102, 202},
+                        { 10,  81, 153}, {  3,  56, 102}, {  2,  33,  59},
+                    },
+                    {  // band 2
+                        {100, 152, 243}, { 80, 129, 236}, { 14,  94, 194},
+                        {  4,  72, 150}, {  1,  50, 103}, {  1,  35,  60},
+                    },
+                    {  // band 3
+                        {130, 183, 247}, { 70, 139, 242}, { 19, 100, 203},
+                        {  4,  83, 159}, {  1,  59, 119}, {  1,  44,  72},
+                    },
+                    {  // band 4
+                        {197, 138, 252}, {135, 107, 247}, { 31,  86, 210},
+                        {  7,  74, 160}, {  1,  53, 107}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {229,  54, 254}, {200,  51, 251}, { 83,  61, 226},
+                        { 33,  55, 177}, { 12,  74, 145}, {128, 128, 128},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {229,  20, 235}, {183,  37, 221}, {127,  47, 198},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {188, 115, 251}, {208, 110, 250}, {101,  99, 235},
+                        { 38,  81, 197}, {  9,  56, 132}, {  9,  52,  63},
+                    },
+                    {  // band 2
+                        {189, 150, 252}, {186, 137, 251}, { 54, 107, 236},
+                        { 14,  90, 195}, {  1,  89, 104}, {128, 128, 128},
+                    },
+                    {  // band 3
+                        {209, 180, 254}, {142, 145, 253}, { 51, 130, 236},
+                        {  6, 128, 214}, {  1, 128, 254}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {231, 140, 254}, {194, 128, 254}, { 75, 119, 233},
+                        {128,  23, 230}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {244,  59, 254}, {239,  81, 254}, {128,  85, 254},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {246,  55, 247}, {197,  64, 235}, {141,  74, 218},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {178, 163, 254}, {192, 138, 252}, { 85, 103, 231},
+                        { 49,  81, 179}, { 32,  54, 133}, { 12,  26,  98},
+                    },
+                    {  // band 2
+                        {189, 173, 254}, {179, 150, 253}, { 60,  94, 237},
+                        { 34,  81, 198}, { 20,  53, 187}, {128, 128, 128},
+                    },
+                    {  // band 3
+                        {202, 191, 254}, {157, 160, 254}, { 57, 117, 240},
+                        { 28, 105, 211}, {  1, 128,   1}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {231, 146, 254}, {208, 133, 254}, { 66,  78, 233},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {246,  49, 254}, {246,  63, 254}, { 85, 142, 254},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+            },
+        },
+        {  // TX_SIZE 1
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        { 45,  28, 124}, { 23,  35, 107}, { 10,  34,  78},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 53,  99, 177}, { 82,  96, 174}, { 46,  89, 158},
+                        { 21,  76, 133}, {  6,  56,  94}, {  1,  33,  54},
+                    },
+                    {  // band 2
+                        { 68, 147, 201}, { 42, 124, 195}, { 17,  98, 166},
+                        {  7,  75, 131}, {  2,  53,  93}, {  1,  33,  59},
+                    },
+                    {  // band 3
+                        { 65, 176, 217}, { 30, 137, 206}, {  6,  97, 167},
+                        {  2,  70, 128}, {  1,  47,  88}, {  1,  29,  46},
+                    },
+                    {  // band 4
+                        { 69, 195, 232}, { 24, 146, 218}, {  4, 100, 175},
+                        {  2,  72, 134}, {  1,  51,  93}, {  1,  29,  52},
+                    },
+                    {  // band 5
+                        { 96, 212, 246}, { 39, 158, 234}, {  6, 109, 192},
+                        {  2,  77, 144}, {  1,  50,  95}, {  1,  20,  45},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        { 71,  80, 213}, { 53,  73, 181}, { 25,  66, 141},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 35, 168, 231}, { 91, 150, 229}, { 49, 122, 202},
+                        { 22,  97, 162}, { 10,  68, 108}, {  9,  48,  57},
+                    },
+                    {  // band 2
+                        { 56, 178, 236}, { 32, 148, 225}, {  9,  99, 176},
+                        {  4,  69, 127}, {  2,  44,  78}, {  1,  25,  41},
+                    },
+                    {  // band 3
+                        { 57, 191, 242}, { 27, 155, 230}, {  5, 102, 180},
+                        {  2,  71, 133}, {  1,  44,  78}, {  1,  27,  41},
+                    },
+                    {  // band 4
+                        { 67, 201, 247}, { 24, 162, 237}, {  3, 106, 188},
+                        {  3,  74, 137}, {  1,  46,  85}, {  1,  34,  48},
+                    },
+                    {  // band 5
+                        {111, 210, 251}, { 47, 166, 244}, {  3, 113, 199},
+                        {  2,  77, 146}, {  1,  48,  93}, {  1,  38,  22},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {206,  21, 221}, {150,  36, 195}, { 94,  44, 164},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {147, 128, 239}, {194, 122, 238}, { 95, 104, 220},
+                        { 39,  81, 183}, { 13,  53, 111}, {  3,  24,  49},
+                    },
+                    {  // band 2
+                        {164, 163, 244}, {106, 142, 239}, { 50, 112, 215},
+                        { 26,  90, 177}, { 12,  67, 130}, {  1,   1,  64},
+                    },
+                    {  // band 3
+                        {155, 193, 249}, { 88, 158, 244}, { 26, 124, 220},
+                        { 10,  98, 173}, {  1,  77, 126}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {141, 205, 252}, { 64, 174, 248}, { 17, 124, 221},
+                        { 12,  92, 176}, {  1,  29, 148}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {150, 217, 254}, { 74, 191, 252}, { 30, 144, 215},
+                        {  1, 106, 137}, {128,   1, 128}, {128, 128, 128},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {241,  37, 242}, {175,  48, 223}, { 99,  53, 189},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {153, 183, 248}, {212, 156, 247}, {134, 124, 221},
+                        { 88, 103, 184}, { 59,  86, 132}, { 29,  61,  67},
+                    },
+                    {  // band 2
+                        {162, 199, 250}, {106, 167, 247}, { 56, 110, 207},
+                        { 32,  85, 165}, { 16,  71, 130}, {  1,  93, 254},
+                    },
+                    {  // band 3
+                        {143, 213, 252}, { 86, 187, 250}, { 23, 124, 220},
+                        {  7,  95, 176}, {  1, 109, 102}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {130, 219, 254}, { 70, 201, 253}, { 15, 128, 215},
+                        {  1, 101, 201}, {  1,  64, 170}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {155, 219, 254}, {105, 207, 254}, { 28, 155, 229},
+                        {  1, 153, 191}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+            },
+        },
+        {  // TX_SIZE 2
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        { 18,  26, 117}, { 10,  29,  82}, {  3,  25,  52},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 35,  88, 152}, { 62,  85, 150}, { 36,  77, 137},
+                        { 16,  66, 116}, {  4,  47,  81}, {  1,  26,  44},
+                    },
+                    {  // band 2
+                        { 55, 141, 182}, { 32, 119, 177}, { 12,  93, 154},
+                        {  4,  71, 123}, {  1,  51,  89}, {  1,  32,  56},
+                    },
+                    {  // band 3
+                        { 46, 171, 202}, { 21, 130, 191}, {  5,  91, 154},
+                        {  1,  64, 115}, {  1,  42,  77}, {  1,  25,  41},
+                    },
+                    {  // band 4
+                        { 43, 195, 219}, { 12, 142, 203}, {  1,  91, 156},
+                        {  1,  63, 115}, {  1,  41,  77}, {  1,  22,  43},
+                    },
+                    {  // band 5
+                        { 42, 221, 238}, {  8, 162, 219}, {  1,  98, 167},
+                        {  1,  67, 123}, {  1,  43,  83}, {  1,  25,  38},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        { 16,  51, 216}, { 20,  48, 168}, {  9,  44, 109},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 34, 164, 226}, {124, 148, 226}, { 72, 127, 207},
+                        { 36, 107, 175}, { 15,  81, 129}, {  6,  51,  79},
+                    },
+                    {  // band 2
+                        { 61, 182, 234}, { 35, 148, 220}, {  9, 101, 178},
+                        {  4,  71, 134}, {  1,  46,  90}, {  1,  24,  51},
+                    },
+                    {  // band 3
+                        { 54, 198, 239}, { 25, 156, 224}, {  3,  98, 173},
+                        {  1,  66, 124}, {  1,  41,  78}, {  1,  15,  37},
+                    },
+                    {  // band 4
+                        { 48, 209, 242}, { 12, 162, 226}, {  1,  96, 169},
+                        {  1,  63, 119}, {  1,  40,  78}, {  1,  18,  45},
+                    },
+                    {  // band 5
+                        { 44, 223, 247}, {  6, 173, 232}, {  1, 105, 178},
+                        {  1,  71, 131}, {  1,  44,  84}, {  1,  13,  46},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {188,  26, 214}, {121,  42, 181}, { 66,  49, 149},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {136, 128, 233}, {172, 124, 230}, { 80, 106, 211},
+                        { 27,  81, 174}, {  6,  49,  98}, {  8,  28,  49},
+                    },
+                    {  // band 2
+                        {145, 166, 239}, { 92, 141, 229}, { 28, 108, 196},
+                        {  8,  87, 154}, {  1,  58, 105}, {  1,  27,  59},
+                    },
+                    {  // band 3
+                        {131, 193, 242}, { 66, 151, 231}, { 13, 112, 192},
+                        {  2,  81, 152}, {  1,  66, 121}, {  1,  23,  64},
+                    },
+                    {  // band 4
+                        {112, 211, 246}, { 41, 164, 235}, {  5, 117, 202},
+                        {  1,  83, 162}, {  1,  64, 111}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        { 96, 230, 250}, { 28, 185, 243}, {  2, 132, 204},
+                        {  1,  91, 166}, {  1,  85,  46}, {128, 128, 128},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {238,  23, 242}, {157,  29, 215}, { 73,  27, 162},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {165, 173, 250}, {222, 151, 247}, {152, 134, 235},
+                        {114, 120, 210}, { 86, 109, 176}, { 53,  88, 145},
+                    },
+                    {  // band 2
+                        {164, 194, 249}, {100, 158, 241}, { 35, 111, 212},
+                        { 17,  85, 167}, {  1,  52, 112}, {  1,  73,   1},
+                    },
+                    {  // band 3
+                        {151, 215, 252}, { 83, 172, 245}, { 16, 122, 208},
+                        {  6, 101, 165}, {  1,  74, 113}, {  1,   1,   1},
+                    },
+                    {  // band 4
+                        {138, 230, 253}, { 65, 184, 248}, {  8, 128, 212},
+                        {  1, 111, 182}, {128,   1,   1}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {123, 240, 253}, { 36, 201, 250}, {  3, 127, 211},
+                        {  1,  68, 204}, {128,   1,   1}, {128, 128, 128},
+                    },
+                },
+            },
+        },
+        {  // TX_SIZE 3
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        { 51,  21, 156}, { 30,  23,  86}, {  4,  18,  37},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 38,  77, 129}, { 79,  76, 129}, { 40,  66, 117},
+                        { 12,  54,  95}, {  1,  36,  60}, {  1,  17,  29},
+                    },
+                    {  // band 2
+                        { 44, 133, 149}, { 24, 107, 143}, {  8,  78, 121},
+                        {  3,  59,  97}, {  1,  42,  71}, {  1,  22,  37},
+                    },
+                    {  // band 3
+                        { 29, 160, 171}, {  9, 114, 158}, {  1,  76, 125},
+                        {  1,  54,  93}, {  1,  36,  63}, {  1,  20,  35},
+                    },
+                    {  // band 4
+                        { 22, 188, 205}, {  6, 132, 186}, {  1,  87, 144},
+                        {  1,  62, 107}, {  1,  41,  72}, {  1,  23,  41},
+                    },
+                    {  // band 5
+                        { 25, 233, 236}, {  5, 165, 214}, {  1,  96, 158},
+                        {  1,  63, 112}, {  1,  40,  73}, {  1,  23,  40},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        { 48,  20, 231}, { 37,  21, 179}, { 15,  18, 109},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 41, 154, 216}, {196, 142, 221}, {131, 125, 207},
+                        { 84, 111, 181}, { 45,  91, 142}, { 27,  62,  89},
+                    },
+                    {  // band 2
+                        { 72, 181, 230}, { 41, 147, 215}, { 10, 102, 173},
+                        {  3,  73, 132}, {  1,  47,  89}, {  1,  23,  50},
+                    },
+                    {  // band 3
+                        { 60, 201, 236}, { 23, 157, 219}, {  2,  99, 167},
+                        {  1,  69, 124}, {  1,  43,  80}, {  1,  22,  39},
+                    },
+                    {  // band 4
+                        { 53, 214, 242}, { 15, 165, 224}, {  1, 101, 173},
+                        {  1,  70, 131}, {  1,  44,  83}, {  1,  23,  49},
+                    },
+                    {  // band 5
+                        { 39, 239, 248}, {  7, 186, 233}, {  1, 108, 174},
+                        {  1,  70, 123}, {  1,  43,  77}, {  1,  16,  42},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {161,  26, 204}, { 77,  40, 160}, { 26,  50, 117},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 80, 140, 218}, {136, 133, 215}, { 63, 117, 197},
+                        { 20,  93, 170}, {  7,  55, 102}, { 13,  32,  52},
+                    },
+                    {  // band 2
+                        { 86, 173, 231}, { 46, 150, 220}, { 18, 118, 190},
+                        {  8,  90, 150}, {  2,  60,  95}, {  1,  39,  41},
+                    },
+                    {  // band 3
+                        { 80, 183, 242}, { 37, 160, 231}, {  6, 120, 182},
+                        {  1,  86, 137}, {  1,  46,  78}, {  1,  15,  24},
+                    },
+                    {  // band 4
+                        { 88, 215, 247}, { 42, 179, 235}, {  4, 116, 182},
+                        {  2,  80, 133}, {  1,  46,  85}, {  1,  64,  43},
+                    },
+                    {  // band 5
+                        {100, 236, 250}, { 31, 186, 234}, {  1, 114, 181},
+                        {  1,  85, 135}, {  1,  78,  64}, {128, 128, 128},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {213,  13, 245}, {106,  16, 211}, { 32,  11, 156},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {140, 214, 247}, {241, 186, 243}, {177, 172, 235},
+                        {128, 156, 219}, {106, 130, 191}, { 99, 105, 152},
+                    },
+                    {  // band 2
+                        {125, 218, 248}, { 75, 167, 239}, { 29, 111, 212},
+                        {  6,  66, 152}, {  1,  42,  96}, {  1,  85, 128},
+                    },
+                    {  // band 3
+                        {120, 232, 252}, { 60, 189, 247}, {  8, 141, 200},
+                        {  1,  89, 134}, {  1,  32, 128}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {111, 238, 253}, { 56, 198, 245}, {  1, 123, 208},
+                        {  1,  93, 176}, {  1,   1,  73}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        { 98, 251, 249}, { 56, 189, 244}, { 17, 113, 220},
+                        {  1, 109, 179}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+            },
+        },
+    },
+    {  // Q_Index 3
+        {  // TX_SIZE 0
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        {186,  16, 200}, {122,  31, 187}, { 78,  40, 161},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {146, 119, 245}, {182, 115, 244}, {130, 113, 238},
+                        { 88, 110, 225}, { 47, 103, 208}, {  5, 102, 188},
+                    },
+                    {  // band 2
+                        {164, 157, 248}, {155, 141, 250}, { 71, 116, 243},
+                        { 88, 129, 233}, { 50,  99, 228}, { 26, 148, 191},
+                    },
+                    {  // band 3
+                        {200, 158, 253}, {177, 118, 252}, { 99, 113, 245},
+                        { 77, 120, 210}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {231, 104, 254}, {209,  82, 254}, {143, 112, 252},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {250,  36, 254}, {243,  55, 254}, {223, 170, 254},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        {207,  37, 226}, {164,  46, 218}, {122,  58, 201},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {149, 154, 253}, {170, 137, 253}, { 94, 123, 247},
+                        { 42, 113, 222}, { 16,  97, 174}, { 49,  98, 159},
+                    },
+                    {  // band 2
+                        {177, 162, 253}, {165, 142, 252}, { 51, 108, 243},
+                        { 18, 108, 213}, {  1,  98, 254}, {128, 128, 128},
+                    },
+                    {  // band 3
+                        {211, 152, 254}, {184, 116, 254}, { 70, 110, 244},
+                        {  8, 108, 237}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {236,  89, 254}, {210,  67, 254}, {112, 111, 248},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {246,  26, 254}, {233,  35, 254}, {128,   1, 254},
+                        {254, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {247,   2, 247}, {226,   8, 242}, {191,  14, 235},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {231,  94, 254}, {248,  91, 254}, {186,  89, 252},
+                        {128,  92, 244}, { 79, 112, 254}, {128, 128, 128},
+                    },
+                    {  // band 2
+                        {228, 145, 253}, {240, 130, 254}, {223, 105, 254},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 3
+                        {245, 153, 253}, {240, 120, 254}, {128, 128, 128},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {254, 128, 254}, {204, 128, 254}, {128, 128, 128},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {253,   7, 249}, {224,   9, 244}, {182,  13, 231},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {234, 109, 254}, {242, 104, 254}, {160,  98, 254},
+                        {123,  85, 243}, { 82,  43, 217}, {128, 128, 128},
+                    },
+                    {  // band 2
+                        {243, 137, 254}, {240, 118, 254}, {136,  53, 254},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 3
+                        {251, 173, 254}, {229, 129, 250}, {128, 128, 128},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {254, 119, 254}, {254, 128, 128}, {128, 128, 128},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+            },
+        },
+        {  // TX_SIZE 1
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        { 49,  26, 159}, { 36,  34, 150}, { 26,  38, 124},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 99, 122, 226}, {143, 119, 225}, { 90, 113, 213},
+                        { 46, 102, 193}, { 14,  84, 157}, {  3,  59, 107},
+                    },
+                    {  // band 2
+                        {109, 164, 237}, { 74, 142, 233}, { 29, 112, 216},
+                        { 14,  92, 184}, { 10,  80, 156}, {  1,  52, 137},
+                    },
+                    {  // band 3
+                        {110, 191, 245}, { 59, 156, 240}, { 18, 121, 220},
+                        {  8,  97, 184}, {  3,  84, 150}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {115, 203, 250}, { 59, 167, 246}, { 16, 130, 226},
+                        {  7,  97, 192}, {  1,  71,  99}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {149, 218, 253}, { 93, 171, 251}, { 28, 125, 233},
+                        { 28,  99, 192}, {128,  85,  85}, {128, 128, 128},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        { 97,  45, 229}, { 79,  52, 205}, { 46,  58, 171},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 99, 180, 249}, {156, 165, 249}, { 73, 141, 237},
+                        { 31, 116, 208}, { 13,  81, 153}, {  5,  42,  86},
+                    },
+                    {  // band 2
+                        {113, 188, 251}, { 68, 161, 244}, { 16, 108, 216},
+                        {  6,  81, 168}, {  2,  65, 118}, {128,   1,   1},
+                    },
+                    {  // band 3
+                        {117, 201, 252}, { 62, 171, 248}, { 12, 119, 221},
+                        {  5,  90, 182}, {  4,  66, 116}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {128, 207, 253}, { 70, 176, 251}, { 11, 126, 228},
+                        {  6,  89, 189}, {  1,  44, 148}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {162, 218, 254}, {107, 170, 253}, { 22, 131, 238},
+                        {  1,  77, 182}, {  1, 254, 128}, {128, 128, 128},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {235,   5, 238}, {194,  14, 223}, {152,  22, 205},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {200, 121, 251}, {241, 115, 252}, {167, 108, 248},
+                        { 93,  93, 233}, { 36,  66, 189}, {128, 128, 128},
+                    },
+                    {  // band 2
+                        {220, 151, 253}, {176, 135, 252}, { 95, 124, 254},
+                        { 64, 105, 217}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 3
+                        {225, 189, 254}, {175, 155, 254}, {102, 119, 254},
+                        {  1,   1,   1}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {218, 195, 254}, {125, 157, 253}, {128, 128, 254},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {221, 197, 254}, { 85, 210, 254}, {128, 128, 128},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {250,   9, 246}, {204,  13, 234}, {144,  18, 211},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {213, 157, 253}, {243, 138, 253}, {170, 117, 250},
+                        {109,  91, 233}, { 66,  77, 163}, { 64,  85, 254},
+                    },
+                    {  // band 2
+                        {221, 169, 254}, {182, 141, 253}, {112, 120, 239},
+                        { 85, 165, 254}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 3
+                        {226, 192, 254}, {189, 174, 251}, {153, 128, 254},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {232, 192, 254}, {195, 187, 247}, {  1, 191, 254},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {247, 185, 254}, {254,  93, 254}, {128, 128, 128},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+            },
+        },
+        {  // TX_SIZE 2
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        { 14,  30, 136}, { 15,  33, 120}, { 10,  33,  90},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 92, 109, 209}, {113, 108, 207}, { 77, 102, 193},
+                        { 39,  91, 171}, { 11,  70, 129}, {  2,  44,  77},
+                    },
+                    {  // band 2
+                        { 99, 158, 223}, { 66, 135, 217}, { 23, 109, 194},
+                        {  9,  85, 160}, {  3,  66, 124}, {  1,  51, 100},
+                    },
+                    {  // band 3
+                        { 89, 189, 234}, { 46, 149, 225}, { 10, 110, 194},
+                        {  2,  83, 156}, {  1,  57, 113}, {  1,  47,  73},
+                    },
+                    {  // band 4
+                        { 78, 206, 242}, { 28, 161, 232}, {  3, 114, 200},
+                        {  1,  86, 161}, {  1,  62, 118}, {  1,   1,   1},
+                    },
+                    {  // band 5
+                        { 72, 227, 250}, { 20, 182, 242}, {  3, 126, 210},
+                        {  2,  91, 166}, {  1,  64, 126}, {128, 128, 128},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        { 23,  42, 227}, { 41,  43, 195}, { 25,  45, 146},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {100, 172, 245}, {165, 158, 246}, { 88, 137, 234},
+                        { 44, 116, 203}, { 18,  85, 149}, {  7,  56,  92},
+                    },
+                    {  // band 2
+                        {117, 188, 247}, { 70, 155, 239}, { 18, 105, 204},
+                        {  7,  78, 158}, {  2,  50, 111}, {  1,  38,  77},
+                    },
+                    {  // band 3
+                        {104, 207, 250}, { 54, 166, 241}, {  6, 110, 199},
+                        {  1,  78, 155}, {  1,  45, 100}, {  1,   1,   1},
+                    },
+                    {  // band 4
+                        { 87, 216, 251}, { 30, 177, 243}, {  1, 114, 203},
+                        {  1,  85, 157}, {  1,  53, 108}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        { 80, 230, 253}, { 23, 193, 248}, {  1, 127, 215},
+                        {  1,  94, 170}, {  1,  71,  59}, {128, 128, 128},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {222,   9, 234}, {161,  20, 210}, {113,  30, 185},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {195, 120, 248}, {231, 124, 247}, {148, 116, 238},
+                        { 64,  98, 207}, { 20,  70, 147}, { 87,  68, 100},
+                    },
+                    {  // band 2
+                        {186, 161, 250}, {124, 148, 245}, { 44, 123, 230},
+                        { 23, 107, 205}, {  1,  80, 131}, {128, 128, 128},
+                    },
+                    {  // band 3
+                        {172, 196, 252}, {110, 160, 248}, { 37, 134, 235},
+                        { 23, 125, 200}, {128, 254, 128}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {173, 209, 253}, {103, 175, 250}, {  1, 120, 240},
+                        {  1, 146, 254}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {184, 235, 254}, { 81, 186, 251}, {128, 109, 254},
+                        {128, 254, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {248,   8, 243}, {185,  11, 225}, {108,  11, 189},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {208, 158, 254}, {244, 147, 252}, {195, 132, 248},
+                        {161, 122, 224}, {129, 114, 188}, { 59, 119, 159},
+                    },
+                    {  // band 2
+                        {202, 182, 253}, {143, 161, 251}, { 73, 115, 247},
+                        {146, 175, 204}, {128,   1, 254}, {128, 128, 128},
+                    },
+                    {  // band 3
+                        {202, 204, 254}, {131, 174, 251}, { 18, 153, 207},
+                        {128, 254, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {192, 221, 254}, {114, 190, 254}, {128, 170, 254},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {166, 236, 254}, {119, 200, 254}, {128, 128, 128},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+            },
+        },
+        {  // TX_SIZE 3
+            {  // Y plane
+                {  // Intra
+                    {  // band 0
+                        { 30,  32, 144}, { 21,  35,  96}, {  4,  27,  55},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 35, 107, 172}, { 61, 104, 170}, { 33,  94, 160},
+                        { 13,  80, 139}, {  2,  55,  97}, {  1,  28,  49},
+                    },
+                    {  // band 2
+                        { 51, 153, 195}, { 29, 129, 189}, {  9,  99, 163},
+                        {  3,  75, 129}, {  1,  49,  88}, {  1,  29,  50},
+                    },
+                    {  // band 3
+                        { 53, 164, 210}, { 21, 134, 201}, {  3,  97, 164},
+                        {  1,  69, 124}, {  1,  45,  82}, {  1,  31,  58},
+                    },
+                    {  // band 4
+                        { 47, 205, 234}, { 18, 158, 220}, {  2, 109, 177},
+                        {  1,  78, 137}, {  1,  53, 101}, {  1,  34,  70},
+                    },
+                    {  // band 5
+                        { 55, 233, 245}, { 16, 179, 233}, {  1, 116, 191},
+                        {  1,  79, 145}, {  1,  53, 101}, {  1,  37,  58},
+                    },
+                },
+                {  // Intra
+                    {  // band 0
+                        { 36,  33, 227}, { 39,  28, 190}, { 18,  27, 134},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        { 76, 156, 235}, {184, 147, 235}, {114, 130, 220},
+                        { 72, 112, 191}, { 42,  87, 144}, { 21,  65,  93},
+                    },
+                    {  // band 2
+                        { 96, 179, 240}, { 51, 149, 228}, { 12, 105, 191},
+                        {  6,  74, 148}, {  1,  47, 100}, {  1,  29,  53},
+                    },
+                    {  // band 3
+                        { 88, 191, 242}, { 35, 154, 231}, {  3, 106, 187},
+                        {  1,  74, 140}, {  1,  41,  84}, {  1,  25,  38},
+                    },
+                    {  // band 4
+                        { 77, 212, 249}, { 28, 171, 239}, {  2, 117, 199},
+                        {  1,  79, 151}, {  1,  45,  99}, {  1,   1,   1},
+                    },
+                    {  // band 5
+                        { 77, 236, 252}, { 27, 190, 246}, {  2, 120, 203},
+                        {  1,  78, 147}, {  1,  42,  72}, {128, 128, 128},
+                    },
+                },
+            },
+            {  // UV plane
+                {  // Inter
+                    {  // band 0
+                        {185,  11, 227}, {113,  30, 182}, { 57,  44, 144},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {151, 139, 244}, {212, 139, 241}, {124, 126, 231},
+                        { 59, 104, 213}, { 26,  73, 158}, { 20,  45,  95},
+                    },
+                    {  // band 2
+                        {155, 163, 247}, {108, 152, 239}, { 39, 124, 214},
+                        {  7, 109, 162}, { 29,  57, 128}, {128, 128, 128},
+                    },
+                    {  // band 3
+                        {158, 176, 250}, { 89, 164, 243}, { 11, 114, 196},
+                        {  1,  96, 141}, {  1,  81, 118}, {128,   1,   1},
+                    },
+                    {  // band 4
+                        {148, 212, 251}, { 59, 174, 240}, {  2, 130, 203},
+                        {  1,  70, 168}, {  1,  51, 106}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {104, 237, 252}, { 39, 190, 246}, {  1, 154, 220},
+                        {128, 102,   1}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+                {  // Inter
+                    {  // band 0
+                        {236,   6, 242}, {111,   6, 206}, { 36,   5, 161},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 1
+                        {193, 193, 252}, {248, 182, 251}, {218, 150, 246},
+                        {182, 134, 244}, {151, 137, 227}, { 45, 102, 195},
+                    },
+                    {  // band 2
+                        {188, 202, 251}, {125, 165, 249}, { 64,  75, 218},
+                        {  1, 128, 254}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 3
+                        {178, 225, 254}, {107, 188, 231}, { 21, 135, 233},
+                        {128,   1, 254}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 4
+                        {164, 227, 253}, { 55, 193, 251}, {  1, 111, 225},
+                        {128, 128, 128}, {128, 128, 128}, {128, 128, 128},
+                    },
+                    {  // band 5
+                        {151, 243, 254}, { 50, 203, 254}, {128, 179, 254},
+                        {128,   1, 254}, {128, 128, 128}, {128, 128, 128},
+                    },
+                },
+            },
+        },
+    },
+};
+#else
 static const vp10_coeff_probs_model default_coef_probs_4x4[PLANE_TYPES] = {
   {  // Y plane
     {  // Intra
@@ -740,12 +2785,11 @@
     }
   }
 };
+#endif  // CONFIG_ENTROPY
 
 static void extend_to_full_distribution(vpx_prob *probs, vpx_prob p) {
-  // TODO(aconverse): model[PIVOT_NODE] should never be zero.
-  // https://code.google.com/p/webm/issues/detail?id=1089
-  memcpy(probs, vp10_pareto8_full[p == 0 ? 254 : p - 1],
-         MODEL_NODES * sizeof(vpx_prob));
+  assert(p != 0);
+  memcpy(probs, vp10_pareto8_full[p - 1], MODEL_NODES * sizeof(vpx_prob));
 }
 
 void vp10_model_to_full_probs(const vpx_prob *model, vpx_prob *full) {
@@ -754,28 +2798,64 @@
   extend_to_full_distribution(&full[UNCONSTRAINED_NODES], model[PIVOT_NODE]);
 }
 
+#if CONFIG_ANS
+void vp10_build_token_cdfs(const vpx_prob *pdf_model, rans_dec_lut cdf) {
+  AnsP10 pdf_tab[ENTROPY_TOKENS - 1];
+  assert(pdf_model[2] != 0);
+  // TODO(aconverse): Investigate making the precision of the zero and EOB tree
+  // nodes 10-bits.
+  rans_merge_prob8_pdf(pdf_tab, pdf_model[1],
+                       vp10_pareto8_token_probs[pdf_model[2] - 1],
+                       ENTROPY_TOKENS - 2);
+  rans_build_cdf_from_pdf(pdf_tab, cdf);
+}
+
+void vp10_coef_pareto_cdfs(FRAME_CONTEXT *fc) {
+  TX_SIZE t;
+  int i, j, k, l;
+  for (t = TX_4X4; t <= TX_32X32; ++t)
+    for (i = 0; i < PLANE_TYPES; ++i)
+      for (j = 0; j < REF_TYPES; ++j)
+        for (k = 0; k < COEF_BANDS; ++k)
+          for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l)
+            vp10_build_token_cdfs(fc->coef_probs[t][i][j][k][l],
+                                  fc->coef_cdfs[t][i][j][k][l]);
+}
+#endif  // CONFIG_ANS
+
 void vp10_default_coef_probs(VP10_COMMON *cm) {
+#if CONFIG_ENTROPY
+  const int index =
+      VPXMIN(ROUND_POWER_OF_TWO(cm->base_qindex, 8 - QCTX_BIN_BITS),
+             QCTX_BINS - 1);
+  vp10_copy(cm->fc->coef_probs, default_qctx_coef_probs[index]);
+#else
   vp10_copy(cm->fc->coef_probs[TX_4X4], default_coef_probs_4x4);
   vp10_copy(cm->fc->coef_probs[TX_8X8], default_coef_probs_8x8);
   vp10_copy(cm->fc->coef_probs[TX_16X16], default_coef_probs_16x16);
   vp10_copy(cm->fc->coef_probs[TX_32X32], default_coef_probs_32x32);
+#endif  // CONFIG_ENTROPY
+#if CONFIG_ANS
+  vp10_coef_pareto_cdfs(cm->fc);
+#endif  // CONFIG_ANS
 }
 
-#define COEF_COUNT_SAT 24
-#define COEF_MAX_UPDATE_FACTOR 112
-#define COEF_COUNT_SAT_KEY 24
-#define COEF_MAX_UPDATE_FACTOR_KEY 112
-#define COEF_COUNT_SAT_AFTER_KEY 24
-#define COEF_MAX_UPDATE_FACTOR_AFTER_KEY 128
-
 static void adapt_coef_probs(VP10_COMMON *cm, TX_SIZE tx_size,
                              unsigned int count_sat,
                              unsigned int update_factor) {
   const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
   vp10_coeff_probs_model *const probs = cm->fc->coef_probs[tx_size];
+#if CONFIG_ENTROPY
+  const vp10_coeff_probs_model *const pre_probs = cm->partial_prob_update ?
+      (const vp10_coeff_probs_model *)cm->starting_coef_probs[tx_size] :
+      pre_fc->coef_probs[tx_size];
+#else
   const vp10_coeff_probs_model *const pre_probs = pre_fc->coef_probs[tx_size];
-  vp10_coeff_count_model *counts = cm->counts.coef[tx_size];
-  unsigned int (*eob_counts)[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] =
+#endif  // CONFIG_ENTROPY
+  const vp10_coeff_count_model *const counts =
+      (const vp10_coeff_count_model *)cm->counts.coef[tx_size];
+  const unsigned int (*eob_counts)[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS] =
+      (const unsigned int (*)[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS])
       cm->counts.eob_branch[tx_size];
   int i, j, k, l, m;
 
@@ -793,9 +2873,9 @@
             { n1, n2 }
           };
           for (m = 0; m < UNCONSTRAINED_NODES; ++m)
-            probs[i][j][k][l][m] = merge_probs(pre_probs[i][j][k][l][m],
-                                               branch_ct[m],
-                                               count_sat, update_factor);
+            probs[i][j][k][l][m] = vp10_merge_probs(pre_probs[i][j][k][l][m],
+                                                    branch_ct[m],
+                                                    count_sat, update_factor);
         }
 }
 
@@ -803,16 +2883,41 @@
   TX_SIZE t;
   unsigned int count_sat, update_factor;
 
-  if (frame_is_intra_only(cm)) {
-    update_factor = COEF_MAX_UPDATE_FACTOR_KEY;
-    count_sat = COEF_COUNT_SAT_KEY;
-  } else if (cm->last_frame_type == KEY_FRAME) {
+#if CONFIG_ENTROPY
+  if (cm->last_frame_type == KEY_FRAME) {
+    update_factor = COEF_MAX_UPDATE_FACTOR_AFTER_KEY_BITS;  /* adapt quickly */
+    count_sat = COEF_COUNT_SAT_AFTER_KEY_BITS;
+  } else {
+    update_factor = COEF_MAX_UPDATE_FACTOR_BITS;
+    count_sat = COEF_COUNT_SAT_BITS;
+  }
+  if (cm->partial_prob_update == 1) {
+    update_factor = COEF_MAX_UPDATE_FACTOR_BITS;
+  }
+#else
+  if (cm->last_frame_type == KEY_FRAME) {
     update_factor = COEF_MAX_UPDATE_FACTOR_AFTER_KEY;  /* adapt quickly */
     count_sat = COEF_COUNT_SAT_AFTER_KEY;
   } else {
     update_factor = COEF_MAX_UPDATE_FACTOR;
     count_sat = COEF_COUNT_SAT;
   }
+#endif  // CONFIG_ENTROPY
   for (t = TX_4X4; t <= TX_32X32; t++)
     adapt_coef_probs(cm, t, count_sat, update_factor);
+#if CONFIG_ANS
+  vp10_coef_pareto_cdfs(cm->fc);
+#endif
 }
+
+#if CONFIG_ENTROPY
+void vp10_partial_adapt_probs(VP10_COMMON *cm, int mi_row, int mi_col) {
+  (void)mi_row;
+  (void)mi_col;
+
+  if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+    cm->partial_prob_update = 1;
+    vp10_adapt_coef_probs(cm);
+  }
+}
+#endif  // CONFIG_ENTROPY
diff --git a/vp10/common/entropy.h b/vp10/common/entropy.h
index 9a471c8..d0ca880 100644
--- a/vp10/common/entropy.h
+++ b/vp10/common/entropy.h
@@ -14,6 +14,9 @@
 #include "vpx/vpx_integer.h"
 #include "vpx_dsp/prob.h"
 
+#if CONFIG_ANS
+#include "vp10/common/ans.h"
+#endif  // CONFIG_ANS
 #include "vp10/common/common.h"
 #include "vp10/common/enums.h"
 
@@ -21,8 +24,14 @@
 extern "C" {
 #endif
 
-#define DIFF_UPDATE_PROB        252
-#define GROUP_DIFF_UPDATE_PROB  252
+#define DIFF_UPDATE_PROB       252
+#define GROUP_DIFF_UPDATE_PROB 252
+
+#if CONFIG_ENTROPY
+#define COEF_PROBS_BUFS 16
+#define QCTX_BIN_BITS 2
+#define QCTX_BINS (1 << QCTX_BIN_BITS)
+#endif  // CONFIG_ENTROPY
 
 // Coefficient token alphabet
 #define ZERO_TOKEN      0   // 0     Extra Bits 0+0
@@ -136,6 +145,9 @@
 struct VP10Common;
 void vp10_default_coef_probs(struct VP10Common *cm);
 void vp10_adapt_coef_probs(struct VP10Common *cm);
+#if CONFIG_ENTROPY
+void vp10_partial_adapt_probs(struct VP10Common *cm, int mi_row, int mi_col);
+#endif  // CONFIG_ENTROPY
 
 // This is the index in the scan order beyond which all coefficients for
 // 8x8 transform and above are in the top band.
@@ -163,6 +175,12 @@
 #define MODEL_NODES (ENTROPY_NODES - UNCONSTRAINED_NODES)
 extern const vpx_tree_index vp10_coef_con_tree[TREE_SIZE(ENTROPY_TOKENS)];
 extern const vpx_prob vp10_pareto8_full[COEFF_PROB_MODELS][MODEL_NODES];
+#if CONFIG_ANS
+extern const AnsP10
+    vp10_pareto8_token_probs[COEFF_PROB_MODELS][ENTROPY_TOKENS - 2];
+
+typedef rans_dec_lut coeff_cdf_model[REF_TYPES][COEF_BANDS][COEFF_CONTEXTS];
+#endif  // CONFIG_ANS
 
 typedef vpx_prob vp10_coeff_probs_model[REF_TYPES][COEF_BANDS]
                                       [COEFF_CONTEXTS][UNCONSTRAINED_NODES];
@@ -209,6 +227,55 @@
   return combine_entropy_contexts(above_ec, left_ec);
 }
 
+#if CONFIG_ANS
+struct frame_contexts;
+void vp10_coef_pareto_cdfs(struct frame_contexts *fc);
+#endif  // CONFIG_ANS
+
+#if CONFIG_ENTROPY
+#define COEF_COUNT_SAT_BITS                   5
+#define COEF_MAX_UPDATE_FACTOR_BITS           7
+#define COEF_COUNT_SAT_AFTER_KEY_BITS         5
+#define COEF_MAX_UPDATE_FACTOR_AFTER_KEY_BITS 7
+#define MODE_MV_COUNT_SAT_BITS                5
+#define MODE_MV_MAX_UPDATE_FACTOR_BITS        7
+
+#else
+
+#define COEF_COUNT_SAT 24
+#define COEF_MAX_UPDATE_FACTOR 112
+#define COEF_COUNT_SAT_AFTER_KEY 24
+#define COEF_MAX_UPDATE_FACTOR_AFTER_KEY 128
+
+#endif  // CONFIG_ENTROPY
+
+static INLINE vpx_prob vp10_merge_probs(vpx_prob pre_prob,
+                                        const unsigned int ct[2],
+                                        unsigned int count_sat,
+                                        unsigned int max_update_factor) {
+#if CONFIG_ENTROPY
+  const vpx_prob prob = get_binary_prob(ct[0], ct[1]);
+  const unsigned int count =
+      VPXMIN(ct[0] + ct[1], (unsigned int)(1 << count_sat));
+  const unsigned int factor =
+      count << (max_update_factor - count_sat);
+  return weighted_prob(pre_prob, prob, factor);
+#else
+  return merge_probs(pre_prob, ct, count_sat, max_update_factor);
+#endif  // CONFIG_ENTROPY
+}
+
+static INLINE vpx_prob vp10_mode_mv_merge_probs(vpx_prob pre_prob,
+                                                const unsigned int ct[2]) {
+#if CONFIG_ENTROPY
+  return vp10_merge_probs(pre_prob, ct,
+                          MODE_MV_COUNT_SAT_BITS,
+                          MODE_MV_MAX_UPDATE_FACTOR_BITS);
+#else
+  return mode_mv_merge_probs(pre_prob, ct);
+#endif  // CONFIG_ENTROPY
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp10/common/entropymode.c b/vp10/common/entropymode.c
index 78f3650..8fb88b2 100644
--- a/vp10/common/entropymode.c
+++ b/vp10/common/entropymode.c
@@ -10,6 +10,7 @@
 
 #include "vpx_mem/vpx_mem.h"
 
+#include "vp10/common/reconinter.h"
 #include "vp10/common/onyxc_int.h"
 #include "vp10/common/seg_common.h"
 
@@ -127,21 +128,6 @@
   }
 };
 
-#if !CONFIG_MISC_FIXES
-const vpx_prob vp10_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1] = {
-  { 144,  11,  54, 157, 195, 130,  46,  58, 108 },  // y = dc
-  { 118,  15, 123, 148, 131, 101,  44,  93, 131 },  // y = v
-  { 113,  12,  23, 188, 226, 142,  26,  32, 125 },  // y = h
-  { 120,  11,  50, 123, 163, 135,  64,  77, 103 },  // y = d45
-  { 113,   9,  36, 155, 111, 157,  32,  44, 161 },  // y = d135
-  { 116,   9,  55, 176,  76,  96,  37,  61, 149 },  // y = d117
-  { 115,   9,  28, 141, 161, 167,  21,  25, 193 },  // y = d153
-  { 120,  12,  32, 145, 195, 142,  32,  38,  86 },  // y = d207
-  { 116,  12,  64, 120, 140, 125,  49, 115, 121 },  // y = d63
-  { 102,  19,  66, 162, 182, 122,  35,  59, 128 }   // y = tm
-};
-#endif
-
 static const vpx_prob default_if_y_probs[BLOCK_SIZE_GROUPS][INTRA_MODES - 1] = {
   {  65,  32,  18, 144, 162, 194,  41,  51,  98 },  // block_size < 8x8
   { 132,  68,  18, 165, 217, 196,  45,  40,  78 },  // block_size < 16x16
@@ -162,32 +148,38 @@
   { 101,  21, 107, 181, 192, 103,  19,  67, 125 }   // y = tm
 };
 
-#if !CONFIG_MISC_FIXES
-const vpx_prob vp10_kf_partition_probs[PARTITION_CONTEXTS]
-                                     [PARTITION_TYPES - 1] = {
+#if CONFIG_EXT_PARTITION_TYPES
+static const vpx_prob default_partition_probs[PARTITION_CONTEXTS]
+                                             [EXT_PARTITION_TYPES - 1] = {
   // 8x8 -> 4x4
-  { 158,  97,  94 },  // a/l both not split
-  {  93,  24,  99 },  // a split, l not split
-  {  85, 119,  44 },  // l split, a not split
-  {  62,  59,  67 },  // a/l both split
+  { 199, 122, 141, 128, 128, 128, 128 },  // a/l both not split
+  { 147,  63, 159, 128, 128, 128, 128 },  // a split, l not split
+  { 148, 133, 118, 128, 128, 128, 128 },  // l split, a not split
+  { 121, 104, 114, 128, 128, 128, 128 },  // a/l both split
   // 16x16 -> 8x8
-  { 149,  53,  53 },  // a/l both not split
-  {  94,  20,  48 },  // a split, l not split
-  {  83,  53,  24 },  // l split, a not split
-  {  52,  18,  18 },  // a/l both split
+  { 174,  73,  87, 128, 128, 128, 128 },  // a/l both not split
+  {  92,  41,  83, 128, 128, 128, 128 },  // a split, l not split
+  {  82,  99,  50, 128, 128, 128, 128 },  // l split, a not split
+  {  53,  39,  39, 128, 128, 128, 128 },  // a/l both split
   // 32x32 -> 16x16
-  { 150,  40,  39 },  // a/l both not split
-  {  78,  12,  26 },  // a split, l not split
-  {  67,  33,  11 },  // l split, a not split
-  {  24,   7,   5 },  // a/l both split
+  { 177,  58,  59, 128, 128, 128, 128 },  // a/l both not split
+  {  68,  26,  63, 128, 128, 128, 128 },  // a split, l not split
+  {  52,  79,  25, 128, 128, 128, 128 },  // l split, a not split
+  {  17,  14,  12, 128, 128, 128, 128 },  // a/l both split
   // 64x64 -> 32x32
-  { 174,  35,  49 },  // a/l both not split
-  {  68,  11,  27 },  // a split, l not split
-  {  57,  15,   9 },  // l split, a not split
-  {  12,   3,   3 },  // a/l both split
+  { 222,  34,  30, 128, 128, 128, 128 },  // a/l both not split
+  {  72,  16,  44, 128, 128, 128, 128 },  // a split, l not split
+  {  58,  32,  12, 128, 128, 128, 128 },  // l split, a not split
+  {  10,   7,   6, 128, 128, 128, 128 },  // a/l both split
+#if CONFIG_EXT_PARTITION
+  // 128x128 -> 64x64
+  { 222,  34,  30, 128, 128, 128, 128 },  // a/l both not split
+  {  72,  16,  44, 128, 128, 128, 128 },  // a split, l not split
+  {  58,  32,  12, 128, 128, 128, 128 },  // l split, a not split
+  {  10,   7,   6, 128, 128, 128, 128 },  // a/l both split
+#endif  // CONFIG_EXT_PARTITION
 };
-#endif
-
+#else
 static const vpx_prob default_partition_probs[PARTITION_CONTEXTS]
                                              [PARTITION_TYPES - 1] = {
   // 8x8 -> 4x4
@@ -210,10 +202,50 @@
   {  72,  16,  44 },  // a split, l not split
   {  58,  32,  12 },  // l split, a not split
   {  10,   7,   6 },  // a/l both split
+#if CONFIG_EXT_PARTITION
+  // 128x128 -> 64x64
+  { 222,  34,  30 },  // a/l both not split
+  {  72,  16,  44 },  // a split, l not split
+  {  58,  32,  12 },  // l split, a not split
+  {  10,   7,   6 },  // a/l both split
+#endif  // CONFIG_EXT_PARTITION
 };
+#endif  // CONFIG_EXT_PARTITION_TYPES
+
+#if CONFIG_REF_MV
+static const vpx_prob default_newmv_prob[NEWMV_MODE_CONTEXTS] = {
+    200, 180, 150, 150, 110, 70, 60,
+};
+
+static const vpx_prob default_zeromv_prob[ZEROMV_MODE_CONTEXTS] = {
+    192, 64,
+};
+
+static const vpx_prob default_refmv_prob[REFMV_MODE_CONTEXTS] = {
+    220, 220, 200, 200, 180, 128, 30, 220, 30,
+};
+
+static const vpx_prob default_drl_prob[DRL_MODE_CONTEXTS] = {
+    128, 160, 180, 128, 160
+};
+
+#if CONFIG_EXT_INTER
+static const vpx_prob default_new2mv_prob = 180;
+#endif  // CONFIG_EXT_INTER
+#endif  // CONFIG_REF_MV
 
 static const vpx_prob default_inter_mode_probs[INTER_MODE_CONTEXTS]
                                               [INTER_MODES - 1] = {
+#if CONFIG_EXT_INTER
+  // TODO(zoeliu): To adjust the initial default probs
+  {2,       173,   34,   173},  // 0 = both zero mv
+  {7,       145,   85,   145},  // 1 = one zero mv + one a predicted mv
+  {7,       166,   63,   166},  // 2 = two predicted mvs
+  {7,       94,    66,   128},  // 3 = one predicted/zero and one new mv
+  {8,       64,    46,   128},  // 4 = two new mvs
+  {17,      81,    31,   128},  // 5 = one intra neighbour + x
+  {25,      29,    30,    96},  // 6 = two intra neighbours
+#else
   {2,       173,   34},  // 0 = both zero mv
   {7,       145,   85},  // 1 = one zero mv + one a predicted mv
   {7,       166,   63},  // 2 = two predicted mvs
@@ -221,8 +253,101 @@
   {8,       64,    46},  // 4 = two new mvs
   {17,      81,    31},  // 5 = one intra neighbour + x
   {25,      29,    30},  // 6 = two intra neighbours
+#endif  // CONFIG_EXT_INTER
 };
 
+#if CONFIG_EXT_INTER
+static const vpx_prob default_inter_compound_mode_probs
+                      [INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES - 1] = {
+  { 2, 173,  68, 192, 64, 192, 128, 180, 180},   // 0 = both zero mv
+  { 7, 145, 160, 192, 64, 192, 128, 180, 180},   // 1 = 1 zero + 1 predicted
+  { 7, 166, 126, 192, 64, 192, 128, 180, 180},   // 2 = two predicted mvs
+  { 7,  94, 132, 192, 64, 192, 128, 180, 180},   // 3 = 1 pred/zero, 1 new
+  { 8,  64,  64, 192, 64, 192, 128, 180, 180},   // 4 = two new mvs
+  {17,  81,  52, 192, 64, 192, 128, 180, 180},   // 5 = one intra neighbour
+  {25,  29,  50, 192, 64, 192, 128, 180, 180},   // 6 = two intra neighbours
+};
+
+static const vpx_prob default_interintra_prob[BLOCK_SIZE_GROUPS] = {
+  208, 208, 208, 208,
+};
+
+static const vpx_prob
+    default_interintra_mode_prob[BLOCK_SIZE_GROUPS][INTERINTRA_MODES - 1] = {
+  {  65,  32,  18, 144, 162, 194,  41,  51,  98 },  // block_size < 8x8
+  { 132,  68,  18, 165, 217, 196,  45,  40,  78 },  // block_size < 16x16
+  { 173,  80,  19, 176, 240, 193,  64,  35,  46 },  // block_size < 32x32
+  { 221, 135,  38, 194, 248, 121,  96,  85,  29 }   // block_size >= 32x32
+};
+
+static const vpx_prob default_wedge_interintra_prob[BLOCK_SIZES] = {
+  208, 208, 208, 208, 208, 208, 216, 216, 216, 224, 224, 224, 240,
+#if CONFIG_EXT_PARTITION
+  208, 208, 208
+#endif  // CONFIG_EXT_PARTITION
+};
+
+static const vpx_prob default_wedge_interinter_prob[BLOCK_SIZES] = {
+  208, 208, 208, 208, 208, 208, 216, 216, 216, 224, 224, 224, 240,
+#if CONFIG_EXT_PARTITION
+  255, 255, 255
+#endif  // CONFIG_EXT_PARTITION
+};
+#endif  // CONFIG_EXT_INTER
+
+// Change this section appropriately once warped motion is supported
+#if CONFIG_OBMC && !CONFIG_WARPED_MOTION
+const vpx_tree_index vp10_motvar_tree[TREE_SIZE(MOTION_VARIATIONS)] = {
+  -SIMPLE_TRANSLATION, -OBMC_CAUSAL
+};
+static
+const vpx_prob default_motvar_prob[BLOCK_SIZES][MOTION_VARIATIONS - 1] = {
+  {255},
+  {255}, {255}, {151},
+  {153}, {144}, {178},
+  {165}, {160}, {207},
+  {195}, {168}, {244},
+#if CONFIG_EXT_PARTITION
+  {252}, {252}, {252},
+#endif  // CONFIG_EXT_PARTITION
+};
+
+#elif !CONFIG_OBMC && CONFIG_WARPED_MOTION
+
+const vpx_tree_index vp10_motvar_tree[TREE_SIZE(MOTION_VARIATIONS)] = {
+  -SIMPLE_TRANSLATION, -WARPED_CAUSAL
+};
+static
+const vpx_prob default_motvar_prob[BLOCK_SIZES][MOTION_VARIATIONS - 1] = {
+  {255},
+  {255}, {255}, {151},
+  {153}, {144}, {178},
+  {165}, {160}, {207},
+  {195}, {168}, {244},
+#if CONFIG_EXT_PARTITION
+  {252}, {252}, {252},
+#endif  // CONFIG_EXT_PARTITION
+};
+
+#elif CONFIG_OBMC && CONFIG_WARPED_MOTION
+
+const vpx_tree_index vp10_motvar_tree[TREE_SIZE(MOTION_VARIATIONS)] = {
+  -SIMPLE_TRANSLATION, 2,
+  -OBMC_CAUSAL, -WARPED_CAUSAL,
+};
+static
+const vpx_prob default_motvar_prob[BLOCK_SIZES][MOTION_VARIATIONS - 1] = {
+  {255, 200},
+  {255, 200}, {255, 200}, {151, 200},
+  {153, 200}, {144, 200}, {178, 200},
+  {165, 200}, {160, 200}, {207, 200},
+  {195, 200}, {168, 200}, {244, 200},
+#if CONFIG_EXT_PARTITION
+  {252, 200}, {252, 200}, {252, 200},
+#endif  // CONFIG_EXT_PARTITION
+};
+#endif  // CONFIG_OBMC || !CONFIG_WARPED_MOTION
+
 /* Array indices are identical to previously-existing INTRAMODECONTEXTNODES. */
 const vpx_tree_index vp10_intra_mode_tree[TREE_SIZE(INTRA_MODES)] = {
   -DC_PRED, 2,                      /* 0 = DC_NODE */
@@ -233,21 +358,66 @@
   -D135_PRED, -D117_PRED,           /* 5 = D135_NODE */
   -D45_PRED, 14,                    /* 6 = D45_NODE */
   -D63_PRED, 16,                    /* 7 = D63_NODE */
-  -D153_PRED, -D207_PRED             /* 8 = D153_NODE */
+  -D153_PRED, -D207_PRED            /* 8 = D153_NODE */
 };
 
 const vpx_tree_index vp10_inter_mode_tree[TREE_SIZE(INTER_MODES)] = {
   -INTER_OFFSET(ZEROMV), 2,
   -INTER_OFFSET(NEARESTMV), 4,
+#if CONFIG_EXT_INTER
+  -INTER_OFFSET(NEARMV), 6,
+  -INTER_OFFSET(NEWMV), -INTER_OFFSET(NEWFROMNEARMV)
+#else
   -INTER_OFFSET(NEARMV), -INTER_OFFSET(NEWMV)
+#endif  // CONFIG_EXT_INTER
 };
 
+#if CONFIG_EXT_INTER
+const vpx_tree_index vp10_interintra_mode_tree[TREE_SIZE(INTERINTRA_MODES)] = {
+  -II_DC_PRED, 2,                   /* 0 = II_DC_NODE     */
+  -II_TM_PRED, 4,                   /* 1 = II_TM_NODE     */
+  -II_V_PRED, 6,                    /* 2 = II_V_NODE      */
+  8, 12,                            /* 3 = II_COM_NODE    */
+  -II_H_PRED, 10,                   /* 4 = II_H_NODE      */
+  -II_D135_PRED, -II_D117_PRED,     /* 5 = II_D135_NODE   */
+  -II_D45_PRED, 14,                 /* 6 = II_D45_NODE    */
+  -II_D63_PRED, 16,                 /* 7 = II_D63_NODE    */
+  -II_D153_PRED, -II_D207_PRED      /* 8 = II_D153_NODE   */
+};
+
+const vpx_tree_index vp10_inter_compound_mode_tree
+      [TREE_SIZE(INTER_COMPOUND_MODES)] = {
+  -INTER_COMPOUND_OFFSET(ZERO_ZEROMV), 2,
+  -INTER_COMPOUND_OFFSET(NEAREST_NEARESTMV), 4,
+  6, -INTER_COMPOUND_OFFSET(NEW_NEWMV),
+  8, 12,
+  -INTER_COMPOUND_OFFSET(NEAR_NEARMV), 10,
+  -INTER_COMPOUND_OFFSET(NEAREST_NEARMV),
+      -INTER_COMPOUND_OFFSET(NEAR_NEARESTMV),
+  14, 16,
+  -INTER_COMPOUND_OFFSET(NEAREST_NEWMV), -INTER_COMPOUND_OFFSET(NEW_NEARESTMV),
+  -INTER_COMPOUND_OFFSET(NEAR_NEWMV), -INTER_COMPOUND_OFFSET(NEW_NEARMV)
+};
+#endif  // CONFIG_EXT_INTER
+
 const vpx_tree_index vp10_partition_tree[TREE_SIZE(PARTITION_TYPES)] = {
   -PARTITION_NONE, 2,
   -PARTITION_HORZ, 4,
   -PARTITION_VERT, -PARTITION_SPLIT
 };
 
+#if CONFIG_EXT_PARTITION_TYPES
+const vpx_tree_index vp10_ext_partition_tree[TREE_SIZE(EXT_PARTITION_TYPES)] = {
+  -PARTITION_NONE, 2,
+  6, 4,
+  8, -PARTITION_SPLIT,
+  -PARTITION_HORZ, 10,
+  -PARTITION_VERT, 12,
+  -PARTITION_HORZ_A, -PARTITION_HORZ_B,
+  -PARTITION_VERT_A, -PARTITION_VERT_B
+};
+#endif  // CONFIG_EXT_PARTITION_TYPES
+
 static const vpx_prob default_intra_inter_p[INTRA_INTER_CONTEXTS] = {
   9, 102, 187, 225
 };
@@ -256,60 +426,595 @@
   239, 183, 119,  96,  41
 };
 
-static const vpx_prob default_comp_ref_p[REF_CONTEXTS] = {
-  50, 126, 123, 221, 226
-};
 
-static const vpx_prob default_single_ref_p[REF_CONTEXTS][2] = {
+#if CONFIG_EXT_REFS
+static const vpx_prob default_comp_ref_p[REF_CONTEXTS][FWD_REFS - 1] = {
+  // TODO(zoeliu): To adjust the initial prob values.
+  {  33,  16,  16 },
+  {  77,  74,  74 },
+  { 142, 142, 142 },
+  { 172, 170, 170 },
+  { 238, 247, 247 }
+};
+static const vpx_prob default_comp_bwdref_p[REF_CONTEXTS][BWD_REFS - 1] = {
+  { 16 }, { 74 }, { 142 }, { 170 }, { 247 }
+};
+#else
+static const vpx_prob default_comp_ref_p[REF_CONTEXTS][COMP_REFS - 1] = {
+  { 50 }, { 126 }, { 123 }, { 221 }, { 226 }
+};
+#endif  // CONFIG_EXT_REFS
+
+static const vpx_prob default_single_ref_p[REF_CONTEXTS][SINGLE_REFS - 1] = {
+#if CONFIG_EXT_REFS
+  {  33,  16,  16,  16,  16 },
+  {  77,  74,  74,  74,  74 },
+  { 142, 142, 142, 142, 142 },
+  { 172, 170, 170, 170, 170 },
+  { 238, 247, 247, 247, 247 }
+#else
   {  33,  16 },
   {  77,  74 },
   { 142, 142 },
   { 172, 170 },
   { 238, 247 }
+#endif  // CONFIG_EXT_REFS
 };
 
-static const struct tx_probs default_tx_probs = {
-  { { 3, 136, 37 },
-    { 5, 52,  13 } },
-
-  { { 20, 152 },
-    { 15, 101 } },
-
-  { { 100 },
-    { 66  } }
+const vpx_tree_index vp10_palette_size_tree[TREE_SIZE(PALETTE_SIZES)] = {
+    -TWO_COLORS, 2,
+    -THREE_COLORS, 4,
+    -FOUR_COLORS, 6,
+    -FIVE_COLORS, 8,
+    -SIX_COLORS, 10,
+    -SEVEN_COLORS, -EIGHT_COLORS,
 };
 
-void vp10_tx_counts_to_branch_counts_32x32(const unsigned int *tx_count_32x32p,
-                                      unsigned int (*ct_32x32p)[2]) {
-  ct_32x32p[0][0] = tx_count_32x32p[TX_4X4];
-  ct_32x32p[0][1] = tx_count_32x32p[TX_8X8] +
-                    tx_count_32x32p[TX_16X16] +
-                    tx_count_32x32p[TX_32X32];
-  ct_32x32p[1][0] = tx_count_32x32p[TX_8X8];
-  ct_32x32p[1][1] = tx_count_32x32p[TX_16X16] +
-                    tx_count_32x32p[TX_32X32];
-  ct_32x32p[2][0] = tx_count_32x32p[TX_16X16];
-  ct_32x32p[2][1] = tx_count_32x32p[TX_32X32];
+// TODO(huisu): tune these probs
+const vpx_prob
+vp10_default_palette_y_size_prob[PALETTE_BLOCK_SIZES][PALETTE_SIZES - 1] = {
+    {  96,  89, 100,  64,  77, 130},
+    {  22,  15,  44,  16,  34,  82},
+    {  30,  19,  57,  18,  38,  86},
+    {  94,  36, 104,  23,  43,  92},
+    { 116,  76, 107,  46,  65, 105},
+    { 112,  82,  94,  40,  70, 112},
+    { 147, 124, 123,  58,  69, 103},
+    { 180, 113, 136,  49,  45, 114},
+    { 107,  70,  87,  49, 154, 156},
+    {  98, 105, 142,  63,  64, 152},
+#if CONFIG_EXT_PARTITION
+    {  98, 105, 142,  63,  64, 152},
+    {  98, 105, 142,  63,  64, 152},
+    {  98, 105, 142,  63,  64, 152},
+#endif  // CONFIG_EXT_PARTITION
+};
+
+const vpx_prob
+vp10_default_palette_uv_size_prob[PALETTE_BLOCK_SIZES][PALETTE_SIZES - 1] = {
+    { 160, 196, 228, 213, 175, 230},
+    {  87, 148, 208, 141, 166, 163},
+    {  72, 151, 204, 139, 155, 161},
+    {  78, 135, 171, 104, 120, 173},
+    {  59,  92, 131,  78,  92, 142},
+    {  75, 118, 149,  84,  90, 128},
+    {  89,  87,  92,  66,  66, 128},
+    {  67,  53,  54,  55,  66,  93},
+    { 120, 130,  83, 171,  75, 214},
+    {  72,  55,  66,  68,  79, 107},
+#if CONFIG_EXT_PARTITION
+    {  72,  55,  66,  68,  79, 107},
+    {  72,  55,  66,  68,  79, 107},
+    {  72,  55,  66,  68,  79, 107},
+#endif  // CONFIG_EXT_PARTITION
+};
+
+const vpx_prob
+vp10_default_palette_y_mode_prob[PALETTE_BLOCK_SIZES][PALETTE_Y_MODE_CONTEXTS]
+                                                      = {
+    { 240,  180,  100, },
+    { 240,  180,  100, },
+    { 240,  180,  100, },
+    { 240,  180,  100, },
+    { 240,  180,  100, },
+    { 240,  180,  100, },
+    { 240,  180,  100, },
+    { 240,  180,  100, },
+    { 240,  180,  100, },
+    { 240,  180,  100, },
+#if CONFIG_EXT_PARTITION
+    { 240,  180,  100, },
+    { 240,  180,  100, },
+    { 240,  180,  100, },
+#endif  // CONFIG_EXT_PARTITION
+};
+
+
+const vpx_prob vp10_default_palette_uv_mode_prob[2] = {
+    253, 229
+};
+
+const vpx_tree_index
+vp10_palette_color_tree[PALETTE_MAX_SIZE - 1][TREE_SIZE(PALETTE_COLORS)] = {
+    {  // 2 colors
+        -PALETTE_COLOR_ONE, -PALETTE_COLOR_TWO,
+    },
+    {  // 3 colors
+        -PALETTE_COLOR_ONE, 2,
+        -PALETTE_COLOR_TWO, -PALETTE_COLOR_THREE,
+    },
+    {  // 4 colors
+        -PALETTE_COLOR_ONE, 2,
+        -PALETTE_COLOR_TWO, 4,
+        -PALETTE_COLOR_THREE, -PALETTE_COLOR_FOUR,
+    },
+    {  // 5 colors
+        -PALETTE_COLOR_ONE, 2,
+        -PALETTE_COLOR_TWO, 4,
+        -PALETTE_COLOR_THREE, 6,
+        -PALETTE_COLOR_FOUR, -PALETTE_COLOR_FIVE,
+    },
+    {  // 6 colors
+        -PALETTE_COLOR_ONE, 2,
+        -PALETTE_COLOR_TWO, 4,
+        -PALETTE_COLOR_THREE, 6,
+        -PALETTE_COLOR_FOUR, 8,
+        -PALETTE_COLOR_FIVE, -PALETTE_COLOR_SIX,
+    },
+    {  // 7 colors
+        -PALETTE_COLOR_ONE, 2,
+        -PALETTE_COLOR_TWO, 4,
+        -PALETTE_COLOR_THREE, 6,
+        -PALETTE_COLOR_FOUR, 8,
+        -PALETTE_COLOR_FIVE, 10,
+        -PALETTE_COLOR_SIX, -PALETTE_COLOR_SEVEN,
+    },
+    {  // 8 colors
+        -PALETTE_COLOR_ONE, 2,
+        -PALETTE_COLOR_TWO, 4,
+        -PALETTE_COLOR_THREE, 6,
+        -PALETTE_COLOR_FOUR, 8,
+        -PALETTE_COLOR_FIVE, 10,
+        -PALETTE_COLOR_SIX, 12,
+        -PALETTE_COLOR_SEVEN, -PALETTE_COLOR_EIGHT,
+    },
+};
+
+const vpx_prob vp10_default_palette_y_color_prob
+[PALETTE_MAX_SIZE - 1][PALETTE_COLOR_CONTEXTS][PALETTE_COLORS - 1] = {
+    {  // 2 colors
+        { 230, 255, 128, 128, 128, 128, 128 },
+        { 214, 255, 128, 128, 128, 128, 128 },
+        { 128, 128, 128, 128, 128, 128, 128 },
+        { 128, 128, 128, 128, 128, 128, 128 },
+        { 128, 128, 128, 128, 128, 128, 128 },
+        { 240, 255, 128, 128, 128, 128, 128 },
+        {  73, 255, 128, 128, 128, 128, 128 },
+        { 128, 128, 128, 128, 128, 128, 128 },
+        { 130, 255, 128, 128, 128, 128, 128 },
+        { 227, 255, 128, 128, 128, 128, 128 },
+        { 128, 128, 128, 128, 128, 128, 128 },
+        { 188, 255, 128, 128, 128, 128, 128 },
+        {  75, 255, 128, 128, 128, 128, 128 },
+        { 250, 255, 128, 128, 128, 128, 128 },
+        { 223, 255, 128, 128, 128, 128, 128 },
+        { 252, 255, 128, 128, 128, 128, 128 },
+    }, {  // 3 colors
+        { 229, 137, 255, 128, 128, 128, 128 },
+        { 197, 120, 255, 128, 128, 128, 128 },
+        { 107, 195, 255, 128, 128, 128, 128 },
+        { 128, 128, 128, 128, 128, 128, 128 },
+        {  27, 151, 255, 128, 128, 128, 128 },
+        { 230, 130, 255, 128, 128, 128, 128 },
+        {  37, 230, 255, 128, 128, 128, 128 },
+        {  67, 221, 255, 128, 128, 128, 128 },
+        { 124, 230, 255, 128, 128, 128, 128 },
+        { 195, 109, 255, 128, 128, 128, 128 },
+        {  99, 122, 255, 128, 128, 128, 128 },
+        { 205, 208, 255, 128, 128, 128, 128 },
+        {  40, 235, 255, 128, 128, 128, 128 },
+        { 251, 132, 255, 128, 128, 128, 128 },
+        { 237, 186, 255, 128, 128, 128, 128 },
+        { 253, 112, 255, 128, 128, 128, 128 },
+    }, {  // 4 colors
+        { 195,  87, 128, 255, 128, 128, 128 },
+        { 143, 100, 123, 255, 128, 128, 128 },
+        {  94, 124, 119, 255, 128, 128, 128 },
+        {  77,  91, 130, 255, 128, 128, 128 },
+        {  39, 114, 178, 255, 128, 128, 128 },
+        { 222,  94, 125, 255, 128, 128, 128 },
+        {  44, 203, 132, 255, 128, 128, 128 },
+        {  68, 175, 122, 255, 128, 128, 128 },
+        { 110, 187, 124, 255, 128, 128, 128 },
+        { 152,  91, 128, 255, 128, 128, 128 },
+        {  70, 109, 181, 255, 128, 128, 128 },
+        { 133, 113, 164, 255, 128, 128, 128 },
+        {  47, 205, 133, 255, 128, 128, 128 },
+        { 247,  94, 136, 255, 128, 128, 128 },
+        { 205, 122, 146, 255, 128, 128, 128 },
+        { 251, 100, 141, 255, 128, 128, 128 },
+    }, {  // 5 colors
+        { 195,  65,  84, 125, 255, 128, 128 },
+        { 150,  76,  84, 121, 255, 128, 128 },
+        {  94, 110,  81, 117, 255, 128, 128 },
+        {  79,  85,  91, 139, 255, 128, 128 },
+        {  26, 102, 139, 127, 255, 128, 128 },
+        { 220,  73,  91, 119, 255, 128, 128 },
+        {  38, 203,  86, 127, 255, 128, 128 },
+        {  61, 186,  72, 124, 255, 128, 128 },
+        { 132, 199,  84, 128, 255, 128, 128 },
+        { 172,  52,  62, 120, 255, 128, 128 },
+        { 102,  89, 121, 122, 255, 128, 128 },
+        { 182,  48,  69, 186, 255, 128, 128 },
+        {  36, 206,  87, 126, 255, 128, 128 },
+        { 249,  55,  67, 122, 255, 128, 128 },
+        { 218,  88,  75, 122, 255, 128, 128 },
+        { 253,  64,  80, 119, 255, 128, 128 },
+    }, {  // 6 colors
+        { 182,  54,  64,  75, 118, 255, 128 },
+        { 126,  67,  70,  76, 116, 255, 128 },
+        {  79,  92,  67,  85, 120, 255, 128 },
+        {  63,  61,  81, 118, 132, 255, 128 },
+        {  21,  80, 105,  83, 119, 255, 128 },
+        { 215,  72,  74,  74, 111, 255, 128 },
+        {  50, 176,  63,  79, 120, 255, 128 },
+        {  72, 148,  66,  77, 120, 255, 128 },
+        { 105, 177,  57,  78, 130, 255, 128 },
+        { 150,  66,  66,  80, 127, 255, 128 },
+        {  81,  76, 109,  85, 116, 255, 128 },
+        { 113,  81,  62,  96, 148, 255, 128 },
+        {  54, 179,  69,  82, 121, 255, 128 },
+        { 244,  47,  48,  67, 118, 255, 128 },
+        { 198,  83,  53,  65, 121, 255, 128 },
+        { 250,  42,  51,  69, 110, 255, 128 },
+    }, {  // 7 colors
+        { 182,  45,  54,  62,  74, 113, 255 },
+        { 124,  63,  57,  62,  77, 114, 255 },
+        {  77,  80,  56,  66,  76, 117, 255 },
+        {  63,  57,  69,  98,  85, 131, 255 },
+        {  19,  81,  98,  63,  80, 116, 255 },
+        { 215,  56,  60,  63,  68, 105, 255 },
+        {  50, 174,  50,  60,  79, 118, 255 },
+        {  68, 151,  50,  58,  73, 117, 255 },
+        { 104, 182,  53,  57,  79, 127, 255 },
+        { 156,  50,  51,  63,  77, 111, 255 },
+        {  88,  67,  97,  59,  82, 120, 255 },
+        { 114,  81,  46,  65, 103, 132, 255 },
+        {  55, 166,  57,  66,  82, 120, 255 },
+        { 245,  34,  38,  43,  63, 114, 255 },
+        { 203,  68,  45,  47,  60, 118, 255 },
+        { 250,  35,  37,  47,  66, 110, 255 },
+    }, {  // 8 colors
+        { 180,  43,  46,  50,  56,  69, 109 },
+        { 116,  53,  51,  49,  57,  73, 115 },
+        {  79,  70,  49,  50,  59,  74, 117 },
+        {  60,  54,  57,  70,  62,  83, 129 },
+        {  20,  73,  85,  52,  66,  81, 119 },
+        { 213,  56,  52,  49,  53,  62, 104 },
+        {  48, 161,  41,  45,  56,  77, 116 },
+        {  68, 139,  40,  47,  54,  71, 116 },
+        { 123, 166,  42,  43,  52,  76, 130 },
+        { 153,  44,  44,  47,  54,  79, 129 },
+        {  87,  64,  83,  49,  60,  75, 127 },
+        { 131,  68,  43,  48,  73,  96, 130 },
+        {  55, 152,  45,  51,  64,  77, 113 },
+        { 243,  30,  28,  33,  41,  65, 114 },
+        { 202,  56,  35,  36,  42,  63, 123 },
+        { 249,  31,  29,  32,  45,  68, 111 },
+    }
+};
+
+const vpx_prob vp10_default_palette_uv_color_prob
+[PALETTE_MAX_SIZE - 1][PALETTE_COLOR_CONTEXTS][PALETTE_COLORS - 1] = {
+    {  // 2 colors
+        { 228, 255, 128, 128, 128, 128, 128 },
+        { 195, 255, 128, 128, 128, 128, 128 },
+        { 128, 128, 128, 128, 128, 128, 128 },
+        { 128, 128, 128, 128, 128, 128, 128 },
+        { 128, 128, 128, 128, 128, 128, 128 },
+        { 228, 255, 128, 128, 128, 128, 128 },
+        {  71, 255, 128, 128, 128, 128, 128 },
+        { 128, 128, 128, 128, 128, 128, 128 },
+        { 129, 255, 128, 128, 128, 128, 128 },
+        { 206, 255, 128, 128, 128, 128, 128 },
+        { 128, 128, 128, 128, 128, 128, 128 },
+        { 136, 255, 128, 128, 128, 128, 128 },
+        {  98, 255, 128, 128, 128, 128, 128 },
+        { 236, 255, 128, 128, 128, 128, 128 },
+        { 222, 255, 128, 128, 128, 128, 128 },
+        { 249, 255, 128, 128, 128, 128, 128 },
+    }, {  // 3 colors
+        { 198, 136, 255, 128, 128, 128, 128 },
+        { 178, 105, 255, 128, 128, 128, 128 },
+        { 100, 206, 255, 128, 128, 128, 128 },
+        { 128, 128, 128, 128, 128, 128, 128 },
+        {  12, 136, 255, 128, 128, 128, 128 },
+        { 219, 134, 255, 128, 128, 128, 128 },
+        {  50, 198, 255, 128, 128, 128, 128 },
+        {  61, 231, 255, 128, 128, 128, 128 },
+        { 110, 209, 255, 128, 128, 128, 128 },
+        { 173, 106, 255, 128, 128, 128, 128 },
+        { 145, 166, 255, 128, 128, 128, 128 },
+        { 156, 175, 255, 128, 128, 128, 128 },
+        {  69, 183, 255, 128, 128, 128, 128 },
+        { 241, 163, 255, 128, 128, 128, 128 },
+        { 224, 160, 255, 128, 128, 128, 128 },
+        { 246, 154, 255, 128, 128, 128, 128 },
+    }, {  // 4 colors
+        { 173,  88, 143, 255, 128, 128, 128 },
+        { 146,  81, 127, 255, 128, 128, 128 },
+        {  84, 134, 102, 255, 128, 128, 128 },
+        {  69, 138, 140, 255, 128, 128, 128 },
+        {  31, 103, 200, 255, 128, 128, 128 },
+        { 217, 101, 139, 255, 128, 128, 128 },
+        {  51, 174, 121, 255, 128, 128, 128 },
+        {  64, 177, 109, 255, 128, 128, 128 },
+        {  96, 179, 145, 255, 128, 128, 128 },
+        { 164,  77, 114, 255, 128, 128, 128 },
+        {  87,  94, 156, 255, 128, 128, 128 },
+        { 105,  57, 173, 255, 128, 128, 128 },
+        {  63, 158, 137, 255, 128, 128, 128 },
+        { 236, 102, 156, 255, 128, 128, 128 },
+        { 197, 115, 153, 255, 128, 128, 128 },
+        { 245, 106, 154, 255, 128, 128, 128 },
+    }, {  // 5 colors
+        { 179,  64,  97, 129, 255, 128, 128 },
+        { 137,  56,  88, 125, 255, 128, 128 },
+        {  82, 107,  61, 118, 255, 128, 128 },
+        {  59, 113,  86, 115, 255, 128, 128 },
+        {  23,  88, 118, 130, 255, 128, 128 },
+        { 213,  66,  90, 125, 255, 128, 128 },
+        {  37, 181, 103, 121, 255, 128, 128 },
+        {  47, 188,  61, 131, 255, 128, 128 },
+        { 104, 185, 103, 144, 255, 128, 128 },
+        { 163,  39,  76, 112, 255, 128, 128 },
+        {  94,  74, 131, 126, 255, 128, 128 },
+        { 142,  42, 103, 163, 255, 128, 128 },
+        {  53, 162,  99, 149, 255, 128, 128 },
+        { 239,  54,  84, 108, 255, 128, 128 },
+        { 203,  84, 110, 147, 255, 128, 128 },
+        { 248,  70, 105, 151, 255, 128, 128 },
+    }, {  // 6 colors
+        { 189,  50,  67,  90, 130, 255, 128 },
+        { 114,  50,  55,  90, 123, 255, 128 },
+        {  66,  76,  54,  82, 128, 255, 128 },
+        {  43,  69,  69,  80, 129, 255, 128 },
+        {  22,  59,  87,  88, 141, 255, 128 },
+        { 203,  49,  68,  87, 122, 255, 128 },
+        {  43, 157,  74, 104, 146, 255, 128 },
+        {  54, 138,  51,  95, 138, 255, 128 },
+        {  82, 171,  58, 102, 146, 255, 128 },
+        { 129,  38,  59,  64, 168, 255, 128 },
+        {  56,  67, 119,  92, 112, 255, 128 },
+        {  96,  62,  53, 132,  82, 255, 128 },
+        {  60, 147,  77, 108, 145, 255, 128 },
+        { 238,  76,  73,  93, 148, 255, 128 },
+        { 189,  86,  73, 103, 157, 255, 128 },
+        { 246,  62,  75,  83, 167, 255, 128 },
+    }, {  // 7 colors
+        { 179,  42,  51,  73,  99, 134, 255 },
+        { 119,  52,  52,  61,  64, 114, 255 },
+        {  53,  77,  35,  65,  71, 131, 255 },
+        {  38,  70,  51,  68,  89, 144, 255 },
+        {  23,  65, 128,  73,  97, 131, 255 },
+        { 210,  47,  52,  63,  81, 143, 255 },
+        {  42, 159,  57,  68,  98, 143, 255 },
+        {  49, 153,  45,  82,  93, 143, 255 },
+        {  81, 169,  52,  72, 113, 151, 255 },
+        { 136,  46,  35,  56,  75,  96, 255 },
+        {  57,  84, 109,  47, 107, 131, 255 },
+        { 128,  78,  57,  36, 128,  85, 255 },
+        {  54, 149,  68,  77,  94, 153, 255 },
+        { 243,  58,  50,  71,  81, 167, 255 },
+        { 189,  92,  64,  70, 121, 173, 255 },
+        { 248,  35,  38,  51,  82, 201, 255 },
+    }, {  // 8 colors
+        { 201,  40,  36,  42,  64,  92, 123 },
+        { 116,  43,  33,  43,  73, 102, 128 },
+        {  46,  77,  37,  69,  62,  78, 150 },
+        {  40,  65,  52,  50,  76,  89, 133 },
+        {  28,  48,  91,  17,  64,  77, 133 },
+        { 218,  43,  43,  37,  56,  72, 163 },
+        {  41, 155,  44,  83,  82, 129, 180 },
+        {  44, 141,  29,  55,  64,  89, 147 },
+        {  92, 166,  48,  45,  59, 126, 179 },
+        { 169,  35,  49,  41,  36,  99, 139 },
+        {  55,  77,  77,  56,  60,  75, 156 },
+        { 155,  81,  51,  64,  57, 182, 255 },
+        {  60, 134,  49,  49,  93, 128, 174 },
+        { 244,  98,  51,  46,  22,  73, 238 },
+        { 189,  70,  40,  87,  93,  79, 201 },
+        { 248,  54,  49,  40,  29,  42, 227 },
+    }
+};
+
+static const int palette_color_context_lookup[PALETTE_COLOR_CONTEXTS] = {
+    // (3, 0, 0, 0), (3, 2, 0, 0), (3, 3, 2, 0), (3, 3, 2, 2),
+    3993,  4235,  4378,  4380,
+    // (4, 3, 3, 0), (5, 0, 0, 0), (5, 3, 0, 0), (5, 3, 2, 0),
+    5720,  6655,  7018,  7040,
+    // (5, 5, 0, 0), (6, 2, 0, 0), (6, 2, 2, 0), (6, 4, 0, 0),
+    7260,  8228,  8250,  8470,
+    // (7, 3, 0, 0), (8, 0, 0, 0), (8, 2, 0, 0), (10, 0, 0, 0)
+    9680, 10648, 10890, 13310
+};
+
+const vpx_tree_index vp10_tx_size_tree[TX_SIZES - 1][TREE_SIZE(TX_SIZES)] = {
+    {  // Max tx_size is 8X8
+        -TX_4X4, -TX_8X8,
+    },
+    {  // Max tx_size is 16X16
+        -TX_4X4, 2,
+        -TX_8X8, -TX_16X16,
+    },
+    {  // Max tx_size is 32X32
+        -TX_4X4, 2,
+        -TX_8X8, 4,
+        -TX_16X16, -TX_32X32,
+    },
+};
+
+static const vpx_prob
+default_tx_size_prob[TX_SIZES - 1][TX_SIZE_CONTEXTS][TX_SIZES - 1] = {
+    {  // Max tx_size is 8X8
+        { 100, }, { 66, },
+    },
+    {  // Max tx_size is 16X16
+        { 20, 152, }, { 15, 101, },
+    },
+    {  // Max tx_size is 32X32
+        { 3, 136, 37 }, { 5, 52,  13 },
+    },
+};
+
+int vp10_get_palette_color_context(const uint8_t *color_map, int cols,
+                                   int r, int c, int n, int *color_order) {
+  int i, j, max, max_idx, temp;
+  int scores[PALETTE_MAX_SIZE + 10];
+  int weights[4] = {3, 2, 3, 2};
+  int color_ctx = 0;
+  int color_neighbors[4];
+
+  assert(n <= PALETTE_MAX_SIZE);
+
+  if (c - 1 >= 0)
+    color_neighbors[0] = color_map[r * cols + c - 1];
+  else
+    color_neighbors[0] = -1;
+  if (c - 1 >= 0 && r - 1 >= 0)
+    color_neighbors[1] = color_map[(r - 1) * cols + c - 1];
+  else
+    color_neighbors[1] = -1;
+  if (r - 1 >= 0)
+    color_neighbors[2] = color_map[(r - 1) * cols + c];
+  else
+    color_neighbors[2] = -1;
+  if (r - 1 >= 0 && c + 1 <= cols - 1)
+    color_neighbors[3] = color_map[(r - 1) * cols + c + 1];
+  else
+    color_neighbors[3] = -1;
+
+  for (i = 0; i < PALETTE_MAX_SIZE; ++i)
+    color_order[i] = i;
+  memset(scores, 0, PALETTE_MAX_SIZE * sizeof(scores[0]));
+  for (i = 0; i < 4; ++i) {
+    if (color_neighbors[i] >= 0)
+      scores[color_neighbors[i]] += weights[i];
+  }
+
+  for (i = 0; i < 4; ++i) {
+    max = scores[i];
+    max_idx = i;
+    j = i + 1;
+    while (j < n) {
+      if (scores[j] > max) {
+        max = scores[j];
+        max_idx = j;
+      }
+      ++j;
+    }
+
+    if (max_idx != i) {
+      temp = scores[i];
+      scores[i] = scores[max_idx];
+      scores[max_idx] = temp;
+
+      temp = color_order[i];
+      color_order[i] = color_order[max_idx];
+      color_order[max_idx] = temp;
+    }
+  }
+
+  for (i = 0; i < 4; ++i)
+    color_ctx = color_ctx * 11 + scores[i];
+
+  for (i = 0; i < PALETTE_COLOR_CONTEXTS; ++i)
+    if (color_ctx == palette_color_context_lookup[i]) {
+      color_ctx = i;
+      break;
+    }
+
+  if (color_ctx >= PALETTE_COLOR_CONTEXTS)
+    color_ctx = 0;
+
+  return color_ctx;
 }
 
-void vp10_tx_counts_to_branch_counts_16x16(const unsigned int *tx_count_16x16p,
-                                      unsigned int (*ct_16x16p)[2]) {
-  ct_16x16p[0][0] = tx_count_16x16p[TX_4X4];
-  ct_16x16p[0][1] = tx_count_16x16p[TX_8X8] + tx_count_16x16p[TX_16X16];
-  ct_16x16p[1][0] = tx_count_16x16p[TX_8X8];
-  ct_16x16p[1][1] = tx_count_16x16p[TX_16X16];
-}
-
-void vp10_tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p,
-                                    unsigned int (*ct_8x8p)[2]) {
-  ct_8x8p[0][0] = tx_count_8x8p[TX_4X4];
-  ct_8x8p[0][1] = tx_count_8x8p[TX_8X8];
-}
+#if CONFIG_VAR_TX
+static const vpx_prob default_txfm_partition_probs[TXFM_PARTITION_CONTEXTS] = {
+    192, 128, 64, 192, 128, 64, 192, 128, 64,
+};
+#endif
 
 static const vpx_prob default_skip_probs[SKIP_CONTEXTS] = {
   192, 128, 64
 };
 
+#if CONFIG_EXT_INTERP
+static const vpx_prob default_switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
+                                                    [SWITCHABLE_FILTERS - 1] = {
+#if CONFIG_DUAL_FILTER
+  { 235, 192, 128, 128},
+  { 36, 243, 208, 128},
+  { 34, 16, 128, 128},
+  { 36, 243, 48, 128},
+  { 34, 16, 128, 128},
+  { 149, 160, 128, 128},
+
+  { 235, 192, 128, 128},
+  { 36, 243, 208, 128},
+  { 34, 16, 128, 128},
+  { 36, 243, 48, 128},
+  { 34, 16, 128, 128},
+  { 149, 160, 128, 128},
+
+  { 235, 192, 128, 128},
+  { 36, 243, 208, 128},
+  { 34, 16, 128, 128},
+  { 36, 243, 48, 128},
+  { 34, 16, 128, 128},
+  { 149, 160, 128, 128},
+
+  { 235, 192, 128, 128},
+  { 36, 243, 208, 128},
+  { 34, 16, 128, 128},
+  { 36, 243, 48, 128},
+  { 34, 16, 128, 128},
+  { 149, 160, 128, 128},
+#else
+  { 235, 192, 128, 128},
+  { 36, 243, 208, 128},
+  { 34, 16, 128, 128},
+  { 36, 243, 48, 128},
+  { 34, 16, 128, 128},
+  { 149, 160, 128, 128},
+#endif
+};
+#else  // CONFIG_EXT_INTERP
+#if CONFIG_DUAL_FILTER
+static const vpx_prob default_switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
+                                                    [SWITCHABLE_FILTERS - 1] = {
+  { 235, 162, },
+  { 36, 255, },
+  { 34, 3, },
+  { 149, 144, },
+
+  { 235, 162, },
+  { 36, 255, },
+  { 34, 3, },
+  { 10, 3, },
+
+  { 235, 162, },
+  { 36, 255, },
+  { 34, 3, },
+  { 149, 144, },
+
+  { 235, 162, },
+  { 36, 255, },
+  { 34, 3, },
+  { 10, 3, },
+};
+#else
 static const vpx_prob default_switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
                                                     [SWITCHABLE_FILTERS - 1] = {
   { 235, 162, },
@@ -317,14 +1022,214 @@
   { 34, 3, },
   { 149, 144, },
 };
-
-#if CONFIG_MISC_FIXES
-// FIXME(someone) need real defaults here
-static const struct segmentation_probs default_seg_probs = {
-  { 128, 128, 128, 128, 128, 128, 128 },
-  { 128, 128, 128 },
-};
 #endif
+#endif  // CONFIG_EXT_INTERP
+
+#if CONFIG_EXT_TX
+const vpx_tree_index vp10_ext_tx_inter_tree[EXT_TX_SETS_INTER]
+                                           [TREE_SIZE(TX_TYPES)] = {
+  { // ToDo(yaowu): remove used entry 0.
+    0
+  }, {
+    -IDTX, 2,
+    4, 14,
+    6, 8,
+    -V_DCT, -H_DCT,
+    10, 12,
+    -V_ADST, -H_ADST,
+    -V_FLIPADST, -H_FLIPADST,
+    -DCT_DCT, 16,
+    18, 24,
+    20, 22,
+    -ADST_DCT, -DCT_ADST,
+    -FLIPADST_DCT, -DCT_FLIPADST,
+    26, 28,
+    -ADST_ADST, -FLIPADST_FLIPADST,
+    -ADST_FLIPADST, -FLIPADST_ADST
+  }, {
+    -IDTX, 2,
+    4, 6,
+    -V_DCT, -H_DCT,
+    -DCT_DCT, 8,
+    10, 16,
+    12, 14,
+    -ADST_DCT, -DCT_ADST,
+    -FLIPADST_DCT, -DCT_FLIPADST,
+    18, 20,
+    -ADST_ADST, -FLIPADST_FLIPADST,
+    -ADST_FLIPADST, -FLIPADST_ADST
+  }, {
+    -IDTX, -DCT_DCT,
+  }
+};
+
+const vpx_tree_index vp10_ext_tx_intra_tree[EXT_TX_SETS_INTRA]
+                                           [TREE_SIZE(TX_TYPES)] = {
+  {  // ToDo(yaowu): remove unused entry 0.
+    0
+  }, {
+    -IDTX, 2,
+    -DCT_DCT, 4,
+    6, 8,
+    -V_DCT, -H_DCT,
+    -ADST_ADST, 10,
+    -ADST_DCT, -DCT_ADST,
+  }, {
+    -IDTX, 2,
+    -DCT_DCT, 4,
+    -ADST_ADST, 6,
+    -ADST_DCT, -DCT_ADST,
+  }
+};
+
+static const vpx_prob
+default_inter_ext_tx_prob[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES - 1] = {
+  { // ToDo(yaowu): remove unused entry 0.
+    { 0 },
+    { 0 },
+    { 0 },
+#if EXT_TX_SIZES == 4
+    { 0 },
+#endif
+  }, {
+    { 10, 24, 30, 128, 128, 128, 128, 112, 160, 128, 128, 128, 128, 128, 128},
+    { 10, 24, 30, 128, 128, 128, 128, 112, 160, 128, 128, 128, 128, 128, 128},
+    { 10, 24, 30, 128, 128, 128, 128, 112, 160, 128, 128, 128, 128, 128, 128},
+#if EXT_TX_SIZES == 4
+    { 10, 24, 30, 128, 128, 128, 128, 112, 160, 128, 128, 128, 128, 128, 128},
+#endif
+  }, {
+    { 10, 30, 128, 112, 160, 128, 128, 128, 128, 128, 128 },
+    { 10, 30, 128, 112, 160, 128, 128, 128, 128, 128, 128 },
+    { 10, 30, 128, 112, 160, 128, 128, 128, 128, 128, 128 },
+#if EXT_TX_SIZES == 4
+    { 10, 30, 128, 112, 160, 128, 128, 128, 128, 128, 128 },
+#endif
+  }, {
+    { 12, },
+    { 12, },
+    { 12, },
+#if EXT_TX_SIZES == 4
+    { 12, },
+#endif
+  }
+};
+
+static const vpx_prob
+default_intra_ext_tx_prob[EXT_TX_SETS_INTRA][EXT_TX_SIZES]
+                         [INTRA_MODES][TX_TYPES - 1] = {
+  { // ToDo(yaowu): remove unused entry 0.
+    {
+      { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 },
+    }, {
+      { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 },
+    }, {
+      { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 },
+#if EXT_TX_SIZES == 4
+    }, {
+      { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 }, { 0 },
+#endif
+    },
+  }, {
+    {
+      {   8, 224,  32, 128, 64, 128, },
+      {  10,  32,  32, 128, 16, 192, },
+      {  10,  32,  32, 128, 16,  64, },
+      {   9, 200,  32, 128, 64, 128, },
+      {   8,   8,  32, 128, 224, 128, },
+      {  10,  32,  32, 128, 16, 192, },
+      {  10,  32,  32, 128, 16,  64, },
+      {  10,  23,  32, 128, 80, 176, },
+      {  10,  23,  32, 128, 80, 176, },
+      {  10,  32,  32, 128, 16,  64, },
+    }, {
+      {   8, 224, 32, 128,  64, 128, },
+      {  10,  32, 32, 128,  16, 192, },
+      {  10,  32, 32, 128,  16,  64, },
+      {   9, 200, 32, 128,  64, 128, },
+      {   8,   8, 32, 128, 224, 128, },
+      {  10,  32, 32, 128,  16, 192, },
+      {  10,  32, 32, 128,  16,  64, },
+      {  10,  23, 32, 128,  80, 176, },
+      {  10,  23, 32, 128,  80, 176, },
+      {  10,  32, 32, 128,  16,  64, },
+    }, {
+      {   8, 224, 32, 128,  64, 128, },
+      {  10,  32, 32, 128,  16, 192, },
+      {  10,  32, 32, 128,  16,  64, },
+      {   9, 200, 32, 128,  64, 128, },
+      {   8,   8, 32, 128, 224, 128, },
+      {  10,  32, 32, 128,  16, 192, },
+      {  10,  32, 32, 128,  16,  64, },
+      {  10,  23, 32, 128,  80, 176, },
+      {  10,  23, 32, 128,  80, 176, },
+      {  10,  32, 32, 128,  16,  64, },
+#if EXT_TX_SIZES == 4
+    }, {
+      {   8, 224, 32, 128,  64, 128, },
+      {  10,  32, 32, 128,  16, 192, },
+      {  10,  32, 32, 128,  16,  64, },
+      {   9, 200, 32, 128,  64, 128, },
+      {   8,   8, 32, 128, 224, 128, },
+      {  10,  32, 32, 128,  16, 192, },
+      {  10,  32, 32, 128,  16,  64, },
+      {  10,  23, 32, 128,  80, 176, },
+      {  10,  23, 32, 128,  80, 176, },
+      {  10,  32, 32, 128,  16,  64, },
+#endif
+    },
+  }, {
+    {
+      {   8, 224,  64, 128, },
+      {  10,  32,  16, 192, },
+      {  10,  32,  16,  64, },
+      {   9, 200,  64, 128, },
+      {   8,   8, 224, 128, },
+      {  10,  32,  16, 192, },
+      {  10,  32,  16,  64, },
+      {  10,  23,  80, 176, },
+      {  10,  23,  80, 176, },
+      {  10,  32,  16,  64, },
+    }, {
+      {   8, 224,  64, 128, },
+      {  10,  32,  16, 192, },
+      {  10,  32,  16,  64, },
+      {   9, 200,  64, 128, },
+      {   8,   8, 224, 128, },
+      {  10,  32,  16, 192, },
+      {  10,  32,  16,  64, },
+      {  10,  23,  80, 176, },
+      {  10,  23,  80, 176, },
+      {  10,  32,  16,  64, },
+    }, {
+      {   8, 224,  64, 128, },
+      {  10,  32,  16, 192, },
+      {  10,  32,  16,  64, },
+      {   9, 200,  64, 128, },
+      {   8,   8, 224, 128, },
+      {  10,  32,  16, 192, },
+      {  10,  32,  16,  64, },
+      {  10,  23,  80, 176, },
+      {  10,  23,  80, 176, },
+      {  10,  32,  16,  64, },
+#if EXT_TX_SIZES == 4
+    }, {
+      {   8, 224,  64, 128, },
+      {  10,  32,  16, 192, },
+      {  10,  32,  16,  64, },
+      {   9, 200,  64, 128, },
+      {   8,   8, 224, 128, },
+      {  10,  32,  16, 192, },
+      {  10,  32,  16,  64, },
+      {  10,  23,  80, 176, },
+      {  10,  23,  80, 176, },
+      {  10,  32,  16,  64, },
+#endif
+    },
+  },
+};
+
+#else
 
 const vpx_tree_index vp10_ext_tx_tree[TREE_SIZE(TX_TYPES)] = {
   -DCT_DCT, 2,
@@ -345,6 +1250,39 @@
   {176, 85, 128},
   {192, 85, 128},
 };
+#endif  // CONFIG_EXT_TX
+
+#if CONFIG_EXT_INTRA
+static const vpx_prob
+default_intra_filter_probs[INTRA_FILTERS + 1][INTRA_FILTERS - 1] = {
+  { 98,  63,  60,  },
+  { 98,  82,  80,  },
+  { 94,  65, 103,  },
+  { 49,  25,  24,  },
+  { 72,  38,  50,  },
+};
+static const vpx_prob default_ext_intra_probs[2] = {230, 230};
+
+const vpx_tree_index vp10_intra_filter_tree[TREE_SIZE(INTRA_FILTERS)] = {
+  -INTRA_FILTER_LINEAR, 2,
+  -INTRA_FILTER_8TAP, 4,
+  -INTRA_FILTER_8TAP_SHARP, -INTRA_FILTER_8TAP_SMOOTH,
+};
+#endif  // CONFIG_EXT_INTRA
+
+#if CONFIG_SUPERTX
+static const vpx_prob default_supertx_prob[PARTITION_SUPERTX_CONTEXTS]
+                                          [TX_SIZES] = {
+  { 1, 160, 160, 170 },
+  { 1, 200, 200, 210 },
+};
+#endif  // CONFIG_SUPERTX
+
+// FIXME(someone) need real defaults here
+static const struct segmentation_probs default_seg_probs = {
+  { 128, 128, 128, 128, 128, 128, 128 },
+  { 128, 128, 128 },
+};
 
 static void init_mode_probs(FRAME_CONTEXT *fc) {
   vp10_copy(fc->uv_mode_prob, default_uv_probs);
@@ -354,23 +1292,63 @@
   vp10_copy(fc->intra_inter_prob, default_intra_inter_p);
   vp10_copy(fc->comp_inter_prob, default_comp_inter_p);
   vp10_copy(fc->comp_ref_prob, default_comp_ref_p);
+#if CONFIG_EXT_REFS
+  vp10_copy(fc->comp_bwdref_prob, default_comp_bwdref_p);
+#endif  // CONFIG_EXT_REFS
   vp10_copy(fc->single_ref_prob, default_single_ref_p);
-  fc->tx_probs = default_tx_probs;
+  vp10_copy(fc->tx_size_probs, default_tx_size_prob);
+#if CONFIG_VAR_TX
+  vp10_copy(fc->txfm_partition_prob, default_txfm_partition_probs);
+#endif
   vp10_copy(fc->skip_probs, default_skip_probs);
+#if CONFIG_REF_MV
+  vp10_copy(fc->newmv_prob, default_newmv_prob);
+  vp10_copy(fc->zeromv_prob, default_zeromv_prob);
+  vp10_copy(fc->refmv_prob, default_refmv_prob);
+  vp10_copy(fc->drl_prob, default_drl_prob);
+#if CONFIG_EXT_INTER
+  fc->new2mv_prob = default_new2mv_prob;
+#endif  // CONFIG_EXT_INTER
+#endif  // CONFIG_REF_MV
   vp10_copy(fc->inter_mode_probs, default_inter_mode_probs);
-#if CONFIG_MISC_FIXES
+#if CONFIG_OBMC || CONFIG_WARPED_MOTION
+  vp10_copy(fc->motvar_prob, default_motvar_prob);
+#endif  // CONFIG_OBMC || CONFIG_WARPED_MOTION
+#if CONFIG_EXT_INTER
+  vp10_copy(fc->inter_compound_mode_probs, default_inter_compound_mode_probs);
+  vp10_copy(fc->interintra_prob, default_interintra_prob);
+  vp10_copy(fc->interintra_mode_prob, default_interintra_mode_prob);
+  vp10_copy(fc->wedge_interintra_prob, default_wedge_interintra_prob);
+  vp10_copy(fc->wedge_interinter_prob, default_wedge_interinter_prob);
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_SUPERTX
+  vp10_copy(fc->supertx_prob, default_supertx_prob);
+#endif  // CONFIG_SUPERTX
   vp10_copy(fc->seg.tree_probs, default_seg_probs.tree_probs);
   vp10_copy(fc->seg.pred_probs, default_seg_probs.pred_probs);
-#endif
-  vp10_copy(fc->intra_ext_tx_prob, default_intra_ext_tx_prob);
+#if CONFIG_EXT_INTRA
+  vp10_copy(fc->ext_intra_probs, default_ext_intra_probs);
+  vp10_copy(fc->intra_filter_probs, default_intra_filter_probs);
+#endif  // CONFIG_EXT_INTRA
   vp10_copy(fc->inter_ext_tx_prob, default_inter_ext_tx_prob);
+  vp10_copy(fc->intra_ext_tx_prob, default_intra_ext_tx_prob);
 }
 
+#if CONFIG_EXT_INTERP
 const vpx_tree_index vp10_switchable_interp_tree
-                         [TREE_SIZE(SWITCHABLE_FILTERS)] = {
-  -EIGHTTAP, 2,
-  -EIGHTTAP_SMOOTH, -EIGHTTAP_SHARP
+[TREE_SIZE(SWITCHABLE_FILTERS)] = {
+  -EIGHTTAP_REGULAR, 2,
+  4, 6,
+  -EIGHTTAP_SMOOTH, -EIGHTTAP_SMOOTH2,
+  -MULTITAP_SHARP, -MULTITAP_SHARP2,
 };
+#else
+const vpx_tree_index vp10_switchable_interp_tree
+[TREE_SIZE(SWITCHABLE_FILTERS)] = {
+  -EIGHTTAP_REGULAR, 2,
+  -EIGHTTAP_SMOOTH, -MULTITAP_SHARP
+};
+#endif  // CONFIG_EXT_INTERP
 
 void vp10_adapt_inter_frame_probs(VP10_COMMON *cm) {
   int i, j;
@@ -379,37 +1357,105 @@
   const FRAME_COUNTS *counts = &cm->counts;
 
   for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
-    fc->intra_inter_prob[i] = mode_mv_merge_probs(pre_fc->intra_inter_prob[i],
-                                                  counts->intra_inter[i]);
+    fc->intra_inter_prob[i] = vp10_mode_mv_merge_probs(
+        pre_fc->intra_inter_prob[i], counts->intra_inter[i]);
   for (i = 0; i < COMP_INTER_CONTEXTS; i++)
-    fc->comp_inter_prob[i] = mode_mv_merge_probs(pre_fc->comp_inter_prob[i],
-                                                 counts->comp_inter[i]);
+    fc->comp_inter_prob[i] = vp10_mode_mv_merge_probs(
+        pre_fc->comp_inter_prob[i], counts->comp_inter[i]);
+
+#if CONFIG_EXT_REFS
   for (i = 0; i < REF_CONTEXTS; i++)
-    fc->comp_ref_prob[i] = mode_mv_merge_probs(pre_fc->comp_ref_prob[i],
-                                               counts->comp_ref[i]);
+    for (j = 0; j < (FWD_REFS - 1); j++)
+      fc->comp_ref_prob[i][j] = mode_mv_merge_probs(
+          pre_fc->comp_ref_prob[i][j], counts->comp_ref[i][j]);
   for (i = 0; i < REF_CONTEXTS; i++)
-    for (j = 0; j < 2; j++)
-      fc->single_ref_prob[i][j] = mode_mv_merge_probs(
+    for (j = 0; j < (BWD_REFS - 1); j++)
+      fc->comp_bwdref_prob[i][j] = mode_mv_merge_probs(
+          pre_fc->comp_bwdref_prob[i][j], counts->comp_bwdref[i][j]);
+#else
+  for (i = 0; i < REF_CONTEXTS; i++)
+    for (j = 0; j < (COMP_REFS - 1); j++)
+      fc->comp_ref_prob[i][j] = mode_mv_merge_probs(
+          pre_fc->comp_ref_prob[i][j], counts->comp_ref[i][j]);
+#endif  // CONFIG_EXT_REFS
+
+  for (i = 0; i < REF_CONTEXTS; i++)
+    for (j = 0; j < (SINGLE_REFS - 1); j++)
+      fc->single_ref_prob[i][j] = vp10_mode_mv_merge_probs(
           pre_fc->single_ref_prob[i][j], counts->single_ref[i][j]);
 
+#if CONFIG_REF_MV
+  for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i)
+    fc->newmv_prob[i] = vp10_mode_mv_merge_probs(pre_fc->newmv_prob[i],
+                                                 counts->newmv_mode[i]);
+  for (i = 0; i < ZEROMV_MODE_CONTEXTS; ++i)
+    fc->zeromv_prob[i] = vp10_mode_mv_merge_probs(pre_fc->zeromv_prob[i],
+                                                  counts->zeromv_mode[i]);
+  for (i = 0; i < REFMV_MODE_CONTEXTS; ++i)
+    fc->refmv_prob[i] = vp10_mode_mv_merge_probs(pre_fc->refmv_prob[i],
+                                                 counts->refmv_mode[i]);
+
+  for (i = 0; i < DRL_MODE_CONTEXTS; ++i)
+    fc->drl_prob[i] = vp10_mode_mv_merge_probs(pre_fc->drl_prob[i],
+                                               counts->drl_mode[i]);
+#if CONFIG_EXT_INTER
+  fc->new2mv_prob = vp10_mode_mv_merge_probs(pre_fc->new2mv_prob,
+                                             counts->new2mv_mode);
+#endif  // CONFIG_EXT_INTER
+#else
   for (i = 0; i < INTER_MODE_CONTEXTS; i++)
     vpx_tree_merge_probs(vp10_inter_mode_tree, pre_fc->inter_mode_probs[i],
-                counts->inter_mode[i], fc->inter_mode_probs[i]);
+                         counts->inter_mode[i], fc->inter_mode_probs[i]);
+#endif
+
+#if CONFIG_OBMC || CONFIG_WARPED_MOTION
+  for (i = BLOCK_8X8; i < BLOCK_SIZES; ++i)
+    vpx_tree_merge_probs(vp10_motvar_tree, pre_fc->motvar_prob[i],
+                         counts->motvar[i], fc->motvar_prob[i]);
+#endif  // CONFIG_OBMC || CONFIG_WARPED_MOTION
+
+#if CONFIG_SUPERTX
+  for (i = 0; i < PARTITION_SUPERTX_CONTEXTS; ++i) {
+    int j;
+    for (j = 1; j < TX_SIZES; ++j) {
+      fc->supertx_prob[i][j] = vp10_mode_mv_merge_probs(
+          pre_fc->supertx_prob[i][j], counts->supertx[i][j]);
+    }
+  }
+#endif  // CONFIG_SUPERTX
+
+#if CONFIG_EXT_INTER
+  for (i = 0; i < INTER_MODE_CONTEXTS; i++)
+    vpx_tree_merge_probs(vp10_inter_compound_mode_tree,
+                         pre_fc->inter_compound_mode_probs[i],
+                         counts->inter_compound_mode[i],
+                         fc->inter_compound_mode_probs[i]);
+  for (i = 0; i < BLOCK_SIZE_GROUPS; ++i) {
+    if (is_interintra_allowed_bsize_group(i))
+      fc->interintra_prob[i] = vp10_mode_mv_merge_probs(
+          pre_fc->interintra_prob[i], counts->interintra[i]);
+  }
+  for (i = 0; i < BLOCK_SIZE_GROUPS; i++) {
+    vpx_tree_merge_probs(
+        vp10_interintra_mode_tree, pre_fc->interintra_mode_prob[i],
+        counts->interintra_mode[i], fc->interintra_mode_prob[i]);
+  }
+  for (i = 0; i < BLOCK_SIZES; ++i) {
+    if (is_interintra_allowed_bsize(i) && is_interintra_wedge_used(i))
+      fc->wedge_interintra_prob[i] = vp10_mode_mv_merge_probs(
+          pre_fc->wedge_interintra_prob[i], counts->wedge_interintra[i]);
+  }
+  for (i = 0; i < BLOCK_SIZES; ++i) {
+    if (is_interinter_wedge_used(i))
+      fc->wedge_interinter_prob[i] = vp10_mode_mv_merge_probs(
+          pre_fc->wedge_interinter_prob[i], counts->wedge_interinter[i]);
+  }
+#endif  // CONFIG_EXT_INTER
 
   for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
     vpx_tree_merge_probs(vp10_intra_mode_tree, pre_fc->y_mode_prob[i],
                 counts->y_mode[i], fc->y_mode_prob[i]);
 
-#if !CONFIG_MISC_FIXES
-  for (i = 0; i < INTRA_MODES; ++i)
-    vpx_tree_merge_probs(vp10_intra_mode_tree, pre_fc->uv_mode_prob[i],
-                         counts->uv_mode[i], fc->uv_mode_prob[i]);
-
-  for (i = 0; i < PARTITION_CONTEXTS; i++)
-    vpx_tree_merge_probs(vp10_partition_tree, pre_fc->partition_prob[i],
-                         counts->partition[i], fc->partition_prob[i]);
-#endif
-
   if (cm->interp_filter == SWITCHABLE) {
     for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
       vpx_tree_merge_probs(vp10_switchable_interp_tree,
@@ -420,41 +1466,57 @@
 }
 
 void vp10_adapt_intra_frame_probs(VP10_COMMON *cm) {
-  int i;
+  int i, j;
   FRAME_CONTEXT *fc = cm->fc;
   const FRAME_CONTEXT *pre_fc = &cm->frame_contexts[cm->frame_context_idx];
   const FRAME_COUNTS *counts = &cm->counts;
 
   if (cm->tx_mode == TX_MODE_SELECT) {
-    int j;
-    unsigned int branch_ct_8x8p[TX_SIZES - 3][2];
-    unsigned int branch_ct_16x16p[TX_SIZES - 2][2];
-    unsigned int branch_ct_32x32p[TX_SIZES - 1][2];
-
-    for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
-      vp10_tx_counts_to_branch_counts_8x8(counts->tx.p8x8[i], branch_ct_8x8p);
-      for (j = 0; j < TX_SIZES - 3; ++j)
-        fc->tx_probs.p8x8[i][j] = mode_mv_merge_probs(
-            pre_fc->tx_probs.p8x8[i][j], branch_ct_8x8p[j]);
-
-      vp10_tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i], branch_ct_16x16p);
-      for (j = 0; j < TX_SIZES - 2; ++j)
-        fc->tx_probs.p16x16[i][j] = mode_mv_merge_probs(
-            pre_fc->tx_probs.p16x16[i][j], branch_ct_16x16p[j]);
-
-      vp10_tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i], branch_ct_32x32p);
-      for (j = 0; j < TX_SIZES - 1; ++j)
-        fc->tx_probs.p32x32[i][j] = mode_mv_merge_probs(
-            pre_fc->tx_probs.p32x32[i][j], branch_ct_32x32p[j]);
+    for (i = 0; i < TX_SIZES - 1; ++i) {
+      for (j = 0; j < TX_SIZE_CONTEXTS; ++j)
+        vpx_tree_merge_probs(vp10_tx_size_tree[i],
+                             pre_fc->tx_size_probs[i][j],
+                             counts->tx_size[i][j],
+                             fc->tx_size_probs[i][j]);
     }
   }
 
+#if CONFIG_VAR_TX
+  if (cm->tx_mode == TX_MODE_SELECT)
+    for (i = 0; i < TXFM_PARTITION_CONTEXTS; ++i)
+      fc->txfm_partition_prob[i] =
+          vp10_mode_mv_merge_probs(pre_fc->txfm_partition_prob[i],
+                              counts->txfm_partition[i]);
+#endif
+
   for (i = 0; i < SKIP_CONTEXTS; ++i)
-    fc->skip_probs[i] = mode_mv_merge_probs(
+    fc->skip_probs[i] = vp10_mode_mv_merge_probs(
         pre_fc->skip_probs[i], counts->skip[i]);
 
+#if CONFIG_EXT_TX
   for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
-    int j;
+    int s;
+    for (s = 1; s < EXT_TX_SETS_INTER; ++s) {
+      if (use_inter_ext_tx_for_txsize[s][i]) {
+        vpx_tree_merge_probs(vp10_ext_tx_inter_tree[s],
+                             pre_fc->inter_ext_tx_prob[s][i],
+                             counts->inter_ext_tx[s][i],
+                             fc->inter_ext_tx_prob[s][i]);
+      }
+    }
+    for (s = 1; s < EXT_TX_SETS_INTRA; ++s) {
+      if (use_intra_ext_tx_for_txsize[s][i]) {
+        int j;
+        for (j = 0; j < INTRA_MODES; ++j)
+          vpx_tree_merge_probs(vp10_ext_tx_intra_tree[s],
+                               pre_fc->intra_ext_tx_prob[s][i][j],
+                               counts->intra_ext_tx[s][i][j],
+                               fc->intra_ext_tx_prob[s][i][j]);
+      }
+    }
+  }
+#else
+  for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
     for (j = 0; j < TX_TYPES; ++j)
       vpx_tree_merge_probs(vp10_ext_tx_tree,
                            pre_fc->intra_ext_tx_prob[i][j],
@@ -467,12 +1529,12 @@
                          counts->inter_ext_tx[i],
                          fc->inter_ext_tx_prob[i]);
   }
+#endif  // CONFIG_EXT_TX
 
-#if CONFIG_MISC_FIXES
   if (cm->seg.temporal_update) {
     for (i = 0; i < PREDICTION_PROBS; i++)
-      fc->seg.pred_probs[i] = mode_mv_merge_probs(pre_fc->seg.pred_probs[i],
-                                                  counts->seg.pred[i]);
+      fc->seg.pred_probs[i] = vp10_mode_mv_merge_probs(
+          pre_fc->seg.pred_probs[i], counts->seg.pred[i]);
 
     vpx_tree_merge_probs(vp10_segment_tree, pre_fc->seg.tree_probs,
                          counts->seg.tree_mispred, fc->seg.tree_probs);
@@ -485,10 +1547,28 @@
     vpx_tree_merge_probs(vp10_intra_mode_tree, pre_fc->uv_mode_prob[i],
                          counts->uv_mode[i], fc->uv_mode_prob[i]);
 
+#if CONFIG_EXT_PARTITION_TYPES
+  vpx_tree_merge_probs(vp10_partition_tree, pre_fc->partition_prob[0],
+                       counts->partition[0], fc->partition_prob[0]);
+  for (i = 1; i < PARTITION_CONTEXTS; i++)
+    vpx_tree_merge_probs(vp10_ext_partition_tree, pre_fc->partition_prob[i],
+                         counts->partition[i], fc->partition_prob[i]);
+#else
   for (i = 0; i < PARTITION_CONTEXTS; i++)
     vpx_tree_merge_probs(vp10_partition_tree, pre_fc->partition_prob[i],
                          counts->partition[i], fc->partition_prob[i]);
-#endif
+#endif  // CONFIG_EXT_PARTITION_TYPES
+
+#if CONFIG_EXT_INTRA
+  for (i = 0; i < PLANE_TYPES; ++i) {
+    fc->ext_intra_probs[i] = vp10_mode_mv_merge_probs(
+              pre_fc->ext_intra_probs[i], counts->ext_intra[i]);
+  }
+
+  for (i = 0; i < INTRA_FILTERS + 1; ++i)
+    vpx_tree_merge_probs(vp10_intra_filter_tree, pre_fc->intra_filter_probs[i],
+                         counts->intra_filter[i], fc->intra_filter_probs[i]);
+#endif  // CONFIG_EXT_INTRA
 }
 
 static void set_default_lf_deltas(struct loopfilter *lf) {
@@ -497,6 +1577,11 @@
 
   lf->ref_deltas[INTRA_FRAME] = 1;
   lf->ref_deltas[LAST_FRAME] = 0;
+#if CONFIG_EXT_REFS
+  lf->ref_deltas[LAST2_FRAME] = lf->ref_deltas[LAST_FRAME];
+  lf->ref_deltas[LAST3_FRAME] = lf->ref_deltas[LAST_FRAME];
+  lf->ref_deltas[BWDREF_FRAME] = lf->ref_deltas[LAST_FRAME];
+#endif  // CONFIG_EXT_REFS
   lf->ref_deltas[GOLDEN_FRAME] = -1;
   lf->ref_deltas[ALTREF_FRAME] = -1;
 
@@ -526,6 +1611,9 @@
 
   // To force update of the sharpness
   lf->last_sharpness_level = -1;
+#if CONFIG_LOOP_RESTORATION
+  cm->rst_info.restoration_level = -1;
+#endif  // CONFIG_LOOP_RESTORATION
 
   vp10_default_coef_probs(cm);
   init_mode_probs(cm->fc);
diff --git a/vp10/common/entropymode.h b/vp10/common/entropymode.h
index 611d3ad..71e79d9 100644
--- a/vp10/common/entropymode.h
+++ b/vp10/common/entropymode.h
@@ -26,22 +26,18 @@
 #define TX_SIZE_CONTEXTS 2
 
 #define INTER_OFFSET(mode) ((mode) - NEARESTMV)
+#if CONFIG_EXT_INTER
+#define INTER_COMPOUND_OFFSET(mode) ((mode) - NEAREST_NEARESTMV)
+#endif  // CONFIG_EXT_INTER
+
+#define PALETTE_COLOR_CONTEXTS 16
+#define PALETTE_MAX_SIZE 8
+#define PALETTE_BLOCK_SIZES (BLOCK_LARGEST - BLOCK_8X8 + 1)
+#define PALETTE_Y_MODE_CONTEXTS 3
+#define PALETTE_MAX_BLOCK_SIZE (64 * 64)
 
 struct VP10Common;
 
-struct tx_probs {
-  vpx_prob p32x32[TX_SIZE_CONTEXTS][TX_SIZES - 1];
-  vpx_prob p16x16[TX_SIZE_CONTEXTS][TX_SIZES - 2];
-  vpx_prob p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 3];
-};
-
-struct tx_counts {
-  unsigned int p32x32[TX_SIZE_CONTEXTS][TX_SIZES];
-  unsigned int p16x16[TX_SIZE_CONTEXTS][TX_SIZES - 1];
-  unsigned int p8x8[TX_SIZE_CONTEXTS][TX_SIZES - 2];
-  unsigned int tx_totals[TX_SIZES];
-};
-
 struct seg_counts {
   unsigned int tree_total[MAX_SEGMENTS];
   unsigned int tree_mispred[MAX_SEGMENTS];
@@ -51,81 +47,210 @@
 typedef struct frame_contexts {
   vpx_prob y_mode_prob[BLOCK_SIZE_GROUPS][INTRA_MODES - 1];
   vpx_prob uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
+#if CONFIG_EXT_PARTITION_TYPES
+  vpx_prob partition_prob[PARTITION_CONTEXTS][EXT_PARTITION_TYPES - 1];
+#else
   vpx_prob partition_prob[PARTITION_CONTEXTS][PARTITION_TYPES - 1];
+#endif
   vp10_coeff_probs_model coef_probs[TX_SIZES][PLANE_TYPES];
+#if CONFIG_ANS
+  coeff_cdf_model coef_cdfs[TX_SIZES][PLANE_TYPES];
+#endif
   vpx_prob switchable_interp_prob[SWITCHABLE_FILTER_CONTEXTS]
                                  [SWITCHABLE_FILTERS - 1];
+
+#if CONFIG_REF_MV
+  vpx_prob newmv_prob[NEWMV_MODE_CONTEXTS];
+  vpx_prob zeromv_prob[ZEROMV_MODE_CONTEXTS];
+  vpx_prob refmv_prob[REFMV_MODE_CONTEXTS];
+  vpx_prob drl_prob[DRL_MODE_CONTEXTS];
+
+#if CONFIG_EXT_INTER
+  vpx_prob new2mv_prob;
+#endif  // CONFIG_EXT_INTER
+#endif  // CONFIG_REF_MV
+
   vpx_prob inter_mode_probs[INTER_MODE_CONTEXTS][INTER_MODES - 1];
+#if CONFIG_EXT_INTER
+  vpx_prob inter_compound_mode_probs[INTER_MODE_CONTEXTS]
+                                    [INTER_COMPOUND_MODES - 1];
+  vpx_prob interintra_prob[BLOCK_SIZE_GROUPS];
+  vpx_prob interintra_mode_prob[BLOCK_SIZE_GROUPS][INTERINTRA_MODES - 1];
+  vpx_prob wedge_interintra_prob[BLOCK_SIZES];
+  vpx_prob wedge_interinter_prob[BLOCK_SIZES];
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_OBMC || CONFIG_WARPED_MOTION
+  vpx_prob motvar_prob[BLOCK_SIZES][MOTION_VARIATIONS - 1];
+#endif  // CONFIG_OBMC || CONFIG_WARPED_MOTION
   vpx_prob intra_inter_prob[INTRA_INTER_CONTEXTS];
   vpx_prob comp_inter_prob[COMP_INTER_CONTEXTS];
-  vpx_prob single_ref_prob[REF_CONTEXTS][2];
-  vpx_prob comp_ref_prob[REF_CONTEXTS];
-  struct tx_probs tx_probs;
-  vpx_prob skip_probs[SKIP_CONTEXTS];
-  nmv_context nmvc;
-#if CONFIG_MISC_FIXES
-  struct segmentation_probs seg;
+  vpx_prob single_ref_prob[REF_CONTEXTS][SINGLE_REFS-1];
+#if CONFIG_EXT_REFS
+  vpx_prob comp_ref_prob[REF_CONTEXTS][FWD_REFS-1];
+  vpx_prob comp_bwdref_prob[REF_CONTEXTS][BWD_REFS-1];
+#else
+  vpx_prob comp_ref_prob[REF_CONTEXTS][COMP_REFS-1];
+#endif  // CONFIG_EXT_REFS
+  vpx_prob tx_size_probs[TX_SIZES - 1][TX_SIZE_CONTEXTS][TX_SIZES - 1];
+#if CONFIG_VAR_TX
+  vpx_prob txfm_partition_prob[TXFM_PARTITION_CONTEXTS];
 #endif
+  vpx_prob skip_probs[SKIP_CONTEXTS];
+#if CONFIG_REF_MV
+  nmv_context nmvc[NMV_CONTEXTS];
+#else
+  nmv_context nmvc;
+#endif
+  int initialized;
+#if CONFIG_EXT_TX
+  vpx_prob inter_ext_tx_prob[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES - 1];
+  vpx_prob intra_ext_tx_prob[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
+                            [TX_TYPES - 1];
+#else
   vpx_prob intra_ext_tx_prob[EXT_TX_SIZES][TX_TYPES][TX_TYPES - 1];
   vpx_prob inter_ext_tx_prob[EXT_TX_SIZES][TX_TYPES - 1];
-  int initialized;
+#endif  // CONFIG_EXT_TX
+#if CONFIG_SUPERTX
+  vpx_prob supertx_prob[PARTITION_SUPERTX_CONTEXTS][TX_SIZES];
+#endif  // CONFIG_SUPERTX
+  struct segmentation_probs seg;
+#if CONFIG_EXT_INTRA
+  vpx_prob ext_intra_probs[PLANE_TYPES];
+  vpx_prob intra_filter_probs[INTRA_FILTERS + 1][INTRA_FILTERS - 1];
+#endif  // CONFIG_EXT_INTRA
 } FRAME_CONTEXT;
 
 typedef struct FRAME_COUNTS {
+  // Note: This structure should only contain 'unsigned int' fields, or
+  // aggregates built solely from 'unsigned int' fields/elements
   unsigned int kf_y_mode[INTRA_MODES][INTRA_MODES][INTRA_MODES];
   unsigned int y_mode[BLOCK_SIZE_GROUPS][INTRA_MODES];
   unsigned int uv_mode[INTRA_MODES][INTRA_MODES];
+#if CONFIG_EXT_PARTITION_TYPES
+  unsigned int partition[PARTITION_CONTEXTS][EXT_PARTITION_TYPES];
+#else
   unsigned int partition[PARTITION_CONTEXTS][PARTITION_TYPES];
+#endif
   vp10_coeff_count_model coef[TX_SIZES][PLANE_TYPES];
   unsigned int eob_branch[TX_SIZES][PLANE_TYPES][REF_TYPES]
                          [COEF_BANDS][COEFF_CONTEXTS];
   unsigned int switchable_interp[SWITCHABLE_FILTER_CONTEXTS]
                                 [SWITCHABLE_FILTERS];
+#if CONFIG_REF_MV
+  unsigned int newmv_mode[NEWMV_MODE_CONTEXTS][2];
+  unsigned int zeromv_mode[ZEROMV_MODE_CONTEXTS][2];
+  unsigned int refmv_mode[REFMV_MODE_CONTEXTS][2];
+  unsigned int drl_mode[DRL_MODE_CONTEXTS][2];
+#if CONFIG_EXT_INTER
+  unsigned int new2mv_mode[2];
+#endif  // CONFIG_EXT_INTER
+#endif
+
   unsigned int inter_mode[INTER_MODE_CONTEXTS][INTER_MODES];
+#if CONFIG_EXT_INTER
+  unsigned int inter_compound_mode[INTER_MODE_CONTEXTS][INTER_COMPOUND_MODES];
+  unsigned int interintra[BLOCK_SIZE_GROUPS][2];
+  unsigned int interintra_mode[BLOCK_SIZE_GROUPS][INTERINTRA_MODES];
+  unsigned int wedge_interintra[BLOCK_SIZES][2];
+  unsigned int wedge_interinter[BLOCK_SIZES][2];
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_OBMC || CONFIG_WARPED_MOTION
+  unsigned int motvar[BLOCK_SIZES][MOTION_VARIATIONS];
+#endif  // CONFIG_OBMC || CONFIG_WARPED_MOTION
   unsigned int intra_inter[INTRA_INTER_CONTEXTS][2];
   unsigned int comp_inter[COMP_INTER_CONTEXTS][2];
-  unsigned int single_ref[REF_CONTEXTS][2][2];
-  unsigned int comp_ref[REF_CONTEXTS][2];
-  struct tx_counts tx;
-  unsigned int skip[SKIP_CONTEXTS][2];
-  nmv_context_counts mv;
-#if CONFIG_MISC_FIXES
-  struct seg_counts seg;
+  unsigned int single_ref[REF_CONTEXTS][SINGLE_REFS-1][2];
+#if CONFIG_EXT_REFS
+  unsigned int comp_ref[REF_CONTEXTS][FWD_REFS-1][2];
+  unsigned int comp_bwdref[REF_CONTEXTS][BWD_REFS-1][2];
+#else
+  unsigned int comp_ref[REF_CONTEXTS][COMP_REFS-1][2];
+#endif  // CONFIG_EXT_REFS
+  unsigned int tx_size_totals[TX_SIZES];
+  unsigned int tx_size[TX_SIZES - 1][TX_SIZE_CONTEXTS][TX_SIZES];
+#if CONFIG_VAR_TX
+  unsigned int txfm_partition[TXFM_PARTITION_CONTEXTS][2];
 #endif
+  unsigned int skip[SKIP_CONTEXTS][2];
+#if CONFIG_REF_MV
+  nmv_context_counts mv[NMV_CONTEXTS];
+#else
+  nmv_context_counts mv;
+#endif
+#if CONFIG_EXT_TX
+  unsigned int inter_ext_tx[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES];
+  unsigned int intra_ext_tx[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
+                           [TX_TYPES];
+#else
   unsigned int intra_ext_tx[EXT_TX_SIZES][TX_TYPES][TX_TYPES];
   unsigned int inter_ext_tx[EXT_TX_SIZES][TX_TYPES];
+#endif  // CONFIG_EXT_TX
+#if CONFIG_SUPERTX
+  unsigned int supertx[PARTITION_SUPERTX_CONTEXTS][TX_SIZES][2];
+  unsigned int supertx_size[TX_SIZES];
+#endif  // CONFIG_SUPERTX
+  struct seg_counts seg;
+#if CONFIG_EXT_INTRA
+  unsigned int ext_intra[PLANE_TYPES][2];
+  unsigned int intra_filter[INTRA_FILTERS + 1][INTRA_FILTERS];
+#endif  // CONFIG_EXT_INTRA
 } FRAME_COUNTS;
 
 extern const vpx_prob vp10_kf_y_mode_prob[INTRA_MODES][INTRA_MODES]
                                         [INTRA_MODES - 1];
-#if !CONFIG_MISC_FIXES
-extern const vpx_prob vp10_kf_uv_mode_prob[INTRA_MODES][INTRA_MODES - 1];
-extern const vpx_prob vp10_kf_partition_probs[PARTITION_CONTEXTS]
-                                            [PARTITION_TYPES - 1];
-#endif
+extern const vpx_prob
+vp10_default_palette_y_mode_prob[PALETTE_BLOCK_SIZES][PALETTE_Y_MODE_CONTEXTS];
+extern const vpx_prob vp10_default_palette_uv_mode_prob[2];
+extern const vpx_prob
+vp10_default_palette_y_size_prob[PALETTE_BLOCK_SIZES][PALETTE_SIZES - 1];
+extern const vpx_prob
+vp10_default_palette_uv_size_prob[PALETTE_BLOCK_SIZES][PALETTE_SIZES - 1];
+extern const vpx_prob vp10_default_palette_y_color_prob
+[PALETTE_MAX_SIZE - 1][PALETTE_COLOR_CONTEXTS][PALETTE_COLORS - 1];
+extern const vpx_prob vp10_default_palette_uv_color_prob
+[PALETTE_MAX_SIZE - 1][PALETTE_COLOR_CONTEXTS][PALETTE_COLORS - 1];
 
 extern const vpx_tree_index vp10_intra_mode_tree[TREE_SIZE(INTRA_MODES)];
 extern const vpx_tree_index vp10_inter_mode_tree[TREE_SIZE(INTER_MODES)];
+#if CONFIG_EXT_INTER
+extern const vpx_tree_index vp10_interintra_mode_tree
+                            [TREE_SIZE(INTERINTRA_MODES)];
+extern const vpx_tree_index vp10_inter_compound_mode_tree
+                            [TREE_SIZE(INTER_COMPOUND_MODES)];
+#endif  // CONFIG_EXT_INTER
 extern const vpx_tree_index vp10_partition_tree[TREE_SIZE(PARTITION_TYPES)];
+#if CONFIG_EXT_PARTITION_TYPES
+extern const vpx_tree_index vp10_ext_partition_tree
+                                [TREE_SIZE(EXT_PARTITION_TYPES)];
+#endif
 extern const vpx_tree_index vp10_switchable_interp_tree
                                 [TREE_SIZE(SWITCHABLE_FILTERS)];
-
+extern const vpx_tree_index vp10_palette_size_tree[TREE_SIZE(PALETTE_SIZES)];
+extern const vpx_tree_index
+vp10_palette_color_tree[PALETTE_MAX_SIZE - 1][TREE_SIZE(PALETTE_COLORS)];
+extern const vpx_tree_index
+vp10_tx_size_tree[TX_SIZES - 1][TREE_SIZE(TX_SIZES)];
+#if CONFIG_EXT_INTRA
+extern const vpx_tree_index vp10_intra_filter_tree[TREE_SIZE(INTRA_FILTERS)];
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_EXT_TX
+extern const vpx_tree_index
+    vp10_ext_tx_inter_tree[EXT_TX_SETS_INTER][TREE_SIZE(TX_TYPES)];
+extern const vpx_tree_index
+    vp10_ext_tx_intra_tree[EXT_TX_SETS_INTRA][TREE_SIZE(TX_TYPES)];
+#else
+extern const vpx_tree_index
+    vp10_ext_tx_tree[TREE_SIZE(TX_TYPES)];
+#endif  // CONFIG_EXT_TX
+#if CONFIG_OBMC || CONFIG_WARPED_MOTION
+extern const vpx_tree_index vp10_motvar_tree[TREE_SIZE(MOTION_VARIATIONS)];
+#endif  // CONFIG_OBMC || CONFIG_WARPED_MOTION
 
 void vp10_setup_past_independence(struct VP10Common *cm);
 
 void vp10_adapt_intra_frame_probs(struct VP10Common *cm);
 void vp10_adapt_inter_frame_probs(struct VP10Common *cm);
 
-void vp10_tx_counts_to_branch_counts_32x32(const unsigned int *tx_count_32x32p,
-                                      unsigned int (*ct_32x32p)[2]);
-void vp10_tx_counts_to_branch_counts_16x16(const unsigned int *tx_count_16x16p,
-                                      unsigned int (*ct_16x16p)[2]);
-void vp10_tx_counts_to_branch_counts_8x8(const unsigned int *tx_count_8x8p,
-                                    unsigned int (*ct_8x8p)[2]);
-
-extern const vpx_tree_index
-    vp10_ext_tx_tree[TREE_SIZE(TX_TYPES)];
-
 static INLINE int vp10_ceil_log2(int n) {
   int i = 1, p = 2;
   while (p < n) {
@@ -135,6 +260,9 @@
   return i;
 }
 
+int vp10_get_palette_color_context(const uint8_t *color_map, int cols,
+                                   int r, int c, int n, int *color_order);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp10/common/entropymv.c b/vp10/common/entropymv.c
index a9946ee..87c6421 100644
--- a/vp10/common/entropymv.c
+++ b/vp10/common/entropymv.c
@@ -44,7 +44,12 @@
 };
 
 static const nmv_context default_nmv_context = {
+#if CONFIG_REF_MV
+  {1, 64, 96},
+  128,
+#else
   {32, 64, 96},
+#endif
   {
     { // Vertical component
       128,                                                  // sign
@@ -127,14 +132,11 @@
   return c;
 }
 
+// TODO(jingning): This idle function is intentionally left as is for
+// experimental purpose.
 int vp10_use_mv_hp(const MV *ref) {
-#if CONFIG_MISC_FIXES
   (void) ref;
   return 1;
-#else
-  return (abs(ref->row) >> 3) < COMPANDED_MVREF_THRESH &&
-         (abs(ref->col) >> 3) < COMPANDED_MVREF_THRESH;
-#endif
 }
 
 static void inc_mv_component(int v, nmv_component_counts *comp_counts,
@@ -155,37 +157,82 @@
   if (c == MV_CLASS_0) {
     comp_counts->class0[d] += incr;
     comp_counts->class0_fp[d][f] += incr;
-    comp_counts->class0_hp[e] += usehp * incr;
+    if (usehp)
+      comp_counts->class0_hp[e] += incr;
   } else {
     int i;
     int b = c + CLASS0_BITS - 1;  // number of bits
     for (i = 0; i < b; ++i)
       comp_counts->bits[i][((d >> i) & 1)] += incr;
     comp_counts->fp[f] += incr;
-    comp_counts->hp[e] += usehp * incr;
+    if (usehp)
+      comp_counts->hp[e] += incr;
   }
 }
 
 void vp10_inc_mv(const MV *mv, nmv_context_counts *counts, const int usehp) {
   if (counts != NULL) {
     const MV_JOINT_TYPE j = vp10_get_mv_joint(mv);
+
+#if CONFIG_REF_MV
+    ++counts->zero_rmv[j == MV_JOINT_ZERO];
+    if (j == MV_JOINT_ZERO)
+      return;
+#endif
     ++counts->joints[j];
 
-    if (mv_joint_vertical(j)) {
-      inc_mv_component(mv->row, &counts->comps[0], 1,
-                       !CONFIG_MISC_FIXES || usehp);
-    }
+    if (mv_joint_vertical(j))
+      inc_mv_component(mv->row, &counts->comps[0], 1, usehp);
 
-    if (mv_joint_horizontal(j)) {
-      inc_mv_component(mv->col, &counts->comps[1], 1,
-                       !CONFIG_MISC_FIXES || usehp);
-    }
+    if (mv_joint_horizontal(j))
+      inc_mv_component(mv->col, &counts->comps[1], 1, usehp);
   }
 }
 
 void vp10_adapt_mv_probs(VP10_COMMON *cm, int allow_hp) {
   int i, j;
+#if CONFIG_REF_MV
+  int idx;
+  for (idx = 0; idx < NMV_CONTEXTS; ++idx) {
+    nmv_context *fc = &cm->fc->nmvc[idx];
+    const nmv_context *pre_fc =
+        &cm->frame_contexts[cm->frame_context_idx].nmvc[idx];
+    const nmv_context_counts *counts = &cm->counts.mv[idx];
 
+    vpx_tree_merge_probs(vp10_mv_joint_tree, pre_fc->joints, counts->joints,
+                         fc->joints);
+#if CONFIG_REF_MV
+    fc->zero_rmv = vp10_mode_mv_merge_probs(pre_fc->zero_rmv, counts->zero_rmv);
+#endif
+
+    for (i = 0; i < 2; ++i) {
+      nmv_component *comp = &fc->comps[i];
+      const nmv_component *pre_comp = &pre_fc->comps[i];
+      const nmv_component_counts *c = &counts->comps[i];
+
+      comp->sign = vp10_mode_mv_merge_probs(pre_comp->sign, c->sign);
+      vpx_tree_merge_probs(vp10_mv_class_tree, pre_comp->classes, c->classes,
+                           comp->classes);
+      vpx_tree_merge_probs(vp10_mv_class0_tree, pre_comp->class0, c->class0,
+                           comp->class0);
+
+      for (j = 0; j < MV_OFFSET_BITS; ++j)
+        comp->bits[j] = vp10_mode_mv_merge_probs(pre_comp->bits[j], c->bits[j]);
+
+      for (j = 0; j < CLASS0_SIZE; ++j)
+        vpx_tree_merge_probs(vp10_mv_fp_tree, pre_comp->class0_fp[j],
+                             c->class0_fp[j], comp->class0_fp[j]);
+
+      vpx_tree_merge_probs(vp10_mv_fp_tree, pre_comp->fp, c->fp, comp->fp);
+
+      if (allow_hp) {
+        comp->class0_hp = vp10_mode_mv_merge_probs(pre_comp->class0_hp,
+                                                   c->class0_hp);
+        comp->hp = vp10_mode_mv_merge_probs(pre_comp->hp, c->hp);
+      }
+    }
+  }
+#else
   nmv_context *fc = &cm->fc->nmvc;
   const nmv_context *pre_fc = &cm->frame_contexts[cm->frame_context_idx].nmvc;
   const nmv_context_counts *counts = &cm->counts.mv;
@@ -198,14 +245,14 @@
     const nmv_component *pre_comp = &pre_fc->comps[i];
     const nmv_component_counts *c = &counts->comps[i];
 
-    comp->sign = mode_mv_merge_probs(pre_comp->sign, c->sign);
+    comp->sign = vp10_mode_mv_merge_probs(pre_comp->sign, c->sign);
     vpx_tree_merge_probs(vp10_mv_class_tree, pre_comp->classes, c->classes,
                          comp->classes);
     vpx_tree_merge_probs(vp10_mv_class0_tree, pre_comp->class0, c->class0,
                          comp->class0);
 
     for (j = 0; j < MV_OFFSET_BITS; ++j)
-      comp->bits[j] = mode_mv_merge_probs(pre_comp->bits[j], c->bits[j]);
+      comp->bits[j] = vp10_mode_mv_merge_probs(pre_comp->bits[j], c->bits[j]);
 
     for (j = 0; j < CLASS0_SIZE; ++j)
       vpx_tree_merge_probs(vp10_mv_fp_tree, pre_comp->class0_fp[j],
@@ -214,12 +261,20 @@
     vpx_tree_merge_probs(vp10_mv_fp_tree, pre_comp->fp, c->fp, comp->fp);
 
     if (allow_hp) {
-      comp->class0_hp = mode_mv_merge_probs(pre_comp->class0_hp, c->class0_hp);
-      comp->hp = mode_mv_merge_probs(pre_comp->hp, c->hp);
+      comp->class0_hp = vp10_mode_mv_merge_probs(
+          pre_comp->class0_hp, c->class0_hp);
+      comp->hp = vp10_mode_mv_merge_probs(pre_comp->hp, c->hp);
     }
   }
+#endif
 }
 
 void vp10_init_mv_probs(VP10_COMMON *cm) {
+#if CONFIG_REF_MV
+  int i;
+  for (i = 0; i < NMV_CONTEXTS; ++i)
+    cm->fc->nmvc[i] = default_nmv_context;
+#else
   cm->fc->nmvc = default_nmv_context;
+#endif
 }
diff --git a/vp10/common/entropymv.h b/vp10/common/entropymv.h
index d1eb95c..f8ade34 100644
--- a/vp10/common/entropymv.h
+++ b/vp10/common/entropymv.h
@@ -95,6 +95,9 @@
 
 typedef struct {
   vpx_prob joints[MV_JOINTS - 1];
+#if CONFIG_REF_MV
+  vpx_prob zero_rmv;
+#endif
   nmv_component comps[2];
 } nmv_context;
 
@@ -121,6 +124,9 @@
 
 typedef struct {
   unsigned int joints[MV_JOINTS];
+#if CONFIG_REF_MV
+  unsigned int zero_rmv[2];
+#endif
   nmv_component_counts comps[2];
 } nmv_context_counts;
 
diff --git a/vp10/common/enums.h b/vp10/common/enums.h
index 18c7d16..d1ce121 100644
--- a/vp10/common/enums.h
+++ b/vp10/common/enums.h
@@ -18,13 +18,43 @@
 extern "C" {
 #endif
 
-#define MI_SIZE_LOG2 3
-#define MI_BLOCK_SIZE_LOG2 (6 - MI_SIZE_LOG2)  // 64 = 2^6
+#undef MAX_SB_SIZE
 
-#define MI_SIZE (1 << MI_SIZE_LOG2)  // pixels per mi-unit
-#define MI_BLOCK_SIZE (1 << MI_BLOCK_SIZE_LOG2)  // mi-units per max block
+// Max superblock size
+#if CONFIG_EXT_PARTITION
+# define MAX_SB_SIZE_LOG2 7
+#else
+# define MAX_SB_SIZE_LOG2 6
+#endif  // CONFIG_EXT_PARTITION
+#define MAX_SB_SIZE   (1 << MAX_SB_SIZE_LOG2)
+#define MAX_SB_SQUARE (MAX_SB_SIZE * MAX_SB_SIZE)
 
-#define MI_MASK (MI_BLOCK_SIZE - 1)
+// Min superblock size
+#define MIN_SB_SIZE_LOG2 6
+
+// Pixels per Mode Info (MI) unit
+#define MI_SIZE_LOG2  3
+#define MI_SIZE       (1 << MI_SIZE_LOG2)
+
+// MI-units per max superblock (MI Block - MIB)
+#define MAX_MIB_SIZE_LOG2 (MAX_SB_SIZE_LOG2 - MI_SIZE_LOG2)
+#define MAX_MIB_SIZE      (1 << MAX_MIB_SIZE_LOG2)
+
+// MI-units per min superblock
+#define MIN_MIB_SIZE_LOG2 (MIN_SB_SIZE_LOG2 - MI_SIZE_LOG2)
+
+// Mask to extract MI offset within max MIB
+#define MAX_MIB_MASK    (MAX_MIB_SIZE - 1)
+#define MAX_MIB_MASK_2  (MAX_MIB_SIZE * 2 - 1)
+
+// Maximum number of tile rows and tile columns
+#if CONFIG_EXT_TILE
+# define  MAX_TILE_ROWS 1024
+# define  MAX_TILE_COLS 1024
+#else
+# define  MAX_TILE_ROWS 4
+# define  MAX_TILE_COLS 64
+#endif  // CONFIG_EXT_TILE
 
 // Bitstream profiles indicated by 2-3 bits in the uncompressed header.
 // 00: Profile 0.  8-bit 4:2:0 only.
@@ -41,23 +71,46 @@
   MAX_PROFILES
 } BITSTREAM_PROFILE;
 
-#define BLOCK_4X4     0
-#define BLOCK_4X8     1
-#define BLOCK_8X4     2
-#define BLOCK_8X8     3
-#define BLOCK_8X16    4
-#define BLOCK_16X8    5
-#define BLOCK_16X16   6
-#define BLOCK_16X32   7
-#define BLOCK_32X16   8
-#define BLOCK_32X32   9
-#define BLOCK_32X64  10
-#define BLOCK_64X32  11
-#define BLOCK_64X64  12
-#define BLOCK_SIZES  13
+#define BLOCK_4X4       0
+#define BLOCK_4X8       1
+#define BLOCK_8X4       2
+#define BLOCK_8X8       3
+#define BLOCK_8X16      4
+#define BLOCK_16X8      5
+#define BLOCK_16X16     6
+#define BLOCK_16X32     7
+#define BLOCK_32X16     8
+#define BLOCK_32X32     9
+#define BLOCK_32X64    10
+#define BLOCK_64X32    11
+#define BLOCK_64X64    12
+#if !CONFIG_EXT_PARTITION
+# define BLOCK_SIZES   13
+#else
+# define BLOCK_64X128  13
+# define BLOCK_128X64  14
+# define BLOCK_128X128 15
+# define BLOCK_SIZES   16
+#endif  // !CONFIG_EXT_PARTITION
 #define BLOCK_INVALID BLOCK_SIZES
+#define BLOCK_LARGEST (BLOCK_SIZES - 1)
 typedef uint8_t BLOCK_SIZE;
 
+#if CONFIG_EXT_PARTITION_TYPES
+typedef enum PARTITION_TYPE {
+  PARTITION_NONE,
+  PARTITION_HORZ,
+  PARTITION_VERT,
+  PARTITION_SPLIT,
+  PARTITION_HORZ_A,  // HORZ split and the left partition is split again
+  PARTITION_HORZ_B,  // HORZ split and the right partition is split again
+  PARTITION_VERT_A,  // VERT split and the top partition is split again
+  PARTITION_VERT_B,  // VERT split and the bottom partition is split again
+  EXT_PARTITION_TYPES,
+  PARTITION_TYPES = PARTITION_SPLIT + 1,
+  PARTITION_INVALID = EXT_PARTITION_TYPES
+} PARTITION_TYPE;
+#else
 typedef enum PARTITION_TYPE {
   PARTITION_NONE,
   PARTITION_HORZ,
@@ -66,10 +119,15 @@
   PARTITION_TYPES,
   PARTITION_INVALID = PARTITION_TYPES
 } PARTITION_TYPE;
+#endif  // CONFIG_EXT_PARTITION_TYPES
 
 typedef char PARTITION_CONTEXT;
 #define PARTITION_PLOFFSET   4  // number of probability models per block size
-#define PARTITION_CONTEXTS (4 * PARTITION_PLOFFSET)
+#if CONFIG_EXT_PARTITION
+# define PARTITION_CONTEXTS  (5 * PARTITION_PLOFFSET)
+#else
+# define PARTITION_CONTEXTS  (4 * PARTITION_PLOFFSET)
+#endif  // CONFIG_EXT_PARTITION
 
 // block transform size
 typedef uint8_t TX_SIZE;
@@ -79,6 +137,19 @@
 #define TX_32X32 ((TX_SIZE)3)   // 32x32 transform
 #define TX_SIZES ((TX_SIZE)4)
 
+#define MAX_TX_SIZE_LOG2  5
+#define MAX_TX_SIZE       (1 << MAX_TX_SIZE_LOG2)
+#define MIN_TX_SIZE_LOG2  2
+#define MIN_TX_SIZE       (1 << MIN_TX_SIZE_LOG2)
+#define MAX_TX_SQUARE     (MAX_TX_SIZE * MAX_TX_SIZE)
+
+// Number of maxium size transform blocks in the maximum size superblock
+#define MAX_TX_BLOCKS_IN_MAX_SB_LOG2 \
+  ((MAX_SB_SIZE_LOG2 - MAX_TX_SIZE_LOG2) * 2)
+#define MAX_TX_BLOCKS_IN_MAX_SB (1 << MAX_TX_BLOCKS_IN_MAX_SB_LOG2)
+
+#define MAX_NUM_TXB  (1 << (MAX_SB_SIZE_LOG2 - MIN_TX_SIZE_LOG2))
+
 // frame transform mode
 typedef enum {
   ONLY_4X4            = 0,        // only 4x4 transform used
@@ -89,20 +160,59 @@
   TX_MODES            = 5,
 } TX_MODE;
 
+// 1D tx types
+typedef enum {
+  DCT_1D = 0,
+  ADST_1D = 1,
+  FLIPADST_1D = 2,
+  IDTX_1D = 3,
+  TX_TYPES_1D = 4,
+} TX_TYPE_1D;
+
 typedef enum {
   DCT_DCT   = 0,                      // DCT  in both horizontal and vertical
   ADST_DCT  = 1,                      // ADST in vertical, DCT in horizontal
   DCT_ADST  = 2,                      // DCT  in vertical, ADST in horizontal
   ADST_ADST = 3,                      // ADST in both directions
-  TX_TYPES = 4
+#if CONFIG_EXT_TX
+  FLIPADST_DCT = 4,
+  DCT_FLIPADST = 5,
+  FLIPADST_FLIPADST = 6,
+  ADST_FLIPADST = 7,
+  FLIPADST_ADST = 8,
+  IDTX = 9,
+  V_DCT = 10,
+  H_DCT = 11,
+  V_ADST = 12,
+  H_ADST = 13,
+  V_FLIPADST = 14,
+  H_FLIPADST = 15,
+#endif  // CONFIG_EXT_TX
+  TX_TYPES,
 } TX_TYPE;
 
+#if CONFIG_EXT_TX
+#define EXT_TX_SIZES       4  // number of sizes that use extended transforms
+#define EXT_TX_SETS_INTER  4  // Sets of transform selections for INTER
+#define EXT_TX_SETS_INTRA  3  // Sets of transform selections for INTRA
+#else
 #define EXT_TX_SIZES       3  // number of sizes that use extended transforms
+#endif  // CONFIG_EXT_TX
 
 typedef enum {
   VP9_LAST_FLAG = 1 << 0,
+#if CONFIG_EXT_REFS
+  VP9_LAST2_FLAG = 1 << 1,
+  VP9_LAST3_FLAG = 1 << 2,
+  VP9_GOLD_FLAG = 1 << 3,
+  VP9_BWD_FLAG = 1 << 4,
+  VP9_ALT_FLAG = 1 << 5,
+  VP9_REFFRAME_ALL = (1 << 6) - 1
+#else
   VP9_GOLD_FLAG = 1 << 1,
   VP9_ALT_FLAG = 1 << 2,
+  VP9_REFFRAME_ALL = (1 << 3) - 1
+#endif  // CONFIG_EXT_REFS
 } VP9_REFFRAME;
 
 typedef enum {
@@ -111,6 +221,29 @@
   PLANE_TYPES
 } PLANE_TYPE;
 
+typedef enum {
+  TWO_COLORS,
+  THREE_COLORS,
+  FOUR_COLORS,
+  FIVE_COLORS,
+  SIX_COLORS,
+  SEVEN_COLORS,
+  EIGHT_COLORS,
+  PALETTE_SIZES
+} PALETTE_SIZE;
+
+typedef enum {
+  PALETTE_COLOR_ONE,
+  PALETTE_COLOR_TWO,
+  PALETTE_COLOR_THREE,
+  PALETTE_COLOR_FOUR,
+  PALETTE_COLOR_FIVE,
+  PALETTE_COLOR_SIX,
+  PALETTE_COLOR_SEVEN,
+  PALETTE_COLOR_EIGHT,
+  PALETTE_COLORS
+} PALETTE_COLOR;
+
 #define DC_PRED    0       // Average of above and left pixels
 #define V_PRED     1       // Vertical
 #define H_PRED     2       // Horizontal
@@ -125,23 +258,174 @@
 #define NEARMV    11
 #define ZEROMV    12
 #define NEWMV     13
-#define MB_MODE_COUNT 14
+#if CONFIG_EXT_INTER
+#define NEWFROMNEARMV     14
+#define NEAREST_NEARESTMV 15
+#define NEAREST_NEARMV    16
+#define NEAR_NEARESTMV    17
+#define NEAR_NEARMV       18
+#define NEAREST_NEWMV     19
+#define NEW_NEARESTMV     20
+#define NEAR_NEWMV        21
+#define NEW_NEARMV        22
+#define ZERO_ZEROMV       23
+#define NEW_NEWMV         24
+#define MB_MODE_COUNT     25
+#else
+#define MB_MODE_COUNT     14
+#endif  // CONFIG_EXT_INTER
 typedef uint8_t PREDICTION_MODE;
 
 #define INTRA_MODES (TM_PRED + 1)
 
+typedef enum {
+  SIMPLE_TRANSLATION = 0,
+#if CONFIG_OBMC
+  OBMC_CAUSAL,    // 2-sided OBMC
+#endif  // CONFIG_OBMC
+#if CONFIG_WARPED_MOTION
+  WARPED_CAUSAL,  // 2-sided WARPED
+#endif  // CONFIG_WARPED_MOTION
+  MOTION_VARIATIONS
+} MOTION_VARIATION;
+
+#if CONFIG_EXT_INTER
+typedef enum {
+  II_DC_PRED = 0,
+  II_V_PRED,
+  II_H_PRED,
+  II_D45_PRED,
+  II_D135_PRED,
+  II_D117_PRED,
+  II_D153_PRED,
+  II_D207_PRED,
+  II_D63_PRED,
+  II_TM_PRED,
+  INTERINTRA_MODES
+} INTERINTRA_MODE;
+
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_EXT_INTRA
+typedef enum {
+  FILTER_DC_PRED,
+  FILTER_V_PRED,
+  FILTER_H_PRED,
+  FILTER_D45_PRED,
+  FILTER_D135_PRED,
+  FILTER_D117_PRED,
+  FILTER_D153_PRED,
+  FILTER_D207_PRED,
+  FILTER_D63_PRED,
+  FILTER_TM_PRED,
+  EXT_INTRA_MODES,
+} EXT_INTRA_MODE;
+
+#define FILTER_INTRA_MODES (FILTER_TM_PRED + 1)
+#define DIRECTIONAL_MODES (INTRA_MODES - 2)
+#endif  // CONFIG_EXT_INTRA
+
+#if CONFIG_EXT_INTER
+#define INTER_MODES (1 + NEWFROMNEARMV - NEARESTMV)
+#else
 #define INTER_MODES (1 + NEWMV - NEARESTMV)
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_EXT_INTER
+#define INTER_COMPOUND_MODES (1 + NEW_NEWMV - NEAREST_NEARESTMV)
+#endif  // CONFIG_EXT_INTER
 
 #define SKIP_CONTEXTS 3
+
+#if CONFIG_REF_MV
+#define NMV_CONTEXTS 3
+
+#define NEWMV_MODE_CONTEXTS  7
+#define ZEROMV_MODE_CONTEXTS 2
+#define REFMV_MODE_CONTEXTS  9
+#define DRL_MODE_CONTEXTS    5
+
+#define ZEROMV_OFFSET 3
+#define REFMV_OFFSET  4
+
+#define NEWMV_CTX_MASK ((1 << ZEROMV_OFFSET) - 1)
+#define ZEROMV_CTX_MASK ((1 << (REFMV_OFFSET - ZEROMV_OFFSET)) - 1)
+#define REFMV_CTX_MASK ((1 << (8 - REFMV_OFFSET)) - 1)
+
+#define ALL_ZERO_FLAG_OFFSET   8
+#define SKIP_NEARESTMV_OFFSET  9
+#define SKIP_NEARMV_OFFSET    10
+#define SKIP_NEARESTMV_SUB8X8_OFFSET 11
+#endif
+
 #define INTER_MODE_CONTEXTS 7
 
 /* Segment Feature Masks */
 #define MAX_MV_REF_CANDIDATES 2
 
+#if CONFIG_REF_MV
+#define MAX_REF_MV_STACK_SIZE 16
+#if CONFIG_EXT_PARTITION
+#define REF_CAT_LEVEL 640
+#else
+#define REF_CAT_LEVEL 160
+#endif  // CONFIG_EXT_PARTITION
+#endif  // CONFIG_REF_MV
+
 #define INTRA_INTER_CONTEXTS 4
 #define COMP_INTER_CONTEXTS 5
 #define REF_CONTEXTS 5
 
+#if CONFIG_VAR_TX
+#define TXFM_PARTITION_CONTEXTS 9
+typedef TX_SIZE TXFM_CONTEXT;
+#endif
+
+#define NONE           -1
+#define INTRA_FRAME     0
+#define LAST_FRAME      1
+
+#if CONFIG_EXT_REFS
+
+#define LAST2_FRAME     2
+#define LAST3_FRAME     3
+#define GOLDEN_FRAME    4
+#define BWDREF_FRAME    5
+#define ALTREF_FRAME    6
+#define MAX_REF_FRAMES  7
+#define LAST_REF_FRAMES (LAST3_FRAME - LAST_FRAME + 1)
+
+#else
+
+#define GOLDEN_FRAME    2
+#define ALTREF_FRAME    3
+#define MAX_REF_FRAMES  4
+#endif  // CONFIG_EXT_REFS
+
+#define FWD_REFS (GOLDEN_FRAME - LAST_FRAME + 1)
+#define FWD_RF_OFFSET(ref) (ref - LAST_FRAME)
+#if CONFIG_EXT_REFS
+#define BWD_REFS (ALTREF_FRAME - BWDREF_FRAME + 1)
+#define BWD_RF_OFFSET(ref) (ref - BWDREF_FRAME)
+#else
+#define BWD_REFS 1
+#define BWD_RF_OFFSET(ref) (ref - ALTREF_FRAME)
+#endif
+
+#define SINGLE_REFS (FWD_REFS + BWD_REFS)
+#define COMP_REFS   (FWD_REFS * BWD_REFS)
+
+#if CONFIG_REF_MV
+#define MODE_CTX_REF_FRAMES (MAX_REF_FRAMES + COMP_REFS)
+#else
+#define MODE_CTX_REF_FRAMES MAX_REF_FRAMES
+#endif
+
+#if CONFIG_SUPERTX
+#define PARTITION_SUPERTX_CONTEXTS 2
+#define MAX_SUPERTX_BLOCK_SIZE BLOCK_32X32
+#endif  // CONFIG_SUPERTX
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp10/common/filter.c b/vp10/common/filter.c
index dda279f..8427237 100644
--- a/vp10/common/filter.c
+++ b/vp10/common/filter.c
@@ -32,9 +32,162 @@
   { 0, 0, 0,   8, 120, 0, 0, 0 }
 };
 
-// Lagrangian interpolation filter
+#if USE_TEMPORALFILTER_12TAP
+DECLARE_ALIGNED(16, static const int16_t,
+                sub_pel_filters_temporalfilter_12[SUBPEL_SHIFTS][12]) = {
+  // intfilt 0.8
+  {0,   0,   0,   0,   0, 128,   0,   0,   0,   0,   0, 0},
+  {0,   1,  -1,   3,  -7, 127,   8,  -4,   2,  -1,   0, 0},
+  {0,   1,  -3,   5, -12, 124,  18,  -8,   4,  -2,   1, 0},
+  {-1,   2,  -4,   8, -17, 120,  28, -11,   6,  -3,   1, -1},
+  {-1,   2,  -4,  10, -21, 114,  38, -15,   8,  -4,   2, -1},
+  {-1,   3,  -5,  11, -23, 107,  49, -18,   9,  -5,   2, -1},
+  {-1,   3,  -6,  12, -25,  99,  60, -21,  11,  -6,   3, -1},
+  {-1,   3,  -6,  12, -25,  90,  70, -23,  12,  -6,   3, -1},
+  {-1,   3,  -6,  12, -24,  80,  80, -24,  12,  -6,   3, -1},
+  {-1,   3,  -6,  12, -23,  70,  90, -25,  12,  -6,   3, -1},
+  {-1,   3,  -6,  11, -21,  60,  99, -25,  12,  -6,   3, -1},
+  {-1,   2,  -5,   9, -18,  49, 107, -23,  11,  -5,   3, -1},
+  {-1,   2,  -4,   8, -15,  38, 114, -21,  10,  -4,   2, -1},
+  {-1,   1,  -3,   6, -11,  28, 120, -17,   8,  -4,   2, -1},
+  {0,   1,  -2,   4,  -8,  18, 124, -12,   5,  -3,   1, 0},
+  {0,   0,  -1,   2,  -4,   8, 127,  -7,   3,  -1,   1, 0},
+};
+#endif  // USE_TEMPORALFILTER_12TAP
+
+#if CONFIG_EXT_INTERP
 DECLARE_ALIGNED(256, static const InterpKernel,
                 sub_pel_filters_8[SUBPEL_SHIFTS]) = {
+  // intfilt 0.575
+  {0,   0,   0, 128,   0,   0,   0, 0},
+  {0,   1,  -5, 126,   8,  -3,   1, 0},
+  {-1,   3, -10, 123,  18,  -6,   2, -1},
+  {-1,   4, -14, 118,  27,  -9,   3, 0},
+  {-1,   5, -16, 112,  37, -12,   4, -1},
+  {-1,   5, -18, 105,  48, -14,   4, -1},
+  {-1,   6, -19,  97,  58, -17,   5, -1},
+  {-1,   6, -20,  88,  68, -18,   6, -1},
+  {-1,   6, -19,  78,  78, -19,   6, -1},
+  {-1,   6, -18,  68,  88, -20,   6, -1},
+  {-1,   5, -17,  58,  97, -19,   6, -1},
+  {-1,   4, -14,  48, 105, -18,   5, -1},
+  {-1,   4, -12,  37, 112, -16,   5, -1},
+  {0,   3,  -9,  27, 118, -14,   4, -1},
+  {-1,   2,  -6,  18, 123, -10,   3, -1},
+  {0,   1,  -3,   8, 126,  -5,   1, 0},
+};
+
+#if CONFIG_EXT_INTRA
+DECLARE_ALIGNED(256, static const InterpKernel,
+                sub_pel_filters_8sharp[SUBPEL_SHIFTS]) = {
+  // intfilt 0.8
+  {0,   0,   0, 128,   0,   0,   0, 0},
+  {-1,   2,  -6, 127,   9,  -4,   2, -1},
+  {-2,   5, -12, 124,  18,  -7,   4, -2},
+  {-2,   7, -16, 119,  28, -11,   5, -2},
+  {-3,   8, -19, 114,  38, -14,   7, -3},
+  {-3,   9, -22, 107,  49, -17,   8, -3},
+  {-4,  10, -23,  99,  60, -20,  10, -4},
+  {-4,  11, -23,  90,  70, -22,  10, -4},
+  {-4,  11, -23,  80,  80, -23,  11, -4},
+  {-4,  10, -22,  70,  90, -23,  11, -4},
+  {-4,  10, -20,  60,  99, -23,  10, -4},
+  {-3,   8, -17,  49, 107, -22,   9, -3},
+  {-3,   7, -14,  38, 114, -19,   8, -3},
+  {-2,   5, -11,  28, 119, -16,   7, -2},
+  {-2,   4,  -7,  18, 124, -12,   5, -2},
+  {-1,   2,  -4,   9, 127,  -6,   2, -1},
+};
+#endif  // CONFIG_EXT_INTRA
+
+DECLARE_ALIGNED(256, static const int16_t,
+                sub_pel_filters_10sharp[SUBPEL_SHIFTS][10]) = {
+  // intfilt 0.77
+  {0,   0,   0,   0, 128,   0,   0,   0,   0, 0},
+  {0,  -1,   3,  -6, 127,   8,  -4,   2,  -1, 0},
+  {1,  -2,   5, -12, 124,  18,  -7,   3,  -2, 0},
+  {1,  -3,   7, -17, 119,  28, -11,   5,  -2, 1},
+  {1,  -4,   8, -20, 114,  38, -14,   7,  -3, 1},
+  {1,  -4,   9, -22, 107,  49, -17,   8,  -4, 1},
+  {2,  -5,  10, -24,  99,  59, -20,   9,  -4, 2},
+  {2,  -5,  10, -24,  90,  70, -22,  10,  -5, 2},
+  {2,  -5,  10, -23,  80,  80, -23,  10,  -5, 2},
+  {2,  -5,  10, -22,  70,  90, -24,  10,  -5, 2},
+  {2,  -4,   9, -20,  59,  99, -24,  10,  -5, 2},
+  {1,  -4,   8, -17,  49, 107, -22,   9,  -4, 1},
+  {1,  -3,   7, -14,  38, 114, -20,   8,  -4, 1},
+  {1,  -2,   5, -11,  28, 119, -17,   7,  -3, 1},
+  {0,  -2,   3,  -7,  18, 124, -12,   5,  -2, 1},
+  {0,  -1,   2,  -4,   8, 127,  -6,   3,  -1, 0},
+};
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+                sub_pel_filters_8smooth2[SUBPEL_SHIFTS]) = {
+// freqmultiplier = 0.35
+  {0,  0,  0, 128,  0,  0,  0,  0},
+  {-1,  8, 31, 47, 34, 10,  0, -1},
+  {-1,  7, 29, 46, 36, 12,  0, -1},
+  {-1,  6, 28, 46, 37, 13,  0, -1},
+  {-1,  5, 26, 46, 38, 14,  1, -1},
+  {-1,  4, 25, 45, 39, 16,  1, -1},
+  {-1,  4, 23, 44, 41, 17,  1, -1},
+  {-1,  3, 21, 44, 42, 18,  2, -1},
+  {-1,  2, 20, 43, 43, 20,  2, -1},
+  {-1,  2, 18, 42, 44, 21,  3, -1},
+  {-1,  1, 17, 41, 44, 23,  4, -1},
+  {-1,  1, 16, 39, 45, 25,  4, -1},
+  {-1,  1, 14, 38, 46, 26,  5, -1},
+  {-1,  0, 13, 37, 46, 28,  6, -1},
+  {-1,  0, 12, 36, 46, 29,  7, -1},
+  {-1,  0, 10, 34, 47, 31,  8, -1},
+};
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+                sub_pel_filters_8smooth[SUBPEL_SHIFTS]) = {
+// freqmultiplier = 0.75
+  {0,  0,  0, 128,  0,  0,  0,  0},
+  {2, -10,  19,  95,  31, -11,   2, 0},
+  {2,  -9,  14,  94,  37, -12,   2, 0},
+  {2,  -8,   9,  92,  43, -12,   1, 1},
+  {2,  -7,   5,  90,  49, -12,   1, 0},
+  {2,  -5,   1,  86,  55, -12,   0, 1},
+  {1,  -4,  -2,  82,  61, -11,   0, 1},
+  {1, -3, -5, 77, 67, -9, -1, 1},
+  {1, -2, -7, 72, 72, -7, -2, 1},
+  {1, -1, -9, 67, 77, -5, -3, 1},
+  {1,   0, -11,  61,  82,  -2,  -4, 1},
+  {1,   0, -12,  55,  86,   1,  -5, 2},
+  {0,   1, -12,  49,  90,   5,  -7, 2},
+  {1,   1, -12,  43,  92,   9,  -8, 2},
+  {0,   2, -12,  37,  94,  14,  -9, 2},
+  {0,   2, -11,  31,  95,  19, -10, 2},
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                sub_pel_filters_12sharp[SUBPEL_SHIFTS][12]) = {
+  // intfilt 0.85
+  {0,   0,   0,   0,   0, 128,   0,   0,   0,   0,   0, 0},
+  {0,   1,  -2,   3,  -7, 127,   8,  -4,   2,  -1,   1, 0},
+  {-1,   2,  -3,   6, -13, 124,  18,  -8,   4,  -2,   2, -1},
+  {-1,   3,  -4,   8, -18, 120,  28, -12,   7,  -4,   2, -1},
+  {-1,   3,  -6,  10, -21, 115,  38, -15,   8,  -5,   3, -1},
+  {-2,   4,  -6,  12, -24, 108,  49, -18,  10,  -6,   3, -2},
+  {-2,   4,  -7,  13, -25, 100,  60, -21,  11,  -7,   4, -2},
+  {-2,   4,  -7,  13, -26,  91,  71, -24,  13,  -7,   4, -2},
+  {-2,   4,  -7,  13, -25,  81,  81, -25,  13,  -7,   4, -2},
+  {-2,   4,  -7,  13, -24,  71,  91, -26,  13,  -7,   4, -2},
+  {-2,   4,  -7,  11, -21,  60, 100, -25,  13,  -7,   4, -2},
+  {-2,   3,  -6,  10, -18,  49, 108, -24,  12,  -6,   4, -2},
+  {-1,   3,  -5,   8, -15,  38, 115, -21,  10,  -6,   3, -1},
+  {-1,   2,  -4,   7, -12,  28, 120, -18,   8,  -4,   3, -1},
+  {-1,   2,  -2,   4,  -8,  18, 124, -13,   6,  -3,   2, -1},
+  {0,   1,  -1,   2,  -4,   8, 127,  -7,   3,  -2,   1, 0},
+};
+#else  // CONFIG_EXT_INTERP
+
+DECLARE_ALIGNED(256, static const InterpKernel,
+                sub_pel_filters_8[SUBPEL_SHIFTS]) = {
+  // Lagrangian interpolation filter
   { 0,   0,   0, 128,   0,   0,   0,  0},
   { 0,   1,  -5, 126,   8,  -3,   1,  0},
   { -1,   3, -10, 122,  18,  -6,   2,  0},
@@ -53,9 +206,9 @@
   { 0,   1,  -3,   8, 126,  -5,   1,  0}
 };
 
-// DCT based filter
 DECLARE_ALIGNED(256, static const InterpKernel,
-                sub_pel_filters_8s[SUBPEL_SHIFTS]) = {
+                sub_pel_filters_8sharp[SUBPEL_SHIFTS]) = {
+  // DCT based filter
   {0,   0,   0, 128,   0,   0,   0, 0},
   {-1,   3,  -7, 127,   8,  -3,   1, 0},
   {-2,   5, -13, 125,  17,  -6,   3, -1},
@@ -74,9 +227,9 @@
   {0,   1,  -3,   8, 127,  -7,   3, -1}
 };
 
-// freqmultiplier = 0.5
 DECLARE_ALIGNED(256, static const InterpKernel,
-                sub_pel_filters_8lp[SUBPEL_SHIFTS]) = {
+                sub_pel_filters_8smooth[SUBPEL_SHIFTS]) = {
+// freqmultiplier = 0.5
   { 0,  0,  0, 128,  0,  0,  0,  0},
   {-3, -1, 32,  64, 38,  1, -3,  0},
   {-2, -2, 29,  63, 41,  2, -3,  0},
@@ -94,11 +247,120 @@
   { 0, -3,  2,  41, 63, 29, -2, -2},
   { 0, -3,  1,  38, 64, 32, -1, -3}
 };
+#endif  // CONFIG_EXT_INTERP
 
-
-const InterpKernel *vp10_filter_kernels[4] = {
-  sub_pel_filters_8,
-  sub_pel_filters_8lp,
-  sub_pel_filters_8s,
-  bilinear_filters
+#if CONFIG_EXT_INTRA
+const InterpKernel *vp10_intra_filter_kernels[INTRA_FILTERS] = {
+  bilinear_filters,         // INTRA_FILTER_LINEAR
+  sub_pel_filters_8,        // INTRA_FILTER_8TAP
+  sub_pel_filters_8sharp,   // INTRA_FILTER_8TAP_SHARP
+  sub_pel_filters_8smooth,  // INTRA_FILTER_8TAP_SMOOTH
 };
+#endif  // CONFIG_EXT_INTRA
+
+#if CONFIG_EXT_INTERP
+static const InterpFilterParams
+vp10_interp_filter_params_list[SWITCHABLE_FILTERS + 1] = {
+  {(const int16_t*)sub_pel_filters_8, SUBPEL_TAPS, SUBPEL_SHIFTS},
+  {(const int16_t*)sub_pel_filters_8smooth, SUBPEL_TAPS, SUBPEL_SHIFTS},
+  {(const int16_t*)sub_pel_filters_10sharp, 10, SUBPEL_SHIFTS},
+  {(const int16_t*)sub_pel_filters_8smooth2, SUBPEL_TAPS, SUBPEL_SHIFTS},
+  {(const int16_t*)sub_pel_filters_12sharp, 12, SUBPEL_SHIFTS},
+  {(const int16_t*)bilinear_filters, SUBPEL_TAPS, SUBPEL_SHIFTS}
+};
+#else
+static const InterpFilterParams
+vp10_interp_filter_params_list[SWITCHABLE_FILTERS + 1] = {
+  {(const int16_t*)sub_pel_filters_8, SUBPEL_TAPS, SUBPEL_SHIFTS},
+  {(const int16_t*)sub_pel_filters_8smooth, SUBPEL_TAPS, SUBPEL_SHIFTS},
+  {(const int16_t*)sub_pel_filters_8sharp, SUBPEL_TAPS, SUBPEL_SHIFTS},
+  {(const int16_t*)bilinear_filters, SUBPEL_TAPS, SUBPEL_SHIFTS}
+};
+#endif  // CONFIG_EXT_INTERP
+
+#if USE_TEMPORALFILTER_12TAP
+static const InterpFilterParams vp10_interp_temporalfilter_12tap = {
+  (const int16_t*)sub_pel_filters_temporalfilter_12, 12, SUBPEL_SHIFTS
+};
+#endif  // USE_TEMPORALFILTER_12TAP
+
+InterpFilterParams vp10_get_interp_filter_params(
+    const INTERP_FILTER interp_filter) {
+#if USE_TEMPORALFILTER_12TAP
+  if (interp_filter == TEMPORALFILTER_12TAP)
+    return vp10_interp_temporalfilter_12tap;
+#endif  // USE_TEMPORALFILTER_12TAP
+  return vp10_interp_filter_params_list[interp_filter];
+}
+
+const int16_t *vp10_get_interp_filter_kernel(
+    const INTERP_FILTER interp_filter) {
+#if USE_TEMPORALFILTER_12TAP
+  if (interp_filter == TEMPORALFILTER_12TAP)
+    return vp10_interp_temporalfilter_12tap.filter_ptr;
+#endif  // USE_TEMPORALFILTER_12TAP
+  return (const int16_t*)
+      vp10_interp_filter_params_list[interp_filter].filter_ptr;
+}
+
+SubpelFilterCoeffs vp10_get_subpel_filter_signal_dir(
+    const InterpFilterParams p, int index) {
+#if CONFIG_EXT_INTERP && HAVE_SSSE3
+  if (p.filter_ptr == (const int16_t *)sub_pel_filters_12sharp) {
+    return &sub_pel_filters_12sharp_signal_dir[index][0];
+  }
+  if (p.filter_ptr == (const int16_t *)sub_pel_filters_10sharp) {
+    return &sub_pel_filters_10sharp_signal_dir[index][0];
+  }
+#endif
+#if USE_TEMPORALFILTER_12TAP && HAVE_SSSE3
+  if (p.filter_ptr == (const int16_t *)sub_pel_filters_temporalfilter_12) {
+    return &sub_pel_filters_temporalfilter_12_signal_dir[index][0];
+  }
+#endif
+  (void)p;
+  (void)index;
+  return NULL;
+}
+
+SubpelFilterCoeffs vp10_get_subpel_filter_ver_signal_dir(
+    const InterpFilterParams p, int index) {
+#if CONFIG_EXT_INTERP && HAVE_SSSE3
+  if (p.filter_ptr == (const int16_t *)sub_pel_filters_12sharp) {
+    return &sub_pel_filters_12sharp_ver_signal_dir[index][0];
+  }
+  if (p.filter_ptr == (const int16_t *)sub_pel_filters_10sharp) {
+    return &sub_pel_filters_10sharp_ver_signal_dir[index][0];
+  }
+#endif
+#if USE_TEMPORALFILTER_12TAP && HAVE_SSSE3
+  if (p.filter_ptr == (const int16_t *)sub_pel_filters_temporalfilter_12) {
+    return &sub_pel_filters_temporalfilter_12_ver_signal_dir[index][0];
+  }
+#endif
+  (void)p;
+  (void)index;
+  return NULL;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+HbdSubpelFilterCoeffs vp10_hbd_get_subpel_filter_ver_signal_dir(
+    const InterpFilterParams p, int index) {
+#if CONFIG_EXT_INTERP && HAVE_SSE4_1
+  if (p.filter_ptr == (const int16_t *)sub_pel_filters_12sharp) {
+    return &sub_pel_filters_12sharp_highbd_ver_signal_dir[index][0];
+  }
+  if (p.filter_ptr == (const int16_t *)sub_pel_filters_10sharp) {
+    return &sub_pel_filters_10sharp_highbd_ver_signal_dir[index][0];
+  }
+#endif
+#if USE_TEMPORALFILTER_12TAP && HAVE_SSE4_1
+  if (p.filter_ptr == (const int16_t *)sub_pel_filters_temporalfilter_12) {
+    return &sub_pel_filters_temporalfilter_12_highbd_ver_signal_dir[index][0];
+  }
+#endif
+  (void)p;
+  (void)index;
+  return NULL;
+}
+#endif
diff --git a/vp10/common/filter.h b/vp10/common/filter.h
index 826cd03..5ebf2a5 100644
--- a/vp10/common/filter.h
+++ b/vp10/common/filter.h
@@ -21,19 +21,111 @@
 extern "C" {
 #endif
 
-#define EIGHTTAP            0
+#define EIGHTTAP_REGULAR    0
 #define EIGHTTAP_SMOOTH     1
-#define EIGHTTAP_SHARP      2
+#define MULTITAP_SHARP      2
+
+#if CONFIG_EXT_INTERP
+#define EIGHTTAP_SMOOTH2    3
+#define MULTITAP_SHARP2     4
+
+#define MAX_SUBPEL_TAPS    12
+
+#define SUPPORT_NONINTERPOLATING_FILTERS 0  /* turn it on for experimentation */
+#define SWITCHABLE_FILTERS  5 /* Number of switchable filters */
+#else
 #define SWITCHABLE_FILTERS  3 /* Number of switchable filters */
-#define BILINEAR            3
+#endif  // CONFIG_EXT_INTERP
+
+#define USE_TEMPORALFILTER_12TAP 1
+#if USE_TEMPORALFILTER_12TAP
+#define TEMPORALFILTER_12TAP (SWITCHABLE_FILTERS + 1)
+#endif
+
 // The codec can operate in four possible inter prediction filter mode:
 // 8-tap, 8-tap-smooth, 8-tap-sharp, and switching between the three.
+
+#define BILINEAR            (SWITCHABLE_FILTERS)
+#define SWITCHABLE          (SWITCHABLE_FILTERS + 1)  /* the last one */
+#if CONFIG_DUAL_FILTER
+#define SWITCHABLE_FILTER_CONTEXTS ((SWITCHABLE_FILTERS + 1) * 4)
+#define INTER_FILTER_COMP_OFFSET   (SWITCHABLE_FILTERS + 1)
+#define INTER_FILTER_DIR_OFFSET    ((SWITCHABLE_FILTERS + 1) * 2)
+#else
 #define SWITCHABLE_FILTER_CONTEXTS (SWITCHABLE_FILTERS + 1)
-#define SWITCHABLE 4 /* should be the last one */
+#endif
 
 typedef uint8_t INTERP_FILTER;
 
-extern const InterpKernel *vp10_filter_kernels[4];
+#if CONFIG_EXT_INTRA
+typedef enum {
+  INTRA_FILTER_LINEAR,
+  INTRA_FILTER_8TAP,
+  INTRA_FILTER_8TAP_SHARP,
+  INTRA_FILTER_8TAP_SMOOTH,
+  INTRA_FILTERS,
+} INTRA_FILTER;
+
+extern const InterpKernel *vp10_intra_filter_kernels[INTRA_FILTERS];
+#endif  // CONFIG_EXT_INTRA
+
+typedef struct InterpFilterParams {
+  const int16_t* filter_ptr;
+  uint16_t taps;
+  uint16_t subpel_shifts;
+} InterpFilterParams;
+
+InterpFilterParams vp10_get_interp_filter_params(
+    const INTERP_FILTER interp_filter);
+
+const int16_t *vp10_get_interp_filter_kernel(
+    const INTERP_FILTER interp_filter);
+
+static INLINE const int16_t* vp10_get_interp_filter_subpel_kernel(
+    const InterpFilterParams filter_params, const int subpel) {
+  return filter_params.filter_ptr + filter_params.taps * subpel;
+}
+
+static INLINE int vp10_is_interpolating_filter(
+    const INTERP_FILTER interp_filter) {
+  const InterpFilterParams ip = vp10_get_interp_filter_params(interp_filter);
+  return (ip.filter_ptr[ip.taps / 2 - 1] == 128);
+}
+
+#if USE_TEMPORALFILTER_12TAP
+extern const int8_t sub_pel_filters_temporalfilter_12_signal_dir[15][2][16];
+extern const int8_t sub_pel_filters_temporalfilter_12_ver_signal_dir[15][6][16];
+#if CONFIG_VP9_HIGHBITDEPTH
+extern const
+int16_t sub_pel_filters_temporalfilter_12_highbd_ver_signal_dir[15][6][8];
+#endif
+#endif
+
+#if CONFIG_EXT_INTERP
+extern const int8_t sub_pel_filters_12sharp_signal_dir[15][2][16];
+extern const int8_t sub_pel_filters_10sharp_signal_dir[15][2][16];
+extern const int8_t sub_pel_filters_12sharp_ver_signal_dir[15][6][16];
+extern const int8_t sub_pel_filters_10sharp_ver_signal_dir[15][6][16];
+#if CONFIG_VP9_HIGHBITDEPTH
+extern const int16_t sub_pel_filters_12sharp_highbd_ver_signal_dir[15][6][8];
+extern const int16_t sub_pel_filters_10sharp_highbd_ver_signal_dir[15][6][8];
+#endif
+#endif
+
+typedef const int8_t (*SubpelFilterCoeffs)[16];
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef const int16_t (*HbdSubpelFilterCoeffs)[8];
+#endif
+
+SubpelFilterCoeffs vp10_get_subpel_filter_signal_dir(
+    const InterpFilterParams p, int index);
+
+SubpelFilterCoeffs vp10_get_subpel_filter_ver_signal_dir(
+    const InterpFilterParams p, int index);
+#if CONFIG_VP9_HIGHBITDEPTH
+HbdSubpelFilterCoeffs vp10_hbd_get_subpel_filter_ver_signal_dir(
+    const InterpFilterParams p, int index);
+#endif
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp10/common/idct.c b/vp10/common/idct.c
index 5ee15c8..179b903 100644
--- a/vp10/common/idct.c
+++ b/vp10/common/idct.c
@@ -13,111 +13,728 @@
 #include "./vp10_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vp10/common/blockd.h"
+#include "vp10/common/enums.h"
 #include "vp10/common/idct.h"
+#include "vp10/common/vp10_inv_txfm2d_cfg.h"
 #include "vpx_dsp/inv_txfm.h"
 #include "vpx_ports/mem.h"
 
+int get_tx_scale(const MACROBLOCKD *const xd, const TX_TYPE tx_type,
+                 const TX_SIZE tx_size) {
+  (void) tx_type;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    return tx_size == TX_32X32;
+  }
+#else
+  (void)xd;
+#endif
+  return tx_size == TX_32X32;
+}
+
+#if CONFIG_EXT_TX
+static void iidtx4_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 4; ++i)
+    output[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2);
+}
+
+static void iidtx8_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 8; ++i)
+    output[i] = input[i] * 2;
+}
+
+static void iidtx16_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 16; ++i)
+    output[i] = (tran_low_t)dct_const_round_shift(input[i] * 2 * Sqrt2);
+}
+
+static void iidtx32_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 32; ++i)
+    output[i] = input[i] * 4;
+}
+
+// For use in lieu of DST
+static void ihalfright32_c(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  tran_low_t inputhalf[16];
+  for (i = 0; i < 16; ++i) {
+    output[i] = input[16 + i] * 4;
+  }
+  // Multiply input by sqrt(2)
+  for (i = 0; i < 16; ++i) {
+    inputhalf[i] = (tran_low_t)dct_const_round_shift(input[i] * Sqrt2);
+  }
+  idct16_c(inputhalf, output + 16);
+  // Note overall scaling factor is 4 times orthogonal
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_iidtx4_c(const tran_low_t *input, tran_low_t *output,
+                            int bd) {
+  int i;
+  for (i = 0; i < 4; ++i)
+    output[i] = (tran_low_t)highbd_dct_const_round_shift(input[i] * Sqrt2, bd);
+}
+
+static void highbd_iidtx8_c(const tran_low_t *input, tran_low_t *output,
+                            int bd) {
+  int i;
+  (void) bd;
+  for (i = 0; i < 8; ++i)
+    output[i] = input[i] * 2;
+}
+
+static void highbd_iidtx16_c(const tran_low_t *input, tran_low_t *output,
+                            int bd) {
+  int i;
+  for (i = 0; i < 16; ++i)
+    output[i] = (tran_low_t)highbd_dct_const_round_shift(
+        input[i] * 2 * Sqrt2, bd);
+}
+
+static void highbd_iidtx32_c(const tran_low_t *input, tran_low_t *output,
+                             int bd) {
+  int i;
+  (void) bd;
+  for (i = 0; i < 32; ++i)
+    output[i] = input[i] * 4;
+}
+
+static void highbd_ihalfright32_c(const tran_low_t *input, tran_low_t *output,
+                                  int bd) {
+  int i;
+  tran_low_t inputhalf[16];
+  for (i = 0; i < 16; ++i) {
+    output[i] = input[16 + i] * 4;
+  }
+  // Multiply input by sqrt(2)
+  for (i = 0; i < 16; ++i) {
+    inputhalf[i] = (tran_low_t)highbd_dct_const_round_shift(
+        input[i] * Sqrt2, bd);
+  }
+  vpx_highbd_idct16_c(inputhalf, output + 16, bd);
+  // Note overall scaling factor is 4 times orthogonal
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+// Inverse identity transform and add.
+static void inv_idtx_add_c(const tran_low_t *input, uint8_t *dest, int stride,
+                           int bs, int tx_type) {
+  int r, c;
+  const int shift = bs < 32 ? 3 : 2;
+  if (tx_type == IDTX) {
+    for (r = 0; r < bs; ++r) {
+      for (c = 0; c < bs; ++c)
+        dest[c] = clip_pixel_add(dest[c], input[c] >> shift);
+      dest += stride;
+      input += bs;
+    }
+  }
+}
+
+#define FLIPUD_PTR(dest, stride, size) do {     \
+  (dest) = (dest) + ((size) - 1) * (stride);  \
+  (stride) = - (stride);                      \
+} while (0)
+
+static void maybe_flip_strides(uint8_t **dst, int *dstride,
+                               tran_low_t **src, int *sstride,
+                               int tx_type, int size) {
+  // Note that the transpose of src will be added to dst. In order to LR
+  // flip the addends (in dst coordinates), we UD flip the src. To UD flip
+  // the addends, we UD flip the dst.
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case IDTX:
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+      break;
+    case FLIPADST_DCT:
+    case FLIPADST_ADST:
+    case V_FLIPADST:
+      // flip UD
+      FLIPUD_PTR(*dst, *dstride, size);
+      break;
+    case DCT_FLIPADST:
+    case ADST_FLIPADST:
+    case H_FLIPADST:
+      // flip LR
+      FLIPUD_PTR(*src, *sstride, size);
+      break;
+    case FLIPADST_FLIPADST:
+      // flip UD
+      FLIPUD_PTR(*dst, *dstride, size);
+      // flip LR
+      FLIPUD_PTR(*src, *sstride, size);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void highbd_idst4_c(const tran_low_t *input, tran_low_t *output, int bd) {
+  tran_low_t step[4];
+  tran_high_t temp1, temp2;
+  (void) bd;
+  // stage 1
+  temp1 = (input[3] + input[1]) * cospi_16_64;
+  temp2 = (input[3] - input[1]) * cospi_16_64;
+  step[0] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step[1] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  temp1 = input[2] * cospi_24_64 - input[0] * cospi_8_64;
+  temp2 = input[2] * cospi_8_64 + input[0] * cospi_24_64;
+  step[2] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step[3] = WRAPLOW(dct_const_round_shift(temp2), bd);
+
+  // stage 2
+  output[0] = WRAPLOW(step[0] + step[3], bd);
+  output[1] = WRAPLOW(-step[1] - step[2], bd);
+  output[2] = WRAPLOW(step[1] - step[2], bd);
+  output[3] = WRAPLOW(step[3] - step[0], bd);
+}
+
+void highbd_idst8_c(const tran_low_t *input, tran_low_t *output, int bd) {
+  tran_low_t step1[8], step2[8];
+  tran_high_t temp1, temp2;
+  (void) bd;
+  // stage 1
+  step1[0] = input[7];
+  step1[2] = input[3];
+  step1[1] = input[5];
+  step1[3] = input[1];
+  temp1 = input[6] * cospi_28_64 - input[0] * cospi_4_64;
+  temp2 = input[6] * cospi_4_64 + input[0] * cospi_28_64;
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  temp1 = input[2] * cospi_12_64 - input[4] * cospi_20_64;
+  temp2 = input[2] * cospi_20_64 + input[4] * cospi_12_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
+
+  // stage 2
+  temp1 = (step1[0] + step1[2]) * cospi_16_64;
+  temp2 = (step1[0] - step1[2]) * cospi_16_64;
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  temp1 = step1[1] * cospi_24_64 - step1[3] * cospi_8_64;
+  temp2 = step1[1] * cospi_8_64 + step1[3] * cospi_24_64;
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[4] = WRAPLOW(step1[4] + step1[5], bd);
+  step2[5] = WRAPLOW(step1[4] - step1[5], bd);
+  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
+  step2[7] = WRAPLOW(step1[6] + step1[7], bd);
+
+  // stage 3
+  step1[0] = WRAPLOW(step2[0] + step2[3], bd);
+  step1[1] = WRAPLOW(step2[1] + step2[2], bd);
+  step1[2] = WRAPLOW(step2[1] - step2[2], bd);
+  step1[3] = WRAPLOW(step2[0] - step2[3], bd);
+  step1[4] = step2[4];
+  temp1 = (step2[6] - step2[5]) * cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * cospi_16_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[7] = step2[7];
+
+  // stage 4
+  output[0] = WRAPLOW(step1[0] + step1[7], bd);
+  output[1] = WRAPLOW(-step1[1] - step1[6], bd);
+  output[2] = WRAPLOW(step1[2] + step1[5], bd);
+  output[3] = WRAPLOW(-step1[3] - step1[4], bd);
+  output[4] = WRAPLOW(step1[3] - step1[4], bd);
+  output[5] = WRAPLOW(-step1[2] + step1[5], bd);
+  output[6] = WRAPLOW(step1[1] - step1[6], bd);
+  output[7] = WRAPLOW(-step1[0] + step1[7], bd);
+}
+
+void highbd_idst16_c(const tran_low_t *input, tran_low_t *output, int bd) {
+  // vp9_highbd_igentx16(input, output, bd, Tx16);
+  tran_low_t step1[16], step2[16];
+  tran_high_t temp1, temp2;
+  (void) bd;
+
+  // stage 1
+  step1[0] = input[15];
+  step1[1] = input[7];
+  step1[2] = input[11];
+  step1[3] = input[3];
+  step1[4] = input[13];
+  step1[5] = input[5];
+  step1[6] = input[9];
+  step1[7] = input[1];
+  step1[8] = input[14];
+  step1[9] = input[6];
+  step1[10] = input[10];
+  step1[11] = input[2];
+  step1[12] = input[12];
+  step1[13] = input[4];
+  step1[14] = input[8];
+  step1[15] = input[0];
+
+  // stage 2
+  step2[0] = step1[0];
+  step2[1] = step1[1];
+  step2[2] = step1[2];
+  step2[3] = step1[3];
+  step2[4] = step1[4];
+  step2[5] = step1[5];
+  step2[6] = step1[6];
+  step2[7] = step1[7];
+
+  temp1 = step1[8] * cospi_30_64 - step1[15] * cospi_2_64;
+  temp2 = step1[8] * cospi_2_64 + step1[15] * cospi_30_64;
+  step2[8] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[15] = WRAPLOW(dct_const_round_shift(temp2), bd);
+
+  temp1 = step1[9] * cospi_14_64 - step1[14] * cospi_18_64;
+  temp2 = step1[9] * cospi_18_64 + step1[14] * cospi_14_64;
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd);
+
+  temp1 = step1[10] * cospi_22_64 - step1[13] * cospi_10_64;
+  temp2 = step1[10] * cospi_10_64 + step1[13] * cospi_22_64;
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
+
+  temp1 = step1[11] * cospi_6_64 - step1[12] * cospi_26_64;
+  temp2 = step1[11] * cospi_26_64 + step1[12] * cospi_6_64;
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd);
+
+  // stage 3
+  step1[0] = step2[0];
+  step1[1] = step2[1];
+  step1[2] = step2[2];
+  step1[3] = step2[3];
+
+  temp1 = step2[4] * cospi_28_64 - step2[7] * cospi_4_64;
+  temp2 = step2[4] * cospi_4_64 + step2[7] * cospi_28_64;
+  step1[4] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[7] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  temp1 = step2[5] * cospi_12_64 - step2[6] * cospi_20_64;
+  temp2 = step2[5] * cospi_20_64 + step2[6] * cospi_12_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
+
+  step1[8] = WRAPLOW(step2[8] + step2[9], bd);
+  step1[9] = WRAPLOW(step2[8] - step2[9], bd);
+  step1[10] = WRAPLOW(-step2[10] + step2[11], bd);
+  step1[11] = WRAPLOW(step2[10] + step2[11], bd);
+  step1[12] = WRAPLOW(step2[12] + step2[13], bd);
+  step1[13] = WRAPLOW(step2[12] - step2[13], bd);
+  step1[14] = WRAPLOW(-step2[14] + step2[15], bd);
+  step1[15] = WRAPLOW(step2[14] + step2[15], bd);
+
+  // stage 4
+  temp1 = (step1[0] + step1[1]) * cospi_16_64;
+  temp2 = (step1[0] - step1[1]) * cospi_16_64;
+  step2[0] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[1] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  temp1 = step1[2] * cospi_24_64 - step1[3] * cospi_8_64;
+  temp2 = step1[2] * cospi_8_64 + step1[3] * cospi_24_64;
+  step2[2] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[3] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[4] = WRAPLOW(step1[4] + step1[5], bd);
+  step2[5] = WRAPLOW(step1[4] - step1[5], bd);
+  step2[6] = WRAPLOW(-step1[6] + step1[7], bd);
+  step2[7] = WRAPLOW(step1[6] + step1[7], bd);
+
+  step2[8] = step1[8];
+  step2[15] = step1[15];
+  temp1 = -step1[9] * cospi_8_64 + step1[14] * cospi_24_64;
+  temp2 = step1[9] * cospi_24_64 + step1[14] * cospi_8_64;
+  step2[9] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[14] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  temp1 = -step1[10] * cospi_24_64 - step1[13] * cospi_8_64;
+  temp2 = -step1[10] * cospi_8_64 + step1[13] * cospi_24_64;
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[11] = step1[11];
+  step2[12] = step1[12];
+
+  // stage 5
+  step1[0] = WRAPLOW(step2[0] + step2[3], bd);
+  step1[1] = WRAPLOW(step2[1] + step2[2], bd);
+  step1[2] = WRAPLOW(step2[1] - step2[2], bd);
+  step1[3] = WRAPLOW(step2[0] - step2[3], bd);
+  step1[4] = step2[4];
+  temp1 = (step2[6] - step2[5]) * cospi_16_64;
+  temp2 = (step2[5] + step2[6]) * cospi_16_64;
+  step1[5] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step1[6] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step1[7] = step2[7];
+
+  step1[8] = WRAPLOW(step2[8] + step2[11], bd);
+  step1[9] = WRAPLOW(step2[9] + step2[10], bd);
+  step1[10] = WRAPLOW(step2[9] - step2[10], bd);
+  step1[11] = WRAPLOW(step2[8] - step2[11], bd);
+  step1[12] = WRAPLOW(-step2[12] + step2[15], bd);
+  step1[13] = WRAPLOW(-step2[13] + step2[14], bd);
+  step1[14] = WRAPLOW(step2[13] + step2[14], bd);
+  step1[15] = WRAPLOW(step2[12] + step2[15], bd);
+
+  // stage 6
+  step2[0] = WRAPLOW(step1[0] + step1[7], bd);
+  step2[1] = WRAPLOW(step1[1] + step1[6], bd);
+  step2[2] = WRAPLOW(step1[2] + step1[5], bd);
+  step2[3] = WRAPLOW(step1[3] + step1[4], bd);
+  step2[4] = WRAPLOW(step1[3] - step1[4], bd);
+  step2[5] = WRAPLOW(step1[2] - step1[5], bd);
+  step2[6] = WRAPLOW(step1[1] - step1[6], bd);
+  step2[7] = WRAPLOW(step1[0] - step1[7], bd);
+  step2[8] = step1[8];
+  step2[9] = step1[9];
+  temp1 = (-step1[10] + step1[13]) * cospi_16_64;
+  temp2 = (step1[10] + step1[13]) * cospi_16_64;
+  step2[10] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[13] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  temp1 = (-step1[11] + step1[12]) * cospi_16_64;
+  temp2 = (step1[11] + step1[12]) * cospi_16_64;
+  step2[11] = WRAPLOW(dct_const_round_shift(temp1), bd);
+  step2[12] = WRAPLOW(dct_const_round_shift(temp2), bd);
+  step2[14] = step1[14];
+  step2[15] = step1[15];
+
+  // stage 7
+  output[0] = WRAPLOW(step2[0] + step2[15], bd);
+  output[1] = WRAPLOW(-step2[1] - step2[14], bd);
+  output[2] = WRAPLOW(step2[2] + step2[13], bd);
+  output[3] = WRAPLOW(-step2[3] - step2[12], bd);
+  output[4] = WRAPLOW(step2[4] + step2[11], bd);
+  output[5] = WRAPLOW(-step2[5] - step2[10], bd);
+  output[6] = WRAPLOW(step2[6] + step2[9], bd);
+  output[7] = WRAPLOW(-step2[7] - step2[8], bd);
+  output[8] = WRAPLOW(step2[7] - step2[8], bd);
+  output[9] = WRAPLOW(-step2[6] + step2[9], bd);
+  output[10] = WRAPLOW(step2[5] - step2[10], bd);
+  output[11] = WRAPLOW(-step2[4] + step2[11], bd);
+  output[12] = WRAPLOW(step2[3] - step2[12], bd);
+  output[13] = WRAPLOW(-step2[2] + step2[13], bd);
+  output[14] = WRAPLOW(step2[1] - step2[14], bd);
+  output[15] = WRAPLOW(-step2[0] + step2[15], bd);
+}
+
+static void highbd_inv_idtx_add_c(const tran_low_t *input, uint8_t *dest8,
+                                  int stride, int bs, int tx_type, int bd) {
+  int r, c;
+  const int shift = bs < 32 ? 3 : 2;
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  if (tx_type == IDTX) {
+    for (r = 0; r < bs; ++r) {
+      for (c = 0; c < bs; ++c)
+        dest[c] = highbd_clip_pixel_add(dest[c], input[c] >> shift, bd);
+      dest += stride;
+      input += bs;
+    }
+  }
+}
+
+static void maybe_flip_strides16(uint16_t **dst, int *dstride,
+                                 tran_low_t **src, int *sstride,
+                                 int tx_type, int size) {
+  // Note that the transpose of src will be added to dst. In order to LR
+  // flip the addends (in dst coordinates), we UD flip the src. To UD flip
+  // the addends, we UD flip the dst.
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case IDTX:
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+      break;
+    case FLIPADST_DCT:
+    case FLIPADST_ADST:
+    case V_FLIPADST:
+      // flip UD
+      FLIPUD_PTR(*dst, *dstride, size);
+      break;
+    case DCT_FLIPADST:
+    case ADST_FLIPADST:
+    case H_FLIPADST:
+      // flip LR
+      FLIPUD_PTR(*src, *sstride, size);
+      break;
+    case FLIPADST_FLIPADST:
+      // flip UD
+      FLIPUD_PTR(*dst, *dstride, size);
+      // flip LR
+      FLIPUD_PTR(*src, *sstride, size);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // CONFIG_EXT_TX
+
 void vp10_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride,
-                         int tx_type) {
-  const transform_2d IHT_4[] = {
-    { idct4_c, idct4_c  },  // DCT_DCT  = 0
-    { iadst4_c, idct4_c  },   // ADST_DCT = 1
-    { idct4_c, iadst4_c },    // DCT_ADST = 2
-    { iadst4_c, iadst4_c }      // ADST_ADST = 3
+                          int tx_type) {
+  static const transform_2d IHT_4[] = {
+    { idct4_c,  idct4_c  },  // DCT_DCT
+    { iadst4_c, idct4_c  },  // ADST_DCT
+    { idct4_c,  iadst4_c },  // DCT_ADST
+    { iadst4_c, iadst4_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { iadst4_c, idct4_c  },  // FLIPADST_DCT
+    { idct4_c,  iadst4_c },  // DCT_FLIPADST
+    { iadst4_c, iadst4_c },  // FLIPADST_FLIPADST
+    { iadst4_c, iadst4_c },  // ADST_FLIPADST
+    { iadst4_c, iadst4_c },  // FLIPADST_ADST
+    { iidtx4_c, iidtx4_c },  // IDTX
+    { idct4_c,  iidtx4_c },  // V_DCT
+    { iidtx4_c, idct4_c  },  // H_DCT
+    { iadst4_c, iidtx4_c },  // V_ADST
+    { iidtx4_c, iadst4_c },  // H_ADST
+    { iadst4_c, iidtx4_c },  // V_FLIPADST
+    { iidtx4_c, iadst4_c },  // H_FLIPADST
+#endif  // CONFIG_EXT_TX
   };
 
   int i, j;
-  tran_low_t out[4 * 4];
-  tran_low_t *outptr = out;
-  tran_low_t temp_in[4], temp_out[4];
+  tran_low_t tmp;
+  tran_low_t out[4][4];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 4;
 
   // inverse transform row vectors
   for (i = 0; i < 4; ++i) {
-    IHT_4[tx_type].rows(input, outptr);
+    IHT_4[tx_type].rows(input, out[i]);
     input  += 4;
-    outptr += 4;
+  }
+
+  // transpose
+  for (i = 1 ; i < 4; i++) {
+    for (j = 0; j < i; j++) {
+            tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
   }
 
   // inverse transform column vectors
   for (i = 0; i < 4; ++i) {
-    for (j = 0; j < 4; ++j)
-      temp_in[j] = out[j * 4 + i];
-    IHT_4[tx_type].cols(temp_in, temp_out);
+    IHT_4[tx_type].cols(out[i], out[i]);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 4);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < 4; ++i) {
     for (j = 0; j < 4; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 4));
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 4));
     }
   }
 }
 
-static const transform_2d IHT_8[] = {
-  { idct8_c,  idct8_c  },  // DCT_DCT  = 0
-  { iadst8_c, idct8_c  },  // ADST_DCT = 1
-  { idct8_c,  iadst8_c },  // DCT_ADST = 2
-  { iadst8_c, iadst8_c }   // ADST_ADST = 3
-};
-
 void vp10_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                          int tx_type) {
+  static const transform_2d IHT_8[] = {
+    { idct8_c,  idct8_c  },  // DCT_DCT
+    { iadst8_c, idct8_c  },  // ADST_DCT
+    { idct8_c,  iadst8_c },  // DCT_ADST
+    { iadst8_c, iadst8_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { iadst8_c, idct8_c  },  // FLIPADST_DCT
+    { idct8_c,  iadst8_c },  // DCT_FLIPADST
+    { iadst8_c, iadst8_c },  // FLIPADST_FLIPADST
+    { iadst8_c, iadst8_c },  // ADST_FLIPADST
+    { iadst8_c, iadst8_c },  // FLIPADST_ADST
+    { iidtx8_c, iidtx8_c },  // IDTX
+    { idct8_c,  iidtx8_c },  // V_DCT
+    { iidtx8_c, idct8_c  },  // H_DCT
+    { iadst8_c, iidtx8_c },  // V_ADST
+    { iidtx8_c, iadst8_c },  // H_ADST
+    { iadst8_c, iidtx8_c },  // V_FLIPADST
+    { iidtx8_c, iadst8_c },  // H_FLIPADST
+#endif  // CONFIG_EXT_TX
+  };
+
   int i, j;
-  tran_low_t out[8 * 8];
-  tran_low_t *outptr = out;
-  tran_low_t temp_in[8], temp_out[8];
-  const transform_2d ht = IHT_8[tx_type];
+  tran_low_t tmp;
+  tran_low_t out[8][8];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 8;
 
   // inverse transform row vectors
   for (i = 0; i < 8; ++i) {
-    ht.rows(input, outptr);
-    input += 8;
-    outptr += 8;
+    IHT_8[tx_type].rows(input, out[i]);
+    input  += 8;
+  }
+
+  // transpose
+  for (i = 1 ; i < 8; i++) {
+    for (j = 0; j < i; j++) {
+            tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
   }
 
   // inverse transform column vectors
   for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j)
-      temp_in[j] = out[j * 8 + i];
-    ht.cols(temp_in, temp_out);
+    IHT_8[tx_type].cols(out[i], out[i]);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 8);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < 8; ++i) {
     for (j = 0; j < 8; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 5));
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 5));
     }
   }
 }
 
-static const transform_2d IHT_16[] = {
-  { idct16_c,  idct16_c  },  // DCT_DCT  = 0
-  { iadst16_c, idct16_c  },  // ADST_DCT = 1
-  { idct16_c,  iadst16_c },  // DCT_ADST = 2
-  { iadst16_c, iadst16_c }   // ADST_ADST = 3
-};
-
 void vp10_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest, int stride,
                             int tx_type) {
-  int i, j;
-  tran_low_t out[16 * 16];
-  tran_low_t *outptr = out;
-  tran_low_t temp_in[16], temp_out[16];
-  const transform_2d ht = IHT_16[tx_type];
+  static const transform_2d IHT_16[] = {
+    { idct16_c,  idct16_c  },  // DCT_DCT
+    { iadst16_c, idct16_c  },  // ADST_DCT
+    { idct16_c,  iadst16_c },  // DCT_ADST
+    { iadst16_c, iadst16_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { iadst16_c, idct16_c  },  // FLIPADST_DCT
+    { idct16_c,  iadst16_c },  // DCT_FLIPADST
+    { iadst16_c, iadst16_c },  // FLIPADST_FLIPADST
+    { iadst16_c, iadst16_c },  // ADST_FLIPADST
+    { iadst16_c, iadst16_c },  // FLIPADST_ADST
+    { iidtx16_c, iidtx16_c },  // IDTX
+    { idct16_c,  iidtx16_c },  // V_DCT
+    { iidtx16_c, idct16_c  },  // H_DCT
+    { iadst16_c, iidtx16_c },  // V_ADST
+    { iidtx16_c, iadst16_c },  // H_ADST
+    { iadst16_c, iidtx16_c },  // V_FLIPADST
+    { iidtx16_c, iadst16_c },  // H_FLIPADST
+#endif  // CONFIG_EXT_TX
+  };
 
-  // Rows
+  int i, j;
+  tran_low_t tmp;
+  tran_low_t out[16][16];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 16;
+
+  // inverse transform row vectors
   for (i = 0; i < 16; ++i) {
-    ht.rows(input, outptr);
-    input += 16;
-    outptr += 16;
+    IHT_16[tx_type].rows(input, out[i]);
+    input  += 16;
   }
 
-  // Columns
+  // transpose
+  for (i = 1 ; i < 16; i++) {
+    for (j = 0; j < i; j++) {
+            tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
   for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j)
-      temp_in[j] = out[j * 16 + i];
-    ht.cols(temp_in, temp_out);
+    IHT_16[tx_type].cols(out[i], out[i]);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 16);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < 16; ++i) {
     for (j = 0; j < 16; ++j) {
-      dest[j * stride + i] = clip_pixel_add(dest[j * stride + i],
-                                            ROUND_POWER_OF_TWO(temp_out[j], 6));
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
     }
   }
 }
 
+#if CONFIG_EXT_TX
+void vp10_iht32x32_1024_add_c(const tran_low_t *input, uint8_t *dest,
+                              int stride, int tx_type) {
+  static const transform_2d IHT_32[] = {
+    { idct32_c,  idct32_c  },                // DCT_DCT
+    { ihalfright32_c, idct32_c  },           // ADST_DCT
+    { idct32_c,  ihalfright32_c },           // DCT_ADST
+    { ihalfright32_c, ihalfright32_c },      // ADST_ADST
+    { ihalfright32_c, idct32_c  },           // FLIPADST_DCT
+    { idct32_c,  ihalfright32_c },           // DCT_FLIPADST
+    { ihalfright32_c, ihalfright32_c },      // FLIPADST_FLIPADST
+    { ihalfright32_c, ihalfright32_c },      // ADST_FLIPADST
+    { ihalfright32_c, ihalfright32_c },      // FLIPADST_ADST
+    { iidtx32_c, iidtx32_c },                // IDTX
+    { idct32_c,  iidtx32_c },                // V_DCT
+    { iidtx32_c, idct32_c  },                // H_DCT
+    { ihalfright32_c, iidtx16_c },           // V_ADST
+    { iidtx16_c, ihalfright32_c },           // H_ADST
+    { ihalfright32_c, iidtx16_c },           // V_FLIPADST
+    { iidtx16_c, ihalfright32_c },           // H_FLIPADST
+  };
+
+  int i, j;
+  tran_low_t tmp;
+  tran_low_t out[32][32];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 32;
+
+  // inverse transform row vectors
+  for (i = 0; i < 32; ++i) {
+    IHT_32[tx_type].rows(input, out[i]);
+    input  += 32;
+  }
+
+  // transpose
+  for (i = 1 ; i < 32; i++) {
+    for (j = 0; j < i; j++) {
+            tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 32; ++i) {
+    IHT_32[tx_type].cols(out[i], out[i]);
+  }
+
+  maybe_flip_strides(&dest, &stride, &outp, &outstride, tx_type, 32);
+
+  // Sum with the destination
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = clip_pixel_add(dest[d], ROUND_POWER_OF_TWO(outp[s], 6));
+    }
+  }
+}
+#endif  // CONFIG_EXT_TX
+
 // idct
 void vp10_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
                      int eob) {
@@ -183,20 +800,42 @@
   if (lossless) {
     assert(tx_type == DCT_DCT);
     vp10_iwht4x4_add(input, dest, stride, eob);
-  } else {
-    switch (tx_type) {
-      case DCT_DCT:
-        vp10_idct4x4_add(input, dest, stride, eob);
-        break;
-      case ADST_DCT:
-      case DCT_ADST:
-      case ADST_ADST:
-        vp10_iht4x4_16_add(input, dest, stride, tx_type);
-        break;
-      default:
-        assert(0);
-        break;
-    }
+    return;
+  }
+
+  switch (tx_type) {
+    case DCT_DCT:
+      vp10_idct4x4_add(input, dest, stride, eob);
+      break;
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      vp10_iht4x4_16_add(input, dest, stride, tx_type);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      vp10_iht4x4_16_add(input, dest, stride, tx_type);
+      break;
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      // Use C version since DST only exists in C code
+      vp10_iht4x4_16_add_c(input, dest, stride, tx_type);
+      break;
+    case IDTX:
+      inv_idtx_add_c(input, dest, stride, 4, tx_type);
+      break;
+#endif  // CONFIG_EXT_TX
+    default:
+      assert(0);
+      break;
   }
 }
 
@@ -211,6 +850,27 @@
     case ADST_ADST:
       vp10_iht8x8_64_add(input, dest, stride, tx_type);
       break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      vp10_iht8x8_64_add(input, dest, stride, tx_type);
+      break;
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      // Use C version since DST only exists in C code
+      vp10_iht8x8_64_add_c(input, dest, stride, tx_type);
+      break;
+    case IDTX:
+      inv_idtx_add_c(input, dest, stride, 8, tx_type);
+      break;
+#endif  // CONFIG_EXT_TX
     default:
       assert(0);
       break;
@@ -228,6 +888,27 @@
     case ADST_ADST:
       vp10_iht16x16_256_add(input, dest, stride, tx_type);
       break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      vp10_iht16x16_256_add(input, dest, stride, tx_type);
+      break;
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      // Use C version since DST only exists in C code
+      vp10_iht16x16_256_add_c(input, dest, stride, tx_type);
+      break;
+    case IDTX:
+      inv_idtx_add_c(input, dest, stride, 16, tx_type);
+      break;
+#endif  // CONFIG_EXT_TX
     default:
       assert(0);
       break;
@@ -240,11 +921,27 @@
     case DCT_DCT:
       vp10_idct32x32_add(input, dest, stride, eob);
       break;
+#if CONFIG_EXT_TX
     case ADST_DCT:
     case DCT_ADST:
     case ADST_ADST:
-      assert(0);
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      vp10_iht32x32_1024_add_c(input, dest, stride, tx_type);
       break;
+    case IDTX:
+      inv_idtx_add_c(input, dest, stride, 32, tx_type);
+      break;
+#endif  // CONFIG_EXT_TX
     default:
       assert(0);
       break;
@@ -254,108 +951,266 @@
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp10_highbd_iht4x4_16_add_c(const tran_low_t *input, uint8_t *dest8,
                                 int stride, int tx_type, int bd) {
-  const highbd_transform_2d IHT_4[] = {
-    { vpx_highbd_idct4_c, vpx_highbd_idct4_c  },    // DCT_DCT  = 0
-    { vpx_highbd_iadst4_c, vpx_highbd_idct4_c },    // ADST_DCT = 1
-    { vpx_highbd_idct4_c, vpx_highbd_iadst4_c },    // DCT_ADST = 2
-    { vpx_highbd_iadst4_c, vpx_highbd_iadst4_c }    // ADST_ADST = 3
+  static const highbd_transform_2d HIGH_IHT_4[] = {
+    { vpx_highbd_idct4_c,  vpx_highbd_idct4_c  },  // DCT_DCT
+    { vpx_highbd_iadst4_c, vpx_highbd_idct4_c  },  // ADST_DCT
+    { vpx_highbd_idct4_c,  vpx_highbd_iadst4_c },  // DCT_ADST
+    { vpx_highbd_iadst4_c, vpx_highbd_iadst4_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { vpx_highbd_iadst4_c, vpx_highbd_idct4_c  },  // FLIPADST_DCT
+    { vpx_highbd_idct4_c,  vpx_highbd_iadst4_c },  // DCT_FLIPADST
+    { vpx_highbd_iadst4_c, vpx_highbd_iadst4_c },  // FLIPADST_FLIPADST
+    { vpx_highbd_iadst4_c, vpx_highbd_iadst4_c },  // ADST_FLIPADST
+    { vpx_highbd_iadst4_c, vpx_highbd_iadst4_c },  // FLIPADST_ADST
+    {     highbd_iidtx4_c,     highbd_iidtx4_c },  // IDTX
+    { vpx_highbd_idct4_c,      highbd_iidtx4_c },  // V_DCT
+    {     highbd_iidtx4_c, vpx_highbd_idct4_c  },  // H_DCT
+    { vpx_highbd_iadst4_c,     highbd_iidtx4_c },  // V_ADST
+    {     highbd_iidtx4_c, vpx_highbd_iadst4_c },  // H_ADST
+    { vpx_highbd_iadst4_c,     highbd_iidtx4_c },  // V_FLIPADST
+    {     highbd_iidtx4_c, vpx_highbd_iadst4_c },  // H_FLIPADST
+#endif  // CONFIG_EXT_TX
   };
+
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
   int i, j;
-  tran_low_t out[4 * 4];
-  tran_low_t *outptr = out;
-  tran_low_t temp_in[4], temp_out[4];
+  tran_low_t tmp;
+  tran_low_t out[4][4];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 4;
 
-  // Inverse transform row vectors.
+  // inverse transform row vectors
   for (i = 0; i < 4; ++i) {
-    IHT_4[tx_type].rows(input, outptr, bd);
+    HIGH_IHT_4[tx_type].rows(input, out[i], bd);
     input  += 4;
-    outptr += 4;
   }
 
-  // Inverse transform column vectors.
+  // transpose
+  for (i = 1 ; i < 4; i++) {
+    for (j = 0; j < i; j++) {
+            tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
   for (i = 0; i < 4; ++i) {
-    for (j = 0; j < 4; ++j)
-      temp_in[j] = out[j * 4 + i];
-    IHT_4[tx_type].cols(temp_in, temp_out, bd);
+    HIGH_IHT_4[tx_type].cols(out[i], out[i], bd);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 4);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < 4; ++i) {
     for (j = 0; j < 4; ++j) {
-      dest[j * stride + i] = highbd_clip_pixel_add(
-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 4), bd);
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = highbd_clip_pixel_add(dest[d],
+                                      ROUND_POWER_OF_TWO(outp[s], 4), bd);
     }
   }
 }
 
-static const highbd_transform_2d HIGH_IHT_8[] = {
-  { vpx_highbd_idct8_c,  vpx_highbd_idct8_c  },  // DCT_DCT  = 0
-  { vpx_highbd_iadst8_c, vpx_highbd_idct8_c  },  // ADST_DCT = 1
-  { vpx_highbd_idct8_c,  vpx_highbd_iadst8_c },  // DCT_ADST = 2
-  { vpx_highbd_iadst8_c, vpx_highbd_iadst8_c }   // ADST_ADST = 3
-};
-
 void vp10_highbd_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest8,
                                 int stride, int tx_type, int bd) {
-  int i, j;
-  tran_low_t out[8 * 8];
-  tran_low_t *outptr = out;
-  tran_low_t temp_in[8], temp_out[8];
-  const highbd_transform_2d ht = HIGH_IHT_8[tx_type];
+  static const highbd_transform_2d HIGH_IHT_8[] = {
+    { vpx_highbd_idct8_c,  vpx_highbd_idct8_c  },  // DCT_DCT
+    { vpx_highbd_iadst8_c, vpx_highbd_idct8_c  },  // ADST_DCT
+    { vpx_highbd_idct8_c,  vpx_highbd_iadst8_c },  // DCT_ADST
+    { vpx_highbd_iadst8_c, vpx_highbd_iadst8_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { vpx_highbd_iadst8_c, vpx_highbd_idct8_c  },  // FLIPADST_DCT
+    { vpx_highbd_idct8_c,  vpx_highbd_iadst8_c },  // DCT_FLIPADST
+    { vpx_highbd_iadst8_c, vpx_highbd_iadst8_c },  // FLIPADST_FLIPADST
+    { vpx_highbd_iadst8_c, vpx_highbd_iadst8_c },  // ADST_FLIPADST
+    { vpx_highbd_iadst8_c, vpx_highbd_iadst8_c },  // FLIPADST_ADST
+    {     highbd_iidtx8_c,     highbd_iidtx8_c },  // IDTX
+    { vpx_highbd_idct8_c,      highbd_iidtx8_c },  // V_DCT
+    {     highbd_iidtx8_c, vpx_highbd_idct8_c  },  // H_DCT
+    { vpx_highbd_iadst8_c,     highbd_iidtx8_c },  // V_ADST
+    {     highbd_iidtx8_c, vpx_highbd_iadst8_c },  // H_ADST
+    { vpx_highbd_iadst8_c,     highbd_iidtx8_c },  // V_FLIPADST
+    {     highbd_iidtx8_c, vpx_highbd_iadst8_c },  // H_FLIPADST
+#endif  // CONFIG_EXT_TX
+  };
+
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  // Inverse transform row vectors.
+  int i, j;
+  tran_low_t tmp;
+  tran_low_t out[8][8];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 8;
+
+  // inverse transform row vectors
   for (i = 0; i < 8; ++i) {
-    ht.rows(input, outptr, bd);
-    input += 8;
-    outptr += 8;
+    HIGH_IHT_8[tx_type].rows(input, out[i], bd);
+    input  += 8;
   }
 
-  // Inverse transform column vectors.
+  // transpose
+  for (i = 1 ; i < 8; i++) {
+    for (j = 0; j < i; j++) {
+            tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
   for (i = 0; i < 8; ++i) {
-    for (j = 0; j < 8; ++j)
-      temp_in[j] = out[j * 8 + i];
-    ht.cols(temp_in, temp_out, bd);
+    HIGH_IHT_8[tx_type].cols(out[i], out[i], bd);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 8);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < 8; ++i) {
     for (j = 0; j < 8; ++j) {
-      dest[j * stride + i] = highbd_clip_pixel_add(
-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 5), bd);
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = highbd_clip_pixel_add(dest[d],
+                                      ROUND_POWER_OF_TWO(outp[s], 5), bd);
     }
   }
 }
 
-static const highbd_transform_2d HIGH_IHT_16[] = {
-  { vpx_highbd_idct16_c,  vpx_highbd_idct16_c  },  // DCT_DCT  = 0
-  { vpx_highbd_iadst16_c, vpx_highbd_idct16_c  },  // ADST_DCT = 1
-  { vpx_highbd_idct16_c,  vpx_highbd_iadst16_c },  // DCT_ADST = 2
-  { vpx_highbd_iadst16_c, vpx_highbd_iadst16_c }   // ADST_ADST = 3
-};
-
 void vp10_highbd_iht16x16_256_add_c(const tran_low_t *input, uint8_t *dest8,
                                    int stride, int tx_type, int bd) {
-  int i, j;
-  tran_low_t out[16 * 16];
-  tran_low_t *outptr = out;
-  tran_low_t temp_in[16], temp_out[16];
-  const highbd_transform_2d ht = HIGH_IHT_16[tx_type];
+  static const highbd_transform_2d HIGH_IHT_16[] = {
+    { vpx_highbd_idct16_c,  vpx_highbd_idct16_c  },  // DCT_DCT
+    { vpx_highbd_iadst16_c, vpx_highbd_idct16_c  },  // ADST_DCT
+    { vpx_highbd_idct16_c,  vpx_highbd_iadst16_c },  // DCT_ADST
+    { vpx_highbd_iadst16_c, vpx_highbd_iadst16_c },  // ADST_ADST
+#if CONFIG_EXT_TX
+    { vpx_highbd_iadst16_c, vpx_highbd_idct16_c  },  // FLIPADST_DCT
+    { vpx_highbd_idct16_c,  vpx_highbd_iadst16_c },  // DCT_FLIPADST
+    { vpx_highbd_iadst16_c, vpx_highbd_iadst16_c },  // FLIPADST_FLIPADST
+    { vpx_highbd_iadst16_c, vpx_highbd_iadst16_c },  // ADST_FLIPADST
+    { vpx_highbd_iadst16_c, vpx_highbd_iadst16_c },  // FLIPADST_ADST
+    {     highbd_iidtx16_c,     highbd_iidtx16_c },  // IDTX
+    { vpx_highbd_idct16_c,      highbd_iidtx16_c },  // V_DCT
+    {     highbd_iidtx16_c, vpx_highbd_idct16_c  },  // H_DCT
+    { vpx_highbd_iadst16_c,     highbd_iidtx16_c },  // V_ADST
+    {     highbd_iidtx16_c, vpx_highbd_iadst16_c },  // H_ADST
+    { vpx_highbd_iadst16_c,     highbd_iidtx16_c },  // V_FLIPADST
+    {     highbd_iidtx16_c, vpx_highbd_iadst16_c },  // H_FLIPADST
+#endif  // CONFIG_EXT_TX
+  };
+
   uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
 
-  // Rows
+  int i, j;
+  tran_low_t tmp;
+  tran_low_t out[16][16];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 16;
+
+  // inverse transform row vectors
   for (i = 0; i < 16; ++i) {
-    ht.rows(input, outptr, bd);
-    input += 16;
-    outptr += 16;
+    HIGH_IHT_16[tx_type].rows(input, out[i], bd);
+    input  += 16;
   }
 
-  // Columns
+  // transpose
+  for (i = 1 ; i < 16; i++) {
+    for (j = 0; j < i; j++) {
+            tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
   for (i = 0; i < 16; ++i) {
-    for (j = 0; j < 16; ++j)
-      temp_in[j] = out[j * 16 + i];
-    ht.cols(temp_in, temp_out, bd);
+    HIGH_IHT_16[tx_type].cols(out[i], out[i], bd);
+  }
+
+#if CONFIG_EXT_TX
+  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 16);
+#endif
+
+  // Sum with the destination
+  for (i = 0; i < 16; ++i) {
     for (j = 0; j < 16; ++j) {
-      dest[j * stride + i] = highbd_clip_pixel_add(
-          dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = highbd_clip_pixel_add(dest[d],
+                                      ROUND_POWER_OF_TWO(outp[s], 6), bd);
     }
   }
 }
 
+#if CONFIG_EXT_TX
+void vp10_highbd_iht32x32_1024_add_c(const tran_low_t *input, uint8_t *dest8,
+                                     int stride, int tx_type, int bd) {
+  static const highbd_transform_2d HIGH_IHT_32[] = {
+    { vpx_highbd_idct32_c,    vpx_highbd_idct32_c    },  // DCT_DCT
+    { highbd_ihalfright32_c,  vpx_highbd_idct32_c    },  // ADST_DCT
+    { vpx_highbd_idct32_c,    highbd_ihalfright32_c  },  // DCT_ADST
+    { highbd_ihalfright32_c,  highbd_ihalfright32_c  },  // ADST_ADST
+    { highbd_ihalfright32_c,  vpx_highbd_idct32_c    },  // FLIPADST_DCT
+    { vpx_highbd_idct32_c,    highbd_ihalfright32_c  },  // DCT_FLIPADST
+    { highbd_ihalfright32_c,  highbd_ihalfright32_c  },  // FLIPADST_FLIPADST
+    { highbd_ihalfright32_c,  highbd_ihalfright32_c  },  // ADST_FLIPADST
+    { highbd_ihalfright32_c,  highbd_ihalfright32_c  },  // FLIPADST_ADST
+    { highbd_iidtx32_c,       highbd_iidtx32_c       },  // IDTX
+    { vpx_highbd_idct32_c,    highbd_iidtx32_c       },  // V_DCT
+    { highbd_iidtx32_c,       vpx_highbd_idct32_c    },  // H_DCT
+    { highbd_ihalfright32_c,  highbd_iidtx32_c       },  // V_ADST
+    { highbd_iidtx32_c,       highbd_ihalfright32_c  },  // H_ADST
+    { highbd_ihalfright32_c,  highbd_iidtx32_c       },  // V_FLIPADST
+    { highbd_iidtx32_c,       highbd_ihalfright32_c  },  // H_FLIPADST
+  };
+
+  uint16_t *dest = CONVERT_TO_SHORTPTR(dest8);
+
+  int i, j;
+  tran_low_t tmp;
+  tran_low_t out[32][32];
+  tran_low_t *outp = &out[0][0];
+  int outstride = 32;
+
+  // inverse transform row vectors
+  for (i = 0; i < 32; ++i) {
+    HIGH_IHT_32[tx_type].rows(input, out[i], bd);
+    input  += 32;
+  }
+
+  // transpose
+  for (i = 1 ; i < 32; i++) {
+    for (j = 0; j < i; j++) {
+            tmp = out[i][j];
+      out[i][j] = out[j][i];
+      out[j][i] = tmp;
+    }
+  }
+
+  // inverse transform column vectors
+  for (i = 0; i < 32; ++i) {
+    HIGH_IHT_32[tx_type].cols(out[i], out[i], bd);
+  }
+
+  maybe_flip_strides16(&dest, &stride, &outp, &outstride, tx_type, 32);
+
+  // Sum with the destination
+  for (i = 0; i < 32; ++i) {
+    for (j = 0; j < 32; ++j) {
+      int d = i * stride + j;
+      int s = j * outstride + i;
+      dest[d] = highbd_clip_pixel_add(dest[d],
+                                      ROUND_POWER_OF_TWO(outp[s], 6), bd);
+    }
+  }
+}
+#endif  // CONFIG_EXT_TX
+
 // idct
 void vp10_highbd_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
                             int eob, int bd) {
@@ -425,35 +1280,79 @@
   if (lossless) {
     assert(tx_type == DCT_DCT);
     vp10_highbd_iwht4x4_add(input, dest, stride, eob, bd);
-  } else {
-    switch (tx_type) {
-      case DCT_DCT:
-        vp10_highbd_idct4x4_add(input, dest, stride, eob, bd);
-        break;
-      case ADST_DCT:
-      case DCT_ADST:
-      case ADST_ADST:
-         vp10_highbd_iht4x4_16_add(input, dest, stride, tx_type, bd);
-         break;
-      default:
-        assert(0);
-        break;
-    }
+    return;
+  }
+
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      vp10_inv_txfm2d_add_4x4(input, CONVERT_TO_SHORTPTR(dest), stride,
+                              tx_type, bd);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      vp10_inv_txfm2d_add_4x4(input, CONVERT_TO_SHORTPTR(dest), stride,
+                              tx_type, bd);
+      break;
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      // Use C version since DST only exists in C code
+      vp10_highbd_iht4x4_16_add_c(input, dest, stride, tx_type, bd);
+      break;
+    case IDTX:
+      highbd_inv_idtx_add_c(input, dest, stride, 4, tx_type, bd);
+      break;
+#endif  // CONFIG_EXT_TX
+    default:
+      assert(0);
+      break;
   }
 }
 
 void vp10_highbd_inv_txfm_add_8x8(const tran_low_t *input, uint8_t *dest,
                                   int stride, int eob, int bd,
                                   TX_TYPE tx_type) {
+  (void)eob;
   switch (tx_type) {
     case DCT_DCT:
-      vp10_highbd_idct8x8_add(input, dest, stride, eob, bd);
-      break;
     case ADST_DCT:
     case DCT_ADST:
     case ADST_ADST:
-      vp10_highbd_iht8x8_64_add(input, dest, stride, tx_type, bd);
+      vp10_inv_txfm2d_add_8x8(input, CONVERT_TO_SHORTPTR(dest), stride,
+                              tx_type, bd);
       break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      vp10_inv_txfm2d_add_8x8(input, CONVERT_TO_SHORTPTR(dest), stride,
+                              tx_type, bd);
+      break;
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      // Use C version since DST only exists in C code
+      vp10_highbd_iht8x8_64_add_c(input, dest, stride, tx_type, bd);
+      break;
+    case IDTX:
+      highbd_inv_idtx_add_c(input, dest, stride, 8, tx_type, bd);
+      break;
+#endif  // CONFIG_EXT_TX
     default:
       assert(0);
       break;
@@ -463,15 +1362,37 @@
 void vp10_highbd_inv_txfm_add_16x16(const tran_low_t *input, uint8_t *dest,
                                     int stride, int eob, int bd,
                                     TX_TYPE tx_type) {
+  (void)eob;
   switch (tx_type) {
     case DCT_DCT:
-      vp10_highbd_idct16x16_add(input, dest, stride, eob, bd);
-      break;
     case ADST_DCT:
     case DCT_ADST:
     case ADST_ADST:
-      vp10_highbd_iht16x16_256_add(input, dest, stride, tx_type, bd);
+      vp10_inv_txfm2d_add_16x16(input, CONVERT_TO_SHORTPTR(dest), stride,
+                                tx_type, bd);
       break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      vp10_inv_txfm2d_add_16x16(input, CONVERT_TO_SHORTPTR(dest), stride,
+                                tx_type, bd);
+      break;
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      // Use C version since DST only exists in C code
+      vp10_highbd_iht16x16_256_add_c(input, dest, stride, tx_type, bd);
+      break;
+    case IDTX:
+      highbd_inv_idtx_add_c(input, dest, stride, 16, tx_type, bd);
+      break;
+#endif  // CONFIG_EXT_TX
     default:
       assert(0);
       break;
@@ -481,18 +1402,99 @@
 void vp10_highbd_inv_txfm_add_32x32(const tran_low_t *input, uint8_t *dest,
                                     int stride, int eob, int bd,
                                     TX_TYPE tx_type) {
+  (void)eob;
   switch (tx_type) {
     case DCT_DCT:
-      vp10_highbd_idct32x32_add(input, dest, stride, eob, bd);
+      vp10_inv_txfm2d_add_32x32(input, CONVERT_TO_SHORTPTR(dest), stride,
+                                DCT_DCT, bd);
       break;
+#if CONFIG_EXT_TX
     case ADST_DCT:
     case DCT_ADST:
     case ADST_ADST:
-      assert(0);
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      vp10_highbd_iht32x32_1024_add_c(input, dest, stride, tx_type, bd);
       break;
+    case IDTX:
+      highbd_inv_idtx_add_c(input, dest, stride, 32, tx_type, bd);
+      break;
+#endif  // CONFIG_EXT_TX
     default:
       assert(0);
       break;
   }
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+
+void inv_txfm_add(const tran_low_t *input, uint8_t *dest, int stride,
+                  INV_TXFM_PARAM *inv_txfm_param) {
+  const TX_TYPE tx_type = inv_txfm_param->tx_type;
+  const TX_SIZE tx_size = inv_txfm_param->tx_size;
+  const int eob = inv_txfm_param->eob;
+  const int lossless = inv_txfm_param->lossless;
+
+  switch (tx_size) {
+    case TX_32X32:
+      vp10_inv_txfm_add_32x32(input, dest, stride, eob, tx_type);
+      break;
+    case TX_16X16:
+      vp10_inv_txfm_add_16x16(input, dest, stride, eob, tx_type);
+      break;
+    case TX_8X8:
+      vp10_inv_txfm_add_8x8(input, dest, stride, eob, tx_type);
+      break;
+    case TX_4X4:
+      // this is like vp10_short_idct4x4 but has a special case around eob<=1
+      // which is significant (not just an optimization) for the lossless
+      // case.
+      vp10_inv_txfm_add_4x4(input, dest, stride, eob, tx_type,
+                            lossless);
+      break;
+    default:
+      assert(0 && "Invalid transform size");
+      break;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void highbd_inv_txfm_add(const tran_low_t *input, uint8_t *dest, int stride,
+                         INV_TXFM_PARAM *inv_txfm_param) {
+  const TX_TYPE tx_type = inv_txfm_param->tx_type;
+  const TX_SIZE tx_size = inv_txfm_param->tx_size;
+  const int eob = inv_txfm_param->eob;
+  const int bd = inv_txfm_param->bd;
+  const int lossless = inv_txfm_param->lossless;
+
+  switch (tx_size) {
+    case TX_32X32:
+      vp10_highbd_inv_txfm_add_32x32(input, dest, stride, eob, bd, tx_type);
+      break;
+    case TX_16X16:
+      vp10_highbd_inv_txfm_add_16x16(input, dest, stride, eob, bd, tx_type);
+      break;
+    case TX_8X8:
+      vp10_highbd_inv_txfm_add_8x8(input, dest, stride, eob, bd, tx_type);
+      break;
+    case TX_4X4:
+      // this is like vp10_short_idct4x4 but has a special case around eob<=1
+      // which is significant (not just an optimization) for the lossless
+      // case.
+      vp10_highbd_inv_txfm_add_4x4(input, dest, stride, eob, bd, tx_type,
+                                   lossless);
+      break;
+    default:
+      assert(0 && "Invalid transform size");
+      break;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp10/common/idct.h b/vp10/common/idct.h
index 0883398..5d52314 100644
--- a/vp10/common/idct.h
+++ b/vp10/common/idct.h
@@ -14,6 +14,7 @@
 #include <assert.h>
 
 #include "./vpx_config.h"
+#include "vp10/common/blockd.h"
 #include "vp10/common/common.h"
 #include "vp10/common/enums.h"
 #include "vpx_dsp/inv_txfm.h"
@@ -24,6 +25,16 @@
 extern "C" {
 #endif
 
+typedef struct INV_TXFM_PARAM {
+  TX_TYPE tx_type;
+  TX_SIZE tx_size;
+  int eob;
+  int lossless;
+#if CONFIG_VP9_HIGHBITDEPTH
+  int bd;
+#endif
+} INV_TXFM_PARAM;
+
 typedef void (*transform_1d)(const tran_low_t*, tran_low_t*);
 
 typedef struct {
@@ -38,10 +49,20 @@
 } highbd_transform_2d;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+#define MAX_TX_SCALE 1
+int get_tx_scale(const MACROBLOCKD *const xd, const TX_TYPE tx_type,
+                 const TX_SIZE tx_size);
+
 void vp10_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
                      int eob);
 void vp10_idct4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
                      int eob);
+void vp10_idct8x8_add(const tran_low_t *input, uint8_t *dest, int stride,
+                      int eob);
+void vp10_idct16x16_add(const tran_low_t *input, uint8_t *dest, int stride,
+                        int eob);
+void vp10_idct32x32_add(const tran_low_t *input, uint8_t *dest, int stride,
+                        int eob);
 
 void vp10_inv_txfm_add_4x4(const tran_low_t *input, uint8_t *dest,
                            int stride, int eob, TX_TYPE tx_type, int lossless);
@@ -51,7 +72,8 @@
                              int stride, int eob, TX_TYPE tx_type);
 void vp10_inv_txfm_add_32x32(const tran_low_t *input, uint8_t *dest,
                              int stride, int eob, TX_TYPE tx_type);
-
+void inv_txfm_add(const tran_low_t *input, uint8_t *dest, int stride,
+                  INV_TXFM_PARAM *inv_txfm_param);
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp10_highbd_iwht4x4_add(const tran_low_t *input, uint8_t *dest, int stride,
                             int eob, int bd);
@@ -74,6 +96,8 @@
 void vp10_highbd_inv_txfm_add_32x32(const tran_low_t *input, uint8_t *dest,
                                     int stride, int eob, int bd,
                                     TX_TYPE tx_type);
+void highbd_inv_txfm_add(const tran_low_t *input, uint8_t *dest, int stride,
+                         INV_TXFM_PARAM *inv_txfm_param);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp10/common/loopfilter.c b/vp10/common/loopfilter.c
index 8f4fc8c..55715d7 100644
--- a/vp10/common/loopfilter.c
+++ b/vp10/common/loopfilter.c
@@ -8,11 +8,14 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <math.h>
+
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "vp10/common/loopfilter.h"
 #include "vp10/common/onyxc_int.h"
 #include "vp10/common/reconinter.h"
+#include "vp10/common/restoration.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
@@ -207,6 +210,10 @@
 static const int mode_lf_lut[MB_MODE_COUNT] = {
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  // INTRA_MODES
   1, 1, 0, 1                     // INTER_MODES (ZEROMV == 0)
+#if CONFIG_EXT_INTER
+  , 1,                           // NEWFROMNEARMV mode
+  1, 1, 1, 1, 1, 1, 1, 1, 0, 1      // INTER_COMPOUND_MODES (ZERO_ZEROMV == 0)
+#endif  // CONFIG_EXT_INTER
 };
 
 static void update_sharpness(loop_filter_info_n *lfi, int sharpness_lvl) {
@@ -233,8 +240,16 @@
 
 static uint8_t get_filter_level(const loop_filter_info_n *lfi_n,
                                 const MB_MODE_INFO *mbmi) {
-  return lfi_n->lvl[mbmi->segment_id][mbmi->ref_frame[0]]
-                   [mode_lf_lut[mbmi->mode]];
+#if CONFIG_SUPERTX
+  const int segment_id = VPXMIN(mbmi->segment_id, mbmi->segment_id_supertx);
+  assert(IMPLIES(supertx_enabled(mbmi),
+                 mbmi->segment_id_supertx != MAX_SEGMENTS));
+  assert(IMPLIES(supertx_enabled(mbmi),
+                 mbmi->segment_id_supertx <= mbmi->segment_id));
+#else
+  const int segment_id = mbmi->segment_id;
+#endif  // CONFIG_SUPERTX
+  return lfi_n->lvl[segment_id][mbmi->ref_frame[0]][mode_lf_lut[mbmi->mode]];
 }
 
 void vp10_loop_filter_init(VP10_COMMON *cm) {
@@ -715,11 +730,7 @@
   uint64_t *const int_4x4_y = &lfm->int_4x4_y;
   uint16_t *const left_uv = &lfm->left_uv[tx_size_uv];
   uint16_t *const above_uv = &lfm->above_uv[tx_size_uv];
-#if CONFIG_MISC_FIXES
   uint16_t *const int_4x4_uv = &lfm->left_int_4x4_uv;
-#else
-  uint16_t *const int_4x4_uv = &lfm->int_4x4_uv;
-#endif
   int i;
 
   // If filter level is 0 we don't loop filter.
@@ -728,11 +739,11 @@
   } else {
     const int w = num_8x8_blocks_wide_lookup[block_size];
     const int h = num_8x8_blocks_high_lookup[block_size];
-    int index = shift_y;
-    for (i = 0; i < h; i++) {
-      memset(&lfm->lfl_y[index], filter_level, w);
-      index += 8;
-    }
+    const int row = (shift_y >> MAX_MIB_SIZE_LOG2);
+    const int col = shift_y - (row << MAX_MIB_SIZE_LOG2);
+
+    for (i = 0; i < h; i++)
+      memset(&lfm->lfl_y[row + i][col], filter_level, w);
   }
 
   // These set 1 in the current block size for the block size edges.
@@ -754,13 +765,8 @@
 
   // If the block has no coefficients and is not intra we skip applying
   // the loop filter on block edges.
-#if CONFIG_MISC_FIXES
   if ((mbmi->skip || mbmi->has_no_coeffs) && is_inter_block(mbmi))
     return;
-#else
-  if (mbmi->skip && is_inter_block(mbmi))
-    return;
-#endif
 
   // Here we are adding a mask for the transform size. The transform
   // size mask is set to be correct for a 64x64 prediction block size. We
@@ -792,10 +798,18 @@
 // we only update u and v masks on the first block.
 static void build_y_mask(const loop_filter_info_n *const lfi_n,
                          const MODE_INFO *mi, const int shift_y,
+#if CONFIG_SUPERTX
+                         int supertx_enabled,
+#endif  // CONFIG_SUPERTX
                          LOOP_FILTER_MASK *lfm) {
   const MB_MODE_INFO *mbmi = &mi->mbmi;
-  const BLOCK_SIZE block_size = mbmi->sb_type;
   const TX_SIZE tx_size_y = mbmi->tx_size;
+#if CONFIG_SUPERTX
+  const BLOCK_SIZE block_size =
+      supertx_enabled ? (BLOCK_SIZE)(3 * tx_size_y) : mbmi->sb_type;
+#else
+  const BLOCK_SIZE block_size = mbmi->sb_type;
+#endif
   const int filter_level = get_filter_level(lfi_n, mbmi);
   uint64_t *const left_y = &lfm->left_y[tx_size_y];
   uint64_t *const above_y = &lfm->above_y[tx_size_y];
@@ -807,23 +821,18 @@
   } else {
     const int w = num_8x8_blocks_wide_lookup[block_size];
     const int h = num_8x8_blocks_high_lookup[block_size];
-    int index = shift_y;
-    for (i = 0; i < h; i++) {
-      memset(&lfm->lfl_y[index], filter_level, w);
-      index += 8;
-    }
+    const int row = (shift_y >> MAX_MIB_SIZE_LOG2);
+    const int col = shift_y - (row << MAX_MIB_SIZE_LOG2);
+
+    for (i = 0; i < h; i++)
+      memset(&lfm->lfl_y[row + i][col], filter_level, w);
   }
 
   *above_y |= above_prediction_mask[block_size] << shift_y;
   *left_y |= left_prediction_mask[block_size] << shift_y;
 
-#if CONFIG_MISC_FIXES
   if ((mbmi->skip || mbmi->has_no_coeffs) && is_inter_block(mbmi))
     return;
-#else
-  if (mbmi->skip && is_inter_block(mbmi))
-    return;
-#endif
 
   *above_y |= (size_mask[block_size] &
                above_64x64_txform_mask[tx_size_y]) << shift_y;
@@ -866,10 +875,11 @@
   const int shift_32_uv[] = {0, 2, 8, 10};
   const int shift_16_uv[] = {0, 1, 4, 5};
   int i;
-  const int max_rows = (mi_row + MI_BLOCK_SIZE > cm->mi_rows ?
-                        cm->mi_rows - mi_row : MI_BLOCK_SIZE);
-  const int max_cols = (mi_col + MI_BLOCK_SIZE > cm->mi_cols ?
-                        cm->mi_cols - mi_col : MI_BLOCK_SIZE);
+  const int max_rows = VPXMIN(cm->mi_rows - mi_row, MAX_MIB_SIZE);
+  const int max_cols = VPXMIN(cm->mi_cols - mi_col, MAX_MIB_SIZE);
+#if CONFIG_EXT_PARTITION
+  assert(0 && "Not yet updated");
+#endif  // CONFIG_EXT_PARTITION
 
   vp10_zero(*lfm);
   assert(mip[0] != NULL);
@@ -909,6 +919,10 @@
             break;
           case BLOCK_32X16:
             build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+#if CONFIG_SUPERTX
+            if (supertx_enabled(&mip[0]->mbmi))
+              break;
+#endif
             if (mi_32_row_offset + 2 >= max_rows)
               continue;
             mip2 = mip + mode_info_stride * 2;
@@ -916,12 +930,22 @@
             break;
           case BLOCK_16X32:
             build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+#if CONFIG_SUPERTX
+            if (supertx_enabled(&mip[0]->mbmi))
+              break;
+#endif
             if (mi_32_col_offset + 2 >= max_cols)
               continue;
             mip2 = mip + 2;
             build_masks(lfi_n, mip2[0], shift_y + 2, shift_uv + 1, lfm);
             break;
           default:
+#if CONFIG_SUPERTX
+            if (mip[0]->mbmi.tx_size == TX_32X32) {
+              build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+              break;
+            }
+#endif
             for (idx_16 = 0; idx_16 < 4; mip += offset_16[idx_16], ++idx_16) {
               const int shift_y = shift_32_y[idx_32] + shift_16_y[idx_16];
               const int shift_uv = shift_32_uv[idx_32] + shift_16_uv[idx_16];
@@ -938,23 +962,45 @@
                   build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
                   break;
                 case BLOCK_16X8:
+#if CONFIG_SUPERTX
+                  if (supertx_enabled(&mip[0]->mbmi))
+                    break;
+#endif
                   build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
                   if (mi_16_row_offset + 1 >= max_rows)
                     continue;
                   mip2 = mip + mode_info_stride;
-                  build_y_mask(lfi_n, mip2[0], shift_y+8, lfm);
+                  build_y_mask(lfi_n, mip2[0], shift_y+8,
+#if CONFIG_SUPERTX
+                               0,
+#endif
+                               lfm);
                   break;
                 case BLOCK_8X16:
+#if CONFIG_SUPERTX
+                  if (supertx_enabled(&mip[0]->mbmi))
+                    break;
+#endif
                   build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
                   if (mi_16_col_offset +1 >= max_cols)
                     continue;
                   mip2 = mip + 1;
-                  build_y_mask(lfi_n, mip2[0], shift_y+1, lfm);
+                  build_y_mask(lfi_n, mip2[0], shift_y+1,
+#if CONFIG_SUPERTX
+                               0,
+#endif
+                               lfm);
                   break;
                 default: {
                   const int shift_y = shift_32_y[idx_32] +
                                       shift_16_y[idx_16] +
                                       shift_8_y[0];
+#if CONFIG_SUPERTX
+                  if (mip[0]->mbmi.tx_size == TX_16X16) {
+                    build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
+                    break;
+                  }
+#endif
                   build_masks(lfi_n, mip[0], shift_y, shift_uv, lfm);
                   mip += offset[0];
                   for (idx_8 = 1; idx_8 < 4; mip += offset[idx_8], ++idx_8) {
@@ -969,7 +1015,11 @@
                     if (mi_8_col_offset >= max_cols ||
                         mi_8_row_offset >= max_rows)
                       continue;
-                    build_y_mask(lfi_n, mip[0], shift_y, lfm);
+                    build_y_mask(lfi_n, mip[0], shift_y,
+#if CONFIG_SUPERTX
+                                 supertx_enabled(&mip[0]->mbmi),
+#endif
+                                 lfm);
                   }
                   break;
                 }
@@ -1000,12 +1050,14 @@
   lfm->above_uv[TX_4X4] &= ~above_border_uv;
 
   // We do some special edge handling.
-  if (mi_row + MI_BLOCK_SIZE > cm->mi_rows) {
+  if (mi_row + MAX_MIB_SIZE > cm->mi_rows) {
     const uint64_t rows = cm->mi_rows - mi_row;
 
     // Each pixel inside the border gets a 1,
-    const uint64_t mask_y = (((uint64_t) 1 << (rows << 3)) - 1);
-    const uint16_t mask_uv = (((uint16_t) 1 << (((rows + 1) >> 1) << 2)) - 1);
+    const uint64_t mask_y =
+      (((uint64_t) 1 << (rows << MAX_MIB_SIZE_LOG2)) - 1);
+    const uint16_t mask_uv =
+      (((uint16_t) 1 << (((rows + 1) >> 1) << (MAX_MIB_SIZE_LOG2 - 1))) - 1);
 
     // Remove values completely outside our border.
     for (i = 0; i < TX_32X32; i++) {
@@ -1015,11 +1067,7 @@
       lfm->above_uv[i] &= mask_uv;
     }
     lfm->int_4x4_y &= mask_y;
-#if CONFIG_MISC_FIXES
     lfm->above_int_4x4_uv = lfm->left_int_4x4_uv & mask_uv;
-#else
-    lfm->int_4x4_uv &= mask_uv;
-#endif
 
     // We don't apply a wide loop filter on the last uv block row. If set
     // apply the shorter one instead.
@@ -1033,7 +1081,7 @@
     }
   }
 
-  if (mi_col + MI_BLOCK_SIZE > cm->mi_cols) {
+  if (mi_col + MAX_MIB_SIZE > cm->mi_cols) {
     const uint64_t columns = cm->mi_cols - mi_col;
 
     // Each pixel inside the border gets a 1, the multiply copies the border
@@ -1053,11 +1101,7 @@
       lfm->above_uv[i] &= mask_uv;
     }
     lfm->int_4x4_y &= mask_y;
-#if CONFIG_MISC_FIXES
     lfm->left_int_4x4_uv &= mask_uv_int;
-#else
-    lfm->int_4x4_uv &= mask_uv_int;
-#endif
 
     // We don't apply a wide loop filter on the last uv column. If set
     // apply the shorter one instead.
@@ -1087,11 +1131,7 @@
   assert(!(lfm->left_uv[TX_16X16]&lfm->left_uv[TX_8X8]));
   assert(!(lfm->left_uv[TX_16X16] & lfm->left_uv[TX_4X4]));
   assert(!(lfm->left_uv[TX_8X8] & lfm->left_uv[TX_4X4]));
-#if CONFIG_MISC_FIXES
   assert(!(lfm->left_int_4x4_uv & lfm->left_uv[TX_16X16]));
-#else
-  assert(!(lfm->int_4x4_uv & lfm->left_uv[TX_16X16]));
-#endif
   assert(!(lfm->above_y[TX_16X16] & lfm->above_y[TX_8X8]));
   assert(!(lfm->above_y[TX_16X16] & lfm->above_y[TX_4X4]));
   assert(!(lfm->above_y[TX_8X8] & lfm->above_y[TX_4X4]));
@@ -1099,11 +1139,7 @@
   assert(!(lfm->above_uv[TX_16X16] & lfm->above_uv[TX_8X8]));
   assert(!(lfm->above_uv[TX_16X16] & lfm->above_uv[TX_4X4]));
   assert(!(lfm->above_uv[TX_8X8] & lfm->above_uv[TX_4X4]));
-#if CONFIG_MISC_FIXES
   assert(!(lfm->above_int_4x4_uv & lfm->above_uv[TX_16X16]));
-#else
-  assert(!(lfm->int_4x4_uv & lfm->above_uv[TX_16X16]));
-#endif
 }
 
 static void filter_selectively_vert(uint8_t *s, int pitch,
@@ -1179,75 +1215,129 @@
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 void vp10_filter_block_plane_non420(VP10_COMMON *cm,
-                                   struct macroblockd_plane *plane,
-                                   MODE_INFO **mi_8x8,
-                                   int mi_row, int mi_col) {
+                                    struct macroblockd_plane *plane,
+                                    MODE_INFO **mib,
+                                    int mi_row, int mi_col) {
   const int ss_x = plane->subsampling_x;
   const int ss_y = plane->subsampling_y;
   const int row_step = 1 << ss_y;
   const int col_step = 1 << ss_x;
-  const int row_step_stride = cm->mi_stride * row_step;
   struct buf_2d *const dst = &plane->dst;
   uint8_t* const dst0 = dst->buf;
-  unsigned int mask_16x16[MI_BLOCK_SIZE] = {0};
-  unsigned int mask_8x8[MI_BLOCK_SIZE] = {0};
-  unsigned int mask_4x4[MI_BLOCK_SIZE] = {0};
-  unsigned int mask_4x4_int[MI_BLOCK_SIZE] = {0};
-  uint8_t lfl[MI_BLOCK_SIZE * MI_BLOCK_SIZE];
+  unsigned int mask_16x16[MAX_MIB_SIZE] = {0};
+  unsigned int mask_8x8[MAX_MIB_SIZE] = {0};
+  unsigned int mask_4x4[MAX_MIB_SIZE] = {0};
+  unsigned int mask_4x4_int[MAX_MIB_SIZE] = {0};
+  uint8_t lfl[MAX_MIB_SIZE][MAX_MIB_SIZE];
   int r, c;
 
-  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
+  for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += row_step) {
     unsigned int mask_16x16_c = 0;
     unsigned int mask_8x8_c = 0;
     unsigned int mask_4x4_c = 0;
     unsigned int border_mask;
 
     // Determine the vertical edges that need filtering
-    for (c = 0; c < MI_BLOCK_SIZE && mi_col + c < cm->mi_cols; c += col_step) {
-      const MODE_INFO *mi = mi_8x8[c];
-      const BLOCK_SIZE sb_type = mi[0].mbmi.sb_type;
-      const int skip_this = mi[0].mbmi.skip && is_inter_block(&mi[0].mbmi);
+    for (c = 0; c < cm->mib_size && mi_col + c < cm->mi_cols; c += col_step) {
+      const MODE_INFO *mi = mib[c];
+      const MB_MODE_INFO *mbmi = &mi[0].mbmi;
+      const BLOCK_SIZE sb_type = mbmi->sb_type;
+      const int skip_this = mbmi->skip && is_inter_block(mbmi);
+      const int blk_row = r & (num_8x8_blocks_high_lookup[sb_type] - 1);
+      const int blk_col = c & (num_8x8_blocks_wide_lookup[sb_type] - 1);
+
       // left edge of current unit is block/partition edge -> no skip
       const int block_edge_left = (num_4x4_blocks_wide_lookup[sb_type] > 1) ?
-          !(c & (num_8x8_blocks_wide_lookup[sb_type] - 1)) : 1;
+          !blk_col : 1;
       const int skip_this_c = skip_this && !block_edge_left;
       // top edge of current unit is block/partition edge -> no skip
       const int block_edge_above = (num_4x4_blocks_high_lookup[sb_type] > 1) ?
-          !(r & (num_8x8_blocks_high_lookup[sb_type] - 1)) : 1;
+          !blk_row : 1;
       const int skip_this_r = skip_this && !block_edge_above;
+
+#if CONFIG_VAR_TX
+      TX_SIZE tx_size = (plane->plane_type == PLANE_TYPE_UV)
+          ? get_uv_tx_size(mbmi, plane) : mbmi->tx_size;
+#else
       const TX_SIZE tx_size = (plane->plane_type == PLANE_TYPE_UV)
-                            ? get_uv_tx_size(&mi[0].mbmi, plane)
-                            : mi[0].mbmi.tx_size;
+                            ? get_uv_tx_size(mbmi, plane)
+                            : mbmi->tx_size;
+#endif
+
       const int skip_border_4x4_c = ss_x && mi_col + c == cm->mi_cols - 1;
       const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
 
+      TX_SIZE tx_size_c = tx_size;
+      TX_SIZE tx_size_r = tx_size;
+
+      int tx_size_mask = 0;
       // Filter level can vary per MI
-      if (!(lfl[(r << 3) + (c >> ss_x)] =
-            get_filter_level(&cm->lf_info, &mi[0].mbmi)))
+      if (!(lfl[r][c >> ss_x] = get_filter_level(&cm->lf_info, mbmi)))
         continue;
 
+      if (tx_size == TX_32X32)
+        tx_size_mask = 3;
+      else if (tx_size == TX_16X16)
+        tx_size_mask = 1;
+      else
+        tx_size_mask = 0;
+
+#if CONFIG_VAR_TX
+      if (is_inter_block(mbmi) && !mbmi->skip)
+        tx_size = (plane->plane_type == PLANE_TYPE_UV) ?
+            get_uv_tx_size_impl(mbmi->inter_tx_size[blk_row][ blk_col],
+                                sb_type, ss_x, ss_y) :
+            mbmi->inter_tx_size[blk_row][blk_col];
+
+      tx_size_r = VPXMIN(tx_size,
+                         cm->above_txfm_context[mi_col + c]);
+      tx_size_c = VPXMIN(tx_size,
+                         cm->left_txfm_context[(mi_row + r) & MAX_MIB_MASK]);
+
+      cm->above_txfm_context[mi_col + c] = tx_size;
+      cm->left_txfm_context[(mi_row + r) & MAX_MIB_MASK] = tx_size;
+#endif
+
       // Build masks based on the transform size of each block
-      if (tx_size == TX_32X32) {
-        if (!skip_this_c && ((c >> ss_x) & 3) == 0) {
+      // handle vertical mask
+      if (tx_size_c == TX_32X32) {
+        if (!skip_this_c && ((c >> ss_x) & tx_size_mask) == 0) {
           if (!skip_border_4x4_c)
             mask_16x16_c |= 1 << (c >> ss_x);
           else
             mask_8x8_c |= 1 << (c >> ss_x);
         }
-        if (!skip_this_r && ((r >> ss_y) & 3) == 0) {
+      } else if (tx_size_c == TX_16X16) {
+        if (!skip_this_c && ((c >> ss_x) & tx_size_mask) == 0) {
+          if (!skip_border_4x4_c)
+            mask_16x16_c |= 1 << (c >> ss_x);
+          else
+            mask_8x8_c |= 1 << (c >> ss_x);
+        }
+      } else {
+        // force 8x8 filtering on 32x32 boundaries
+        if (!skip_this_c && ((c >> ss_x) & tx_size_mask) == 0) {
+          if (tx_size_c == TX_8X8 || ((c >> ss_x) & 3) == 0)
+            mask_8x8_c |= 1 << (c >> ss_x);
+          else
+            mask_4x4_c |= 1 << (c >> ss_x);
+        }
+
+        if (!skip_this && tx_size_c < TX_8X8 && !skip_border_4x4_c &&
+            ((c >> ss_x) & tx_size_mask) == 0)
+          mask_4x4_int[r] |= 1 << (c >> ss_x);
+      }
+
+      // set horizontal mask
+      if (tx_size_r == TX_32X32) {
+        if (!skip_this_r && ((r >> ss_y) & tx_size_mask) == 0) {
           if (!skip_border_4x4_r)
             mask_16x16[r] |= 1 << (c >> ss_x);
           else
             mask_8x8[r] |= 1 << (c >> ss_x);
         }
-      } else if (tx_size == TX_16X16) {
-        if (!skip_this_c && ((c >> ss_x) & 1) == 0) {
-          if (!skip_border_4x4_c)
-            mask_16x16_c |= 1 << (c >> ss_x);
-          else
-            mask_8x8_c |= 1 << (c >> ss_x);
-        }
-        if (!skip_this_r && ((r >> ss_y) & 1) == 0) {
+      } else if (tx_size_r == TX_16X16) {
+        if (!skip_this_r && ((r >> ss_y) & tx_size_mask) == 0) {
           if (!skip_border_4x4_r)
             mask_16x16[r] |= 1 << (c >> ss_x);
           else
@@ -1255,21 +1345,15 @@
         }
       } else {
         // force 8x8 filtering on 32x32 boundaries
-        if (!skip_this_c) {
-          if (tx_size == TX_8X8 || ((c >> ss_x) & 3) == 0)
-            mask_8x8_c |= 1 << (c >> ss_x);
-          else
-            mask_4x4_c |= 1 << (c >> ss_x);
-        }
-
-        if (!skip_this_r) {
-          if (tx_size == TX_8X8 || ((r >> ss_y) & 3) == 0)
+        if (!skip_this_r && ((r >> ss_y) & tx_size_mask) == 0) {
+          if (tx_size_r == TX_8X8 || ((r >> ss_y) & 3) == 0)
             mask_8x8[r] |= 1 << (c >> ss_x);
           else
             mask_4x4[r] |= 1 << (c >> ss_x);
         }
 
-        if (!skip_this && tx_size < TX_8X8 && !skip_border_4x4_c)
+        if (!skip_this && tx_size_r < TX_8X8 && !skip_border_4x4_c &&
+            ((r >> ss_y) & tx_size_mask) == 0)
           mask_4x4_int[r] |= 1 << (c >> ss_x);
       }
     }
@@ -1278,21 +1362,22 @@
     border_mask = ~(mi_col == 0);
 #if CONFIG_VP9_HIGHBITDEPTH
     if (cm->use_highbitdepth) {
-      highbd_filter_selectively_vert(CONVERT_TO_SHORTPTR(dst->buf),
-                                     dst->stride,
-                                     mask_16x16_c & border_mask,
-                                     mask_8x8_c & border_mask,
-                                     mask_4x4_c & border_mask,
-                                     mask_4x4_int[r],
-                                     &cm->lf_info, &lfl[r << 3],
-                                     (int)cm->bit_depth);
+      highbd_filter_selectively_vert(
+          CONVERT_TO_SHORTPTR(dst->buf),
+          dst->stride,
+          mask_16x16_c & border_mask,
+          mask_8x8_c & border_mask,
+          mask_4x4_c & border_mask,
+          mask_4x4_int[r],
+          &cm->lf_info, &lfl[r][0],
+          (int)cm->bit_depth);
     } else {
       filter_selectively_vert(dst->buf, dst->stride,
                               mask_16x16_c & border_mask,
                               mask_8x8_c & border_mask,
                               mask_4x4_c & border_mask,
                               mask_4x4_int[r],
-                              &cm->lf_info, &lfl[r << 3]);
+                              &cm->lf_info, &lfl[r][0]);
     }
 #else
     filter_selectively_vert(dst->buf, dst->stride,
@@ -1300,15 +1385,15 @@
                             mask_8x8_c & border_mask,
                             mask_4x4_c & border_mask,
                             mask_4x4_int[r],
-                            &cm->lf_info, &lfl[r << 3]);
+                            &cm->lf_info, &lfl[r][0]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-    dst->buf += 8 * dst->stride;
-    mi_8x8 += row_step_stride;
+    dst->buf += MI_SIZE * dst->stride;
+    mib += row_step * cm->mi_stride;
   }
 
   // Now do horizontal pass
   dst->buf = dst0;
-  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += row_step) {
+  for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += row_step) {
     const int skip_border_4x4_r = ss_y && mi_row + r == cm->mi_rows - 1;
     const unsigned int mask_4x4_int_r = skip_border_4x4_r ? 0 : mask_4x4_int[r];
 
@@ -1327,21 +1412,22 @@
     }
 #if CONFIG_VP9_HIGHBITDEPTH
     if (cm->use_highbitdepth) {
-      highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
-                                      dst->stride,
-                                      mask_16x16_r,
-                                      mask_8x8_r,
-                                      mask_4x4_r,
-                                      mask_4x4_int_r,
-                                      &cm->lf_info, &lfl[r << 3],
-                                      (int)cm->bit_depth);
+      highbd_filter_selectively_horiz(
+          CONVERT_TO_SHORTPTR(dst->buf),
+          dst->stride,
+          mask_16x16_r,
+          mask_8x8_r,
+          mask_4x4_r,
+          mask_4x4_int_r,
+          &cm->lf_info, &lfl[r][0],
+          (int)cm->bit_depth);
     } else {
       filter_selectively_horiz(dst->buf, dst->stride,
                                mask_16x16_r,
                                mask_8x8_r,
                                mask_4x4_r,
                                mask_4x4_int_r,
-                               &cm->lf_info, &lfl[r << 3]);
+                               &cm->lf_info, &lfl[r][0]);
     }
 #else
     filter_selectively_horiz(dst->buf, dst->stride,
@@ -1349,9 +1435,9 @@
                              mask_8x8_r,
                              mask_4x4_r,
                              mask_4x4_int_r,
-                             &cm->lf_info, &lfl[r << 3]);
+                             &cm->lf_info, &lfl[r][0]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-    dst->buf += 8 * dst->stride;
+    dst->buf += MI_SIZE * dst->stride;
   }
 }
 
@@ -1370,7 +1456,7 @@
   assert(plane->subsampling_x == 0 && plane->subsampling_y == 0);
 
   // Vertical pass: do 2 rows at one time
-  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) {
+  for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += 2) {
     unsigned int mask_16x16_l = mask_16x16 & 0xffff;
     unsigned int mask_8x8_l = mask_8x8 & 0xffff;
     unsigned int mask_4x4_l = mask_4x4 & 0xffff;
@@ -1382,22 +1468,24 @@
       highbd_filter_selectively_vert_row2(
           plane->subsampling_x, CONVERT_TO_SHORTPTR(dst->buf), dst->stride,
           mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-          &lfm->lfl_y[r << 3], (int)cm->bit_depth);
+          &lfm->lfl_y[r][0], (int)cm->bit_depth);
     } else {
       filter_selectively_vert_row2(
           plane->subsampling_x, dst->buf, dst->stride, mask_16x16_l, mask_8x8_l,
-          mask_4x4_l, mask_4x4_int_l, &cm->lf_info, &lfm->lfl_y[r << 3]);
+          mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
+          &lfm->lfl_y[r][0]);
     }
 #else
     filter_selectively_vert_row2(
         plane->subsampling_x, dst->buf, dst->stride, mask_16x16_l, mask_8x8_l,
-        mask_4x4_l, mask_4x4_int_l, &cm->lf_info, &lfm->lfl_y[r << 3]);
+        mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
+        &lfm->lfl_y[r][0]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-    dst->buf += 16 * dst->stride;
-    mask_16x16 >>= 16;
-    mask_8x8 >>= 16;
-    mask_4x4 >>= 16;
-    mask_4x4_int >>= 16;
+    dst->buf += 2 * MI_SIZE * dst->stride;
+    mask_16x16 >>= 2 * MI_SIZE;
+    mask_8x8 >>= 2 * MI_SIZE;
+    mask_4x4 >>= 2 * MI_SIZE;
+    mask_4x4_int >>= 2 * MI_SIZE;
   }
 
   // Horizontal pass
@@ -1407,7 +1495,7 @@
   mask_4x4 = lfm->above_y[TX_4X4];
   mask_4x4_int = lfm->int_4x4_y;
 
-  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r++) {
+  for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r++) {
     unsigned int mask_16x16_r;
     unsigned int mask_8x8_r;
     unsigned int mask_4x4_r;
@@ -1426,24 +1514,25 @@
     if (cm->use_highbitdepth) {
       highbd_filter_selectively_horiz(
           CONVERT_TO_SHORTPTR(dst->buf), dst->stride, mask_16x16_r, mask_8x8_r,
-          mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info, &lfm->lfl_y[r << 3],
+          mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info,
+          &lfm->lfl_y[r][0],
           (int)cm->bit_depth);
     } else {
       filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
                                mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info,
-                               &lfm->lfl_y[r << 3]);
+                               &lfm->lfl_y[r][0]);
     }
 #else
     filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
                              mask_4x4_r, mask_4x4_int & 0xff, &cm->lf_info,
-                             &lfm->lfl_y[r << 3]);
+                             &lfm->lfl_y[r][0]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-    dst->buf += 8 * dst->stride;
-    mask_16x16 >>= 8;
-    mask_8x8 >>= 8;
-    mask_4x4 >>= 8;
-    mask_4x4_int >>= 8;
+    dst->buf += MI_SIZE * dst->stride;
+    mask_16x16 >>= MI_SIZE;
+    mask_8x8 >>= MI_SIZE;
+    mask_4x4 >>= MI_SIZE;
+    mask_4x4_int >>= MI_SIZE;
   }
 }
 
@@ -1458,21 +1547,16 @@
   uint16_t mask_16x16 = lfm->left_uv[TX_16X16];
   uint16_t mask_8x8 = lfm->left_uv[TX_8X8];
   uint16_t mask_4x4 = lfm->left_uv[TX_4X4];
-#if CONFIG_MISC_FIXES
   uint16_t mask_4x4_int = lfm->left_int_4x4_uv;
-#else
-  uint16_t mask_4x4_int = lfm->int_4x4_uv;
-#endif
 
   assert(plane->subsampling_x == 1 && plane->subsampling_y == 1);
+  assert(plane->plane_type == PLANE_TYPE_UV);
 
   // Vertical pass: do 2 rows at one time
-  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 4) {
-    if (plane->plane_type == 1) {
-      for (c = 0; c < (MI_BLOCK_SIZE >> 1); c++) {
-        lfm->lfl_uv[(r << 1) + c] = lfm->lfl_y[(r << 3) + (c << 1)];
-        lfm->lfl_uv[((r + 2) << 1) + c] = lfm->lfl_y[((r + 2) << 3) + (c << 1)];
-      }
+  for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += 4) {
+    for (c = 0; c < (cm->mib_size >> 1); c++) {
+      lfm->lfl_uv[r >> 1][c] = lfm->lfl_y[r][c << 1];
+      lfm->lfl_uv[(r + 2) >> 1][c] = lfm->lfl_y[r + 2][c << 1];
     }
 
     {
@@ -1487,25 +1571,25 @@
         highbd_filter_selectively_vert_row2(
             plane->subsampling_x, CONVERT_TO_SHORTPTR(dst->buf), dst->stride,
             mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-            &lfm->lfl_uv[r << 1], (int)cm->bit_depth);
+            &lfm->lfl_uv[r >> 1][0], (int)cm->bit_depth);
       } else {
         filter_selectively_vert_row2(
             plane->subsampling_x, dst->buf, dst->stride,
             mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-            &lfm->lfl_uv[r << 1]);
+            &lfm->lfl_uv[r >> 1][0]);
       }
 #else
       filter_selectively_vert_row2(
           plane->subsampling_x, dst->buf, dst->stride,
           mask_16x16_l, mask_8x8_l, mask_4x4_l, mask_4x4_int_l, &cm->lf_info,
-          &lfm->lfl_uv[r << 1]);
+          &lfm->lfl_uv[r >> 1][0]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-      dst->buf += 16 * dst->stride;
-      mask_16x16 >>= 8;
-      mask_8x8 >>= 8;
-      mask_4x4 >>= 8;
-      mask_4x4_int >>= 8;
+      dst->buf += 2 * MI_SIZE * dst->stride;
+      mask_16x16 >>= MI_SIZE;
+      mask_8x8 >>= MI_SIZE;
+      mask_4x4 >>= MI_SIZE;
+      mask_4x4_int >>= MI_SIZE;
     }
   }
 
@@ -1514,13 +1598,9 @@
   mask_16x16 = lfm->above_uv[TX_16X16];
   mask_8x8 = lfm->above_uv[TX_8X8];
   mask_4x4 = lfm->above_uv[TX_4X4];
-#if CONFIG_MISC_FIXES
   mask_4x4_int = lfm->above_int_4x4_uv;
-#else
-  mask_4x4_int = lfm->int_4x4_uv;
-#endif
 
-  for (r = 0; r < MI_BLOCK_SIZE && mi_row + r < cm->mi_rows; r += 2) {
+  for (r = 0; r < cm->mib_size && mi_row + r < cm->mi_rows; r += 2) {
     const int skip_border_4x4_r = mi_row + r == cm->mi_rows - 1;
     const unsigned int mask_4x4_int_r =
         skip_border_4x4_r ? 0 : (mask_4x4_int & 0xf);
@@ -1543,34 +1623,58 @@
       highbd_filter_selectively_horiz(CONVERT_TO_SHORTPTR(dst->buf),
                                       dst->stride, mask_16x16_r, mask_8x8_r,
                                       mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
-                                      &lfm->lfl_uv[r << 1], (int)cm->bit_depth);
+                                      &lfm->lfl_uv[r >> 1][0],
+                                      (int)cm->bit_depth);
     } else {
       filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
                                mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
-                               &lfm->lfl_uv[r << 1]);
+                               &lfm->lfl_uv[r >> 1][0]);
     }
 #else
     filter_selectively_horiz(dst->buf, dst->stride, mask_16x16_r, mask_8x8_r,
                              mask_4x4_r, mask_4x4_int_r, &cm->lf_info,
-                             &lfm->lfl_uv[r << 1]);
+                             &lfm->lfl_uv[r >> 1][0]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-    dst->buf += 8 * dst->stride;
-    mask_16x16 >>= 4;
-    mask_8x8 >>= 4;
-    mask_4x4 >>= 4;
-    mask_4x4_int >>= 4;
+    dst->buf += MI_SIZE * dst->stride;
+    mask_16x16 >>= MI_SIZE / 2;
+    mask_8x8 >>= MI_SIZE / 2;
+    mask_4x4 >>= MI_SIZE / 2;
+    mask_4x4_int >>= MI_SIZE / 2;
   }
 }
 
 void vp10_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer,
-                          VP10_COMMON *cm,
-                          struct macroblockd_plane planes[MAX_MB_PLANE],
-                          int start, int stop, int y_only) {
+                           VP10_COMMON *cm,
+                           struct macroblockd_plane planes[MAX_MB_PLANE],
+                           int start, int stop, int y_only) {
+#if CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES
   const int num_planes = y_only ? 1 : MAX_MB_PLANE;
+  int mi_row, mi_col;
+
+# if CONFIG_VAR_TX
+  memset(cm->above_txfm_context, TX_SIZES, cm->mi_cols);
+# endif  // CONFIG_VAR_TX
+  for (mi_row = start; mi_row < stop; mi_row += cm->mib_size) {
+    MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
+# if CONFIG_VAR_TX
+    memset(cm->left_txfm_context, TX_SIZES, MAX_MIB_SIZE);
+# endif  // CONFIG_VAR_TX
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += cm->mib_size) {
+      int plane;
+
+      vp10_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
+
+      for (plane = 0; plane < num_planes; ++plane)
+        vp10_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
+                                       mi_row, mi_col);
+    }
+  }
+#else
+  const int num_planes = y_only ? 1 : MAX_MB_PLANE;
+  int mi_row, mi_col;
   enum lf_path path;
   LOOP_FILTER_MASK lfm;
-  int mi_row, mi_col;
 
   if (y_only)
     path = LF_PATH_444;
@@ -1581,17 +1685,15 @@
   else
     path = LF_PATH_SLOW;
 
-  for (mi_row = start; mi_row < stop; mi_row += MI_BLOCK_SIZE) {
+  for (mi_row = start; mi_row < stop; mi_row += MAX_MIB_SIZE) {
     MODE_INFO **mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
-
-    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
       int plane;
 
       vp10_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
 
       // TODO(JBB): Make setup_mask work for non 420.
-      vp10_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride,
-                     &lfm);
+      vp10_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride, &lfm);
 
       vp10_filter_block_plane_ss00(cm, &planes[0], mi_row, &lfm);
       for (plane = 1; plane < num_planes; ++plane) {
@@ -1610,6 +1712,7 @@
       }
     }
   }
+#endif  // CONFIG_VAR_TX || CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES
 }
 
 void vp10_loop_filter_frame(YV12_BUFFER_CONFIG *frame,
diff --git a/vp10/common/loopfilter.h b/vp10/common/loopfilter.h
index 8db705a..fc57d09 100644
--- a/vp10/common/loopfilter.h
+++ b/vp10/common/loopfilter.h
@@ -15,6 +15,7 @@
 #include "./vpx_config.h"
 
 #include "vp10/common/blockd.h"
+#include "vp10/common/restoration.h"
 #include "vp10/common/seg_common.h"
 
 #ifdef __cplusplus
@@ -43,7 +44,8 @@
   uint8_t mode_ref_delta_enabled;
   uint8_t mode_ref_delta_update;
 
-  // 0 = Intra, Last, GF, ARF
+  // 0 = Intra, Last, Last2+Last3(CONFIG_EXT_REFS),
+  // GF, BRF(CONFIG_EXT_REFS), ARF
   signed char ref_deltas[MAX_REF_FRAMES];
   signed char last_ref_deltas[MAX_REF_FRAMES];
 
@@ -80,14 +82,10 @@
   uint64_t int_4x4_y;
   uint16_t left_uv[TX_SIZES];
   uint16_t above_uv[TX_SIZES];
-#if CONFIG_MISC_FIXES
   uint16_t left_int_4x4_uv;
   uint16_t above_int_4x4_uv;
-#else
-  uint16_t int_4x4_uv;
-#endif
-  uint8_t lfl_y[64];
-  uint8_t lfl_uv[16];
+  uint8_t lfl_y[MAX_MIB_SIZE][MAX_MIB_SIZE];
+  uint8_t lfl_uv[MAX_MIB_SIZE / 2][MAX_MIB_SIZE / 2];
 } LOOP_FILTER_MASK;
 
 /* assorted loopfilter functions which get used elsewhere */
@@ -125,16 +123,16 @@
 void vp10_loop_filter_frame_init(struct VP10Common *cm, int default_filt_lvl);
 
 void vp10_loop_filter_frame(YV12_BUFFER_CONFIG *frame,
-                           struct VP10Common *cm,
-                           struct macroblockd *mbd,
-                           int filter_level,
-                           int y_only, int partial_frame);
+                            struct VP10Common *cm,
+                            struct macroblockd *mbd,
+                            int filter_level,
+                            int y_only, int partial_frame);
 
 // Apply the loop filter to [start, stop) macro block rows in frame_buffer.
 void vp10_loop_filter_rows(YV12_BUFFER_CONFIG *frame_buffer,
-                          struct VP10Common *cm,
-                          struct macroblockd_plane planes[MAX_MB_PLANE],
-                          int start, int stop, int y_only);
+                           struct VP10Common *cm,
+                           struct macroblockd_plane planes[MAX_MB_PLANE],
+                           int start, int stop, int y_only);
 
 typedef struct LoopFilterWorkerData {
   YV12_BUFFER_CONFIG *frame_buffer;
diff --git a/vp10/common/mfqe.c b/vp10/common/mfqe.c
index c715ef7..52756bd 100644
--- a/vp10/common/mfqe.c
+++ b/vp10/common/mfqe.c
@@ -355,9 +355,15 @@
   const YV12_BUFFER_CONFIG *show = cm->frame_to_show;
   // Last decoded frame and will store the MFQE result.
   YV12_BUFFER_CONFIG *dest = &cm->post_proc_buffer;
+
+#if CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES
+  // TODO(any): Fix for ext parition types and 128 superblocks
+  assert(0);
+#endif  // CONFIG_EXT_PARTITION || CONFIG_EXT_PARTITION_TYPES
+
   // Loop through each super block.
-  for (mi_row = 0; mi_row < cm->mi_rows; mi_row += MI_BLOCK_SIZE) {
-    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
+  for (mi_row = 0; mi_row < cm->mi_rows; mi_row += MAX_MIB_SIZE) {
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MAX_MIB_SIZE) {
       MODE_INFO *mi;
       MODE_INFO *mi_local = cm->mi + (mi_row * cm->mi_stride + mi_col);
       // Motion Info in last frame.
diff --git a/vp10/common/mv.h b/vp10/common/mv.h
index b4971a5..8b9348b 100644
--- a/vp10/common/mv.h
+++ b/vp10/common/mv.h
@@ -11,9 +11,8 @@
 #ifndef VP10_COMMON_MV_H_
 #define VP10_COMMON_MV_H_
 
-#include "vpx/vpx_integer.h"
-
 #include "vp10/common/common.h"
+#include "vpx_dsp/vpx_filter.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -34,6 +33,15 @@
   int32_t col;
 } MV32;
 
+#if CONFIG_REF_MV
+typedef struct candidate_mv {
+  int_mv this_mv;
+  int_mv comp_mv;
+  int_mv pred_mv;
+  int weight;
+} CANDIDATE_MV;
+#endif
+
 static INLINE int is_zero_mv(const MV *mv) {
   return *((const uint32_t *)mv) == 0;
 }
@@ -48,6 +56,9 @@
   mv->row = clamp(mv->row, min_row, max_row);
 }
 
+static INLINE int mv_has_subpel(const MV *mv) {
+  return (mv->row & SUBPEL_MASK) || (mv->col & SUBPEL_MASK);
+}
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp10/common/mvref_common.c b/vp10/common/mvref_common.c
index 1ef80c2..eed1508 100644
--- a/vp10/common/mvref_common.c
+++ b/vp10/common/mvref_common.c
@@ -11,6 +11,536 @@
 
 #include "vp10/common/mvref_common.h"
 
+#if CONFIG_REF_MV
+
+static uint8_t add_ref_mv_candidate(const MODE_INFO *const candidate_mi,
+                                    const MB_MODE_INFO *const candidate,
+                                    const MV_REFERENCE_FRAME rf[2],
+                                    uint8_t *refmv_count,
+                                    CANDIDATE_MV *ref_mv_stack,
+                                    const int use_hp,
+                                    int len, int block, int col) {
+  const int weight = len;
+  int index = 0, ref;
+  int newmv_count = 0;
+
+  assert(2 * weight < REF_CAT_LEVEL);
+
+  if (rf[1] == NONE) {
+    // single reference frame
+    for (ref = 0; ref < 2; ++ref) {
+      if (candidate->ref_frame[ref] == rf[0]) {
+        int_mv this_refmv =
+            get_sub_block_mv(candidate_mi, ref, col, block);
+        lower_mv_precision(&this_refmv.as_mv, use_hp);
+
+        for (index = 0; index < *refmv_count; ++index)
+          if (ref_mv_stack[index].this_mv.as_int == this_refmv.as_int)
+            break;
+
+        if (index < *refmv_count)
+          ref_mv_stack[index].weight += 2 * weight;
+
+        // Add a new item to the list.
+        if (index == *refmv_count) {
+          ref_mv_stack[index].this_mv = this_refmv;
+          ref_mv_stack[index].pred_mv =
+              get_sub_block_pred_mv(candidate_mi, ref, col, block);
+          ref_mv_stack[index].weight = 2 * weight;
+          ++(*refmv_count);
+
+#if CONFIG_EXT_INTER
+          if (candidate->mode == NEWMV || candidate->mode == NEWFROMNEARMV)
+#else
+          if (candidate->mode == NEWMV)
+#endif  // CONFIG_EXT_INTER
+            ++newmv_count;
+        }
+
+        if (candidate_mi->mbmi.sb_type < BLOCK_8X8 && block >= 0) {
+          int alt_block = 3 - block;
+          this_refmv =
+              get_sub_block_mv(candidate_mi, ref, col, alt_block);
+          lower_mv_precision(&this_refmv.as_mv, use_hp);
+
+          for (index = 0; index < *refmv_count; ++index)
+            if (ref_mv_stack[index].this_mv.as_int == this_refmv.as_int)
+              break;
+
+          if (index < *refmv_count)
+            ref_mv_stack[index].weight += weight;
+
+          // Add a new item to the list.
+          if (index == *refmv_count) {
+            ref_mv_stack[index].this_mv = this_refmv;
+            ref_mv_stack[index].pred_mv =
+                get_sub_block_pred_mv(candidate_mi, ref, col, alt_block);
+            ref_mv_stack[index].weight = weight;
+            ++(*refmv_count);
+
+#if CONFIG_EXT_INTER
+          if (candidate->mode == NEWMV || candidate->mode == NEWFROMNEARMV)
+#else
+            if (candidate->mode == NEWMV)
+#endif  // CONFIG_EXT_INTER
+              ++newmv_count;
+          }
+        }
+      }
+    }
+  } else {
+    // compound reference frame
+    if (candidate->ref_frame[0] == rf[0] &&
+        candidate->ref_frame[1] == rf[1]) {
+      int_mv this_refmv[2];
+
+      for (ref = 0; ref < 2; ++ref) {
+        this_refmv[ref] = get_sub_block_mv(candidate_mi, ref, col, block);
+        lower_mv_precision(&this_refmv[ref].as_mv, use_hp);
+      }
+
+      for (index = 0; index < *refmv_count; ++index)
+        if ((ref_mv_stack[index].this_mv.as_int == this_refmv[0].as_int) &&
+            (ref_mv_stack[index].comp_mv.as_int == this_refmv[1].as_int))
+          break;
+
+      if (index < *refmv_count)
+        ref_mv_stack[index].weight += 2 * weight;
+
+      // Add a new item to the list.
+      if (index == *refmv_count) {
+        ref_mv_stack[index].this_mv = this_refmv[0];
+        ref_mv_stack[index].comp_mv = this_refmv[1];
+        ref_mv_stack[index].weight = 2 * weight;
+        ++(*refmv_count);
+
+#if CONFIG_EXT_INTER
+        if (candidate->mode == NEW_NEWMV)
+#else
+        if (candidate->mode == NEWMV)
+#endif  // CONFIG_EXT_INTER
+          ++newmv_count;
+      }
+
+      if (candidate_mi->mbmi.sb_type < BLOCK_8X8 && block >= 0) {
+        int alt_block = 3 - block;
+        this_refmv[0] = get_sub_block_mv(candidate_mi, 0, col, alt_block);
+        this_refmv[1] = get_sub_block_mv(candidate_mi, 1, col, alt_block);
+
+        for (ref = 0; ref < 2; ++ref)
+          lower_mv_precision(&this_refmv[ref].as_mv, use_hp);
+
+        for (index = 0; index < *refmv_count; ++index)
+          if (ref_mv_stack[index].this_mv.as_int == this_refmv[0].as_int &&
+              ref_mv_stack[index].comp_mv.as_int == this_refmv[1].as_int)
+            break;
+
+        if (index < *refmv_count)
+          ref_mv_stack[index].weight += weight;
+
+        // Add a new item to the list.
+        if (index == *refmv_count) {
+          ref_mv_stack[index].this_mv = this_refmv[0];
+          ref_mv_stack[index].comp_mv = this_refmv[1];
+          ref_mv_stack[index].weight = weight;
+          ++(*refmv_count);
+
+#if CONFIG_EXT_INTER
+          if (candidate->mode == NEW_NEWMV)
+#else
+          if (candidate->mode == NEWMV)
+#endif  // CONFIG_EXT_INTER
+            ++newmv_count;
+        }
+      }
+    }
+  }
+  return newmv_count;
+}
+
+static uint8_t scan_row_mbmi(const VP10_COMMON *cm, const MACROBLOCKD *xd,
+                             const int mi_row, const int mi_col, int block,
+                             const MV_REFERENCE_FRAME rf[2],
+                             int row_offset,
+                             CANDIDATE_MV *ref_mv_stack,
+                             uint8_t *refmv_count) {
+  const TileInfo *const tile = &xd->tile;
+  int i;
+  uint8_t newmv_count = 0;
+
+  for (i = 0; i < xd->n8_w && *refmv_count < MAX_REF_MV_STACK_SIZE;) {
+    POSITION mi_pos;
+    mi_pos.row = row_offset;
+    mi_pos.col = i;
+
+    if (is_inside(tile, mi_col, mi_row, &mi_pos)) {
+      const MODE_INFO *const candidate_mi =
+          xd->mi[mi_pos.row * xd->mi_stride + mi_pos.col];
+      const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
+      const int len = VPXMIN(xd->n8_w,
+                             num_8x8_blocks_wide_lookup[candidate->sb_type]);
+
+      newmv_count += add_ref_mv_candidate(candidate_mi, candidate, rf,
+                                          refmv_count, ref_mv_stack,
+                                          cm->allow_high_precision_mv,
+                                          len, block, mi_pos.col);
+      i += len;
+    } else {
+      ++i;
+    }
+  }
+
+  return newmv_count;
+}
+
+static uint8_t scan_col_mbmi(const VP10_COMMON *cm, const MACROBLOCKD *xd,
+                             const int mi_row, const int mi_col, int block,
+                             const MV_REFERENCE_FRAME rf[2],
+                             int col_offset,
+                             CANDIDATE_MV *ref_mv_stack,
+                             uint8_t *refmv_count) {
+  const TileInfo *const tile = &xd->tile;
+  int i;
+  uint8_t newmv_count = 0;
+
+  for (i = 0; i < xd->n8_h && *refmv_count < MAX_REF_MV_STACK_SIZE;) {
+    POSITION mi_pos;
+    mi_pos.row = i;
+    mi_pos.col = col_offset;
+
+    if (is_inside(tile, mi_col, mi_row, &mi_pos)) {
+      const MODE_INFO *const candidate_mi =
+          xd->mi[mi_pos.row * xd->mi_stride + mi_pos.col];
+      const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
+      const int len = VPXMIN(xd->n8_h,
+                       num_8x8_blocks_high_lookup[candidate->sb_type]);
+
+      newmv_count += add_ref_mv_candidate(candidate_mi, candidate, rf,
+                                          refmv_count, ref_mv_stack,
+                                          cm->allow_high_precision_mv,
+                                          len, block, mi_pos.col);
+      i += len;
+    } else {
+      ++i;
+    }
+  }
+
+  return newmv_count;
+}
+
+static uint8_t scan_blk_mbmi(const VP10_COMMON *cm, const MACROBLOCKD *xd,
+                             const int mi_row, const int mi_col, int block,
+                             const MV_REFERENCE_FRAME rf[2],
+                             int row_offset, int col_offset,
+                             CANDIDATE_MV *ref_mv_stack,
+                             uint8_t *refmv_count) {
+  const TileInfo *const tile = &xd->tile;
+  POSITION mi_pos;
+  uint8_t newmv_count = 0;
+
+  mi_pos.row = row_offset;
+  mi_pos.col = col_offset;
+
+  if (is_inside(tile, mi_col, mi_row, &mi_pos) &&
+      *refmv_count < MAX_REF_MV_STACK_SIZE) {
+    const MODE_INFO *const candidate_mi =
+        xd->mi[mi_pos.row * xd->mi_stride + mi_pos.col];
+    const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
+    const int len = 1;
+
+    newmv_count += add_ref_mv_candidate(candidate_mi, candidate, rf,
+                                        refmv_count, ref_mv_stack,
+                                        cm->allow_high_precision_mv,
+                                        len, block, mi_pos.col);
+  }  // Analyze a single 8x8 block motion information.
+  return newmv_count;
+}
+
+static int has_top_right(const MACROBLOCKD *xd,
+                         int mi_row, int mi_col, int bs) {
+  // In a split partition all apart from the bottom right has a top right
+  int has_tr = !((mi_row & bs) && (mi_col & bs));
+
+  // bs > 0 and bs is a power of 2
+  assert(bs > 0 && !(bs & (bs - 1)));
+
+  // For each 4x4 group of blocks, when the bottom right is decoded the blocks
+  // to the right have not been decoded therefore the bottom right does
+  // not have a top right
+  while (bs < MAX_MIB_SIZE) {
+    if (mi_col & bs) {
+      if ((mi_col & (2 * bs)) && (mi_row & (2 * bs))) {
+        has_tr = 0;
+        break;
+      }
+    } else {
+      break;
+    }
+    bs <<= 1;
+  }
+
+  // The left hand of two vertical rectangles always has a top right (as the
+  // block above will have been decoded)
+  if (xd->n8_w < xd->n8_h)
+    if (!xd->is_sec_rect)
+      has_tr = 1;
+
+  // The bottom of two horizontal rectangles never has a top right (as the block
+  // to the right won't have been decoded)
+  if (xd->n8_w > xd->n8_h)
+    if (xd->is_sec_rect)
+      has_tr = 0;
+
+#if CONFIG_EXT_PARTITION_TYPES
+  // The bottom left square of a Vertical A does not have a top right as it is
+  // decoded before the right hand rectangle of the partition
+  if (xd->mi[0]->mbmi.partition == PARTITION_VERT_A)
+    if ((mi_row & bs) && !(mi_col & bs))
+      has_tr = 0;
+#endif  // CONFIG_EXT_PARTITION_TYPES
+
+  return has_tr;
+}
+
+static void handle_sec_rect_block(const MB_MODE_INFO * const candidate,
+                                  uint8_t refmv_count,
+                                  CANDIDATE_MV *ref_mv_stack,
+                                  MV_REFERENCE_FRAME ref_frame,
+                                  int16_t *mode_context) {
+  int rf, idx;
+
+  for (rf = 0; rf < 2; ++rf) {
+    if (candidate->ref_frame[rf] == ref_frame) {
+      const int list_range = VPXMIN(refmv_count, MAX_MV_REF_CANDIDATES);
+
+      const int_mv pred_mv = candidate->mv[rf];
+      for (idx = 0; idx < list_range; ++idx)
+        if (pred_mv.as_int == ref_mv_stack[idx].this_mv.as_int)
+          break;
+
+      if (idx < list_range) {
+        if (idx == 0)
+          mode_context[ref_frame] |= (1 << SKIP_NEARESTMV_OFFSET);
+        else if (idx == 1)
+          mode_context[ref_frame] |= (1 << SKIP_NEARMV_OFFSET);
+      }
+    }
+  }
+}
+
+static void setup_ref_mv_list(const VP10_COMMON *cm, const MACROBLOCKD *xd,
+                              MV_REFERENCE_FRAME ref_frame,
+                              uint8_t *refmv_count,
+                              CANDIDATE_MV *ref_mv_stack,
+                              int_mv *mv_ref_list,
+                              int block, int mi_row, int mi_col,
+                              int16_t *mode_context) {
+  int idx, nearest_refmv_count = 0;
+  uint8_t newmv_count = 0;
+
+  CANDIDATE_MV tmp_mv;
+  int len, nr_len;
+
+  const MV_REF *const prev_frame_mvs_base = cm->use_prev_frame_mvs ?
+      cm->prev_frame->mvs + mi_row * cm->mi_cols + mi_col : NULL;
+
+  int bs = VPXMAX(xd->n8_w, xd->n8_h);
+  int has_tr = has_top_right(xd, mi_row, mi_col, bs);
+
+  MV_REFERENCE_FRAME rf[2];
+  vp10_set_ref_frame(rf, ref_frame);
+
+  mode_context[ref_frame] = 0;
+  *refmv_count = 0;
+
+  // Scan the first above row mode info.
+  newmv_count = scan_row_mbmi(cm, xd, mi_row, mi_col, block, rf,
+                              -1, ref_mv_stack, refmv_count);
+  // Scan the first left column mode info.
+  newmv_count += scan_col_mbmi(cm, xd, mi_row, mi_col, block, rf,
+                               -1, ref_mv_stack, refmv_count);
+
+  // Check top-right boundary
+  if (has_tr)
+    newmv_count += scan_blk_mbmi(cm, xd, mi_row, mi_col, block, rf,
+                                 -1, 1, ref_mv_stack, refmv_count);
+
+  nearest_refmv_count = *refmv_count;
+
+  for (idx = 0; idx < nearest_refmv_count; ++idx) {
+    assert(ref_mv_stack[idx].weight > 0 &&
+           ref_mv_stack[idx].weight < REF_CAT_LEVEL);
+    ref_mv_stack[idx].weight += REF_CAT_LEVEL;
+  }
+
+  if (prev_frame_mvs_base && cm->show_frame && cm->last_show_frame
+      && rf[1] == NONE) {
+    int ref;
+    int blk_row, blk_col;
+
+    for (blk_row = 0; blk_row < xd->n8_h; ++blk_row) {
+      for (blk_col = 0; blk_col < xd->n8_w; ++blk_col) {
+        const MV_REF *prev_frame_mvs =
+            prev_frame_mvs_base + blk_row * cm->mi_cols + blk_col;
+
+        POSITION mi_pos;
+        mi_pos.row = blk_row;
+        mi_pos.col = blk_col;
+
+        if (!is_inside(&xd->tile, mi_col, mi_row, &mi_pos))
+          continue;
+
+        for (ref = 0; ref < 2; ++ref) {
+          if (prev_frame_mvs->ref_frame[ref] == ref_frame) {
+            int_mv this_refmv = prev_frame_mvs->mv[ref];
+            lower_mv_precision(&this_refmv.as_mv,
+                               cm->allow_high_precision_mv);
+
+            for (idx = 0; idx < *refmv_count; ++idx)
+              if (this_refmv.as_int == ref_mv_stack[idx].this_mv.as_int)
+                break;
+
+            if (idx < *refmv_count)
+              ref_mv_stack[idx].weight += 2;
+
+            if (idx == *refmv_count &&
+                *refmv_count < MAX_REF_MV_STACK_SIZE) {
+              ref_mv_stack[idx].this_mv.as_int = this_refmv.as_int;
+              ref_mv_stack[idx].weight = 2;
+              ++(*refmv_count);
+
+              if (abs(ref_mv_stack[idx].this_mv.as_mv.row) >= 8 ||
+                  abs(ref_mv_stack[idx].this_mv.as_mv.col) >= 8)
+                mode_context[ref_frame] |= (1 << ZEROMV_OFFSET);
+            }
+          }
+        }
+      }
+    }
+  }
+
+  if (*refmv_count == nearest_refmv_count)
+    mode_context[ref_frame] |= (1 << ZEROMV_OFFSET);
+
+  // Analyze the top-left corner block mode info.
+//  scan_blk_mbmi(cm, xd, mi_row, mi_col, block, ref_frame,
+//                -1, -1, ref_mv_stack, refmv_count);
+
+  // Scan the second outer area.
+  scan_row_mbmi(cm, xd, mi_row, mi_col, block, rf,
+                -2, ref_mv_stack, refmv_count);
+  scan_col_mbmi(cm, xd, mi_row, mi_col, block, rf,
+                -2, ref_mv_stack, refmv_count);
+
+  // Scan the third outer area.
+  scan_row_mbmi(cm, xd, mi_row, mi_col, block, rf,
+                -3, ref_mv_stack, refmv_count);
+  scan_col_mbmi(cm, xd, mi_row, mi_col, block, rf,
+                -3, ref_mv_stack, refmv_count);
+
+  // Scan the fourth outer area.
+  scan_row_mbmi(cm, xd, mi_row, mi_col, block, rf,
+                -4, ref_mv_stack, refmv_count);
+  // Scan the third left row mode info.
+  scan_col_mbmi(cm, xd, mi_row, mi_col, block, rf,
+                -4, ref_mv_stack, refmv_count);
+
+  switch (nearest_refmv_count) {
+    case 0:
+      mode_context[ref_frame] |= 0;
+      if (*refmv_count >= 1)
+        mode_context[ref_frame] |= 1;
+
+      if (*refmv_count == 1)
+        mode_context[ref_frame] |= (1 << REFMV_OFFSET);
+      else if (*refmv_count >= 2)
+        mode_context[ref_frame] |= (2 << REFMV_OFFSET);
+      break;
+    case 1:
+      mode_context[ref_frame] |= (newmv_count > 0) ? 2 : 3;
+
+      if (*refmv_count == 1)
+        mode_context[ref_frame] |= (3 << REFMV_OFFSET);
+      else if (*refmv_count >= 2)
+        mode_context[ref_frame] |= (4 << REFMV_OFFSET);
+      break;
+
+    case 2:
+    default:
+      if (newmv_count >= 2)
+        mode_context[ref_frame] |= 4;
+      else if (newmv_count == 1)
+        mode_context[ref_frame] |= 5;
+      else
+        mode_context[ref_frame] |= 6;
+
+      mode_context[ref_frame] |= (5 << REFMV_OFFSET);
+      break;
+  }
+
+  // Rank the likelihood and assign nearest and near mvs.
+  len = nearest_refmv_count;
+  while (len > 0) {
+    nr_len = 0;
+    for (idx = 1; idx < len; ++idx) {
+      if (ref_mv_stack[idx - 1].weight < ref_mv_stack[idx].weight) {
+        tmp_mv = ref_mv_stack[idx - 1];
+        ref_mv_stack[idx - 1] = ref_mv_stack[idx];
+        ref_mv_stack[idx] = tmp_mv;
+        nr_len = idx;
+      }
+    }
+    len = nr_len;
+  }
+
+  len = *refmv_count;
+  while (len > nearest_refmv_count) {
+    nr_len = nearest_refmv_count;
+    for (idx = nearest_refmv_count + 1; idx < len; ++idx) {
+      if (ref_mv_stack[idx - 1].weight < ref_mv_stack[idx].weight) {
+        tmp_mv = ref_mv_stack[idx - 1];
+        ref_mv_stack[idx - 1] = ref_mv_stack[idx];
+        ref_mv_stack[idx] = tmp_mv;
+        nr_len = idx;
+      }
+    }
+    len = nr_len;
+  }
+
+  // TODO(jingning): Clean-up needed.
+  if (xd->is_sec_rect) {
+    if (xd->n8_w < xd->n8_h) {
+      const MODE_INFO *const candidate_mi = xd->mi[-1];
+      const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
+      handle_sec_rect_block(candidate, nearest_refmv_count, ref_mv_stack,
+                            ref_frame, mode_context);
+    }
+
+    if (xd->n8_w > xd->n8_h) {
+      const MODE_INFO *const candidate_mi = xd->mi[-xd->mi_stride];
+      const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
+      handle_sec_rect_block(candidate, nearest_refmv_count, ref_mv_stack,
+                            ref_frame, mode_context);
+    }
+  }
+
+  if (rf[1] > NONE) {
+    for (idx = 0; idx < *refmv_count; ++idx) {
+      clamp_mv_ref(&ref_mv_stack[idx].this_mv.as_mv,
+                   xd->n8_w << 3 , xd->n8_h << 3, xd);
+      clamp_mv_ref(&ref_mv_stack[idx].comp_mv.as_mv,
+                   xd->n8_w << 3 , xd->n8_h << 3, xd);
+    }
+  } else {
+    for (idx = 0; idx < VPXMIN(MAX_MV_REF_CANDIDATES, *refmv_count); ++idx) {
+      mv_ref_list[idx].as_int = ref_mv_stack[idx].this_mv.as_int;
+      clamp_mv_ref(&mv_ref_list[idx].as_mv,
+                   xd->n8_w << 3, xd->n8_h << 3, xd);
+    }
+  }
+}
+#endif
+
 // This function searches the neighbourhood of a given MB/SB
 // to try and find candidate reference vectors.
 static void find_mv_refs_idx(const VP10_COMMON *cm, const MACROBLOCKD *xd,
@@ -18,7 +548,7 @@
                              int_mv *mv_ref_list,
                              int block, int mi_row, int mi_col,
                              find_mv_refs_sync sync, void *const data,
-                             uint8_t *mode_context) {
+                             int16_t *mode_context) {
   const int *ref_sign_bias = cm->ref_frame_sign_bias;
   int i, refmv_count = 0;
   const POSITION *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type];
@@ -30,17 +560,12 @@
   const int bw = num_8x8_blocks_wide_lookup[mi->mbmi.sb_type] << 3;
   const int bh = num_8x8_blocks_high_lookup[mi->mbmi.sb_type] << 3;
 
-#if !CONFIG_MISC_FIXES
-  // Blank the reference vector list
-  memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES);
-#endif
-
   // The nearest 2 blocks are treated differently
   // if the size < 8x8 we get the mv from the bmi substructure,
   // and we also need to keep a mode count.
   for (i = 0; i < 2; ++i) {
     const POSITION *const mv_ref = &mv_ref_search[i];
-    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
+    if (is_inside(tile, mi_col, mi_row, mv_ref)) {
       const MODE_INFO *const candidate_mi = xd->mi[mv_ref->col + mv_ref->row *
                                                    xd->mi_stride];
       const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
@@ -62,7 +587,7 @@
   // mode counts.
   for (; i < MVREF_NEIGHBOURS; ++i) {
     const POSITION *const mv_ref = &mv_ref_search[i];
-    if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
+    if (is_inside(tile, mi_col, mi_row, mv_ref)) {
       const MB_MODE_INFO *const candidate = &xd->mi[mv_ref->col + mv_ref->row *
                                                     xd->mi_stride]->mbmi;
       different_ref_found = 1;
@@ -108,7 +633,7 @@
   if (different_ref_found) {
     for (i = 0; i < MVREF_NEIGHBOURS; ++i) {
       const POSITION *mv_ref = &mv_ref_search[i];
-      if (is_inside(tile, mi_col, mi_row, cm->mi_rows, mv_ref)) {
+      if (is_inside(tile, mi_col, mi_row, mv_ref)) {
         const MB_MODE_INFO *const candidate = &xd->mi[mv_ref->col + mv_ref->row
                                               * xd->mi_stride]->mbmi;
 
@@ -133,9 +658,6 @@
     }
 
     if (prev_frame_mvs->ref_frame[1] > INTRA_FRAME &&
-#if !CONFIG_MISC_FIXES
-        prev_frame_mvs->mv[1].as_int != prev_frame_mvs->mv[0].as_int &&
-#endif
         prev_frame_mvs->ref_frame[1] != ref_frame) {
       int_mv mv = prev_frame_mvs->mv[1];
       if (ref_sign_bias[prev_frame_mvs->ref_frame[1]] !=
@@ -147,38 +669,101 @@
     }
   }
 
- Done:
-
-  mode_context[ref_frame] = counter_to_context[context_counter];
-
-#if CONFIG_MISC_FIXES
+Done:
+  if (mode_context)
+    mode_context[ref_frame] = counter_to_context[context_counter];
   for (i = refmv_count; i < MAX_MV_REF_CANDIDATES; ++i)
       mv_ref_list[i].as_int = 0;
-#else
-  // Clamp vectors
-  for (i = 0; i < MAX_MV_REF_CANDIDATES; ++i)
-    clamp_mv_ref(&mv_ref_list[i].as_mv, bw, bh, xd);
-#endif
 }
 
+#if CONFIG_EXT_INTER
+// This function keeps a mode count for a given MB/SB
+void vp10_update_mv_context(const MACROBLOCKD *xd,
+                            MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+                            int_mv *mv_ref_list,
+                            int block, int mi_row, int mi_col,
+                            int16_t *mode_context) {
+  int i, refmv_count = 0;
+  const POSITION *const mv_ref_search = mv_ref_blocks[mi->mbmi.sb_type];
+  int context_counter = 0;
+  const int bw = num_8x8_blocks_wide_lookup[mi->mbmi.sb_type] << 3;
+  const int bh = num_8x8_blocks_high_lookup[mi->mbmi.sb_type] << 3;
+  const TileInfo *const tile = &xd->tile;
+
+  // Blank the reference vector list
+  memset(mv_ref_list, 0, sizeof(*mv_ref_list) * MAX_MV_REF_CANDIDATES);
+
+  // The nearest 2 blocks are examined only.
+  // If the size < 8x8, we get the mv from the bmi substructure;
+  for (i = 0; i < 2; ++i) {
+    const POSITION *const mv_ref = &mv_ref_search[i];
+    if (is_inside(tile, mi_col, mi_row, mv_ref)) {
+      const MODE_INFO *const candidate_mi =
+          xd->mi[mv_ref->col + mv_ref->row * xd->mi_stride];
+      const MB_MODE_INFO *const candidate = &candidate_mi->mbmi;
+
+      // Keep counts for entropy encoding.
+      context_counter += mode_2_counter[candidate->mode];
+
+      if (candidate->ref_frame[0] == ref_frame) {
+        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 0, mv_ref->col, block),
+                        refmv_count, mv_ref_list, bw, bh, xd, Done);
+      } else if (candidate->ref_frame[1] == ref_frame) {
+        ADD_MV_REF_LIST(get_sub_block_mv(candidate_mi, 1, mv_ref->col, block),
+                        refmv_count, mv_ref_list, bw, bh, xd, Done);
+      }
+    }
+  }
+
+ Done:
+
+  if (mode_context)
+    mode_context[ref_frame] = counter_to_context[context_counter];
+}
+#endif  // CONFIG_EXT_INTER
+
 void vp10_find_mv_refs(const VP10_COMMON *cm, const MACROBLOCKD *xd,
                       MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+#if CONFIG_REF_MV
+                      uint8_t *ref_mv_count,
+                      CANDIDATE_MV *ref_mv_stack,
+#if CONFIG_EXT_INTER
+                      int16_t *compound_mode_context,
+#endif  // CONFIG_EXT_INTER
+#endif
                       int_mv *mv_ref_list,
                       int mi_row, int mi_col,
                       find_mv_refs_sync sync, void *const data,
-                      uint8_t *mode_context) {
+                      int16_t *mode_context) {
+#if CONFIG_REF_MV
+  int idx, all_zero = 1;
+#endif
+#if CONFIG_EXT_INTER
+  vp10_update_mv_context(xd, mi, ref_frame, mv_ref_list, -1,
+                         mi_row, mi_col,
+#if CONFIG_REF_MV
+                         compound_mode_context);
+#else
+                         mode_context);
+#endif  // CONFIG_REF_MV
+  find_mv_refs_idx(cm, xd, mi, ref_frame, mv_ref_list, -1,
+                   mi_row, mi_col, sync, data, NULL);
+#else
   find_mv_refs_idx(cm, xd, mi, ref_frame, mv_ref_list, -1,
                    mi_row, mi_col, sync, data, mode_context);
-}
+#endif  // CONFIG_EXT_INTER
 
-static void lower_mv_precision(MV *mv, int allow_hp) {
-  const int use_hp = allow_hp && vp10_use_mv_hp(mv);
-  if (!use_hp) {
-    if (mv->row & 1)
-      mv->row += (mv->row > 0 ? -1 : 1);
-    if (mv->col & 1)
-      mv->col += (mv->col > 0 ? -1 : 1);
-  }
+#if CONFIG_REF_MV
+  setup_ref_mv_list(cm, xd, ref_frame, ref_mv_count, ref_mv_stack,
+                    mv_ref_list, -1, mi_row, mi_col, mode_context);
+
+  for (idx = 0; idx < MAX_MV_REF_CANDIDATES; ++idx)
+    if (mv_ref_list[idx].as_int != 0)
+      all_zero = 0;
+
+  if (all_zero)
+    mode_context[ref_frame] |= (1 << ALL_ZERO_FLAG_OFFSET);
+#endif
 }
 
 void vp10_find_best_ref_mvs(int allow_hp,
@@ -194,18 +779,56 @@
 }
 
 void vp10_append_sub8x8_mvs_for_idx(VP10_COMMON *cm, MACROBLOCKD *xd,
-                                   int block, int ref, int mi_row, int mi_col,
-                                   int_mv *nearest_mv, int_mv *near_mv,
-                                   uint8_t *mode_context) {
+                                    int block, int ref, int mi_row, int mi_col,
+#if CONFIG_REF_MV
+                                    CANDIDATE_MV *ref_mv_stack,
+                                    uint8_t *ref_mv_count,
+#endif
+#if CONFIG_EXT_INTER
+                                    int_mv *mv_list,
+#endif  // CONFIG_EXT_INTER
+                                    int_mv *nearest_mv, int_mv *near_mv) {
+#if !CONFIG_EXT_INTER
   int_mv mv_list[MAX_MV_REF_CANDIDATES];
+#endif  // !CONFIG_EXT_INTER
   MODE_INFO *const mi = xd->mi[0];
   b_mode_info *bmi = mi->bmi;
   int n;
+#if CONFIG_REF_MV
+  CANDIDATE_MV tmp_mv;
+  uint8_t idx;
+  uint8_t above_count = 0, left_count = 0;
+  MV_REFERENCE_FRAME rf[2] = { mi->mbmi.ref_frame[ref], NONE };
+  *ref_mv_count = 0;
+#endif
 
   assert(MAX_MV_REF_CANDIDATES == 2);
 
   find_mv_refs_idx(cm, xd, mi, mi->mbmi.ref_frame[ref], mv_list, block,
-                   mi_row, mi_col, NULL, NULL, mode_context);
+                   mi_row, mi_col, NULL, NULL, NULL);
+
+#if CONFIG_REF_MV
+  scan_blk_mbmi(cm, xd, mi_row, mi_col, block, rf,
+                -1, 0, ref_mv_stack, ref_mv_count);
+  above_count = *ref_mv_count;
+
+  scan_blk_mbmi(cm, xd, mi_row, mi_col, block, rf,
+                0, -1, ref_mv_stack, ref_mv_count);
+  left_count = *ref_mv_count - above_count;
+
+  if (above_count > 1 && left_count > 0) {
+    tmp_mv = ref_mv_stack[1];
+    ref_mv_stack[1] = ref_mv_stack[above_count];
+    ref_mv_stack[above_count] = tmp_mv;
+  }
+
+  for (idx = 0; idx < *ref_mv_count; ++idx)
+    clamp_mv_ref(&ref_mv_stack[idx].this_mv.as_mv,
+                 xd->n8_w << 3, xd->n8_h << 3, xd);
+
+  for (idx = 0; idx < VPXMIN(MAX_MV_REF_CANDIDATES, *ref_mv_count); ++idx)
+    mv_list[idx].as_int = ref_mv_stack[idx].this_mv.as_int;
+#endif
 
   near_mv->as_int = 0;
   switch (block) {
diff --git a/vp10/common/mvref_common.h b/vp10/common/mvref_common.h
index 0a98866..70ef017 100644
--- a/vp10/common/mvref_common.h
+++ b/vp10/common/mvref_common.h
@@ -55,6 +55,19 @@
   0,  // NEARMV
   3,  // ZEROMV
   1,  // NEWMV
+#if CONFIG_EXT_INTER
+  1,  // NEWFROMNEARMV
+  0,  // NEAREST_NEARESTMV
+  0,  // NEAREST_NEARMV
+  0,  // NEAR_NEARESTMV
+  0,  // NEAR_NEARMV
+  1,  // NEAREST_NEWMV
+  1,  // NEW_NEARESTMV
+  1,  // NEAR_NEWMV
+  1,  // NEW_NEARMV
+  3,  // ZERO_ZEROMV
+  1,  // NEW_NEWMV
+#endif  // CONFIG_EXT_INTER
 };
 
 // There are 3^3 different combinations of 3 counts that can be either 0,1 or
@@ -108,7 +121,16 @@
   // 64X32
   {{-1, 0}, {0, -1}, {-1, 4}, {2, -1}, {-1, -1}, {-3, 0}, {0, -3}, {-1, 2}},
   // 64X64
-  {{-1, 3}, {3, -1}, {-1, 4}, {4, -1}, {-1, -1}, {-1, 0}, {0, -1}, {-1, 6}}
+  {{-1, 3}, {3, -1}, {-1, 4}, {4, -1}, {-1, -1}, {-1, 0}, {0, -1}, {-1, 6}},
+#if CONFIG_EXT_PARTITION
+  // TODO(debargha/jingning) Making them twice the 32x64, .. ones above
+  // 64x128
+  {{0, -2}, {-2, 0}, {8, -2}, {-2, 4}, {-2, -2}, {0, -6}, {-6, 0}, {4, -2}},
+  // 128x64
+  {{-2, 0}, {0, -2}, {-2, 8}, {4, -2}, {-2, -2}, {-6, 0}, {0, -6}, {-2, 4}},
+  // 128x128
+  {{-2, 6}, {6, -2}, {-2, 8}, {8, -2}, {-2, -2}, {-2, 0}, {0, -2}, {-2, 12}},
+#endif  // CONFIG_EXT_PARTITION
 };
 
 static const int idx_n_column_to_subblock[4][2] = {
@@ -119,26 +141,17 @@
 };
 
 // clamp_mv_ref
-#if CONFIG_MISC_FIXES
-#define MV_BORDER (8 << 3)  // Allow 8 pels in 1/8th pel units
+#if CONFIG_EXT_PARTITION
+# define MV_BORDER (16 << 3)  // Allow 16 pels in 1/8th pel units
 #else
-#define MV_BORDER (16 << 3)  // Allow 16 pels in 1/8th pel units
-#endif
+# define MV_BORDER (8 << 3)   // Allow 8 pels in 1/8th pel units
+#endif  // CONFIG_EXT_PARTITION
 
 static INLINE void clamp_mv_ref(MV *mv, int bw, int bh, const MACROBLOCKD *xd) {
-#if CONFIG_MISC_FIXES
   clamp_mv(mv, xd->mb_to_left_edge - bw * 8 - MV_BORDER,
                xd->mb_to_right_edge + bw * 8 + MV_BORDER,
                xd->mb_to_top_edge - bh * 8 - MV_BORDER,
                xd->mb_to_bottom_edge + bh * 8 + MV_BORDER);
-#else
-  (void) bw;
-  (void) bh;
-  clamp_mv(mv, xd->mb_to_left_edge - MV_BORDER,
-               xd->mb_to_right_edge + MV_BORDER,
-               xd->mb_to_top_edge - MV_BORDER,
-               xd->mb_to_bottom_edge + MV_BORDER);
-#endif
 }
 
 // This function returns either the appropriate sub block or block's mv
@@ -151,6 +164,16 @@
           : candidate->mbmi.mv[which_mv];
 }
 
+#if CONFIG_REF_MV
+static INLINE int_mv get_sub_block_pred_mv(const MODE_INFO *candidate,
+                                           int which_mv,
+                                           int search_col, int block_idx) {
+  return block_idx >= 0 && candidate->mbmi.sb_type < BLOCK_8X8
+          ? candidate->bmi[idx_n_column_to_subblock[block_idx][search_col == 0]]
+              .pred_mv_s8[which_mv]
+          : candidate->mbmi.pred_mv[which_mv];
+}
+#endif
 
 // Performs mv sign inversion if indicated by the reference frame combination.
 static INLINE int_mv scale_mv(const MB_MODE_INFO *mbmi, int ref,
@@ -164,11 +187,7 @@
   return mv;
 }
 
-#if CONFIG_MISC_FIXES
 #define CLIP_IN_ADD(mv, bw, bh, xd) clamp_mv_ref(mv, bw, bh, xd)
-#else
-#define CLIP_IN_ADD(mv, bw, bh, xd) do {} while (0)
-#endif
 
 // This macro is used to add a motion vector mv_ref list if it isn't
 // already in the list.  If it's the second motion vector it will also
@@ -194,8 +213,6 @@
         ADD_MV_REF_LIST(scale_mv((mbmi), 0, ref_frame, ref_sign_bias), \
                         refmv_count, mv_ref_list, bw, bh, xd, Done); \
       if (has_second_ref(mbmi) && \
-          (CONFIG_MISC_FIXES || \
-           (mbmi)->mv[1].as_int != (mbmi)->mv[0].as_int) && \
           (mbmi)->ref_frame[1] != ref_frame) \
         ADD_MV_REF_LIST(scale_mv((mbmi), 1, ref_frame, ref_sign_bias), \
                         refmv_count, mv_ref_list, bw, bh, xd, Done); \
@@ -206,20 +223,141 @@
 // Checks that the given mi_row, mi_col and search point
 // are inside the borders of the tile.
 static INLINE int is_inside(const TileInfo *const tile,
-                            int mi_col, int mi_row, int mi_rows,
+                            int mi_col, int mi_row,
                             const POSITION *mi_pos) {
-  return !(mi_row + mi_pos->row < 0 ||
+  return !(mi_row + mi_pos->row < tile->mi_row_start ||
            mi_col + mi_pos->col < tile->mi_col_start ||
-           mi_row + mi_pos->row >= mi_rows ||
+           mi_row + mi_pos->row >= tile->mi_row_end ||
            mi_col + mi_pos->col >= tile->mi_col_end);
 }
 
+static INLINE void lower_mv_precision(MV *mv, int allow_hp) {
+  const int use_hp = allow_hp && vp10_use_mv_hp(mv);
+  if (!use_hp) {
+    if (mv->row & 1)
+      mv->row += (mv->row > 0 ? -1 : 1);
+    if (mv->col & 1)
+      mv->col += (mv->col > 0 ? -1 : 1);
+  }
+}
+
+#if CONFIG_REF_MV
+static INLINE int vp10_nmv_ctx(const uint8_t ref_mv_count,
+                               const CANDIDATE_MV *ref_mv_stack) {
+#if CONFIG_EXT_INTER
+  return 0;
+#endif
+  if (ref_mv_stack[0].weight > REF_CAT_LEVEL &&
+      ref_mv_count > 0) {
+    if (abs(ref_mv_stack[0].this_mv.as_mv.row -
+             ref_mv_stack[0].pred_mv.as_mv.row) <= 4 &&
+         abs(ref_mv_stack[0].this_mv.as_mv.col -
+             ref_mv_stack[0].pred_mv.as_mv.col) <= 4)
+      return 2;
+    else
+      return 1;
+  }
+  return 0;
+}
+
+static INLINE int8_t vp10_ref_frame_type(const MV_REFERENCE_FRAME *const rf) {
+  if (rf[1] > INTRA_FRAME) {
+    return MAX_REF_FRAMES + FWD_RF_OFFSET(rf[0]) +
+        BWD_RF_OFFSET(rf[1]) * FWD_REFS;
+  }
+
+  return rf[0];
+}
+
+static MV_REFERENCE_FRAME ref_frame_map[COMP_REFS][2] = {
+#if CONFIG_EXT_REFS
+  {LAST_FRAME, BWDREF_FRAME},
+  {LAST2_FRAME, BWDREF_FRAME},
+  {LAST3_FRAME, BWDREF_FRAME},
+  {GOLDEN_FRAME, BWDREF_FRAME},
+
+  {LAST_FRAME, ALTREF_FRAME},
+  {LAST2_FRAME, ALTREF_FRAME},
+  {LAST3_FRAME, ALTREF_FRAME},
+  {GOLDEN_FRAME, ALTREF_FRAME}
+#else
+  {LAST_FRAME, ALTREF_FRAME},
+  {GOLDEN_FRAME, ALTREF_FRAME}
+#endif
+};
+
+static INLINE void vp10_set_ref_frame(MV_REFERENCE_FRAME *rf,
+                                      int8_t ref_frame_type) {
+  if (ref_frame_type >= MAX_REF_FRAMES) {
+    rf[0] = ref_frame_map[ref_frame_type - MAX_REF_FRAMES][0];
+    rf[1] = ref_frame_map[ref_frame_type - MAX_REF_FRAMES][1];
+  } else {
+    rf[0] = ref_frame_type;
+    rf[1] = NONE;
+    assert(ref_frame_type > INTRA_FRAME && ref_frame_type < MAX_REF_FRAMES);
+  }
+}
+
+static INLINE int16_t vp10_mode_context_analyzer(
+    const int16_t *const mode_context, const MV_REFERENCE_FRAME *const rf,
+    BLOCK_SIZE bsize, int block) {
+  int16_t mode_ctx = 0;
+  if (block >= 0) {
+    mode_ctx = mode_context[rf[0]] & 0x00ff;
+
+    if (block > 0 && bsize < BLOCK_8X8 && bsize > BLOCK_4X4)
+      mode_ctx |= (1 << SKIP_NEARESTMV_SUB8X8_OFFSET);
+
+    return mode_ctx;
+  }
+
+  if (rf[1] > INTRA_FRAME)
+    return mode_context[rf[0]] & (mode_context[rf[1]] | 0x00ff);
+  else if (rf[0] != ALTREF_FRAME)
+    return mode_context[rf[0]] & ~(mode_context[ALTREF_FRAME] & 0xfe00);
+  else
+    return mode_context[rf[0]];
+}
+
+static INLINE uint8_t vp10_drl_ctx(const CANDIDATE_MV *ref_mv_stack,
+                                   int ref_idx) {
+  if (ref_mv_stack[ref_idx].weight >= REF_CAT_LEVEL &&
+      ref_mv_stack[ref_idx + 1].weight >= REF_CAT_LEVEL) {
+    if (ref_mv_stack[ref_idx].weight == ref_mv_stack[ref_idx + 1].weight)
+      return 0;
+    else
+      return 1;
+  }
+
+  if (ref_mv_stack[ref_idx].weight >= REF_CAT_LEVEL &&
+      ref_mv_stack[ref_idx + 1].weight < REF_CAT_LEVEL)
+    return 2;
+
+  if (ref_mv_stack[ref_idx].weight < REF_CAT_LEVEL &&
+      ref_mv_stack[ref_idx + 1].weight < REF_CAT_LEVEL) {
+    if (ref_mv_stack[ref_idx].weight == ref_mv_stack[ref_idx + 1].weight)
+      return 3;
+    else
+      return 4;
+  }
+
+  return 0;
+}
+#endif
+
 typedef void (*find_mv_refs_sync)(void *const data, int mi_row);
 void vp10_find_mv_refs(const VP10_COMMON *cm, const MACROBLOCKD *xd,
-                      MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
-                      int_mv *mv_ref_list, int mi_row, int mi_col,
-                      find_mv_refs_sync sync, void *const data,
-                      uint8_t *mode_context);
+                       MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+#if CONFIG_REF_MV
+                       uint8_t *ref_mv_count,
+                       CANDIDATE_MV *ref_mv_stack,
+#if CONFIG_EXT_INTER
+                       int16_t *compound_mode_context,
+#endif  // CONFIG_EXT_INTER
+#endif
+                       int_mv *mv_ref_list, int mi_row, int mi_col,
+                       find_mv_refs_sync sync, void *const data,
+                       int16_t *mode_context);
 
 // check a list of motion vectors by sad score using a number rows of pixels
 // above and a number cols of pixels in the left to select the one with best
@@ -228,9 +366,24 @@
                            int_mv *mvlist, int_mv *nearest_mv, int_mv *near_mv);
 
 void vp10_append_sub8x8_mvs_for_idx(VP10_COMMON *cm, MACROBLOCKD *xd,
-                                   int block, int ref, int mi_row, int mi_col,
-                                   int_mv *nearest_mv, int_mv *near_mv,
-                                   uint8_t *mode_context);
+                                    int block, int ref, int mi_row, int mi_col,
+#if CONFIG_REF_MV
+                                    CANDIDATE_MV *ref_mv_stack,
+                                    uint8_t *ref_mv_count,
+#endif
+#if CONFIG_EXT_INTER
+                                    int_mv *mv_list,
+#endif  // CONFIG_EXT_INTER
+                                    int_mv *nearest_mv, int_mv *near_mv);
+
+#if CONFIG_EXT_INTER
+// This function keeps a mode count for a given MB/SB
+void vp10_update_mv_context(const MACROBLOCKD *xd,
+                            MODE_INFO *mi, MV_REFERENCE_FRAME ref_frame,
+                            int_mv *mv_ref_list,
+                            int block, int mi_row, int mi_col,
+                            int16_t *mode_context);
+#endif  // CONFIG_EXT_INTER
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp10/common/onyxc_int.h b/vp10/common/onyxc_int.h
index ffef733..b2d65b5 100644
--- a/vp10/common/onyxc_int.h
+++ b/vp10/common/onyxc_int.h
@@ -20,9 +20,11 @@
 #include "vp10/common/entropymv.h"
 #include "vp10/common/entropy.h"
 #include "vp10/common/entropymode.h"
+#include "vp10/common/mv.h"
 #include "vp10/common/frame_buffers.h"
 #include "vp10/common/quant_common.h"
 #include "vp10/common/tile_common.h"
+#include "vp10/common/restoration.h"
 
 #if CONFIG_VP9_POSTPROC
 #include "vp10/common/postproc.h"
@@ -65,10 +67,6 @@
 
 typedef enum {
   /**
-   * Don't update frame context
-   */
-  REFRESH_FRAME_CONTEXT_OFF,
-  /**
    * Update frame context to values resulting from forward probability
    * updates signaled in the frame header
    */
@@ -144,7 +142,8 @@
   int subsampling_y;
 
 #if CONFIG_VP9_HIGHBITDEPTH
-  int use_highbitdepth;  // Marks if we need to use 16bit frame buffers.
+  // Marks if we need to use 16bit frame buffers (1: yes, 0: no).
+  int use_highbitdepth;
 #endif
 
   YV12_BUFFER_CONFIG *frame_to_show;
@@ -171,13 +170,27 @@
   YV12_BUFFER_CONFIG post_proc_buffer;
   YV12_BUFFER_CONFIG post_proc_buffer_int;
 #endif
+#if CONFIG_LOOP_RESTORATION
+  YV12_BUFFER_CONFIG tmp_loop_buf;
+#endif  // CONFIG_LOOP_RESTORATION
 
   FRAME_TYPE last_frame_type;  /* last frame's frame type for motion search.*/
+#if CONFIG_EXT_REFS
+  // frame type of the frame before last frame
+  FRAME_TYPE last2_frame_type;
+  // TODO(zoeliu): To check whether last3_frame_type is still needed.
+  // frame type of the frame two frames before last frame
+  FRAME_TYPE last3_frame_type;
+#endif  // CONFIG_EXT_REFS
   FRAME_TYPE frame_type;
 
   int show_frame;
   int last_show_frame;
   int show_existing_frame;
+#if CONFIG_EXT_REFS
+  // Flag for a frame used as a reference - not written to the bitstream
+  int is_reference_frame;
+#endif  // CONFIG_EXT_REFS
 
   // Flag signaling that the frame is encoded using only INTRA modes.
   uint8_t intra_only;
@@ -185,6 +198,8 @@
 
   int allow_high_precision_mv;
 
+  int allow_screen_content_tools;
+
   // Flag signaling which frame contexts should be reset to default values.
   RESET_FRAME_CONTEXT_MODE reset_frame_context;
 
@@ -204,6 +219,10 @@
   int uv_ac_delta_q;
   int16_t y_dequant[MAX_SEGMENTS][2];
   int16_t uv_dequant[MAX_SEGMENTS][2];
+#if CONFIG_NEW_QUANT
+  dequant_val_type_nuq y_dequant_nuq[MAX_SEGMENTS][QUANT_PROFILES][COEF_BANDS];
+  dequant_val_type_nuq uv_dequant_nuq[MAX_SEGMENTS][QUANT_PROFILES][COEF_BANDS];
+#endif
 
   /* We allocate a MODE_INFO struct for each macroblock, together with
      an extra row on top and column on the left to simplify prediction. */
@@ -243,6 +262,10 @@
   INTERP_FILTER interp_filter;
 
   loop_filter_info_n lf_info;
+#if CONFIG_LOOP_RESTORATION
+  RestorationInfo rst_info;
+  RestorationInternal rst_internal;
+#endif  // CONFIG_LOOP_RESTORATION
 
   // Flag signaling how frame contexts should be updated at the end of
   // a frame decode
@@ -252,15 +275,17 @@
 
   struct loopfilter lf;
   struct segmentation seg;
-#if !CONFIG_MISC_FIXES
-  struct segmentation_probs segp;
-#endif
 
   int frame_parallel_decode;  // frame-based threading.
 
   // Context probabilities for reference frame prediction
+#if CONFIG_EXT_REFS
+  MV_REFERENCE_FRAME comp_fwd_ref[FWD_REFS];
+  MV_REFERENCE_FRAME comp_bwd_ref[BWD_REFS];
+#else
   MV_REFERENCE_FRAME comp_fixed_ref;
-  MV_REFERENCE_FRAME comp_var_ref[2];
+  MV_REFERENCE_FRAME comp_var_ref[COMP_REFS];
+#endif  // CONFIG_EXT_REFS
   REFERENCE_MODE reference_mode;
 
   FRAME_CONTEXT *fc;  /* this frame entropy */
@@ -268,6 +293,18 @@
   unsigned int  frame_context_idx; /* Context to use/update */
   FRAME_COUNTS counts;
 
+#if CONFIG_ENTROPY
+  // The initial probabilities for a frame, before any subframe backward update,
+  // and after forward update.
+  vp10_coeff_probs_model starting_coef_probs[TX_SIZES][PLANE_TYPES];
+  // Number of subframe backward updates already done
+  uint8_t coef_probs_update_idx;
+  // Signal if the backward update is subframe or end-of-frame
+  uint8_t partial_prob_update;
+  // Frame level flag to turn on/off subframe backward update
+  uint8_t do_subframe_update;
+#endif  // CONFIG_ENTROPY
+
   unsigned int current_video_frame;
   BITSTREAM_PROFILE profile;
 
@@ -281,8 +318,12 @@
 
   int error_resilient_mode;
 
+#if !CONFIG_EXT_TILE
   int log2_tile_cols, log2_tile_rows;
-  int tile_sz_mag;
+#endif  // !CONFIG_EXT_TILE
+  int tile_cols, tile_rows;
+  int tile_width, tile_height;  // In MI units
+
   int byte_alignment;
   int skip_loop_filter;
 
@@ -298,13 +339,21 @@
   BufferPool *buffer_pool;
 
   PARTITION_CONTEXT *above_seg_context;
-  ENTROPY_CONTEXT *above_context;
+  ENTROPY_CONTEXT *above_context[MAX_MB_PLANE];
+#if CONFIG_VAR_TX
+  TXFM_CONTEXT *above_txfm_context;
+  TXFM_CONTEXT left_txfm_context[MAX_MIB_SIZE];
+#endif
   int above_context_alloc_cols;
 
   // scratch memory for intraonly/keyframe forward updates from default tables
   // - this is intentionally not placed in FRAME_CONTEXT since it's reset upon
   // each keyframe and not used afterwards
   vpx_prob kf_y_prob[INTRA_MODES][INTRA_MODES][INTRA_MODES - 1];
+
+  BLOCK_SIZE sb_size;   // Size of the superblock used for this frame
+  int mib_size;         // Size of the superblock in units of MI blocks
+  int mib_size_log2;    // Log 2 of above.
 } VP10_COMMON;
 
 // TODO(hkuang): Don't need to lock the whole pool after implementing atomic
@@ -334,7 +383,8 @@
   return &cm->buffer_pool->frame_bufs[cm->ref_frame_map[index]].buf;
 }
 
-static INLINE YV12_BUFFER_CONFIG *get_frame_new_buffer(VP10_COMMON *cm) {
+static INLINE YV12_BUFFER_CONFIG *get_frame_new_buffer(
+    const VP10_COMMON *const cm) {
   return &cm->buffer_pool->frame_bufs[cm->new_fb_idx].buf;
 }
 
@@ -369,8 +419,12 @@
   bufs[new_idx].ref_count++;
 }
 
-static INLINE int mi_cols_aligned_to_sb(int n_mis) {
-  return ALIGN_POWER_OF_TWO(n_mis, MI_BLOCK_SIZE_LOG2);
+static INLINE int mi_cols_aligned_to_sb(const VP10_COMMON *cm) {
+  return ALIGN_POWER_OF_TWO(cm->mi_cols, cm->mib_size_log2);
+}
+
+static INLINE int mi_rows_aligned_to_sb(const VP10_COMMON *cm) {
+  return ALIGN_POWER_OF_TWO(cm->mi_rows, cm->mib_size_log2);
 }
 
 static INLINE int frame_is_intra_only(const VP10_COMMON *const cm) {
@@ -380,28 +434,36 @@
 static INLINE void vp10_init_macroblockd(VP10_COMMON *cm, MACROBLOCKD *xd,
                                         tran_low_t *dqcoeff) {
   int i;
-
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     xd->plane[i].dqcoeff = dqcoeff;
-    xd->above_context[i] = cm->above_context +
-        i * sizeof(*cm->above_context) * 2 * mi_cols_aligned_to_sb(cm->mi_cols);
-
+    xd->above_context[i] = cm->above_context[i];
     if (xd->plane[i].plane_type == PLANE_TYPE_Y) {
       memcpy(xd->plane[i].seg_dequant, cm->y_dequant, sizeof(cm->y_dequant));
+#if CONFIG_NEW_QUANT
+      memcpy(xd->plane[i].seg_dequant_nuq, cm->y_dequant_nuq,
+             sizeof(cm->y_dequant_nuq));
+#endif
     } else {
       memcpy(xd->plane[i].seg_dequant, cm->uv_dequant, sizeof(cm->uv_dequant));
+#if CONFIG_NEW_QUANT
+      memcpy(xd->plane[i].seg_dequant_nuq, cm->uv_dequant_nuq,
+             sizeof(cm->uv_dequant_nuq));
+#endif
     }
     xd->fc = cm->fc;
   }
 
   xd->above_seg_context = cm->above_seg_context;
+#if CONFIG_VAR_TX
+  xd->above_txfm_context = cm->above_txfm_context;
+#endif
   xd->mi_stride = cm->mi_stride;
   xd->error_info = &cm->error;
 }
 
 static INLINE void set_skip_context(MACROBLOCKD *xd, int mi_row, int mi_col) {
   const int above_idx = mi_col * 2;
-  const int left_idx = (mi_row * 2) & 15;
+  const int left_idx = (mi_row * 2) & MAX_MIB_MASK_2;
   int i;
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     struct macroblockd_plane *const pd = &xd->plane[i];
@@ -412,7 +474,7 @@
 
 static INLINE int calc_mi_size(int len) {
   // len is in mi units.
-  return len + MI_BLOCK_SIZE;
+  return len + MAX_MIB_SIZE;
 }
 
 static INLINE void set_mi_row_col(MACROBLOCKD *xd, const TileInfo *const tile,
@@ -425,7 +487,7 @@
   xd->mb_to_right_edge  = ((mi_cols - bw - mi_col) * MI_SIZE) * 8;
 
   // Are edges available for intra prediction?
-  xd->up_available    = (mi_row != 0);
+  xd->up_available    = (mi_row > tile->mi_row_start);
   xd->left_available  = (mi_col > tile->mi_col_start);
   if (xd->up_available) {
     xd->above_mi = xd->mi[-xd->mi_stride];
@@ -444,6 +506,19 @@
     xd->left_mi = NULL;
     xd->left_mbmi = NULL;
   }
+
+  xd->n8_h = bh;
+  xd->n8_w = bw;
+#if CONFIG_REF_MV
+  xd->is_sec_rect = 0;
+  if (xd->n8_w < xd->n8_h)
+    if (mi_col & (xd->n8_h - 1))
+      xd->is_sec_rect = 1;
+
+  if (xd->n8_w > xd->n8_h)
+    if (mi_row & (xd->n8_w - 1))
+      xd->is_sec_rect = 1;
+#endif
 }
 
 static INLINE const vpx_prob *get_y_mode_probs(const VP10_COMMON *cm,
@@ -461,8 +536,15 @@
                                             BLOCK_SIZE subsize,
                                             BLOCK_SIZE bsize) {
   PARTITION_CONTEXT *const above_ctx = xd->above_seg_context + mi_col;
-  PARTITION_CONTEXT *const left_ctx = xd->left_seg_context + (mi_row & MI_MASK);
+  PARTITION_CONTEXT *const left_ctx =
+    xd->left_seg_context + (mi_row & MAX_MIB_MASK);
 
+#if CONFIG_EXT_PARTITION_TYPES
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
+  memset(above_ctx, partition_context_lookup[subsize].above, bw);
+  memset(left_ctx, partition_context_lookup[subsize].left, bh);
+#else
   // num_4x4_blocks_wide_lookup[bsize] / 2
   const int bs = num_8x8_blocks_wide_lookup[bsize];
 
@@ -471,13 +553,56 @@
   // bits of smaller block sizes to be zero.
   memset(above_ctx, partition_context_lookup[subsize].above, bs);
   memset(left_ctx, partition_context_lookup[subsize].left, bs);
+#endif  // CONFIG_EXT_PARTITION_TYPES
 }
 
+#if CONFIG_EXT_PARTITION_TYPES
+static INLINE void update_ext_partition_context(MACROBLOCKD *xd,
+                                                int mi_row, int mi_col,
+                                                BLOCK_SIZE subsize,
+                                                BLOCK_SIZE bsize,
+                                                PARTITION_TYPE partition) {
+  if (bsize >= BLOCK_8X8) {
+    const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
+    BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+    switch (partition) {
+      case PARTITION_SPLIT:
+        if (bsize != BLOCK_8X8)
+          break;
+      case PARTITION_NONE:
+      case PARTITION_HORZ:
+      case PARTITION_VERT:
+        update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+        break;
+      case PARTITION_HORZ_A:
+        update_partition_context(xd, mi_row, mi_col, bsize2, subsize);
+        update_partition_context(xd, mi_row + hbs, mi_col, subsize, subsize);
+        break;
+      case PARTITION_HORZ_B:
+        update_partition_context(xd, mi_row, mi_col, subsize, subsize);
+        update_partition_context(xd, mi_row + hbs, mi_col, bsize2, subsize);
+        break;
+      case PARTITION_VERT_A:
+        update_partition_context(xd, mi_row, mi_col, bsize2, subsize);
+        update_partition_context(xd, mi_row, mi_col + hbs, subsize, subsize);
+        break;
+      case PARTITION_VERT_B:
+        update_partition_context(xd, mi_row, mi_col, subsize, subsize);
+        update_partition_context(xd, mi_row, mi_col + hbs, bsize2, subsize);
+        break;
+      default:
+        assert(0 && "Invalid partition type");
+    }
+  }
+}
+#endif  // CONFIG_EXT_PARTITION_TYPES
+
 static INLINE int partition_plane_context(const MACROBLOCKD *xd,
                                           int mi_row, int mi_col,
                                           BLOCK_SIZE bsize) {
   const PARTITION_CONTEXT *above_ctx = xd->above_seg_context + mi_col;
-  const PARTITION_CONTEXT *left_ctx = xd->left_seg_context + (mi_row & MI_MASK);
+  const PARTITION_CONTEXT *left_ctx =
+    xd->left_seg_context + (mi_row & MAX_MIB_MASK);
   const int bsl = mi_width_log2_lookup[bsize];
   int above = (*above_ctx >> bsl) & 1 , left = (*left_ctx >> bsl) & 1;
 
@@ -487,6 +612,116 @@
   return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
 }
 
+static INLINE void vp10_zero_above_context(VP10_COMMON *const cm,
+                             int mi_col_start, int mi_col_end) {
+  const int width = mi_col_end - mi_col_start;
+
+  const int offset_y = 2 * mi_col_start;
+  const int width_y = 2 * width;
+  const int offset_uv = offset_y >> cm->subsampling_x;
+  const int width_uv = width_y >> cm->subsampling_x;
+
+  vp10_zero_array(cm->above_context[0] + offset_y, width_y);
+  vp10_zero_array(cm->above_context[1] + offset_uv, width_uv);
+  vp10_zero_array(cm->above_context[2] + offset_uv, width_uv);
+
+  vp10_zero_array(cm->above_seg_context + mi_col_start, width);
+
+#if CONFIG_VAR_TX
+  vp10_zero_array(cm->above_txfm_context + mi_col_start, width);
+#endif  // CONFIG_VAR_TX
+}
+
+static INLINE void vp10_zero_left_context(MACROBLOCKD *const xd) {
+  vp10_zero(xd->left_context);
+  vp10_zero(xd->left_seg_context);
+#if CONFIG_VAR_TX
+  vp10_zero(xd->left_txfm_context_buffer);
+#endif
+}
+
+#if CONFIG_VAR_TX
+static INLINE void set_txfm_ctx(TXFM_CONTEXT *txfm_ctx,
+                                TX_SIZE tx_size,
+                                int len) {
+  int i;
+  for (i = 0; i < len; ++i)
+    txfm_ctx[i] = tx_size;
+}
+
+static INLINE void txfm_partition_update(TXFM_CONTEXT *above_ctx,
+                                         TXFM_CONTEXT *left_ctx,
+                                         TX_SIZE tx_size) {
+  BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+  int bs = num_8x8_blocks_high_lookup[bsize];
+  int i;
+  for (i = 0; i < bs; ++i) {
+    above_ctx[i] = tx_size;
+    left_ctx[i] = tx_size;
+  }
+}
+
+static INLINE int txfm_partition_context(TXFM_CONTEXT *above_ctx,
+                                         TXFM_CONTEXT *left_ctx,
+                                         TX_SIZE tx_size) {
+  int above = *above_ctx < tx_size;
+  int left = *left_ctx < tx_size;
+  return (tx_size - 1) * 3 + above + left;
+}
+#endif
+
+static INLINE PARTITION_TYPE get_partition(const VP10_COMMON *const cm,
+                                           const int mi_row,
+                                           const int mi_col,
+                                           const BLOCK_SIZE bsize) {
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols) {
+    return PARTITION_INVALID;
+  } else {
+    const int offset = mi_row * cm->mi_stride + mi_col;
+    MODE_INFO **mi = cm->mi_grid_visible + offset;
+    const MB_MODE_INFO *const mbmi = &mi[0]->mbmi;
+    const int bsl = b_width_log2_lookup[bsize];
+    const PARTITION_TYPE partition = partition_lookup[bsl][mbmi->sb_type];
+#if !CONFIG_EXT_PARTITION_TYPES
+    return partition;
+#else
+    const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
+
+    assert(cm->mi_grid_visible[offset] == &cm->mi[offset]);
+
+    if (partition != PARTITION_NONE &&
+        bsize > BLOCK_8X8 &&
+        mi_row + hbs < cm->mi_rows &&
+        mi_col + hbs < cm->mi_cols) {
+      const BLOCK_SIZE h = get_subsize(bsize, PARTITION_HORZ_A);
+      const BLOCK_SIZE v = get_subsize(bsize, PARTITION_VERT_A);
+      const MB_MODE_INFO *const mbmi_right = &mi[hbs]->mbmi;
+      const MB_MODE_INFO *const mbmi_below = &mi[hbs * cm->mi_stride]->mbmi;
+      if (mbmi->sb_type == h) {
+        return mbmi_below->sb_type == h ? PARTITION_HORZ : PARTITION_HORZ_B;
+      } else if (mbmi->sb_type == v) {
+        return mbmi_right->sb_type == v ? PARTITION_VERT : PARTITION_VERT_B;
+      } else if (mbmi_below->sb_type == h) {
+        return PARTITION_HORZ_A;
+      } else if (mbmi_right->sb_type == v) {
+        return PARTITION_VERT_A;
+      } else {
+        return PARTITION_SPLIT;
+      }
+    }
+
+    return partition;
+#endif  // !CONFIG_EXT_PARTITION_TYPES
+  }
+}
+
+static INLINE void set_sb_size(VP10_COMMON *const cm,
+                               const BLOCK_SIZE sb_size) {
+  cm->sb_size = sb_size;
+  cm->mib_size = num_8x8_blocks_wide_lookup[cm->sb_size];
+  cm->mib_size_log2 = mi_width_log2_lookup[cm->sb_size];
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp10/common/pred_common.c b/vp10/common/pred_common.c
index 236ae54..dd9be87 100644
--- a/vp10/common/pred_common.c
+++ b/vp10/common/pred_common.c
@@ -11,9 +11,63 @@
 
 #include "vp10/common/common.h"
 #include "vp10/common/pred_common.h"
+#include "vp10/common/reconinter.h"
 #include "vp10/common/seg_common.h"
 
 // Returns a context number for the given MB prediction signal
+#if CONFIG_DUAL_FILTER
+static INTERP_FILTER get_ref_filter_type(const MODE_INFO *mi,
+                                         const MACROBLOCKD *xd,
+                                         int dir,
+                                         MV_REFERENCE_FRAME ref_frame) {
+  INTERP_FILTER ref_type = SWITCHABLE_FILTERS;
+  const MB_MODE_INFO *ref_mbmi = &mi->mbmi;
+  int use_subpel[2] = {
+      has_subpel_mv_component(mi, xd, dir),
+      has_subpel_mv_component(mi, xd, dir + 2),
+  };
+
+  if (ref_mbmi->ref_frame[0] == ref_frame && use_subpel[0])
+    ref_type = ref_mbmi->interp_filter[(dir & 0x01)];
+  else if (ref_mbmi->ref_frame[1] == ref_frame && use_subpel[1])
+    ref_type = ref_mbmi->interp_filter[(dir & 0x01) + 2];
+
+  return ref_type;
+}
+
+int vp10_get_pred_context_switchable_interp(const MACROBLOCKD *xd, int dir) {
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const int ctx_offset =
+      (mbmi->ref_frame[1] > INTRA_FRAME) * INTER_FILTER_COMP_OFFSET;
+  MV_REFERENCE_FRAME ref_frame = (dir < 2) ?
+      mbmi->ref_frame[0] : mbmi->ref_frame[1];
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries corresponding to real macroblocks.
+  // The prediction flags in these dummy entries are initialized to 0.
+  int filter_type_ctx = ctx_offset + (dir & 0x01) * INTER_FILTER_DIR_OFFSET;
+  int left_type = SWITCHABLE_FILTERS;
+  int above_type = SWITCHABLE_FILTERS;
+
+  if (xd->left_available)
+    left_type = get_ref_filter_type(xd->mi[-1], xd, dir, ref_frame);
+
+  if (xd->up_available)
+    above_type = get_ref_filter_type(xd->mi[-xd->mi_stride], xd,
+                                     dir, ref_frame);
+
+  if (left_type == above_type)
+    filter_type_ctx += left_type;
+  else if (left_type == SWITCHABLE_FILTERS && above_type != SWITCHABLE_FILTERS)
+    filter_type_ctx += above_type;
+  else if (left_type != SWITCHABLE_FILTERS && above_type == SWITCHABLE_FILTERS)
+    filter_type_ctx += left_type;
+  else
+    filter_type_ctx += SWITCHABLE_FILTERS;
+
+  return filter_type_ctx;
+}
+#else
 int vp10_get_pred_context_switchable_interp(const MACROBLOCKD *xd) {
   // Note:
   // The mode info data structure has a one element border above and to the
@@ -21,10 +75,10 @@
   // The prediction flags in these dummy entries are initialized to 0.
   const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
   const int left_type = xd->left_available && is_inter_block(left_mbmi) ?
-                            left_mbmi->interp_filter : SWITCHABLE_FILTERS;
+      left_mbmi->interp_filter : SWITCHABLE_FILTERS;
   const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
   const int above_type = xd->up_available && is_inter_block(above_mbmi) ?
-                             above_mbmi->interp_filter : SWITCHABLE_FILTERS;
+      above_mbmi->interp_filter : SWITCHABLE_FILTERS;
 
   if (left_type == above_type)
     return left_type;
@@ -35,6 +89,68 @@
   else
     return SWITCHABLE_FILTERS;
 }
+#endif
+
+#if CONFIG_EXT_INTRA
+// Obtain the reference filter type from the above/left neighbor blocks.
+static INTRA_FILTER get_ref_intra_filter(const MB_MODE_INFO *ref_mbmi) {
+  INTRA_FILTER ref_type = INTRA_FILTERS;
+
+  if (ref_mbmi->sb_type >= BLOCK_8X8) {
+    PREDICTION_MODE mode = ref_mbmi->mode;
+    if (is_inter_block(ref_mbmi)) {
+#if CONFIG_DUAL_FILTER
+      switch (ref_mbmi->interp_filter[0]) {
+#else
+      switch (ref_mbmi->interp_filter) {
+#endif
+        case EIGHTTAP_REGULAR:
+          ref_type = INTRA_FILTER_8TAP;
+          break;
+        case EIGHTTAP_SMOOTH:
+          ref_type = INTRA_FILTER_8TAP_SMOOTH;
+          break;
+        case MULTITAP_SHARP:
+          ref_type = INTRA_FILTER_8TAP_SHARP;
+          break;
+        case BILINEAR:
+          ref_type = INTRA_FILTERS;
+          break;
+        default:
+          break;
+      }
+    } else {
+      if (mode != DC_PRED && mode != TM_PRED) {
+        int p_angle = mode_to_angle_map[mode] +
+            ref_mbmi->angle_delta[0] * ANGLE_STEP;
+        if (vp10_is_intra_filter_switchable(p_angle)) {
+          ref_type = ref_mbmi->intra_filter;
+        }
+      }
+    }
+  }
+  return ref_type;
+}
+
+int vp10_get_pred_context_intra_interp(const MACROBLOCKD *xd) {
+  int left_type = INTRA_FILTERS, above_type = INTRA_FILTERS;
+
+  if (xd->left_available)
+    left_type = get_ref_intra_filter(xd->left_mbmi);
+
+  if (xd->up_available)
+    above_type = get_ref_intra_filter(xd->above_mbmi);
+
+  if (left_type == above_type)
+    return left_type;
+  else if (left_type == INTRA_FILTERS && above_type != INTRA_FILTERS)
+    return above_type;
+  else if (left_type != INTRA_FILTERS && above_type == INTRA_FILTERS)
+    return left_type;
+  else
+    return INTRA_FILTERS;
+}
+#endif  // CONFIG_EXT_INTRA
 
 // The mode info data structure has a one element border above and to the
 // left of the entries corresponding to real macroblocks.
@@ -61,8 +177,60 @@
   }
 }
 
+#if CONFIG_EXT_REFS
+
+#define CHECK_BWDREF_OR_ALTREF(ref_frame) \
+  (((ref_frame) == BWDREF_FRAME) || ((ref_frame) == ALTREF_FRAME))
+
 int vp10_get_reference_mode_context(const VP10_COMMON *cm,
-                                   const MACROBLOCKD *xd) {
+                                    const MACROBLOCKD *xd) {
+  int ctx;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
+
+  (void)cm;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries corresponding to real macroblocks.
+  // The prediction flags in these dummy entries are initialized to 0.
+  if (has_above && has_left) {  // both edges available
+    if (!has_second_ref(above_mbmi) && !has_second_ref(left_mbmi))
+      // neither edge uses comp pred (0/1)
+      ctx = CHECK_BWDREF_OR_ALTREF(above_mbmi->ref_frame[0]) ^
+            CHECK_BWDREF_OR_ALTREF(left_mbmi->ref_frame[0]);
+    else if (!has_second_ref(above_mbmi))
+      // one of two edges uses comp pred (2/3)
+      ctx = 2 + (CHECK_BWDREF_OR_ALTREF(above_mbmi->ref_frame[0]) ||
+                 !is_inter_block(above_mbmi));
+    else if (!has_second_ref(left_mbmi))
+      // one of two edges uses comp pred (2/3)
+      ctx = 2 + (CHECK_BWDREF_OR_ALTREF(left_mbmi->ref_frame[0]) ||
+                 !is_inter_block(left_mbmi));
+    else  // both edges use comp pred (4)
+      ctx = 4;
+  } else if (has_above || has_left) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
+
+    if (!has_second_ref(edge_mbmi))
+      // edge does not use comp pred (0/1)
+      ctx = CHECK_BWDREF_OR_ALTREF(edge_mbmi->ref_frame[0]);
+    else
+      // edge uses comp pred (3)
+      ctx = 3;
+  } else {  // no edges available (1)
+    ctx = 1;
+  }
+  assert(ctx >= 0 && ctx < COMP_INTER_CONTEXTS);
+  return ctx;
+}
+
+#else  // CONFIG_EXT_REFS
+
+int vp10_get_reference_mode_context(const VP10_COMMON *cm,
+                                    const MACROBLOCKD *xd) {
   int ctx;
   const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
   const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
@@ -103,9 +271,426 @@
   return ctx;
 }
 
+#endif  // CONFIG_EXT_REFS
+
+#if CONFIG_EXT_REFS
+
+// TODO(zoeliu): Future work will be conducted to optimize the context design
+//               for the coding of the reference frames.
+
+#define CHECK_LAST_OR_LAST2(ref_frame) \
+  ((ref_frame == LAST_FRAME) || (ref_frame == LAST2_FRAME))
+
+#define CHECK_GOLDEN_OR_LAST3(ref_frame) \
+  ((ref_frame == GOLDEN_FRAME) || (ref_frame == LAST3_FRAME))
+
+// Returns a context number for the given MB prediction signal
+// Signal the first reference frame for a compound mode be either
+// GOLDEN/LAST3, or LAST/LAST2.
+//
+// NOTE(zoeliu): The probability of ref_frame[0] is either
+//               GOLDEN_FRAME or LAST3_FRAME.
+int vp10_get_pred_context_comp_ref_p(const VP10_COMMON *cm,
+                                     const MACROBLOCKD *xd) {
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int above_in_image = xd->up_available;
+  const int left_in_image = xd->left_available;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries correpsonding to real macroblocks.
+  // The prediction flags in these dummy entries are initialised to 0.
+  const int bwd_ref_sign_idx = cm->ref_frame_sign_bias[cm->comp_bwd_ref[0]];
+  const int fwd_ref_sign_idx = !bwd_ref_sign_idx;
+
+  if (above_in_image && left_in_image) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra (2)
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+
+      if (!has_second_ref(edge_mbmi))  // single pred (1/3)
+        pred_context = 1 +
+            2 * (!CHECK_GOLDEN_OR_LAST3(edge_mbmi->ref_frame[0]));
+      else  // comp pred (1/3)
+        pred_context = 1 + 2 *
+            (!CHECK_GOLDEN_OR_LAST3(edge_mbmi->ref_frame[fwd_ref_sign_idx]));
+    } else {  // inter/inter
+      const int l_sg = !has_second_ref(left_mbmi);
+      const int a_sg = !has_second_ref(above_mbmi);
+      const MV_REFERENCE_FRAME frfa = a_sg ?
+          above_mbmi->ref_frame[0] : above_mbmi->ref_frame[fwd_ref_sign_idx];
+      const MV_REFERENCE_FRAME frfl = l_sg ?
+          left_mbmi->ref_frame[0] : left_mbmi->ref_frame[fwd_ref_sign_idx];
+
+      if (frfa == frfl && CHECK_GOLDEN_OR_LAST3(frfa)) {
+        pred_context = 0;
+      } else if (l_sg && a_sg) {  // single/single
+        if ((CHECK_BWDREF_OR_ALTREF(frfa) && CHECK_LAST_OR_LAST2(frfl)) ||
+            (CHECK_BWDREF_OR_ALTREF(frfl) && CHECK_LAST_OR_LAST2(frfa))) {
+          pred_context = 4;
+        } else if (CHECK_GOLDEN_OR_LAST3(frfa) ||
+                   CHECK_GOLDEN_OR_LAST3(frfl)) {
+          pred_context = 1;
+        } else {
+          pred_context = 3;
+        }
+      } else if (l_sg || a_sg) {  // single/comp
+        const MV_REFERENCE_FRAME frfc = l_sg ? frfa : frfl;
+        const MV_REFERENCE_FRAME rfs = a_sg ? frfa : frfl;
+
+        if (CHECK_GOLDEN_OR_LAST3(frfc) && !CHECK_GOLDEN_OR_LAST3(rfs))
+          pred_context = 1;
+        else if (CHECK_GOLDEN_OR_LAST3(rfs) && !CHECK_GOLDEN_OR_LAST3(frfc))
+          pred_context = 2;
+        else
+          pred_context = 4;
+      } else {  // comp/comp
+        if ((CHECK_LAST_OR_LAST2(frfa) && CHECK_LAST_OR_LAST2(frfl))) {
+          pred_context = 4;
+        } else {
+          // NOTE(zoeliu): Following assert may be removed once confirmed.
+          assert(CHECK_GOLDEN_OR_LAST3(frfa) || CHECK_GOLDEN_OR_LAST3(frfl));
+          pred_context = 2;
+        }
+      }
+    }
+  } else if (above_in_image || left_in_image) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+
+    if (!is_inter_block(edge_mbmi)) {
+      pred_context = 2;
+    } else {
+      if (has_second_ref(edge_mbmi))
+        pred_context = 4 *
+            (!CHECK_GOLDEN_OR_LAST3(edge_mbmi->ref_frame[fwd_ref_sign_idx]));
+      else
+        pred_context = 3 * (!CHECK_GOLDEN_OR_LAST3(edge_mbmi->ref_frame[0]));
+    }
+  } else {  // no edges available (2)
+    pred_context = 2;
+  }
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+
+  return pred_context;
+}
+
+// Returns a context number for the given MB prediction signal
+// Signal the first reference frame for a compound mode be LAST,
+// conditioning on that it is known either LAST/LAST2.
+//
+// NOTE(zoeliu): The probability of ref_frame[0] is LAST_FRAME,
+// conditioning on it is either LAST_FRAME or LAST2_FRAME.
+int vp10_get_pred_context_comp_ref_p1(const VP10_COMMON *cm,
+                                      const MACROBLOCKD *xd) {
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int above_in_image = xd->up_available;
+  const int left_in_image = xd->left_available;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries correpsonding to real macroblocks.
+  // The prediction flags in these dummy entries are initialised to 0.
+  const int bwd_ref_sign_idx = cm->ref_frame_sign_bias[cm->comp_bwd_ref[0]];
+  const int fwd_ref_sign_idx = !bwd_ref_sign_idx;
+
+  if (above_in_image && left_in_image) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra (2)
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+
+      if (!has_second_ref(edge_mbmi))  // single pred (1/3)
+        pred_context = 1 + 2 * (edge_mbmi->ref_frame[0] != LAST_FRAME);
+      else  // comp pred (1/3)
+        pred_context = 1 + 2 * (edge_mbmi->ref_frame[fwd_ref_sign_idx]
+                                != LAST_FRAME);
+    } else {  // inter/inter
+      const int l_sg = !has_second_ref(left_mbmi);
+      const int a_sg = !has_second_ref(above_mbmi);
+      const MV_REFERENCE_FRAME frfa = a_sg ?
+          above_mbmi->ref_frame[0] : above_mbmi->ref_frame[fwd_ref_sign_idx];
+      const MV_REFERENCE_FRAME frfl = l_sg ?
+          left_mbmi->ref_frame[0] : left_mbmi->ref_frame[fwd_ref_sign_idx];
+
+      if (frfa == frfl && frfa == LAST_FRAME)
+        pred_context = 0;
+      else if (l_sg && a_sg) {  // single/single
+        if (frfa == LAST_FRAME || frfl == LAST_FRAME)
+          pred_context = 1;
+        else if (CHECK_GOLDEN_OR_LAST3(frfa) || CHECK_GOLDEN_OR_LAST3(frfl))
+          pred_context = 2 + (frfa != frfl);
+        else if (frfa == frfl ||
+                 (CHECK_BWDREF_OR_ALTREF(frfa) && CHECK_BWDREF_OR_ALTREF(frfl)))
+          pred_context = 3;
+        else
+          pred_context = 4;
+      } else if (l_sg || a_sg) {  // single/comp
+        const MV_REFERENCE_FRAME frfc = l_sg ? frfa : frfl;
+        const MV_REFERENCE_FRAME rfs = a_sg ? frfa : frfl;
+
+        if (frfc == LAST_FRAME && rfs != LAST_FRAME)
+          pred_context = 1;
+        else if (rfs == LAST_FRAME && frfc != LAST_FRAME)
+          pred_context = 2;
+        else
+          pred_context = 3 +
+              (frfc == LAST2_FRAME || CHECK_GOLDEN_OR_LAST3(rfs));
+      } else {  // comp/comp
+        if (frfa == LAST_FRAME || frfl == LAST_FRAME)
+          pred_context = 2;
+        else
+          pred_context = 3 +
+              (CHECK_GOLDEN_OR_LAST3(frfa) || CHECK_GOLDEN_OR_LAST3(frfl));
+      }
+    }
+  } else if (above_in_image || left_in_image) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+
+    if (!is_inter_block(edge_mbmi)) {
+      pred_context = 2;
+    } else {
+      if (has_second_ref(edge_mbmi)) {
+        pred_context = 4 *
+            (edge_mbmi->ref_frame[fwd_ref_sign_idx] != LAST_FRAME);
+      } else {
+        if (edge_mbmi->ref_frame[0] == LAST_FRAME)
+          pred_context = 0;
+        else
+          pred_context = 2 + CHECK_GOLDEN_OR_LAST3(edge_mbmi->ref_frame[0]);
+      }
+    }
+  } else {  // no edges available (2)
+    pred_context = 2;
+  }
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+
+  return pred_context;
+}
+
+// Returns a context number for the given MB prediction signal
+// Signal the first reference frame for a compound mode be GOLDEN,
+// conditioning on that it is known either GOLDEN or LAST3.
+//
+// NOTE(zoeliu): The probability of ref_frame[0] is GOLDEN_FRAME,
+// conditioning on it is either GOLDEN or LAST3.
+int vp10_get_pred_context_comp_ref_p2(const VP10_COMMON *cm,
+                                      const MACROBLOCKD *xd) {
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int above_in_image = xd->up_available;
+  const int left_in_image = xd->left_available;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries correpsonding to real macroblocks.
+  // The prediction flags in these dummy entries are initialised to 0.
+  const int bwd_ref_sign_idx = cm->ref_frame_sign_bias[cm->comp_bwd_ref[0]];
+  const int fwd_ref_sign_idx = !bwd_ref_sign_idx;
+
+  if (above_in_image && left_in_image) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra (2)
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+
+      if (!has_second_ref(edge_mbmi))  // single pred (1/3)
+        pred_context = 1 + 2 * (edge_mbmi->ref_frame[0] != GOLDEN_FRAME);
+      else  // comp pred (1/3)
+        pred_context = 1 + 2 * (edge_mbmi->ref_frame[fwd_ref_sign_idx]
+                                != GOLDEN_FRAME);
+    } else {  // inter/inter
+      const int l_sg = !has_second_ref(left_mbmi);
+      const int a_sg = !has_second_ref(above_mbmi);
+      const MV_REFERENCE_FRAME frfa = a_sg ?
+          above_mbmi->ref_frame[0] : above_mbmi->ref_frame[fwd_ref_sign_idx];
+      const MV_REFERENCE_FRAME frfl = l_sg ?
+          left_mbmi->ref_frame[0] : left_mbmi->ref_frame[fwd_ref_sign_idx];
+
+      if (frfa == frfl && frfa == GOLDEN_FRAME)
+        pred_context = 0;
+      else if (l_sg && a_sg) {  // single/single
+        if (frfa == GOLDEN_FRAME || frfl == GOLDEN_FRAME)
+          pred_context = 1;
+        else if (CHECK_LAST_OR_LAST2(frfa) || CHECK_LAST_OR_LAST2(frfl))
+          pred_context = 2 + (frfa != frfl);
+        else if (frfa == frfl ||
+                 (CHECK_BWDREF_OR_ALTREF(frfa) && CHECK_BWDREF_OR_ALTREF(frfl)))
+          pred_context = 3;
+        else
+          pred_context = 4;
+      } else if (l_sg || a_sg) {  // single/comp
+        const MV_REFERENCE_FRAME frfc = l_sg ? frfa : frfl;
+        const MV_REFERENCE_FRAME rfs = a_sg ? frfa : frfl;
+
+        if (frfc == GOLDEN_FRAME && rfs != GOLDEN_FRAME)
+          pred_context = 1;
+        else if (rfs == GOLDEN_FRAME && frfc != GOLDEN_FRAME)
+          pred_context = 2;
+        else
+          pred_context = 3 +
+              (frfc == LAST3_FRAME || CHECK_LAST_OR_LAST2(rfs));
+      } else {  // comp/comp
+        if (frfa == GOLDEN_FRAME || frfl == GOLDEN_FRAME)
+          pred_context = 2;
+        else
+          pred_context = 3 +
+              (CHECK_LAST_OR_LAST2(frfa) || CHECK_LAST_OR_LAST2(frfl));
+      }
+    }
+  } else if (above_in_image || left_in_image) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+
+    if (!is_inter_block(edge_mbmi)) {
+      pred_context = 2;
+    } else {
+      if (has_second_ref(edge_mbmi)) {
+        pred_context = 4 *
+            (edge_mbmi->ref_frame[fwd_ref_sign_idx] != GOLDEN_FRAME);
+      } else {
+        if (edge_mbmi->ref_frame[0] == GOLDEN_FRAME)
+          pred_context = 0;
+        else
+          pred_context = 2 + CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]);
+      }
+    }
+  } else {  // no edges available (2)
+    pred_context = 2;
+  }
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+
+  return pred_context;
+}
+
+// Returns a context number for the given MB prediction signal
+int vp10_get_pred_context_comp_bwdref_p(const VP10_COMMON *cm,
+                                        const MACROBLOCKD *xd) {
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int above_in_image = xd->up_available;
+  const int left_in_image = xd->left_available;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries corresponding to real macroblocks.
+  // The prediction flags in these dummy entries are initialized to 0.
+  const int bwd_ref_sign_idx = cm->ref_frame_sign_bias[cm->comp_bwd_ref[0]];
+  const int fwd_ref_sign_idx = !bwd_ref_sign_idx;
+
+  if (above_in_image && left_in_image) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra (2)
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+
+      if (!has_second_ref(edge_mbmi))  // single pred (1/3)
+        pred_context = 1 + 2 * (edge_mbmi->ref_frame[1] != cm->comp_bwd_ref[1]);
+      else  // comp pred (1/3)
+        pred_context = 1 + 2 * (edge_mbmi->ref_frame[bwd_ref_sign_idx]
+                                != cm->comp_bwd_ref[1]);
+    } else {  // inter/inter
+      const int l_comp = has_second_ref(left_mbmi);
+      const int a_comp = has_second_ref(above_mbmi);
+
+      const MV_REFERENCE_FRAME l_brf = l_comp ?
+          left_mbmi->ref_frame[bwd_ref_sign_idx] : NONE;
+      const MV_REFERENCE_FRAME a_brf = a_comp ?
+          above_mbmi->ref_frame[bwd_ref_sign_idx] : NONE;
+
+      const MV_REFERENCE_FRAME l_frf = !l_comp ?
+          left_mbmi->ref_frame[0] : left_mbmi->ref_frame[fwd_ref_sign_idx];
+      const MV_REFERENCE_FRAME a_frf = !a_comp ?
+          above_mbmi->ref_frame[0] : above_mbmi->ref_frame[fwd_ref_sign_idx];
+
+      if (l_comp && a_comp) {  // comp/comp
+        if (l_brf == a_brf && l_brf == cm->comp_bwd_ref[1]) {
+          pred_context = 0;
+        } else if (l_brf == cm->comp_bwd_ref[1] ||
+                   a_brf == cm->comp_bwd_ref[1]) {
+          pred_context = 1;
+        } else {
+          // NOTE: Backward ref should be either BWDREF or ALTREF.
+          assert(l_brf == a_brf && l_brf != cm->comp_bwd_ref[1]);
+          pred_context = 3;
+        }
+      } else if (!l_comp && !a_comp) {  // single/single
+        if (l_frf == a_frf && l_frf == cm->comp_bwd_ref[1]) {
+          pred_context = 0;
+        } else if (l_frf == cm->comp_bwd_ref[1] ||
+                   a_frf == cm->comp_bwd_ref[1]) {
+          pred_context = 1;
+        } else if (l_frf == a_frf) {
+          pred_context = 3;
+        } else {
+          assert(l_frf != a_frf &&
+                 l_frf != cm->comp_bwd_ref[1] && a_frf != cm->comp_bwd_ref[1]);
+          pred_context = 4;
+        }
+      } else {  // comp/single
+        assert((l_comp && !a_comp) || (!l_comp && a_comp));
+
+        if ((l_comp && l_brf == cm->comp_bwd_ref[1] &&
+             a_frf == cm->comp_bwd_ref[1]) ||
+            (a_comp && a_brf == cm->comp_bwd_ref[1] &&
+             l_frf == cm->comp_bwd_ref[1])) {
+          pred_context = 1;
+        } else if ((l_comp && l_brf == cm->comp_bwd_ref[1]) ||
+                   (a_comp && a_brf == cm->comp_bwd_ref[1]) ||
+                   (!l_comp && l_frf == cm->comp_bwd_ref[1]) ||
+                   (!a_comp && a_frf == cm->comp_bwd_ref[1])) {
+          pred_context = 2;
+        } else {
+          pred_context = 4;
+        }
+      }
+    }
+  } else if (above_in_image || left_in_image) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = above_in_image ? above_mbmi : left_mbmi;
+
+    if (!is_inter_block(edge_mbmi)) {
+      pred_context = 2;
+    } else {
+      if (has_second_ref(edge_mbmi)) {
+        pred_context = 4 * (edge_mbmi->ref_frame[bwd_ref_sign_idx]
+                            != cm->comp_bwd_ref[1]);
+      } else {
+        pred_context = 3 * (edge_mbmi->ref_frame[0] != cm->comp_bwd_ref[1]);
+      }
+    }
+  } else {  // no edges available (2)
+    pred_context = 2;
+  }
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+
+  return pred_context;
+}
+
+#else  // CONFIG_EXT_REFS
+
 // Returns a context number for the given MB prediction signal
 int vp10_get_pred_context_comp_ref_p(const VP10_COMMON *cm,
-                                    const MACROBLOCKD *xd) {
+                                     const MACROBLOCKD *xd) {
   int pred_context;
   const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
   const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
@@ -186,6 +771,469 @@
   return pred_context;
 }
 
+#endif  // CONFIG_EXT_REFS
+
+#if CONFIG_EXT_REFS
+
+// For the bit to signal whether the single reference is a ALTREF_FRAME
+// or a BWDREF_FRAME.
+//
+// NOTE(zoeliu): The probability of ref_frame[0] is ALTREF/BWDREF.
+int vp10_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries correpsonding to real macroblocks.
+  // The prediction flags in these dummy entries are initialised to 0.
+  if (has_above && has_left) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+
+      if (!has_second_ref(edge_mbmi))
+        pred_context = 4 * (!CHECK_BWDREF_OR_ALTREF(edge_mbmi->ref_frame[0]));
+      else
+        pred_context = 1 + (!CHECK_BWDREF_OR_ALTREF(edge_mbmi->ref_frame[0]) ||
+                            !CHECK_BWDREF_OR_ALTREF(edge_mbmi->ref_frame[1]));
+    } else {  // inter/inter
+      const int above_has_second = has_second_ref(above_mbmi);
+      const int left_has_second  = has_second_ref(left_mbmi);
+
+      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
+      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
+
+      if (above_has_second && left_has_second) {
+        pred_context = 1 + (!CHECK_BWDREF_OR_ALTREF(above0) ||
+                            !CHECK_BWDREF_OR_ALTREF(above1) ||
+                            !CHECK_BWDREF_OR_ALTREF(left0) ||
+                            !CHECK_BWDREF_OR_ALTREF(left1));
+      } else if (above_has_second || left_has_second) {
+        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
+
+        if (!CHECK_BWDREF_OR_ALTREF(rfs))
+          pred_context = 3 + (!CHECK_BWDREF_OR_ALTREF(crf1) ||
+                              !CHECK_BWDREF_OR_ALTREF(crf2));
+        else
+          pred_context = !CHECK_BWDREF_OR_ALTREF(crf1) ||
+                         !CHECK_BWDREF_OR_ALTREF(crf2);
+      } else {
+        pred_context = 2 * (!CHECK_BWDREF_OR_ALTREF(above0)) +
+                       2 * (!CHECK_BWDREF_OR_ALTREF(left0));
+      }
+    }
+  } else if (has_above || has_left) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
+    if (!is_inter_block(edge_mbmi)) {  // intra
+      pred_context = 2;
+    } else {  // inter
+      if (!has_second_ref(edge_mbmi))
+        pred_context = 4 * (!CHECK_BWDREF_OR_ALTREF(edge_mbmi->ref_frame[0]));
+      else
+        pred_context = 1 + (!CHECK_BWDREF_OR_ALTREF(edge_mbmi->ref_frame[0]) ||
+                            !CHECK_BWDREF_OR_ALTREF(edge_mbmi->ref_frame[1]));
+    }
+  } else {  // no edges available
+    pred_context = 2;
+  }
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+  return pred_context;
+}
+
+// For the bit to signal whether the single reference is ALTREF_FRAME or
+// BWDREF_FRAME, knowing that it shall be either of these 2 choices.
+//
+// NOTE(zoeliu): The probability of ref_frame[0] is ALTREF_FRAME, conditioning
+// on it is either ALTREF_FRAME/BWDREF_FRAME.
+int vp10_get_pred_context_single_ref_p2(const MACROBLOCKD *xd) {
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries correpsonding to real macroblocks.
+  // The prediction flags in these dummy entries are initialised to 0.
+  if (has_above && has_left) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+      if (!has_second_ref(edge_mbmi)) {
+        if (!CHECK_BWDREF_OR_ALTREF(edge_mbmi->ref_frame[0]))
+          pred_context = 3;
+        else
+          pred_context = 4 * (edge_mbmi->ref_frame[0] == BWDREF_FRAME);
+      } else {
+        pred_context = 1 + 2 * (edge_mbmi->ref_frame[0] == BWDREF_FRAME ||
+                                edge_mbmi->ref_frame[1] == BWDREF_FRAME);
+      }
+    } else {  // inter/inter
+      const int above_has_second = has_second_ref(above_mbmi);
+      const int left_has_second  = has_second_ref(left_mbmi);
+      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
+      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
+
+      if (above_has_second && left_has_second) {
+        if (above0 == left0 && above1 == left1)
+          pred_context = 3 * (above0 == BWDREF_FRAME ||
+                              above1 == BWDREF_FRAME ||
+                              left0 == BWDREF_FRAME ||
+                              left1 == BWDREF_FRAME);
+        else
+          pred_context = 2;
+      } else if (above_has_second || left_has_second) {
+        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
+
+        if (rfs == BWDREF_FRAME)
+          pred_context = 3 + (crf1 == BWDREF_FRAME || crf2 == BWDREF_FRAME);
+        else if (rfs == ALTREF_FRAME)
+          pred_context = (crf1 == BWDREF_FRAME || crf2 == BWDREF_FRAME);
+        else
+          pred_context = 1 + 2 * (crf1 == BWDREF_FRAME || crf2 == BWDREF_FRAME);
+      } else {
+        if (!CHECK_BWDREF_OR_ALTREF(above0) && !CHECK_BWDREF_OR_ALTREF(left0)) {
+          pred_context = 2 + (above0 == left0);
+        } else if (!CHECK_BWDREF_OR_ALTREF(above0) ||
+                   !CHECK_BWDREF_OR_ALTREF(left0)) {
+          const MV_REFERENCE_FRAME edge0 =
+              !CHECK_BWDREF_OR_ALTREF(above0) ? left0 : above0;
+          pred_context = 4 * (edge0 == BWDREF_FRAME);
+        } else {
+          pred_context = 2 * (above0 == BWDREF_FRAME) +
+                         2 * (left0  == BWDREF_FRAME);
+        }
+      }
+    }
+  } else if (has_above || has_left) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
+
+    if (!is_inter_block(edge_mbmi) ||
+        (!CHECK_BWDREF_OR_ALTREF(edge_mbmi->ref_frame[0]) &&
+         !has_second_ref(edge_mbmi)))
+      pred_context = 2;
+    else if (!has_second_ref(edge_mbmi))
+      pred_context = 4 * (edge_mbmi->ref_frame[0] == BWDREF_FRAME);
+    else
+      pred_context = 3 * (edge_mbmi->ref_frame[0] == BWDREF_FRAME ||
+                          edge_mbmi->ref_frame[1] == BWDREF_FRAME);
+  } else {  // no edges available (2)
+    pred_context = 2;
+  }
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+  return pred_context;
+}
+
+// For the bit to signal whether the single reference is LAST3/GOLDEN or
+// LAST2/LAST, knowing that it shall be either of these 2 choices.
+//
+// NOTE(zoeliu): The probability of ref_frame[0] is LAST3/GOLDEN, conditioning
+// on it is either LAST3/GOLDEN/LAST2/LAST.
+int vp10_get_pred_context_single_ref_p3(const MACROBLOCKD *xd) {
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries correpsonding to real macroblocks.
+  // The prediction flags in these dummy entries are initialised to 0.
+  if (has_above && has_left) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+      if (!has_second_ref(edge_mbmi)) {
+        if (CHECK_BWDREF_OR_ALTREF(edge_mbmi->ref_frame[0]))
+          pred_context = 3;
+        else
+          pred_context = 4 * CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]);
+      } else {
+        pred_context = 1 +
+            2 * (CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]) ||
+                 CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[1]));
+      }
+    } else {  // inter/inter
+      const int above_has_second = has_second_ref(above_mbmi);
+      const int left_has_second  = has_second_ref(left_mbmi);
+      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
+      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
+
+      if (above_has_second && left_has_second) {
+        if (above0 == left0 && above1 == left1)
+          pred_context = 3 * (CHECK_LAST_OR_LAST2(above0) ||
+                              CHECK_LAST_OR_LAST2(above1) ||
+                              CHECK_LAST_OR_LAST2(left0) ||
+                              CHECK_LAST_OR_LAST2(left1));
+        else
+          pred_context = 2;
+      } else if (above_has_second || left_has_second) {
+        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
+
+        if (CHECK_LAST_OR_LAST2(rfs))
+          pred_context = 3 + (CHECK_LAST_OR_LAST2(crf1) ||
+                              CHECK_LAST_OR_LAST2(crf2));
+        else if (CHECK_GOLDEN_OR_LAST3(rfs))
+          pred_context = (CHECK_LAST_OR_LAST2(crf1) ||
+                          CHECK_LAST_OR_LAST2(crf2));
+        else
+          pred_context = 1 + 2 * (CHECK_LAST_OR_LAST2(crf1) ||
+                                  CHECK_LAST_OR_LAST2(crf2));
+      } else {
+        if (CHECK_BWDREF_OR_ALTREF(above0) && CHECK_BWDREF_OR_ALTREF(left0)) {
+          pred_context = 2 + (above0 == left0);
+        } else if (CHECK_BWDREF_OR_ALTREF(above0) ||
+                   CHECK_BWDREF_OR_ALTREF(left0)) {
+          const MV_REFERENCE_FRAME edge0 =
+              CHECK_BWDREF_OR_ALTREF(above0) ? left0 : above0;
+          pred_context = 4 * CHECK_LAST_OR_LAST2(edge0);
+        } else {
+          pred_context = 2 * CHECK_LAST_OR_LAST2(above0) +
+                         2 * CHECK_LAST_OR_LAST2(left0);
+        }
+      }
+    }
+  } else if (has_above || has_left) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
+
+    if (!is_inter_block(edge_mbmi) ||
+        (CHECK_BWDREF_OR_ALTREF(edge_mbmi->ref_frame[0]) &&
+         !has_second_ref(edge_mbmi)))
+      pred_context = 2;
+    else if (!has_second_ref(edge_mbmi))
+      pred_context = 4 * (CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]));
+    else
+      pred_context = 3 * (CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]) ||
+                          CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[1]));
+  } else {  // no edges available (2)
+    pred_context = 2;
+  }
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+  return pred_context;
+}
+
+// For the bit to signal whether the single reference is LAST2_FRAME or
+// LAST_FRAME, knowing that it shall be either of these 2 choices.
+//
+// NOTE(zoeliu): The probability of ref_frame[0] is LAST2_FRAME, conditioning
+// on it is either LAST2_FRAME/LAST_FRAME.
+int vp10_get_pred_context_single_ref_p4(const MACROBLOCKD *xd) {
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries correpsonding to real macroblocks.
+  // The prediction flags in these dummy entries are initialised to 0.
+  if (has_above && has_left) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+      if (!has_second_ref(edge_mbmi)) {
+        if (!CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]))
+          pred_context = 3;
+        else
+          pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME);
+      } else {
+        pred_context = 1 +
+            2 * (edge_mbmi->ref_frame[0] == LAST_FRAME ||
+                 edge_mbmi->ref_frame[1] == LAST_FRAME);
+      }
+    } else {  // inter/inter
+      const int above_has_second = has_second_ref(above_mbmi);
+      const int left_has_second  = has_second_ref(left_mbmi);
+      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
+      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
+
+      if (above_has_second && left_has_second) {
+        if (above0 == left0 && above1 == left1)
+          pred_context = 3 * (above0 == LAST_FRAME || above1 == LAST_FRAME ||
+                              left0 == LAST_FRAME || left1 == LAST_FRAME);
+        else
+          pred_context = 2;
+      } else if (above_has_second || left_has_second) {
+        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
+
+        if (rfs == LAST_FRAME)
+          pred_context = 3 + (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
+        else if (rfs == LAST2_FRAME)
+          pred_context = (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
+        else
+          pred_context = 1 + 2 * (crf1 == LAST_FRAME || crf2 == LAST_FRAME);
+      } else {
+        if (!CHECK_LAST_OR_LAST2(above0) &&
+            !CHECK_LAST_OR_LAST2(left0)) {
+          pred_context = 2 + (above0 == left0);
+        } else if (!CHECK_LAST_OR_LAST2(above0) ||
+                   !CHECK_LAST_OR_LAST2(left0)) {
+          const MV_REFERENCE_FRAME edge0 =
+              !CHECK_LAST_OR_LAST2(above0) ? left0 : above0;
+          pred_context = 4 * (edge0 == LAST_FRAME);
+        } else {
+          pred_context = 2 * (above0 == LAST_FRAME) + 2 * (left0 == LAST_FRAME);
+        }
+      }
+    }
+  } else if (has_above || has_left) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
+
+    if (!is_inter_block(edge_mbmi) ||
+        (!CHECK_LAST_OR_LAST2(edge_mbmi->ref_frame[0]) &&
+         !has_second_ref(edge_mbmi)))
+      pred_context = 2;
+    else if (!has_second_ref(edge_mbmi))
+      pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST_FRAME);
+    else
+      pred_context = 3 * (edge_mbmi->ref_frame[0] == LAST_FRAME ||
+                          edge_mbmi->ref_frame[1] == LAST_FRAME);
+  } else {  // no edges available (2)
+    pred_context = 2;
+  }
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+  return pred_context;
+}
+
+// For the bit to signal whether the single reference is GOLDEN_FRAME or
+// LAST3_FRAME, knowing that it shall be either of these 2 choices.
+//
+// NOTE(zoeliu): The probability of ref_frame[0] is GOLDEN_FRAME, conditioning
+// on it is either GOLDEN_FRAME/LAST3_FRAME.
+int vp10_get_pred_context_single_ref_p5(const MACROBLOCKD *xd) {
+  int pred_context;
+  const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
+  const MB_MODE_INFO *const left_mbmi = xd->left_mbmi;
+  const int has_above = xd->up_available;
+  const int has_left = xd->left_available;
+
+  // Note:
+  // The mode info data structure has a one element border above and to the
+  // left of the entries correpsonding to real macroblocks.
+  // The prediction flags in these dummy entries are initialised to 0.
+  if (has_above && has_left) {  // both edges available
+    const int above_intra = !is_inter_block(above_mbmi);
+    const int left_intra = !is_inter_block(left_mbmi);
+
+    if (above_intra && left_intra) {  // intra/intra
+      pred_context = 2;
+    } else if (above_intra || left_intra) {  // intra/inter or inter/intra
+      const MB_MODE_INFO *edge_mbmi = above_intra ? left_mbmi : above_mbmi;
+      if (!has_second_ref(edge_mbmi)) {
+        if (!CHECK_GOLDEN_OR_LAST3(edge_mbmi->ref_frame[0]))
+          pred_context = 3;
+        else
+          pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST3_FRAME);
+      } else {
+        pred_context = 1 +
+            2 * (edge_mbmi->ref_frame[0] == LAST3_FRAME ||
+                 edge_mbmi->ref_frame[1] == LAST3_FRAME);
+      }
+    } else {  // inter/inter
+      const int above_has_second = has_second_ref(above_mbmi);
+      const int left_has_second  = has_second_ref(left_mbmi);
+      const MV_REFERENCE_FRAME above0 = above_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME above1 = above_mbmi->ref_frame[1];
+      const MV_REFERENCE_FRAME left0 = left_mbmi->ref_frame[0];
+      const MV_REFERENCE_FRAME left1 = left_mbmi->ref_frame[1];
+
+      if (above_has_second && left_has_second) {
+        if (above0 == left0 && above1 == left1)
+          pred_context = 3 * (above0 == LAST3_FRAME || above1 == LAST3_FRAME ||
+                              left0 == LAST3_FRAME || left1 == LAST3_FRAME);
+        else
+          pred_context = 2;
+      } else if (above_has_second || left_has_second) {
+        const MV_REFERENCE_FRAME rfs = !above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf1 = above_has_second ? above0 : left0;
+        const MV_REFERENCE_FRAME crf2 = above_has_second ? above1 : left1;
+
+        if (rfs == LAST3_FRAME)
+          pred_context = 3 + (crf1 == LAST3_FRAME || crf2 == LAST3_FRAME);
+        else if (rfs == GOLDEN_FRAME)
+          pred_context = (crf1 == LAST3_FRAME || crf2 == LAST3_FRAME);
+        else
+          pred_context = 1 + 2 * (crf1 == LAST3_FRAME || crf2 == LAST3_FRAME);
+      } else {
+        if (!CHECK_GOLDEN_OR_LAST3(above0) &&
+            !CHECK_GOLDEN_OR_LAST3(left0)) {
+          pred_context = 2 + (above0 == left0);
+        } else if (!CHECK_GOLDEN_OR_LAST3(above0) ||
+                   !CHECK_GOLDEN_OR_LAST3(left0)) {
+          const MV_REFERENCE_FRAME edge0 =
+              !CHECK_GOLDEN_OR_LAST3(above0) ? left0 : above0;
+          pred_context = 4 * (edge0 == LAST3_FRAME);
+        } else {
+          pred_context = 2 * (above0 == LAST3_FRAME) +
+                         2 * (left0 == LAST3_FRAME);
+        }
+      }
+    }
+  } else if (has_above || has_left) {  // one edge available
+    const MB_MODE_INFO *edge_mbmi = has_above ? above_mbmi : left_mbmi;
+
+    if (!is_inter_block(edge_mbmi) ||
+        (!CHECK_GOLDEN_OR_LAST3(edge_mbmi->ref_frame[0]) &&
+         !has_second_ref(edge_mbmi)))
+      pred_context = 2;
+    else if (!has_second_ref(edge_mbmi))
+      pred_context = 4 * (edge_mbmi->ref_frame[0] == LAST3_FRAME);
+    else
+      pred_context = 3 * (edge_mbmi->ref_frame[0] == LAST3_FRAME ||
+                          edge_mbmi->ref_frame[1] == LAST3_FRAME);
+  } else {  // no edges available (2)
+    pred_context = 2;
+  }
+
+  assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
+  return pred_context;
+}
+
+#else  // CONFIG_EXT_REFS
+
 int vp10_get_pred_context_single_ref_p1(const MACROBLOCKD *xd) {
   int pred_context;
   const MB_MODE_INFO *const above_mbmi = xd->above_mbmi;
@@ -303,7 +1351,7 @@
 
         if (rfs == GOLDEN_FRAME)
           pred_context = 3 + (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
-        else if (rfs == ALTREF_FRAME)
+        else if (rfs != GOLDEN_FRAME && rfs != LAST_FRAME)
           pred_context = crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME;
         else
           pred_context = 1 + 2 * (crf1 == GOLDEN_FRAME || crf2 == GOLDEN_FRAME);
@@ -315,8 +1363,8 @@
                                                                   : above0;
           pred_context = 4 * (edge0 == GOLDEN_FRAME);
         } else {
-          pred_context = 2 * (above0 == GOLDEN_FRAME) +
-                             2 * (left0 == GOLDEN_FRAME);
+          pred_context =
+              2 * (above0 == GOLDEN_FRAME) + 2 * (left0 == GOLDEN_FRAME);
         }
       }
     }
@@ -337,3 +1385,5 @@
   assert(pred_context >= 0 && pred_context < REF_CONTEXTS);
   return pred_context;
 }
+
+#endif  // CONFIG_EXT_REFS
diff --git a/vp10/common/pred_common.h b/vp10/common/pred_common.h
index d6d7146..d4ae980 100644
--- a/vp10/common/pred_common.h
+++ b/vp10/common/pred_common.h
@@ -66,7 +66,15 @@
   return cm->fc->skip_probs[vp10_get_skip_context(xd)];
 }
 
+#if CONFIG_DUAL_FILTER
+int vp10_get_pred_context_switchable_interp(const MACROBLOCKD *xd, int dir);
+#else
 int vp10_get_pred_context_switchable_interp(const MACROBLOCKD *xd);
+#endif
+
+#if CONFIG_EXT_INTRA
+int vp10_get_pred_context_intra_interp(const MACROBLOCKD *xd);
+#endif  // CONFIG_EXT_INTRA
 
 int vp10_get_intra_inter_context(const MACROBLOCKD *xd);
 
@@ -79,7 +87,7 @@
                                     const MACROBLOCKD *xd);
 
 static INLINE vpx_prob vp10_get_reference_mode_prob(const VP10_COMMON *cm,
-                                                   const MACROBLOCKD *xd) {
+                                                    const MACROBLOCKD *xd) {
   return cm->fc->comp_inter_prob[vp10_get_reference_mode_context(cm, xd)];
 }
 
@@ -87,25 +95,78 @@
                                     const MACROBLOCKD *xd);
 
 static INLINE vpx_prob vp10_get_pred_prob_comp_ref_p(const VP10_COMMON *cm,
-                                                    const MACROBLOCKD *xd) {
+                                                     const MACROBLOCKD *xd) {
   const int pred_context = vp10_get_pred_context_comp_ref_p(cm, xd);
-  return cm->fc->comp_ref_prob[pred_context];
+  return cm->fc->comp_ref_prob[pred_context][0];
 }
 
+#if CONFIG_EXT_REFS
+int vp10_get_pred_context_comp_ref_p1(const VP10_COMMON *cm,
+                                      const MACROBLOCKD *xd);
+
+static INLINE vpx_prob vp10_get_pred_prob_comp_ref_p1(const VP10_COMMON *cm,
+                                                     const MACROBLOCKD *xd) {
+  const int pred_context = vp10_get_pred_context_comp_ref_p1(cm, xd);
+  return cm->fc->comp_ref_prob[pred_context][1];
+}
+
+int vp10_get_pred_context_comp_ref_p2(const VP10_COMMON *cm,
+                                      const MACROBLOCKD *xd);
+
+static INLINE vpx_prob vp10_get_pred_prob_comp_ref_p2(const VP10_COMMON *cm,
+                                                     const MACROBLOCKD *xd) {
+  const int pred_context = vp10_get_pred_context_comp_ref_p2(cm, xd);
+  return cm->fc->comp_ref_prob[pred_context][2];
+}
+
+int vp10_get_pred_context_comp_bwdref_p(const VP10_COMMON *cm,
+                                        const MACROBLOCKD *xd);
+
+static INLINE vpx_prob vp10_get_pred_prob_comp_bwdref_p(const VP10_COMMON *cm,
+                                                        const MACROBLOCKD *xd) {
+  const int pred_context = vp10_get_pred_context_comp_bwdref_p(cm, xd);
+  return cm->fc->comp_bwdref_prob[pred_context][0];
+}
+
+#endif  // CONFIG_EXT_REFS
+
 int vp10_get_pred_context_single_ref_p1(const MACROBLOCKD *xd);
 
 static INLINE vpx_prob vp10_get_pred_prob_single_ref_p1(const VP10_COMMON *cm,
-                                                       const MACROBLOCKD *xd) {
+                                                        const MACROBLOCKD *xd) {
   return cm->fc->single_ref_prob[vp10_get_pred_context_single_ref_p1(xd)][0];
 }
 
 int vp10_get_pred_context_single_ref_p2(const MACROBLOCKD *xd);
 
 static INLINE vpx_prob vp10_get_pred_prob_single_ref_p2(const VP10_COMMON *cm,
-                                                       const MACROBLOCKD *xd) {
+                                                        const MACROBLOCKD *xd) {
   return cm->fc->single_ref_prob[vp10_get_pred_context_single_ref_p2(xd)][1];
 }
 
+#if CONFIG_EXT_REFS
+int vp10_get_pred_context_single_ref_p3(const MACROBLOCKD *xd);
+
+static INLINE vpx_prob vp10_get_pred_prob_single_ref_p3(const VP10_COMMON *cm,
+                                                        const MACROBLOCKD *xd) {
+  return cm->fc->single_ref_prob[vp10_get_pred_context_single_ref_p3(xd)][2];
+}
+
+int vp10_get_pred_context_single_ref_p4(const MACROBLOCKD *xd);
+
+static INLINE vpx_prob vp10_get_pred_prob_single_ref_p4(const VP10_COMMON *cm,
+                                                        const MACROBLOCKD *xd) {
+  return cm->fc->single_ref_prob[vp10_get_pred_context_single_ref_p4(xd)][3];
+}
+
+int vp10_get_pred_context_single_ref_p5(const MACROBLOCKD *xd);
+
+static INLINE vpx_prob vp10_get_pred_prob_single_ref_p5(const VP10_COMMON *cm,
+                                                        const MACROBLOCKD *xd) {
+  return cm->fc->single_ref_prob[vp10_get_pred_context_single_ref_p5(xd)][4];
+}
+#endif  // CONFIG_EXT_REFS
+
 // Returns a context number for the given MB prediction signal
 // The mode info data structure has a one element border above and to the
 // left of the entries corresponding to real blocks.
@@ -129,41 +190,67 @@
   return (above_ctx + left_ctx) > max_tx_size;
 }
 
-static INLINE const vpx_prob *get_tx_probs(TX_SIZE max_tx_size, int ctx,
-                                           const struct tx_probs *tx_probs) {
-  switch (max_tx_size) {
-    case TX_8X8:
-      return tx_probs->p8x8[ctx];
-    case TX_16X16:
-      return tx_probs->p16x16[ctx];
-    case TX_32X32:
-      return tx_probs->p32x32[ctx];
-    default:
-      assert(0 && "Invalid max_tx_size.");
-      return NULL;
+#if CONFIG_VAR_TX
+static void update_tx_counts(VP10_COMMON *cm, MACROBLOCKD *xd,
+                             MB_MODE_INFO *mbmi, BLOCK_SIZE plane_bsize,
+                             TX_SIZE tx_size, int blk_row, int blk_col,
+                             TX_SIZE max_tx_size, int ctx) {
+  const struct macroblockd_plane *const pd = &xd->plane[0];
+  const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+  const int tx_row = blk_row >> (1 - pd->subsampling_y);
+  const int tx_col = blk_col >> (1 - pd->subsampling_x);
+  const TX_SIZE plane_tx_size = mbmi->inter_tx_size[tx_row][tx_col];
+  int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
+  int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
+
+  if (xd->mb_to_bottom_edge < 0)
+    max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
+  if (xd->mb_to_right_edge < 0)
+    max_blocks_wide += xd->mb_to_right_edge >> (5 + pd->subsampling_x);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide)
+    return;
+
+  if (tx_size == plane_tx_size) {
+    ++xd->counts->tx_size[max_tx_size - TX_8X8][ctx][tx_size];
+    mbmi->tx_size = tx_size;
+  } else {
+    int bsl = b_width_log2_lookup[bsize];
+    int i;
+
+    assert(bsl > 0);
+    --bsl;
+
+    for (i = 0; i < 4; ++i) {
+      const int offsetr = blk_row + ((i >> 1) << bsl);
+      const int offsetc = blk_col + ((i & 0x01) << bsl);
+
+      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide)
+        continue;
+      update_tx_counts(cm, xd, mbmi, plane_bsize,
+                       tx_size - 1, offsetr, offsetc, max_tx_size, ctx);
+    }
   }
 }
 
-static INLINE const vpx_prob *get_tx_probs2(TX_SIZE max_tx_size,
-                                            const MACROBLOCKD *xd,
-                                            const struct tx_probs *tx_probs) {
-  return get_tx_probs(max_tx_size, get_tx_size_context(xd), tx_probs);
-}
+static INLINE void inter_block_tx_count_update(VP10_COMMON *cm,
+                                               MACROBLOCKD *xd,
+                                               MB_MODE_INFO *mbmi,
+                                               BLOCK_SIZE plane_bsize,
+                                               int ctx) {
+  const int mi_width = num_4x4_blocks_wide_lookup[plane_bsize];
+  const int mi_height = num_4x4_blocks_high_lookup[plane_bsize];
+  TX_SIZE max_tx_size = max_txsize_lookup[plane_bsize];
+  BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
+  int bh = num_4x4_blocks_wide_lookup[txb_size];
+  int idx, idy;
 
-static INLINE unsigned int *get_tx_counts(TX_SIZE max_tx_size, int ctx,
-                                          struct tx_counts *tx_counts) {
-  switch (max_tx_size) {
-    case TX_8X8:
-      return tx_counts->p8x8[ctx];
-    case TX_16X16:
-      return tx_counts->p16x16[ctx];
-    case TX_32X32:
-      return tx_counts->p32x32[ctx];
-    default:
-      assert(0 && "Invalid max_tx_size.");
-      return NULL;
-  }
+  for (idy = 0; idy < mi_height; idy += bh)
+    for (idx = 0; idx < mi_width; idx += bh)
+      update_tx_counts(cm, xd, mbmi, plane_bsize, max_tx_size, idy, idx,
+                       max_tx_size, ctx);
 }
+#endif
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp10/common/quant_common.c b/vp10/common/quant_common.c
index edf7394..a1ce23e 100644
--- a/vp10/common/quant_common.c
+++ b/vp10/common/quant_common.c
@@ -9,8 +9,160 @@
  */
 
 #include "vp10/common/common.h"
+#include "vp10/common/entropy.h"
 #include "vp10/common/quant_common.h"
 #include "vp10/common/seg_common.h"
+#include "vp10/common/blockd.h"
+
+#if CONFIG_NEW_QUANT
+
+// Bin widths expressed as a fraction over 128 of the quant stepsize,
+// for the quantization bins 0-4.
+// So a value x indicates the bin is actually factor x/128 of the
+// nominal quantization step.  For the zero bin, the width is only
+// for one side of zero, so the actual width is twice that.
+//
+// Functions with nuq correspond to "non uniform quantization"
+// TODO(sarahparker, debargha): Optimize these tables
+
+typedef struct {
+  uint8_t knots[NUQ_KNOTS];   // offsets
+  uint8_t doff;               // dequantization
+} qprofile_type;
+
+static const qprofile_type nuq_lossless[COEF_BANDS] = {
+  {{64, 128, 128}, 0},  // dc, band 0
+  {{64, 128, 128}, 0},  // band 1
+  {{64, 128, 128}, 0},  // band 2
+  {{64, 128, 128}, 0},  // band 3
+  {{64, 128, 128}, 0},  // band 4
+  {{64, 128, 128}, 0},  // band 5
+};
+
+static const qprofile_type nuq[QUANT_PROFILES][QUANT_RANGES][COEF_BANDS] = {
+  {
+    {
+      {{64, 128, 128}, 0},  // dc, band 0
+      {{64, 128, 128}, 0},  // band 1
+      {{64, 128, 128}, 0},  // band 2
+      {{64, 128, 128}, 0},  // band 3
+      {{64, 128, 128}, 0},  // band 4
+      {{64, 128, 128}, 0}   // band 5
+    }, {
+      {{64, 128, 128}, 0},  // dc, band 0
+      {{64, 128, 128}, 0},  // band 1
+      {{64, 128, 128}, 0},  // band 2
+      {{64, 128, 128}, 0},  // band 3
+      {{64, 128, 128}, 0},  // band 4
+      {{64, 128, 128}, 0}   // band 5
+    }
+  },
+#if QUANT_PROFILES > 1
+  {
+    {
+      {{64, 128, 128}, 0},  // dc, band 0
+      {{64, 128, 128}, 0},  // band 1
+      {{64, 128, 128}, 0},  // band 2
+      {{64, 128, 128}, 0},  // band 3
+      {{64, 128, 128}, 0},  // band 4
+      {{64, 128, 128}, 0}   // band 5
+    }, {
+      {{64, 128, 128}, 0},  // dc, band 0
+      {{64, 128, 128}, 0},  // band 1
+      {{64, 128, 128}, 0},  // band 2
+      {{64, 128, 128}, 0},  // band 3
+      {{64, 128, 128}, 0},  // band 4
+      {{64, 128, 128}, 0}   // band 5
+    }
+  },
+#if QUANT_PROFILES > 2
+  {
+    {
+      {{64, 128, 128}, 0},  // dc, band 0
+      {{64, 128, 128}, 0},  // band 1
+      {{64, 128, 128}, 0},  // band 2
+      {{64, 128, 128}, 0},  // band 3
+      {{64, 128, 128}, 0},  // band 4
+      {{64, 128, 128}, 0},  // band 5
+    }, {
+      {{64, 128, 128}, 0},  // dc, band 0
+      {{64, 128, 128}, 0},  // band 1
+      {{64, 128, 128}, 0},  // band 2
+      {{64, 128, 128}, 0},  // band 3
+      {{64, 128, 128}, 0},  // band 4
+      {{64, 128, 128}, 0},  // band 5
+    }
+  }
+#endif  // QUANT_PROFILES > 2
+#endif  // QUANT_PROFILES > 1
+};
+
+static INLINE int qrange_from_qindex(int qindex) {
+  // return high quality (1) or low quality (0)
+  return qindex < 140 ? 1 : 0;
+}
+
+static const uint8_t *get_nuq_knots(int qindex, int band, int q_profile) {
+  if (!qindex)
+    return nuq_lossless[band].knots;
+  else
+    return nuq[q_profile][qrange_from_qindex(qindex)][band].knots;
+}
+
+static INLINE int16_t quant_to_doff_fixed(int qindex, int band,
+                                          int q_profile) {
+  if (!qindex)
+    return nuq_lossless[band].doff;
+  else
+    return nuq[q_profile][qrange_from_qindex(qindex)][band].doff;
+}
+
+// get cumulative bins
+static INLINE void get_cuml_bins_nuq(int q, int qindex, int band,
+                                     tran_low_t *cuml_bins, int q_profile) {
+  const uint8_t *knots = get_nuq_knots(qindex, band, q_profile);
+  int16_t cuml_knots[NUQ_KNOTS];
+  int i;
+  cuml_knots[0] = knots[0];
+  for (i = 1; i < NUQ_KNOTS; ++i)
+    cuml_knots[i] = cuml_knots[i - 1] + knots[i];
+  for (i = 0; i < NUQ_KNOTS; ++i)
+    cuml_bins[i] = ROUND_POWER_OF_TWO(cuml_knots[i] * q, 7);
+}
+
+void vp10_get_dequant_val_nuq(int q, int qindex, int band,
+                              tran_low_t *dq, tran_low_t *cuml_bins,
+                              int q_profile) {
+  const uint8_t *knots = get_nuq_knots(qindex, band, q_profile);
+  tran_low_t cuml_bins_[NUQ_KNOTS], *cuml_bins_ptr;
+  tran_low_t doff;
+  int i;
+  cuml_bins_ptr = (cuml_bins ? cuml_bins : cuml_bins_);
+  get_cuml_bins_nuq(q, qindex, band, cuml_bins_ptr, q_profile);
+  dq[0] = 0;
+  for (i = 1; i < NUQ_KNOTS; ++i) {
+    doff = quant_to_doff_fixed(qindex, band, q_profile);
+    doff = ROUND_POWER_OF_TWO(doff * knots[i], 7);
+    dq[i] = cuml_bins_ptr[i - 1] +
+        ROUND_POWER_OF_TWO((knots[i] - doff * 2) * q, 8);
+  }
+  doff = quant_to_doff_fixed(qindex, band, q_profile);
+  dq[NUQ_KNOTS] =
+      cuml_bins_ptr[NUQ_KNOTS - 1] + ROUND_POWER_OF_TWO((64 - doff) * q, 7);
+}
+
+tran_low_t vp10_dequant_abscoeff_nuq(int v, int q, const tran_low_t *dq) {
+  if (v <= NUQ_KNOTS)
+    return dq[v];
+  else
+    return dq[NUQ_KNOTS] + (v - NUQ_KNOTS) * q;
+}
+
+tran_low_t vp10_dequant_coeff_nuq(int v, int q, const tran_low_t *dq) {
+  tran_low_t dqmag = vp10_dequant_abscoeff_nuq(abs(v), q, dq);
+  return (v < 0 ? -dqmag : dqmag);
+}
+#endif  // CONFIG_NEW_QUANT
 
 static const int16_t dc_qlookup[QINDEX_RANGE] = {
   4,       8,    8,    9,   10,   11,   12,   12,
diff --git a/vp10/common/quant_common.h b/vp10/common/quant_common.h
index 6813e17..d955796 100644
--- a/vp10/common/quant_common.h
+++ b/vp10/common/quant_common.h
@@ -29,6 +29,25 @@
 int vp10_get_qindex(const struct segmentation *seg, int segment_id,
                    int base_qindex);
 
+#if CONFIG_NEW_QUANT
+
+#define QUANT_PROFILES 3
+#define QUANT_RANGES   2
+#define NUQ_KNOTS      3
+
+typedef tran_low_t dequant_val_type_nuq[NUQ_KNOTS + 1];
+typedef tran_low_t cuml_bins_type_nuq[NUQ_KNOTS];
+void vp10_get_dequant_val_nuq(int q, int qindex, int band,
+                              tran_low_t *dq, tran_low_t *cuml_bins,
+                              int dq_off_index);
+tran_low_t vp10_dequant_abscoeff_nuq(int v, int q, const tran_low_t *dq);
+tran_low_t vp10_dequant_coeff_nuq(int v, int q, const tran_low_t *dq);
+
+static INLINE int get_dq_profile_from_ctx(int q_ctx) {
+  return VPXMIN(q_ctx, QUANT_PROFILES - 1);
+}
+#endif  // CONFIG_NEW_QUANT
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp10/common/reconinter.c b/vp10/common/reconinter.c
index fdcb967..d2fc980 100644
--- a/vp10/common/reconinter.c
+++ b/vp10/common/reconinter.c
@@ -11,13 +11,626 @@
 #include <assert.h>
 
 #include "./vpx_scale_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
 #include "./vpx_config.h"
 
 #include "vpx/vpx_integer.h"
+#include "vpx_dsp/blend.h"
 
 #include "vp10/common/blockd.h"
 #include "vp10/common/reconinter.h"
 #include "vp10/common/reconintra.h"
+#if CONFIG_OBMC
+#include "vp10/common/onyxc_int.h"
+#endif  // CONFIG_OBMC
+
+#if CONFIG_EXT_INTER
+
+#define NSMOOTHERS  1
+static int get_masked_weight(int m, int smoothness) {
+#define SMOOTHER_LEN  32
+  static const uint8_t smoothfn[NSMOOTHERS][2 * SMOOTHER_LEN + 1] = {
+    {
+      0,  0,  0,  0,  0,  0,  0,  0,
+      0,  0,  0,  0,  0,  0,  0,  0,
+      0,  0,  0,  0,  0,  0,  0,  0,
+      0,  0,  1,  2,  4,  7, 13, 21,
+      32,
+      43, 51, 57, 60, 62, 63, 64, 64,
+      64, 64, 64, 64, 64, 64, 64, 64,
+      64, 64, 64, 64, 64, 64, 64, 64,
+      64, 64, 64, 64, 64, 64, 64, 64,
+    }
+  };
+  if (m < -SMOOTHER_LEN)
+    return 0;
+  else if (m > SMOOTHER_LEN)
+    return (1 << WEDGE_WEIGHT_BITS);
+  else
+    return smoothfn[smoothness][m + SMOOTHER_LEN];
+}
+
+
+// [smoother][negative][direction]
+DECLARE_ALIGNED(
+    16, static uint8_t,
+    wedge_mask_obl[NSMOOTHERS][2][WEDGE_DIRECTIONS]
+                  [MASK_MASTER_SIZE * MASK_MASTER_SIZE]);
+
+DECLARE_ALIGNED(
+    16, static uint8_t,
+    wedge_signflip_lookup[BLOCK_SIZES][MAX_WEDGE_TYPES]);
+
+// 3 * MAX_WEDGE_SQUARE is an easy to compute and fairly tight upper bound
+// on the sum of all mask sizes up to an including MAX_WEDGE_SQUARE.
+DECLARE_ALIGNED(
+    16, static uint8_t,
+    wedge_mask_buf[2 * MAX_WEDGE_TYPES * 3 * MAX_WEDGE_SQUARE]);
+
+static wedge_masks_type wedge_masks[BLOCK_SIZES][2];
+
+// Some unused wedge codebooks left temporarily to facilitate experiments.
+// To be removed when setteld.
+static wedge_code_type wedge_codebook_8_hgtw[8] = {
+    {WEDGE_OBLIQUE27,  4, 4},
+    {WEDGE_OBLIQUE63,  4, 4},
+    {WEDGE_OBLIQUE117, 4, 4},
+    {WEDGE_OBLIQUE153, 4, 4},
+    {WEDGE_OBLIQUE27,  4, 2},
+    {WEDGE_OBLIQUE27,  4, 6},
+    {WEDGE_OBLIQUE153, 4, 2},
+    {WEDGE_OBLIQUE153, 4, 6},
+};
+
+static wedge_code_type wedge_codebook_8_hltw[8] = {
+    {WEDGE_OBLIQUE27,  4, 4},
+    {WEDGE_OBLIQUE63,  4, 4},
+    {WEDGE_OBLIQUE117, 4, 4},
+    {WEDGE_OBLIQUE153, 4, 4},
+    {WEDGE_OBLIQUE63,  2, 4},
+    {WEDGE_OBLIQUE63,  6, 4},
+    {WEDGE_OBLIQUE117, 2, 4},
+    {WEDGE_OBLIQUE117, 6, 4},
+};
+
+static wedge_code_type wedge_codebook_8_heqw[8] = {
+    {WEDGE_OBLIQUE27,  4, 4},
+    {WEDGE_OBLIQUE63,  4, 4},
+    {WEDGE_OBLIQUE117, 4, 4},
+    {WEDGE_OBLIQUE153, 4, 4},
+    {WEDGE_HORIZONTAL, 4, 2},
+    {WEDGE_HORIZONTAL, 4, 6},
+    {WEDGE_VERTICAL,   2, 4},
+    {WEDGE_VERTICAL,   6, 4},
+};
+
+#if !USE_LARGE_WEDGE_CODEBOOK
+static const wedge_code_type wedge_codebook_16_hgtw[16] = {
+    {WEDGE_OBLIQUE27,  4, 4},
+    {WEDGE_OBLIQUE63,  4, 4},
+    {WEDGE_OBLIQUE117, 4, 4},
+    {WEDGE_OBLIQUE153, 4, 4},
+    {WEDGE_HORIZONTAL, 4, 2},
+    {WEDGE_HORIZONTAL, 4, 4},
+    {WEDGE_HORIZONTAL, 4, 6},
+    {WEDGE_VERTICAL,   4, 4},
+    {WEDGE_OBLIQUE27,  4, 2},
+    {WEDGE_OBLIQUE27,  4, 6},
+    {WEDGE_OBLIQUE153, 4, 2},
+    {WEDGE_OBLIQUE153, 4, 6},
+    {WEDGE_OBLIQUE63,  2, 4},
+    {WEDGE_OBLIQUE63,  6, 4},
+    {WEDGE_OBLIQUE117, 2, 4},
+    {WEDGE_OBLIQUE117, 6, 4},
+};
+
+static const wedge_code_type wedge_codebook_16_hltw[16] = {
+    {WEDGE_OBLIQUE27,  4, 4},
+    {WEDGE_OBLIQUE63,  4, 4},
+    {WEDGE_OBLIQUE117, 4, 4},
+    {WEDGE_OBLIQUE153, 4, 4},
+    {WEDGE_VERTICAL,   2, 4},
+    {WEDGE_VERTICAL,   4, 4},
+    {WEDGE_VERTICAL,   6, 4},
+    {WEDGE_HORIZONTAL, 4, 4},
+    {WEDGE_OBLIQUE27,  4, 2},
+    {WEDGE_OBLIQUE27,  4, 6},
+    {WEDGE_OBLIQUE153, 4, 2},
+    {WEDGE_OBLIQUE153, 4, 6},
+    {WEDGE_OBLIQUE63,  2, 4},
+    {WEDGE_OBLIQUE63,  6, 4},
+    {WEDGE_OBLIQUE117, 2, 4},
+    {WEDGE_OBLIQUE117, 6, 4},
+};
+
+static const wedge_code_type wedge_codebook_16_heqw[16] = {
+    {WEDGE_OBLIQUE27,  4, 4},
+    {WEDGE_OBLIQUE63,  4, 4},
+    {WEDGE_OBLIQUE117, 4, 4},
+    {WEDGE_OBLIQUE153, 4, 4},
+    {WEDGE_HORIZONTAL, 4, 2},
+    {WEDGE_HORIZONTAL, 4, 6},
+    {WEDGE_VERTICAL,   2, 4},
+    {WEDGE_VERTICAL,   6, 4},
+    {WEDGE_OBLIQUE27,  4, 2},
+    {WEDGE_OBLIQUE27,  4, 6},
+    {WEDGE_OBLIQUE153, 4, 2},
+    {WEDGE_OBLIQUE153, 4, 6},
+    {WEDGE_OBLIQUE63,  2, 4},
+    {WEDGE_OBLIQUE63,  6, 4},
+    {WEDGE_OBLIQUE117, 2, 4},
+    {WEDGE_OBLIQUE117, 6, 4},
+};
+
+const wedge_params_type wedge_params_lookup[BLOCK_SIZES] = {
+  {0, NULL, NULL, 0, NULL},
+  {0, NULL, NULL, 0, NULL},
+  {0, NULL, NULL, 0, NULL},
+  {4, wedge_codebook_16_heqw, wedge_signflip_lookup[3], 0, wedge_masks[3]},
+  {4, wedge_codebook_16_hgtw, wedge_signflip_lookup[4], 0, wedge_masks[4]},
+  {4, wedge_codebook_16_hltw, wedge_signflip_lookup[5], 0, wedge_masks[5]},
+  {4, wedge_codebook_16_heqw, wedge_signflip_lookup[6], 0, wedge_masks[6]},
+  {4, wedge_codebook_16_hgtw, wedge_signflip_lookup[7], 0, wedge_masks[7]},
+  {4, wedge_codebook_16_hltw, wedge_signflip_lookup[8], 0, wedge_masks[8]},
+  {4, wedge_codebook_16_heqw, wedge_signflip_lookup[9], 0, wedge_masks[9]},
+  {0, wedge_codebook_8_hgtw, wedge_signflip_lookup[10], 0, wedge_masks[10]},
+  {0, wedge_codebook_8_hltw, wedge_signflip_lookup[11], 0, wedge_masks[11]},
+  {0, wedge_codebook_8_heqw, wedge_signflip_lookup[12], 0, wedge_masks[12]},
+#if CONFIG_EXT_PARTITION
+  {0, NULL, NULL, 0, NULL},
+  {0, NULL, NULL, 0, NULL},
+  {0, NULL, NULL, 0, NULL},
+#endif  // CONFIG_EXT_PARTITION
+};
+
+#else
+
+static const wedge_code_type wedge_codebook_32_hgtw[32] = {
+    {WEDGE_OBLIQUE27,  4, 4},
+    {WEDGE_OBLIQUE63,  4, 4},
+    {WEDGE_OBLIQUE117, 4, 4},
+    {WEDGE_OBLIQUE153, 4, 4},
+    {WEDGE_HORIZONTAL, 4, 2},
+    {WEDGE_HORIZONTAL, 4, 4},
+    {WEDGE_HORIZONTAL, 4, 6},
+    {WEDGE_VERTICAL,   4, 4},
+    {WEDGE_OBLIQUE27,  4, 1},
+    {WEDGE_OBLIQUE27,  4, 2},
+    {WEDGE_OBLIQUE27,  4, 3},
+    {WEDGE_OBLIQUE27,  4, 5},
+    {WEDGE_OBLIQUE27,  4, 6},
+    {WEDGE_OBLIQUE27,  4, 7},
+    {WEDGE_OBLIQUE153, 4, 1},
+    {WEDGE_OBLIQUE153, 4, 2},
+    {WEDGE_OBLIQUE153, 4, 3},
+    {WEDGE_OBLIQUE153, 4, 5},
+    {WEDGE_OBLIQUE153, 4, 6},
+    {WEDGE_OBLIQUE153, 4, 7},
+    {WEDGE_OBLIQUE63,  1, 4},
+    {WEDGE_OBLIQUE63,  2, 4},
+    {WEDGE_OBLIQUE63,  3, 4},
+    {WEDGE_OBLIQUE63,  5, 4},
+    {WEDGE_OBLIQUE63,  6, 4},
+    {WEDGE_OBLIQUE63,  7, 4},
+    {WEDGE_OBLIQUE117, 1, 4},
+    {WEDGE_OBLIQUE117, 2, 4},
+    {WEDGE_OBLIQUE117, 3, 4},
+    {WEDGE_OBLIQUE117, 5, 4},
+    {WEDGE_OBLIQUE117, 6, 4},
+    {WEDGE_OBLIQUE117, 7, 4},
+};
+
+static const wedge_code_type wedge_codebook_32_hltw[32] = {
+    {WEDGE_OBLIQUE27,  4, 4},
+    {WEDGE_OBLIQUE63,  4, 4},
+    {WEDGE_OBLIQUE117, 4, 4},
+    {WEDGE_OBLIQUE153, 4, 4},
+    {WEDGE_VERTICAL,   2, 4},
+    {WEDGE_VERTICAL,   4, 4},
+    {WEDGE_VERTICAL,   6, 4},
+    {WEDGE_HORIZONTAL, 4, 4},
+    {WEDGE_OBLIQUE27,  4, 1},
+    {WEDGE_OBLIQUE27,  4, 2},
+    {WEDGE_OBLIQUE27,  4, 3},
+    {WEDGE_OBLIQUE27,  4, 5},
+    {WEDGE_OBLIQUE27,  4, 6},
+    {WEDGE_OBLIQUE27,  4, 7},
+    {WEDGE_OBLIQUE153, 4, 1},
+    {WEDGE_OBLIQUE153, 4, 2},
+    {WEDGE_OBLIQUE153, 4, 3},
+    {WEDGE_OBLIQUE153, 4, 5},
+    {WEDGE_OBLIQUE153, 4, 6},
+    {WEDGE_OBLIQUE153, 4, 7},
+    {WEDGE_OBLIQUE63,  1, 4},
+    {WEDGE_OBLIQUE63,  2, 4},
+    {WEDGE_OBLIQUE63,  3, 4},
+    {WEDGE_OBLIQUE63,  5, 4},
+    {WEDGE_OBLIQUE63,  6, 4},
+    {WEDGE_OBLIQUE63,  7, 4},
+    {WEDGE_OBLIQUE117, 1, 4},
+    {WEDGE_OBLIQUE117, 2, 4},
+    {WEDGE_OBLIQUE117, 3, 4},
+    {WEDGE_OBLIQUE117, 5, 4},
+    {WEDGE_OBLIQUE117, 6, 4},
+    {WEDGE_OBLIQUE117, 7, 4},
+};
+
+static const wedge_code_type wedge_codebook_32_heqw[32] = {
+    {WEDGE_OBLIQUE27,  4, 4},
+    {WEDGE_OBLIQUE63,  4, 4},
+    {WEDGE_OBLIQUE117, 4, 4},
+    {WEDGE_OBLIQUE153, 4, 4},
+    {WEDGE_HORIZONTAL, 4, 2},
+    {WEDGE_HORIZONTAL, 4, 6},
+    {WEDGE_VERTICAL,   2, 4},
+    {WEDGE_VERTICAL,   6, 4},
+    {WEDGE_OBLIQUE27,  4, 1},
+    {WEDGE_OBLIQUE27,  4, 2},
+    {WEDGE_OBLIQUE27,  4, 3},
+    {WEDGE_OBLIQUE27,  4, 5},
+    {WEDGE_OBLIQUE27,  4, 6},
+    {WEDGE_OBLIQUE27,  4, 7},
+    {WEDGE_OBLIQUE153, 4, 1},
+    {WEDGE_OBLIQUE153, 4, 2},
+    {WEDGE_OBLIQUE153, 4, 3},
+    {WEDGE_OBLIQUE153, 4, 5},
+    {WEDGE_OBLIQUE153, 4, 6},
+    {WEDGE_OBLIQUE153, 4, 7},
+    {WEDGE_OBLIQUE63,  1, 4},
+    {WEDGE_OBLIQUE63,  2, 4},
+    {WEDGE_OBLIQUE63,  3, 4},
+    {WEDGE_OBLIQUE63,  5, 4},
+    {WEDGE_OBLIQUE63,  6, 4},
+    {WEDGE_OBLIQUE63,  7, 4},
+    {WEDGE_OBLIQUE117, 1, 4},
+    {WEDGE_OBLIQUE117, 2, 4},
+    {WEDGE_OBLIQUE117, 3, 4},
+    {WEDGE_OBLIQUE117, 5, 4},
+    {WEDGE_OBLIQUE117, 6, 4},
+    {WEDGE_OBLIQUE117, 7, 4},
+};
+
+const wedge_params_type wedge_params_lookup[BLOCK_SIZES] = {
+  {0, NULL, NULL, 0, NULL},
+  {0, NULL, NULL, 0, NULL},
+  {0, NULL, NULL, 0, NULL},
+  {5, wedge_codebook_32_heqw, wedge_signflip_lookup[3], 0, wedge_masks[3]},
+  {5, wedge_codebook_32_hgtw, wedge_signflip_lookup[4], 0, wedge_masks[4]},
+  {5, wedge_codebook_32_hltw, wedge_signflip_lookup[5], 0, wedge_masks[5]},
+  {5, wedge_codebook_32_heqw, wedge_signflip_lookup[6], 0, wedge_masks[6]},
+  {5, wedge_codebook_32_hgtw, wedge_signflip_lookup[7], 0, wedge_masks[7]},
+  {5, wedge_codebook_32_hltw, wedge_signflip_lookup[8], 0, wedge_masks[8]},
+  {5, wedge_codebook_32_heqw, wedge_signflip_lookup[9], 0, wedge_masks[9]},
+  {0, wedge_codebook_8_hgtw, wedge_signflip_lookup[10], 0, wedge_masks[10]},
+  {0, wedge_codebook_8_hltw, wedge_signflip_lookup[11], 0, wedge_masks[11]},
+  {0, wedge_codebook_8_heqw, wedge_signflip_lookup[12], 0, wedge_masks[12]},
+#if CONFIG_EXT_PARTITION
+  {0, NULL, NULL, 0, NULL},
+  {0, NULL, NULL, 0, NULL},
+  {0, NULL, NULL, 0, NULL},
+#endif  // CONFIG_EXT_PARTITION
+};
+#endif  // USE_LARGE_WEDGE_CODEBOOK
+
+static const uint8_t *get_wedge_mask_inplace(int wedge_index,
+                                             int neg,
+                                             BLOCK_SIZE sb_type) {
+  const uint8_t *master;
+  const int bh = 4 << b_height_log2_lookup[sb_type];
+  const int bw = 4 << b_width_log2_lookup[sb_type];
+  const wedge_code_type *a =
+      wedge_params_lookup[sb_type].codebook + wedge_index;
+  const int smoother = wedge_params_lookup[sb_type].smoother;
+  int woff, hoff;
+  const uint8_t wsignflip = wedge_params_lookup[sb_type].signflip[wedge_index];
+
+  assert(wedge_index >= 0 &&
+         wedge_index < (1 << get_wedge_bits_lookup(sb_type)));
+  woff = (a->x_offset * bw) >> 3;
+  hoff = (a->y_offset * bh) >> 3;
+  master = wedge_mask_obl[smoother][neg ^ wsignflip][a->direction] +
+      MASK_MASTER_STRIDE * (MASK_MASTER_SIZE / 2 - hoff) +
+      MASK_MASTER_SIZE / 2 - woff;
+  return master;
+}
+
+const uint8_t *vp10_get_soft_mask(int wedge_index,
+                                  int wedge_sign,
+                                  BLOCK_SIZE sb_type,
+                                  int offset_x,
+                                  int offset_y) {
+  const uint8_t *mask =
+      get_wedge_mask_inplace(wedge_index, wedge_sign, sb_type);
+  if (mask)
+    mask -= (offset_x + offset_y * MASK_MASTER_STRIDE);
+  return mask;
+}
+
+static void init_wedge_master_masks() {
+  int i, j, s;
+  const int w = MASK_MASTER_SIZE;
+  const int h = MASK_MASTER_SIZE;
+  const int stride = MASK_MASTER_STRIDE;
+  const int a[2] = {2, 1};
+  const double asqrt = sqrt(a[0] * a[0] + a[1] * a[1]);
+  for (s = 0; s < NSMOOTHERS; s++) {
+    for (i = 0; i < h; ++i)
+      for (j = 0; j < w; ++j) {
+        int x = (2 * j + 1 - w);
+        int y = (2 * i + 1 - h);
+        int m = (int)rint((a[0] * x + a[1] * y) / asqrt);
+        wedge_mask_obl[s][1][WEDGE_OBLIQUE63][i * stride + j] =
+        wedge_mask_obl[s][1][WEDGE_OBLIQUE27][j * stride + i] =
+            get_masked_weight(m, s);
+        wedge_mask_obl[s][1][WEDGE_OBLIQUE117][i * stride + w - 1 - j] =
+        wedge_mask_obl[s][1][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] =
+            (1 << WEDGE_WEIGHT_BITS) - get_masked_weight(m, s);
+        wedge_mask_obl[s][0][WEDGE_OBLIQUE63][i * stride + j] =
+        wedge_mask_obl[s][0][WEDGE_OBLIQUE27][j * stride + i] =
+            (1 << WEDGE_WEIGHT_BITS) - get_masked_weight(m, s);
+        wedge_mask_obl[s][0][WEDGE_OBLIQUE117][i * stride + w - 1 - j] =
+        wedge_mask_obl[s][0][WEDGE_OBLIQUE153][(w - 1 - j) * stride + i] =
+            get_masked_weight(m, s);
+        wedge_mask_obl[s][1][WEDGE_VERTICAL][i * stride + j] =
+        wedge_mask_obl[s][1][WEDGE_HORIZONTAL][j * stride + i] =
+            get_masked_weight(x, s);
+        wedge_mask_obl[s][0][WEDGE_VERTICAL][i * stride + j] =
+        wedge_mask_obl[s][0][WEDGE_HORIZONTAL][j * stride + i] =
+            (1 << WEDGE_WEIGHT_BITS) - get_masked_weight(x, s);
+      }
+  }
+}
+
+// If the signs for the wedges for various blocksizes are
+// inconsistent flip the sign flag. Do it only once for every
+// wedge codebook.
+static void init_wedge_signs() {
+  BLOCK_SIZE sb_type;
+  memset(wedge_signflip_lookup, 0, sizeof(wedge_signflip_lookup));
+  for (sb_type = BLOCK_4X4; sb_type < BLOCK_SIZES; ++sb_type) {
+    const int bw = 4 * num_4x4_blocks_wide_lookup[sb_type];
+    const int bh = 4 * num_4x4_blocks_high_lookup[sb_type];
+    const wedge_params_type wedge_params = wedge_params_lookup[sb_type];
+    const int wbits = wedge_params.bits;
+    const int wtypes = 1 << wbits;
+    int i, w;
+    if (wbits == 0) continue;
+    for (w = 0; w < wtypes; ++w) {
+      const uint8_t *mask = get_wedge_mask_inplace(w, 0, sb_type);
+      int sum = 0;
+      for (i = 0; i < bw; ++i)
+        sum += mask[i];
+      for (i = 0; i < bh; ++i)
+        sum += mask[i * MASK_MASTER_STRIDE];
+      sum = (sum + (bw + bh) / 2) / (bw + bh);
+      wedge_params.signflip[w] = (sum < 32);
+    }
+  }
+}
+
+static void init_wedge_masks() {
+  uint8_t *dst = wedge_mask_buf;
+  BLOCK_SIZE bsize;
+  memset(wedge_masks, 0, sizeof(wedge_masks));
+  for (bsize = BLOCK_4X4; bsize < BLOCK_SIZES; ++bsize) {
+    const uint8_t *mask;
+    const int bw = 4 * num_4x4_blocks_wide_lookup[bsize];
+    const int bh = 4 * num_4x4_blocks_high_lookup[bsize];
+    const wedge_params_type *wedge_params = &wedge_params_lookup[bsize];
+    const int wbits = wedge_params->bits;
+    const int wtypes = 1 << wbits;
+    int w;
+    if (wbits == 0) continue;
+    for (w = 0; w < wtypes; ++w) {
+      mask = get_wedge_mask_inplace(w, 0, bsize);
+      vpx_convolve_copy(mask, MASK_MASTER_STRIDE, dst, bw,
+                        NULL, 0, NULL, 0, bw, bh);
+      wedge_params->masks[0][w] = dst;
+      dst += bw * bh;
+
+      mask = get_wedge_mask_inplace(w, 1, bsize);
+      vpx_convolve_copy(mask, MASK_MASTER_STRIDE, dst, bw,
+                        NULL, 0, NULL, 0, bw, bh);
+      wedge_params->masks[1][w] = dst;
+      dst += bw * bh;
+    }
+    assert(sizeof(wedge_mask_buf) >= (size_t)(dst - wedge_mask_buf));
+  }
+}
+
+// Equation of line: f(x, y) = a[0]*(x - a[2]*w/8) + a[1]*(y - a[3]*h/8) = 0
+void vp10_init_wedge_masks() {
+  init_wedge_master_masks();
+  init_wedge_signs();
+  init_wedge_masks();
+}
+
+
+#if CONFIG_SUPERTX
+static void build_masked_compound_wedge_extend(
+    uint8_t *dst, int dst_stride,
+    const uint8_t *src0, int src0_stride,
+    const uint8_t *src1, int src1_stride,
+    int wedge_index,
+    int wedge_sign,
+    BLOCK_SIZE sb_type,
+    int wedge_offset_x, int wedge_offset_y,
+    int h, int w) {
+  const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
+  const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
+  const uint8_t *mask = vp10_get_soft_mask(
+     wedge_index, wedge_sign, sb_type, wedge_offset_x, wedge_offset_y);
+  vpx_blend_a64_mask(dst, dst_stride,
+                     src0, src0_stride,
+                     src1, src1_stride,
+                     mask, MASK_MASTER_STRIDE,
+                     h, w, subh, subw);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void build_masked_compound_wedge_extend_highbd(
+    uint8_t *dst_8, int dst_stride,
+    const uint8_t *src0_8, int src0_stride,
+    const uint8_t *src1_8, int src1_stride,
+    int wedge_index, int wedge_sign,
+    BLOCK_SIZE sb_type,
+    int wedge_offset_x, int wedge_offset_y,
+    int h, int w, int bd) {
+  const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
+  const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
+  const uint8_t *mask = vp10_get_soft_mask(
+      wedge_index, wedge_sign, sb_type, wedge_offset_x, wedge_offset_y);
+  vpx_highbd_blend_a64_mask(dst_8, dst_stride,
+                            src0_8, src0_stride,
+                            src1_8, src1_stride,
+                            mask, MASK_MASTER_STRIDE,
+                            h, w, subh, subw, bd);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // CONFIG_SUPERTX
+
+static void build_masked_compound_wedge(
+    uint8_t *dst, int dst_stride,
+    const uint8_t *src0, int src0_stride,
+    const uint8_t *src1, int src1_stride,
+    int wedge_index, int wedge_sign,
+    BLOCK_SIZE sb_type,
+    int h, int w) {
+  // Derive subsampling from h and w passed in. May be refactored to
+  // pass in subsampling factors directly.
+  const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
+  const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
+  const uint8_t *mask = vp10_get_contiguous_soft_mask(wedge_index, wedge_sign,
+                                                      sb_type);
+  vpx_blend_a64_mask(dst, dst_stride,
+                     src0, src0_stride,
+                     src1, src1_stride,
+                     mask, 4 * num_4x4_blocks_wide_lookup[sb_type],
+                     h, w, subh, subw);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void build_masked_compound_wedge_highbd(
+    uint8_t *dst_8, int dst_stride,
+    const uint8_t *src0_8, int src0_stride,
+    const uint8_t *src1_8, int src1_stride,
+    int wedge_index, int wedge_sign,
+    BLOCK_SIZE sb_type,
+    int h, int w, int bd) {
+  // Derive subsampling from h and w passed in. May be refactored to
+  // pass in subsampling factors directly.
+  const int subh = (2 << b_height_log2_lookup[sb_type]) == h;
+  const int subw = (2 << b_width_log2_lookup[sb_type]) == w;
+  const uint8_t *mask = vp10_get_contiguous_soft_mask(wedge_index, wedge_sign,
+                                                      sb_type);
+  vpx_highbd_blend_a64_mask(dst_8, dst_stride,
+                            src0_8, src0_stride,
+                            src1_8, src1_stride,
+                            mask, 4 * num_4x4_blocks_wide_lookup[sb_type],
+                            h, w, subh, subw, bd);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+void vp10_make_masked_inter_predictor(
+    const uint8_t *pre,
+    int pre_stride,
+    uint8_t *dst,
+    int dst_stride,
+    const int subpel_x,
+    const int subpel_y,
+    const struct scale_factors *sf,
+    int w, int h,
+#if CONFIG_DUAL_FILTER
+    const INTERP_FILTER *interp_filter,
+#else
+    const INTERP_FILTER interp_filter,
+#endif
+    int xs, int ys,
+#if CONFIG_SUPERTX
+    int wedge_offset_x, int wedge_offset_y,
+#endif  // CONFIG_SUPERTX
+    const MACROBLOCKD *xd) {
+  const MODE_INFO *mi = xd->mi[0];
+  // The prediction filter types used here should be those for
+  // the second reference block.
+#if CONFIG_DUAL_FILTER
+  INTERP_FILTER tmp_ipf[4] = {
+    interp_filter[2], interp_filter[3], interp_filter[2], interp_filter[3],
+  };
+#else
+  INTERP_FILTER tmp_ipf = interp_filter;
+#endif  // CONFIG_DUAL_FILTER
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint8_t, tmp_dst_[2 * MAX_SB_SQUARE]);
+  uint8_t *tmp_dst =
+      (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ?
+      CONVERT_TO_BYTEPTR(tmp_dst_) : tmp_dst_;
+  vp10_make_inter_predictor(pre, pre_stride, tmp_dst, MAX_SB_SIZE,
+                            subpel_x, subpel_y, sf, w, h, 0,
+                            tmp_ipf, xs, ys, xd);
+#if CONFIG_SUPERTX
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    build_masked_compound_wedge_extend_highbd(
+        dst, dst_stride,
+        dst, dst_stride,
+        tmp_dst, MAX_SB_SIZE,
+        mi->mbmi.interinter_wedge_index,
+        mi->mbmi.interinter_wedge_sign,
+        mi->mbmi.sb_type,
+        wedge_offset_x, wedge_offset_y, h, w, xd->bd);
+  else
+    build_masked_compound_wedge_extend(
+        dst, dst_stride,
+        dst, dst_stride,
+        tmp_dst, MAX_SB_SIZE,
+        mi->mbmi.interinter_wedge_index,
+        mi->mbmi.interinter_wedge_sign,
+        mi->mbmi.sb_type,
+        wedge_offset_x, wedge_offset_y, h, w);
+#else
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    build_masked_compound_wedge_highbd(
+        dst, dst_stride,
+        dst, dst_stride,
+        tmp_dst, MAX_SB_SIZE,
+        mi->mbmi.interinter_wedge_index,
+        mi->mbmi.interinter_wedge_sign,
+        mi->mbmi.sb_type, h, w, xd->bd);
+  else
+    build_masked_compound_wedge(
+        dst, dst_stride,
+        dst, dst_stride,
+        tmp_dst, MAX_SB_SIZE,
+        mi->mbmi.interinter_wedge_index,
+        mi->mbmi.interinter_wedge_sign,
+        mi->mbmi.sb_type, h, w);
+#endif  // CONFIG_SUPERTX
+#else   // CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint8_t, tmp_dst[MAX_SB_SQUARE]);
+  vp10_make_inter_predictor(pre, pre_stride, tmp_dst, MAX_SB_SIZE,
+                            subpel_x, subpel_y, sf, w, h, 0,
+                            tmp_ipf, xs, ys, xd);
+#if CONFIG_SUPERTX
+  build_masked_compound_wedge_extend(
+      dst, dst_stride,
+      dst, dst_stride,
+      tmp_dst, MAX_SB_SIZE,
+      mi->mbmi.interinter_wedge_index,
+      mi->mbmi.interinter_wedge_sign,
+      mi->mbmi.sb_type,
+      wedge_offset_x, wedge_offset_y, h, w);
+#else
+  build_masked_compound_wedge(
+      dst, dst_stride,
+      dst, dst_stride,
+      tmp_dst, MAX_SB_SIZE,
+      mi->mbmi.interinter_wedge_index,
+      mi->mbmi.interinter_wedge_sign,
+      mi->mbmi.sb_type, h, w);
+#endif  // CONFIG_SUPERTX
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+}
+#endif  // CONFIG_EXT_INTER
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp10_highbd_build_inter_predictor(const uint8_t *src, int src_stride,
@@ -25,7 +638,11 @@
                                       const MV *src_mv,
                                       const struct scale_factors *sf,
                                       int w, int h, int ref,
-                                      const InterpKernel *kernel,
+#if CONFIG_DUAL_FILTER
+                                      const INTERP_FILTER *interp_filter,
+#else
+                                      const INTERP_FILTER interp_filter,
+#endif
                                       enum mv_precision precision,
                                       int x, int y, int bd) {
   const int is_q4 = precision == MV_PRECISION_Q4;
@@ -37,8 +654,9 @@
 
   src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS);
 
-  high_inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y,
-                       sf, w, h, ref, kernel, sf->x_step_q4, sf->y_step_q4, bd);
+  highbd_inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y,
+                         sf, w, h, ref, interp_filter, sf->x_step_q4,
+                         sf->y_step_q4, bd);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -47,7 +665,11 @@
                                const MV *src_mv,
                                const struct scale_factors *sf,
                                int w, int h, int ref,
-                               const InterpKernel *kernel,
+#if CONFIG_DUAL_FILTER
+                               const INTERP_FILTER *interp_filter,
+#else
+                               const INTERP_FILTER interp_filter,
+#endif
                                enum mv_precision precision,
                                int x, int y) {
   const int is_q4 = precision == MV_PRECISION_Q4;
@@ -60,19 +682,110 @@
   src += (mv.row >> SUBPEL_BITS) * src_stride + (mv.col >> SUBPEL_BITS);
 
   inter_predictor(src, src_stride, dst, dst_stride, subpel_x, subpel_y,
-                  sf, w, h, ref, kernel, sf->x_step_q4, sf->y_step_q4);
+                  sf, w, h, ref, interp_filter, sf->x_step_q4, sf->y_step_q4);
 }
 
-void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
-                                   int bw, int bh,
-                                   int x, int y, int w, int h,
-                                   int mi_x, int mi_y) {
+void build_inter_predictors(MACROBLOCKD *xd, int plane,
+#if CONFIG_OBMC
+                            int mi_col_offset, int mi_row_offset,
+#endif  // CONFIG_OBMC
+                            int block,
+                            int bw, int bh,
+                            int x, int y, int w, int h,
+#if CONFIG_SUPERTX && CONFIG_EXT_INTER
+                            int wedge_offset_x, int wedge_offset_y,
+#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
+                            int mi_x, int mi_y) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
+#if CONFIG_OBMC
+  const MODE_INFO *mi = xd->mi[mi_col_offset + xd->mi_stride * mi_row_offset];
+#else
   const MODE_INFO *mi = xd->mi[0];
+#endif  // CONFIG_OBMC
   const int is_compound = has_second_ref(&mi->mbmi);
-  const InterpKernel *kernel = vp10_filter_kernels[mi->mbmi.interp_filter];
   int ref;
 
+#if CONFIG_DUAL_FILTER
+  if (mi->mbmi.sb_type < BLOCK_8X8 && plane > 0) {
+    // block size in log2
+    const int b4_wl = b_width_log2_lookup[mi->mbmi.sb_type];
+    const int b4_hl = b_height_log2_lookup[mi->mbmi.sb_type];
+    const int b8_sl = b_width_log2_lookup[BLOCK_8X8];
+
+    // block size
+    const int b4_w = 1 << b4_wl;
+    const int b4_h = 1 << b4_hl;
+    const int b8_s = 1 << b8_sl;
+    int idx, idy;
+
+    const int x_base = x;
+    const int y_base = y;
+
+    // processing unit size
+    const int x_step = w >> (b8_sl - b4_wl);
+    const int y_step = h >> (b8_sl - b4_hl);
+
+    for (idy = 0; idy < b8_s; idy += b4_h) {
+      for (idx = 0; idx < b8_s; idx += b4_w) {
+        const int chr_idx = (idy * 2) + idx;
+        for (ref = 0; ref < 1 + is_compound; ++ref) {
+          const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
+          struct buf_2d *const pre_buf = &pd->pre[ref];
+          struct buf_2d *const dst_buf = &pd->dst;
+          uint8_t *dst = dst_buf->buf;
+          const MV mv = mi->bmi[chr_idx].as_mv[ref].as_mv;
+          const MV mv_q4 = clamp_mv_to_umv_border_sb(
+              xd, &mv, bw, bh, pd->subsampling_x, pd->subsampling_y);
+          uint8_t *pre;
+          MV32 scaled_mv;
+          int xs, ys, subpel_x, subpel_y;
+          const int is_scaled = vp10_is_scaled(sf);
+
+          x = x_base + idx * x_step;
+          y = y_base + idy * y_step;
+
+          dst += dst_buf->stride * y + x;
+
+          if (is_scaled) {
+            pre =
+                pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, sf);
+            scaled_mv = vp10_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
+            xs = sf->x_step_q4;
+            ys = sf->y_step_q4;
+          } else {
+            pre = pre_buf->buf + y * pre_buf->stride + x;
+            scaled_mv.row = mv_q4.row;
+            scaled_mv.col = mv_q4.col;
+            xs = ys = 16;
+          }
+
+          subpel_x = scaled_mv.col & SUBPEL_MASK;
+          subpel_y = scaled_mv.row & SUBPEL_MASK;
+          pre += (scaled_mv.row >> SUBPEL_BITS) * pre_buf->stride +
+                 (scaled_mv.col >> SUBPEL_BITS);
+
+#if CONFIG_EXT_INTER
+          if (ref && is_interinter_wedge_used(mi->mbmi.sb_type) &&
+              mi->mbmi.use_wedge_interinter)
+            vp10_make_masked_inter_predictor(
+                pre, pre_buf->stride, dst, dst_buf->stride, subpel_x, subpel_y,
+                sf, w, h, mi->mbmi.interp_filter, xs, ys,
+#if CONFIG_SUPERTX
+                wedge_offset_x, wedge_offset_y,
+#endif  // CONFIG_SUPERTX
+                xd);
+          else
+#endif  // CONFIG_EXT_INTER
+            vp10_make_inter_predictor(
+                pre, pre_buf->stride, dst, dst_buf->stride, subpel_x, subpel_y,
+                sf, x_step, y_step, ref, mi->mbmi.interp_filter, xs, ys, xd);
+        }
+      }
+    }
+    return;
+  }
+#endif
+
   for (ref = 0; ref < 1 + is_compound; ++ref) {
     const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
     struct buf_2d *const pre_buf = &pd->pre[ref];
@@ -107,24 +820,28 @@
       scaled_mv.col = mv_q4.col;
       xs = ys = 16;
     }
+
     subpel_x = scaled_mv.col & SUBPEL_MASK;
     subpel_y = scaled_mv.row & SUBPEL_MASK;
     pre += (scaled_mv.row >> SUBPEL_BITS) * pre_buf->stride
            + (scaled_mv.col >> SUBPEL_BITS);
 
-#if CONFIG_VP9_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      high_inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
-                           subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys,
-                           xd->bd);
-    } else {
-      inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
-                      subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys);
-    }
-#else
-    inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
-                    subpel_x, subpel_y, sf, w, h, ref, kernel, xs, ys);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
+#if CONFIG_EXT_INTER
+    if (ref && is_interinter_wedge_used(mi->mbmi.sb_type) &&
+        mi->mbmi.use_wedge_interinter)
+      vp10_make_masked_inter_predictor(
+          pre, pre_buf->stride, dst, dst_buf->stride,
+          subpel_x, subpel_y, sf, w, h,
+          mi->mbmi.interp_filter, xs, ys,
+#if CONFIG_SUPERTX
+          wedge_offset_x, wedge_offset_y,
+#endif  // CONFIG_SUPERTX
+          xd);
+    else
+#endif  // CONFIG_EXT_INTER
+      vp10_make_inter_predictor(pre, pre_buf->stride, dst, dst_buf->stride,
+                                subpel_x, subpel_y, sf, w, h, ref,
+                                mi->mbmi.interp_filter, xs, ys, xd);
   }
 }
 
@@ -140,7 +857,6 @@
   uint8_t *const dst = &pd->dst.buf[(ir * pd->dst.stride + ic) << 2];
   int ref;
   const int is_compound = has_second_ref(&mi->mbmi);
-  const InterpKernel *kernel = vp10_filter_kernels[mi->mbmi.interp_filter];
 
   for (ref = 0; ref < 1 + is_compound; ++ref) {
     const uint8_t *pre =
@@ -151,7 +867,8 @@
                                       dst, pd->dst.stride,
                                       &mi->bmi[i].as_mv[ref].as_mv,
                                       &xd->block_refs[ref]->sf, width, height,
-                                      ref, kernel, MV_PRECISION_Q3,
+                                      ref, mi->mbmi.interp_filter,
+                                      MV_PRECISION_Q3,
                                       mi_col * MI_SIZE + 4 * ic,
                                       mi_row * MI_SIZE + 4 * ir, xd->bd);
   } else {
@@ -159,7 +876,7 @@
                                dst, pd->dst.stride,
                                &mi->bmi[i].as_mv[ref].as_mv,
                                &xd->block_refs[ref]->sf, width, height, ref,
-                               kernel, MV_PRECISION_Q3,
+                               mi->mbmi.interp_filter, MV_PRECISION_Q3,
                                mi_col * MI_SIZE + 4 * ic,
                                mi_row * MI_SIZE + 4 * ir);
   }
@@ -168,7 +885,7 @@
                                dst, pd->dst.stride,
                                &mi->bmi[i].as_mv[ref].as_mv,
                                &xd->block_refs[ref]->sf, width, height, ref,
-                               kernel, MV_PRECISION_Q3,
+                               mi->mbmi.interp_filter, MV_PRECISION_Q3,
                                mi_col * MI_SIZE + 4 * ic,
                                mi_row * MI_SIZE + 4 * ir);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -200,11 +917,27 @@
       assert(pw * num_4x4_w == bw && ph * num_4x4_h == bh);
       for (y = 0; y < num_4x4_h; ++y)
         for (x = 0; x < num_4x4_w; ++x)
-           build_inter_predictors(xd, plane, y * 2 + x, bw, bh,
-                                  4 * x, 4 * y, pw, ph, mi_x, mi_y);
+           build_inter_predictors(xd, plane,
+#if CONFIG_OBMC
+                                  0, 0,
+#endif  // CONFIG_OBMC
+                                  y * 2 + x, bw, bh,
+                                  4 * x, 4 * y, pw, ph,
+#if CONFIG_SUPERTX && CONFIG_EXT_INTER
+                                  0, 0,
+#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
+                                  mi_x, mi_y);
     } else {
-      build_inter_predictors(xd, plane, 0, bw, bh,
-                             0, 0, bw, bh, mi_x, mi_y);
+      build_inter_predictors(xd, plane,
+#if CONFIG_OBMC
+                             0, 0,
+#endif  // CONFIG_OBMC
+                             0, bw, bh,
+                             0, 0, bw, bh,
+#if CONFIG_SUPERTX && CONFIG_EXT_INTER
+                             0, 0,
+#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
+                             mi_x, mi_y);
     }
   }
 }
@@ -212,23 +945,65 @@
 void vp10_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col,
                                     BLOCK_SIZE bsize) {
   build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 0, 0);
+#if CONFIG_EXT_INTER
+  if (is_interintra_pred(&xd->mi[0]->mbmi))
+    vp10_build_interintra_predictors_sby(xd,
+                                         xd->plane[0].dst.buf,
+                                         xd->plane[0].dst.stride,
+                                         bsize);
+#endif  // CONFIG_EXT_INTER
 }
 
 void vp10_build_inter_predictors_sbp(MACROBLOCKD *xd, int mi_row, int mi_col,
-                                    BLOCK_SIZE bsize, int plane) {
+                                     BLOCK_SIZE bsize, int plane) {
   build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, plane, plane);
+#if CONFIG_EXT_INTER
+  if (is_interintra_pred(&xd->mi[0]->mbmi)) {
+    if (plane == 0) {
+      vp10_build_interintra_predictors_sby(xd,
+                                           xd->plane[0].dst.buf,
+                                           xd->plane[0].dst.stride,
+                                           bsize);
+    } else {
+      vp10_build_interintra_predictors_sbc(xd,
+                                           xd->plane[plane].dst.buf,
+                                           xd->plane[plane].dst.stride,
+                                           plane, bsize);
+    }
+  }
+#endif  // CONFIG_EXT_INTER
 }
 
 void vp10_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col,
-                                     BLOCK_SIZE bsize) {
+                                      BLOCK_SIZE bsize) {
   build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 1,
                                     MAX_MB_PLANE - 1);
+#if CONFIG_EXT_INTER
+  if (is_interintra_pred(&xd->mi[0]->mbmi))
+    vp10_build_interintra_predictors_sbuv(xd,
+                                          xd->plane[1].dst.buf,
+                                          xd->plane[2].dst.buf,
+                                          xd->plane[1].dst.stride,
+                                          xd->plane[2].dst.stride,
+                                          bsize);
+#endif  // CONFIG_EXT_INTER
 }
 
 void vp10_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
                                    BLOCK_SIZE bsize) {
   build_inter_predictors_for_planes(xd, bsize, mi_row, mi_col, 0,
                                     MAX_MB_PLANE - 1);
+#if CONFIG_EXT_INTER
+  if (is_interintra_pred(&xd->mi[0]->mbmi))
+    vp10_build_interintra_predictors(xd,
+                                     xd->plane[0].dst.buf,
+                                     xd->plane[1].dst.buf,
+                                     xd->plane[2].dst.buf,
+                                     xd->plane[0].dst.stride,
+                                     xd->plane[1].dst.stride,
+                                     xd->plane[2].dst.stride,
+                                     bsize);
+#endif  // CONFIG_EXT_INTER
 }
 
 void vp10_setup_dst_planes(struct macroblockd_plane planes[MAX_MB_PLANE],
@@ -264,3 +1039,1313 @@
     }
   }
 }
+
+#if CONFIG_SUPERTX
+static const uint8_t mask_8[8] = {
+  64, 64, 62, 52, 12,  2,  0,  0
+};
+
+static const uint8_t mask_16[16] = {
+  63, 62, 60, 58, 55, 50, 43, 36, 28, 21, 14, 9, 6, 4, 2, 1
+};
+
+static const uint8_t mask_32[32] = {
+  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 63, 61, 57, 52, 45, 36,
+  28, 19, 12,  7,  3,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+};
+
+static const uint8_t mask_8_uv[8] = {
+  64, 64, 62, 52,  12,  2,  0,  0
+};
+
+static const uint8_t mask_16_uv[16] = {
+  64, 64, 64, 64, 61, 53, 45, 36, 28, 19, 11, 3, 0,  0,  0,  0
+};
+
+static const uint8_t mask_32_uv[32] = {
+  64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 60, 54, 46, 36,
+  28, 18, 10,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+};
+
+static const uint8_t* get_supertx_mask(int length, int plane) {
+  switch (length) {
+    case 8:
+      return plane ? mask_8_uv : mask_8;
+    case 16:
+      return plane ? mask_16_uv : mask_16;
+    case 32:
+      return plane ? mask_32_uv : mask_32;
+    default:
+      assert(0);
+  }
+  return NULL;
+}
+
+void vp10_build_masked_inter_predictor_complex(
+    MACROBLOCKD *xd,
+    uint8_t *dst, int dst_stride,
+    const uint8_t *pre, int pre_stride,
+    int mi_row, int mi_col,
+    int mi_row_ori, int mi_col_ori, BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
+    PARTITION_TYPE partition, int plane) {
+  const struct macroblockd_plane *pd = &xd->plane[plane];
+  const int ssx = pd->subsampling_x;
+  const int ssy = pd->subsampling_y;
+  const int top_w = (4 << b_width_log2_lookup[top_bsize]) >> ssx;
+  const int top_h = (4 << b_height_log2_lookup[top_bsize]) >> ssy;
+  const int w = (4 << b_width_log2_lookup[bsize]) >> ssx;
+  const int h = (4 << b_height_log2_lookup[bsize]) >> ssy;
+  const int w_offset = ((mi_col - mi_col_ori) * MI_SIZE) >> ssx;
+  const int h_offset = ((mi_row - mi_row_ori) * MI_SIZE) >> ssy;
+
+  int w_remain, h_remain;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int is_hdb = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  assert(bsize <= BLOCK_32X32);
+  assert(IMPLIES(plane == 0, ssx == 0));
+  assert(IMPLIES(plane == 0, ssy == 0));
+
+  switch (partition) {
+    case PARTITION_HORZ: {
+      const uint8_t *const mask = get_supertx_mask(h, ssy);
+
+      w_remain = top_w;
+      h_remain = top_h - h_offset - h;
+      dst += h_offset * dst_stride;
+      pre += h_offset * pre_stride;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (is_hdb)
+        vpx_highbd_blend_a64_vmask(dst, dst_stride,
+                                   dst, dst_stride,
+                                   pre, pre_stride,
+                                   mask, h, top_w, xd->bd);
+      else
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        vpx_blend_a64_vmask(dst, dst_stride,
+                            dst, dst_stride,
+                            pre, pre_stride,
+                            mask, h, top_w);
+
+      dst += h * dst_stride;
+      pre += h * pre_stride;
+      break;
+    }
+    case PARTITION_VERT: {
+      const uint8_t *const mask = get_supertx_mask(w, ssx);
+
+      w_remain = top_w - w_offset - w;
+      h_remain = top_h;
+      dst += w_offset;
+      pre += w_offset;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (is_hdb)
+        vpx_highbd_blend_a64_hmask(dst, dst_stride,
+                                   dst, dst_stride,
+                                   pre, pre_stride,
+                                   mask, top_h, w, xd->bd);
+      else
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        vpx_blend_a64_hmask(dst, dst_stride,
+                            dst, dst_stride,
+                            pre, pre_stride,
+                            mask, top_h, w);
+
+      dst += w;
+      pre += w;
+      break;
+    }
+    default: {
+      assert(0);
+      return;
+    }
+  }
+
+  if (w_remain == 0 || h_remain == 0) {
+    return;
+  }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (is_hdb) {
+    dst = (uint8_t*)CONVERT_TO_SHORTPTR(dst);
+    pre = (const uint8_t*)CONVERT_TO_SHORTPTR(pre);
+    dst_stride *= 2;
+    pre_stride *= 2;
+    w_remain *= 2;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  do {
+    memcpy(dst, pre, w_remain * sizeof(uint8_t));
+    dst += dst_stride;
+    pre += pre_stride;
+  } while (--h_remain);
+}
+
+void vp10_build_inter_predictors_sb_sub8x8_extend(
+    MACROBLOCKD *xd,
+#if CONFIG_EXT_INTER
+    int mi_row_ori, int mi_col_ori,
+#endif  // CONFIG_EXT_INTER
+    int mi_row, int mi_col,
+    BLOCK_SIZE bsize, int block) {
+  // Prediction function used in supertx:
+  // Use the mv at current block (which is less than 8x8)
+  // to get prediction of a block located at (mi_row, mi_col) at size of bsize
+  // bsize can be larger than 8x8.
+  // block (0-3): the sub8x8 location of current block
+  int plane;
+  const int mi_x = mi_col * MI_SIZE;
+  const int mi_y = mi_row * MI_SIZE;
+#if CONFIG_EXT_INTER
+  const int wedge_offset_x = (mi_col_ori - mi_col) * MI_SIZE;
+  const int wedge_offset_y = (mi_row_ori - mi_row) * MI_SIZE;
+#endif  // CONFIG_EXT_INTER
+
+  // For sub8x8 uv:
+  // Skip uv prediction in supertx except the first block (block = 0)
+  int max_plane = block ? 1 : MAX_MB_PLANE;
+
+  for (plane = 0; plane < max_plane; plane++) {
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize,
+                                                        &xd->plane[plane]);
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+    const int bw = 4 * num_4x4_w;
+    const int bh = 4 * num_4x4_h;
+
+    build_inter_predictors(xd, plane,
+#if CONFIG_OBMC
+                           0, 0,
+#endif  // CONFIG_OBMC
+                           block, bw, bh,
+                           0, 0, bw, bh,
+#if CONFIG_EXT_INTER
+                           wedge_offset_x,
+                           wedge_offset_y,
+#endif  // CONFIG_EXT_INTER
+                           mi_x, mi_y);
+  }
+#if CONFIG_EXT_INTER
+  if (is_interintra_pred(&xd->mi[0]->mbmi))
+    vp10_build_interintra_predictors(xd,
+                                     xd->plane[0].dst.buf,
+                                     xd->plane[1].dst.buf,
+                                     xd->plane[2].dst.buf,
+                                     xd->plane[0].dst.stride,
+                                     xd->plane[1].dst.stride,
+                                     xd->plane[2].dst.stride,
+                                     bsize);
+#endif  // CONFIG_EXT_INTER
+}
+
+void vp10_build_inter_predictors_sb_extend(MACROBLOCKD *xd,
+#if CONFIG_EXT_INTER
+                                           int mi_row_ori, int mi_col_ori,
+#endif  // CONFIG_EXT_INTER
+                                           int mi_row, int mi_col,
+                                           BLOCK_SIZE bsize) {
+  int plane;
+  const int mi_x = mi_col * MI_SIZE;
+  const int mi_y = mi_row * MI_SIZE;
+#if CONFIG_EXT_INTER
+  const int wedge_offset_x = (mi_col_ori - mi_col) * MI_SIZE;
+  const int wedge_offset_y = (mi_row_ori - mi_row) * MI_SIZE;
+#endif  // CONFIG_EXT_INTER
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(
+        bsize, &xd->plane[plane]);
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+    const int bw = 4 * num_4x4_w;
+    const int bh = 4 * num_4x4_h;
+
+    if (xd->mi[0]->mbmi.sb_type < BLOCK_8X8) {
+      int x, y;
+      assert(bsize == BLOCK_8X8);
+      for (y = 0; y < num_4x4_h; ++y)
+        for (x = 0; x < num_4x4_w; ++x)
+           build_inter_predictors(
+               xd, plane,
+#if CONFIG_OBMC
+               0, 0,
+#endif  // CONFIG_OBMC
+               y * 2 + x, bw, bh, 4 * x, 4 * y, 4, 4,
+#if CONFIG_EXT_INTER
+               wedge_offset_x,
+               wedge_offset_y,
+#endif  // CONFIG_EXT_INTER
+               mi_x, mi_y);
+    } else {
+      build_inter_predictors(
+          xd, plane,
+#if CONFIG_OBMC
+          0, 0,
+#endif  // CONFIG_OBMC
+          0, bw, bh, 0, 0, bw, bh,
+#if CONFIG_EXT_INTER
+          wedge_offset_x,
+          wedge_offset_y,
+#endif  // CONFIG_EXT_INTER
+          mi_x, mi_y);
+    }
+  }
+}
+#endif  // CONFIG_SUPERTX
+
+#if CONFIG_OBMC
+// obmc_mask_N[overlap_position]
+static const uint8_t obmc_mask_1[1] = {
+  55
+};
+
+static const uint8_t obmc_mask_2[2] = {
+  45, 62
+};
+
+static const uint8_t obmc_mask_4[4] = {
+  39, 50, 59, 64
+};
+
+static const uint8_t obmc_mask_8[8] = {
+  36, 42, 48, 53, 57, 61, 63, 64
+};
+
+static const uint8_t obmc_mask_16[16] = {
+  34, 37, 40, 43, 46, 49, 52, 54, 56, 58, 60, 61, 63, 64, 64, 64
+};
+
+static const uint8_t obmc_mask_32[32] = {
+  33, 35, 36, 38, 40, 41, 43, 44, 45, 47, 48, 50, 51, 52, 53, 55,
+  56, 57, 58, 59, 60, 60, 61, 62, 62, 63, 63, 64, 64, 64, 64, 64
+};
+
+#if CONFIG_EXT_PARTITION
+static const uint8_t obmc_mask_64[64] = {
+  33, 34, 35, 35, 36, 37, 38, 39, 40, 40, 41, 42, 43, 44, 44, 44,
+  45, 46, 47, 47, 48, 49, 50, 51, 51, 51, 52, 52, 53, 54, 55, 56,
+  56, 56, 57, 57, 58, 58, 59, 60, 60, 60, 60, 60, 61, 62, 62, 62,
+  62, 62, 63, 63, 63, 63, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64,
+};
+#endif  // CONFIG_EXT_PARTITION
+
+
+const uint8_t* vp10_get_obmc_mask(int length) {
+  switch (length) {
+    case 1:
+      return obmc_mask_1;
+    case 2:
+      return obmc_mask_2;
+    case 4:
+      return obmc_mask_4;
+    case 8:
+      return obmc_mask_8;
+    case 16:
+      return obmc_mask_16;
+    case 32:
+      return obmc_mask_32;
+#if CONFIG_EXT_PARTITION
+    case 64:
+      return obmc_mask_64;
+#endif  // CONFIG_EXT_PARTITION
+    default:
+      assert(0);
+      return NULL;
+  }
+}
+
+// This function combines motion compensated predictions that is generated by
+// top/left neighboring blocks' inter predictors with the regular inter
+// prediction. We assume the original prediction (bmc) is stored in
+// xd->plane[].dst.buf
+void vp10_build_obmc_inter_prediction(VP10_COMMON *cm,
+                                      MACROBLOCKD *xd, int mi_row, int mi_col,
+                                      uint8_t *above[MAX_MB_PLANE],
+                                      int above_stride[MAX_MB_PLANE],
+                                      uint8_t *left[MAX_MB_PLANE],
+                                      int left_stride[MAX_MB_PLANE]) {
+  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  int plane, i;
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  // handle above row
+  if (xd->up_available) {
+    const int overlap = num_4x4_blocks_high_lookup[bsize] * 2;
+    const int miw = VPXMIN(xd->n8_w, cm->mi_cols - mi_col);
+    const int mi_row_offset = -1;
+
+    assert(miw > 0);
+
+    i = 0;
+    do {  // for each mi in the above row
+      const int mi_col_offset = i;
+      const MB_MODE_INFO *const above_mbmi =
+          &xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]->mbmi;
+      const int mi_step =
+          VPXMIN(xd->n8_w, num_8x8_blocks_wide_lookup[above_mbmi->sb_type]);
+
+      if (is_neighbor_overlappable(above_mbmi)) {
+        for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+          const struct macroblockd_plane *pd = &xd->plane[plane];
+          const int bw = (mi_step * MI_SIZE) >> pd->subsampling_x;
+          const int bh = overlap >> pd->subsampling_y;
+          const int dst_stride = pd->dst.stride;
+          uint8_t *const dst =
+              &pd->dst.buf[(i * MI_SIZE) >> pd->subsampling_x];
+          const int tmp_stride = above_stride[plane];
+          const uint8_t *const tmp =
+              &above[plane][(i * MI_SIZE) >> pd->subsampling_x];
+          const uint8_t *const mask = vp10_get_obmc_mask(bh);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (is_hbd)
+            vpx_highbd_blend_a64_vmask(dst, dst_stride, dst, dst_stride,
+                                       tmp, tmp_stride, mask, bh, bw, xd->bd);
+          else
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+            vpx_blend_a64_vmask(dst, dst_stride, dst, dst_stride,
+                                tmp, tmp_stride, mask, bh, bw);
+        }
+      }
+      i += mi_step;
+    } while (i < miw);
+  }
+
+  // handle left column
+  if (xd->left_available) {
+    const int overlap = num_4x4_blocks_wide_lookup[bsize] * 2;
+    const int mih = VPXMIN(xd->n8_h, cm->mi_rows - mi_row);
+    const int mi_col_offset = -1;
+
+    assert(mih > 0);
+
+    i = 0;
+    do {  // for each mi in the left column
+      const int mi_row_offset = i;
+      const MB_MODE_INFO *const left_mbmi =
+          &xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]->mbmi;
+      const int mi_step =
+          VPXMIN(xd->n8_h, num_8x8_blocks_high_lookup[left_mbmi->sb_type]);
+
+      if (is_neighbor_overlappable(left_mbmi)) {
+        for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+          const struct macroblockd_plane *pd = &xd->plane[plane];
+          const int bw = overlap >> pd->subsampling_x;
+          const int bh = (mi_step * MI_SIZE) >> pd->subsampling_y;
+          const int dst_stride = pd->dst.stride;
+          uint8_t *const dst =
+              &pd->dst.buf[(i * MI_SIZE * dst_stride) >> pd->subsampling_y];
+          const int tmp_stride = left_stride[plane];
+          const uint8_t *const tmp =
+              &left[plane][(i * MI_SIZE * tmp_stride) >> pd->subsampling_y];
+          const uint8_t *const mask = vp10_get_obmc_mask(bw);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (is_hbd)
+            vpx_highbd_blend_a64_hmask(dst, dst_stride, dst, dst_stride,
+                                       tmp, tmp_stride, mask, bh, bw, xd->bd);
+          else
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+            vpx_blend_a64_hmask(dst, dst_stride, dst, dst_stride,
+                                tmp, tmp_stride, mask, bh, bw);
+        }
+      }
+      i += mi_step;
+    } while (i < mih);
+  }
+}
+
+#if CONFIG_EXT_INTER
+void modify_neighbor_predictor_for_obmc(MB_MODE_INFO *mbmi) {
+  if (is_interintra_pred(mbmi)) {
+    mbmi->ref_frame[1] = NONE;
+  } else if (has_second_ref(mbmi) && is_interinter_wedge_used(mbmi->sb_type) &&
+             mbmi->use_wedge_interinter) {
+    mbmi->use_wedge_interinter = 0;
+    mbmi->ref_frame[1] = NONE;
+  }
+  return;
+}
+#endif  // CONFIG_EXT_INTER
+
+void vp10_build_prediction_by_above_preds(VP10_COMMON *cm,
+                                          MACROBLOCKD *xd,
+                                          int mi_row, int mi_col,
+                                          uint8_t *tmp_buf[MAX_MB_PLANE],
+                                          int tmp_stride[MAX_MB_PLANE]) {
+  const TileInfo *const tile = &xd->tile;
+  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  int i, j, mi_step, ref;
+
+  if (mi_row <= tile->mi_row_start)
+    return;
+
+  for (i = 0; i < VPXMIN(xd->n8_w, cm->mi_cols - mi_col); i += mi_step) {
+    int mi_row_offset = -1;
+    int mi_col_offset = i;
+    int mi_x, mi_y, bw, bh;
+    MODE_INFO *above_mi = xd->mi[mi_col_offset +
+                                 mi_row_offset * xd->mi_stride];
+    MB_MODE_INFO *above_mbmi = &above_mi->mbmi;
+#if CONFIG_EXT_INTER
+    MB_MODE_INFO backup_mbmi;
+#endif  // CONFIG_EXT_INTER
+
+    mi_step = VPXMIN(xd->n8_w,
+                     num_8x8_blocks_wide_lookup[above_mbmi->sb_type]);
+
+    if (!is_neighbor_overlappable(above_mbmi))
+      continue;
+
+#if CONFIG_EXT_INTER
+    backup_mbmi = *above_mbmi;
+    modify_neighbor_predictor_for_obmc(above_mbmi);
+#endif  // CONFIG_EXT_INTER
+
+    for (j = 0; j < MAX_MB_PLANE; ++j) {
+      struct macroblockd_plane *const pd = &xd->plane[j];
+      setup_pred_plane(&pd->dst,
+                       tmp_buf[j], tmp_stride[j],
+                       0, i, NULL,
+                       pd->subsampling_x, pd->subsampling_y);
+    }
+    for (ref = 0; ref < 1 + has_second_ref(above_mbmi); ++ref) {
+      MV_REFERENCE_FRAME frame = above_mbmi->ref_frame[ref];
+      RefBuffer *ref_buf = &cm->frame_refs[frame - LAST_FRAME];
+
+      xd->block_refs[ref] = ref_buf;
+      if ((!vp10_is_valid_scale(&ref_buf->sf)))
+        vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM,
+                           "Reference frame has invalid dimensions");
+      vp10_setup_pre_planes(xd, ref, ref_buf->buf, mi_row, mi_col + i,
+                            &ref_buf->sf);
+    }
+
+    xd->mb_to_left_edge   = -(((mi_col + i) * MI_SIZE) * 8);
+    mi_x = (mi_col + i) << MI_SIZE_LOG2;
+    mi_y = mi_row << MI_SIZE_LOG2;
+
+    for (j = 0; j < MAX_MB_PLANE; ++j) {
+      const struct macroblockd_plane *pd = &xd->plane[j];
+      bw = (mi_step * 8) >> pd->subsampling_x;
+      bh = VPXMAX((num_4x4_blocks_high_lookup[bsize] * 2) >> pd->subsampling_y,
+                  4);
+
+      if (above_mbmi->sb_type < BLOCK_8X8) {
+        const PARTITION_TYPE bp = BLOCK_8X8 - above_mbmi->sb_type;
+        const int have_vsplit = bp != PARTITION_HORZ;
+        const int have_hsplit = bp != PARTITION_VERT;
+        const int num_4x4_w = 2 >> ((!have_vsplit) | pd->subsampling_x);
+        const int num_4x4_h = 2 >> ((!have_hsplit) | pd->subsampling_y);
+        const int pw = 8 >> (have_vsplit | pd->subsampling_x);
+        int x, y;
+
+        for (y = 0; y < num_4x4_h; ++y)
+          for (x = 0; x < num_4x4_w; ++x) {
+            if ((bp == PARTITION_HORZ || bp == PARTITION_SPLIT)
+                && y == 0 && !pd->subsampling_y)
+              continue;
+
+            build_inter_predictors(xd, j, mi_col_offset, mi_row_offset,
+                                   y * 2 + x, bw, bh,
+                                   4 * x, 0, pw, bh,
+#if CONFIG_SUPERTX && CONFIG_EXT_INTER
+                                   0, 0,
+#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
+                                   mi_x, mi_y);
+          }
+      } else {
+        build_inter_predictors(xd, j, mi_col_offset, mi_row_offset,
+                               0, bw, bh, 0, 0, bw, bh,
+#if CONFIG_SUPERTX && CONFIG_EXT_INTER
+                               0, 0,
+#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
+                               mi_x, mi_y);
+      }
+    }
+#if CONFIG_EXT_INTER
+    *above_mbmi = backup_mbmi;
+#endif  // CONFIG_EXT_INTER
+  }
+  xd->mb_to_left_edge   = -((mi_col * MI_SIZE) * 8);
+}
+
+void vp10_build_prediction_by_left_preds(VP10_COMMON *cm,
+                                         MACROBLOCKD *xd,
+                                         int mi_row, int mi_col,
+                                         uint8_t *tmp_buf[MAX_MB_PLANE],
+                                         int tmp_stride[MAX_MB_PLANE]) {
+  const TileInfo *const tile = &xd->tile;
+  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  int i, j, mi_step, ref;
+
+  if (mi_col == 0 || (mi_col - 1 < tile->mi_col_start))
+    return;
+
+  for (i = 0; i < VPXMIN(xd->n8_h, cm->mi_rows - mi_row); i += mi_step) {
+    int mi_row_offset = i;
+    int mi_col_offset = -1;
+    int mi_x, mi_y, bw, bh;
+    MODE_INFO *left_mi = xd->mi[mi_col_offset +
+                                mi_row_offset * xd->mi_stride];
+    MB_MODE_INFO *left_mbmi = &left_mi->mbmi;
+#if CONFIG_EXT_INTER
+    MB_MODE_INFO backup_mbmi;
+#endif  // CONFIG_EXT_INTER
+
+    mi_step = VPXMIN(xd->n8_h,
+                     num_8x8_blocks_high_lookup[left_mbmi->sb_type]);
+
+    if (!is_neighbor_overlappable(left_mbmi))
+      continue;
+
+#if CONFIG_EXT_INTER
+    backup_mbmi = *left_mbmi;
+    modify_neighbor_predictor_for_obmc(left_mbmi);
+#endif  // CONFIG_EXT_INTER
+
+    for (j = 0; j < MAX_MB_PLANE; ++j) {
+      struct macroblockd_plane *const pd = &xd->plane[j];
+      setup_pred_plane(&pd->dst,
+                       tmp_buf[j], tmp_stride[j],
+                       i, 0, NULL,
+                       pd->subsampling_x, pd->subsampling_y);
+    }
+    for (ref = 0; ref < 1 + has_second_ref(left_mbmi); ++ref) {
+      MV_REFERENCE_FRAME frame = left_mbmi->ref_frame[ref];
+      RefBuffer *ref_buf = &cm->frame_refs[frame - LAST_FRAME];
+
+      xd->block_refs[ref] = ref_buf;
+      if ((!vp10_is_valid_scale(&ref_buf->sf)))
+        vpx_internal_error(xd->error_info, VPX_CODEC_UNSUP_BITSTREAM,
+                           "Reference frame has invalid dimensions");
+      vp10_setup_pre_planes(xd, ref, ref_buf->buf, mi_row + i, mi_col,
+                            &ref_buf->sf);
+    }
+
+    xd->mb_to_top_edge    = -(((mi_row + i) * MI_SIZE) * 8);
+    mi_x = mi_col << MI_SIZE_LOG2;
+    mi_y = (mi_row + i) << MI_SIZE_LOG2;
+
+    for (j = 0; j < MAX_MB_PLANE; ++j) {
+      const struct macroblockd_plane *pd = &xd->plane[j];
+      bw = VPXMAX((num_4x4_blocks_wide_lookup[bsize] * 2) >> pd->subsampling_x,
+                  4);
+      bh = (mi_step << MI_SIZE_LOG2) >> pd->subsampling_y;
+
+      if (left_mbmi->sb_type < BLOCK_8X8) {
+        const PARTITION_TYPE bp = BLOCK_8X8 - left_mbmi->sb_type;
+        const int have_vsplit = bp != PARTITION_HORZ;
+        const int have_hsplit = bp != PARTITION_VERT;
+        const int num_4x4_w = 2 >> ((!have_vsplit) | pd->subsampling_x);
+        const int num_4x4_h = 2 >> ((!have_hsplit) | pd->subsampling_y);
+        const int ph = 8 >> (have_hsplit | pd->subsampling_y);
+        int x, y;
+
+        for (y = 0; y < num_4x4_h; ++y)
+          for (x = 0; x < num_4x4_w; ++x) {
+            if ((bp == PARTITION_VERT || bp == PARTITION_SPLIT)
+                && x == 0 && !pd->subsampling_x)
+              continue;
+
+            build_inter_predictors(xd, j, mi_col_offset, mi_row_offset,
+                                   y * 2 + x, bw, bh,
+                                   0, 4 * y, bw, ph,
+#if CONFIG_SUPERTX && CONFIG_EXT_INTER
+                                   0, 0,
+#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
+                                   mi_x, mi_y);
+          }
+      } else {
+        build_inter_predictors(xd, j, mi_col_offset, mi_row_offset, 0,
+                               bw, bh, 0, 0, bw, bh,
+#if CONFIG_SUPERTX && CONFIG_EXT_INTER
+                               0, 0,
+#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
+                               mi_x, mi_y);
+      }
+    }
+#if CONFIG_EXT_INTER
+    *left_mbmi = backup_mbmi;
+#endif  // CONFIG_EXT_INTER
+  }
+  xd->mb_to_top_edge    = -((mi_row * MI_SIZE) * 8);
+}
+#endif  // CONFIG_OBMC
+
+#if CONFIG_EXT_INTER
+#if CONFIG_EXT_PARTITION
+static const int ii_weights1d[MAX_SB_SIZE] = {
+  102, 100,  97,  95,  92,  90,  88,  86,
+  84,  82,  80,  78,  76,  74,  73,  71,
+  69,  68,  67,  65,  64,  62,  61,  60,
+  59,  58,  57,  55,  54,  53,  52,  52,
+  51,  50,  49,  48,  47,  47,  46,  45,
+  45,  44,  43,  43,  42,  41,  41,  40,
+  40,  39,  39,  38,  38,  38,  37,  37,
+  36,  36,  36,  35,  35,  35,  34,  34,
+  34,  33,  33,  33,  33,  32,  32,  32,
+  32,  32,  31,  31,  31,  31,  31,  30,
+  30,  30,  30,  30,  30,  30,  29,  29,
+  29,  29,  29,  29,  29,  29,  28,  28,
+  28,  28,  28,  28,  28,  28,  28,  28,
+  28,  28,  27,  27,  27,  27,  27,  27,
+  27,  27,  27,  27,  27,  27,  27,  27,
+  27,  27,  27,  27,  27,  27,  27,  27,
+};
+static int ii_size_scales[BLOCK_SIZES] = {
+  32, 16, 16, 16, 8, 8, 8, 4, 4, 4, 2, 2, 2, 1, 1, 1
+};
+#else
+static const int ii_weights1d[MAX_SB_SIZE] = {
+  102, 100,  97,  95,  92,  90,  88,  86,
+  84,  82,  80,  78,  76,  74,  73,  71,
+  69,  68,  67,  65,  64,  62,  61,  60,
+  59,  58,  57,  55,  54,  53,  52,  52,
+  51,  50,  49,  48,  47,  47,  46,  45,
+  45,  44,  43,  43,  42,  41,  41,  40,
+  40,  39,  39,  38,  38,  38,  37,  37,
+  36,  36,  36,  35,  35,  35,  34,  34,
+};
+static int ii_size_scales[BLOCK_SIZES] = {
+  16, 8, 8, 8, 4, 4, 4, 2, 2, 2, 1, 1, 1
+};
+#endif  // CONFIG_EXT_PARTITION
+
+static void combine_interintra(INTERINTRA_MODE mode,
+                               int use_wedge_interintra,
+                               int wedge_index,
+                               int wedge_sign,
+                               BLOCK_SIZE bsize,
+                               BLOCK_SIZE plane_bsize,
+                               uint8_t *comppred,
+                               int compstride,
+                               const uint8_t *interpred,
+                               int interstride,
+                               const uint8_t *intrapred,
+                               int intrastride) {
+  const int bw = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  const int bh = 4 * num_4x4_blocks_high_lookup[plane_bsize];
+  const int size_scale = ii_size_scales[plane_bsize];
+  int i, j;
+
+  if (use_wedge_interintra) {
+    if (is_interintra_wedge_used(bsize)) {
+      const uint8_t *mask = vp10_get_contiguous_soft_mask(wedge_index,
+                                                          wedge_sign,
+                                                          bsize);
+      const int subw = 2 * num_4x4_blocks_wide_lookup[bsize] == bw;
+      const int subh = 2 * num_4x4_blocks_high_lookup[bsize] == bh;
+      vpx_blend_a64_mask(comppred, compstride,
+                         intrapred, intrastride,
+                         interpred, interstride,
+                         mask, 4 * num_4x4_blocks_wide_lookup[bsize],
+                         bh, bw, subh, subw);
+    }
+    return;
+  }
+
+  switch (mode) {
+    case II_V_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = ii_weights1d[i * size_scale];
+          comppred[i * compstride + j] =
+              VPX_BLEND_A256(scale,
+                             intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
+        }
+      }
+      break;
+
+    case II_H_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = ii_weights1d[j * size_scale];
+          comppred[i * compstride + j] =
+              VPX_BLEND_A256(scale,
+                             intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
+        }
+      }
+      break;
+
+    case II_D63_PRED:
+    case II_D117_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = (ii_weights1d[i * size_scale] * 3 +
+                       ii_weights1d[j * size_scale]) >> 2;
+          comppred[i * compstride + j] =
+              VPX_BLEND_A256(scale,
+                             intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
+        }
+      }
+      break;
+
+    case II_D207_PRED:
+    case II_D153_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = (ii_weights1d[j * size_scale] * 3 +
+                       ii_weights1d[i * size_scale]) >> 2;
+          comppred[i * compstride + j] =
+              VPX_BLEND_A256(scale,
+                             intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
+        }
+      }
+      break;
+
+    case II_D135_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = ii_weights1d[(i < j ? i : j) * size_scale];
+          comppred[i * compstride + j] =
+              VPX_BLEND_A256(scale,
+                             intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
+        }
+      }
+      break;
+
+    case II_D45_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = (ii_weights1d[i * size_scale] +
+                       ii_weights1d[j * size_scale]) >> 1;
+          comppred[i * compstride + j] =
+              VPX_BLEND_A256(scale,
+                             intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
+        }
+      }
+      break;
+
+    case II_TM_PRED:
+    case II_DC_PRED:
+    default:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          comppred[i * compstride + j] =
+              VPX_BLEND_AVG(intrapred[i * intrastride + j],
+                            interpred[i * interstride + j]);
+        }
+      }
+      break;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void combine_interintra_highbd(INTERINTRA_MODE mode,
+                                      int use_wedge_interintra,
+                                      int wedge_index,
+                                      int wedge_sign,
+                                      BLOCK_SIZE bsize,
+                                      BLOCK_SIZE plane_bsize,
+                                      uint8_t *comppred8,
+                                      int compstride,
+                                      const uint8_t *interpred8,
+                                      int interstride,
+                                      const uint8_t *intrapred8,
+                                      int intrastride, int bd) {
+  const int bw = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  const int bh = 4 * num_4x4_blocks_high_lookup[plane_bsize];
+  const int size_scale = ii_size_scales[plane_bsize];
+  int i, j;
+
+  uint16_t *comppred = CONVERT_TO_SHORTPTR(comppred8);
+  const uint16_t *interpred = CONVERT_TO_SHORTPTR(interpred8);
+  const uint16_t *intrapred = CONVERT_TO_SHORTPTR(intrapred8);
+
+  if (use_wedge_interintra) {
+    if (is_interintra_wedge_used(bsize)) {
+      const uint8_t *mask = vp10_get_contiguous_soft_mask(wedge_index,
+                                                          wedge_sign,
+                                                          bsize);
+      const int subh = 2 * num_4x4_blocks_high_lookup[bsize] == bh;
+      const int subw = 2 * num_4x4_blocks_wide_lookup[bsize] == bw;
+      vpx_highbd_blend_a64_mask(comppred8, compstride,
+                                intrapred8, intrastride,
+                                interpred8, interstride,
+                                mask, bw,
+                                bh, bw, subh, subw, bd);
+    }
+    return;
+  }
+
+  switch (mode) {
+    case II_V_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = ii_weights1d[i * size_scale];
+          comppred[i * compstride + j] =
+              VPX_BLEND_A256(scale,
+                             intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
+        }
+      }
+      break;
+
+    case II_H_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = ii_weights1d[j * size_scale];
+          comppred[i * compstride + j] =
+              VPX_BLEND_A256(scale,
+                             intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
+        }
+      }
+      break;
+
+    case II_D63_PRED:
+    case II_D117_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = (ii_weights1d[i * size_scale] * 3 +
+                       ii_weights1d[j * size_scale]) >> 2;
+          comppred[i * compstride + j] =
+              VPX_BLEND_A256(scale,
+                             intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
+        }
+      }
+      break;
+
+    case II_D207_PRED:
+    case II_D153_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = (ii_weights1d[j * size_scale] * 3 +
+                       ii_weights1d[i * size_scale]) >> 2;
+          comppred[i * compstride + j] =
+              VPX_BLEND_A256(scale,
+                             intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
+        }
+      }
+      break;
+
+    case II_D135_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = ii_weights1d[(i < j ? i : j) * size_scale];
+          comppred[i * compstride + j] =
+              VPX_BLEND_A256(scale,
+                             intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
+        }
+      }
+      break;
+
+    case II_D45_PRED:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          int scale = (ii_weights1d[i * size_scale] +
+                       ii_weights1d[j * size_scale]) >> 1;
+          comppred[i * compstride + j] =
+              VPX_BLEND_A256(scale,
+                             intrapred[i * intrastride + j],
+                             interpred[i * interstride + j]);
+        }
+      }
+      break;
+
+    case II_TM_PRED:
+    case II_DC_PRED:
+    default:
+      for (i = 0; i < bh; ++i) {
+        for (j = 0; j < bw; ++j) {
+          comppred[i * compstride + j] =
+              VPX_BLEND_AVG(interpred[i * interstride + j],
+                            intrapred[i * intrastride + j]);
+        }
+      }
+      break;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+// Break down rectangular intra prediction for joint spatio-temporal prediction
+// into two square intra predictions.
+static void build_intra_predictors_for_interintra(
+    MACROBLOCKD *xd,
+    uint8_t *ref, int ref_stride,
+    uint8_t *dst, int dst_stride,
+    PREDICTION_MODE mode,
+    BLOCK_SIZE bsize,
+    int plane) {
+  BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, &xd->plane[plane]);
+  const int bwl = b_width_log2_lookup[plane_bsize];
+  const int bhl = b_height_log2_lookup[plane_bsize];
+  const int pxbw = 4 << bwl;
+  const int pxbh = 4 << bhl;
+  TX_SIZE max_tx_size = max_txsize_lookup[plane_bsize];
+
+  if (bwl == bhl) {
+    vp10_predict_intra_block(xd, bwl, bhl, max_tx_size, mode,
+                             ref, ref_stride, dst, dst_stride,
+                             0, 0, plane);
+
+  } else if (bwl < bhl) {
+    uint8_t *src_2 = ref + pxbw * ref_stride;
+    uint8_t *dst_2 = dst + pxbw * dst_stride;
+    vp10_predict_intra_block(xd, bwl, bhl, max_tx_size, mode,
+                             ref, ref_stride, dst, dst_stride,
+                             0, 0, plane);
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      uint16_t *src_216 = CONVERT_TO_SHORTPTR(src_2);
+      uint16_t *dst_216 = CONVERT_TO_SHORTPTR(dst_2);
+      memcpy(src_216 - ref_stride, dst_216 - dst_stride,
+             sizeof(*src_216) * pxbw);
+    } else
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    {
+      memcpy(src_2 - ref_stride, dst_2 - dst_stride, sizeof(*src_2) * pxbw);
+    }
+    vp10_predict_intra_block(xd, bwl, bhl, max_tx_size, mode,
+                             src_2, ref_stride, dst_2, dst_stride,
+                             0, 1 << bwl, plane);
+  } else {  // bwl > bhl
+    int i;
+    uint8_t *src_2 = ref + pxbh;
+    uint8_t *dst_2 = dst + pxbh;
+    vp10_predict_intra_block(xd, bwl, bhl, max_tx_size, mode,
+                             ref, ref_stride, dst, dst_stride,
+                             0, 0, plane);
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      uint16_t *src_216 = CONVERT_TO_SHORTPTR(src_2);
+      uint16_t *dst_216 = CONVERT_TO_SHORTPTR(dst_2);
+      for (i = 0; i < pxbh; ++i)
+        src_216[i * ref_stride - 1] = dst_216[i * dst_stride - 1];
+    } else
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    {
+      for (i = 0; i < pxbh; ++i)
+        src_2[i * ref_stride - 1] = dst_2[i * dst_stride - 1];
+    }
+    vp10_predict_intra_block(xd, bwl, bhl, max_tx_size, mode,
+                             src_2, ref_stride, dst_2, dst_stride,
+                             1 << bhl, 0, plane);
+  }
+}
+
+// Mapping of interintra to intra mode for use in the intra component
+static const int interintra_to_intra_mode[INTERINTRA_MODES] = {
+  DC_PRED,
+  V_PRED,
+  H_PRED,
+  D45_PRED,
+  D135_PRED,
+  D117_PRED,
+  D153_PRED,
+  D207_PRED,
+  D63_PRED,
+  TM_PRED
+};
+
+void vp10_build_intra_predictors_for_interintra(
+    MACROBLOCKD *xd,
+    BLOCK_SIZE bsize, int plane,
+    uint8_t *dst, int dst_stride) {
+  build_intra_predictors_for_interintra(
+      xd, xd->plane[plane].dst.buf, xd->plane[plane].dst.stride,
+      dst, dst_stride,
+      interintra_to_intra_mode[xd->mi[0]->mbmi.interintra_mode],
+      bsize, plane);
+}
+
+void vp10_combine_interintra(MACROBLOCKD *xd,
+                             BLOCK_SIZE bsize, int plane,
+                             const uint8_t *inter_pred, int inter_stride,
+                             const uint8_t *intra_pred, int intra_stride) {
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, &xd->plane[plane]);
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    combine_interintra_highbd(xd->mi[0]->mbmi.interintra_mode,
+                              xd->mi[0]->mbmi.use_wedge_interintra,
+                              xd->mi[0]->mbmi.interintra_wedge_index,
+                              xd->mi[0]->mbmi.interintra_wedge_sign,
+                              bsize,
+                              plane_bsize,
+                              xd->plane[plane].dst.buf,
+                              xd->plane[plane].dst.stride,
+                              inter_pred, inter_stride,
+                              intra_pred, intra_stride,
+                              xd->bd);
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  combine_interintra(xd->mi[0]->mbmi.interintra_mode,
+                     xd->mi[0]->mbmi.use_wedge_interintra,
+                     xd->mi[0]->mbmi.interintra_wedge_index,
+                     xd->mi[0]->mbmi.interintra_wedge_sign,
+                     bsize,
+                     plane_bsize,
+                     xd->plane[plane].dst.buf, xd->plane[plane].dst.stride,
+                     inter_pred, inter_stride,
+                     intra_pred, intra_stride);
+}
+
+void vp10_build_interintra_predictors_sby(MACROBLOCKD *xd,
+                                          uint8_t *ypred,
+                                          int ystride,
+                                          BLOCK_SIZE bsize) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        DECLARE_ALIGNED(16, uint16_t,
+                        intrapredictor[MAX_SB_SQUARE]);
+    vp10_build_intra_predictors_for_interintra(
+        xd, bsize, 0, CONVERT_TO_BYTEPTR(intrapredictor), MAX_SB_SIZE);
+    vp10_combine_interintra(xd, bsize, 0, ypred, ystride,
+                            CONVERT_TO_BYTEPTR(intrapredictor), MAX_SB_SIZE);
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  {
+    DECLARE_ALIGNED(16, uint8_t, intrapredictor[MAX_SB_SQUARE]);
+    vp10_build_intra_predictors_for_interintra(
+        xd, bsize, 0, intrapredictor, MAX_SB_SIZE);
+    vp10_combine_interintra(xd, bsize, 0, ypred, ystride,
+                            intrapredictor, MAX_SB_SIZE);
+  }
+}
+
+void vp10_build_interintra_predictors_sbc(MACROBLOCKD *xd,
+                                          uint8_t *upred,
+                                          int ustride,
+                                          int plane,
+                                          BLOCK_SIZE bsize) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    DECLARE_ALIGNED(16, uint16_t,
+                    uintrapredictor[MAX_SB_SQUARE]);
+    vp10_build_intra_predictors_for_interintra(
+        xd, bsize, plane, CONVERT_TO_BYTEPTR(uintrapredictor), MAX_SB_SIZE);
+    vp10_combine_interintra(xd, bsize, plane, upred, ustride,
+                            CONVERT_TO_BYTEPTR(uintrapredictor), MAX_SB_SIZE);
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  {
+    DECLARE_ALIGNED(16, uint8_t, uintrapredictor[MAX_SB_SQUARE]);
+    vp10_build_intra_predictors_for_interintra(
+        xd, bsize, plane, uintrapredictor, MAX_SB_SIZE);
+    vp10_combine_interintra(xd, bsize, plane, upred, ustride,
+                            uintrapredictor, MAX_SB_SIZE);
+  }
+}
+
+void vp10_build_interintra_predictors_sbuv(MACROBLOCKD *xd,
+                                           uint8_t *upred,
+                                           uint8_t *vpred,
+                                           int ustride, int vstride,
+                                           BLOCK_SIZE bsize) {
+  vp10_build_interintra_predictors_sbc(xd, upred, ustride, 1, bsize);
+  vp10_build_interintra_predictors_sbc(xd, vpred, vstride, 2, bsize);
+}
+
+void vp10_build_interintra_predictors(MACROBLOCKD *xd,
+                                      uint8_t *ypred,
+                                      uint8_t *upred,
+                                      uint8_t *vpred,
+                                      int ystride, int ustride, int vstride,
+                                      BLOCK_SIZE bsize) {
+  vp10_build_interintra_predictors_sby(xd, ypred, ystride, bsize);
+  vp10_build_interintra_predictors_sbuv(xd, upred, vpred,
+                                        ustride, vstride, bsize);
+}
+
+// Builds the inter-predictor for the single ref case
+// for use in the encoder to search the wedges efficiently.
+static void build_inter_predictors_single_buf(MACROBLOCKD *xd, int plane,
+                                              int block,
+                                              int bw, int bh,
+                                              int x, int y, int w, int h,
+                                              int mi_x, int mi_y,
+                                              int ref,
+                                              uint8_t *const ext_dst,
+                                              int ext_dst_stride) {
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const MODE_INFO *mi = xd->mi[0];
+
+  const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
+  struct buf_2d *const pre_buf = &pd->pre[ref];
+#if CONFIG_VP9_HIGHBITDEPTH
+  uint8_t *const dst =
+      (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH ?
+      CONVERT_TO_BYTEPTR(ext_dst) : ext_dst) + ext_dst_stride * y + x;
+#else
+  uint8_t *const dst = ext_dst + ext_dst_stride * y + x;
+#endif
+  const MV mv = mi->mbmi.sb_type < BLOCK_8X8
+      ? average_split_mvs(pd, mi, ref, block)
+      : mi->mbmi.mv[ref].as_mv;
+
+  // TODO(jkoleszar): This clamping is done in the incorrect place for the
+  // scaling case. It needs to be done on the scaled MV, not the pre-scaling
+  // MV. Note however that it performs the subsampling aware scaling so
+  // that the result is always q4.
+  // mv_precision precision is MV_PRECISION_Q4.
+  const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, &mv, bw, bh,
+                                             pd->subsampling_x,
+                                             pd->subsampling_y);
+
+  uint8_t *pre;
+  MV32 scaled_mv;
+  int xs, ys, subpel_x, subpel_y;
+  const int is_scaled = vp10_is_scaled(sf);
+
+  if (is_scaled) {
+    pre = pre_buf->buf + scaled_buffer_offset(x, y, pre_buf->stride, sf);
+    scaled_mv = vp10_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
+    xs = sf->x_step_q4;
+    ys = sf->y_step_q4;
+  } else {
+    pre = pre_buf->buf + (y * pre_buf->stride + x);
+    scaled_mv.row = mv_q4.row;
+    scaled_mv.col = mv_q4.col;
+    xs = ys = 16;
+  }
+
+  subpel_x = scaled_mv.col & SUBPEL_MASK;
+  subpel_y = scaled_mv.row & SUBPEL_MASK;
+  pre += (scaled_mv.row >> SUBPEL_BITS) * pre_buf->stride
+      + (scaled_mv.col >> SUBPEL_BITS);
+
+  vp10_make_inter_predictor(pre, pre_buf->stride, dst, ext_dst_stride,
+                            subpel_x, subpel_y, sf, w, h, 0,
+                            mi->mbmi.interp_filter, xs, ys, xd);
+}
+
+void vp10_build_inter_predictors_for_planes_single_buf(
+    MACROBLOCKD *xd, BLOCK_SIZE bsize,
+    int plane_from, int plane_to,
+    int mi_row, int mi_col, int ref,
+    uint8_t *ext_dst[3], int ext_dst_stride[3]) {
+  int plane;
+  const int mi_x = mi_col * MI_SIZE;
+  const int mi_y = mi_row * MI_SIZE;
+  for (plane = plane_from; plane <= plane_to; ++plane) {
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize,
+                                                        &xd->plane[plane]);
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+    const int bw = 4 * num_4x4_w;
+    const int bh = 4 * num_4x4_h;
+
+    if (xd->mi[0]->mbmi.sb_type < BLOCK_8X8) {
+      int x, y;
+      assert(bsize == BLOCK_8X8);
+      for (y = 0; y < num_4x4_h; ++y)
+        for (x = 0; x < num_4x4_w; ++x)
+          build_inter_predictors_single_buf(xd, plane,
+                                            y * 2 + x, bw, bh,
+                                            4 * x, 4 * y, 4, 4,
+                                            mi_x, mi_y, ref,
+                                            ext_dst[plane],
+                                            ext_dst_stride[plane]);
+    } else {
+      build_inter_predictors_single_buf(xd, plane,
+                                        0, bw, bh,
+                                        0, 0, bw, bh,
+                                        mi_x, mi_y, ref,
+                                        ext_dst[plane],
+                                        ext_dst_stride[plane]);
+    }
+  }
+}
+
+static void build_wedge_inter_predictor_from_buf(MACROBLOCKD *xd, int plane,
+                                                 int x, int y, int w, int h,
+                                                 uint8_t *ext_dst0,
+                                                 int ext_dst_stride0,
+                                                 uint8_t *ext_dst1,
+                                                 int ext_dst_stride1) {
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const int is_compound = has_second_ref(mbmi);
+  MACROBLOCKD_PLANE *const pd = &xd->plane[plane];
+  struct buf_2d *const dst_buf = &pd->dst;
+  uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
+
+  if (is_compound
+      && is_interinter_wedge_used(mbmi->sb_type)
+      && mbmi->use_wedge_interinter) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+      build_masked_compound_wedge_highbd(
+          dst, dst_buf->stride,
+          CONVERT_TO_BYTEPTR(ext_dst0), ext_dst_stride0,
+          CONVERT_TO_BYTEPTR(ext_dst1), ext_dst_stride1,
+          mbmi->interinter_wedge_index,
+          mbmi->interinter_wedge_sign,
+          mbmi->sb_type, h, w,
+          xd->bd);
+    else
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      build_masked_compound_wedge(
+          dst, dst_buf->stride,
+          ext_dst0, ext_dst_stride0,
+          ext_dst1, ext_dst_stride1,
+          mbmi->interinter_wedge_index,
+          mbmi->interinter_wedge_sign,
+          mbmi->sb_type, h, w);
+  } else {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+      vpx_highbd_convolve_copy(CONVERT_TO_BYTEPTR(ext_dst0),  ext_dst_stride0,
+                               dst, dst_buf->stride, NULL, 0, NULL, 0, w, h,
+                               xd->bd);
+    else
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      vpx_convolve_copy(ext_dst0, ext_dst_stride0,
+                        dst, dst_buf->stride, NULL, 0, NULL, 0, w, h);
+  }
+}
+
+void vp10_build_wedge_inter_predictor_from_buf(
+    MACROBLOCKD *xd, BLOCK_SIZE bsize,
+    int plane_from, int plane_to,
+    uint8_t *ext_dst0[3], int ext_dst_stride0[3],
+    uint8_t *ext_dst1[3], int ext_dst_stride1[3]) {
+  int plane;
+  for (plane = plane_from; plane <= plane_to; ++plane) {
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize,
+                                                        &xd->plane[plane]);
+    const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+
+    if (xd->mi[0]->mbmi.sb_type < BLOCK_8X8) {
+      int x, y;
+      assert(bsize == BLOCK_8X8);
+      for (y = 0; y < num_4x4_h; ++y)
+        for (x = 0; x < num_4x4_w; ++x)
+          build_wedge_inter_predictor_from_buf(xd, plane,
+                                               4 * x, 4 * y, 4, 4,
+                                               ext_dst0[plane],
+                                               ext_dst_stride0[plane],
+                                               ext_dst1[plane],
+                                               ext_dst_stride1[plane]);
+    } else {
+      const int bw = 4 * num_4x4_w;
+      const int bh = 4 * num_4x4_h;
+      build_wedge_inter_predictor_from_buf(xd, plane,
+                                           0, 0, bw, bh,
+                                           ext_dst0[plane],
+                                           ext_dst_stride0[plane],
+                                           ext_dst1[plane],
+                                           ext_dst_stride1[plane]);
+    }
+  }
+}
+#endif  // CONFIG_EXT_INTER
diff --git a/vp10/common/reconinter.h b/vp10/common/reconinter.h
index 5678f47..c32596e 100644
--- a/vp10/common/reconinter.h
+++ b/vp10/common/reconinter.h
@@ -13,8 +13,8 @@
 
 #include "vp10/common/filter.h"
 #include "vp10/common/onyxc_int.h"
+#include "vp10/common/vp10_convolve.h"
 #include "vpx/vpx_integer.h"
-#include "vpx_dsp/vpx_filter.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -25,29 +25,264 @@
                                    const int subpel_x,
                                    const int subpel_y,
                                    const struct scale_factors *sf,
-                                   int w, int h, int ref,
-                                   const InterpKernel *kernel,
+                                   int w, int h, int ref_idx,
+#if CONFIG_DUAL_FILTER
+                                   const INTERP_FILTER *interp_filter,
+#else
+                                   const INTERP_FILTER interp_filter,
+#endif
                                    int xs, int ys) {
-  sf->predict[subpel_x != 0][subpel_y != 0][ref](
-      src, src_stride, dst, dst_stride,
-      kernel[subpel_x], xs, kernel[subpel_y], ys, w, h);
+#if CONFIG_DUAL_FILTER
+  InterpFilterParams interp_filter_params_x =
+      vp10_get_interp_filter_params(interp_filter[1 + 2 * ref_idx]);
+  InterpFilterParams interp_filter_params_y =
+      vp10_get_interp_filter_params(interp_filter[0 + 2 * ref_idx]);
+#else
+  InterpFilterParams interp_filter_params =
+      vp10_get_interp_filter_params(interp_filter);
+#endif
+
+#if CONFIG_DUAL_FILTER
+  if (interp_filter_params_x.taps == SUBPEL_TAPS &&
+      interp_filter_params_y.taps == SUBPEL_TAPS &&
+      w > 2 && h > 2) {
+    const int16_t *kernel_x =
+        vp10_get_interp_filter_subpel_kernel(interp_filter_params_x, subpel_x);
+    const int16_t *kernel_y =
+        vp10_get_interp_filter_subpel_kernel(interp_filter_params_y, subpel_y);
+#else
+  if (interp_filter_params.taps == SUBPEL_TAPS) {
+    const int16_t *kernel_x =
+        vp10_get_interp_filter_subpel_kernel(interp_filter_params, subpel_x);
+    const int16_t *kernel_y =
+        vp10_get_interp_filter_subpel_kernel(interp_filter_params, subpel_y);
+#endif
+#if CONFIG_EXT_INTERP && SUPPORT_NONINTERPOLATING_FILTERS
+    if (IsInterpolatingFilter(interp_filter)) {
+      // Interpolating filter
+      sf->predict[subpel_x != 0][subpel_y != 0][ref](
+          src, src_stride, dst, dst_stride,
+          kernel_x, xs, kernel_y, ys, w, h);
+    } else {
+      sf->predict_ni[subpel_x != 0][subpel_y != 0][ref](
+          src, src_stride, dst, dst_stride,
+          kernel_x, xs, kernel_y, ys, w, h);
+    }
+#else
+    sf->predict[subpel_x != 0][subpel_y != 0][ref_idx](
+        src, src_stride, dst, dst_stride,
+        kernel_x, xs, kernel_y, ys, w, h);
+#endif  // CONFIG_EXT_INTERP && SUPPORT_NONINTERPOLATING_FILTERS
+  } else {
+    // ref_idx > 0 means this is the second reference frame
+    // first reference frame's prediction result is already in dst
+    // therefore we need to average the first and second results
+    vp10_convolve(src, src_stride, dst, dst_stride, w, h, interp_filter,
+                  subpel_x, xs, subpel_y, ys, ref_idx);
+  }
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static INLINE void high_inter_predictor(const uint8_t *src, int src_stride,
-                                        uint8_t *dst, int dst_stride,
-                                        const int subpel_x,
-                                        const int subpel_y,
-                                        const struct scale_factors *sf,
-                                        int w, int h, int ref,
-                                        const InterpKernel *kernel,
-                                        int xs, int ys, int bd) {
-  sf->highbd_predict[subpel_x != 0][subpel_y != 0][ref](
-      src, src_stride, dst, dst_stride,
-      kernel[subpel_x], xs, kernel[subpel_y], ys, w, h, bd);
+static INLINE void highbd_inter_predictor(const uint8_t *src, int src_stride,
+                                          uint8_t *dst, int dst_stride,
+                                          const int subpel_x,
+                                          const int subpel_y,
+                                          const struct scale_factors *sf,
+                                          int w, int h, int ref,
+#if CONFIG_DUAL_FILTER
+                                          const INTERP_FILTER *interp_filter,
+#else
+                                          const INTERP_FILTER interp_filter,
+#endif
+                                          int xs, int ys, int bd) {
+#if CONFIG_DUAL_FILTER
+  InterpFilterParams interp_filter_params_x =
+      vp10_get_interp_filter_params(interp_filter[1 + 2 * ref]);
+  InterpFilterParams interp_filter_params_y =
+      vp10_get_interp_filter_params(interp_filter[0 + 2 * ref]);
+#else
+  InterpFilterParams interp_filter_params =
+      vp10_get_interp_filter_params(interp_filter);
+#endif
+
+#if CONFIG_DUAL_FILTER
+  if (interp_filter_params_x.taps == SUBPEL_TAPS &&
+      interp_filter_params_y.taps == SUBPEL_TAPS &&
+      w > 2 && h > 2) {
+    const int16_t *kernel_x =
+        vp10_get_interp_filter_subpel_kernel(interp_filter_params_x, subpel_x);
+    const int16_t *kernel_y =
+        vp10_get_interp_filter_subpel_kernel(interp_filter_params_y, subpel_y);
+#else
+  if (interp_filter_params.taps == SUBPEL_TAPS) {
+    const int16_t *kernel_x =
+        vp10_get_interp_filter_subpel_kernel(interp_filter_params, subpel_x);
+    const int16_t *kernel_y =
+        vp10_get_interp_filter_subpel_kernel(interp_filter_params, subpel_y);
+#endif  // CONFIG_DUAL_FILTER
+#if CONFIG_EXT_INTERP && SUPPORT_NONINTERPOLATING_FILTERS
+    if (IsInterpolatingFilter(interp_filter)) {
+      // Interpolating filter
+      sf->highbd_predict[subpel_x != 0][subpel_y != 0][ref](
+          src, src_stride, dst, dst_stride,
+          kernel_x, xs, kernel_y, ys, w, h, bd);
+    } else {
+      sf->highbd_predict_ni[subpel_x != 0][subpel_y != 0][ref](
+          src, src_stride, dst, dst_stride,
+          kernel_x, xs, kernel_y, ys, w, h, bd);
+    }
+#else
+    sf->highbd_predict[subpel_x != 0][subpel_y != 0][ref](
+        src, src_stride, dst, dst_stride,
+        kernel_x, xs, kernel_y, ys, w, h, bd);
+#endif  // CONFIG_EXT_INTERP && SUPPORT_NONINTERPOLATING_FILTERS
+  } else {
+    // ref > 0 means this is the second reference frame
+    // first reference frame's prediction result is already in dst
+    // therefore we need to average the first and second results
+    int avg = ref > 0;
+    vp10_highbd_convolve(src, src_stride, dst, dst_stride, w, h,
+                         interp_filter, subpel_x, xs, subpel_y, ys, avg,
+                         bd);
+  }
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+#if CONFIG_EXT_INTER
+// Set to one to use larger codebooks
+#define USE_LARGE_WEDGE_CODEBOOK  0
+
+#if USE_LARGE_WEDGE_CODEBOOK
+#define MAX_WEDGE_TYPES   (1 << 5)
+#else
+#define MAX_WEDGE_TYPES   (1 << 4)
+#endif
+
+#define MAX_WEDGE_SIZE_LOG2   5   // 32x32
+#define MAX_WEDGE_SIZE        (1 << MAX_WEDGE_SIZE_LOG2)
+#define MAX_WEDGE_SQUARE      (MAX_WEDGE_SIZE * MAX_WEDGE_SIZE)
+
+#define WEDGE_WEIGHT_BITS 6
+
+#define WEDGE_NONE       -1
+
+// Angles are with respect to horizontal anti-clockwise
+typedef enum {
+  WEDGE_HORIZONTAL = 0,
+  WEDGE_VERTICAL = 1,
+  WEDGE_OBLIQUE27 = 2,
+  WEDGE_OBLIQUE63 = 3,
+  WEDGE_OBLIQUE117 = 4,
+  WEDGE_OBLIQUE153 = 5,
+  WEDGE_DIRECTIONS
+} WedgeDirectionType;
+
+// 3-tuple: {direction, x_offset, y_offset}
+typedef struct {
+  WedgeDirectionType direction;
+  int x_offset;
+  int y_offset;
+} wedge_code_type;
+
+typedef uint8_t *wedge_masks_type[MAX_WEDGE_TYPES];
+
+typedef struct {
+  int bits;
+  const wedge_code_type *codebook;
+  uint8_t *signflip;
+  int smoother;
+  wedge_masks_type *masks;
+} wedge_params_type;
+
+extern const wedge_params_type wedge_params_lookup[BLOCK_SIZES];
+
+static INLINE int get_wedge_bits_lookup(BLOCK_SIZE sb_type) {
+  return wedge_params_lookup[sb_type].bits;
+}
+
+static INLINE int is_interinter_wedge_used(BLOCK_SIZE sb_type) {
+  (void) sb_type;
+  return wedge_params_lookup[sb_type].bits > 0;
+}
+
+static INLINE int get_interinter_wedge_bits(BLOCK_SIZE sb_type) {
+  const int wbits = wedge_params_lookup[sb_type].bits;
+  return (wbits > 0) ? wbits + 1 : 0;
+}
+
+static INLINE int is_interintra_wedge_used(BLOCK_SIZE sb_type) {
+  (void) sb_type;
+  return wedge_params_lookup[sb_type].bits > 0;
+}
+
+static INLINE int get_interintra_wedge_bits(BLOCK_SIZE sb_type) {
+  return wedge_params_lookup[sb_type].bits;
+}
+#endif  // CONFIG_EXT_INTER
+
+void build_inter_predictors(MACROBLOCKD *xd, int plane,
+#if CONFIG_OBMC
+                            int mi_col_offset, int mi_row_offset,
+#endif  // CONFIG_OBMC
+                            int block,
+                            int bw, int bh,
+                            int x, int y, int w, int h,
+#if CONFIG_SUPERTX && CONFIG_EXT_INTER
+                            int wedge_offset_x, int wedge_offset_y,
+#endif  // CONFIG_SUPERTX && CONFIG_EXT_INTER
+                            int mi_x, int mi_y);
+
+static INLINE void vp10_make_inter_predictor(
+    const uint8_t *src,
+    int src_stride,
+    uint8_t *dst,
+    int dst_stride,
+    const int subpel_x,
+    const int subpel_y,
+    const struct scale_factors *sf,
+    int w, int h, int ref,
+#if CONFIG_DUAL_FILTER
+    const INTERP_FILTER *interp_filter,
+#else
+    const INTERP_FILTER interp_filter,
+#endif
+    int xs, int ys,
+    const MACROBLOCKD *xd) {
+  (void) xd;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    highbd_inter_predictor(src, src_stride, dst, dst_stride,
+                           subpel_x, subpel_y, sf, w, h, ref,
+                           interp_filter, xs, ys, xd->bd);
+  else
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    inter_predictor(src, src_stride, dst, dst_stride,
+                    subpel_x, subpel_y, sf, w, h, ref,
+                    interp_filter, xs, ys);
+}
+
+#if CONFIG_EXT_INTER
+void vp10_make_masked_inter_predictor(
+    const uint8_t *pre,
+    int pre_stride,
+    uint8_t *dst,
+    int dst_stride,
+    const int subpel_x,
+    const int subpel_y,
+    const struct scale_factors *sf,
+    int w, int h,
+#if CONFIG_DUAL_FILTER
+    const INTERP_FILTER *interp_filter,
+#else
+    const INTERP_FILTER interp_filter,
+#endif
+    int xs, int ys,
+#if CONFIG_SUPERTX
+    int wedge_offset_x, int wedge_offset_y,
+#endif  // CONFIG_SUPERTX
+    const MACROBLOCKD *xd);
+#endif  // CONFIG_EXT_INTER
+
 static INLINE int round_mv_comp_q4(int value) {
   return (value < 0 ? value - 2 : value + 2) / 4;
 }
@@ -57,10 +292,10 @@
                               mi->bmi[1].as_mv[idx].as_mv.row +
                               mi->bmi[2].as_mv[idx].as_mv.row +
                               mi->bmi[3].as_mv[idx].as_mv.row),
-             round_mv_comp_q4(mi->bmi[0].as_mv[idx].as_mv.col +
-                              mi->bmi[1].as_mv[idx].as_mv.col +
-                              mi->bmi[2].as_mv[idx].as_mv.col +
-                              mi->bmi[3].as_mv[idx].as_mv.col) };
+     round_mv_comp_q4(mi->bmi[0].as_mv[idx].as_mv.col +
+                      mi->bmi[1].as_mv[idx].as_mv.col +
+                      mi->bmi[2].as_mv[idx].as_mv.col +
+                      mi->bmi[3].as_mv[idx].as_mv.col) };
   return res;
 }
 
@@ -126,33 +361,58 @@
   return res;
 }
 
-void build_inter_predictors(MACROBLOCKD *xd, int plane, int block,
-                                   int bw, int bh,
-                                   int x, int y, int w, int h,
-                                   int mi_x, int mi_y);
-
 void vp10_build_inter_predictor_sub8x8(MACROBLOCKD *xd, int plane,
                                        int i, int ir, int ic,
                                        int mi_row, int mi_col);
 
 void vp10_build_inter_predictors_sby(MACROBLOCKD *xd, int mi_row, int mi_col,
-                                    BLOCK_SIZE bsize);
-
-void vp10_build_inter_predictors_sbp(MACROBLOCKD *xd, int mi_row, int mi_col,
-                                    BLOCK_SIZE bsize, int plane);
-
-void vp10_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col,
                                      BLOCK_SIZE bsize);
 
+void vp10_build_inter_predictors_sbp(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                     BLOCK_SIZE bsize, int plane);
+
+void vp10_build_inter_predictors_sbuv(MACROBLOCKD *xd, int mi_row, int mi_col,
+                                      BLOCK_SIZE bsize);
+
 void vp10_build_inter_predictors_sb(MACROBLOCKD *xd, int mi_row, int mi_col,
-                                   BLOCK_SIZE bsize);
+                                    BLOCK_SIZE bsize);
+
+#if CONFIG_SUPERTX
+void vp10_build_inter_predictors_sb_sub8x8_extend(
+    MACROBLOCKD *xd,
+#if CONFIG_EXT_INTER
+    int mi_row_ori, int mi_col_ori,
+#endif  // CONFIG_EXT_INTER
+    int mi_row, int mi_col,
+    BLOCK_SIZE bsize, int block);
+
+void vp10_build_inter_predictors_sb_extend(
+    MACROBLOCKD *xd,
+#if CONFIG_EXT_INTER
+    int mi_row_ori, int mi_col_ori,
+#endif  // CONFIG_EXT_INTER
+    int mi_row, int mi_col,
+    BLOCK_SIZE bsize);
+struct macroblockd_plane;
+void vp10_build_masked_inter_predictor_complex(
+    MACROBLOCKD *xd,
+    uint8_t *dst, int dst_stride,
+    const uint8_t *pre, int pre_stride,
+    int mi_row, int mi_col, int mi_row_ori, int mi_col_ori,
+    BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
+    PARTITION_TYPE partition, int plane);
+#endif  // CONFIG_SUPERTX
 
 void vp10_build_inter_predictor(const uint8_t *src, int src_stride,
                                uint8_t *dst, int dst_stride,
                                const MV *mv_q3,
                                const struct scale_factors *sf,
                                int w, int h, int do_avg,
-                               const InterpKernel *kernel,
+#if CONFIG_DUAL_FILTER
+                               const INTERP_FILTER *interp_filter,
+#else
+                               const INTERP_FILTER interp_filter,
+#endif
                                enum mv_precision precision,
                                int x, int y);
 
@@ -162,7 +422,11 @@
                                       const MV *mv_q3,
                                       const struct scale_factors *sf,
                                       int w, int h, int do_avg,
-                                      const InterpKernel *kernel,
+#if CONFIG_DUAL_FILTER
+                                      const INTERP_FILTER *interp_filter,
+#else
+                                      const INTERP_FILTER interp_filter,
+#endif
                                       enum mv_precision precision,
                                       int x, int y, int bd);
 #endif
@@ -193,6 +457,203 @@
                           const YV12_BUFFER_CONFIG *src, int mi_row, int mi_col,
                           const struct scale_factors *sf);
 
+#if CONFIG_DUAL_FILTER
+// Detect if the block have sub-pixel level motion vectors
+// per component.
+static INLINE int has_subpel_mv_component(const MODE_INFO *const mi,
+                                          const MACROBLOCKD *const xd,
+                                          int dir) {
+  const MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  int plane;
+  int ref = (dir >> 1);
+
+  if (bsize >= BLOCK_8X8) {
+    if (dir & 0x01) {
+      if (mbmi->mv[ref].as_mv.col & SUBPEL_MASK)
+        return 1;
+    } else {
+      if (mbmi->mv[ref].as_mv.row & SUBPEL_MASK)
+        return 1;
+    }
+  } else {
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+      const PARTITION_TYPE bp = BLOCK_8X8 - bsize;
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const int have_vsplit = bp != PARTITION_HORZ;
+      const int have_hsplit = bp != PARTITION_VERT;
+      const int num_4x4_w = 2 >> ((!have_vsplit) | pd->subsampling_x);
+      const int num_4x4_h = 2 >> ((!have_hsplit) | pd->subsampling_y);
+
+      int x, y;
+      for (y = 0; y < num_4x4_h; ++y) {
+        for (x = 0; x < num_4x4_w; ++x) {
+          const MV mv = average_split_mvs(pd, mi, ref, y * 2 + x);
+          if (dir & 0x01) {
+            if (mv.col & SUBPEL_MASK)
+              return 1;
+          } else {
+            if (mv.row & SUBPEL_MASK)
+              return 1;
+          }
+        }
+      }
+    }
+  }
+
+  return 0;
+}
+#endif
+
+#if CONFIG_EXT_INTERP
+static INLINE int vp10_is_interp_needed(const MACROBLOCKD *const xd) {
+  MODE_INFO *const mi = xd->mi[0];
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int is_compound = has_second_ref(mbmi);
+  int intpel_mv = 1;
+  int plane;
+
+#if SUPPORT_NONINTERPOLATING_FILTERS
+  // TODO(debargha): This is is currently only for experimentation
+  // with non-interpolating filters. Remove later.
+  // If any of the filters are non-interpolating, then indicate the
+  // interpolation filter always.
+  int i;
+  for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
+    if (!IsInterpolatingFilter(i)) return 1;
+  }
+#endif
+
+  // For scaled references, interpolation filter is indicated all the time.
+  if (vp10_is_scaled(&xd->block_refs[0]->sf))
+    return 1;
+  if (is_compound && vp10_is_scaled(&xd->block_refs[1]->sf))
+    return 1;
+
+  if (bsize < BLOCK_8X8) {
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+      const PARTITION_TYPE bp = BLOCK_8X8 - bsize;
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      const int have_vsplit = bp != PARTITION_HORZ;
+      const int have_hsplit = bp != PARTITION_VERT;
+      const int num_4x4_w = 2 >> ((!have_vsplit) | pd->subsampling_x);
+      const int num_4x4_h = 2 >> ((!have_hsplit) | pd->subsampling_y);
+      int ref;
+      for (ref = 0; ref < 1 + is_compound; ++ref) {
+        int x, y;
+        for (y = 0; y < num_4x4_h; ++y)
+          for (x = 0; x < num_4x4_w; ++x) {
+            const MV mv = average_split_mvs(pd, mi, ref, y * 2 + x);
+            if (mv_has_subpel(&mv))
+              return 1;
+          }
+      }
+    }
+    return 0;
+  } else {
+    intpel_mv = !mv_has_subpel(&mbmi->mv[0].as_mv);
+    if (is_compound && intpel_mv) {
+      intpel_mv &= !mv_has_subpel(&mbmi->mv[1].as_mv);
+    }
+  }
+  return !intpel_mv;
+}
+#endif  // CONFIG_EXT_INTERP
+
+#if CONFIG_OBMC
+const uint8_t* vp10_get_obmc_mask(int length);
+void vp10_build_obmc_inter_prediction(VP10_COMMON *cm,
+                                      MACROBLOCKD *xd, int mi_row, int mi_col,
+                                      uint8_t *above[MAX_MB_PLANE],
+                                      int above_stride[MAX_MB_PLANE],
+                                      uint8_t *left[MAX_MB_PLANE],
+                                      int left_stride[MAX_MB_PLANE]);
+void vp10_build_prediction_by_above_preds(VP10_COMMON *cm,
+                                          MACROBLOCKD *xd,
+                                          int mi_row, int mi_col,
+                                          uint8_t *tmp_buf[MAX_MB_PLANE],
+                                          int tmp_stride[MAX_MB_PLANE]);
+void vp10_build_prediction_by_left_preds(VP10_COMMON *cm,
+                                         MACROBLOCKD *xd,
+                                         int mi_row, int mi_col,
+                                         uint8_t *tmp_buf[MAX_MB_PLANE],
+                                         int tmp_stride[MAX_MB_PLANE]);
+#endif  // CONFIG_OBMC
+
+#if CONFIG_EXT_INTER
+#define MASK_MASTER_SIZE   (2 * MAX_SB_SIZE)
+#define MASK_MASTER_STRIDE (2 * MAX_SB_SIZE)
+
+void vp10_init_wedge_masks();
+
+static INLINE const uint8_t *vp10_get_contiguous_soft_mask(int wedge_index,
+                                                           int wedge_sign,
+                                                           BLOCK_SIZE sb_type) {
+  return wedge_params_lookup[sb_type].masks[wedge_sign][wedge_index];
+}
+
+const uint8_t *vp10_get_soft_mask(int wedge_index,
+                                  int wedge_sign,
+                                  BLOCK_SIZE sb_type,
+                                  int wedge_offset_x,
+                                  int wedge_offset_y);
+
+void vp10_build_interintra_predictors(MACROBLOCKD *xd,
+                                      uint8_t *ypred,
+                                      uint8_t *upred,
+                                      uint8_t *vpred,
+                                      int ystride,
+                                      int ustride,
+                                      int vstride,
+                                      BLOCK_SIZE bsize);
+void vp10_build_interintra_predictors_sby(MACROBLOCKD *xd,
+                                          uint8_t *ypred,
+                                          int ystride,
+                                          BLOCK_SIZE bsize);
+void vp10_build_interintra_predictors_sbc(MACROBLOCKD *xd,
+                                          uint8_t *upred,
+                                          int ustride,
+                                          int plane,
+                                          BLOCK_SIZE bsize);
+void vp10_build_interintra_predictors_sbuv(MACROBLOCKD *xd,
+                                           uint8_t *upred,
+                                           uint8_t *vpred,
+                                           int ustride, int vstride,
+                                           BLOCK_SIZE bsize);
+
+void vp10_build_intra_predictors_for_interintra(
+    MACROBLOCKD *xd,
+    BLOCK_SIZE bsize, int plane,
+    uint8_t *intra_pred, int intra_stride);
+void vp10_combine_interintra(
+    MACROBLOCKD *xd,
+    BLOCK_SIZE bsize, int plane,
+    const uint8_t *inter_pred, int inter_stride,
+    const uint8_t *intra_pred, int intra_stride);
+void vp10_build_interintra_predictors_sbuv(MACROBLOCKD *xd,
+                                           uint8_t *upred,
+                                           uint8_t *vpred,
+                                           int ustride, int vstride,
+                                           BLOCK_SIZE bsize);
+void vp10_build_interintra_predictors_sby(MACROBLOCKD *xd,
+                                          uint8_t *ypred,
+                                          int ystride,
+                                          BLOCK_SIZE bsize);
+
+// Encoder only
+void vp10_build_inter_predictors_for_planes_single_buf(
+    MACROBLOCKD *xd, BLOCK_SIZE bsize,
+    int plane_from, int plane_to,
+    int mi_row, int mi_col, int ref,
+                uint8_t *ext_dst[3], int ext_dst_stride[3]);
+void vp10_build_wedge_inter_predictor_from_buf(
+    MACROBLOCKD *xd, BLOCK_SIZE bsize,
+    int plane_from, int plane_to,
+    uint8_t *ext_dst0[3], int ext_dst_stride0[3],
+    uint8_t *ext_dst1[3], int ext_dst_stride1[3]);
+#endif  // CONFIG_EXT_INTER
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp10/common/reconintra.c b/vp10/common/reconintra.c
index e9e3949..89ff13b 100644
--- a/vp10/common/reconintra.c
+++ b/vp10/common/reconintra.c
@@ -8,8 +8,11 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <math.h>
+
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/system_state.h"
 
 #if CONFIG_VP9_HIGHBITDEPTH
 #include "vpx_dsp/vpx_dsp_common.h"
@@ -21,7 +24,6 @@
 #include "vp10/common/reconintra.h"
 #include "vp10/common/onyxc_int.h"
 
-#if CONFIG_MISC_FIXES
 enum {
   NEED_LEFT = 1 << 1,
   NEED_ABOVE = 1 << 2,
@@ -42,52 +44,31 @@
   NEED_ABOVE | NEED_ABOVERIGHT,             // D63
   NEED_LEFT | NEED_ABOVE | NEED_ABOVELEFT,  // TM
 };
-#else
-enum {
-  NEED_LEFT = 1 << 1,
-  NEED_ABOVE = 1 << 2,
-  NEED_ABOVERIGHT = 1 << 3,
-};
 
-static const uint8_t extend_modes[INTRA_MODES] = {
-  NEED_ABOVE | NEED_LEFT,       // DC
-  NEED_ABOVE,                   // V
-  NEED_LEFT,                    // H
-  NEED_ABOVERIGHT,              // D45
-  NEED_LEFT | NEED_ABOVE,       // D135
-  NEED_LEFT | NEED_ABOVE,       // D117
-  NEED_LEFT | NEED_ABOVE,       // D153
-  NEED_LEFT,                    // D207
-  NEED_ABOVERIGHT,              // D63
-  NEED_LEFT | NEED_ABOVE,       // TM
-};
-#endif
-
-#if CONFIG_MISC_FIXES
-static const uint8_t orders_64x64[1] = { 0 };
-static const uint8_t orders_64x32[2] = { 0, 1 };
-static const uint8_t orders_32x64[2] = { 0, 1 };
-static const uint8_t orders_32x32[4] = {
+static const uint8_t orders_128x128[1] = { 0 };
+static const uint8_t orders_128x64[2] = { 0, 1 };
+static const uint8_t orders_64x128[2] = { 0, 1 };
+static const uint8_t orders_64x64[4] = {
   0, 1,
   2, 3,
 };
-static const uint8_t orders_32x16[8] = {
+static const uint8_t orders_64x32[8] = {
   0, 2,
   1, 3,
   4, 6,
   5, 7,
 };
-static const uint8_t orders_16x32[8] = {
+static const uint8_t orders_32x64[8] = {
   0, 1, 2, 3,
   4, 5, 6, 7,
 };
-static const uint8_t orders_16x16[16] = {
+static const uint8_t orders_32x32[16] = {
   0,   1,  4,  5,
   2,   3,  6,  7,
   8,   9, 12, 13,
   10, 11, 14, 15,
 };
-static const uint8_t orders_16x8[32] = {
+static const uint8_t orders_32x16[32] = {
   0,   2,  8, 10,
   1,   3,  9, 11,
   4,   6, 12, 14,
@@ -97,13 +78,13 @@
   20, 22, 28, 30,
   21, 23, 29, 31,
 };
-static const uint8_t orders_8x16[32] = {
+static const uint8_t orders_16x32[32] = {
   0,   1,  2,  3,  8,  9, 10, 11,
   4,   5,  6,  7, 12, 13, 14, 15,
   16, 17, 18, 19, 24, 25, 26, 27,
   20, 21, 22, 23, 28, 29, 30, 31,
 };
-static const uint8_t orders_8x8[64] = {
+static const uint8_t orders_16x16[64] = {
   0,   1,  4,  5, 16, 17, 20, 21,
   2,   3,  6,  7, 18, 19, 22, 23,
   8,   9, 12, 13, 24, 25, 28, 29,
@@ -113,82 +94,241 @@
   40, 41, 44, 45, 56, 57, 60, 61,
   42, 43, 46, 47, 58, 59, 62, 63,
 };
-static const uint8_t *const orders[BLOCK_SIZES] = {
-  orders_8x8, orders_8x8, orders_8x8, orders_8x8,
-  orders_8x16, orders_16x8, orders_16x16,
-  orders_16x32, orders_32x16, orders_32x32,
-  orders_32x64, orders_64x32, orders_64x64,
+
+#if CONFIG_EXT_PARTITION
+static const uint8_t orders_16x8[128] = {
+  0,   2,  8, 10,  32,  34,  40,  42,
+  1,   3,  9, 11,  33,  35,  41,  43,
+  4,   6, 12, 14,  36,  38,  44,  46,
+  5,   7, 13, 15,  37,  39,  45,  47,
+  16, 18, 24, 26,  48,  50,  56,  58,
+  17, 19, 25, 27,  49,  51,  57,  59,
+  20, 22, 28, 30,  52,  54,  60,  62,
+  21, 23, 29, 31,  53,  55,  61,  63,
+  64, 66, 72, 74,  96,  98, 104, 106,
+  65, 67, 73, 75,  97,  99, 105, 107,
+  68, 70, 76, 78, 100, 102, 108, 110,
+  69, 71, 77, 79, 101, 103, 109, 111,
+  80, 82, 88, 90, 112, 114, 120, 122,
+  81, 83, 89, 91, 113, 115, 121, 123,
+  84, 86, 92, 94, 116, 118, 124, 126,
+  85, 87, 93, 95, 117, 119, 125, 127,
 };
+static const uint8_t orders_8x16[128] = {
+  0,   1,  2,  3,  8,  9, 10, 11,  32,  33,  34,  35,  40,  41,  42,  43,
+  4,   5,  6,  7, 12, 13, 14, 15,  36,  37,  38,  39,  44,  45,  46,  47,
+  16, 17, 18, 19, 24, 25, 26, 27,  48,  49,  50,  51,  56,  57,  58,  59,
+  20, 21, 22, 23, 28, 29, 30, 31,  52,  53,  54,  55,  60,  61,  62,  63,
+  64, 65, 66, 67, 72, 73, 74, 75,  96,  97,  98,  99, 104, 105, 106, 107,
+  68, 69, 70, 71, 76, 77, 78, 79, 100, 101, 102, 103, 108, 109, 110, 111,
+  80, 81, 82, 83, 88, 89, 90, 91, 112, 113, 114, 115, 120, 121, 122, 123,
+  84, 85, 86, 87, 92, 93, 94, 95, 116, 117, 118, 119, 124, 125, 126, 127,
+};
+static const uint8_t orders_8x8[256] = {
+0,     1,   4,   5,  16,  17,  20,  21,  64,  65,  68,  69,  80,  81,  84,  85,
+2,     3,   6,   7,  18,  19,  22,  23,  66,  67,  70,  71,  82,  83,  86,  87,
+8,     9,  12,  13,  24,  25,  28,  29,  72,  73,  76,  77,  88,  89,  92,  93,
+10,   11,  14,  15,  26,  27,  30,  31,  74,  75,  78,  79,  90,  91,  94,  95,
+32,   33,  36,  37,  48,  49,  52,  53,  96,  97, 100, 101, 112, 113, 116, 117,
+34,   35,  38,  39,  50,  51,  54,  55,  98,  99, 102, 103, 114, 115, 118, 119,
+40,   41,  44,  45,  56,  57,  60,  61, 104, 105, 108, 109, 120, 121, 124, 125,
+42,   43,  46,  47,  58,  59,  62,  63, 106, 107, 110, 111, 122, 123, 126, 127,
+128, 129, 132, 133, 144, 145, 148, 149, 192, 193, 196, 197, 208, 209, 212, 213,
+130, 131, 134, 135, 146, 147, 150, 151, 194, 195, 198, 199, 210, 211, 214, 215,
+136, 137, 140, 141, 152, 153, 156, 157, 200, 201, 204, 205, 216, 217, 220, 221,
+138, 139, 142, 143, 154, 155, 158, 159, 202, 203, 206, 207, 218, 219, 222, 223,
+160, 161, 164, 165, 176, 177, 180, 181, 224, 225, 228, 229, 240, 241, 244, 245,
+162, 163, 166, 167, 178, 179, 182, 183, 226, 227, 230, 231, 242, 243, 246, 247,
+168, 169, 172, 173, 184, 185, 188, 189, 232, 233, 236, 237, 248, 249, 252, 253,
+170, 171, 174, 175, 186, 187, 190, 191, 234, 235, 238, 239, 250, 251, 254, 255,
+};
+
+static const uint8_t *const orders[BLOCK_SIZES] = {
+  //                              4X4
+                                  orders_8x8,
+  // 4X8,         8X4,            8X8
+  orders_8x8,     orders_8x8,     orders_8x8,
+  // 8X16,        16X8,           16X16
+  orders_8x16,    orders_16x8,    orders_16x16,
+  // 16X32,       32X16,          32X32
+  orders_16x32,   orders_32x16,   orders_32x32,
+  // 32X64,       64X32,          64X64
+  orders_32x64,   orders_64x32,   orders_64x64,
+  // 64x128,      128x64,         128x128
+  orders_64x128,  orders_128x64,  orders_128x128
+};
+#else
+static const uint8_t *const orders[BLOCK_SIZES] = {
+  //                              4X4
+                                  orders_16x16,
+  // 4X8,         8X4,            8X8
+  orders_16x16,   orders_16x16,   orders_16x16,
+  // 8X16,        16X8,           16X16
+  orders_16x32,   orders_32x16,   orders_32x32,
+  // 16X32,       32X16,          32X32
+  orders_32x64,   orders_64x32,   orders_64x64,
+  // 32X64,       64X32,          64X64
+  orders_64x128,  orders_128x64,  orders_128x128
+};
+#endif  // CONFIG_EXT_PARTITION
+
+#if CONFIG_EXT_PARTITION_TYPES
+static const uint8_t orders_verta_64x64[4] = {
+  0, 2,
+  1, 2,
+};
+static const uint8_t orders_verta_32x32[16] = {
+  0,   2,  4,  6,
+  1,   2,  5,  6,
+  8,  10, 12, 14,
+  9,  10, 13, 14,
+};
+static const uint8_t orders_verta_16x16[64] = {
+  0,   2,  4,  6, 16, 18, 20, 22,
+  1,   2,  5,  6, 17, 18, 21, 22,
+  8,  10, 12, 14, 24, 26, 28, 30,
+  9,  10, 13, 14, 25, 26, 29, 30,
+  32, 34, 36, 38, 48, 50, 52, 54,
+  33, 34, 37, 38, 49, 50, 53, 54,
+  40, 42, 44, 46, 56, 58, 60, 62,
+  41, 42, 45, 46, 57, 58, 61, 62,
+};
+#if CONFIG_EXT_PARTITION
+static const uint8_t orders_verta_8x8[256] = {
+0,     2,   4,   6,  16,  18,  20,  22,  64,  66,  68,  70,  80,  82,  84,  86,
+1,     2,   5,   6,  17,  18,  21,  22,  65,  66,  69,  70,  81,  82,  85,  86,
+8,    10,  12,  14,  24,  26,  28,  30,  72,  74,  76,  78,  88,  90,  92,  94,
+9,    10,  13,  14,  25,  26,  29,  30,  73,  74,  77,  78,  89,  90,  93,  94,
+32,   34,  36,  38,  48,  50,  52,  54,  96,  98, 100, 102, 112, 114, 116, 118,
+33,   34,  37,  38,  49,  50,  53,  54,  97,  98, 101, 102, 113, 114, 117, 118,
+40,   42,  44,  46,  56,  58,  60,  62, 104, 106, 108, 110, 120, 122, 124, 126,
+41,   42,  45,  46,  57,  58,  61,  62, 105, 106, 109, 110, 121, 122, 125, 126,
+128, 130, 132, 134, 144, 146, 148, 150, 192, 194, 196, 198, 208, 210, 212, 214,
+129, 130, 133, 134, 145, 146, 149, 150, 193, 194, 197, 198, 209, 210, 213, 214,
+136, 138, 140, 142, 152, 154, 156, 158, 200, 202, 204, 206, 216, 218, 220, 222,
+137, 138, 141, 142, 153, 154, 157, 158, 201, 202, 205, 206, 217, 218, 221, 222,
+160, 162, 164, 166, 176, 178, 180, 182, 224, 226, 228, 230, 240, 242, 244, 246,
+161, 162, 165, 166, 177, 178, 181, 182, 225, 226, 229, 230, 241, 242, 245, 246,
+168, 170, 172, 174, 184, 186, 188, 190, 232, 234, 236, 238, 248, 250, 252, 254,
+169, 170, 173, 174, 185, 186, 189, 190, 233, 234, 237, 238, 249, 250, 253, 254,
+};
+static const uint8_t *const orders_verta[BLOCK_SIZES] = {
+  //                                  4X4
+                                      orders_verta_8x8,
+  // 4X8,           8X4,              8X8
+  orders_verta_8x8, orders_verta_8x8, orders_verta_8x8,
+  // 8X16,          16X8,             16X16
+  orders_8x16,      orders_16x8,      orders_verta_16x16,
+  // 16X32,         32X16,            32X32
+  orders_16x32,     orders_32x16,     orders_verta_32x32,
+  // 32X64,         64X32,            64X64
+  orders_32x64,     orders_64x32,     orders_verta_64x64,
+  // 64x128,        128x64,           128x128
+  orders_64x128,    orders_128x64,    orders_128x128
+};
+#else
+static const uint8_t *const orders_verta[BLOCK_SIZES] = {
+  //                                      4X4
+                                          orders_verta_16x16,
+  // 4X8,             8X4,                8X8
+  orders_verta_16x16, orders_verta_16x16, orders_verta_16x16,
+  // 8X16,            16X8,               16X16
+  orders_16x32,       orders_32x16,       orders_verta_32x32,
+  // 16X32,           32X16,              32X32
+  orders_32x64,       orders_64x32,       orders_verta_64x64,
+  // 32X64,           64X32,              64X64
+  orders_64x128,      orders_128x64,      orders_128x128
+};
+#endif  // CONFIG_EXT_PARTITION
+#endif  // CONFIG_EXT_PARTITION_TYPES
 
 static int vp10_has_right(BLOCK_SIZE bsize, int mi_row, int mi_col,
                           int right_available,
+#if CONFIG_EXT_PARTITION_TYPES
+                          PARTITION_TYPE partition,
+#endif
                           TX_SIZE txsz, int y, int x, int ss_x) {
-  if (y == 0) {
-    int wl = mi_width_log2_lookup[bsize];
-    int hl = mi_height_log2_lookup[bsize];
-    int w = 1 << (wl + 1 - ss_x);
-    int step = 1 << txsz;
-    const uint8_t *order = orders[bsize];
-    int my_order, tr_order;
+  const int wl = mi_width_log2_lookup[bsize];
+  const int w = VPXMAX(num_4x4_blocks_wide_lookup[bsize] >> ss_x, 1);
+  const int step = 1 << txsz;
 
-    if (x + step < w)
+  if (!right_available) {
+    return 0;
+  } else {
+    // Handle block size 4x8 and 4x4
+    if (ss_x == 0 && num_4x4_blocks_wide_lookup[bsize] < 2 && x == 0)
       return 1;
 
-    mi_row = (mi_row & 7) >> hl;
-    mi_col = (mi_col & 7) >> wl;
+    if (y == 0) {
+      const int hl = mi_height_log2_lookup[bsize];
+      const uint8_t *order;
+      int my_order, tr_order;
+#if CONFIG_EXT_PARTITION_TYPES
+      if (partition == PARTITION_VERT_A)
+        order = orders_verta[bsize];
+      else
+#endif  // CONFIG_EXT_PARTITION_TYPES
+      order = orders[bsize];
 
-    if (mi_row == 0)
-      return right_available;
+      if (x + step < w)
+        return 1;
 
-    if (((mi_col + 1) << wl) >= 8)
-      return 0;
+      mi_row = (mi_row & MAX_MIB_MASK) >> hl;
+      mi_col = (mi_col & MAX_MIB_MASK) >> wl;
 
-    my_order = order[((mi_row + 0) << (3 - wl)) + mi_col + 0];
-    tr_order = order[((mi_row - 1) << (3 - wl)) + mi_col + 1];
+      // If top row of coding unit
+      if (mi_row == 0)
+        return 1;
 
-    return my_order > tr_order && right_available;
-  } else {
-    int wl = mi_width_log2_lookup[bsize];
-    int w = 1 << (wl + 1 - ss_x);
-    int step = 1 << txsz;
+      // If rightmost column of coding unit
+      if (((mi_col + 1) << wl) >= MAX_MIB_SIZE)
+        return 0;
 
-    return x + step < w;
+      my_order = order[((mi_row + 0) << (MAX_MIB_SIZE_LOG2 - wl)) + mi_col + 0];
+      tr_order = order[((mi_row - 1) << (MAX_MIB_SIZE_LOG2 - wl)) + mi_col + 1];
+
+      return my_order > tr_order;
+    } else {
+      return x + step < w;
+    }
   }
 }
 
 static int vp10_has_bottom(BLOCK_SIZE bsize, int mi_row, int mi_col,
                            int bottom_available, TX_SIZE txsz,
                            int y, int x, int ss_y) {
-  if (x == 0) {
-    int wl = mi_width_log2_lookup[bsize];
-    int hl = mi_height_log2_lookup[bsize];
-    int h = 1 << (hl + 1 - ss_y);
-    int step = 1 << txsz;
+  if (!bottom_available || x != 0) {
+    return 0;
+  } else {
+    const int wl = mi_width_log2_lookup[bsize];
+    const int hl = mi_height_log2_lookup[bsize];
+    const int h = 1 << (hl + 1 - ss_y);
+    const int step = 1 << txsz;
     const uint8_t *order = orders[bsize];
     int my_order, bl_order;
 
-    mi_row = (mi_row & 7) >> hl;
-    mi_col = (mi_col & 7) >> wl;
-
-    if (mi_col == 0)
-      return bottom_available &&
-             (mi_row << (hl + !ss_y)) + y + step < (8 << !ss_y);
-
-    if (((mi_row + 1) << hl) >= 8)
-      return 0;
+    // Handle block size 8x4 and 4x4
+    if (ss_y == 0 && num_4x4_blocks_high_lookup[bsize] < 2 && y == 0)
+      return 1;
 
     if (y + step < h)
       return 1;
 
-    my_order = order[((mi_row + 0) << (3 - wl)) + mi_col + 0];
-    bl_order = order[((mi_row + 1) << (3 - wl)) + mi_col - 1];
+    mi_row = (mi_row & MAX_MIB_MASK) >> hl;
+    mi_col = (mi_col & MAX_MIB_MASK) >> wl;
 
-    return bl_order < my_order && bottom_available;
-  } else {
-    return 0;
+    if (mi_col == 0)
+      return (mi_row << (hl + !ss_y)) + y + step < (MAX_MIB_SIZE << !ss_y);
+
+    if (((mi_row + 1) << hl) >= MAX_MIB_SIZE)
+      return 0;
+
+    my_order = order[((mi_row + 0) << (MAX_MIB_SIZE_LOG2 - wl)) + mi_col + 0];
+    bl_order = order[((mi_row + 1) << (MAX_MIB_SIZE_LOG2 - wl)) + mi_col - 1];
+
+    return bl_order < my_order;
   }
 }
-#endif
 
 typedef void (*intra_pred_fn)(uint8_t *dst, ptrdiff_t stride,
                               const uint8_t *above, const uint8_t *left);
@@ -216,15 +356,9 @@
 
   INIT_ALL_SIZES(pred[V_PRED], v);
   INIT_ALL_SIZES(pred[H_PRED], h);
-#if CONFIG_MISC_FIXES
   INIT_ALL_SIZES(pred[D207_PRED], d207e);
   INIT_ALL_SIZES(pred[D45_PRED], d45e);
   INIT_ALL_SIZES(pred[D63_PRED], d63e);
-#else
-  INIT_ALL_SIZES(pred[D207_PRED], d207);
-  INIT_ALL_SIZES(pred[D45_PRED], d45);
-  INIT_ALL_SIZES(pred[D63_PRED], d63);
-#endif
   INIT_ALL_SIZES(pred[D117_PRED], d117);
   INIT_ALL_SIZES(pred[D135_PRED], d135);
   INIT_ALL_SIZES(pred[D153_PRED], d153);
@@ -238,15 +372,9 @@
 #if CONFIG_VP9_HIGHBITDEPTH
   INIT_ALL_SIZES(pred_high[V_PRED], highbd_v);
   INIT_ALL_SIZES(pred_high[H_PRED], highbd_h);
-#if CONFIG_MISC_FIXES
   INIT_ALL_SIZES(pred_high[D207_PRED], highbd_d207e);
   INIT_ALL_SIZES(pred_high[D45_PRED], highbd_d45e);
-  INIT_ALL_SIZES(pred_high[D63_PRED], highbd_d63);
-#else
-  INIT_ALL_SIZES(pred_high[D207_PRED], highbd_d207);
-  INIT_ALL_SIZES(pred_high[D45_PRED], highbd_d45);
-  INIT_ALL_SIZES(pred_high[D63_PRED], highbd_d63);
-#endif
+  INIT_ALL_SIZES(pred_high[D63_PRED], highbd_d63e);
   INIT_ALL_SIZES(pred_high[D117_PRED], highbd_d117);
   INIT_ALL_SIZES(pred_high[D135_PRED], highbd_d135);
   INIT_ALL_SIZES(pred_high[D153_PRED], highbd_d153);
@@ -261,12 +389,757 @@
 #undef intra_pred_allsizes
 }
 
-#if CONFIG_MISC_FIXES
-static INLINE void memset16(uint16_t *dst, int val, int n) {
-  while (n--)
-    *dst++ = val;
+#if CONFIG_EXT_INTRA
+#define FILTER_INTRA_PREC_BITS 10
+
+static const uint8_t ext_intra_extend_modes[FILTER_INTRA_MODES] = {
+  NEED_LEFT | NEED_ABOVE,      // FILTER_DC
+  NEED_LEFT | NEED_ABOVE,      // FILTER_V
+  NEED_LEFT | NEED_ABOVE,      // FILTER_H
+  NEED_LEFT | NEED_ABOVE,      // FILTER_D45
+  NEED_LEFT | NEED_ABOVE,      // FILTER_D135
+  NEED_LEFT | NEED_ABOVE,      // FILTER_D117
+  NEED_LEFT | NEED_ABOVE,      // FILTER_D153
+  NEED_LEFT | NEED_ABOVE,      // FILTER_D207
+  NEED_LEFT | NEED_ABOVE,      // FILTER_D63
+  NEED_LEFT | NEED_ABOVE,      // FILTER_TM
+};
+
+static int intra_subpel_interp(int base, int shift, const uint8_t *ref,
+                               int ref_start_idx, int ref_end_idx,
+                               INTRA_FILTER filter_type) {
+  int val, k, idx, filter_idx = 0;
+  const int16_t *filter = NULL;
+
+  if (filter_type == INTRA_FILTER_LINEAR) {
+    val = ref[base] * (256 - shift) + ref[base + 1] * shift;
+    val = ROUND_POWER_OF_TWO(val, 8);
+  } else {
+    filter_idx = ROUND_POWER_OF_TWO(shift, 8 - SUBPEL_BITS);
+    filter = vp10_intra_filter_kernels[filter_type][filter_idx];
+
+    if (filter_idx < (1 << SUBPEL_BITS)) {
+      val = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k) {
+        idx = base + 1 - (SUBPEL_TAPS / 2) + k;
+        idx = VPXMAX(VPXMIN(idx, ref_end_idx), ref_start_idx);
+        val += ref[idx] * filter[k];
+      }
+      val = ROUND_POWER_OF_TWO(val, FILTER_BITS);
+    } else {
+      val = ref[base + 1];
+    }
+  }
+
+  return val;
 }
-#endif
+
+// Directional prediction, zone 1: 0 < angle < 90
+static void dr_prediction_z1(uint8_t *dst, ptrdiff_t stride, int bs,
+                             const uint8_t *above, const uint8_t *left,
+                             int dx, int dy, INTRA_FILTER filter_type) {
+  int r, c, x, base, shift, val;
+
+  (void)left;
+  (void)dy;
+  assert(dy == 1);
+  assert(dx < 0);
+
+  if (filter_type != INTRA_FILTER_LINEAR) {
+    const int pad_size = SUBPEL_TAPS >> 1;
+    int len;
+    DECLARE_ALIGNED(16, uint8_t, buf[SUBPEL_SHIFTS][MAX_SB_SIZE]);
+    DECLARE_ALIGNED(16, uint8_t, src[MAX_SB_SIZE + SUBPEL_TAPS]);
+    uint8_t flags[SUBPEL_SHIFTS];
+
+    memset(flags, 0, SUBPEL_SHIFTS * sizeof(flags[0]));
+    memset(src, above[0], pad_size * sizeof(above[0]));
+    memcpy(src + pad_size, above, 2 * bs * sizeof(above[0]));
+    memset(src + pad_size + 2 * bs, above[2 * bs - 1],
+           pad_size * sizeof(above[0]));
+    flags[0] = 1;
+    x = -dx;
+    for (r = 0; r < bs; ++r, dst += stride, x -= dx) {
+      base = x >> 8;
+      shift = x & 0xFF;
+      shift = ROUND_POWER_OF_TWO(shift, 8 - SUBPEL_BITS);
+      if (shift == SUBPEL_SHIFTS) {
+        base += 1;
+        shift = 0;
+      }
+      len = VPXMIN(bs, 2 * bs - 1 - base);
+      if (len <= 0) {
+        int i;
+        for (i = r; i < bs; ++i) {
+          memset(dst, above[2 * bs - 1], bs * sizeof(dst[0]));
+          dst += stride;
+        }
+        return;
+      }
+
+      if (len <= (bs >> 1) && !flags[shift]) {
+        base = x >> 8;
+        shift = x & 0xFF;
+        for (c = 0; c < len; ++c) {
+          val = intra_subpel_interp(base, shift, above, 0, 2 * bs - 1,
+                                    filter_type);
+          dst[c] = clip_pixel(val);
+          ++base;
+        }
+      } else {
+        if (!flags[shift]) {
+          const int16_t *filter = vp10_intra_filter_kernels[filter_type][shift];
+          vpx_convolve8_horiz(src + pad_size, 2 * bs, buf[shift], 2 * bs,
+                              filter, 16,
+                              NULL, 16, 2 * bs, 2 * bs < 16 ? 2 : 1);
+          flags[shift] = 1;
+        }
+        memcpy(dst, shift == 0 ? src + pad_size + base : &buf[shift][base],
+            len * sizeof(dst[0]));
+      }
+
+      if (len < bs)
+        memset(dst + len, above[2 * bs - 1], (bs - len) * sizeof(dst[0]));
+    }
+    return;
+  }
+
+  // For linear filter, C code is faster.
+  x = -dx;
+  for (r = 0; r < bs; ++r, dst += stride, x -= dx) {
+    base = x >> 8;
+    shift = x & 0xFF;
+
+    if (base >= 2 * bs - 1) {
+      int i;
+      for (i = r; i < bs; ++i) {
+        memset(dst, above[2 * bs - 1], bs * sizeof(dst[0]));
+        dst += stride;
+      }
+      return;
+    }
+
+    for (c = 0; c < bs; ++c, ++base) {
+      if (base < 2 * bs - 1) {
+        val = above[base] * (256 - shift) + above[base + 1] * shift;
+        val = ROUND_POWER_OF_TWO(val, 8);
+        dst[c] = clip_pixel(val);
+      } else {
+        dst[c] = above[2 * bs - 1];
+      }
+    }
+  }
+}
+
+// Directional prediction, zone 2: 90 < angle < 180
+static void dr_prediction_z2(uint8_t *dst, ptrdiff_t stride, int bs,
+                             const uint8_t *above, const uint8_t *left,
+                             int dx, int dy, INTRA_FILTER filter_type) {
+  int r, c, x, y, shift1, shift2, val, base1, base2;
+
+  assert(dx > 0);
+  assert(dy > 0);
+
+  x = -dx;
+  for (r = 0; r < bs; ++r, x -= dx, dst += stride) {
+    base1 = x >> 8;
+    y = (r << 8) - dy;
+    for (c = 0; c < bs; ++c, ++base1, y -= dy) {
+      if (base1 >= -1) {
+        shift1 = x & 0xFF;
+        val = intra_subpel_interp(base1, shift1, above, -1, bs - 1,
+                                  filter_type);
+      } else {
+        base2 = y >> 8;
+        if (base2 >= 0) {
+          shift2 = y & 0xFF;
+          val = intra_subpel_interp(base2, shift2, left, 0, bs - 1,
+                                    filter_type);
+        } else {
+          val = left[0];
+        }
+      }
+      dst[c] = clip_pixel(val);
+    }
+  }
+}
+
+// Directional prediction, zone 3: 180 < angle < 270
+static void dr_prediction_z3(uint8_t *dst, ptrdiff_t stride, int bs,
+                             const uint8_t *above, const uint8_t *left,
+                             int dx, int dy, INTRA_FILTER filter_type) {
+  int r, c, y, base, shift, val;
+
+  (void)above;
+  (void)dx;
+
+  assert(dx == 1);
+  assert(dy < 0);
+
+  if (filter_type != INTRA_FILTER_LINEAR) {
+    const int pad_size = SUBPEL_TAPS >> 1;
+    int len, i;
+    DECLARE_ALIGNED(16, uint8_t, buf[MAX_SB_SIZE][4 * SUBPEL_SHIFTS]);
+    DECLARE_ALIGNED(16, uint8_t, src[(MAX_SB_SIZE + SUBPEL_TAPS) * 4]);
+    uint8_t flags[SUBPEL_SHIFTS];
+
+    memset(flags, 0, SUBPEL_SHIFTS * sizeof(flags[0]));
+    for (i = 0; i < pad_size; ++i)
+      src[4 * i] = left[0];
+    for (i = 0; i < 2 * bs; ++i)
+      src[4 * (i + pad_size)] = left[i];
+    for (i = 0; i < pad_size; ++i)
+      src[4 * (i + 2 * bs + pad_size)] = left[2 * bs - 1];
+    flags[0] = 1;
+    y = -dy;
+    for (c = 0; c < bs; ++c, y -= dy) {
+      base = y >> 8;
+      shift = y & 0xFF;
+      shift = ROUND_POWER_OF_TWO(shift, 8 - SUBPEL_BITS);
+      if (shift == SUBPEL_SHIFTS) {
+        base += 1;
+        shift = 0;
+      }
+      len = VPXMIN(bs, 2 * bs - 1 - base);
+
+      if (len <= 0) {
+        for (r = 0; r < bs; ++r) {
+          dst[r * stride + c] = left[ 2 * bs - 1];
+        }
+        continue;
+      }
+
+      if (len <= (bs >> 1) && !flags[shift]) {
+        base = y >> 8;
+        shift = y & 0xFF;
+        for (r = 0; r < len; ++r) {
+          val = intra_subpel_interp(base, shift, left, 0, 2 * bs - 1,
+                                    filter_type);
+          dst[r * stride + c] = clip_pixel(val);
+          ++base;
+        }
+      } else {
+        if (!flags[shift]) {
+          const int16_t *filter = vp10_intra_filter_kernels[filter_type][shift];
+          vpx_convolve8_vert(src + 4 * pad_size, 4,
+                             buf[0] + 4 * shift, 4 * SUBPEL_SHIFTS, NULL, 16,
+                             filter, 16,
+                             2 * bs < 16 ? 4 : 4, 2 * bs);
+          flags[shift] = 1;
+        }
+
+        if (shift == 0) {
+          for (r = 0; r < len; ++r) {
+            dst[r * stride + c] = left[r + base];
+          }
+        } else {
+          for (r = 0; r < len; ++r) {
+            dst[r * stride + c] = buf[r + base][4 * shift];
+          }
+        }
+      }
+
+      if (len < bs) {
+        for (r = len; r < bs; ++r) {
+          dst[r * stride + c] = left[ 2 * bs - 1];
+        }
+      }
+    }
+    return;
+  }
+
+  // For linear filter, C code is faster.
+  y = -dy;
+  for (c = 0; c < bs; ++c, y -= dy) {
+    base = y >> 8;
+    shift = y & 0xFF;
+
+    for (r = 0; r < bs; ++r, ++base) {
+      if (base < 2 * bs - 1) {
+        val = left[base] * (256 - shift) + left[base + 1] * shift;
+        val = ROUND_POWER_OF_TWO(val, 8);
+        dst[r * stride + c] = clip_pixel(val);
+      } else {
+        for (; r < bs; ++r)
+          dst[r * stride + c] = left[2 * bs - 1];
+        break;
+      }
+    }
+  }
+}
+
+static void dr_predictor(uint8_t *dst, ptrdiff_t stride, TX_SIZE tx_size,
+                         const uint8_t *above, const uint8_t *left, int angle,
+                         INTRA_FILTER filter_type) {
+  const int dx = (int)dr_intra_derivative[angle][0];
+  const int dy = (int)dr_intra_derivative[angle][1];
+  const int bs = 4 << tx_size;
+  assert(angle > 0 && angle < 270);
+
+  if (angle > 0 && angle < 90) {
+    dr_prediction_z1(dst, stride, bs, above, left, dx, dy, filter_type);
+  } else if (angle > 90 && angle < 180) {
+    dr_prediction_z2(dst, stride, bs, above, left, dx, dy, filter_type);
+  } else if (angle > 180 && angle < 270) {
+    dr_prediction_z3(dst, stride, bs, above, left, dx, dy, filter_type);
+  } else if (angle == 90) {
+    pred[V_PRED][tx_size](dst, stride, above, left);
+  } else if (angle == 180) {
+    pred[H_PRED][tx_size](dst, stride, above, left);
+  }
+}
+
+static int filter_intra_taps_4[TX_SIZES][INTRA_MODES][4] = {
+    {
+        {735, 881, -537, -54},
+        {1005, 519, -488, -11},
+        {383, 990, -343, -6},
+        {442, 805, -542, 319},
+        {658, 616, -133, -116},
+        {875, 442, -141, -151},
+        {386, 741, -23, -80},
+        {390, 1027, -446, 51},
+        {679, 606, -523, 262},
+        {903, 922, -778, -23},
+    },
+    {
+        {648, 803, -444, 16},
+        {972, 620, -576, 7},
+        {561, 967, -499, -5},
+        {585, 762, -468, 144},
+        {596, 619, -182, -9},
+        {895, 459, -176, -153},
+        {557, 722, -126, -129},
+        {601, 839, -523, 105},
+        {562, 709, -499, 251},
+        {803, 872, -695, 43},
+    },
+    {
+        {423, 728, -347, 111},
+        {963, 685, -665, 23},
+        {281, 1024, -480, 216},
+        {640, 596, -437, 78},
+        {429, 669, -259, 99},
+        {740, 646, -415, 23},
+        {568, 771, -346, 40},
+        {404, 833, -486, 209},
+        {398, 712, -423, 307},
+        {939, 935, -887, 17},
+    },
+    {
+        {477, 737, -393, 150},
+        {881, 630, -546, 67},
+        {506, 984, -443, -20},
+        {114, 459, -270, 528},
+        {433, 528, 14, 3},
+        {837, 470, -301, -30},
+        {181, 777, 89, -107},
+        {-29, 716, -232, 259},
+        {589, 646, -495, 255},
+        {740, 884, -728, 77},
+    },
+};
+
+static void filter_intra_predictors_4tap(uint8_t *dst, ptrdiff_t stride, int bs,
+                                         const uint8_t *above,
+                                         const uint8_t *left,
+                                         int mode) {
+  int k, r, c;
+  int pred[33][65];
+  int mean, ipred;
+  const TX_SIZE tx_size = (bs == 32) ? TX_32X32 :
+      ((bs == 16) ? TX_16X16 : ((bs == 8) ? TX_8X8 : (TX_4X4)));
+  const int c0 = filter_intra_taps_4[tx_size][mode][0];
+  const int c1 = filter_intra_taps_4[tx_size][mode][1];
+  const int c2 = filter_intra_taps_4[tx_size][mode][2];
+  const int c3 = filter_intra_taps_4[tx_size][mode][3];
+
+  k = 0;
+  mean = 0;
+  while (k < bs) {
+    mean = mean + (int)left[k];
+    mean = mean + (int)above[k];
+    k++;
+  }
+  mean = (mean + bs) / (2 * bs);
+
+  for (r = 0; r < bs; ++r)
+    pred[r + 1][0] = (int)left[r] - mean;
+
+  for (c = 0; c < 2 * bs + 1; ++c)
+    pred[0][c] = (int)above[c - 1] - mean;
+
+  for (r = 1; r < bs + 1; ++r)
+    for (c = 1; c < 2 * bs + 1 - r; ++c) {
+      ipred = c0 * pred[r - 1][c] + c1 * pred[r][c - 1] +
+          c2 * pred[r - 1][c - 1] + c3 * pred[r - 1][c + 1];
+      pred[r][c] = ROUND_POWER_OF_TWO_SIGNED(ipred, FILTER_INTRA_PREC_BITS);
+    }
+
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) {
+      ipred = pred[r + 1][c + 1] + mean;
+      dst[c] = clip_pixel(ipred);
+    }
+    dst += stride;
+  }
+}
+
+static void dc_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                               const uint8_t *above, const uint8_t *left) {
+  filter_intra_predictors_4tap(dst, stride, bs, above, left, DC_PRED);
+}
+
+static void v_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                               const uint8_t *above, const uint8_t *left) {
+  filter_intra_predictors_4tap(dst, stride, bs, above, left, V_PRED);
+}
+
+static void h_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                               const uint8_t *above, const uint8_t *left) {
+  filter_intra_predictors_4tap(dst, stride, bs, above, left, H_PRED);
+}
+
+static void d45_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                 const uint8_t *above, const uint8_t *left) {
+  filter_intra_predictors_4tap(dst, stride, bs, above, left, D45_PRED);
+}
+
+static void d135_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                  const uint8_t *above, const uint8_t *left) {
+  filter_intra_predictors_4tap(dst, stride, bs, above, left, D135_PRED);
+}
+
+static void d117_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                  const uint8_t *above, const uint8_t *left) {
+  filter_intra_predictors_4tap(dst, stride, bs, above, left, D117_PRED);
+}
+
+static void d153_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                  const uint8_t *above, const uint8_t *left) {
+  filter_intra_predictors_4tap(dst, stride, bs, above, left, D153_PRED);
+}
+
+static void d207_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                  const uint8_t *above, const uint8_t *left) {
+  filter_intra_predictors_4tap(dst, stride, bs, above, left, D207_PRED);
+}
+
+static void d63_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                 const uint8_t *above, const uint8_t *left) {
+  filter_intra_predictors_4tap(dst, stride, bs, above, left, D63_PRED);
+}
+
+static void tm_filter_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
+                                const uint8_t *above, const uint8_t *left) {
+  filter_intra_predictors_4tap(dst, stride, bs, above, left, TM_PRED);
+}
+
+static void (*filter_intra_predictors[EXT_INTRA_MODES])(uint8_t *dst,
+    ptrdiff_t stride, int bs, const uint8_t *above, const uint8_t *left) = {
+        dc_filter_predictor, v_filter_predictor, h_filter_predictor,
+        d45_filter_predictor, d135_filter_predictor, d117_filter_predictor,
+        d153_filter_predictor, d207_filter_predictor, d63_filter_predictor,
+        tm_filter_predictor,
+};
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static int highbd_intra_subpel_interp(int base, int shift, const uint16_t *ref,
+                                      int ref_start_idx, int ref_end_idx,
+                                      INTRA_FILTER filter_type) {
+  int val, k, idx, filter_idx = 0;
+  const int16_t *filter = NULL;
+
+  if (filter_type == INTRA_FILTER_LINEAR) {
+    val = ref[base] * (256 - shift) + ref[base + 1] * shift;
+    val = ROUND_POWER_OF_TWO(val, 8);
+  } else {
+    filter_idx = ROUND_POWER_OF_TWO(shift, 8 - SUBPEL_BITS);
+    filter = vp10_intra_filter_kernels[filter_type][filter_idx];
+
+    if (filter_idx < (1 << SUBPEL_BITS)) {
+      val = 0;
+      for (k = 0; k < SUBPEL_TAPS; ++k) {
+        idx = base + 1 - (SUBPEL_TAPS / 2) + k;
+        idx = VPXMAX(VPXMIN(idx, ref_end_idx), ref_start_idx);
+        val += ref[idx] * filter[k];
+      }
+      val = ROUND_POWER_OF_TWO(val, FILTER_BITS);
+    } else {
+      val = ref[base + 1];
+    }
+  }
+
+  return val;
+}
+
+// Directional prediction, zone 1: 0 < angle < 90
+static void highbd_dr_prediction_z1(uint16_t *dst, ptrdiff_t stride, int bs,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int dx, int dy, int bd,
+                                    INTRA_FILTER filter_type) {
+  int r, c, x, y, base, shift, val;
+
+  (void)left;
+  (void)dy;
+  assert(dy == 1);
+  assert(dx < 0);
+
+  for (r = 0; r < bs; ++r) {
+    y = r + 1;
+    for (c = 0; c < bs; ++c) {
+      x = (c << 8) - y * dx;
+      base = x >> 8;
+      shift = x - (base << 8);
+      if (base < 2 * bs - 1) {
+        val = highbd_intra_subpel_interp(base, shift, above, 0, 2 * bs - 1,
+                                         filter_type);
+        dst[c] = clip_pixel_highbd(val, bd);
+      } else {
+        dst[c] = above[2 * bs - 1];
+      }
+    }
+    dst += stride;
+  }
+}
+
+// Directional prediction, zone 2: 90 < angle < 180
+static void highbd_dr_prediction_z2(uint16_t *dst, ptrdiff_t stride, int bs,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int dx, int dy, int bd,
+                                    INTRA_FILTER filter_type) {
+  int r, c, x, y, shift, val, base;
+
+  assert(dx > 0);
+  assert(dy > 0);
+
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) {
+      y = r + 1;
+      x = (c << 8) - y * dx;
+      base = x >> 8;
+      if (base >= -1) {
+        shift = x - (base << 8);
+        val = highbd_intra_subpel_interp(base, shift, above, -1, bs - 1,
+                                         filter_type);
+      } else {
+        x = c + 1;
+        y = (r << 8) - x * dy;
+        base = y >> 8;
+        if (base >= 0) {
+          shift = y - (base  << 8);
+          val = highbd_intra_subpel_interp(base, shift, left, 0, bs - 1,
+                                           filter_type);
+        } else {
+          val = left[0];
+        }
+      }
+      dst[c] = clip_pixel_highbd(val, bd);
+    }
+    dst += stride;
+  }
+}
+
+// Directional prediction, zone 3: 180 < angle < 270
+static void highbd_dr_prediction_z3(uint16_t *dst, ptrdiff_t stride, int bs,
+                                    const uint16_t *above, const uint16_t *left,
+                                    int dx, int dy, int bd,
+                                    INTRA_FILTER filter_type) {
+  int r, c, x, y, base, shift, val;
+
+  (void)above;
+  (void)dx;
+  assert(dx == 1);
+  assert(dy < 0);
+
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) {
+      x = c + 1;
+      y = (r << 8) - x * dy;
+      base = y >> 8;
+      shift = y - (base << 8);
+      if (base < 2 * bs - 1) {
+        val = highbd_intra_subpel_interp(base, shift, left, 0, 2 * bs - 1,
+                                         filter_type);
+        dst[c] = clip_pixel_highbd(val, bd);
+      } else {
+        dst[c] = left[2 * bs - 1];
+      }
+    }
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_v_predictor(uint16_t *dst, ptrdiff_t stride,
+                                      int bs, const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  int r;
+  (void) left;
+  (void) bd;
+  for (r = 0; r < bs; r++) {
+    memcpy(dst, above, bs * sizeof(uint16_t));
+    dst += stride;
+  }
+}
+
+static INLINE void highbd_h_predictor(uint16_t *dst, ptrdiff_t stride,
+                                      int bs, const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  int r;
+  (void) above;
+  (void) bd;
+  for (r = 0; r < bs; r++) {
+    vpx_memset16(dst, left[r], bs);
+    dst += stride;
+  }
+}
+
+static void highbd_dr_predictor(uint16_t *dst, ptrdiff_t stride, int bs,
+                                const uint16_t *above, const uint16_t *left,
+                                int angle, int bd, INTRA_FILTER filter) {
+  const int dx = (int)dr_intra_derivative[angle][0];
+  const int dy = (int)dr_intra_derivative[angle][1];
+  assert(angle > 0 && angle < 270);
+
+  if (angle > 0 && angle < 90) {
+    highbd_dr_prediction_z1(dst, stride, bs, above, left, dx, dy, bd, filter);
+  } else if (angle > 90 && angle < 180) {
+    highbd_dr_prediction_z2(dst, stride, bs, above, left, dx, dy, bd, filter);
+  } else if (angle > 180 && angle < 270) {
+    highbd_dr_prediction_z3(dst, stride, bs, above, left, dx, dy, bd, filter);
+  } else if (angle == 90) {
+    highbd_v_predictor(dst, stride, bs, above, left, bd);
+  } else if (angle == 180) {
+    highbd_h_predictor(dst, stride, bs, above, left, bd);
+  }
+}
+
+static void highbd_filter_intra_predictors_4tap(uint16_t *dst, ptrdiff_t stride,
+                                                int bs, const uint16_t *above,
+                                                const uint16_t *left, int mode,
+                                                int bd) {
+  int k, r, c;
+  int pred[33][65];
+  int mean, ipred;
+  const TX_SIZE tx_size = (bs == 32) ? TX_32X32 :
+      ((bs == 16) ? TX_16X16 : ((bs == 8) ? TX_8X8 : (TX_4X4)));
+  const int c0 = filter_intra_taps_4[tx_size][mode][0];
+  const int c1 = filter_intra_taps_4[tx_size][mode][1];
+  const int c2 = filter_intra_taps_4[tx_size][mode][2];
+  const int c3 = filter_intra_taps_4[tx_size][mode][3];
+
+  k = 0;
+  mean = 0;
+  while (k < bs) {
+    mean = mean + (int)left[k];
+    mean = mean + (int)above[k];
+    k++;
+  }
+  mean = (mean + bs) / (2 * bs);
+
+  for (r = 0; r < bs; ++r)
+    pred[r + 1][0] = (int)left[r] - mean;
+
+  for (c = 0; c < 2 * bs + 1; ++c)
+    pred[0][c] = (int)above[c - 1] - mean;
+
+  for (r = 1; r < bs + 1; ++r)
+    for (c = 1; c < 2 * bs + 1 - r; ++c) {
+      ipred = c0 * pred[r - 1][c] + c1 * pred[r][c - 1] +
+          c2 * pred[r - 1][c - 1] + c3 * pred[r - 1][c + 1];
+      pred[r][c] = ROUND_POWER_OF_TWO_SIGNED(ipred, FILTER_INTRA_PREC_BITS);
+    }
+
+  for (r = 0; r < bs; ++r) {
+    for (c = 0; c < bs; ++c) {
+      ipred = pred[r + 1][c + 1] + mean;
+      dst[c] = clip_pixel_highbd(ipred, bd);
+    }
+    dst += stride;
+  }
+}
+
+static void highbd_dc_filter_predictor(uint16_t *dst, ptrdiff_t stride,
+                                       int bs, const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, DC_PRED,
+                                      bd);
+}
+
+static void highbd_v_filter_predictor(uint16_t *dst, ptrdiff_t stride,
+                                      int bs, const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, V_PRED,
+                                      bd);
+}
+
+static void highbd_h_filter_predictor(uint16_t *dst, ptrdiff_t stride,
+                                      int bs, const uint16_t *above,
+                                      const uint16_t *left, int bd) {
+  highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, H_PRED,
+                                      bd);
+}
+
+static void highbd_d45_filter_predictor(uint16_t *dst, ptrdiff_t stride,
+                                        int bs, const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D45_PRED,
+                                      bd);
+}
+
+static void highbd_d135_filter_predictor(uint16_t *dst, ptrdiff_t stride,
+                                         int bs, const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D135_PRED,
+                                      bd);
+}
+
+static void highbd_d117_filter_predictor(uint16_t *dst, ptrdiff_t stride,
+                                         int bs, const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D117_PRED,
+                                      bd);
+}
+
+static void highbd_d153_filter_predictor(uint16_t *dst, ptrdiff_t stride,
+                                         int bs, const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D153_PRED,
+                                      bd);
+}
+
+static void highbd_d207_filter_predictor(uint16_t *dst, ptrdiff_t stride,
+                                         int bs, const uint16_t *above,
+                                         const uint16_t *left, int bd) {
+  highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D207_PRED,
+                                      bd);
+}
+
+static void highbd_d63_filter_predictor(uint16_t *dst, ptrdiff_t stride,
+                                        int bs, const uint16_t *above,
+                                        const uint16_t *left, int bd) {
+  highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, D63_PRED,
+                                      bd);
+}
+
+static void highbd_tm_filter_predictor(uint16_t *dst, ptrdiff_t stride,
+                                       int bs, const uint16_t *above,
+                                       const uint16_t *left, int bd) {
+  highbd_filter_intra_predictors_4tap(dst, stride, bs, above, left, TM_PRED,
+                                      bd);
+}
+
+static void (*highbd_filter_intra_predictors[EXT_INTRA_MODES])(uint16_t *dst,
+    ptrdiff_t stride, int bs, const uint16_t *above, const uint16_t *left,
+    int bd) = {
+        highbd_dc_filter_predictor, highbd_v_filter_predictor,
+        highbd_h_filter_predictor, highbd_d45_filter_predictor,
+        highbd_d135_filter_predictor, highbd_d117_filter_predictor,
+        highbd_d153_filter_predictor, highbd_d207_filter_predictor,
+        highbd_d63_filter_predictor, highbd_tm_filter_predictor,
+};
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // CONFIG_EXT_INTRA
 
 #if CONFIG_VP9_HIGHBITDEPTH
 static void build_intra_predictors_high(const MACROBLOCKD *xd,
@@ -276,56 +1149,85 @@
                                         int dst_stride,
                                         PREDICTION_MODE mode,
                                         TX_SIZE tx_size,
-#if CONFIG_MISC_FIXES
                                         int n_top_px, int n_topright_px,
                                         int n_left_px, int n_bottomleft_px,
-#else
-                                        int up_available,
-                                        int left_available,
-                                        int right_available,
-#endif
-                                        int x, int y,
-                                        int plane, int bd) {
+                                        int plane) {
   int i;
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
-#if CONFIG_MISC_FIXES
-  DECLARE_ALIGNED(16, uint16_t, left_col[32]);
-#else
-  DECLARE_ALIGNED(16, uint16_t, left_col[64]);
-#endif
-  DECLARE_ALIGNED(16, uint16_t, above_data[64 + 16]);
+  DECLARE_ALIGNED(16, uint16_t, left_col[MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, uint16_t, above_data[MAX_SB_SIZE + 16]);
   uint16_t *above_row = above_data + 16;
   const uint16_t *const_above_row = above_row;
   const int bs = 4 << tx_size;
-#if CONFIG_MISC_FIXES
+  int need_left = extend_modes[mode] & NEED_LEFT;
+  int need_above = extend_modes[mode] & NEED_ABOVE;
   const uint16_t *above_ref = ref - ref_stride;
-#else
-  int frame_width, frame_height;
-  int x0, y0;
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-#endif
-  const int need_left = extend_modes[mode] & NEED_LEFT;
-  const int need_above = extend_modes[mode] & NEED_ABOVE;
-  const int need_aboveright = extend_modes[mode] & NEED_ABOVERIGHT;
-  int base = 128 << (bd - 8);
+  int base = 128 << (xd->bd - 8);
   // 127 127 127 .. 127 127 127 127 127 127
   // 129  A   B  ..  Y   Z
   // 129  C   D  ..  W   X
   // 129  E   F  ..  U   V
   // 129  G   H  ..  S   T   T   T   T   T
 
-#if CONFIG_MISC_FIXES
-  (void) x;
-  (void) y;
+#if CONFIG_EXT_INTRA
+  const EXT_INTRA_MODE_INFO *ext_intra_mode_info =
+      &xd->mi[0]->mbmi.ext_intra_mode_info;
+  const EXT_INTRA_MODE ext_intra_mode =
+      ext_intra_mode_info->ext_intra_mode[plane != 0];
+  int p_angle = 0;
+
+  if (mode != DC_PRED && mode != TM_PRED &&
+      xd->mi[0]->mbmi.sb_type >= BLOCK_8X8) {
+    p_angle = mode_to_angle_map[mode] +
+        xd->mi[0]->mbmi.angle_delta[plane != 0] * ANGLE_STEP;
+    if (p_angle <= 90)
+      need_above = 1, need_left = 0;
+    else if (p_angle < 180)
+      need_above = 1, need_left = 1;
+    else
+      need_above = 0, need_left = 1;
+  }
+
+  if (ext_intra_mode_info->use_ext_intra_mode[plane != 0]) {
+    EXT_INTRA_MODE ext_intra_mode =
+        ext_intra_mode_info->ext_intra_mode[plane != 0];
+    need_left = ext_intra_extend_modes[ext_intra_mode] & NEED_LEFT;
+    need_above = ext_intra_extend_modes[ext_intra_mode] & NEED_ABOVE;
+  }
+#endif  // CONFIG_EXT_INTRA
+
   (void) plane;
-  (void) need_left;
-  (void) need_above;
-  (void) need_aboveright;
+  assert(n_top_px >= 0);
+  assert(n_topright_px >= 0);
+  assert(n_left_px >= 0);
+  assert(n_bottomleft_px >= 0);
+
+  if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) {
+    int i;
+    const int val = (n_left_px == 0) ? base + 1 : base - 1;
+    for (i = 0; i < bs; ++i) {
+      vpx_memset16(dst, val, bs);
+      dst += dst_stride;
+    }
+    return;
+  }
 
   // NEED_LEFT
-  if (extend_modes[mode] & NEED_LEFT) {
+  if (need_left) {
+#if CONFIG_EXT_INTRA
+    int need_bottom;
+    if (ext_intra_mode_info->use_ext_intra_mode[plane != 0]) {
+        need_bottom = 0;
+    } else if (mode != DC_PRED && mode != TM_PRED &&
+        xd->mi[0]->mbmi.sb_type >= BLOCK_8X8) {
+        need_bottom = p_angle > 180;
+    } else {
+      need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT);
+    }
+#else
     const int need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT);
+#endif  // CONFIG_EXT_INTRA
     i = 0;
     if (n_left_px > 0) {
       for (; i < n_left_px; i++)
@@ -336,15 +1238,27 @@
           left_col[i] = ref[i * ref_stride - 1];
       }
       if (i < (bs << need_bottom))
-        memset16(&left_col[i], left_col[i - 1], (bs << need_bottom) - i);
+        vpx_memset16(&left_col[i], left_col[i - 1], (bs << need_bottom) - i);
     } else {
-      memset16(left_col, base + 1, bs << need_bottom);
+      vpx_memset16(left_col, base + 1, bs << need_bottom);
     }
   }
 
   // NEED_ABOVE
-  if (extend_modes[mode] & NEED_ABOVE) {
+  if (need_above) {
+#if CONFIG_EXT_INTRA
+    int need_right;
+    if (ext_intra_mode_info->use_ext_intra_mode[plane != 0]) {
+      need_right = 1;
+    } else if (mode != DC_PRED && mode != TM_PRED &&
+        xd->mi[0]->mbmi.sb_type >= BLOCK_8X8) {
+      need_right = p_angle < 90;
+    } else {
+      need_right = !!(extend_modes[mode] & NEED_ABOVERIGHT);
+    }
+#else
     const int need_right = !!(extend_modes[mode] & NEED_ABOVERIGHT);
+#endif  // CONFIG_EXT_INTRA
     if (n_top_px > 0) {
       memcpy(above_row, above_ref, n_top_px * 2);
       i = n_top_px;
@@ -354,148 +1268,50 @@
         i += n_topright_px;
       }
       if (i < (bs << need_right))
-        memset16(&above_row[i], above_row[i - 1], (bs << need_right) - i);
+        vpx_memset16(&above_row[i], above_row[i - 1], (bs << need_right) - i);
     } else {
-      memset16(above_row, base - 1, bs << need_right);
+      vpx_memset16(above_row, base - 1, bs << need_right);
     }
   }
 
-  if (extend_modes[mode] & NEED_ABOVELEFT) {
+#if CONFIG_EXT_INTRA
+  if (ext_intra_mode_info->use_ext_intra_mode[plane != 0] ||
+      (extend_modes[mode] & NEED_ABOVELEFT) ||
+      (mode != DC_PRED && mode != TM_PRED &&
+        xd->mi[0]->mbmi.sb_type >= BLOCK_8X8)) {
     above_row[-1] = n_top_px > 0 ?
         (n_left_px > 0 ? above_ref[-1] : base + 1) : base - 1;
   }
 #else
-  // Get current frame pointer, width and height.
-  if (plane == 0) {
-    frame_width = xd->cur_buf->y_width;
-    frame_height = xd->cur_buf->y_height;
-  } else {
-    frame_width = xd->cur_buf->uv_width;
-    frame_height = xd->cur_buf->uv_height;
+  if ((extend_modes[mode] & NEED_ABOVELEFT)) {
+    above_row[-1] = n_top_px > 0 ?
+        (n_left_px > 0 ? above_ref[-1] : base + 1) : base - 1;
+  }
+#endif  // CONFIG_EXT_INTRA
+
+#if CONFIG_EXT_INTRA
+  if (ext_intra_mode_info->use_ext_intra_mode[plane != 0]) {
+    highbd_filter_intra_predictors[ext_intra_mode](dst, dst_stride, bs,
+        const_above_row, left_col, xd->bd);
+    return;
   }
 
-  // Get block position in current frame.
-  x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
-  y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
-
-  // NEED_LEFT
-  if (need_left) {
-    if (left_available) {
-      if (xd->mb_to_bottom_edge < 0) {
-        /* slower path if the block needs border extension */
-        if (y0 + bs <= frame_height) {
-          for (i = 0; i < bs; ++i)
-            left_col[i] = ref[i * ref_stride - 1];
-        } else {
-          const int extend_bottom = frame_height - y0;
-          for (i = 0; i < extend_bottom; ++i)
-            left_col[i] = ref[i * ref_stride - 1];
-          for (; i < bs; ++i)
-            left_col[i] = ref[(extend_bottom - 1) * ref_stride - 1];
-        }
-      } else {
-        /* faster path if the block does not need extension */
-        for (i = 0; i < bs; ++i)
-          left_col[i] = ref[i * ref_stride - 1];
-      }
-    } else {
-      // TODO(Peter): this value should probably change for high bitdepth
-      vpx_memset16(left_col, base + 1, bs);
-    }
+  if (mode != DC_PRED && mode != TM_PRED &&
+      xd->mi[0]->mbmi.sb_type >= BLOCK_8X8) {
+    INTRA_FILTER filter = INTRA_FILTER_LINEAR;
+    if (plane == 0 && vp10_is_intra_filter_switchable(p_angle))
+      filter = xd->mi[0]->mbmi.intra_filter;
+    highbd_dr_predictor(dst, dst_stride, bs, const_above_row, left_col,
+                        p_angle, xd->bd, filter);
+    return;
   }
-
-  // NEED_ABOVE
-  if (need_above) {
-    if (up_available) {
-      const uint16_t *above_ref = ref - ref_stride;
-      if (xd->mb_to_right_edge < 0) {
-        /* slower path if the block needs border extension */
-        if (x0 + bs <= frame_width) {
-          memcpy(above_row, above_ref, bs * sizeof(above_row[0]));
-        } else if (x0 <= frame_width) {
-          const int r = frame_width - x0;
-          memcpy(above_row, above_ref, r * sizeof(above_row[0]));
-          vpx_memset16(above_row + r, above_row[r - 1], x0 + bs - frame_width);
-        }
-      } else {
-        /* faster path if the block does not need extension */
-        if (bs == 4 && right_available && left_available) {
-          const_above_row = above_ref;
-        } else {
-          memcpy(above_row, above_ref, bs * sizeof(above_row[0]));
-        }
-      }
-      above_row[-1] = left_available ? above_ref[-1] : (base + 1);
-    } else {
-      vpx_memset16(above_row, base - 1, bs);
-      above_row[-1] = base - 1;
-    }
-  }
-
-  // NEED_ABOVERIGHT
-  if (need_aboveright) {
-    if (up_available) {
-      const uint16_t *above_ref = ref - ref_stride;
-      if (xd->mb_to_right_edge < 0) {
-        /* slower path if the block needs border extension */
-        if (x0 + 2 * bs <= frame_width) {
-          if (right_available && bs == 4) {
-            memcpy(above_row, above_ref, 2 * bs * sizeof(above_row[0]));
-          } else {
-            memcpy(above_row, above_ref, bs * sizeof(above_row[0]));
-            vpx_memset16(above_row + bs, above_row[bs - 1], bs);
-          }
-        } else if (x0 + bs <= frame_width) {
-          const int r = frame_width - x0;
-          if (right_available && bs == 4) {
-            memcpy(above_row, above_ref, r * sizeof(above_row[0]));
-            vpx_memset16(above_row + r, above_row[r - 1],
-                         x0 + 2 * bs - frame_width);
-          } else {
-            memcpy(above_row, above_ref, bs * sizeof(above_row[0]));
-            vpx_memset16(above_row + bs, above_row[bs - 1], bs);
-          }
-        } else if (x0 <= frame_width) {
-          const int r = frame_width - x0;
-          memcpy(above_row, above_ref, r * sizeof(above_row[0]));
-          vpx_memset16(above_row + r, above_row[r - 1],
-                       x0 + 2 * bs - frame_width);
-        }
-        // TODO(Peter) this value should probably change for high bitdepth
-        above_row[-1] = left_available ? above_ref[-1] : (base + 1);
-      } else {
-        /* faster path if the block does not need extension */
-        if (bs == 4 && right_available && left_available) {
-          const_above_row = above_ref;
-        } else {
-          memcpy(above_row, above_ref, bs * sizeof(above_row[0]));
-          if (bs == 4 && right_available)
-            memcpy(above_row + bs, above_ref + bs, bs * sizeof(above_row[0]));
-          else
-            vpx_memset16(above_row + bs, above_row[bs - 1], bs);
-          // TODO(Peter): this value should probably change for high bitdepth
-          above_row[-1] = left_available ? above_ref[-1] : (base + 1);
-        }
-      }
-    } else {
-      vpx_memset16(above_row, base - 1, bs * 2);
-      // TODO(Peter): this value should probably change for high bitdepth
-      above_row[-1] = base - 1;
-    }
-  }
-#endif
+#endif  // CONFIG_EXT_INTRA
 
   // predict
   if (mode == DC_PRED) {
-#if CONFIG_MISC_FIXES
     dc_pred_high[n_left_px > 0][n_top_px > 0][tx_size](dst, dst_stride,
                                                        const_above_row,
                                                        left_col, xd->bd);
-#else
-    dc_pred_high[left_available][up_available][tx_size](dst, dst_stride,
-                                                        const_above_row,
-                                                        left_col, xd->bd);
-#endif
   } else {
     pred_high[mode][tx_size](dst, dst_stride, const_above_row, left_col,
                              xd->bd);
@@ -506,28 +1322,44 @@
 static void build_intra_predictors(const MACROBLOCKD *xd, const uint8_t *ref,
                                    int ref_stride, uint8_t *dst, int dst_stride,
                                    PREDICTION_MODE mode, TX_SIZE tx_size,
-#if CONFIG_MISC_FIXES
                                    int n_top_px, int n_topright_px,
                                    int n_left_px, int n_bottomleft_px,
-#else
-                                   int up_available, int left_available,
-                                   int right_available,
-#endif
-                                   int x, int y, int plane) {
+                                   int plane) {
   int i;
-#if CONFIG_MISC_FIXES
-  DECLARE_ALIGNED(16, uint8_t, left_col[64]);
+  DECLARE_ALIGNED(16, uint8_t, left_col[MAX_SB_SIZE]);
   const uint8_t *above_ref = ref - ref_stride;
-#else
-  DECLARE_ALIGNED(16, uint8_t, left_col[32]);
-  int frame_width, frame_height;
-  int x0, y0;
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-#endif
-  DECLARE_ALIGNED(16, uint8_t, above_data[64 + 16]);
+  DECLARE_ALIGNED(16, uint8_t, above_data[MAX_SB_SIZE + 16]);
   uint8_t *above_row = above_data + 16;
   const uint8_t *const_above_row = above_row;
   const int bs = 4 << tx_size;
+  int need_left = extend_modes[mode] & NEED_LEFT;
+  int need_above = extend_modes[mode] & NEED_ABOVE;
+#if CONFIG_EXT_INTRA
+  const EXT_INTRA_MODE_INFO *ext_intra_mode_info =
+      &xd->mi[0]->mbmi.ext_intra_mode_info;
+  const EXT_INTRA_MODE ext_intra_mode =
+      ext_intra_mode_info->ext_intra_mode[plane != 0];
+  int p_angle = 0;
+
+  if (mode != DC_PRED && mode != TM_PRED &&
+      xd->mi[0]->mbmi.sb_type >= BLOCK_8X8) {
+    p_angle = mode_to_angle_map[mode] +
+        xd->mi[0]->mbmi.angle_delta[plane != 0] * ANGLE_STEP;
+    if (p_angle <= 90)
+      need_above = 1, need_left = 0;
+    else if (p_angle < 180)
+      need_above = 1, need_left = 1;
+    else
+      need_above = 0, need_left = 1;
+  }
+
+  if (ext_intra_mode_info->use_ext_intra_mode[plane != 0]) {
+    EXT_INTRA_MODE ext_intra_mode =
+        ext_intra_mode_info->ext_intra_mode[plane != 0];
+    need_left = ext_intra_extend_modes[ext_intra_mode] & NEED_LEFT;
+    need_above = ext_intra_extend_modes[ext_intra_mode] & NEED_ABOVE;
+  }
+#endif  // CONFIG_EXT_INTRA
 
   // 127 127 127 .. 127 127 127 127 127 127
   // 129  A   B  ..  Y   Z
@@ -536,34 +1368,38 @@
   // 129  G   H  ..  S   T   T   T   T   T
   // ..
 
-#if CONFIG_MISC_FIXES
   (void) xd;
-  (void) x;
-  (void) y;
   (void) plane;
   assert(n_top_px >= 0);
   assert(n_topright_px >= 0);
   assert(n_left_px >= 0);
   assert(n_bottomleft_px >= 0);
-#else
-  // Get current frame pointer, width and height.
-  if (plane == 0) {
-    frame_width = xd->cur_buf->y_width;
-    frame_height = xd->cur_buf->y_height;
-  } else {
-    frame_width = xd->cur_buf->uv_width;
-    frame_height = xd->cur_buf->uv_height;
+
+  if ((!need_above && n_left_px == 0) || (!need_left && n_top_px == 0)) {
+    int i;
+    const int val = (n_left_px == 0) ? 129 : 127;
+    for (i = 0; i < bs; ++i) {
+      memset(dst, val, bs);
+      dst += dst_stride;
+    }
+    return;
   }
 
-  // Get block position in current frame.
-  x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
-  y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
-#endif
-
   // NEED_LEFT
-  if (extend_modes[mode] & NEED_LEFT) {
-#if CONFIG_MISC_FIXES
+  if (need_left) {
+#if CONFIG_EXT_INTRA
+    int need_bottom;
+    if (ext_intra_mode_info->use_ext_intra_mode[plane != 0]) {
+      need_bottom = 0;
+    } else if (mode != DC_PRED && mode != TM_PRED &&
+        xd->mi[0]->mbmi.sb_type >= BLOCK_8X8) {
+      need_bottom = p_angle > 180;
+    } else {
+      need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT);
+    }
+#else
     const int need_bottom = !!(extend_modes[mode] & NEED_BOTTOMLEFT);
+#endif  // CONFIG_EXT_INTRA
     i = 0;
     if (n_left_px > 0) {
       for (; i < n_left_px; i++)
@@ -578,35 +1414,23 @@
     } else {
       memset(left_col, 129, bs << need_bottom);
     }
-#else
-    if (left_available) {
-      if (xd->mb_to_bottom_edge < 0) {
-        /* slower path if the block needs border extension */
-        if (y0 + bs <= frame_height) {
-          for (i = 0; i < bs; ++i)
-            left_col[i] = ref[i * ref_stride - 1];
-        } else {
-          const int extend_bottom = frame_height - y0;
-          for (i = 0; i < extend_bottom; ++i)
-            left_col[i] = ref[i * ref_stride - 1];
-          for (; i < bs; ++i)
-            left_col[i] = ref[(extend_bottom - 1) * ref_stride - 1];
-        }
-      } else {
-        /* faster path if the block does not need extension */
-        for (i = 0; i < bs; ++i)
-          left_col[i] = ref[i * ref_stride - 1];
-      }
-    } else {
-      memset(left_col, 129, bs);
-    }
-#endif
   }
 
   // NEED_ABOVE
-  if (extend_modes[mode] & NEED_ABOVE) {
-#if CONFIG_MISC_FIXES
+  if (need_above) {
+#if CONFIG_EXT_INTRA
+    int need_right;
+    if (ext_intra_mode_info->use_ext_intra_mode[plane != 0]) {
+      need_right = 1;
+    } else if (mode != DC_PRED && mode != TM_PRED &&
+        xd->mi[0]->mbmi.sb_type >= BLOCK_8X8) {
+      need_right = p_angle < 90;
+    } else {
+      need_right = !!(extend_modes[mode] & NEED_ABOVERIGHT);
+    }
+#else
     const int need_right = !!(extend_modes[mode] & NEED_ABOVERIGHT);
+#endif  // CONFIG_EXT_INTRA
     if (n_top_px > 0) {
       memcpy(above_row, above_ref, n_top_px);
       i = n_top_px;
@@ -620,140 +1444,128 @@
     } else {
       memset(above_row, 127, bs << need_right);
     }
-#else
-    if (up_available) {
-      const uint8_t *above_ref = ref - ref_stride;
-      if (xd->mb_to_right_edge < 0) {
-        /* slower path if the block needs border extension */
-        if (x0 + bs <= frame_width) {
-          memcpy(above_row, above_ref, bs);
-        } else if (x0 <= frame_width) {
-          const int r = frame_width - x0;
-          memcpy(above_row, above_ref, r);
-          memset(above_row + r, above_row[r - 1], x0 + bs - frame_width);
-        }
-      } else {
-        /* faster path if the block does not need extension */
-        if (bs == 4 && right_available && left_available) {
-          const_above_row = above_ref;
-        } else {
-          memcpy(above_row, above_ref, bs);
-        }
-      }
-      above_row[-1] = left_available ? above_ref[-1] : 129;
-    } else {
-      memset(above_row, 127, bs);
-      above_row[-1] = 127;
-    }
-#endif
   }
 
-#if CONFIG_MISC_FIXES
-  if (extend_modes[mode] & NEED_ABOVELEFT) {
+#if CONFIG_EXT_INTRA
+  if (ext_intra_mode_info->use_ext_intra_mode[plane != 0] ||
+      (extend_modes[mode] & NEED_ABOVELEFT) ||
+      (mode != DC_PRED && mode != TM_PRED &&
+          xd->mi[0]->mbmi.sb_type >= BLOCK_8X8)) {
     above_row[-1] = n_top_px > 0 ? (n_left_px > 0 ? above_ref[-1] : 129) : 127;
   }
 #else
-  // NEED_ABOVERIGHT
-  if (extend_modes[mode] & NEED_ABOVERIGHT) {
-    if (up_available) {
-      const uint8_t *above_ref = ref - ref_stride;
-      if (xd->mb_to_right_edge < 0) {
-        /* slower path if the block needs border extension */
-        if (x0 + 2 * bs <= frame_width) {
-          if (right_available && bs == 4) {
-            memcpy(above_row, above_ref, 2 * bs);
-          } else {
-            memcpy(above_row, above_ref, bs);
-            memset(above_row + bs, above_row[bs - 1], bs);
-          }
-        } else if (x0 + bs <= frame_width) {
-          const int r = frame_width - x0;
-          if (right_available && bs == 4) {
-            memcpy(above_row, above_ref, r);
-            memset(above_row + r, above_row[r - 1], x0 + 2 * bs - frame_width);
-          } else {
-            memcpy(above_row, above_ref, bs);
-            memset(above_row + bs, above_row[bs - 1], bs);
-          }
-        } else if (x0 <= frame_width) {
-          const int r = frame_width - x0;
-          memcpy(above_row, above_ref, r);
-          memset(above_row + r, above_row[r - 1], x0 + 2 * bs - frame_width);
-        }
-      } else {
-        /* faster path if the block does not need extension */
-        if (bs == 4 && right_available && left_available) {
-          const_above_row = above_ref;
-        } else {
-          memcpy(above_row, above_ref, bs);
-          if (bs == 4 && right_available)
-            memcpy(above_row + bs, above_ref + bs, bs);
-          else
-            memset(above_row + bs, above_row[bs - 1], bs);
-        }
-      }
-      above_row[-1] = left_available ? above_ref[-1] : 129;
-    } else {
-      memset(above_row, 127, bs * 2);
-      above_row[-1] = 127;
-    }
+  if ((extend_modes[mode] & NEED_ABOVELEFT)) {
+    above_row[-1] = n_top_px > 0 ? (n_left_px > 0 ? above_ref[-1] : 129) : 127;
   }
-#endif
+#endif  // CONFIG_EXT_INTRA
+
+#if CONFIG_EXT_INTRA
+  if (ext_intra_mode_info->use_ext_intra_mode[plane != 0]) {
+    filter_intra_predictors[ext_intra_mode](dst, dst_stride, bs,
+        const_above_row, left_col);
+    return;
+  }
+
+  if (mode != DC_PRED && mode != TM_PRED &&
+      xd->mi[0]->mbmi.sb_type >= BLOCK_8X8) {
+    INTRA_FILTER filter = INTRA_FILTER_LINEAR;
+    if (plane == 0 && vp10_is_intra_filter_switchable(p_angle))
+      filter = xd->mi[0]->mbmi.intra_filter;
+    dr_predictor(dst, dst_stride, tx_size, const_above_row, left_col, p_angle,
+                 filter);
+    return;
+  }
+#endif  // CONFIG_EXT_INTRA
 
   // predict
   if (mode == DC_PRED) {
-#if CONFIG_MISC_FIXES
     dc_pred[n_left_px > 0][n_top_px > 0][tx_size](dst, dst_stride,
                                                   const_above_row, left_col);
-#else
-    dc_pred[left_available][up_available][tx_size](dst, dst_stride,
-                                                   const_above_row, left_col);
-#endif
   } else {
     pred[mode][tx_size](dst, dst_stride, const_above_row, left_col);
   }
 }
 
 void vp10_predict_intra_block(const MACROBLOCKD *xd, int bwl_in, int bhl_in,
-                             TX_SIZE tx_size, PREDICTION_MODE mode,
-                             const uint8_t *ref, int ref_stride,
-                             uint8_t *dst, int dst_stride,
-                             int aoff, int loff, int plane) {
+                              TX_SIZE tx_size, PREDICTION_MODE mode,
+                              const uint8_t *ref, int ref_stride,
+                              uint8_t *dst, int dst_stride,
+                              int col_off, int row_off, int plane) {
   const int txw = (1 << tx_size);
-  const int have_top = loff || xd->up_available;
-  const int have_left = aoff || xd->left_available;
-  const int x = aoff * 4;
-  const int y = loff * 4;
-#if CONFIG_MISC_FIXES
+  const int have_top = row_off || xd->up_available;
+  const int have_left = col_off || xd->left_available;
+  const int x = col_off * 4;
+  const int y = row_off * 4;
   const int bw = VPXMAX(2, 1 << bwl_in);
   const int bh = VPXMAX(2, 1 << bhl_in);
-  const int mi_row = -xd->mb_to_top_edge >> 6;
-  const int mi_col = -xd->mb_to_left_edge >> 6;
+  const int mi_row = -xd->mb_to_top_edge >> (3 + MI_SIZE_LOG2);
+  const int mi_col = -xd->mb_to_left_edge >> (3 + MI_SIZE_LOG2);
   const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   const int right_available =
-      mi_col + (bw >> !pd->subsampling_x) < xd->tile.mi_col_end;
+      mi_col + (1 << mi_width_log2_lookup[bsize]) < xd->tile.mi_col_end;
+#if CONFIG_EXT_PARTITION_TYPES
+  const PARTITION_TYPE partition = xd->mi[0]->mbmi.partition;
+#endif
   const int have_right = vp10_has_right(bsize, mi_row, mi_col,
-                                        right_available,
-                                        tx_size, loff, aoff,
-                                        pd->subsampling_x);
+                                          right_available,
+#if CONFIG_EXT_PARTITION_TYPES
+                                          partition,
+#endif
+                                          tx_size, row_off, col_off,
+                                          pd->subsampling_x);
   const int have_bottom = vp10_has_bottom(bsize, mi_row, mi_col,
                                           xd->mb_to_bottom_edge > 0,
-                                          tx_size, loff, aoff,
+                                          tx_size, row_off, col_off,
                                           pd->subsampling_y);
   const int wpx = 4 * bw;
   const int hpx = 4 * bh;
   const int txpx = 4 * txw;
+  // Distance between the right edge of this prediction block to
+  // the frame right edge
+  const int xr = (xd->mb_to_right_edge >> (3 + pd->subsampling_x)) +
+      (wpx - x - txpx);
+  // Distance between the bottom edge of this prediction block to
+  // the frame bottom edge
+  const int yd = (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y)) +
+      (hpx - y - txpx);
 
-  int xr = (xd->mb_to_right_edge >> (3 + pd->subsampling_x)) + (wpx - x - txpx);
-  int yd =
-      (xd->mb_to_bottom_edge >> (3 + pd->subsampling_y)) + (hpx - y - txpx);
+  if (xd->mi[0]->mbmi.palette_mode_info.palette_size[plane != 0] > 0) {
+    const int bs = 4 * (1 << tx_size);
+    const int stride = 4 * (1 << bwl_in);
+    int r, c;
+    uint8_t *map = NULL;
+#if CONFIG_VP9_HIGHBITDEPTH
+    uint16_t *palette = xd->mi[0]->mbmi.palette_mode_info.palette_colors +
+        plane * PALETTE_MAX_SIZE;
 #else
-  const int bw = (1 << bwl_in);
-  const int have_right = (aoff + txw) < bw;
-#endif  // CONFIG_MISC_FIXES
+    uint8_t *palette = xd->mi[0]->mbmi.palette_mode_info.palette_colors +
+        plane * PALETTE_MAX_SIZE;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
-#if CONFIG_MISC_FIXES
+    map = xd->plane[plane != 0].color_index_map;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
+      for (r = 0; r < bs; ++r)
+        for (c = 0; c < bs; ++c)
+          dst16[r * dst_stride + c] =
+              palette[map[(r + y) * stride + c + x]];
+    } else {
+      for (r = 0; r < bs; ++r)
+        for (c = 0; c < bs; ++c)
+          dst[r * dst_stride + c] =
+              (uint8_t)(palette[map[(r + y) * stride + c + x]]);
+    }
+#else
+    for (r = 0; r < bs; ++r)
+      for (c = 0; c < bs; ++c)
+        dst[r * dst_stride + c] = palette[map[(r + y) * stride + c + x]];
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    return;
+  }
+
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     build_intra_predictors_high(xd, ref, ref_stride, dst, dst_stride, mode,
@@ -762,7 +1574,7 @@
                                 have_top && have_right ? VPXMIN(txpx, xr) : 0,
                                 have_left ? VPXMIN(txpx, yd + txpx) : 0,
                                 have_bottom && have_left ? VPXMIN(txpx, yd) : 0,
-                                x, y, plane, xd->bd);
+                                plane);
     return;
   }
 #endif
@@ -772,20 +1584,7 @@
                          have_top && have_right ? VPXMIN(txpx, xr) : 0,
                          have_left ? VPXMIN(txpx, yd + txpx) : 0,
                          have_bottom && have_left ? VPXMIN(txpx, yd) : 0,
-                         x, y, plane);
-#else  // CONFIG_MISC_FIXES
-  (void) bhl_in;
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    build_intra_predictors_high(xd, ref, ref_stride, dst, dst_stride, mode,
-                                tx_size, have_top, have_left, have_right,
-                                x, y, plane, xd->bd);
-    return;
-  }
-#endif
-  build_intra_predictors(xd, ref, ref_stride, dst, dst_stride, mode, tx_size,
-                         have_top, have_left, have_right, x, y, plane);
-#endif  // CONFIG_MISC_FIXES
+                         plane);
 }
 
 void vp10_init_intra_predictors(void) {
diff --git a/vp10/common/reconintra.h b/vp10/common/reconintra.h
index f451fb8..b53c2bf 100644
--- a/vp10/common/reconintra.h
+++ b/vp10/common/reconintra.h
@@ -25,6 +25,9 @@
                              const uint8_t *ref, int ref_stride,
                              uint8_t *dst, int dst_stride,
                              int aoff, int loff, int plane);
+#if CONFIG_EXT_INTRA
+int vp10_is_intra_filter_switchable(int angle);
+#endif  // CONFIG_EXT_INTRA
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp10/common/restoration.c b/vp10/common/restoration.c
new file mode 100644
index 0000000..71abd7c
--- /dev/null
+++ b/vp10/common/restoration.c
@@ -0,0 +1,479 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vp10/common/onyxc_int.h"
+#include "vp10/common/restoration.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+
+#define RESTORATION_PARAM_PRECISION     16
+#define RESTORATION_RANGE               256
+#define RESTORATION_RANGE_SYM           (2 * RESTORATION_RANGE + 1)
+
+static uint8_t restoration_filters_r_kf[RESTORATION_LEVELS_KF]
+                                       [RESTORATION_RANGE_SYM];
+static uint8_t restoration_filters_r[RESTORATION_LEVELS]
+                                    [RESTORATION_RANGE_SYM];
+static uint8_t restoration_filters_s_kf[RESTORATION_LEVELS_KF]
+                                       [RESTORATION_WIN][RESTORATION_WIN];
+static uint8_t restoration_filters_s[RESTORATION_LEVELS]
+                                    [RESTORATION_WIN][RESTORATION_WIN];
+
+typedef struct restoration_params {
+  int sigma_x;  // spatial variance x
+  int sigma_y;  // spatial variance y
+  int sigma_r;  // range variance
+} RestorationParamsType;
+
+static RestorationParamsType
+    restoration_level_to_params_arr[RESTORATION_LEVELS] = {
+  // Values are rounded to 1/16 th precision
+  {8, 9, 30},
+  {9, 8, 30},
+  {9, 11, 32},
+  {11, 9, 32},
+  {14, 14, 32},
+  {18, 18, 36},
+  {24, 24, 40},
+  {32, 32, 40},
+};
+
+static RestorationParamsType
+    restoration_level_to_params_arr_kf[RESTORATION_LEVELS_KF] = {
+  // Values are rounded to 1/16 th precision
+  {8, 8, 30},
+  {9, 9, 32},
+  {10, 10, 32},
+  {12, 12, 32},
+  {14, 14, 32},
+  {18, 18, 36},
+  {24, 24, 40},
+  {30, 30, 44},
+  {36, 36, 48},
+  {42, 42, 48},
+  {48, 48, 48},
+  {48, 48, 56},
+  {56, 56, 48},
+  {56, 56, 56},
+  {56, 56, 64},
+  {64, 64, 48},
+};
+
+typedef void (*restore_func_type)(
+    uint8_t *data8, int width, int height,
+    int stride, RestorationInternal *rst,
+    uint8_t *tmpdata8, int tmpstride);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef void (*restore_func_highbd_type)(
+    uint8_t *data8, int width, int height,
+    int stride, RestorationInternal *rst,
+    uint8_t *tmpdata8, int tmpstride,
+    int bit_depth);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE RestorationParamsType vp10_restoration_level_to_params(
+    int index, int kf) {
+  return kf ? restoration_level_to_params_arr_kf[index] :
+              restoration_level_to_params_arr[index];
+}
+
+void vp10_loop_restoration_precal() {
+  int i;
+  for (i = 0; i < RESTORATION_LEVELS_KF; i ++) {
+    const RestorationParamsType param = vp10_restoration_level_to_params(i, 1);
+    const int sigma_x = param.sigma_x;
+    const int sigma_y = param.sigma_y;
+    const int sigma_r = param.sigma_r;
+    const double sigma_r_d = (double)sigma_r / RESTORATION_PARAM_PRECISION;
+    const double sigma_x_d = (double)sigma_x / RESTORATION_PARAM_PRECISION;
+    const double sigma_y_d = (double)sigma_y / RESTORATION_PARAM_PRECISION;
+
+    uint8_t *fr = restoration_filters_r_kf[i] + RESTORATION_RANGE;
+    int j, x, y;
+    for (j = 0; j <= RESTORATION_RANGE; j++) {
+      fr[j] = (uint8_t)(0.5 + RESTORATION_FILT_STEP *
+                        exp(-(j * j) / (2 * sigma_r_d * sigma_r_d)));
+      fr[-j] = fr[j];
+    }
+    for (y = -RESTORATION_HALFWIN; y <= RESTORATION_HALFWIN; y++) {
+      for (x = -RESTORATION_HALFWIN; x <= RESTORATION_HALFWIN; x++) {
+        restoration_filters_s_kf[i][y + RESTORATION_HALFWIN]
+                                   [x + RESTORATION_HALFWIN] =
+          (uint8_t)(0.5 + RESTORATION_FILT_STEP *
+                    exp(-(x * x) / (2 * sigma_x_d * sigma_x_d)
+                        -(y * y) / (2 * sigma_y_d * sigma_y_d)));
+      }
+    }
+  }
+  for (i = 0; i < RESTORATION_LEVELS; i ++) {
+    const RestorationParamsType param = vp10_restoration_level_to_params(i, 0);
+    const int sigma_x = param.sigma_x;
+    const int sigma_y = param.sigma_y;
+    const int sigma_r = param.sigma_r;
+    const double sigma_r_d = (double)sigma_r / RESTORATION_PARAM_PRECISION;
+    const double sigma_x_d = (double)sigma_x / RESTORATION_PARAM_PRECISION;
+    const double sigma_y_d = (double)sigma_y / RESTORATION_PARAM_PRECISION;
+
+    uint8_t *fr = restoration_filters_r[i] + RESTORATION_RANGE;
+    int j, x, y;
+    for (j = 0; j <= RESTORATION_RANGE; j++) {
+      fr[j] = (uint8_t)(0.5 + RESTORATION_FILT_STEP *
+                        exp(-(j * j) / (2 * sigma_r_d * sigma_r_d)));
+      fr[-j] = fr[j];
+    }
+    for (y = -RESTORATION_HALFWIN; y <= RESTORATION_HALFWIN; y++) {
+      for (x = -RESTORATION_HALFWIN; x <= RESTORATION_HALFWIN; x++) {
+        restoration_filters_s[i][y + RESTORATION_HALFWIN]
+                                [x + RESTORATION_HALFWIN] =
+            (uint8_t)(0.5 + RESTORATION_FILT_STEP *
+                      exp(-(x * x) / (2 * sigma_x_d * sigma_x_d)
+                          -(y * y) / (2 * sigma_y_d * sigma_y_d)));
+      }
+    }
+  }
+}
+
+int vp10_restoration_level_bits(const VP10_COMMON *const cm) {
+  return cm->frame_type == KEY_FRAME ?
+      RESTORATION_LEVEL_BITS_KF : RESTORATION_LEVEL_BITS;
+}
+
+void vp10_loop_restoration_init(RestorationInternal *rst,
+                                RestorationInfo *rsi, int kf) {
+  int i;
+  rst->restoration_type = rsi->restoration_type;
+  if (rsi->restoration_type == RESTORE_BILATERAL) {
+    const int level = rsi->restoration_level;
+    assert(level >= 0);
+    rst->wr_lut = kf ? restoration_filters_r_kf[level] :
+                       restoration_filters_r[level];
+    for (i = 0; i < RESTORATION_WIN; i++)
+      rst->wx_lut[i] = kf ? restoration_filters_s_kf[level][i] :
+                            restoration_filters_s[level][i];
+  } else if (rsi->restoration_type == RESTORE_WIENER) {
+    rst->vfilter[RESTORATION_HALFWIN] = rst->hfilter[RESTORATION_HALFWIN] =
+        RESTORATION_FILT_STEP;
+    for (i = 0; i < RESTORATION_HALFWIN; ++i) {
+      rst->vfilter[i] = rst->vfilter[RESTORATION_WIN - 1 - i] = rsi->vfilter[i];
+      rst->hfilter[i] = rst->hfilter[RESTORATION_WIN - 1 - i] = rsi->hfilter[i];
+      rst->vfilter[RESTORATION_HALFWIN] -= 2 * rsi->vfilter[i];
+      rst->hfilter[RESTORATION_HALFWIN] -= 2 * rsi->hfilter[i];
+    }
+  }
+}
+
+static void loop_bilateral_filter(uint8_t *data, int width, int height,
+                                  int stride, RestorationInternal *rst,
+                                  uint8_t *tmpdata, int tmpstride) {
+  int i, j;
+  const uint8_t *wr_lut_ = rst->wr_lut + RESTORATION_RANGE;
+
+  uint8_t *data_p = data + RESTORATION_HALFWIN * stride;
+  uint8_t *tmpdata_p = tmpdata + RESTORATION_HALFWIN * tmpstride;
+  for (i = RESTORATION_HALFWIN; i < height - RESTORATION_HALFWIN; ++i) {
+    for (j = RESTORATION_HALFWIN; j < width - RESTORATION_HALFWIN; ++j) {
+      int x, y;
+      int flsum = 0, wtsum = 0, wt;
+      uint8_t *data_p2 = data_p + j - RESTORATION_HALFWIN * stride;
+      for (y = -RESTORATION_HALFWIN; y <= RESTORATION_HALFWIN; ++y) {
+        for (x = -RESTORATION_HALFWIN; x <= RESTORATION_HALFWIN; ++x) {
+          wt = (int)rst->wx_lut[y + RESTORATION_HALFWIN]
+                               [x + RESTORATION_HALFWIN] *
+               (int)wr_lut_[data_p2[x] - data_p[j]];
+          wtsum += wt;
+          flsum += wt * data_p2[x];
+        }
+        data_p2 += stride;
+      }
+      if (wtsum > 0)
+        tmpdata_p[j] = clip_pixel((int)((flsum + wtsum / 2) / wtsum));
+      else
+        tmpdata_p[j] = data_p[j];
+    }
+    tmpdata_p += tmpstride;
+    data_p += stride;
+  }
+
+  for (i = RESTORATION_HALFWIN; i < height - RESTORATION_HALFWIN; ++i) {
+    memcpy(data + i * stride + RESTORATION_HALFWIN,
+           tmpdata + i * tmpstride + RESTORATION_HALFWIN,
+           (width - RESTORATION_HALFWIN * 2) * sizeof(*data));
+  }
+}
+
+uint8_t hor_sym_filter(uint8_t *d, int *hfilter) {
+  int32_t s = (1 << (RESTORATION_FILT_BITS - 1)) +
+      d[0] * hfilter[RESTORATION_HALFWIN];
+  int i;
+  for (i = 1; i <= RESTORATION_HALFWIN; ++i)
+    s += (d[i] + d[-i]) * hfilter[RESTORATION_HALFWIN + i];
+  return clip_pixel(s >> RESTORATION_FILT_BITS);
+}
+
+uint8_t ver_sym_filter(uint8_t *d, int stride, int *vfilter) {
+  int32_t s = (1 << (RESTORATION_FILT_BITS - 1)) +
+      d[0] * vfilter[RESTORATION_HALFWIN];
+  int i;
+  for (i = 1; i <= RESTORATION_HALFWIN; ++i)
+    s += (d[i * stride] + d[-i * stride]) * vfilter[RESTORATION_HALFWIN + i];
+  return clip_pixel(s >> RESTORATION_FILT_BITS);
+}
+
+static void loop_wiener_filter(uint8_t *data, int width, int height,
+                               int stride, RestorationInternal *rst,
+                               uint8_t *tmpdata, int tmpstride) {
+  uint8_t *data_p = data;
+  uint8_t *tmpdata_p = tmpdata;
+  int i, j;
+
+  for (i = 0; i < height; ++i) {
+    memcpy(tmpdata_p, data_p, sizeof(*data_p) * RESTORATION_HALFWIN);
+    data_p += RESTORATION_HALFWIN;
+    tmpdata_p += RESTORATION_HALFWIN;
+    for (j = RESTORATION_HALFWIN; j < width - RESTORATION_HALFWIN; ++j) {
+      *tmpdata_p++ = hor_sym_filter(data_p++, rst->hfilter);
+    }
+    memcpy(tmpdata_p, data_p, sizeof(*data_p) * RESTORATION_HALFWIN);
+    data_p += RESTORATION_HALFWIN - width + stride;
+    tmpdata_p += RESTORATION_HALFWIN - width + tmpstride;
+  }
+  data_p = data;
+  tmpdata_p = tmpdata;
+  for (i = 0; i < RESTORATION_HALFWIN; ++i) {
+    memcpy(data_p, tmpdata_p, sizeof(*data_p) * width);
+    data_p += stride;
+    tmpdata_p += tmpstride;
+  }
+  for (; i < height - RESTORATION_HALFWIN; ++i) {
+    for (j = 0; j < width; ++j)
+      *data_p++ = ver_sym_filter(tmpdata_p++, tmpstride, rst->vfilter);
+    data_p += stride - width;
+    tmpdata_p += tmpstride - width;
+  }
+  for (; i < height; ++i) {
+    memcpy(data_p, tmpdata_p, sizeof(*data_p) * width);
+    data_p += stride;
+    tmpdata_p += tmpstride;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void loop_bilateral_filter_highbd(
+    uint8_t *data8, int width, int height,
+    int stride, RestorationInternal *rst,
+    uint8_t *tmpdata8, int tmpstride, int bit_depth) {
+  int i, j;
+  const uint8_t *wr_lut_ = rst->wr_lut + RESTORATION_RANGE;
+
+  uint16_t *data = CONVERT_TO_SHORTPTR(data8);
+  uint16_t *tmpdata = CONVERT_TO_SHORTPTR(tmpdata8);
+  uint16_t *data_p = data + RESTORATION_HALFWIN * stride;
+  uint16_t *tmpdata_p = tmpdata + RESTORATION_HALFWIN * tmpstride;
+  for (i = RESTORATION_HALFWIN; i < height - RESTORATION_HALFWIN; ++i) {
+    for (j = RESTORATION_HALFWIN; j < width - RESTORATION_HALFWIN; ++j) {
+      int x, y, diff_r;
+      int flsum = 0, wtsum = 0, wt;
+      uint16_t *data_p2 = data_p + j - RESTORATION_HALFWIN * stride;
+      for (y = -RESTORATION_HALFWIN; y <= RESTORATION_HALFWIN; ++y) {
+        for (x = -RESTORATION_HALFWIN; x <= RESTORATION_HALFWIN; ++x) {
+          diff_r = (data_p2[x] - data_p[j]) >> (bit_depth - 8);
+          assert(diff_r >= -RESTORATION_RANGE && diff_r <= RESTORATION_RANGE);
+          wt = (int)rst->wx_lut[y + RESTORATION_HALFWIN]
+                               [x + RESTORATION_HALFWIN] *
+               (int)wr_lut_[diff_r];
+          wtsum += wt;
+          flsum += wt * data_p2[x];
+        }
+        data_p2 += stride;
+      }
+      if (wtsum > 0)
+        tmpdata_p[j] = clip_pixel_highbd((int)((flsum + wtsum / 2) / wtsum),
+                                         bit_depth);
+      else
+        tmpdata_p[j] = data_p[j];
+    }
+    tmpdata_p += tmpstride;
+    data_p += stride;
+  }
+
+  for (i = RESTORATION_HALFWIN; i < height - RESTORATION_HALFWIN; ++i) {
+    memcpy(data + i * stride + RESTORATION_HALFWIN,
+           tmpdata + i * tmpstride + RESTORATION_HALFWIN,
+           (width - RESTORATION_HALFWIN * 2) * sizeof(*data));
+  }
+}
+
+uint16_t hor_sym_filter_highbd(uint16_t *d, int *hfilter, int bd) {
+  int32_t s = (1 << (RESTORATION_FILT_BITS - 1)) +
+      d[0] * hfilter[RESTORATION_HALFWIN];
+  int i;
+  for (i = 1; i <= RESTORATION_HALFWIN; ++i)
+    s += (d[i] + d[-i]) * hfilter[RESTORATION_HALFWIN + i];
+  return clip_pixel_highbd(s >> RESTORATION_FILT_BITS, bd);
+}
+
+uint16_t ver_sym_filter_highbd(uint16_t *d, int stride, int *vfilter, int bd) {
+  int32_t s = (1 << (RESTORATION_FILT_BITS - 1)) +
+      d[0] * vfilter[RESTORATION_HALFWIN];
+  int i;
+  for (i = 1; i <= RESTORATION_HALFWIN; ++i)
+    s += (d[i * stride] + d[-i * stride]) * vfilter[RESTORATION_HALFWIN + i];
+  return clip_pixel_highbd(s >> RESTORATION_FILT_BITS, bd);
+}
+
+static void loop_wiener_filter_highbd(uint8_t *data8, int width, int height,
+                                      int stride, RestorationInternal *rst,
+                                      uint8_t *tmpdata8, int tmpstride,
+                                      int bit_depth) {
+  uint16_t *data = CONVERT_TO_SHORTPTR(data8);
+  uint16_t *tmpdata = CONVERT_TO_SHORTPTR(tmpdata8);
+  uint16_t *data_p = data;
+  uint16_t *tmpdata_p = tmpdata;
+  int i, j;
+  for (i = 0; i < height; ++i) {
+    memcpy(tmpdata_p, data_p, sizeof(*data_p) * RESTORATION_HALFWIN);
+    data_p += RESTORATION_HALFWIN;
+    tmpdata_p += RESTORATION_HALFWIN;
+    for (j = RESTORATION_HALFWIN; j < width - RESTORATION_HALFWIN; ++j) {
+      *tmpdata_p++ = hor_sym_filter_highbd(data_p++, rst->hfilter, bit_depth);
+    }
+    memcpy(tmpdata_p, data_p, sizeof(*data_p) * RESTORATION_HALFWIN);
+    data_p += RESTORATION_HALFWIN - width + stride;
+    tmpdata_p += RESTORATION_HALFWIN - width + tmpstride;
+  }
+  data_p = data;
+  tmpdata_p = tmpdata;
+  for (i = 0; i < RESTORATION_HALFWIN; ++i) {
+    memcpy(data_p, tmpdata_p, sizeof(*data_p) * width);
+    data_p += stride;
+    tmpdata_p += tmpstride;
+  }
+  for (; i < height - RESTORATION_HALFWIN; ++i) {
+    for (j = 0; j < width; ++j)
+      *data_p++ = ver_sym_filter_highbd(
+          tmpdata_p++, tmpstride, rst->vfilter, bit_depth);
+    data_p += stride - width;
+    tmpdata_p += tmpstride - width;
+  }
+  for (; i < height; ++i) {
+    memcpy(data_p, tmpdata_p, sizeof(*data_p) * width);
+    data_p += stride;
+    tmpdata_p += tmpstride;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+void vp10_loop_restoration_rows(YV12_BUFFER_CONFIG *frame,
+                                VP10_COMMON *cm,
+                                int start_mi_row, int end_mi_row,
+                                int y_only) {
+  const int ywidth = frame->y_crop_width;
+  const int ystride = frame->y_stride;
+  const int uvwidth = frame->uv_crop_width;
+  const int uvstride = frame->uv_stride;
+  const int ystart = start_mi_row << MI_SIZE_LOG2;
+  const int uvstart = ystart >> cm->subsampling_y;
+  int yend = end_mi_row << MI_SIZE_LOG2;
+  int uvend = yend >> cm->subsampling_y;
+  restore_func_type restore_func =
+      cm->rst_internal.restoration_type == RESTORE_BILATERAL ?
+      loop_bilateral_filter : loop_wiener_filter;
+#if CONFIG_VP9_HIGHBITDEPTH
+  restore_func_highbd_type restore_func_highbd =
+      cm->rst_internal.restoration_type == RESTORE_BILATERAL ?
+      loop_bilateral_filter_highbd : loop_wiener_filter_highbd;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  YV12_BUFFER_CONFIG *tmp_buf;
+
+  yend = VPXMIN(yend, cm->height);
+  uvend = VPXMIN(uvend, cm->subsampling_y ? (cm->height + 1) >> 1 : cm->height);
+
+  if (vpx_realloc_frame_buffer(&cm->tmp_loop_buf, cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                               cm->use_highbitdepth,
+#endif
+                               VP9_DEC_BORDER_IN_PIXELS, cm->byte_alignment,
+                               NULL, NULL, NULL) < 0)
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate tmp restoration buffer");
+
+  tmp_buf = &cm->tmp_loop_buf;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (cm->use_highbitdepth)
+    restore_func_highbd(
+        frame->y_buffer + ystart * ystride,
+        ywidth, yend - ystart, ystride, &cm->rst_internal,
+        tmp_buf->y_buffer + ystart * tmp_buf->y_stride,
+        tmp_buf->y_stride, cm->bit_depth);
+  else
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    restore_func(
+        frame->y_buffer + ystart * ystride,
+        ywidth, yend - ystart, ystride, &cm->rst_internal,
+        tmp_buf->y_buffer + ystart * tmp_buf->y_stride,
+        tmp_buf->y_stride);
+  if (!y_only) {
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cm->use_highbitdepth) {
+      restore_func_highbd(
+          frame->u_buffer + uvstart * uvstride,
+          uvwidth, uvend - uvstart, uvstride, &cm->rst_internal,
+          tmp_buf->u_buffer + uvstart * tmp_buf->uv_stride,
+          tmp_buf->uv_stride, cm->bit_depth);
+      restore_func_highbd(
+          frame->v_buffer + uvstart * uvstride,
+          uvwidth, uvend - uvstart, uvstride, &cm->rst_internal,
+          tmp_buf->v_buffer + uvstart * tmp_buf->uv_stride,
+          tmp_buf->uv_stride, cm->bit_depth);
+    } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      restore_func(
+          frame->u_buffer + uvstart * uvstride,
+          uvwidth, uvend - uvstart, uvstride, &cm->rst_internal,
+          tmp_buf->u_buffer + uvstart * tmp_buf->uv_stride,
+          tmp_buf->uv_stride);
+      restore_func(
+          frame->v_buffer + uvstart * uvstride,
+          uvwidth, uvend - uvstart, uvstride, &cm->rst_internal,
+          tmp_buf->v_buffer + uvstart * tmp_buf->uv_stride,
+          tmp_buf->uv_stride);
+#if CONFIG_VP9_HIGHBITDEPTH
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
+}
+
+void vp10_loop_restoration_frame(YV12_BUFFER_CONFIG *frame,
+                                 VP10_COMMON *cm,
+                                 RestorationInfo *rsi,
+                                 int y_only, int partial_frame) {
+  int start_mi_row, end_mi_row, mi_rows_to_filter;
+  if (rsi->restoration_type != RESTORE_NONE) {
+    start_mi_row = 0;
+    mi_rows_to_filter = cm->mi_rows;
+    if (partial_frame && cm->mi_rows > 8) {
+      start_mi_row = cm->mi_rows >> 1;
+      start_mi_row &= 0xfffffff8;
+      mi_rows_to_filter = VPXMAX(cm->mi_rows / 8, 8);
+    }
+    end_mi_row = start_mi_row + mi_rows_to_filter;
+    vp10_loop_restoration_init(&cm->rst_internal, rsi,
+                               cm->frame_type == KEY_FRAME);
+    vp10_loop_restoration_rows(frame, cm, start_mi_row, end_mi_row, y_only);
+  }
+}
diff --git a/vp10/common/restoration.h b/vp10/common/restoration.h
new file mode 100644
index 0000000..8c0f143
--- /dev/null
+++ b/vp10/common/restoration.h
@@ -0,0 +1,90 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_COMMON_RESTORATION_H_
+#define VP10_COMMON_RESTORATION_H_
+
+#include "vpx_ports/mem.h"
+#include "./vpx_config.h"
+
+#include "vp10/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define RESTORATION_LEVEL_BITS_KF 4
+#define RESTORATION_LEVELS_KF     (1 << RESTORATION_LEVEL_BITS_KF)
+#define RESTORATION_LEVEL_BITS    3
+#define RESTORATION_LEVELS        (1 << RESTORATION_LEVEL_BITS)
+#define DEF_RESTORATION_LEVEL     2
+
+#define RESTORATION_HALFWIN       3
+#define RESTORATION_HALFWIN1      (RESTORATION_HALFWIN + 1)
+#define RESTORATION_WIN           (2 * RESTORATION_HALFWIN + 1)
+#define RESTORATION_WIN2          ((RESTORATION_WIN) * (RESTORATION_WIN))
+
+#define RESTORATION_FILT_BITS 7
+#define RESTORATION_FILT_STEP (1 << RESTORATION_FILT_BITS)
+
+#define WIENER_FILT_TAP0_MINV     -5
+#define WIENER_FILT_TAP1_MINV     (-23)
+#define WIENER_FILT_TAP2_MINV     -20
+
+#define WIENER_FILT_TAP0_BITS     4
+#define WIENER_FILT_TAP1_BITS     5
+#define WIENER_FILT_TAP2_BITS     6
+
+#define WIENER_FILT_BITS \
+  ((WIENER_FILT_TAP0_BITS + WIENER_FILT_TAP1_BITS + WIENER_FILT_TAP2_BITS) * 2)
+
+#define WIENER_FILT_TAP0_MAXV \
+  (WIENER_FILT_TAP0_MINV -1 + (1 << WIENER_FILT_TAP0_BITS))
+#define WIENER_FILT_TAP1_MAXV \
+  (WIENER_FILT_TAP1_MINV -1 + (1 << WIENER_FILT_TAP1_BITS))
+#define WIENER_FILT_TAP2_MAXV \
+  (WIENER_FILT_TAP2_MINV -1 + (1 << WIENER_FILT_TAP2_BITS))
+
+typedef enum {
+  RESTORE_NONE,
+  RESTORE_BILATERAL,
+  RESTORE_WIENER,
+} RestorationType;
+
+typedef struct {
+  RestorationType restoration_type;
+  int restoration_level;
+  int vfilter[RESTORATION_HALFWIN], hfilter[RESTORATION_HALFWIN];
+} RestorationInfo;
+
+typedef struct {
+  RestorationType restoration_type;
+  uint8_t *wx_lut[RESTORATION_WIN];
+  uint8_t *wr_lut;
+  int vfilter[RESTORATION_WIN], hfilter[RESTORATION_WIN];
+} RestorationInternal;
+
+int  vp10_restoration_level_bits(const struct VP10Common *const cm);
+void vp10_loop_restoration_init(RestorationInternal *rst,
+                                RestorationInfo *rsi, int kf);
+void vp10_loop_restoration_frame(YV12_BUFFER_CONFIG *frame,
+                                 struct VP10Common *cm,
+                                 RestorationInfo *rsi,
+                                 int y_only, int partial_frame);
+void vp10_loop_restoration_rows(YV12_BUFFER_CONFIG *frame,
+                                struct VP10Common *cm,
+                                int start_mi_row, int end_mi_row,
+                                int y_only);
+void vp10_loop_restoration_precal();
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_COMMON_RESTORATION_H_
diff --git a/vp10/common/scale.c b/vp10/common/scale.c
index ce6062c..65e14a9 100644
--- a/vp10/common/scale.c
+++ b/vp10/common/scale.c
@@ -46,15 +46,15 @@
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp10_setup_scale_factors_for_frame(struct scale_factors *sf,
-                                       int other_w, int other_h,
-                                       int this_w, int this_h,
-                                       int use_highbd) {
+                                        int other_w, int other_h,
+                                        int this_w, int this_h,
+                                        int use_highbd) {
 #else
 void vp10_setup_scale_factors_for_frame(struct scale_factors *sf,
-                                       int other_w, int other_h,
-                                       int this_w, int this_h) {
+                                        int other_w, int other_h,
+                                        int this_w, int this_h) {
 #endif
-  if (!valid_ref_frame_size(other_w, other_h, this_w, this_h)) {
+    if (!valid_ref_frame_size(other_w, other_h, this_w, this_h)) {
     sf->x_scale_fp = REF_INVALID_SCALE;
     sf->y_scale_fp = REF_INVALID_SCALE;
     return;
@@ -79,6 +79,16 @@
   // applied in one direction only, and not at all for 0,0, seems to give the
   // best quality, but it may be worth trying an additional mode that does
   // do the filtering on full-pel.
+#if CONFIG_EXT_INTERP && SUPPORT_NONINTERPOLATING_FILTERS
+  sf->predict_ni[0][0][0] = vpx_convolve8_c;
+  sf->predict_ni[0][0][1] = vpx_convolve8_avg_c;
+  sf->predict_ni[0][1][0] = vpx_convolve8_c;
+  sf->predict_ni[0][1][1] = vpx_convolve8_avg_c;
+  sf->predict_ni[1][0][0] = vpx_convolve8_c;
+  sf->predict_ni[1][0][1] = vpx_convolve8_avg_c;
+  sf->predict_ni[1][1][0] = vpx_convolve8;
+  sf->predict_ni[1][1][1] = vpx_convolve8_avg;
+#endif  // CONFIG_EXT_INTERP && SUPPORT_NONINTERPOLATING_FILTERS
   if (sf->x_step_q4 == 16) {
     if (sf->y_step_q4 == 16) {
       // No scaling in either direction.
@@ -119,8 +129,19 @@
   // 2D subpel motion always gets filtered in both directions
   sf->predict[1][1][0] = vpx_convolve8;
   sf->predict[1][1][1] = vpx_convolve8_avg;
+
 #if CONFIG_VP9_HIGHBITDEPTH
   if (use_highbd) {
+#if CONFIG_EXT_INTERP && SUPPORT_NONINTERPOLATING_FILTERS
+    sf->highbd_predict_ni[0][0][0] = vpx_highbd_convolve8_c;
+    sf->highbd_predict_ni[0][0][1] = vpx_highbd_convolve8_avg_c;
+    sf->highbd_predict_ni[0][1][0] = vpx_highbd_convolve8_c;
+    sf->highbd_predict_ni[0][1][1] = vpx_highbd_convolve8_avg_c;
+    sf->highbd_predict_ni[1][0][0] = vpx_highbd_convolve8_c;
+    sf->highbd_predict_ni[1][0][1] = vpx_highbd_convolve8_avg_c;
+    sf->highbd_predict_ni[1][1][0] = vpx_highbd_convolve8;
+    sf->highbd_predict_ni[1][1][1] = vpx_highbd_convolve8_avg;
+#endif  // CONFIG_EXT_INTERP && SUPPORT_NONINTERPOLATING_FILTERS
     if (sf->x_step_q4 == 16) {
       if (sf->y_step_q4 == 16) {
         // No scaling in either direction.
@@ -162,5 +183,5 @@
     sf->highbd_predict[1][1][0] = vpx_highbd_convolve8;
     sf->highbd_predict[1][1][1] = vpx_highbd_convolve8_avg;
   }
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 }
diff --git a/vp10/common/scale.h b/vp10/common/scale.h
index 833f6c4..604b9d2 100644
--- a/vp10/common/scale.h
+++ b/vp10/common/scale.h
@@ -34,7 +34,15 @@
   convolve_fn_t predict[2][2][2];  // horiz, vert, avg
 #if CONFIG_VP9_HIGHBITDEPTH
   highbd_convolve_fn_t highbd_predict[2][2][2];  // horiz, vert, avg
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+// Functions for non-interpolating filters (those that filter zero offsets)
+#if CONFIG_EXT_INTERP && SUPPORT_NONINTERPOLATING_FILTERS
+  convolve_fn_t predict_ni[2][2][2];  // horiz, vert, avg
+#if CONFIG_VP9_HIGHBITDEPTH
+  highbd_convolve_fn_t highbd_predict_ni[2][2][2];  // horiz, vert, avg
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // CONFIG_EXT_INTERP && SUPPORT_NONINTERPOLATING_FILTERS
 };
 
 MV32 vp10_scale_mv(const MV *mv, int x, int y, const struct scale_factors *sf);
@@ -48,7 +56,7 @@
 void vp10_setup_scale_factors_for_frame(struct scale_factors *sf,
                                        int other_w, int other_h,
                                        int this_w, int this_h);
-#endif
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 static INLINE int vp10_is_valid_scale(const struct scale_factors *sf) {
   return sf->x_scale_fp != REF_INVALID_SCALE &&
diff --git a/vp10/common/scan.c b/vp10/common/scan.c
index a04f649..8cfeb97 100644
--- a/vp10/common/scan.c
+++ b/vp10/common/scan.c
@@ -19,6 +19,22 @@
   7, 14, 11, 15,
 };
 
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_4x4[16]) = {
+  0, 4, 8, 12,
+  1, 5, 9, 13,
+  2, 6, 10, 14,
+  3, 7, 11, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_4x4[16]) = {
+  0, 1, 2, 3,
+  4, 5, 6, 7,
+  8, 9, 10, 11,
+  12, 13, 14, 15,
+};
+#endif  // CONFIG_EXT_TX
+
 DECLARE_ALIGNED(16, static const int16_t, col_scan_4x4[16]) = {
   0,  4,  8,  1,
   12,  5,  9,  2,
@@ -34,7 +50,7 @@
 };
 
 DECLARE_ALIGNED(16, static const int16_t, default_scan_8x8[64]) = {
-  0,  8,  1, 16,  9,  2, 17, 24,
+  0,   8,  1, 16,  9,  2, 17, 24,
   10,  3, 18, 25, 32, 11,  4, 26,
   33, 19, 40, 12, 34, 27,  5, 41,
   20, 48, 13, 35, 42, 28, 21,  6,
@@ -44,6 +60,30 @@
   46, 39, 61, 54, 47, 62, 55, 63,
 };
 
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_8x8[64]) = {
+  0,   8,  16,  24,  32,  40,  48,  56,
+  1,   9,  17,  25,  33,  41,  49,  57,
+  2,  10,  18,  26,  34,  42,  50,  58,
+  3,  11,  19,  27,  35,  43,  51,  59,
+  4,  12,  20,  28,  36,  44,  52,  60,
+  5,  13,  21,  29,  37,  45,  53,  61,
+  6,  14,  22,  30,  38,  46,  54,  62,
+  7,  15,  23,  31,  39,  47,  55,  63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_8x8[64]) = {
+  0,    1,   2,   3,   4,   5,   6,   7,
+  8,    9,  10,  11,  12,  13,  14,  15,
+  16,  17,  18,  19,  20,  21,  22,  23,
+  24,  25,  26,  27,  28,  29,  30,  31,
+  32,  33,  34,  35,  36,  37,  38,  39,
+  40,  41,  42,  43,  44,  45,  46,  47,
+  48,  49,  50,  51,  52,  53,  54,  55,
+  56,  57,  58,  59,  60,  61,  62,  63,
+};
+#endif  // CONFIG_EXT_TX
+
 DECLARE_ALIGNED(16, static const int16_t, col_scan_8x8[64]) = {
   0,  8, 16,  1, 24,  9, 32, 17,
   2, 40, 25, 10, 33, 18, 48,  3,
@@ -87,6 +127,55 @@
   255,
 };
 
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_16x16[256]) = {
+  0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240,
+  1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, 209, 225, 241,
+  2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162, 178, 194, 210, 226, 242,
+  3, 19, 35, 51, 67, 83, 99, 115, 131, 147, 163, 179, 195, 211, 227, 243,
+  4, 20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244,
+  5, 21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245,
+  6, 22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246,
+  7, 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247,
+  8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248,
+  9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249,
+  10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250,
+  11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251,
+  12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252,
+  13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253,
+  14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254,
+  15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_16x16[256]) = {
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+  64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+  80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
+  96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
+  112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124,
+  125, 126, 127,
+  128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140,
+  141, 142, 143,
+  144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
+  157, 158, 159,
+  160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172,
+  173, 174, 175,
+  176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188,
+  189, 190, 191,
+  192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204,
+  205, 206, 207,
+  208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220,
+  221, 222, 223,
+  224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236,
+  237, 238, 239,
+  240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252,
+  253, 254, 255,
+};
+#endif  // CONFIG_EXT_TX
+
 DECLARE_ALIGNED(16, static const int16_t, col_scan_16x16[256]) = {
   0, 16, 32, 48, 1, 64, 17, 80, 33, 96, 49, 2, 65, 112, 18, 81,
   34, 128, 50, 97, 3, 66, 144, 19, 113, 35, 82, 160, 98, 51, 129, 4,
@@ -130,6 +219,203 @@
   255,
 };
 
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t, mcol_scan_32x32[1024]) = {
+  0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480,
+  512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832, 864, 896, 928, 960,
+  992,
+  1, 33, 65, 97, 129, 161, 193, 225, 257, 289, 321, 353, 385, 417, 449, 481,
+  513, 545, 577, 609, 641, 673, 705, 737, 769, 801, 833, 865, 897, 929, 961,
+  993,
+  2, 34, 66, 98, 130, 162, 194, 226, 258, 290, 322, 354, 386, 418, 450, 482,
+  514, 546, 578, 610, 642, 674, 706, 738, 770, 802, 834, 866, 898, 930, 962,
+  994,
+  3, 35, 67, 99, 131, 163, 195, 227, 259, 291, 323, 355, 387, 419, 451, 483,
+  515, 547, 579, 611, 643, 675, 707, 739, 771, 803, 835, 867, 899, 931, 963,
+  995,
+  4, 36, 68, 100, 132, 164, 196, 228, 260, 292, 324, 356, 388, 420, 452, 484,
+  516, 548, 580, 612, 644, 676, 708, 740, 772, 804, 836, 868, 900, 932, 964,
+  996,
+  5, 37, 69, 101, 133, 165, 197, 229, 261, 293, 325, 357, 389, 421, 453, 485,
+  517, 549, 581, 613, 645, 677, 709, 741, 773, 805, 837, 869, 901, 933, 965,
+  997,
+  6, 38, 70, 102, 134, 166, 198, 230, 262, 294, 326, 358, 390, 422, 454, 486,
+  518, 550, 582, 614, 646, 678, 710, 742, 774, 806, 838, 870, 902, 934, 966,
+  998,
+  7, 39, 71, 103, 135, 167, 199, 231, 263, 295, 327, 359, 391, 423, 455, 487,
+  519, 551, 583, 615, 647, 679, 711, 743, 775, 807, 839, 871, 903, 935, 967,
+  999,
+  8, 40, 72, 104, 136, 168, 200, 232, 264, 296, 328, 360, 392, 424, 456, 488,
+  520, 552, 584, 616, 648, 680, 712, 744, 776, 808, 840, 872, 904, 936, 968,
+  1000,
+  9, 41, 73, 105, 137, 169, 201, 233, 265, 297, 329, 361, 393, 425, 457, 489,
+  521, 553, 585, 617, 649, 681, 713, 745, 777, 809, 841, 873, 905, 937, 969,
+  1001,
+  10, 42, 74, 106, 138, 170, 202, 234, 266, 298, 330, 362, 394, 426, 458,
+  490, 522, 554, 586, 618, 650, 682, 714, 746, 778, 810, 842, 874, 906,
+  938, 970, 1002,
+  11, 43, 75, 107, 139, 171, 203, 235, 267, 299, 331, 363, 395, 427, 459,
+  491, 523, 555, 587, 619, 651, 683, 715, 747, 779, 811, 843, 875, 907,
+  939, 971, 1003,
+  12, 44, 76, 108, 140, 172, 204, 236, 268, 300, 332, 364, 396, 428, 460,
+  492, 524, 556, 588, 620, 652, 684, 716, 748, 780, 812, 844, 876, 908,
+  940, 972, 1004,
+  13, 45, 77, 109, 141, 173, 205, 237, 269, 301, 333, 365, 397, 429, 461,
+  493, 525, 557, 589, 621, 653, 685, 717, 749, 781, 813, 845, 877, 909,
+  941, 973, 1005,
+  14, 46, 78, 110, 142, 174, 206, 238, 270, 302, 334, 366, 398, 430, 462,
+  494, 526, 558, 590, 622, 654, 686, 718, 750, 782, 814, 846, 878, 910,
+  942, 974, 1006,
+  15, 47, 79, 111, 143, 175, 207, 239, 271, 303, 335, 367, 399, 431, 463,
+  495, 527, 559, 591, 623, 655, 687, 719, 751, 783, 815, 847, 879, 911,
+  943, 975, 1007,
+  16, 48, 80, 112, 144, 176, 208, 240, 272, 304, 336, 368, 400, 432, 464,
+  496, 528, 560, 592, 624, 656, 688, 720, 752, 784, 816, 848, 880, 912,
+  944, 976, 1008,
+  17, 49, 81, 113, 145, 177, 209, 241, 273, 305, 337, 369, 401, 433, 465,
+  497, 529, 561, 593, 625, 657, 689, 721, 753, 785, 817, 849, 881, 913,
+  945, 977, 1009,
+  18, 50, 82, 114, 146, 178, 210, 242, 274, 306, 338, 370, 402, 434, 466,
+  498, 530, 562, 594, 626, 658, 690, 722, 754, 786, 818, 850, 882, 914,
+  946, 978, 1010,
+  19, 51, 83, 115, 147, 179, 211, 243, 275, 307, 339, 371, 403, 435, 467,
+  499, 531, 563, 595, 627, 659, 691, 723, 755, 787, 819, 851, 883, 915,
+  947, 979, 1011,
+  20, 52, 84, 116, 148, 180, 212, 244, 276, 308, 340, 372, 404, 436, 468,
+  500, 532, 564, 596, 628, 660, 692, 724, 756, 788, 820, 852, 884, 916,
+  948, 980, 1012,
+  21, 53, 85, 117, 149, 181, 213, 245, 277, 309, 341, 373, 405, 437, 469,
+  501, 533, 565, 597, 629, 661, 693, 725, 757, 789, 821, 853, 885, 917,
+  949, 981, 1013,
+  22, 54, 86, 118, 150, 182, 214, 246, 278, 310, 342, 374, 406, 438, 470,
+  502, 534, 566, 598, 630, 662, 694, 726, 758, 790, 822, 854, 886, 918,
+  950, 982, 1014,
+  23, 55, 87, 119, 151, 183, 215, 247, 279, 311, 343, 375, 407, 439, 471,
+  503, 535, 567, 599, 631, 663, 695, 727, 759, 791, 823, 855, 887, 919,
+  951, 983, 1015,
+  24, 56, 88, 120, 152, 184, 216, 248, 280, 312, 344, 376, 408, 440, 472,
+  504, 536, 568, 600, 632, 664, 696, 728, 760, 792, 824, 856, 888, 920,
+  952, 984, 1016,
+  25, 57, 89, 121, 153, 185, 217, 249, 281, 313, 345, 377, 409, 441, 473,
+  505, 537, 569, 601, 633, 665, 697, 729, 761, 793, 825, 857, 889, 921,
+  953, 985, 1017,
+  26, 58, 90, 122, 154, 186, 218, 250, 282, 314, 346, 378, 410, 442, 474,
+  506, 538, 570, 602, 634, 666, 698, 730, 762, 794, 826, 858, 890, 922,
+  954, 986, 1018,
+  27, 59, 91, 123, 155, 187, 219, 251, 283, 315, 347, 379, 411, 443, 475,
+  507, 539, 571, 603, 635, 667, 699, 731, 763, 795, 827, 859, 891, 923,
+  955, 987, 1019,
+  28, 60, 92, 124, 156, 188, 220, 252, 284, 316, 348, 380, 412, 444, 476,
+  508, 540, 572, 604, 636, 668, 700, 732, 764, 796, 828, 860, 892, 924,
+  956, 988, 1020,
+  29, 61, 93, 125, 157, 189, 221, 253, 285, 317, 349, 381, 413, 445, 477,
+  509, 541, 573, 605, 637, 669, 701, 733, 765, 797, 829, 861, 893, 925,
+  957, 989, 1021,
+  30, 62, 94, 126, 158, 190, 222, 254, 286, 318, 350, 382, 414, 446, 478,
+  510, 542, 574, 606, 638, 670, 702, 734, 766, 798, 830, 862, 894, 926,
+  958, 990, 1022,
+  31, 63, 95, 127, 159, 191, 223, 255, 287, 319, 351, 383, 415, 447, 479,
+  511, 543, 575, 607, 639, 671, 703, 735, 767, 799, 831, 863, 895, 927,
+  959, 991, 1023,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, mrow_scan_32x32[1024]) = {
+    0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+    19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+    64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+    80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
+    96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
+    110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
+    123, 124, 125, 126, 127,
+    128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140,
+    141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153,
+    154, 155, 156, 157, 158, 159,
+    160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172,
+    173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185,
+    186, 187, 188, 189, 190, 191,
+    192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204,
+    205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217,
+    218, 219, 220, 221, 222, 223,
+    224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236,
+    237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249,
+    250, 251, 252, 253, 254, 255,
+    256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268,
+    269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281,
+    282, 283, 284, 285, 286, 287,
+    288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300,
+    301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313,
+    314, 315, 316, 317, 318, 319,
+    320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332,
+    333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345,
+    346, 347, 348, 349, 350, 351,
+    352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364,
+    365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377,
+    378, 379, 380, 381, 382, 383,
+    384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396,
+    397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409,
+    410, 411, 412, 413, 414, 415,
+    416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428,
+    429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441,
+    442, 443, 444, 445, 446, 447,
+    448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458, 459, 460,
+    461, 462, 463, 464, 465, 466, 467, 468, 469, 470, 471, 472, 473,
+    474, 475, 476, 477, 478, 479,
+    480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490, 491, 492,
+    493, 494, 495, 496, 497, 498, 499, 500, 501, 502, 503, 504, 505,
+    506, 507, 508, 509, 510, 511,
+    512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522, 523, 524,
+    525, 526, 527, 528, 529, 530, 531, 532, 533, 534, 535, 536, 537,
+    538, 539, 540, 541, 542, 543,
+    544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554, 555, 556,
+    557, 558, 559, 560, 561, 562, 563, 564, 565, 566, 567, 568, 569,
+    570, 571, 572, 573, 574, 575,
+    576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586, 587, 588,
+    589, 590, 591, 592, 593, 594, 595, 596, 597, 598, 599, 600, 601,
+    602, 603, 604, 605, 606, 607,
+    608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620,
+    621, 622, 623, 624, 625, 626, 627, 628, 629, 630, 631, 632, 633,
+    634, 635, 636, 637, 638, 639,
+    640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652,
+    653, 654, 655, 656, 657, 658, 659, 660, 661, 662, 663, 664, 665,
+    666, 667, 668, 669, 670, 671,
+    672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684,
+    685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697,
+    698, 699, 700, 701, 702, 703,
+    704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714, 715, 716,
+    717, 718, 719, 720, 721, 722, 723, 724, 725, 726, 727, 728, 729,
+    730, 731, 732, 733, 734, 735,
+    736, 737, 738, 739, 740, 741, 742, 743, 744, 745, 746, 747, 748,
+    749, 750, 751, 752, 753, 754, 755, 756, 757, 758, 759, 760, 761,
+    762, 763, 764, 765, 766, 767,
+    768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778, 779, 780,
+    781, 782, 783, 784, 785, 786, 787, 788, 789, 790, 791, 792, 793,
+    794, 795, 796, 797, 798, 799,
+    800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810, 811, 812,
+    813, 814, 815, 816, 817, 818, 819, 820, 821, 822, 823, 824, 825,
+    826, 827, 828, 829, 830, 831,
+    832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842, 843, 844,
+    845, 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857,
+    858, 859, 860, 861, 862, 863,
+    864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874, 875, 876,
+    877, 878, 879, 880, 881, 882, 883, 884, 885, 886, 887, 888, 889,
+    890, 891, 892, 893, 894, 895,
+    896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906, 907, 908,
+    909, 910, 911, 912, 913, 914, 915, 916, 917, 918, 919, 920, 921,
+    922, 923, 924, 925, 926, 927,
+    928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938, 939, 940,
+    941, 942, 943, 944, 945, 946, 947, 948, 949, 950, 951, 952, 953,
+    954, 955, 956, 957, 958, 959,
+    960, 961, 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972,
+    973, 974, 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985,
+    986, 987, 988, 989, 990, 991,
+    992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001, 1002, 1003,
+    1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014,
+    1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023,
+};
+#endif  // CONFIG_EXT_TX
+
 DECLARE_ALIGNED(16, static const int16_t, default_scan_32x32[1024]) = {
   0, 32, 1, 64, 33, 2, 96, 65, 34, 128, 3, 97, 66, 160,
   129, 35, 98, 4, 67, 130, 161, 192, 36, 99, 224, 5, 162, 193,
@@ -229,311 +515,1741 @@
   990, 959, 1022, 991, 1023,
 };
 
+#if CONFIG_EXT_TX
+// Scan over two rectangular vertical partitions one after the other
+DECLARE_ALIGNED(16, static const int16_t, v2_scan_32x32[1024]) = {
+  0,  1, 32, 33,  2, 64, 34, 65, 66,  3, 96, 35,
+  97,  67,  98,   4, 128,  36, 129,  99,  68, 130,   5, 100,
+  131, 160,  37, 161,  69, 162, 132, 101, 163,   6, 192,  38,
+  193,  70, 194, 133, 164, 102, 195,   7, 224,  39, 165, 225,
+  134, 196,  71, 226, 103, 227, 166, 197,   8, 256,  40, 135,
+  228, 257,  72, 258, 198, 104, 259, 167, 229, 136, 260,   9,
+  288,  41, 289,  73, 199, 230, 290, 168, 261, 105, 291, 137,
+  292, 231,  10, 200, 262, 320,  42, 321,  74, 322, 169, 293,
+  106, 323, 232, 263, 138, 324, 201, 294,  11, 352,  43, 353,
+  75, 170, 325, 354, 264, 107, 233, 295, 355, 202, 326, 139,
+  356,  12, 384,  44, 265, 296, 385, 171, 357,  76, 386, 234,
+  327, 108, 387, 203, 358, 140, 388, 297, 266, 328,  13, 172,
+  389, 416,  45, 235, 359, 417,  77, 418, 109, 419, 204, 390,
+  298, 329, 141, 267, 360, 420, 236, 391, 173, 421,  14, 448,
+  46, 449,  78, 330, 450, 299, 361, 110, 205, 422, 451, 268,
+  392, 142, 452, 237, 423, 174, 331, 362, 453,  15, 300, 393,
+  480,  47, 481,  79, 482, 206, 454, 269, 424, 111, 483, 143,
+  484, 363, 332, 394, 238, 455, 175, 301, 425, 485, 512, 513,
+  270, 456, 514, 207, 486, 364, 395, 515, 333, 426, 516, 239,
+  487, 302, 457, 517, 396, 271, 488, 544, 365, 427, 545, 518,
+  546, 334, 458, 547, 519, 548, 303, 489, 397, 428, 549, 366,
+  459, 520, 576, 335, 490, 550, 577, 578, 579, 521, 429, 551,
+  398, 460, 580, 367, 491, 581, 552, 522, 582, 608, 609, 430,
+  461, 610, 399, 492, 553, 611, 583, 523, 612, 613, 584, 554,
+  462, 431, 493, 614, 524, 640, 641, 642, 585, 643, 555, 615,
+  644, 463, 494, 586, 525, 616, 645, 556, 646, 672, 617, 673,
+  587, 674, 647, 495, 675, 526, 676, 557, 618, 648, 677, 588,
+  678, 527, 649, 619, 704, 558, 705, 706, 679, 589, 707, 650,
+  708, 620, 680, 709, 559, 590, 710, 651, 681, 736, 621, 737,
+  711, 738, 739, 682, 652, 740, 712, 591, 741, 622, 683, 713,
+  742, 653, 768, 769, 743, 770, 714, 684, 771, 623, 772, 744,
+  654, 773, 715, 685, 745, 774, 655, 775, 800, 801, 716, 746,
+  802, 803, 686, 776, 804, 747, 805, 717, 777, 806, 687, 748,
+  807, 778, 832, 833, 718, 834, 835, 808, 836, 779, 749, 837,
+  809, 719, 838, 780, 750, 810, 839, 864, 865, 866, 867, 840,
+  781, 868, 811, 751, 869, 841, 870, 812, 782, 842, 871, 896,
+  897, 898, 872, 899, 813, 843, 900, 783, 901, 873, 844, 902,
+  814, 874, 903, 928, 929, 845, 930, 904, 815, 875, 931, 932,
+  905, 933, 846, 876, 934, 906, 935, 877, 960, 847, 961, 962,
+  907, 936, 963, 964, 937, 878, 965, 908, 966, 938, 967, 909,
+  879, 992, 939, 993, 968, 994, 995, 996, 910, 969, 940, 997,
+  998,  970,  911,  941,  999,  971, 1000,  942, 1001,  972, 1002,  943,
+  973, 1003,  974, 1004,  975, 1005, 1006, 1007,   16,   48,   80,  112,
+  144, 176,  17,  49, 208,  81, 113, 145, 240, 177, 272,  18,
+  50, 209,  82, 114, 304, 241, 146, 178, 273, 336, 210,  19,
+  51,  83, 115, 305, 242, 147, 368, 179, 274, 337, 211,  20,
+  400,  52,  84, 306, 116, 243, 369, 148, 338, 180, 275, 432,
+  401, 212,  21,  53, 307,  85, 370, 244, 117, 464, 149, 433,
+  339, 276, 181, 402, 213, 308, 496, 371,  22,  54, 465,  86,
+  245, 118, 434, 150, 340, 277, 403, 182, 528, 497, 214, 466,
+  372, 309,  23,  55, 435,  87, 246, 119, 341, 404, 151, 529,
+  560, 278, 498, 183, 467, 373, 215, 310, 436,  24,  56, 247,
+  561,  88, 530, 592, 342, 120, 405, 499, 152, 279, 468, 184,
+  374, 311, 437, 216, 562, 593, 531, 624,  25, 248, 500,  57,
+  406,  89, 343, 121, 469, 280, 153, 594, 185, 375, 563, 625,
+  438, 532, 656, 312, 217, 501, 407, 249,  26, 344,  58,  90,
+  470, 122, 595, 626, 281, 564, 657, 154, 376, 533, 688, 439,
+  186, 313, 502, 218, 408, 627, 596, 658, 250, 345, 471,  27,
+  59, 565, 689,  91, 123, 282, 534, 720, 155, 440, 377, 187,
+  503, 314, 628, 659, 219, 597, 690, 409, 472, 566, 721, 346,
+  251,  28,  60, 535, 752,  92, 124, 283, 441, 378, 156, 660,
+  504, 629, 691, 598, 722, 188, 315, 567, 753, 220, 410, 473,
+  347, 536, 784, 252,  29, 661, 692,  61,  93, 442, 630, 723,
+  284, 125, 379, 505, 599, 754, 157, 316, 568, 785, 189, 474,
+  411, 221, 537, 816, 693, 348, 662, 724, 253, 631, 755, 443,
+  30, 600, 786,  62, 506,  94, 285, 380, 126, 569, 817, 158,
+  317, 190, 475, 694, 725, 412, 663, 756, 538, 848, 222, 632,
+  787, 349, 254, 601, 818, 444, 507,  31,  63, 381, 286,  95,
+  570, 849, 726, 127, 695, 757, 664, 788, 159, 476, 318, 413,
+  539, 880, 191, 633, 819, 223, 350, 602, 850, 508, 255, 445,
+  727, 758, 696, 789, 571, 881, 382, 287, 665, 820, 477, 634,
+  851, 540, 912, 319, 414, 603, 882, 759, 728, 790, 351, 509,
+  697, 821, 446, 572, 913, 666, 852, 383, 635, 883, 478, 541,
+  944, 415, 760, 791, 604, 914, 729, 822, 698, 853, 510, 667,
+  884, 447, 573, 945, 636, 915, 792, 761, 823, 542, 976, 479,
+  730, 854, 605, 946, 699, 885, 668, 916, 511, 574, 977, 793,
+  824,  637,  947,  762,  855,  731,  886,  543, 1008,  606,  978,  700,
+  917,  669,  948,  575,  825, 1009,  794,  856,  763,  887,  638,  979,
+  732,  918,  701,  949,  607, 1010,  670,  980,  826,  857,  795,  888,
+  764,  919,  639, 1011,  733,  950,  702,  981,  858,  827,  889,  796,
+  920,  671, 1012,  765,  951,  734,  982,  703, 1013,  859,  890,  828,
+  921,  797,  952,  766,  983,  735, 1014,  891,  860,  922,  829,  953,
+  798,  984,  767, 1015,  892,  923,  861,  954,  830,  985,  799, 1016,
+  924,  893,  955,  862,  986,  831, 1017,  925,  956,  894,  987,  863,
+  1018,  957,  926,  988,  895, 1019,  958,  989,  927, 1020,  990,  959,
+  1021,  991, 1022, 1023,
+};
+
+// Scan over two rectangular horizontal partitions one after the other
+DECLARE_ALIGNED(16, static const int16_t, h2_scan_32x32[1024]) = {
+  0,  1, 32, 33,  2, 64, 34, 65, 66,  3, 96, 35,
+  97,  67,  98,   4, 128,  36, 129,  99,  68, 130,   5, 100,
+  131, 160,  37, 161,  69, 162, 132, 101, 163,   6, 192,  38,
+  193,  70, 194, 133, 164, 102, 195,   7, 224,  39, 165, 225,
+  134, 196,  71, 226, 103, 227, 166, 197,   8, 256,  40, 135,
+  228, 257,  72, 258, 198, 104, 259, 167, 229, 136, 260,   9,
+  288,  41, 289,  73, 199, 230, 290, 168, 261, 105, 291, 137,
+  292, 231,  10, 200, 262, 320,  42, 321,  74, 322, 169, 293,
+  106, 323, 232, 263, 138, 324, 201, 294,  11, 352,  43, 353,
+  75, 170, 325, 354, 264, 107, 233, 295, 355, 202, 326, 139,
+  356,  12, 384,  44, 265, 296, 385, 171, 357,  76, 386, 234,
+  327, 108, 387, 203, 358, 140, 388, 297, 266, 328,  13, 172,
+  389, 416,  45, 235, 359, 417,  77, 418, 109, 419, 204, 390,
+  298, 329, 141, 267, 360, 420, 236, 391, 173, 421,  14, 448,
+  46, 449,  78, 330, 450, 299, 361, 110, 205, 422, 451, 268,
+  392, 142, 452, 237, 423, 174, 331, 362, 453,  15, 300, 393,
+  480,  47, 481,  79, 482, 206, 454, 269, 424, 111, 483, 143,
+  484, 363, 332, 394, 238, 455, 175, 301, 425, 485,  16,  48,
+  80, 270, 456, 207, 486, 112, 364, 395, 333, 426, 144, 239,
+  487, 302, 457, 176, 396,  17, 271, 488,  49, 365, 427, 208,
+  81, 334, 458, 113, 145, 240, 303, 489, 397, 428, 177, 366,
+  459, 272,  18,  50, 209, 335, 490,  82, 114, 304, 241, 429,
+  146, 398, 460, 367, 491, 178, 273, 336, 210,  19,  51,  83,
+  430, 461, 399, 492, 115, 305, 242, 147, 368, 179, 274, 337,
+  462, 431, 493, 211,  20, 400,  52,  84, 306, 116, 243, 369,
+  148, 463, 494, 338, 180, 275, 432, 401, 212,  21,  53, 307,
+  85, 370, 244, 117, 495, 464, 149, 433, 339, 276, 181, 402,
+  213, 308, 496, 371,  22,  54, 465,  86, 245, 118, 434, 150,
+  340, 277, 403, 182, 497, 214, 466, 372, 309,  23,  55, 435,
+  87, 246, 119, 341, 404, 151, 278, 498, 183, 467, 373, 215,
+  310, 436,  24,  56, 247,  88, 342, 120, 405, 499, 152, 279,
+  468, 184, 374, 311, 437, 216,  25, 248, 500,  57, 406,  89,
+  343, 121, 469, 280, 153, 185, 375, 438, 312, 217, 501, 407,
+  249,  26, 344,  58,  90, 470, 122, 281, 154, 376, 439, 186,
+  313, 502, 218, 408, 250, 345, 471,  27,  59,  91, 123, 282,
+  155, 440, 377, 187, 503, 314, 219, 409, 472, 346, 251,  28,
+  60,  92, 124, 283, 441, 378, 156, 504, 188, 315, 220, 410,
+  473, 347, 252,  29,  61,  93, 442, 284, 125, 379, 505, 157,
+  316, 189, 474, 411, 221, 348, 253, 443,  30,  62, 506,  94,
+  285, 380, 126, 158, 317, 190, 475, 412, 222, 349, 254, 444,
+  507,  31,  63, 381, 286,  95, 127, 159, 476, 318, 413, 191,
+  223, 350, 508, 255, 445, 382, 287, 477, 319, 414, 351, 509,
+  446, 383, 478, 415, 510, 447, 479, 511, 512, 513, 514, 515,
+  516, 517, 544, 545, 518, 546, 547, 519, 548, 549, 520, 576,
+  550, 577, 578, 579, 521, 551, 580, 581, 552, 522, 582, 608,
+  609, 610, 553, 611, 583, 523, 612, 613, 584, 554, 614, 524,
+  640, 641, 642, 585, 643, 555, 615, 644, 586, 525, 616, 645,
+  556, 646, 672, 617, 673, 587, 674, 647, 675, 526, 676, 557,
+  618, 648, 677, 588, 678, 527, 649, 619, 704, 558, 705, 706,
+  679, 589, 707, 650, 708, 620, 680, 709, 528, 559, 590, 710,
+  651, 681, 736, 621, 737, 711, 738, 739, 682, 652, 529, 560,
+  740, 712, 591, 741, 622, 683, 713, 742, 653, 768, 769, 561,
+  743, 530, 592, 770, 714, 684, 771, 623, 772, 744, 654, 773,
+  715, 685, 745, 774, 562, 593, 531, 624, 655, 775, 800, 801,
+  716, 746, 802, 803, 686, 776, 804, 594, 563, 625, 747, 805,
+  717, 532, 656, 777, 806, 687, 748, 807, 778, 832, 833, 718,
+  834, 595, 626, 835, 564, 657, 808, 836, 533, 688, 779, 749,
+  837, 809, 719, 838, 780, 627, 596, 658, 750, 810, 839, 864,
+  565, 689, 865, 866, 867, 534, 720, 840, 781, 868, 811, 751,
+  869, 841, 628, 659, 597, 690, 870, 812, 782, 566, 721, 842,
+  871, 896, 535, 752, 897, 898, 872, 899, 813, 843, 660, 900,
+  783, 629, 691, 598, 722, 901, 873, 567, 753, 844, 902, 814,
+  874, 536, 784, 903, 661, 692, 928, 929, 630, 723, 845, 930,
+  904, 815, 875, 931, 599, 754, 932, 568, 785, 905, 933, 846,
+  876, 934, 537, 816, 693, 662, 724, 906, 631, 755, 935, 877,
+  600, 786, 960, 847, 961, 962, 907, 936, 963, 569, 817, 964,
+  937, 694, 725, 878, 965, 908, 663, 756, 538, 848, 966, 632,
+  787, 938, 601, 818, 967, 909, 879, 992, 939, 993, 968, 570,
+  849, 994, 726, 695, 757, 995, 664, 788, 996, 910, 969, 539,
+  880, 940, 633, 819, 997, 998, 602, 850, 970, 911, 941, 999,
+  727,  758,  696,  789,  571,  881,  971,  665,  820, 1000,  634,  851,
+  942,  540,  912, 1001,  972,  603,  882,  759,  728,  790, 1002,  697,
+  821,  943,  973,  572,  913,  666,  852, 1003,  635,  883,  974,  541,
+  944,  760,  791, 1004,  604,  914,  729,  822,  698,  853,  975,  667,
+  884,  573,  945, 1005,  636,  915,  792,  761,  823,  542,  976, 1006,
+  730,  854,  605,  946,  699,  885,  668,  916, 1007,  574,  977,  793,
+  824,  637,  947,  762,  855,  731,  886,  543, 1008,  606,  978,  700,
+  917,  669,  948,  575,  825, 1009,  794,  856,  763,  887,  638,  979,
+  732,  918,  701,  949,  607, 1010,  670,  980,  826,  857,  795,  888,
+  764,  919,  639, 1011,  733,  950,  702,  981,  858,  827,  889,  796,
+  920,  671, 1012,  765,  951,  734,  982,  703, 1013,  859,  890,  828,
+  921,  797,  952,  766,  983,  735, 1014,  891,  860,  922,  829,  953,
+  798,  984,  767, 1015,  892,  923,  861,  954,  830,  985,  799, 1016,
+  924,  893,  955,  862,  986,  831, 1017,  925,  956,  894,  987,  863,
+  1018,  957,  926,  988,  895, 1019,  958,  989,  927, 1020,  990,  959,
+  1021,  991, 1022, 1023,
+};
+
+// Scan where the top left quarter is scanned first
+DECLARE_ALIGNED(16, static const int16_t, qtr_scan_32x32[1024]) = {
+  0,  1, 32, 33,  2, 64, 34, 65, 66,  3, 96, 35,
+  97,  67,  98,   4, 128,  36, 129,  99,  68, 130,   5, 100,
+  131, 160,  37, 161,  69, 162, 132, 101, 163,   6, 192,  38,
+  193,  70, 194, 133, 164, 102, 195,   7, 224,  39, 165, 225,
+  134, 196,  71, 226, 103, 227, 166, 197,   8, 256,  40, 135,
+  228, 257,  72, 258, 198, 104, 259, 167, 229, 136, 260,   9,
+  288,  41, 289,  73, 199, 230, 290, 168, 261, 105, 291, 137,
+  292, 231,  10, 200, 262, 320,  42, 321,  74, 322, 169, 293,
+  106, 323, 232, 263, 138, 324, 201, 294,  11, 352,  43, 353,
+  75, 170, 325, 354, 264, 107, 233, 295, 355, 202, 326, 139,
+  356,  12, 384,  44, 265, 296, 385, 171, 357,  76, 386, 234,
+  327, 108, 387, 203, 358, 140, 388, 297, 266, 328,  13, 172,
+  389, 416,  45, 235, 359, 417,  77, 418, 109, 419, 204, 390,
+  298, 329, 141, 267, 360, 420, 236, 391, 173, 421,  14, 448,
+  46, 449,  78, 330, 450, 299, 361, 110, 205, 422, 451, 268,
+  392, 142, 452, 237, 423, 174, 331, 362, 453,  15, 300, 393,
+  480,  47, 481,  79, 482, 206, 454, 269, 424, 111, 483, 143,
+  484, 363, 332, 394, 238, 455, 175, 301, 425, 485, 270, 456,
+  207, 486, 364, 395, 333, 426, 239, 487, 302, 457, 396, 271,
+  488, 365, 427, 334, 458, 303, 489, 397, 428, 366, 459, 335,
+  490, 429, 398, 460, 367, 491, 430, 461, 399, 492, 462, 431,
+  493, 463, 494, 495,  16, 512,  48, 513,  80, 514, 112, 515,
+  144, 516, 176, 517,  17, 544,  49, 545, 208, 518,  81, 546,
+  113, 547, 145, 240, 519, 548, 177, 549, 272, 520,  18, 576,
+  50, 209, 550, 577,  82, 578, 114, 579, 304, 521, 241, 551,
+  146, 580, 178, 581, 273, 552, 336, 522, 210, 582,  19, 608,
+  51, 609,  83, 610, 115, 305, 553, 611, 242, 583, 147, 368,
+  523, 612, 179, 613, 274, 584, 337, 554, 211, 614,  20, 400,
+  524, 640,  52, 641,  84, 642, 306, 585, 116, 643, 243, 369,
+  555, 615, 148, 644, 338, 586, 180, 275, 432, 525, 616, 645,
+  401, 556, 212, 646,  21, 672,  53, 307, 617, 673,  85, 370,
+  587, 674, 244, 647, 117, 675, 464, 526, 149, 676, 433, 557,
+  339, 618, 276, 648, 181, 677, 402, 588, 213, 678, 308, 496,
+  527, 649, 371, 619,  22, 704,  54, 465, 558, 705,  86, 706,
+  245, 679, 118, 434, 589, 707, 150, 340, 650, 708, 277, 403,
+  620, 680, 182, 709, 528, 497, 559, 214, 466, 590, 710, 372,
+  651, 309, 681,  23, 736,  55, 435, 621, 737,  87, 246, 711,
+  738, 119, 739, 341, 682, 404, 652, 151, 529, 560, 740, 278,
+  712, 498, 591, 183, 741, 467, 622, 373, 683, 215, 310, 713,
+  742, 436, 653,  24, 768,  56, 769, 247, 561, 743,  88, 530,
+  592, 770, 342, 714, 120, 405, 684, 771, 499, 623, 152, 772,
+  279, 744, 468, 654, 184, 773, 374, 715, 311, 437, 685, 745,
+  216, 774, 562, 593, 531, 624,  25, 248, 500, 655, 775, 800,
+  57, 801, 406, 716,  89, 343, 746, 802, 121, 803, 469, 686,
+  280, 776, 153, 804, 594, 185, 375, 563, 625, 747, 805, 438,
+  717, 532, 656, 312, 777, 217, 806, 501, 687, 407, 748, 249,
+  807,  26, 344, 778, 832,  58, 833,  90, 470, 718, 834, 122,
+  595, 626, 835, 281, 564, 657, 808, 154, 836, 376, 533, 688,
+  779, 439, 749, 186, 837, 313, 809, 502, 719, 218, 838, 408,
+  780, 627, 596, 658, 250, 345, 471, 750, 810, 839,  27, 864,
+  59, 565, 689, 865,  91, 866, 123, 867, 282, 534, 720, 840,
+  155, 440, 781, 868, 377, 811, 187, 503, 751, 869, 314, 841,
+  628, 659, 219, 597, 690, 870, 409, 812, 472, 782, 566, 721,
+  346, 842, 251, 871,  28, 896,  60, 535, 752, 897,  92, 898,
+  124, 283, 872, 899, 441, 813, 378, 843, 156, 660, 900, 504,
+  783, 629, 691, 598, 722, 188, 901, 315, 873, 567, 753, 220,
+  410, 844, 902, 473, 814, 347, 874, 536, 784, 252, 903,  29,
+  661, 692, 928,  61, 929,  93, 442, 630, 723, 845, 930, 284,
+  904, 125, 379, 505, 815, 875, 931, 599, 754, 157, 932, 316,
+  568, 785, 905, 189, 933, 474, 846, 411, 876, 221, 934, 537,
+  816, 693, 348, 662, 724, 906, 253, 631, 755, 935, 443, 877,
+  30, 600, 786, 960,  62, 506, 847, 961,  94, 962, 285, 380,
+  907, 936, 126, 963, 569, 817, 158, 964, 317, 937, 190, 475,
+  694, 725, 878, 965, 412, 908, 663, 756, 538, 848, 222, 966,
+  632, 787, 349, 938, 254, 601, 818, 967, 444, 909, 507, 879,
+  31, 992,  63, 381, 939, 993, 286, 968,  95, 570, 849, 994,
+  726, 127, 695, 757, 995, 664, 788, 159, 996, 476, 910, 318,
+  969, 413, 539, 880, 940, 191, 633, 819, 997, 223, 998, 350,
+  602, 850, 970, 508, 911, 255, 445, 941, 999, 727, 758, 696,
+  789,  571,  881,  382,  971,  287,  665,  820, 1000,  477,  634,  851,
+  942,  540,  912,  319, 1001,  414,  972,  603,  882,  759,  728,  790,
+  351, 1002,  509,  697,  821,  943,  446,  973,  572,  913,  666,  852,
+  383, 1003,  635,  883,  478,  974,  541,  944,  415,  760,  791, 1004,
+  604, 914, 729, 822, 698, 853, 510, 975, 667, 884, 447, 573,
+  945, 1005,  636,  915,  792,  761,  823,  542,  976,  479, 1006,  730,
+  854,  605,  946,  699,  885,  668,  916,  511, 1007,  574,  977,  793,
+  824,  637,  947,  762,  855,  731,  886,  543, 1008,  606,  978,  700,
+  917,  669,  948,  575,  825, 1009,  794,  856,  763,  887,  638,  979,
+  732,  918,  701,  949,  607, 1010,  670,  980,  826,  857,  795,  888,
+  764,  919,  639, 1011,  733,  950,  702,  981,  858,  827,  889,  796,
+  920,  671, 1012,  765,  951,  734,  982,  703, 1013,  859,  890,  828,
+  921,  797,  952,  766,  983,  735, 1014,  891,  860,  922,  829,  953,
+  798,  984,  767, 1015,  892,  923,  861,  954,  830,  985,  799, 1016,
+  924,  893,  955,  862,  986,  831, 1017,  925,  956,  894,  987,  863,
+  1018,  957,  926,  988,  895, 1019,  958,  989,  927, 1020,  990,  959,
+  1021,  991, 1022, 1023,
+};
+#endif  // CONFIG_EXT_TX
+
 // Neighborhood 2-tuples for various scans and blocksizes,
 // in {top, left} order for each position in corresponding scan order.
 DECLARE_ALIGNED(16, static const int16_t,
                 default_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
-  0, 0, 0, 0, 0, 0, 1, 4, 4, 4, 1, 1, 8, 8, 5, 8, 2, 2, 2, 5, 9, 12, 6, 9,
-  3, 6, 10, 13, 7, 10, 11, 14, 0, 0,
+  0,   0,   0,   0,   4,   0,   1,   4,   4,   5,   5,   1,
+  8,   8,   5,   8,   2,   2,   2,   5,   9,  12,   6,   9,
+  3,   6,  10,  13,   7,  10,  11,  14,   0,   0,
+};
+
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 4, 4, 8, 8, 0, 0, 1, 4, 5, 8, 9, 12, 1, 1, 2, 5, 6, 9, 10, 13,
+  2, 2, 3, 6, 7, 10, 11, 14, 0, 0,
 };
 
 DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 1, 1, 2, 2, 0, 0, 1, 4, 2, 5, 3, 6, 4, 4, 5, 8, 6, 9, 7, 10, 8,
+  8, 9, 12, 10, 13, 11, 14, 0, 0,
+};
+#endif  // CONFIG_EXT_TX
+
+DECLARE_ALIGNED(16, static const int16_t,
                 col_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
-  0, 0, 0, 0, 4, 4, 0, 0, 8, 8, 1, 1, 5, 5, 1, 1, 9, 9, 2, 2, 6, 6, 2, 2, 3,
-  3, 10, 10, 7, 7, 11, 11, 0, 0,
+  0,   0,   0,   0,   4,   4,   4,   0,   8,   8,   1,   4,
+  5,   8,   5,   1,   9,  12,   2,   5,   6,   9,   6,   2,
+  3,   6,  10,  13,   7,  10,  11,  14,   0,   0,
 };
 
 DECLARE_ALIGNED(16, static const int16_t,
                 row_scan_4x4_neighbors[17 * MAX_NEIGHBORS]) = {
-  0, 0, 0, 0, 0, 0, 1, 1, 4, 4, 2, 2, 5, 5, 4, 4, 8, 8, 6, 6, 8, 8, 9, 9, 12,
-  12, 10, 10, 13, 13, 14, 14, 0, 0,
+  0,   0,   0,   0,   0,   1,   1,   1,   1,   4,   2,   2,
+  2,   5,   4,   5,   5,   8,   3,   6,   8,   9,   6,   9,
+  9,  12,   7,  10,  10,  13,  11,  14,   0,   0,
 };
 
 DECLARE_ALIGNED(16, static const int16_t,
                 col_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
-  0, 0, 0, 0, 8, 8, 0, 0, 16, 16, 1, 1, 24, 24, 9, 9, 1, 1, 32, 32, 17, 17, 2,
-  2, 25, 25, 10, 10, 40, 40, 2, 2, 18, 18, 33, 33, 3, 3, 48, 48, 11, 11, 26,
-  26, 3, 3, 41, 41, 19, 19, 34, 34, 4, 4, 27, 27, 12, 12, 49, 49, 42, 42, 20,
-  20, 4, 4, 35, 35, 5, 5, 28, 28, 50, 50, 43, 43, 13, 13, 36, 36, 5, 5, 21, 21,
-  51, 51, 29, 29, 6, 6, 44, 44, 14, 14, 6, 6, 37, 37, 52, 52, 22, 22, 7, 7, 30,
-  30, 45, 45, 15, 15, 38, 38, 23, 23, 53, 53, 31, 31, 46, 46, 39, 39, 54, 54,
-  47, 47, 55, 55, 0, 0,
+  0,   0,   0,   0,   8,   8,   8,   0,  16,  16,   1,   8,
+  24,  24,   9,  16,   9,   1,  32,  32,  17,  24,   2,   9,
+  25,  32,  10,  17,  40,  40,  10,   2,  18,  25,  33,  40,
+  3,  10,  48,  48,  11,  18,  26,  33,  11,   3,  41,  48,
+  19,  26,  34,  41,   4,  11,  27,  34,  12,  19,  49,  56,
+  42,  49,  20,  27,  12,   4,  35,  42,   5,  12,  28,  35,
+  50,  57,  43,  50,  13,  20,  36,  43,  13,   5,  21,  28,
+  51,  58,  29,  36,   6,  13,  44,  51,  14,  21,  14,   6,
+  37,  44,  52,  59,  22,  29,   7,  14,  30,  37,  45,  52,
+  15,  22,  38,  45,  23,  30,  53,  60,  31,  38,  46,  53,
+  39,  46,  54,  61,  47,  54,  55,  62,   0,   0,
+};
+
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 8, 8, 16, 16, 24, 24, 32, 32, 40, 40, 48, 48, 0, 0, 1, 8, 9,
+  16, 17, 24, 25, 32, 33, 40, 41, 48, 49, 56, 1, 1, 2, 9, 10, 17, 18, 25,
+  26, 33, 34, 41, 42, 49, 50, 57, 2, 2, 3, 10, 11, 18, 19, 26, 27, 34, 35,
+  42, 43, 50, 51, 58, 3, 3, 4, 11, 12, 19, 20, 27, 28, 35, 36, 43, 44, 51,
+  52, 59, 4, 4, 5, 12, 13, 20, 21, 28, 29, 36, 37, 44, 45, 52, 53, 60, 5,
+  5, 6, 13, 14, 21, 22, 29, 30, 37, 38, 45, 46, 53, 54, 61, 6, 6, 7, 14,
+  15, 22, 23, 30, 31, 38, 39, 46, 47, 54, 55, 62, 0, 0,
 };
 
 DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 0, 0, 1, 8, 2, 9, 3, 10,
+  4, 11, 5, 12, 6, 13, 7, 14, 8, 8, 9, 16, 10, 17, 11, 18, 12, 19, 13, 20,
+  14, 21, 15, 22, 16, 16, 17, 24, 18, 25, 19, 26, 20, 27, 21, 28, 22, 29,
+  23, 30, 24, 24, 25, 32, 26, 33, 27, 34, 28, 35, 29, 36, 30, 37, 31, 38,
+  32, 32, 33, 40, 34, 41, 35, 42, 36, 43, 37, 44, 38, 45, 39, 46, 40, 40,
+  41, 48, 42, 49, 43, 50, 44, 51, 45, 52, 46, 53, 47, 54, 48, 48, 49, 56,
+  50, 57, 51, 58, 52, 59, 53, 60, 54, 61, 55, 62, 0, 0,
+};
+#endif  // CONFIG_EXT_TX
+
+DECLARE_ALIGNED(16, static const int16_t,
                 row_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
-  0, 0, 0, 0, 1, 1, 0, 0, 8, 8, 2, 2, 8, 8, 9, 9, 3, 3, 16, 16, 10, 10, 16, 16,
-  4, 4, 17, 17, 24, 24, 11, 11, 18, 18, 25, 25, 24, 24, 5, 5, 12, 12, 19, 19,
-  32, 32, 26, 26, 6, 6, 33, 33, 32, 32, 20, 20, 27, 27, 40, 40, 13, 13, 34, 34,
-  40, 40, 41, 41, 28, 28, 35, 35, 48, 48, 21, 21, 42, 42, 14, 14, 48, 48, 36,
-  36, 49, 49, 43, 43, 29, 29, 56, 56, 22, 22, 50, 50, 57, 57, 44, 44, 37, 37,
-  51, 51, 30, 30, 58, 58, 52, 52, 45, 45, 59, 59, 38, 38, 60, 60, 46, 46, 53,
-  53, 54, 54, 61, 61, 62, 62, 0, 0,
+  0,   0,   0,   0,   1,   1,   0,   1,   1,   8,   2,   2,
+  8,   9,   2,   9,   3,   3,   9,  16,   3,  10,  16,  17,
+  4,   4,  10,  17,  17,  24,   4,  11,  11,  18,  18,  25,
+  24,  25,   5,   5,   5,  12,  12,  19,  25,  32,  19,  26,
+  6,   6,  26,  33,  32,  33,  13,  20,  20,  27,  33,  40,
+  6,  13,  27,  34,  40,  41,  34,  41,  21,  28,  28,  35,
+  41,  48,  14,  21,  35,  42,   7,  14,  48,  49,  29,  36,
+  42,  49,  36,  43,  22,  29,  49,  56,  15,  22,  43,  50,
+  50,  57,  37,  44,  30,  37,  44,  51,  23,  30,  51,  58,
+  45,  52,  38,  45,  52,  59,  31,  38,  53,  60,  39,  46,
+  46,  53,  47,  54,  54,  61,  55,  62,   0,   0,
 };
 
 DECLARE_ALIGNED(16, static const int16_t,
                 default_scan_8x8_neighbors[65 * MAX_NEIGHBORS]) = {
-  0, 0, 0, 0, 0, 0, 8, 8, 1, 8, 1, 1, 9, 16, 16, 16, 2, 9, 2, 2, 10, 17, 17,
-  24, 24, 24, 3, 10, 3, 3, 18, 25, 25, 32, 11, 18, 32, 32, 4, 11, 26, 33, 19,
-  26, 4, 4, 33, 40, 12, 19, 40, 40, 5, 12, 27, 34, 34, 41, 20, 27, 13, 20, 5,
-  5, 41, 48, 48, 48, 28, 35, 35, 42, 21, 28, 6, 6, 6, 13, 42, 49, 49, 56, 36,
-  43, 14, 21, 29, 36, 7, 14, 43, 50, 50, 57, 22, 29, 37, 44, 15, 22, 44, 51,
-  51, 58, 30, 37, 23, 30, 52, 59, 45, 52, 38, 45, 31, 38, 53, 60, 46, 53, 39,
-  46, 54, 61, 47, 54, 55, 62, 0, 0,
+  0,   0,   0,   0,   8,   0,   8,   8,   1,   8,   9,   1,
+  9,  16,  16,  17,   2,   9,  10,   2,  10,  17,  17,  24,
+  24,  25,   3,  10,  11,   3,  18,  25,  25,  32,  11,  18,
+  32,  33,   4,  11,  26,  33,  19,  26,  12,   4,  33,  40,
+  12,  19,  40,  41,   5,  12,  27,  34,  34,  41,  20,  27,
+  13,  20,  13,   5,  41,  48,  48,  49,  28,  35,  35,  42,
+  21,  28,   6,   6,   6,  13,  42,  49,  49,  56,  36,  43,
+  14,  21,  29,  36,   7,  14,  43,  50,  50,  57,  22,  29,
+  37,  44,  15,  22,  44,  51,  51,  58,  30,  37,  23,  30,
+  52,  59,  45,  52,  38,  45,  31,  38,  53,  60,  46,  53,
+  39,  46,  54,  61,  47,  54,  55,  62,   0,   0,
+};
+
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 16, 16, 32, 32, 48, 48, 64, 64, 80, 80, 96, 96,
+  112, 112, 128, 128, 144, 144, 160, 160, 176, 176, 192, 192, 208,
+  208, 224, 224,
+  0, 0, 1, 16, 17, 32, 33, 48, 49, 64, 65, 80, 81, 96, 97, 112,
+  113, 128, 129, 144, 145, 160, 161, 176, 177, 192, 193, 208, 209,
+  224, 225, 240,
+  1, 1, 2, 17, 18, 33, 34, 49, 50, 65, 66, 81, 82, 97, 98, 113,
+  114, 129, 130, 145, 146, 161, 162, 177, 178, 193, 194, 209, 210,
+  225, 226, 241,
+  2, 2, 3, 18, 19, 34, 35, 50, 51, 66, 67, 82, 83, 98, 99, 114,
+  115, 130, 131, 146, 147, 162, 163, 178, 179, 194, 195, 210, 211,
+  226, 227, 242,
+  3, 3, 4, 19, 20, 35, 36, 51, 52, 67, 68, 83, 84, 99, 100, 115,
+  116, 131, 132, 147, 148, 163, 164, 179, 180, 195, 196, 211, 212,
+  227, 228, 243,
+  4, 4, 5, 20, 21, 36, 37, 52, 53, 68, 69, 84, 85, 100, 101, 116,
+  117, 132, 133, 148, 149, 164, 165, 180, 181, 196, 197, 212, 213,
+  228, 229, 244,
+  5, 5, 6, 21, 22, 37, 38, 53, 54, 69, 70, 85, 86, 101, 102, 117,
+  118, 133, 134, 149, 150, 165, 166, 181, 182, 197, 198, 213, 214,
+  229, 230, 245,
+  6, 6, 7, 22, 23, 38, 39, 54, 55, 70, 71, 86, 87, 102, 103, 118,
+  119, 134, 135, 150, 151, 166, 167, 182, 183, 198, 199, 214, 215,
+  230, 231, 246,
+  7, 7, 8, 23, 24, 39, 40, 55, 56, 71, 72, 87, 88, 103, 104, 119,
+  120, 135, 136, 151, 152, 167, 168, 183, 184, 199, 200, 215, 216,
+  231, 232, 247,
+  8, 8, 9, 24, 25, 40, 41, 56, 57, 72, 73, 88, 89, 104, 105, 120,
+  121, 136, 137, 152, 153, 168, 169, 184, 185, 200, 201, 216, 217,
+  232, 233, 248,
+  9, 9, 10, 25, 26, 41, 42, 57, 58, 73, 74, 89, 90, 105, 106, 121,
+  122, 137, 138, 153, 154, 169, 170, 185, 186, 201, 202, 217, 218,
+  233, 234, 249,
+  10, 10, 11, 26, 27, 42, 43, 58, 59, 74, 75, 90, 91, 106, 107, 122,
+  123, 138, 139, 154, 155, 170, 171, 186, 187, 202, 203, 218, 219,
+  234, 235, 250,
+  11, 11, 12, 27, 28, 43, 44, 59, 60, 75, 76, 91, 92, 107, 108, 123,
+  124, 139, 140, 155, 156, 171, 172, 187, 188, 203, 204, 219, 220,
+  235, 236, 251,
+  12, 12, 13, 28, 29, 44, 45, 60, 61, 76, 77, 92, 93, 108, 109, 124,
+  125, 140, 141, 156, 157, 172, 173, 188, 189, 204, 205, 220, 221,
+  236, 237, 252,
+  13, 13, 14, 29, 30, 45, 46, 61, 62, 77, 78, 93, 94, 109, 110, 125,
+  126, 141, 142, 157, 158, 173, 174, 189, 190, 205, 206, 221, 222,
+  237, 238, 253,
+  14, 14, 15, 30, 31, 46, 47, 62, 63, 78, 79, 94, 95, 110, 111, 126,
+  127, 142, 143, 158, 159, 174, 175, 190, 191, 206, 207, 222, 223,
+  238, 239, 254,
+  0, 0,
 };
 
 DECLARE_ALIGNED(16, static const int16_t,
+                mrow_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,
+  7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14,
+  0, 0, 1, 16, 2, 17, 3, 18, 4, 19, 5, 20, 6, 21, 7, 22,
+  8, 23, 9, 24, 10, 25, 11, 26, 12, 27, 13, 28, 14, 29, 15, 30,
+  16, 16, 17, 32, 18, 33, 19, 34, 20, 35, 21, 36, 22, 37, 23, 38,
+  24, 39, 25, 40, 26, 41, 27, 42, 28, 43, 29, 44, 30, 45, 31, 46,
+  32, 32, 33, 48, 34, 49, 35, 50, 36, 51, 37, 52, 38, 53, 39, 54,
+  40, 55, 41, 56, 42, 57, 43, 58, 44, 59, 45, 60, 46, 61, 47, 62,
+  48, 48, 49, 64, 50, 65, 51, 66, 52, 67, 53, 68, 54, 69, 55, 70,
+  56, 71, 57, 72, 58, 73, 59, 74, 60, 75, 61, 76, 62, 77, 63, 78,
+  64, 64, 65, 80, 66, 81, 67, 82, 68, 83, 69, 84, 70, 85, 71, 86,
+  72, 87, 73, 88, 74, 89, 75, 90, 76, 91, 77, 92, 78, 93, 79, 94,
+  80, 80, 81, 96, 82, 97, 83, 98, 84, 99, 85, 100, 86, 101, 87, 102,
+  88, 103, 89, 104, 90, 105, 91, 106, 92, 107, 93, 108, 94, 109, 95, 110,
+  96, 96, 97, 112, 98, 113, 99, 114, 100, 115, 101, 116, 102, 117,
+  103, 118,
+  104, 119, 105, 120, 106, 121, 107, 122, 108, 123, 109, 124, 110,
+  125, 111, 126,
+  112, 112, 113, 128, 114, 129, 115, 130, 116, 131, 117, 132, 118,
+  133, 119, 134,
+  120, 135, 121, 136, 122, 137, 123, 138, 124, 139, 125, 140, 126,
+  141, 127, 142,
+  128, 128, 129, 144, 130, 145, 131, 146, 132, 147, 133, 148, 134,
+  149, 135, 150,
+  136, 151, 137, 152, 138, 153, 139, 154, 140, 155, 141, 156, 142,
+  157, 143, 158,
+  144, 144, 145, 160, 146, 161, 147, 162, 148, 163, 149, 164, 150,
+  165, 151, 166,
+  152, 167, 153, 168, 154, 169, 155, 170, 156, 171, 157, 172, 158,
+  173, 159, 174,
+  160, 160, 161, 176, 162, 177, 163, 178, 164, 179, 165, 180, 166,
+  181, 167, 182,
+  168, 183, 169, 184, 170, 185, 171, 186, 172, 187, 173, 188, 174,
+  189, 175, 190,
+  176, 176, 177, 192, 178, 193, 179, 194, 180, 195, 181, 196, 182,
+  197, 183, 198,
+  184, 199, 185, 200, 186, 201, 187, 202, 188, 203, 189, 204, 190,
+  205, 191, 206,
+  192, 192, 193, 208, 194, 209, 195, 210, 196, 211, 197, 212, 198,
+  213, 199, 214,
+  200, 215, 201, 216, 202, 217, 203, 218, 204, 219, 205, 220, 206,
+  221, 207, 222,
+  208, 208, 209, 224, 210, 225, 211, 226, 212, 227, 213, 228, 214,
+  229, 215, 230,
+  216, 231, 217, 232, 218, 233, 219, 234, 220, 235, 221, 236, 222,
+  237, 223, 238,
+  224, 224, 225, 240, 226, 241, 227, 242, 228, 243, 229, 244, 230,
+  245, 231, 246,
+  232, 247, 233, 248, 234, 249, 235, 250, 236, 251, 237, 252, 238,
+  253, 239, 254,
+  0, 0,
+};
+#endif  // CONFIG_EXT_TX
+
+DECLARE_ALIGNED(16, static const int16_t,
                 col_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
-  0, 0, 0, 0, 16, 16, 32, 32, 0, 0, 48, 48, 1, 1, 64, 64,
-  17, 17, 80, 80, 33, 33, 1, 1, 49, 49, 96, 96, 2, 2, 65, 65,
-  18, 18, 112, 112, 34, 34, 81, 81, 2, 2, 50, 50, 128, 128, 3, 3,
-  97, 97, 19, 19, 66, 66, 144, 144, 82, 82, 35, 35, 113, 113, 3, 3,
-  51, 51, 160, 160, 4, 4, 98, 98, 129, 129, 67, 67, 20, 20, 83, 83,
-  114, 114, 36, 36, 176, 176, 4, 4, 145, 145, 52, 52, 99, 99, 5, 5,
-  130, 130, 68, 68, 192, 192, 161, 161, 21, 21, 115, 115, 84, 84, 37, 37,
-  146, 146, 208, 208, 53, 53, 5, 5, 100, 100, 177, 177, 131, 131, 69, 69,
-  6, 6, 224, 224, 116, 116, 22, 22, 162, 162, 85, 85, 147, 147, 38, 38,
-  193, 193, 101, 101, 54, 54, 6, 6, 132, 132, 178, 178, 70, 70, 163, 163,
-  209, 209, 7, 7, 117, 117, 23, 23, 148, 148, 7, 7, 86, 86, 194, 194,
-  225, 225, 39, 39, 179, 179, 102, 102, 133, 133, 55, 55, 164, 164, 8, 8,
-  71, 71, 210, 210, 118, 118, 149, 149, 195, 195, 24, 24, 87, 87, 40, 40,
-  56, 56, 134, 134, 180, 180, 226, 226, 103, 103, 8, 8, 165, 165, 211, 211,
-  72, 72, 150, 150, 9, 9, 119, 119, 25, 25, 88, 88, 196, 196, 41, 41,
-  135, 135, 181, 181, 104, 104, 57, 57, 227, 227, 166, 166, 120, 120, 151, 151,
-  197, 197, 73, 73, 9, 9, 212, 212, 89, 89, 136, 136, 182, 182, 10, 10,
-  26, 26, 105, 105, 167, 167, 228, 228, 152, 152, 42, 42, 121, 121, 213, 213,
-  58, 58, 198, 198, 74, 74, 137, 137, 183, 183, 168, 168, 10, 10, 90, 90,
-  229, 229, 11, 11, 106, 106, 214, 214, 153, 153, 27, 27, 199, 199, 43, 43,
-  184, 184, 122, 122, 169, 169, 230, 230, 59, 59, 11, 11, 75, 75, 138, 138,
-  200, 200, 215, 215, 91, 91, 12, 12, 28, 28, 185, 185, 107, 107, 154, 154,
-  44, 44, 231, 231, 216, 216, 60, 60, 123, 123, 12, 12, 76, 76, 201, 201,
-  170, 170, 232, 232, 139, 139, 92, 92, 13, 13, 108, 108, 29, 29, 186, 186,
-  217, 217, 155, 155, 45, 45, 13, 13, 61, 61, 124, 124, 14, 14, 233, 233,
-  77, 77, 14, 14, 171, 171, 140, 140, 202, 202, 30, 30, 93, 93, 109, 109,
-  46, 46, 156, 156, 62, 62, 187, 187, 15, 15, 125, 125, 218, 218, 78, 78,
-  31, 31, 172, 172, 47, 47, 141, 141, 94, 94, 234, 234, 203, 203, 63, 63,
-  110, 110, 188, 188, 157, 157, 126, 126, 79, 79, 173, 173, 95, 95, 219, 219,
-  142, 142, 204, 204, 235, 235, 111, 111, 158, 158, 127, 127, 189, 189, 220,
-  220, 143, 143, 174, 174, 205, 205, 236, 236, 159, 159, 190, 190, 221, 221,
-  175, 175, 237, 237, 206, 206, 222, 222, 191, 191, 238, 238, 207, 207, 223,
-  223, 239, 239, 0, 0,
+  0,   0,   0,   0,  16,  16,  32,  32,  16,   0,  48,  48,
+  1,  16,  64,  64,  17,  32,  80,  80,  33,  48,  17,   1,
+  49,  64,  96,  96,   2,  17,  65,  80,  18,  33, 112, 112,
+  34,  49,  81,  96,  18,   2,  50,  65, 128, 128,   3,  18,
+  97, 112,  19,  34,  66,  81, 144, 144,  82,  97,  35,  50,
+  113, 128,  19,   3,  51,  66, 160, 160,   4,  19,  98, 113,
+  129, 144,  67,  82,  20,  35,  83,  98, 114, 129,  36,  51,
+  176, 176,  20,   4, 145, 160,  52,  67,  99, 114,   5,  20,
+  130, 145,  68,  83, 192, 192, 161, 176,  21,  36, 115, 130,
+  84,  99,  37,  52, 146, 161, 208, 208,  53,  68,  21,   5,
+  100, 115, 177, 192, 131, 146,  69,  84,   6,  21, 224, 224,
+  116, 131,  22,  37, 162, 177,  85, 100, 147, 162,  38,  53,
+  193, 208, 101, 116,  54,  69,  22,   6, 132, 147, 178, 193,
+  70,  85, 163, 178, 209, 224,   7,  22, 117, 132,  23,  38,
+  148, 163,  23,   7,  86, 101, 194, 209, 225, 240,  39,  54,
+  179, 194, 102, 117, 133, 148,  55,  70, 164, 179,   8,  23,
+  71,  86, 210, 225, 118, 133, 149, 164, 195, 210,  24,  39,
+  87, 102,  40,  55,  56,  71, 134, 149, 180, 195, 226, 241,
+  103, 118,  24,   8, 165, 180, 211, 226,  72,  87, 150, 165,
+  9,  24, 119, 134,  25,  40,  88, 103, 196, 211,  41,  56,
+  135, 150, 181, 196, 104, 119,  57,  72, 227, 242, 166, 181,
+  120, 135, 151, 166, 197, 212,  73,  88,  25,   9, 212, 227,
+  89, 104, 136, 151, 182, 197,  10,  25,  26,  41, 105, 120,
+  167, 182, 228, 243, 152, 167,  42,  57, 121, 136, 213, 228,
+  58,  73, 198, 213,  74,  89, 137, 152, 183, 198, 168, 183,
+  26,  10,  90, 105, 229, 244,  11,  26, 106, 121, 214, 229,
+  153, 168,  27,  42, 199, 214,  43,  58, 184, 199, 122, 137,
+  169, 184, 230, 245,  59,  74,  27,  11,  75,  90, 138, 153,
+  200, 215, 215, 230,  91, 106,  12,  27,  28,  43, 185, 200,
+  107, 122, 154, 169,  44,  59, 231, 246, 216, 231,  60,  75,
+  123, 138,  28,  12,  76,  91, 201, 216, 170, 185, 232, 247,
+  139, 154,  92, 107,  13,  28, 108, 123,  29,  44, 186, 201,
+  217, 232, 155, 170,  45,  60,  29,  13,  61,  76, 124, 139,
+  14,  14, 233, 248,  77,  92,  14,  29, 171, 186, 140, 155,
+  202, 217,  30,  45,  93, 108, 109, 124,  46,  61, 156, 171,
+  62,  77, 187, 202,  15,  30, 125, 140, 218, 233,  78,  93,
+  31,  46, 172, 187,  47,  62, 141, 156,  94, 109, 234, 249,
+  203, 218,  63,  78, 110, 125, 188, 203, 157, 172, 126, 141,
+  79,  94, 173, 188,  95, 110, 219, 234, 142, 157, 204, 219,
+  235, 250, 111, 126, 158, 173, 127, 142, 189, 204, 220, 235,
+  143, 158, 174, 189, 205, 220, 236, 251, 159, 174, 190, 205,
+  221, 236, 175, 190, 237, 252, 206, 221, 222, 237, 191, 206,
+  238, 253, 207, 222, 223, 238, 239, 254,   0,   0,
 };
 
 DECLARE_ALIGNED(16, static const int16_t,
                 row_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
-  0, 0, 0, 0, 1, 1, 0, 0, 2, 2, 16, 16, 3, 3, 17, 17,
-  16, 16, 4, 4, 32, 32, 18, 18, 5, 5, 33, 33, 32, 32, 19, 19,
-  48, 48, 6, 6, 34, 34, 20, 20, 49, 49, 48, 48, 7, 7, 35, 35,
-  64, 64, 21, 21, 50, 50, 36, 36, 64, 64, 8, 8, 65, 65, 51, 51,
-  22, 22, 37, 37, 80, 80, 66, 66, 9, 9, 52, 52, 23, 23, 81, 81,
-  67, 67, 80, 80, 38, 38, 10, 10, 53, 53, 82, 82, 96, 96, 68, 68,
-  24, 24, 97, 97, 83, 83, 39, 39, 96, 96, 54, 54, 11, 11, 69, 69,
-  98, 98, 112, 112, 84, 84, 25, 25, 40, 40, 55, 55, 113, 113, 99, 99,
-  12, 12, 70, 70, 112, 112, 85, 85, 26, 26, 114, 114, 100, 100, 128, 128,
-  41, 41, 56, 56, 71, 71, 115, 115, 13, 13, 86, 86, 129, 129, 101, 101,
-  128, 128, 72, 72, 130, 130, 116, 116, 27, 27, 57, 57, 14, 14, 87, 87,
-  42, 42, 144, 144, 102, 102, 131, 131, 145, 145, 117, 117, 73, 73, 144, 144,
-  88, 88, 132, 132, 103, 103, 28, 28, 58, 58, 146, 146, 118, 118, 43, 43,
-  160, 160, 147, 147, 89, 89, 104, 104, 133, 133, 161, 161, 119, 119, 160, 160,
-  74, 74, 134, 134, 148, 148, 29, 29, 59, 59, 162, 162, 176, 176, 44, 44,
-  120, 120, 90, 90, 105, 105, 163, 163, 177, 177, 149, 149, 176, 176, 135, 135,
-  164, 164, 178, 178, 30, 30, 150, 150, 192, 192, 75, 75, 121, 121, 60, 60,
-  136, 136, 193, 193, 106, 106, 151, 151, 179, 179, 192, 192, 45, 45, 165, 165,
-  166, 166, 194, 194, 91, 91, 180, 180, 137, 137, 208, 208, 122, 122, 152, 152,
-  208, 208, 195, 195, 76, 76, 167, 167, 209, 209, 181, 181, 224, 224, 107, 107,
-  196, 196, 61, 61, 153, 153, 224, 224, 182, 182, 168, 168, 210, 210, 46, 46,
-  138, 138, 92, 92, 183, 183, 225, 225, 211, 211, 240, 240, 197, 197, 169, 169,
-  123, 123, 154, 154, 198, 198, 77, 77, 212, 212, 184, 184, 108, 108, 226, 226,
-  199, 199, 62, 62, 227, 227, 241, 241, 139, 139, 213, 213, 170, 170, 185, 185,
-  155, 155, 228, 228, 242, 242, 124, 124, 93, 93, 200, 200, 243, 243, 214, 214,
-  215, 215, 229, 229, 140, 140, 186, 186, 201, 201, 78, 78, 171, 171, 109, 109,
-  156, 156, 244, 244, 216, 216, 230, 230, 94, 94, 245, 245, 231, 231, 125, 125,
-  202, 202, 246, 246, 232, 232, 172, 172, 217, 217, 141, 141, 110, 110, 157,
-  157, 187, 187, 247, 247, 126, 126, 233, 233, 218, 218, 248, 248, 188, 188,
-  203, 203, 142, 142, 173, 173, 158, 158, 249, 249, 234, 234, 204, 204, 219,
-  219, 174, 174, 189, 189, 250, 250, 220, 220, 190, 190, 205, 205, 235, 235,
-  206, 206, 236, 236, 251, 251, 221, 221, 252, 252, 222, 222, 237, 237, 238,
-  238, 253, 253, 254, 254, 0, 0,
+  0,   0,   0,   0,   1,   1,   0,   1,   2,   2,   1,  16,
+  3,   3,   2,  17,  16,  17,   4,   4,  17,  32,   3,  18,
+  5,   5,  18,  33,  32,  33,   4,  19,  33,  48,   6,   6,
+  19,  34,   5,  20,  34,  49,  48,  49,   7,   7,  20,  35,
+  49,  64,   6,  21,  35,  50,  21,  36,  64,  65,   8,   8,
+  50,  65,  36,  51,   7,  22,  22,  37,  65,  80,  51,  66,
+  9,   9,  37,  52,   8,  23,  66,  81,  52,  67,  80,  81,
+  23,  38,  10,  10,  38,  53,  67,  82,  81,  96,  53,  68,
+  9,  24,  82,  97,  68,  83,  24,  39,  96,  97,  39,  54,
+  11,  11,  54,  69,  83,  98,  97, 112,  69,  84,  10,  25,
+  25,  40,  40,  55,  98, 113,  84,  99,  12,  12,  55,  70,
+  112, 113,  70,  85,  11,  26,  99, 114,  85, 100, 113, 128,
+  26,  41,  41,  56,  56,  71, 100, 115,  13,  13,  71,  86,
+  114, 129,  86, 101, 128, 129,  57,  72, 115, 130, 101, 116,
+  12,  27,  42,  57,  14,  14,  72,  87,  27,  42, 129, 144,
+  87, 102, 116, 131, 130, 145, 102, 117,  58,  73, 144, 145,
+  73,  88, 117, 132,  88, 103,  13,  28,  43,  58, 131, 146,
+  103, 118,  28,  43, 145, 160, 132, 147,  74,  89,  89, 104,
+  118, 133, 146, 161, 104, 119, 160, 161,  59,  74, 119, 134,
+  133, 148,  14,  29,  44,  59, 147, 162, 161, 176,  29,  44,
+  105, 120,  75,  90,  90, 105, 148, 163, 162, 177, 134, 149,
+  176, 177, 120, 135, 149, 164, 163, 178,  15,  30, 135, 150,
+  177, 192,  60,  75, 106, 121,  45,  60, 121, 136, 178, 193,
+  91, 106, 136, 151, 164, 179, 192, 193,  30,  45, 150, 165,
+  151, 166, 179, 194,  76,  91, 165, 180, 122, 137, 193, 208,
+  107, 122, 137, 152, 208, 209, 180, 195,  61,  76, 152, 167,
+  194, 209, 166, 181, 224, 224,  92, 107, 181, 196,  46,  61,
+  138, 153, 209, 224, 167, 182, 153, 168, 195, 210,  31,  46,
+  123, 138,  77,  92, 168, 183, 210, 225, 196, 211, 225, 240,
+  182, 197, 154, 169, 108, 123, 139, 154, 183, 198,  62,  77,
+  197, 212, 169, 184,  93, 108, 211, 226, 184, 199,  47,  62,
+  212, 227, 226, 241, 124, 139, 198, 213, 155, 170, 170, 185,
+  140, 155, 213, 228, 227, 242, 109, 124,  78,  93, 185, 200,
+  228, 243, 199, 214, 200, 215, 214, 229, 125, 140, 171, 186,
+  186, 201,  63,  78, 156, 171,  94, 109, 141, 156, 229, 244,
+  201, 216, 215, 230,  79,  94, 230, 245, 216, 231, 110, 125,
+  187, 202, 231, 246, 217, 232, 157, 172, 202, 217, 126, 141,
+  95, 110, 142, 157, 172, 187, 232, 247, 111, 126, 218, 233,
+  203, 218, 233, 248, 173, 188, 188, 203, 127, 142, 158, 173,
+  143, 158, 234, 249, 219, 234, 189, 204, 204, 219, 159, 174,
+  174, 189, 235, 250, 205, 220, 175, 190, 190, 205, 220, 235,
+  191, 206, 221, 236, 236, 251, 206, 221, 237, 252, 207, 222,
+  222, 237, 223, 238, 238, 253, 239, 254,   0,   0,
 };
 
 DECLARE_ALIGNED(16, static const int16_t,
                 default_scan_16x16_neighbors[257 * MAX_NEIGHBORS]) = {
-  0, 0, 0, 0, 0, 0, 16, 16, 1, 16, 1, 1, 32, 32, 17, 32,
-  2, 17, 2, 2, 48, 48, 18, 33, 33, 48, 3, 18, 49, 64, 64, 64,
-  34, 49, 3, 3, 19, 34, 50, 65, 4, 19, 65, 80, 80, 80, 35, 50,
-  4, 4, 20, 35, 66, 81, 81, 96, 51, 66, 96, 96, 5, 20, 36, 51,
-  82, 97, 21, 36, 67, 82, 97, 112, 5, 5, 52, 67, 112, 112, 37, 52,
-  6, 21, 83, 98, 98, 113, 68, 83, 6, 6, 113, 128, 22, 37, 53, 68,
-  84, 99, 99, 114, 128, 128, 114, 129, 69, 84, 38, 53, 7, 22, 7, 7,
-  129, 144, 23, 38, 54, 69, 100, 115, 85, 100, 115, 130, 144, 144, 130, 145,
-  39, 54, 70, 85, 8, 23, 55, 70, 116, 131, 101, 116, 145, 160, 24, 39,
-  8, 8, 86, 101, 131, 146, 160, 160, 146, 161, 71, 86, 40, 55, 9, 24,
-  117, 132, 102, 117, 161, 176, 132, 147, 56, 71, 87, 102, 25, 40, 147, 162,
-  9, 9, 176, 176, 162, 177, 72, 87, 41, 56, 118, 133, 133, 148, 103, 118,
-  10, 25, 148, 163, 57, 72, 88, 103, 177, 192, 26, 41, 163, 178, 192, 192,
-  10, 10, 119, 134, 73, 88, 149, 164, 104, 119, 134, 149, 42, 57, 178, 193,
-  164, 179, 11, 26, 58, 73, 193, 208, 89, 104, 135, 150, 120, 135, 27, 42,
-  74, 89, 208, 208, 150, 165, 179, 194, 165, 180, 105, 120, 194, 209, 43, 58,
-  11, 11, 136, 151, 90, 105, 151, 166, 180, 195, 59, 74, 121, 136, 209, 224,
-  195, 210, 224, 224, 166, 181, 106, 121, 75, 90, 12, 27, 181, 196, 12, 12,
-  210, 225, 152, 167, 167, 182, 137, 152, 28, 43, 196, 211, 122, 137, 91, 106,
-  225, 240, 44, 59, 13, 28, 107, 122, 182, 197, 168, 183, 211, 226, 153, 168,
-  226, 241, 60, 75, 197, 212, 138, 153, 29, 44, 76, 91, 13, 13, 183, 198,
-  123, 138, 45, 60, 212, 227, 198, 213, 154, 169, 169, 184, 227, 242, 92, 107,
-  61, 76, 139, 154, 14, 29, 14, 14, 184, 199, 213, 228, 108, 123, 199, 214,
-  228, 243, 77, 92, 30, 45, 170, 185, 155, 170, 185, 200, 93, 108, 124, 139,
-  214, 229, 46, 61, 200, 215, 229, 244, 15, 30, 109, 124, 62, 77, 140, 155,
-  215, 230, 31, 46, 171, 186, 186, 201, 201, 216, 78, 93, 230, 245, 125, 140,
-  47, 62, 216, 231, 156, 171, 94, 109, 231, 246, 141, 156, 63, 78, 202, 217,
-  187, 202, 110, 125, 217, 232, 172, 187, 232, 247, 79, 94, 157, 172, 126, 141,
-  203, 218, 95, 110, 233, 248, 218, 233, 142, 157, 111, 126, 173, 188, 188, 203,
-  234, 249, 219, 234, 127, 142, 158, 173, 204, 219, 189, 204, 143, 158, 235,
-  250, 174, 189, 205, 220, 159, 174, 220, 235, 221, 236, 175, 190, 190, 205,
-  236, 251, 206, 221, 237, 252, 191, 206, 222, 237, 207, 222, 238, 253, 223,
-  238, 239, 254, 0, 0,
+  0,   0,   0,   0,  16,   0,  16,  16,   1,  16,  17,   1,
+  32,  32,  17,  32,   2,  17,  18,   2,  48,  48,  18,  33,
+  33,  48,   3,  18,  49,  64,  64,  65,  34,  49,  19,   3,
+  19,  34,  50,  65,   4,  19,  65,  80,  80,  81,  35,  50,
+  20,   4,  20,  35,  66,  81,  81,  96,  51,  66,  96,  97,
+  5,  20,  36,  51,  82,  97,  21,  36,  67,  82,  97, 112,
+  21,   5,  52,  67, 112, 113,  37,  52,   6,  21,  83,  98,
+  98, 113,  68,  83,  22,   6, 113, 128,  22,  37,  53,  68,
+  84,  99,  99, 114, 128, 129, 114, 129,  69,  84,  38,  53,
+  7,  22,  23,   7, 129, 144,  23,  38,  54,  69, 100, 115,
+  85, 100, 115, 130, 144, 145, 130, 145,  39,  54,  70,  85,
+  8,  23,  55,  70, 116, 131, 101, 116, 145, 160,  24,  39,
+  24,   8,  86, 101, 131, 146, 160, 161, 146, 161,  71,  86,
+  40,  55,   9,  24, 117, 132, 102, 117, 161, 176, 132, 147,
+  56,  71,  87, 102,  25,  40, 147, 162,  25,   9, 176, 177,
+  162, 177,  72,  87,  41,  56, 118, 133, 133, 148, 103, 118,
+  10,  25, 148, 163,  57,  72,  88, 103, 177, 192,  26,  41,
+  163, 178, 192, 193,  26,  10, 119, 134,  73,  88, 149, 164,
+  104, 119, 134, 149,  42,  57, 178, 193, 164, 179,  11,  26,
+  58,  73, 193, 208,  89, 104, 135, 150, 120, 135,  27,  42,
+  74,  89, 208, 209, 150, 165, 179, 194, 165, 180, 105, 120,
+  194, 209,  43,  58,  27,  11, 136, 151,  90, 105, 151, 166,
+  180, 195,  59,  74, 121, 136, 209, 224, 195, 210, 224, 225,
+  166, 181, 106, 121,  75,  90,  12,  27, 181, 196,  28,  12,
+  210, 225, 152, 167, 167, 182, 137, 152,  28,  43, 196, 211,
+  122, 137,  91, 106, 225, 240,  44,  59,  13,  28, 107, 122,
+  182, 197, 168, 183, 211, 226, 153, 168, 226, 241,  60,  75,
+  197, 212, 138, 153,  29,  44,  76,  91,  29,  13, 183, 198,
+  123, 138,  45,  60, 212, 227, 198, 213, 154, 169, 169, 184,
+  227, 242,  92, 107,  61,  76, 139, 154,  14,  29,  30,  14,
+  184, 199, 213, 228, 108, 123, 199, 214, 228, 243,  77,  92,
+  30,  45, 170, 185, 155, 170, 185, 200,  93, 108, 124, 139,
+  214, 229,  46,  61, 200, 215, 229, 244,  15,  30, 109, 124,
+  62,  77, 140, 155, 215, 230,  31,  46, 171, 186, 186, 201,
+  201, 216,  78,  93, 230, 245, 125, 140,  47,  62, 216, 231,
+  156, 171,  94, 109, 231, 246, 141, 156,  63,  78, 202, 217,
+  187, 202, 110, 125, 217, 232, 172, 187, 232, 247,  79,  94,
+  157, 172, 126, 141, 203, 218,  95, 110, 233, 248, 218, 233,
+  142, 157, 111, 126, 173, 188, 188, 203, 234, 249, 219, 234,
+  127, 142, 158, 173, 204, 219, 189, 204, 143, 158, 235, 250,
+  174, 189, 205, 220, 159, 174, 220, 235, 221, 236, 175, 190,
+  190, 205, 236, 251, 206, 221, 237, 252, 191, 206, 222, 237,
+  207, 222, 238, 253, 223, 238, 239, 254,   0,   0,
+};
+
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t,
+                mcol_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 32, 32, 64, 64, 96, 96, 128, 128, 160, 160, 192, 192,
+  224, 224, 256, 256, 288, 288, 320, 320, 352, 352, 384, 384, 416,
+  416, 448, 448,
+  480, 480, 512, 512, 544, 544, 576, 576, 608, 608, 640, 640, 672,
+  672, 704, 704, 736, 736, 768, 768, 800, 800, 832, 832, 864, 864,
+  896, 896, 928, 928, 960, 960,
+  0, 0, 1, 32, 33, 64, 65, 96, 97, 128, 129, 160, 161, 192, 193,
+  224, 225, 256, 257, 288, 289, 320, 321, 352, 353, 384, 385, 416,
+  417, 448, 449, 480,
+  481, 512, 513, 544, 545, 576, 577, 608, 609, 640, 641, 672, 673,
+  704, 705, 736, 737, 768, 769, 800, 801, 832, 833, 864, 865, 896,
+  897, 928, 929, 960, 961, 992,
+  1, 1, 2, 33, 34, 65, 66, 97, 98, 129, 130, 161, 162, 193, 194,
+  225, 226, 257, 258, 289, 290, 321, 322, 353, 354, 385, 386, 417,
+  418, 449, 450, 481,
+  482, 513, 514, 545, 546, 577, 578, 609, 610, 641, 642, 673, 674,
+  705, 706, 737, 738, 769, 770, 801, 802, 833, 834, 865, 866, 897,
+  898, 929, 930, 961, 962, 993,
+  2, 2, 3, 34, 35, 66, 67, 98, 99, 130, 131, 162, 163, 194, 195,
+  226, 227, 258, 259, 290, 291, 322, 323, 354, 355, 386, 387, 418,
+  419, 450, 451, 482,
+  483, 514, 515, 546, 547, 578, 579, 610, 611, 642, 643, 674, 675,
+  706, 707, 738, 739, 770, 771, 802, 803, 834, 835, 866, 867, 898,
+  899, 930, 931, 962, 963, 994,
+  3, 3, 4, 35, 36, 67, 68, 99, 100, 131, 132, 163, 164, 195, 196,
+  227, 228, 259, 260, 291, 292, 323, 324, 355, 356, 387, 388, 419,
+  420, 451, 452, 483,
+  484, 515, 516, 547, 548, 579, 580, 611, 612, 643, 644, 675, 676,
+  707, 708, 739, 740, 771, 772, 803, 804, 835, 836, 867, 868, 899,
+  900, 931, 932, 963, 964, 995,
+  4, 4, 5, 36, 37, 68, 69, 100, 101, 132, 133, 164, 165, 196, 197,
+  228, 229, 260, 261, 292, 293, 324, 325, 356, 357, 388, 389, 420,
+  421, 452, 453, 484,
+  485, 516, 517, 548, 549, 580, 581, 612, 613, 644, 645, 676, 677,
+  708, 709, 740, 741, 772, 773, 804, 805, 836, 837, 868, 869, 900,
+  901, 932, 933, 964, 965, 996,
+  5, 5, 6, 37, 38, 69, 70, 101, 102, 133, 134, 165, 166, 197, 198,
+  229, 230, 261, 262, 293, 294, 325, 326, 357, 358, 389, 390, 421,
+  422, 453, 454, 485,
+  486, 517, 518, 549, 550, 581, 582, 613, 614, 645, 646, 677, 678,
+  709, 710, 741, 742, 773, 774, 805, 806, 837, 838, 869, 870, 901,
+  902, 933, 934, 965, 966, 997,
+  6, 6, 7, 38, 39, 70, 71, 102, 103, 134, 135, 166, 167, 198, 199,
+  230, 231, 262, 263, 294, 295, 326, 327, 358, 359, 390, 391, 422,
+  423, 454, 455, 486,
+  487, 518, 519, 550, 551, 582, 583, 614, 615, 646, 647, 678, 679,
+  710, 711, 742, 743, 774, 775, 806, 807, 838, 839, 870, 871, 902,
+  903, 934, 935, 966, 967, 998,
+  7, 7, 8, 39, 40, 71, 72, 103, 104, 135, 136, 167, 168, 199, 200,
+  231, 232, 263, 264, 295, 296, 327, 328, 359, 360, 391, 392, 423,
+  424, 455, 456, 487,
+  488, 519, 520, 551, 552, 583, 584, 615, 616, 647, 648, 679, 680,
+  711, 712, 743, 744, 775, 776, 807, 808, 839, 840, 871, 872, 903,
+  904, 935, 936, 967, 968, 999,
+  8, 8, 9, 40, 41, 72, 73, 104, 105, 136, 137, 168, 169, 200, 201,
+  232, 233, 264, 265, 296, 297, 328, 329, 360, 361, 392, 393, 424,
+  425, 456, 457, 488,
+  489, 520, 521, 552, 553, 584, 585, 616, 617, 648, 649, 680, 681,
+  712, 713, 744, 745, 776, 777, 808, 809, 840, 841, 872, 873, 904,
+  905, 936, 937, 968, 969, 1000,
+  9, 9, 10, 41, 42, 73, 74, 105, 106, 137, 138, 169, 170, 201, 202,
+  233, 234, 265, 266, 297, 298, 329, 330, 361, 362, 393, 394, 425,
+  426, 457, 458, 489,
+  490, 521, 522, 553, 554, 585, 586, 617, 618, 649, 650, 681, 682,
+  713, 714, 745, 746, 777, 778, 809, 810, 841, 842, 873, 874, 905,
+  906, 937, 938, 969, 970, 1001,
+  10, 10, 11, 42, 43, 74, 75, 106, 107, 138, 139, 170, 171, 202,
+  203, 234, 235, 266, 267, 298, 299, 330, 331, 362, 363, 394, 395,
+  426, 427, 458, 459, 490,
+  491, 522, 523, 554, 555, 586, 587, 618, 619, 650, 651, 682, 683,
+  714, 715, 746, 747, 778, 779, 810, 811, 842, 843, 874, 875, 906,
+  907, 938, 939, 970, 971, 1002,
+  11, 11, 12, 43, 44, 75, 76, 107, 108, 139, 140, 171, 172, 203,
+  204, 235, 236, 267, 268, 299, 300, 331, 332, 363, 364, 395, 396,
+  427, 428, 459, 460, 491,
+  492, 523, 524, 555, 556, 587, 588, 619, 620, 651, 652, 683, 684,
+  715, 716, 747, 748, 779, 780, 811, 812, 843, 844, 875, 876, 907,
+  908, 939, 940, 971, 972, 1003,
+  12, 12, 13, 44, 45, 76, 77, 108, 109, 140, 141, 172, 173, 204,
+  205, 236, 237, 268, 269, 300, 301, 332, 333, 364, 365, 396, 397,
+  428, 429, 460, 461, 492,
+  493, 524, 525, 556, 557, 588, 589, 620, 621, 652, 653, 684, 685,
+  716, 717, 748, 749, 780, 781, 812, 813, 844, 845, 876, 877, 908,
+  909, 940, 941, 972, 973, 1004,
+  13, 13, 14, 45, 46, 77, 78, 109, 110, 141, 142, 173, 174, 205,
+  206, 237, 238, 269, 270, 301, 302, 333, 334, 365, 366, 397, 398,
+  429, 430, 461, 462, 493,
+  494, 525, 526, 557, 558, 589, 590, 621, 622, 653, 654, 685, 686,
+  717, 718, 749, 750, 781, 782, 813, 814, 845, 846, 877, 878, 909,
+  910, 941, 942, 973, 974, 1005,
+  14, 14, 15, 46, 47, 78, 79, 110, 111, 142, 143, 174, 175, 206,
+  207, 238, 239, 270, 271, 302, 303, 334, 335, 366, 367, 398, 399,
+  430, 431, 462, 463, 494,
+  495, 526, 527, 558, 559, 590, 591, 622, 623, 654, 655, 686, 687,
+  718, 719, 750, 751, 782, 783, 814, 815, 846, 847, 878, 879, 910,
+  911, 942, 943, 974, 975, 1006,
+  15, 15, 16, 47, 48, 79, 80, 111, 112, 143, 144, 175, 176, 207,
+  208, 239, 240, 271, 272, 303, 304, 335, 336, 367, 368, 399, 400,
+  431, 432, 463, 464, 495,
+  496, 527, 528, 559, 560, 591, 592, 623, 624, 655, 656, 687, 688,
+  719, 720, 751, 752, 783, 784, 815, 816, 847, 848, 879, 880, 911,
+  912, 943, 944, 975, 976, 1007,
+  16, 16, 17, 48, 49, 80, 81, 112, 113, 144, 145, 176, 177, 208,
+  209, 240, 241, 272, 273, 304, 305, 336, 337, 368, 369, 400, 401,
+  432, 433, 464, 465, 496,
+  497, 528, 529, 560, 561, 592, 593, 624, 625, 656, 657, 688, 689,
+  720, 721, 752, 753, 784, 785, 816, 817, 848, 849, 880, 881, 912,
+  913, 944, 945, 976, 977, 1008,
+  17, 17, 18, 49, 50, 81, 82, 113, 114, 145, 146, 177, 178, 209,
+  210, 241, 242, 273, 274, 305, 306, 337, 338, 369, 370, 401, 402,
+  433, 434, 465, 466, 497,
+  498, 529, 530, 561, 562, 593, 594, 625, 626, 657, 658, 689, 690,
+  721, 722, 753, 754, 785, 786, 817, 818, 849, 850, 881, 882, 913,
+  914, 945, 946, 977, 978, 1009,
+  18, 18, 19, 50, 51, 82, 83, 114, 115, 146, 147, 178, 179, 210,
+  211, 242, 243, 274, 275, 306, 307, 338, 339, 370, 371, 402, 403,
+  434, 435, 466, 467, 498,
+  499, 530, 531, 562, 563, 594, 595, 626, 627, 658, 659, 690, 691,
+  722, 723, 754, 755, 786, 787, 818, 819, 850, 851, 882, 883, 914,
+  915, 946, 947, 978, 979, 1010,
+  19, 19, 20, 51, 52, 83, 84, 115, 116, 147, 148, 179, 180, 211,
+  212, 243, 244, 275, 276, 307, 308, 339, 340, 371, 372, 403, 404,
+  435, 436, 467, 468, 499,
+  500, 531, 532, 563, 564, 595, 596, 627, 628, 659, 660, 691, 692,
+  723, 724, 755, 756, 787, 788, 819, 820, 851, 852, 883, 884, 915,
+  916, 947, 948, 979, 980, 1011,
+  20, 20, 21, 52, 53, 84, 85, 116, 117, 148, 149, 180, 181, 212,
+  213, 244, 245, 276, 277, 308, 309, 340, 341, 372, 373, 404, 405,
+  436, 437, 468, 469, 500,
+  501, 532, 533, 564, 565, 596, 597, 628, 629, 660, 661, 692, 693,
+  724, 725, 756, 757, 788, 789, 820, 821, 852, 853, 884, 885, 916,
+  917, 948, 949, 980, 981, 1012,
+  21, 21, 22, 53, 54, 85, 86, 117, 118, 149, 150, 181, 182, 213,
+  214, 245, 246, 277, 278, 309, 310, 341, 342, 373, 374, 405, 406,
+  437, 438, 469, 470, 501,
+  502, 533, 534, 565, 566, 597, 598, 629, 630, 661, 662, 693, 694,
+  725, 726, 757, 758, 789, 790, 821, 822, 853, 854, 885, 886, 917,
+  918, 949, 950, 981, 982, 1013,
+  22, 22, 23, 54, 55, 86, 87, 118, 119, 150, 151, 182, 183, 214,
+  215, 246, 247, 278, 279, 310, 311, 342, 343, 374, 375, 406, 407,
+  438, 439, 470, 471, 502,
+  503, 534, 535, 566, 567, 598, 599, 630, 631, 662, 663, 694, 695,
+  726, 727, 758, 759, 790, 791, 822, 823, 854, 855, 886, 887, 918,
+  919, 950, 951, 982, 983, 1014,
+  23, 23, 24, 55, 56, 87, 88, 119, 120, 151, 152, 183, 184, 215,
+  216, 247, 248, 279, 280, 311, 312, 343, 344, 375, 376, 407, 408,
+  439, 440, 471, 472, 503,
+  504, 535, 536, 567, 568, 599, 600, 631, 632, 663, 664, 695, 696,
+  727, 728, 759, 760, 791, 792, 823, 824, 855, 856, 887, 888, 919,
+  920, 951, 952, 983, 984, 1015,
+  24, 24, 25, 56, 57, 88, 89, 120, 121, 152, 153, 184, 185, 216,
+  217, 248, 249, 280, 281, 312, 313, 344, 345, 376, 377, 408, 409,
+  440, 441, 472, 473, 504,
+  505, 536, 537, 568, 569, 600, 601, 632, 633, 664, 665, 696, 697,
+  728, 729, 760, 761, 792, 793, 824, 825, 856, 857, 888, 889, 920,
+  921, 952, 953, 984, 985, 1016,
+  25, 25, 26, 57, 58, 89, 90, 121, 122, 153, 154, 185, 186, 217,
+  218, 249, 250, 281, 282, 313, 314, 345, 346, 377, 378, 409, 410,
+  441, 442, 473, 474, 505,
+  506, 537, 538, 569, 570, 601, 602, 633, 634, 665, 666, 697, 698,
+  729, 730, 761, 762, 793, 794, 825, 826, 857, 858, 889, 890, 921,
+  922, 953, 954, 985, 986, 1017,
+  26, 26, 27, 58, 59, 90, 91, 122, 123, 154, 155, 186, 187, 218,
+  219, 250, 251, 282, 283, 314, 315, 346, 347, 378, 379, 410, 411,
+  442, 443, 474, 475, 506,
+  507, 538, 539, 570, 571, 602, 603, 634, 635, 666, 667, 698, 699,
+  730, 731, 762, 763, 794, 795, 826, 827, 858, 859, 890, 891, 922,
+  923, 954, 955, 986, 987, 1018,
+  27, 27, 28, 59, 60, 91, 92, 123, 124, 155, 156, 187, 188, 219,
+  220, 251, 252, 283, 284, 315, 316, 347, 348, 379, 380, 411, 412,
+  443, 444, 475, 476, 507,
+  508, 539, 540, 571, 572, 603, 604, 635, 636, 667, 668, 699, 700,
+  731, 732, 763, 764, 795, 796, 827, 828, 859, 860, 891, 892, 923,
+  924, 955, 956, 987, 988, 1019,
+  28, 28, 29, 60, 61, 92, 93, 124, 125, 156, 157, 188, 189, 220,
+  221, 252, 253, 284, 285, 316, 317, 348, 349, 380, 381, 412, 413,
+  444, 445, 476, 477, 508,
+  509, 540, 541, 572, 573, 604, 605, 636, 637, 668, 669, 700, 701,
+  732, 733, 764, 765, 796, 797, 828, 829, 860, 861, 892, 893, 924,
+  925, 956, 957, 988, 989, 1020,
+  29, 29, 30, 61, 62, 93, 94, 125, 126, 157, 158, 189, 190, 221,
+  222, 253, 254, 285, 286, 317, 318, 349, 350, 381, 382, 413, 414,
+  445, 446, 477, 478, 509,
+  510, 541, 542, 573, 574, 605, 606, 637, 638, 669, 670, 701, 702,
+  733, 734, 765, 766, 797, 798, 829, 830, 861, 862, 893, 894, 925,
+  926, 957, 958, 989, 990, 1021,
+  30, 30, 31, 62, 63, 94, 95, 126, 127, 158, 159, 190, 191, 222,
+  223, 254, 255, 286, 287, 318, 319, 350, 351, 382, 383, 414, 415,
+  446, 447, 478, 479, 510,
+  511, 542, 543, 574, 575, 606, 607, 638, 639, 670, 671, 702, 703,
+  734, 735, 766, 767, 798, 799, 830, 831, 862, 863, 894, 895, 926,
+  927, 958, 959, 990, 991, 1022,
+  0, 0,
 };
 
 DECLARE_ALIGNED(16, static const int16_t,
-                default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
-  0, 0, 0, 0, 0, 0, 32, 32, 1, 32, 1, 1, 64, 64, 33, 64,
-  2, 33, 96, 96, 2, 2, 65, 96, 34, 65, 128, 128, 97, 128, 3, 34,
-  66, 97, 3, 3, 35, 66, 98, 129, 129, 160, 160, 160, 4, 35, 67, 98,
-  192, 192, 4, 4, 130, 161, 161, 192, 36, 67, 99, 130, 5, 36, 68, 99,
-  193, 224, 162, 193, 224, 224, 131, 162, 37, 68, 100, 131, 5, 5, 194, 225,
-  225, 256, 256, 256, 163, 194, 69, 100, 132, 163, 6, 37, 226, 257, 6, 6,
-  195, 226, 257, 288, 101, 132, 288, 288, 38, 69, 164, 195, 133, 164, 258, 289,
-  227, 258, 196, 227, 7, 38, 289, 320, 70, 101, 320, 320, 7, 7, 165, 196,
-  39, 70, 102, 133, 290, 321, 259, 290, 228, 259, 321, 352, 352, 352, 197, 228,
-  134, 165, 71, 102, 8, 39, 322, 353, 291, 322, 260, 291, 103, 134, 353, 384,
-  166, 197, 229, 260, 40, 71, 8, 8, 384, 384, 135, 166, 354, 385, 323, 354,
-  198, 229, 292, 323, 72, 103, 261, 292, 9, 40, 385, 416, 167, 198, 104, 135,
-  230, 261, 355, 386, 416, 416, 293, 324, 324, 355, 9, 9, 41, 72, 386, 417,
-  199, 230, 136, 167, 417, 448, 262, 293, 356, 387, 73, 104, 387, 418, 231, 262,
-  10, 41, 168, 199, 325, 356, 418, 449, 105, 136, 448, 448, 42, 73, 294, 325,
-  200, 231, 10, 10, 357, 388, 137, 168, 263, 294, 388, 419, 74, 105, 419, 450,
-  449, 480, 326, 357, 232, 263, 295, 326, 169, 200, 11, 42, 106, 137, 480, 480,
-  450, 481, 358, 389, 264, 295, 201, 232, 138, 169, 389, 420, 43, 74, 420, 451,
-  327, 358, 11, 11, 481, 512, 233, 264, 451, 482, 296, 327, 75, 106, 170, 201,
-  482, 513, 512, 512, 390, 421, 359, 390, 421, 452, 107, 138, 12, 43, 202, 233,
-  452, 483, 265, 296, 328, 359, 139, 170, 44, 75, 483, 514, 513, 544, 234, 265,
-  297, 328, 422, 453, 12, 12, 391, 422, 171, 202, 76, 107, 514, 545, 453, 484,
-  544, 544, 266, 297, 203, 234, 108, 139, 329, 360, 298, 329, 140, 171, 515,
-  546, 13, 44, 423, 454, 235, 266, 545, 576, 454, 485, 45, 76, 172, 203, 330,
-  361, 576, 576, 13, 13, 267, 298, 546, 577, 77, 108, 204, 235, 455, 486, 577,
-  608, 299, 330, 109, 140, 547, 578, 14, 45, 14, 14, 141, 172, 578, 609, 331,
-  362, 46, 77, 173, 204, 15, 15, 78, 109, 205, 236, 579, 610, 110, 141, 15, 46,
-  142, 173, 47, 78, 174, 205, 16, 16, 79, 110, 206, 237, 16, 47, 111, 142,
-  48, 79, 143, 174, 80, 111, 175, 206, 17, 48, 17, 17, 207, 238, 49, 80,
-  81, 112, 18, 18, 18, 49, 50, 81, 82, 113, 19, 50, 51, 82, 83, 114, 608, 608,
-  484, 515, 360, 391, 236, 267, 112, 143, 19, 19, 640, 640, 609, 640, 516, 547,
-  485, 516, 392, 423, 361, 392, 268, 299, 237, 268, 144, 175, 113, 144, 20, 51,
-  20, 20, 672, 672, 641, 672, 610, 641, 548, 579, 517, 548, 486, 517, 424, 455,
-  393, 424, 362, 393, 300, 331, 269, 300, 238, 269, 176, 207, 145, 176, 114,
-  145, 52, 83, 21, 52, 21, 21, 704, 704, 673, 704, 642, 673, 611, 642, 580,
-  611, 549, 580, 518, 549, 487, 518, 456, 487, 425, 456, 394, 425, 363, 394,
-  332, 363, 301, 332, 270, 301, 239, 270, 208, 239, 177, 208, 146, 177, 115,
-  146, 84, 115, 53, 84, 22, 53, 22, 22, 705, 736, 674, 705, 643, 674, 581, 612,
-  550, 581, 519, 550, 457, 488, 426, 457, 395, 426, 333, 364, 302, 333, 271,
-  302, 209, 240, 178, 209, 147, 178, 85, 116, 54, 85, 23, 54, 706, 737, 675,
-  706, 582, 613, 551, 582, 458, 489, 427, 458, 334, 365, 303, 334, 210, 241,
-  179, 210, 86, 117, 55, 86, 707, 738, 583, 614, 459, 490, 335, 366, 211, 242,
-  87, 118, 736, 736, 612, 643, 488, 519, 364, 395, 240, 271, 116, 147, 23, 23,
-  768, 768, 737, 768, 644, 675, 613, 644, 520, 551, 489, 520, 396, 427, 365,
-  396, 272, 303, 241, 272, 148, 179, 117, 148, 24, 55, 24, 24, 800, 800, 769,
-  800, 738, 769, 676, 707, 645, 676, 614, 645, 552, 583, 521, 552, 490, 521,
-  428, 459, 397, 428, 366, 397, 304, 335, 273, 304, 242, 273, 180, 211, 149,
-  180, 118, 149, 56, 87, 25, 56, 25, 25, 832, 832, 801, 832, 770, 801, 739,
-  770, 708, 739, 677, 708, 646, 677, 615, 646, 584, 615, 553, 584, 522, 553,
-  491, 522, 460, 491, 429, 460, 398, 429, 367, 398, 336, 367, 305, 336, 274,
-  305, 243, 274, 212, 243, 181, 212, 150, 181, 119, 150, 88, 119, 57, 88, 26,
-  57, 26, 26, 833, 864, 802, 833, 771, 802, 709, 740, 678, 709, 647, 678, 585,
-  616, 554, 585, 523, 554, 461, 492, 430, 461, 399, 430, 337, 368, 306, 337,
-  275, 306, 213, 244, 182, 213, 151, 182, 89, 120, 58, 89, 27, 58, 834, 865,
-  803, 834, 710, 741, 679, 710, 586, 617, 555, 586, 462, 493, 431, 462, 338,
-  369, 307, 338, 214, 245, 183, 214, 90, 121, 59, 90, 835, 866, 711, 742, 587,
-  618, 463, 494, 339, 370, 215, 246, 91, 122, 864, 864, 740, 771, 616, 647,
-  492, 523, 368, 399, 244, 275, 120, 151, 27, 27, 896, 896, 865, 896, 772, 803,
-  741, 772, 648, 679, 617, 648, 524, 555, 493, 524, 400, 431, 369, 400, 276,
-  307, 245, 276, 152, 183, 121, 152, 28, 59, 28, 28, 928, 928, 897, 928, 866,
-  897, 804, 835, 773, 804, 742, 773, 680, 711, 649, 680, 618, 649, 556, 587,
-  525, 556, 494, 525, 432, 463, 401, 432, 370, 401, 308, 339, 277, 308, 246,
-  277, 184, 215, 153, 184, 122, 153, 60, 91, 29, 60, 29, 29, 960, 960, 929,
-  960, 898, 929, 867, 898, 836, 867, 805, 836, 774, 805, 743, 774, 712, 743,
-  681, 712, 650, 681, 619, 650, 588, 619, 557, 588, 526, 557, 495, 526, 464,
-  495, 433, 464, 402, 433, 371, 402, 340, 371, 309, 340, 278, 309, 247, 278,
-  216, 247, 185, 216, 154, 185, 123, 154, 92, 123, 61, 92, 30, 61, 30, 30,
-  961, 992, 930, 961, 899, 930, 837, 868, 806, 837, 775, 806, 713, 744, 682,
-  713, 651, 682, 589, 620, 558, 589, 527, 558, 465, 496, 434, 465, 403, 434,
-  341, 372, 310, 341, 279, 310, 217, 248, 186, 217, 155, 186, 93, 124, 62, 93,
-  31, 62, 962, 993, 931, 962, 838, 869, 807, 838, 714, 745, 683, 714, 590, 621,
-  559, 590, 466, 497, 435, 466, 342, 373, 311, 342, 218, 249, 187, 218, 94,
-  125, 63, 94, 963, 994, 839, 870, 715, 746, 591, 622, 467, 498, 343, 374, 219,
-  250, 95, 126, 868, 899, 744, 775, 620, 651, 496, 527, 372, 403, 248, 279,
-  124, 155, 900, 931, 869, 900, 776, 807, 745, 776, 652, 683, 621, 652, 528,
-  559, 497, 528, 404, 435, 373, 404, 280, 311, 249, 280, 156, 187, 125, 156,
-  932, 963, 901, 932, 870, 901, 808, 839, 777, 808, 746, 777, 684, 715, 653,
-  684, 622, 653, 560, 591, 529, 560, 498, 529, 436, 467, 405, 436, 374, 405,
-  312, 343, 281, 312, 250, 281, 188, 219, 157, 188, 126, 157, 964, 995, 933,
-  964, 902, 933, 871, 902, 840, 871, 809, 840, 778, 809, 747, 778, 716, 747,
-  685, 716, 654, 685, 623, 654, 592, 623, 561, 592, 530, 561, 499, 530, 468,
-  499, 437, 468, 406, 437, 375, 406, 344, 375, 313, 344, 282, 313, 251, 282,
-  220, 251, 189, 220, 158, 189, 127, 158, 965, 996, 934, 965, 903, 934, 841,
-  872, 810, 841, 779, 810, 717, 748, 686, 717, 655, 686, 593, 624, 562, 593,
-  531, 562, 469, 500, 438, 469, 407, 438, 345, 376, 314, 345, 283, 314, 221,
-  252, 190, 221, 159, 190, 966, 997, 935, 966, 842, 873, 811, 842, 718, 749,
-  687, 718, 594, 625, 563, 594, 470, 501, 439, 470, 346, 377, 315, 346, 222,
-  253, 191, 222, 967, 998, 843, 874, 719, 750, 595, 626, 471, 502, 347, 378,
-  223, 254, 872, 903, 748, 779, 624, 655, 500, 531, 376, 407, 252, 283, 904,
-  935, 873, 904, 780, 811, 749, 780, 656, 687, 625, 656, 532, 563, 501, 532,
-  408, 439, 377, 408, 284, 315, 253, 284, 936, 967, 905, 936, 874, 905, 812,
-  843, 781, 812, 750, 781, 688, 719, 657, 688, 626, 657, 564, 595, 533, 564,
-  502, 533, 440, 471, 409, 440, 378, 409, 316, 347, 285, 316, 254, 285, 968,
-  999, 937, 968, 906, 937, 875, 906, 844, 875, 813, 844, 782, 813, 751, 782,
-  720, 751, 689, 720, 658, 689, 627, 658, 596, 627, 565, 596, 534, 565, 503,
-  534, 472, 503, 441, 472, 410, 441, 379, 410, 348, 379, 317, 348, 286, 317,
-  255, 286, 969, 1000, 938, 969, 907, 938, 845, 876, 814, 845, 783, 814, 721,
-  752, 690, 721, 659, 690, 597, 628, 566, 597, 535, 566, 473, 504, 442, 473,
-  411, 442, 349, 380, 318, 349, 287, 318, 970, 1001, 939, 970, 846, 877, 815,
-  846, 722, 753, 691, 722, 598, 629, 567, 598, 474, 505, 443, 474, 350, 381,
-  319, 350, 971, 1002, 847, 878, 723, 754, 599, 630, 475, 506, 351, 382, 876,
-  907, 752, 783, 628, 659, 504, 535, 380, 411, 908, 939, 877, 908, 784, 815,
-  753, 784, 660, 691, 629, 660, 536, 567, 505, 536, 412, 443, 381, 412, 940,
-  971, 909, 940, 878, 909, 816, 847, 785, 816, 754, 785, 692, 723, 661, 692,
-  630, 661, 568, 599, 537, 568, 506, 537, 444, 475, 413, 444, 382, 413, 972,
-  1003, 941, 972, 910, 941, 879, 910, 848, 879, 817, 848, 786, 817, 755, 786,
-  724, 755, 693, 724, 662, 693, 631, 662, 600, 631, 569, 600, 538, 569, 507,
-  538, 476, 507, 445, 476, 414, 445, 383, 414, 973, 1004, 942, 973, 911, 942,
-  849, 880, 818, 849, 787, 818, 725, 756, 694, 725, 663, 694, 601, 632, 570,
-  601, 539, 570, 477, 508, 446, 477, 415, 446, 974, 1005, 943, 974, 850, 881,
-  819, 850, 726, 757, 695, 726, 602, 633, 571, 602, 478, 509, 447, 478, 975,
-  1006, 851, 882, 727, 758, 603, 634, 479, 510, 880, 911, 756, 787, 632, 663,
-  508, 539, 912, 943, 881, 912, 788, 819, 757, 788, 664, 695, 633, 664, 540,
-  571, 509, 540, 944, 975, 913, 944, 882, 913, 820, 851, 789, 820, 758, 789,
-  696, 727, 665, 696, 634, 665, 572, 603, 541, 572, 510, 541, 976, 1007, 945,
-  976, 914, 945, 883, 914, 852, 883, 821, 852, 790, 821, 759, 790, 728, 759,
-  697, 728, 666, 697, 635, 666, 604, 635, 573, 604, 542, 573, 511, 542, 977,
-  1008, 946, 977, 915, 946, 853, 884, 822, 853, 791, 822, 729, 760, 698, 729,
-  667, 698, 605, 636, 574, 605, 543, 574, 978, 1009, 947, 978, 854, 885, 823,
-  854, 730, 761, 699, 730, 606, 637, 575, 606, 979, 1010, 855, 886, 731, 762,
-  607, 638, 884, 915, 760, 791, 636, 667, 916, 947, 885, 916, 792, 823, 761,
-  792, 668, 699, 637, 668, 948, 979, 917, 948, 886, 917, 824, 855, 793, 824,
-  762, 793, 700, 731, 669, 700, 638, 669, 980, 1011, 949, 980, 918, 949, 887,
-  918, 856, 887, 825, 856, 794, 825, 763, 794, 732, 763, 701, 732, 670, 701,
-  639, 670, 981, 1012, 950, 981, 919, 950, 857, 888, 826, 857, 795, 826, 733,
-  764, 702, 733, 671, 702, 982, 1013, 951, 982, 858, 889, 827, 858, 734, 765,
-  703, 734, 983, 1014, 859, 890, 735, 766, 888, 919, 764, 795, 920, 951, 889,
-  920, 796, 827, 765, 796, 952, 983, 921, 952, 890, 921, 828, 859, 797, 828,
-  766, 797, 984, 1015, 953, 984, 922, 953, 891, 922, 860, 891, 829, 860, 798,
-  829, 767, 798, 985, 1016, 954, 985, 923, 954, 861, 892, 830, 861, 799, 830,
-  986, 1017, 955, 986, 862, 893, 831, 862, 987, 1018, 863, 894, 892, 923, 924,
-  955, 893, 924, 956, 987, 925, 956, 894, 925, 988, 1019, 957, 988, 926, 957,
-  895, 926, 989, 1020, 958, 989, 927, 958, 990, 1021, 959, 990, 991, 1022, 0, 0,
+                mrow_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
+  0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9,
+  9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14,
+  15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 21, 21, 22, 22,
+  23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, 30,
+  0, 0, 1, 32, 2, 33, 3, 34, 4, 35, 5, 36, 6, 37, 7, 38, 8, 39, 9,
+  40, 10, 41, 11, 42, 12, 43, 13, 44, 14, 45, 15, 46,
+  16, 47, 17, 48, 18, 49, 19, 50, 20, 51, 21, 52, 22, 53, 23, 54,
+  24, 55, 25, 56, 26, 57, 27, 58, 28, 59, 29, 60, 30, 61, 31, 62,
+  32, 32, 33, 64, 34, 65, 35, 66, 36, 67, 37, 68, 38, 69, 39, 70,
+  40, 71, 41, 72, 42, 73, 43, 74, 44, 75, 45, 76, 46, 77, 47, 78,
+  48, 79, 49, 80, 50, 81, 51, 82, 52, 83, 53, 84, 54, 85, 55, 86,
+  56, 87, 57, 88, 58, 89, 59, 90, 60, 91, 61, 92, 62, 93, 63, 94,
+  64, 64, 65, 96, 66, 97, 67, 98, 68, 99, 69, 100, 70, 101, 71,
+  102, 72, 103, 73, 104, 74, 105, 75, 106, 76, 107, 77, 108, 78,
+  109, 79, 110,
+  80, 111, 81, 112, 82, 113, 83, 114, 84, 115, 85, 116, 86, 117,
+  87, 118, 88, 119, 89, 120, 90, 121, 91, 122, 92, 123, 93, 124,
+  94, 125, 95, 126,
+  96, 96, 97, 128, 98, 129, 99, 130, 100, 131, 101, 132, 102,
+  133, 103, 134, 104, 135, 105, 136, 106, 137, 107, 138, 108,
+  139, 109, 140, 110, 141, 111, 142,
+  112, 143, 113, 144, 114, 145, 115, 146, 116, 147, 117, 148,
+  118, 149, 119, 150, 120, 151, 121, 152, 122, 153, 123, 154,
+  124, 155, 125, 156, 126, 157, 127, 158,
+  128, 128, 129, 160, 130, 161, 131, 162, 132, 163, 133, 164,
+  134, 165, 135, 166, 136, 167, 137, 168, 138, 169, 139, 170,
+  140, 171, 141, 172, 142, 173, 143, 174,
+  144, 175, 145, 176, 146, 177, 147, 178, 148, 179, 149, 180,
+  150, 181, 151, 182, 152, 183, 153, 184, 154, 185, 155, 186,
+  156, 187, 157, 188, 158, 189, 159, 190,
+  160, 160, 161, 192, 162, 193, 163, 194, 164, 195, 165, 196,
+  166, 197, 167, 198, 168, 199, 169, 200, 170, 201, 171, 202,
+  172, 203, 173, 204, 174, 205, 175, 206,
+  176, 207, 177, 208, 178, 209, 179, 210, 180, 211, 181, 212,
+  182, 213, 183, 214, 184, 215, 185, 216, 186, 217, 187, 218,
+  188, 219, 189, 220, 190, 221, 191, 222,
+  192, 192, 193, 224, 194, 225, 195, 226, 196, 227, 197, 228,
+  198, 229, 199, 230, 200, 231, 201, 232, 202, 233, 203, 234,
+  204, 235, 205, 236, 206, 237, 207, 238,
+  208, 239, 209, 240, 210, 241, 211, 242, 212, 243, 213, 244,
+  214, 245, 215, 246, 216, 247, 217, 248, 218, 249, 219, 250,
+  220, 251, 221, 252, 222, 253, 223, 254,
+  224, 224, 225, 256, 226, 257, 227, 258, 228, 259, 229, 260,
+  230, 261, 231, 262, 232, 263, 233, 264, 234, 265, 235, 266,
+  236, 267, 237, 268, 238, 269, 239, 270,
+  240, 271, 241, 272, 242, 273, 243, 274, 244, 275, 245, 276,
+  246, 277, 247, 278, 248, 279, 249, 280, 250, 281, 251, 282,
+  252, 283, 253, 284, 254, 285, 255, 286,
+  256, 256, 257, 288, 258, 289, 259, 290, 260, 291, 261, 292,
+  262, 293, 263, 294, 264, 295, 265, 296, 266, 297, 267, 298,
+  268, 299, 269, 300, 270, 301, 271, 302,
+  272, 303, 273, 304, 274, 305, 275, 306, 276, 307, 277, 308,
+  278, 309, 279, 310, 280, 311, 281, 312, 282, 313, 283, 314,
+  284, 315, 285, 316, 286, 317, 287, 318,
+  288, 288, 289, 320, 290, 321, 291, 322, 292, 323, 293, 324,
+  294, 325, 295, 326, 296, 327, 297, 328, 298, 329, 299, 330,
+  300, 331, 301, 332, 302, 333, 303, 334,
+  304, 335, 305, 336, 306, 337, 307, 338, 308, 339, 309, 340,
+  310, 341, 311, 342, 312, 343, 313, 344, 314, 345, 315, 346,
+  316, 347, 317, 348, 318, 349, 319, 350,
+  320, 320, 321, 352, 322, 353, 323, 354, 324, 355, 325, 356,
+  326, 357, 327, 358, 328, 359, 329, 360, 330, 361, 331, 362,
+  332, 363, 333, 364, 334, 365, 335, 366,
+  336, 367, 337, 368, 338, 369, 339, 370, 340, 371, 341, 372,
+  342, 373, 343, 374, 344, 375, 345, 376, 346, 377, 347, 378,
+  348, 379, 349, 380, 350, 381, 351, 382,
+  352, 352, 353, 384, 354, 385, 355, 386, 356, 387, 357, 388,
+  358, 389, 359, 390, 360, 391, 361, 392, 362, 393, 363, 394,
+  364, 395, 365, 396, 366, 397, 367, 398,
+  368, 399, 369, 400, 370, 401, 371, 402, 372, 403, 373, 404,
+  374, 405, 375, 406, 376, 407, 377, 408, 378, 409, 379, 410,
+  380, 411, 381, 412, 382, 413, 383, 414,
+  384, 384, 385, 416, 386, 417, 387, 418, 388, 419, 389, 420,
+  390, 421, 391, 422, 392, 423, 393, 424, 394, 425, 395, 426,
+  396, 427, 397, 428, 398, 429, 399, 430,
+  400, 431, 401, 432, 402, 433, 403, 434, 404, 435, 405, 436,
+  406, 437, 407, 438, 408, 439, 409, 440, 410, 441, 411, 442,
+  412, 443, 413, 444, 414, 445, 415, 446,
+  416, 416, 417, 448, 418, 449, 419, 450, 420, 451, 421, 452,
+  422, 453, 423, 454, 424, 455, 425, 456, 426, 457, 427, 458,
+  428, 459, 429, 460, 430, 461, 431, 462,
+  432, 463, 433, 464, 434, 465, 435, 466, 436, 467, 437, 468,
+  438, 469, 439, 470, 440, 471, 441, 472, 442, 473, 443, 474,
+  444, 475, 445, 476, 446, 477, 447, 478,
+  448, 448, 449, 480, 450, 481, 451, 482, 452, 483, 453, 484,
+  454, 485, 455, 486, 456, 487, 457, 488, 458, 489, 459, 490,
+  460, 491, 461, 492, 462, 493, 463, 494,
+  464, 495, 465, 496, 466, 497, 467, 498, 468, 499, 469, 500,
+  470, 501, 471, 502, 472, 503, 473, 504, 474, 505, 475, 506,
+  476, 507, 477, 508, 478, 509, 479, 510,
+  480, 480, 481, 512, 482, 513, 483, 514, 484, 515, 485, 516,
+  486, 517, 487, 518, 488, 519, 489, 520, 490, 521, 491, 522,
+  492, 523, 493, 524, 494, 525, 495, 526,
+  496, 527, 497, 528, 498, 529, 499, 530, 500, 531, 501, 532,
+  502, 533, 503, 534, 504, 535, 505, 536, 506, 537, 507, 538,
+  508, 539, 509, 540, 510, 541, 511, 542,
+  512, 512, 513, 544, 514, 545, 515, 546, 516, 547, 517, 548,
+  518, 549, 519, 550, 520, 551, 521, 552, 522, 553, 523, 554,
+  524, 555, 525, 556, 526, 557, 527, 558,
+  528, 559, 529, 560, 530, 561, 531, 562, 532, 563, 533, 564,
+  534, 565, 535, 566, 536, 567, 537, 568, 538, 569, 539, 570,
+  540, 571, 541, 572, 542, 573, 543, 574,
+  544, 544, 545, 576, 546, 577, 547, 578, 548, 579, 549, 580,
+  550, 581, 551, 582, 552, 583, 553, 584, 554, 585, 555, 586,
+  556, 587, 557, 588, 558, 589, 559, 590,
+  560, 591, 561, 592, 562, 593, 563, 594, 564, 595, 565, 596,
+  566, 597, 567, 598, 568, 599, 569, 600, 570, 601, 571, 602,
+  572, 603, 573, 604, 574, 605, 575, 606,
+  576, 576, 577, 608, 578, 609, 579, 610, 580, 611, 581, 612,
+  582, 613, 583, 614, 584, 615, 585, 616, 586, 617, 587, 618,
+  588, 619, 589, 620, 590, 621, 591, 622,
+  592, 623, 593, 624, 594, 625, 595, 626, 596, 627, 597, 628,
+  598, 629, 599, 630, 600, 631, 601, 632, 602, 633, 603, 634,
+  604, 635, 605, 636, 606, 637, 607, 638,
+  608, 608, 609, 640, 610, 641, 611, 642, 612, 643, 613, 644,
+  614, 645, 615, 646, 616, 647, 617, 648, 618, 649, 619, 650,
+  620, 651, 621, 652, 622, 653, 623, 654,
+  624, 655, 625, 656, 626, 657, 627, 658, 628, 659, 629, 660,
+  630, 661, 631, 662, 632, 663, 633, 664, 634, 665, 635, 666,
+  636, 667, 637, 668, 638, 669, 639, 670,
+  640, 640, 641, 672, 642, 673, 643, 674, 644, 675, 645, 676,
+  646, 677, 647, 678, 648, 679, 649, 680, 650, 681, 651, 682,
+  652, 683, 653, 684, 654, 685, 655, 686,
+  656, 687, 657, 688, 658, 689, 659, 690, 660, 691, 661, 692,
+  662, 693, 663, 694, 664, 695, 665, 696, 666, 697, 667, 698,
+  668, 699, 669, 700, 670, 701, 671, 702,
+  672, 672, 673, 704, 674, 705, 675, 706, 676, 707, 677, 708,
+  678, 709, 679, 710, 680, 711, 681, 712, 682, 713, 683, 714,
+  684, 715, 685, 716, 686, 717, 687, 718,
+  688, 719, 689, 720, 690, 721, 691, 722, 692, 723, 693, 724,
+  694, 725, 695, 726, 696, 727, 697, 728, 698, 729, 699, 730,
+  700, 731, 701, 732, 702, 733, 703, 734,
+  704, 704, 705, 736, 706, 737, 707, 738, 708, 739, 709, 740,
+  710, 741, 711, 742, 712, 743, 713, 744, 714, 745, 715, 746,
+  716, 747, 717, 748, 718, 749, 719, 750,
+  720, 751, 721, 752, 722, 753, 723, 754, 724, 755, 725, 756,
+  726, 757, 727, 758, 728, 759, 729, 760, 730, 761, 731, 762,
+  732, 763, 733, 764, 734, 765, 735, 766,
+  736, 736, 737, 768, 738, 769, 739, 770, 740, 771, 741, 772,
+  742, 773, 743, 774, 744, 775, 745, 776, 746, 777, 747, 778,
+  748, 779, 749, 780, 750, 781, 751, 782,
+  752, 783, 753, 784, 754, 785, 755, 786, 756, 787, 757, 788,
+  758, 789, 759, 790, 760, 791, 761, 792, 762, 793, 763, 794,
+  764, 795, 765, 796, 766, 797, 767, 798,
+  768, 768, 769, 800, 770, 801, 771, 802, 772, 803, 773, 804,
+  774, 805, 775, 806, 776, 807, 777, 808, 778, 809, 779, 810,
+  780, 811, 781, 812, 782, 813, 783, 814,
+  784, 815, 785, 816, 786, 817, 787, 818, 788, 819, 789, 820,
+  790, 821, 791, 822, 792, 823, 793, 824, 794, 825, 795, 826,
+  796, 827, 797, 828, 798, 829, 799, 830,
+  800, 800, 801, 832, 802, 833, 803, 834, 804, 835, 805, 836,
+  806, 837, 807, 838, 808, 839, 809, 840, 810, 841, 811, 842,
+  812, 843, 813, 844, 814, 845, 815, 846,
+  816, 847, 817, 848, 818, 849, 819, 850, 820, 851, 821, 852,
+  822, 853, 823, 854, 824, 855, 825, 856, 826, 857, 827, 858,
+  828, 859, 829, 860, 830, 861, 831, 862,
+  832, 832, 833, 864, 834, 865, 835, 866, 836, 867, 837, 868,
+  838, 869, 839, 870, 840, 871, 841, 872, 842, 873, 843, 874,
+  844, 875, 845, 876, 846, 877, 847, 878,
+  848, 879, 849, 880, 850, 881, 851, 882, 852, 883, 853, 884,
+  854, 885, 855, 886, 856, 887, 857, 888, 858, 889, 859, 890,
+  860, 891, 861, 892, 862, 893, 863, 894,
+  864, 864, 865, 896, 866, 897, 867, 898, 868, 899, 869, 900,
+  870, 901, 871, 902, 872, 903, 873, 904, 874, 905, 875, 906,
+  876, 907, 877, 908, 878, 909, 879, 910,
+  880, 911, 881, 912, 882, 913, 883, 914, 884, 915, 885, 916,
+  886, 917, 887, 918, 888, 919, 889, 920, 890, 921, 891, 922,
+  892, 923, 893, 924, 894, 925, 895, 926,
+  896, 896, 897, 928, 898, 929, 899, 930, 900, 931, 901, 932,
+  902, 933, 903, 934, 904, 935, 905, 936, 906, 937, 907, 938,
+  908, 939, 909, 940, 910, 941, 911, 942,
+  912, 943, 913, 944, 914, 945, 915, 946, 916, 947, 917, 948,
+  918, 949, 919, 950, 920, 951, 921, 952, 922, 953, 923, 954,
+  924, 955, 925, 956, 926, 957, 927, 958,
+  928, 928, 929, 960, 930, 961, 931, 962, 932, 963, 933, 964,
+  934, 965, 935, 966, 936, 967, 937, 968, 938, 969, 939, 970,
+  940, 971, 941, 972, 942, 973, 943, 974,
+  944, 975, 945, 976, 946, 977, 947, 978, 948, 979, 949, 980,
+  950, 981, 951, 982, 952, 983, 953, 984, 954, 985, 955, 986,
+  956, 987, 957, 988, 958, 989, 959, 990,
+  960, 960, 961, 992, 962, 993, 963, 994, 964, 995, 965, 996,
+  966, 997, 967, 998, 968, 999, 969, 1000, 970, 1001, 971, 1002,
+  972, 1003, 973, 1004, 974, 1005, 975, 1006,
+  976, 1007, 977, 1008, 978, 1009, 979, 1010, 980, 1011, 981,
+  1012, 982, 1013, 983, 1014, 984, 1015, 985, 1016, 986, 1017,
+  987, 1018, 988, 1019, 989, 1020, 990, 1021, 991, 1022,
+  0, 0,
 };
+#endif  // CONFIG_EXT_TX
+
+DECLARE_ALIGNED(16, static const int16_t,
+                default_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
+  0,   0,   0,   0,  32,   0,  32,  32,   1,  32,  33,   1,
+  64,  64,  33,  64,   2,  33,  96,  96,  34,   2,  65,  96,
+  34,  65, 128, 128,  97, 128,   3,  34,  66,  97,  35,   3,
+  35,  66,  98, 129, 129, 160, 160, 161,   4,  35,  67,  98,
+  192, 192,  36,   4, 130, 161, 161, 192,  36,  67,  99, 130,
+  5,  36,  68,  99, 193, 224, 162, 193, 224, 225, 131, 162,
+  37,  68, 100, 131,  37,   5, 194, 225, 225, 256, 256, 257,
+  163, 194,  69, 100, 132, 163,   6,  37, 226, 257,  38,   6,
+  195, 226, 257, 288, 101, 132, 288, 289,  38,  69, 164, 195,
+  133, 164, 258, 289, 227, 258, 196, 227,   7,  38, 289, 320,
+  70, 101, 320, 321,  39,   7, 165, 196,  39,  70, 102, 133,
+  290, 321, 259, 290, 228, 259, 321, 352, 352, 353, 197, 228,
+  134, 165,  71, 102,   8,  39, 322, 353, 291, 322, 260, 291,
+  103, 134, 353, 384, 166, 197, 229, 260,  40,  71,  40,   8,
+  384, 385, 135, 166, 354, 385, 323, 354, 198, 229, 292, 323,
+  72, 103, 261, 292,   9,  40, 385, 416, 167, 198, 104, 135,
+  230, 261, 355, 386, 416, 417, 293, 324, 324, 355,  41,   9,
+  41,  72, 386, 417, 199, 230, 136, 167, 417, 448, 262, 293,
+  356, 387,  73, 104, 387, 418, 231, 262,  10,  41, 168, 199,
+  325, 356, 418, 449, 105, 136, 448, 449,  42,  73, 294, 325,
+  200, 231,  42,  10, 357, 388, 137, 168, 263, 294, 388, 419,
+  74, 105, 419, 450, 449, 480, 326, 357, 232, 263, 295, 326,
+  169, 200,  11,  42, 106, 137, 480, 481, 450, 481, 358, 389,
+  264, 295, 201, 232, 138, 169, 389, 420,  43,  74, 420, 451,
+  327, 358,  43,  11, 481, 512, 233, 264, 451, 482, 296, 327,
+  75, 106, 170, 201, 482, 513, 512, 513, 390, 421, 359, 390,
+  421, 452, 107, 138,  12,  43, 202, 233, 452, 483, 265, 296,
+  328, 359, 139, 170,  44,  75, 483, 514, 513, 544, 234, 265,
+  297, 328, 422, 453,  44,  12, 391, 422, 171, 202,  76, 107,
+  514, 545, 453, 484, 544, 545, 266, 297, 203, 234, 108, 139,
+  329, 360, 298, 329, 140, 171, 515, 546,  13,  44, 423, 454,
+  235, 266, 545, 576, 454, 485,  45,  76, 172, 203, 330, 361,
+  576, 577,  45,  13, 267, 298, 546, 577,  77, 108, 204, 235,
+  455, 486, 577, 608, 299, 330, 109, 140, 547, 578,  14,  45,
+  46,  14, 141, 172, 578, 609, 331, 362,  46,  77, 173, 204,
+  15,  15,  78, 109, 205, 236, 579, 610, 110, 141,  15,  46,
+  142, 173,  47,  78, 174, 205,  16,  16,  79, 110, 206, 237,
+  16,  47, 111, 142,  48,  79, 143, 174,  80, 111, 175, 206,
+  17,  48,  49,  17, 207, 238,  49,  80,  81, 112,  18,  18,
+  18,  49,  50,  81,  82, 113,  19,  50,  51,  82,  83, 114,
+  608, 609, 484, 515, 360, 391, 236, 267, 112, 143,  51,  19,
+  640, 640, 609, 640, 516, 547, 485, 516, 392, 423, 361, 392,
+  268, 299, 237, 268, 144, 175, 113, 144,  20,  51,  52,  20,
+  672, 672, 641, 672, 610, 641, 548, 579, 517, 548, 486, 517,
+  424, 455, 393, 424, 362, 393, 300, 331, 269, 300, 238, 269,
+  176, 207, 145, 176, 114, 145,  52,  83,  21,  52,  53,  21,
+  704, 704, 673, 704, 642, 673, 611, 642, 580, 611, 549, 580,
+  518, 549, 487, 518, 456, 487, 425, 456, 394, 425, 363, 394,
+  332, 363, 301, 332, 270, 301, 239, 270, 208, 239, 177, 208,
+  146, 177, 115, 146,  84, 115,  53,  84,  22,  53,  54,  22,
+  705, 736, 674, 705, 643, 674, 581, 612, 550, 581, 519, 550,
+  457, 488, 426, 457, 395, 426, 333, 364, 302, 333, 271, 302,
+  209, 240, 178, 209, 147, 178,  85, 116,  54,  85,  23,  54,
+  706, 737, 675, 706, 582, 613, 551, 582, 458, 489, 427, 458,
+  334, 365, 303, 334, 210, 241, 179, 210,  86, 117,  55,  86,
+  707, 738, 583, 614, 459, 490, 335, 366, 211, 242,  87, 118,
+  736, 737, 612, 643, 488, 519, 364, 395, 240, 271, 116, 147,
+  55,  23, 768, 768, 737, 768, 644, 675, 613, 644, 520, 551,
+  489, 520, 396, 427, 365, 396, 272, 303, 241, 272, 148, 179,
+  117, 148,  24,  55,  56,  24, 800, 800, 769, 800, 738, 769,
+  676, 707, 645, 676, 614, 645, 552, 583, 521, 552, 490, 521,
+  428, 459, 397, 428, 366, 397, 304, 335, 273, 304, 242, 273,
+  180, 211, 149, 180, 118, 149,  56,  87,  25,  56,  57,  25,
+  832, 832, 801, 832, 770, 801, 739, 770, 708, 739, 677, 708,
+  646, 677, 615, 646, 584, 615, 553, 584, 522, 553, 491, 522,
+  460, 491, 429, 460, 398, 429, 367, 398, 336, 367, 305, 336,
+  274, 305, 243, 274, 212, 243, 181, 212, 150, 181, 119, 150,
+  88, 119,  57,  88,  26,  57,  58,  26, 833, 864, 802, 833,
+  771, 802, 709, 740, 678, 709, 647, 678, 585, 616, 554, 585,
+  523, 554, 461, 492, 430, 461, 399, 430, 337, 368, 306, 337,
+  275, 306, 213, 244, 182, 213, 151, 182,  89, 120,  58,  89,
+  27,  58, 834, 865, 803, 834, 710, 741, 679, 710, 586, 617,
+  555, 586, 462, 493, 431, 462, 338, 369, 307, 338, 214, 245,
+  183, 214,  90, 121,  59,  90, 835, 866, 711, 742, 587, 618,
+  463, 494, 339, 370, 215, 246,  91, 122, 864, 865, 740, 771,
+  616, 647, 492, 523, 368, 399, 244, 275, 120, 151,  59,  27,
+  896, 896, 865, 896, 772, 803, 741, 772, 648, 679, 617, 648,
+  524, 555, 493, 524, 400, 431, 369, 400, 276, 307, 245, 276,
+  152, 183, 121, 152,  28,  59,  60,  28, 928, 928, 897, 928,
+  866, 897, 804, 835, 773, 804, 742, 773, 680, 711, 649, 680,
+  618, 649, 556, 587, 525, 556, 494, 525, 432, 463, 401, 432,
+  370, 401, 308, 339, 277, 308, 246, 277, 184, 215, 153, 184,
+  122, 153,  60,  91,  29,  60,  61,  29, 960, 960, 929, 960,
+  898, 929, 867, 898, 836, 867, 805, 836, 774, 805, 743, 774,
+  712, 743, 681, 712, 650, 681, 619, 650, 588, 619, 557, 588,
+  526, 557, 495, 526, 464, 495, 433, 464, 402, 433, 371, 402,
+  340, 371, 309, 340, 278, 309, 247, 278, 216, 247, 185, 216,
+  154, 185, 123, 154,  92, 123,  61,  92,  30,  61,  62,  30,
+  961, 992, 930, 961, 899, 930, 837, 868, 806, 837, 775, 806,
+  713, 744, 682, 713, 651, 682, 589, 620, 558, 589, 527, 558,
+  465, 496, 434, 465, 403, 434, 341, 372, 310, 341, 279, 310,
+  217, 248, 186, 217, 155, 186,  93, 124,  62,  93,  31,  62,
+  962, 993, 931, 962, 838, 869, 807, 838, 714, 745, 683, 714,
+  590, 621, 559, 590, 466, 497, 435, 466, 342, 373, 311, 342,
+  218, 249, 187, 218,  94, 125,  63,  94, 963, 994, 839, 870,
+  715, 746, 591, 622, 467, 498, 343, 374, 219, 250,  95, 126,
+  868, 899, 744, 775, 620, 651, 496, 527, 372, 403, 248, 279,
+  124, 155, 900, 931, 869, 900, 776, 807, 745, 776, 652, 683,
+  621, 652, 528, 559, 497, 528, 404, 435, 373, 404, 280, 311,
+  249, 280, 156, 187, 125, 156, 932, 963, 901, 932, 870, 901,
+  808, 839, 777, 808, 746, 777, 684, 715, 653, 684, 622, 653,
+  560, 591, 529, 560, 498, 529, 436, 467, 405, 436, 374, 405,
+  312, 343, 281, 312, 250, 281, 188, 219, 157, 188, 126, 157,
+  964, 995, 933, 964, 902, 933, 871, 902, 840, 871, 809, 840,
+  778, 809, 747, 778, 716, 747, 685, 716, 654, 685, 623, 654,
+  592, 623, 561, 592, 530, 561, 499, 530, 468, 499, 437, 468,
+  406, 437, 375, 406, 344, 375, 313, 344, 282, 313, 251, 282,
+  220, 251, 189, 220, 158, 189, 127, 158, 965, 996, 934, 965,
+  903, 934, 841, 872, 810, 841, 779, 810, 717, 748, 686, 717,
+  655, 686, 593, 624, 562, 593, 531, 562, 469, 500, 438, 469,
+  407, 438, 345, 376, 314, 345, 283, 314, 221, 252, 190, 221,
+  159, 190, 966, 997, 935, 966, 842, 873, 811, 842, 718, 749,
+  687, 718, 594, 625, 563, 594, 470, 501, 439, 470, 346, 377,
+  315, 346, 222, 253, 191, 222, 967, 998, 843, 874, 719, 750,
+  595, 626, 471, 502, 347, 378, 223, 254, 872, 903, 748, 779,
+  624, 655, 500, 531, 376, 407, 252, 283, 904, 935, 873, 904,
+  780, 811, 749, 780, 656, 687, 625, 656, 532, 563, 501, 532,
+  408, 439, 377, 408, 284, 315, 253, 284, 936, 967, 905, 936,
+  874, 905, 812, 843, 781, 812, 750, 781, 688, 719, 657, 688,
+  626, 657, 564, 595, 533, 564, 502, 533, 440, 471, 409, 440,
+  378, 409, 316, 347, 285, 316, 254, 285, 968, 999, 937, 968,
+  906, 937, 875, 906, 844, 875, 813, 844, 782, 813, 751, 782,
+  720, 751, 689, 720, 658, 689, 627, 658, 596, 627, 565, 596,
+  534, 565, 503, 534, 472, 503, 441, 472, 410, 441, 379, 410,
+  348, 379, 317, 348, 286, 317, 255, 286, 969, 1000, 938, 969,
+  907, 938, 845, 876, 814, 845, 783, 814, 721, 752, 690, 721,
+  659, 690, 597, 628, 566, 597, 535, 566, 473, 504, 442, 473,
+  411, 442, 349, 380, 318, 349, 287, 318, 970, 1001, 939, 970,
+  846, 877, 815, 846, 722, 753, 691, 722, 598, 629, 567, 598,
+  474, 505, 443, 474, 350, 381, 319, 350, 971, 1002, 847, 878,
+  723, 754, 599, 630, 475, 506, 351, 382, 876, 907, 752, 783,
+  628, 659, 504, 535, 380, 411, 908, 939, 877, 908, 784, 815,
+  753, 784, 660, 691, 629, 660, 536, 567, 505, 536, 412, 443,
+  381, 412, 940, 971, 909, 940, 878, 909, 816, 847, 785, 816,
+  754, 785, 692, 723, 661, 692, 630, 661, 568, 599, 537, 568,
+  506, 537, 444, 475, 413, 444, 382, 413, 972, 1003, 941, 972,
+  910, 941, 879, 910, 848, 879, 817, 848, 786, 817, 755, 786,
+  724, 755, 693, 724, 662, 693, 631, 662, 600, 631, 569, 600,
+  538, 569, 507, 538, 476, 507, 445, 476, 414, 445, 383, 414,
+  973, 1004, 942, 973, 911, 942, 849, 880, 818, 849, 787, 818,
+  725, 756, 694, 725, 663, 694, 601, 632, 570, 601, 539, 570,
+  477, 508, 446, 477, 415, 446, 974, 1005, 943, 974, 850, 881,
+  819, 850, 726, 757, 695, 726, 602, 633, 571, 602, 478, 509,
+  447, 478, 975, 1006, 851, 882, 727, 758, 603, 634, 479, 510,
+  880, 911, 756, 787, 632, 663, 508, 539, 912, 943, 881, 912,
+  788, 819, 757, 788, 664, 695, 633, 664, 540, 571, 509, 540,
+  944, 975, 913, 944, 882, 913, 820, 851, 789, 820, 758, 789,
+  696, 727, 665, 696, 634, 665, 572, 603, 541, 572, 510, 541,
+  976, 1007, 945, 976, 914, 945, 883, 914, 852, 883, 821, 852,
+  790, 821, 759, 790, 728, 759, 697, 728, 666, 697, 635, 666,
+  604, 635, 573, 604, 542, 573, 511, 542, 977, 1008, 946, 977,
+  915, 946, 853, 884, 822, 853, 791, 822, 729, 760, 698, 729,
+  667, 698, 605, 636, 574, 605, 543, 574, 978, 1009, 947, 978,
+  854, 885, 823, 854, 730, 761, 699, 730, 606, 637, 575, 606,
+  979, 1010, 855, 886, 731, 762, 607, 638, 884, 915, 760, 791,
+  636, 667, 916, 947, 885, 916, 792, 823, 761, 792, 668, 699,
+  637, 668, 948, 979, 917, 948, 886, 917, 824, 855, 793, 824,
+  762, 793, 700, 731, 669, 700, 638, 669, 980, 1011, 949, 980,
+  918, 949, 887, 918, 856, 887, 825, 856, 794, 825, 763, 794,
+  732, 763, 701, 732, 670, 701, 639, 670, 981, 1012, 950, 981,
+  919, 950, 857, 888, 826, 857, 795, 826, 733, 764, 702, 733,
+  671, 702, 982, 1013, 951, 982, 858, 889, 827, 858, 734, 765,
+  703, 734, 983, 1014, 859, 890, 735, 766, 888, 919, 764, 795,
+  920, 951, 889, 920, 796, 827, 765, 796, 952, 983, 921, 952,
+  890, 921, 828, 859, 797, 828, 766, 797, 984, 1015, 953, 984,
+  922, 953, 891, 922, 860, 891, 829, 860, 798, 829, 767, 798,
+  985, 1016, 954, 985, 923, 954, 861, 892, 830, 861, 799, 830,
+  986, 1017, 955, 986, 862, 893, 831, 862, 987, 1018, 863, 894,
+  892, 923, 924, 955, 893, 924, 956, 987, 925, 956, 894, 925,
+  988, 1019, 957, 988, 926, 957, 895, 926, 989, 1020, 958, 989,
+  927, 958, 990, 1021, 959, 990, 991, 1022,   0,   0,
+};
+
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t,
+                v2_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  0,  0,  1, 32,  1,  1, 32, 32,
+  2, 33, 33, 64, 34, 65,  2,  2, 64, 64,  3, 34,
+  65, 96, 35, 66, 66, 97,  3,  3, 96, 96,  4, 35,
+  97, 128,  67,  98,  36,  67,  98, 129,   4,   4,  68,  99,
+  99, 130, 128, 128,   5,  36, 129, 160,  37,  68, 130, 161,
+  100, 131,  69, 100, 131, 162,   5,   5, 160, 160,   6,  37,
+  161, 192,  38,  69, 162, 193, 101, 132, 132, 163,  70, 101,
+  163, 194,   6,   6, 192, 192,   7,  38, 133, 164, 193, 224,
+  102, 133, 164, 195,  39,  70, 194, 225,  71, 102, 195, 226,
+  134, 165, 165, 196,   7,   7, 224, 224,   8,  39, 103, 134,
+  196, 227, 225, 256,  40,  71, 226, 257, 166, 197,  72, 103,
+  227, 258, 135, 166, 197, 228, 104, 135, 228, 259,   8,   8,
+  256, 256,   9,  40, 257, 288,  41,  72, 167, 198, 198, 229,
+  258, 289, 136, 167, 229, 260,  73, 104, 259, 290, 105, 136,
+  260, 291, 199, 230,   9,   9, 168, 199, 230, 261, 288, 288,
+  10,  41, 289, 320,  42,  73, 290, 321, 137, 168, 261, 292,
+  74, 105, 291, 322, 200, 231, 231, 262, 106, 137, 292, 323,
+  169, 200, 262, 293,  10,  10, 320, 320,  11,  42, 321, 352,
+  43,  74, 138, 169, 293, 324, 322, 353, 232, 263,  75, 106,
+  201, 232, 263, 294, 323, 354, 170, 201, 294, 325, 107, 138,
+  324, 355,  11,  11, 352, 352,  12,  43, 233, 264, 264, 295,
+  353, 384, 139, 170, 325, 356,  44,  75, 354, 385, 202, 233,
+  295, 326,  76, 107, 355, 386, 171, 202, 326, 357, 108, 139,
+  356, 387, 265, 296, 234, 265, 296, 327,  12,  12, 140, 171,
+  357, 388, 384, 384,  13,  44, 203, 234, 327, 358, 385, 416,
+  45,  76, 386, 417,  77, 108, 387, 418, 172, 203, 358, 389,
+  266, 297, 297, 328, 109, 140, 235, 266, 328, 359, 388, 419,
+  204, 235, 359, 390, 141, 172, 389, 420,  13,  13, 416, 416,
+  14,  45, 417, 448,  46,  77, 298, 329, 418, 449, 267, 298,
+  329, 360,  78, 109, 173, 204, 390, 421, 419, 450, 236, 267,
+  360, 391, 110, 141, 420, 451, 205, 236, 391, 422, 142, 173,
+  299, 330, 330, 361, 421, 452,  14,  14, 268, 299, 361, 392,
+  448, 448,  15,  46, 449, 480,  47,  78, 450, 481, 174, 205,
+  422, 453, 237, 268, 392, 423,  79, 110, 451, 482, 111, 142,
+  452, 483, 331, 362, 300, 331, 362, 393, 206, 237, 423, 454,
+  143, 174, 269, 300, 393, 424, 453, 484, 480, 480, 481, 512,
+  238, 269, 424, 455, 482, 513, 175, 206, 454, 485, 332, 363,
+  363, 394, 483, 514, 301, 332, 394, 425, 484, 515, 207, 238,
+  455, 486, 270, 301, 425, 456, 485, 516, 364, 395, 239, 270,
+  456, 487, 512, 512, 333, 364, 395, 426, 513, 544, 486, 517,
+  514, 545, 302, 333, 426, 457, 515, 546, 487, 518, 516, 547,
+  271, 302, 457, 488, 365, 396, 396, 427, 517, 548, 334, 365,
+  427, 458, 488, 519, 544, 544, 303, 334, 458, 489, 518, 549,
+  545, 576, 546, 577, 547, 578, 489, 520, 397, 428, 519, 550,
+  366, 397, 428, 459, 548, 579, 335, 366, 459, 490, 549, 580,
+  520, 551, 490, 521, 550, 581, 576, 576, 577, 608, 398, 429,
+  429, 460, 578, 609, 367, 398, 460, 491, 521, 552, 579, 610,
+  551, 582, 491, 522, 580, 611, 581, 612, 552, 583, 522, 553,
+  430, 461, 399, 430, 461, 492, 582, 613, 492, 523, 608, 608,
+  609, 640, 610, 641, 553, 584, 611, 642, 523, 554, 583, 614,
+  612, 643, 431, 462, 462, 493, 554, 585, 493, 524, 584, 615,
+  613, 644, 524, 555, 614, 645, 640, 640, 585, 616, 641, 672,
+  555, 586, 642, 673, 615, 646, 463, 494, 643, 674, 494, 525,
+  644, 675, 525, 556, 586, 617, 616, 647, 645, 676, 556, 587,
+  646, 677, 495, 526, 617, 648, 587, 618, 672, 672, 526, 557,
+  673, 704, 674, 705, 647, 678, 557, 588, 675, 706, 618, 649,
+  676, 707, 588, 619, 648, 679, 677, 708, 527, 558, 558, 589,
+  678, 709, 619, 650, 649, 680, 704, 704, 589, 620, 705, 736,
+  679, 710, 706, 737, 707, 738, 650, 681, 620, 651, 708, 739,
+  680, 711, 559, 590, 709, 740, 590, 621, 651, 682, 681, 712,
+  710, 741, 621, 652, 736, 736, 737, 768, 711, 742, 738, 769,
+  682, 713, 652, 683, 739, 770, 591, 622, 740, 771, 712, 743,
+  622, 653, 741, 772, 683, 714, 653, 684, 713, 744, 742, 773,
+  623, 654, 743, 774, 768, 768, 769, 800, 684, 715, 714, 745,
+  770, 801, 771, 802, 654, 685, 744, 775, 772, 803, 715, 746,
+  773, 804, 685, 716, 745, 776, 774, 805, 655, 686, 716, 747,
+  775, 806, 746, 777, 800, 800, 801, 832, 686, 717, 802, 833,
+  803, 834, 776, 807, 804, 835, 747, 778, 717, 748, 805, 836,
+  777, 808, 687, 718, 806, 837, 748, 779, 718, 749, 778, 809,
+  807, 838, 832, 832, 833, 864, 834, 865, 835, 866, 808, 839,
+  749, 780, 836, 867, 779, 810, 719, 750, 837, 868, 809, 840,
+  838, 869, 780, 811, 750, 781, 810, 841, 839, 870, 864, 864,
+  865, 896, 866, 897, 840, 871, 867, 898, 781, 812, 811, 842,
+  868, 899, 751, 782, 869, 900, 841, 872, 812, 843, 870, 901,
+  782, 813, 842, 873, 871, 902, 896, 896, 897, 928, 813, 844,
+  898, 929, 872, 903, 783, 814, 843, 874, 899, 930, 900, 931,
+  873, 904, 901, 932, 814, 845, 844, 875, 902, 933, 874, 905,
+  903, 934, 845, 876, 928, 928, 815, 846, 929, 960, 930, 961,
+  875, 906, 904, 935, 931, 962, 932, 963, 905, 936, 846, 877,
+  933, 964, 876, 907, 934, 965, 906, 937, 935, 966, 877, 908,
+  847, 878, 960, 960, 907, 938, 961, 992, 936, 967, 962, 993,
+  963, 994, 964, 995, 878, 909, 937, 968, 908, 939, 965, 996,
+  966, 997, 938, 969, 879, 910, 909, 940, 967, 998, 939, 970,
+  968,  999,  910,  941,  969, 1000,  940,  971,  970, 1001,  911,  942,
+  941,  972,  971, 1002,  942,  973,  972, 1003,  943,  974,  973, 1004,
+  974, 1005,  975, 1006,   15,   15,   16,   47,   48,   79,   80,  111,
+  112, 143, 144, 175,  16,  16,  17,  48, 176, 207,  49,  80,
+  81, 112, 113, 144, 208, 239, 145, 176, 240, 271,  17,  17,
+  18,  49, 177, 208,  50,  81,  82, 113, 272, 303, 209, 240,
+  114, 145, 146, 177, 241, 272, 304, 335, 178, 209,  18,  18,
+  19,  50,  51,  82,  83, 114, 273, 304, 210, 241, 115, 146,
+  336, 367, 147, 178, 242, 273, 305, 336, 179, 210,  19,  19,
+  368, 399,  20,  51,  52,  83, 274, 305,  84, 115, 211, 242,
+  337, 368, 116, 147, 306, 337, 148, 179, 243, 274, 400, 431,
+  369, 400, 180, 211,  20,  20,  21,  52, 275, 306,  53,  84,
+  338, 369, 212, 243,  85, 116, 432, 463, 117, 148, 401, 432,
+  307, 338, 244, 275, 149, 180, 370, 401, 181, 212, 276, 307,
+  464, 495, 339, 370,  21,  21,  22,  53, 433, 464,  54,  85,
+  213, 244,  86, 117, 402, 433, 118, 149, 308, 339, 245, 276,
+  371, 402, 150, 181, 496, 527, 465, 496, 182, 213, 434, 465,
+  340, 371, 277, 308,  22,  22,  23,  54, 403, 434,  55,  86,
+  214, 245,  87, 118, 309, 340, 372, 403, 119, 150, 497, 528,
+  528, 559, 246, 277, 466, 497, 151, 182, 435, 466, 341, 372,
+  183, 214, 278, 309, 404, 435,  23,  23,  24,  55, 215, 246,
+  529, 560,  56,  87, 498, 529, 560, 591, 310, 341,  88, 119,
+  373, 404, 467, 498, 120, 151, 247, 278, 436, 467, 152, 183,
+  342, 373, 279, 310, 405, 436, 184, 215, 530, 561, 561, 592,
+  499, 530, 592, 623,  24,  24, 216, 247, 468, 499,  25,  56,
+  374, 405,  57,  88, 311, 342,  89, 120, 437, 468, 248, 279,
+  121, 152, 562, 593, 153, 184, 343, 374, 531, 562, 593, 624,
+  406, 437, 500, 531, 624, 655, 280, 311, 185, 216, 469, 500,
+  375, 406, 217, 248,  25,  25, 312, 343,  26,  57,  58,  89,
+  438, 469,  90, 121, 563, 594, 594, 625, 249, 280, 532, 563,
+  625, 656, 122, 153, 344, 375, 501, 532, 656, 687, 407, 438,
+  154, 185, 281, 312, 470, 501, 186, 217, 376, 407, 595, 626,
+  564, 595, 626, 657, 218, 249, 313, 344, 439, 470,  26,  26,
+  27,  58, 533, 564, 657, 688,  59,  90,  91, 122, 250, 281,
+  502, 533, 688, 719, 123, 154, 408, 439, 345, 376, 155, 186,
+  471, 502, 282, 313, 596, 627, 627, 658, 187, 218, 565, 596,
+  658, 689, 377, 408, 440, 471, 534, 565, 689, 720, 314, 345,
+  219, 250,  27,  27,  28,  59, 503, 534, 720, 751,  60,  91,
+  92, 123, 251, 282, 409, 440, 346, 377, 124, 155, 628, 659,
+  472, 503, 597, 628, 659, 690, 566, 597, 690, 721, 156, 187,
+  283, 314, 535, 566, 721, 752, 188, 219, 378, 409, 441, 472,
+  315, 346, 504, 535, 752, 783, 220, 251,  28,  28, 629, 660,
+  660, 691,  29,  60,  61,  92, 410, 441, 598, 629, 691, 722,
+  252, 283,  93, 124, 347, 378, 473, 504, 567, 598, 722, 753,
+  125, 156, 284, 315, 536, 567, 753, 784, 157, 188, 442, 473,
+  379, 410, 189, 220, 505, 536, 784, 815, 661, 692, 316, 347,
+  630, 661, 692, 723, 221, 252, 599, 630, 723, 754, 411, 442,
+  29,  29, 568, 599, 754, 785,  30,  61, 474, 505,  62,  93,
+  253, 284, 348, 379,  94, 125, 537, 568, 785, 816, 126, 157,
+  285, 316, 158, 189, 443, 474, 662, 693, 693, 724, 380, 411,
+  631, 662, 724, 755, 506, 537, 816, 847, 190, 221, 600, 631,
+  755, 786, 317, 348, 222, 253, 569, 600, 786, 817, 412, 443,
+  475, 506,  30,  30,  31,  62, 349, 380, 254, 285,  63,  94,
+  538, 569, 817, 848, 694, 725,  95, 126, 663, 694, 725, 756,
+  632, 663, 756, 787, 127, 158, 444, 475, 286, 317, 381, 412,
+  507, 538, 848, 879, 159, 190, 601, 632, 787, 818, 191, 222,
+  318, 349, 570, 601, 818, 849, 476, 507, 223, 254, 413, 444,
+  695, 726, 726, 757, 664, 695, 757, 788, 539, 570, 849, 880,
+  350, 381, 255, 286, 633, 664, 788, 819, 445, 476, 602, 633,
+  819, 850, 508, 539, 880, 911, 287, 318, 382, 413, 571, 602,
+  850, 881, 727, 758, 696, 727, 758, 789, 319, 350, 477, 508,
+  665, 696, 789, 820, 414, 445, 540, 571, 881, 912, 634, 665,
+  820, 851, 351, 382, 603, 634, 851, 882, 446, 477, 509, 540,
+  912, 943, 383, 414, 728, 759, 759, 790, 572, 603, 882, 913,
+  697, 728, 790, 821, 666, 697, 821, 852, 478, 509, 635, 666,
+  852, 883, 415, 446, 541, 572, 913, 944, 604, 635, 883, 914,
+  760, 791, 729, 760, 791, 822, 510, 541, 944, 975, 447, 478,
+  698, 729, 822, 853, 573, 604, 914, 945, 667, 698, 853, 884,
+  636, 667, 884, 915, 479, 510, 542, 573, 945, 976, 761, 792,
+  792, 823, 605, 636, 915, 946, 730, 761, 823, 854, 699, 730,
+  854,  885,  511,  542,  976, 1007,  574,  605,  946,  977,  668,  699,
+  885,  916,  637,  668,  916,  947,  543,  574,  793,  824,  977, 1008,
+  762, 793, 824, 855, 731, 762, 855, 886, 606, 637, 947, 978,
+  700,  731,  886,  917,  669,  700,  917,  948,  575,  606,  978, 1009,
+  638, 669, 948, 979, 794, 825, 825, 856, 763, 794, 856, 887,
+  732,  763,  887,  918,  607,  638,  979, 1010,  701,  732,  918,  949,
+  670, 701, 949, 980, 826, 857, 795, 826, 857, 888, 764, 795,
+  888,  919,  639,  670,  980, 1011,  733,  764,  919,  950,  702,  733,
+  950,  981,  671,  702,  981, 1012,  827,  858,  858,  889,  796,  827,
+  889, 920, 765, 796, 920, 951, 734, 765, 951, 982, 703, 734,
+  982, 1013,  859,  890,  828,  859,  890,  921,  797,  828,  921,  952,
+  766,  797,  952,  983,  735,  766,  983, 1014,  860,  891,  891,  922,
+  829,  860,  922,  953,  798,  829,  953,  984,  767,  798,  984, 1015,
+  892, 923, 861, 892, 923, 954, 830, 861, 954, 985, 799, 830,
+  985, 1016,  893,  924,  924,  955,  862,  893,  955,  986,  831,  862,
+  986, 1017,  925,  956,  894,  925,  956,  987,  863,  894,  987, 1018,
+  926,  957,  957,  988,  895,  926,  988, 1019,  958,  989,  927,  958,
+  989, 1020,  959,  990,  990, 1021,  991, 1022, 0, 0,
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                h2_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  0,  0,  1, 32,  1,  1, 32, 32,
+  2, 33, 33, 64, 34, 65,  2,  2, 64, 64,  3, 34,
+  65, 96, 35, 66, 66, 97,  3,  3, 96, 96,  4, 35,
+  97, 128,  67,  98,  36,  67,  98, 129,   4,   4,  68,  99,
+  99, 130, 128, 128,   5,  36, 129, 160,  37,  68, 130, 161,
+  100, 131,  69, 100, 131, 162,   5,   5, 160, 160,   6,  37,
+  161, 192,  38,  69, 162, 193, 101, 132, 132, 163,  70, 101,
+  163, 194,   6,   6, 192, 192,   7,  38, 133, 164, 193, 224,
+  102, 133, 164, 195,  39,  70, 194, 225,  71, 102, 195, 226,
+  134, 165, 165, 196,   7,   7, 224, 224,   8,  39, 103, 134,
+  196, 227, 225, 256,  40,  71, 226, 257, 166, 197,  72, 103,
+  227, 258, 135, 166, 197, 228, 104, 135, 228, 259,   8,   8,
+  256, 256,   9,  40, 257, 288,  41,  72, 167, 198, 198, 229,
+  258, 289, 136, 167, 229, 260,  73, 104, 259, 290, 105, 136,
+  260, 291, 199, 230,   9,   9, 168, 199, 230, 261, 288, 288,
+  10,  41, 289, 320,  42,  73, 290, 321, 137, 168, 261, 292,
+  74, 105, 291, 322, 200, 231, 231, 262, 106, 137, 292, 323,
+  169, 200, 262, 293,  10,  10, 320, 320,  11,  42, 321, 352,
+  43,  74, 138, 169, 293, 324, 322, 353, 232, 263,  75, 106,
+  201, 232, 263, 294, 323, 354, 170, 201, 294, 325, 107, 138,
+  324, 355,  11,  11, 352, 352,  12,  43, 233, 264, 264, 295,
+  353, 384, 139, 170, 325, 356,  44,  75, 354, 385, 202, 233,
+  295, 326,  76, 107, 355, 386, 171, 202, 326, 357, 108, 139,
+  356, 387, 265, 296, 234, 265, 296, 327,  12,  12, 140, 171,
+  357, 388, 384, 384,  13,  44, 203, 234, 327, 358, 385, 416,
+  45,  76, 386, 417,  77, 108, 387, 418, 172, 203, 358, 389,
+  266, 297, 297, 328, 109, 140, 235, 266, 328, 359, 388, 419,
+  204, 235, 359, 390, 141, 172, 389, 420,  13,  13, 416, 416,
+  14,  45, 417, 448,  46,  77, 298, 329, 418, 449, 267, 298,
+  329, 360,  78, 109, 173, 204, 390, 421, 419, 450, 236, 267,
+  360, 391, 110, 141, 420, 451, 205, 236, 391, 422, 142, 173,
+  299, 330, 330, 361, 421, 452,  14,  14, 268, 299, 361, 392,
+  448, 448,  15,  46, 449, 480,  47,  78, 450, 481, 174, 205,
+  422, 453, 237, 268, 392, 423,  79, 110, 451, 482, 111, 142,
+  452, 483, 331, 362, 300, 331, 362, 393, 206, 237, 423, 454,
+  143, 174, 269, 300, 393, 424, 453, 484,  15,  15,  16,  47,
+  48,  79, 238, 269, 424, 455, 175, 206, 454, 485,  80, 111,
+  332, 363, 363, 394, 301, 332, 394, 425, 112, 143, 207, 238,
+  455, 486, 270, 301, 425, 456, 144, 175, 364, 395,  16,  16,
+  239, 270, 456, 487,  17,  48, 333, 364, 395, 426, 176, 207,
+  49,  80, 302, 333, 426, 457,  81, 112, 113, 144, 208, 239,
+  271, 302, 457, 488, 365, 396, 396, 427, 145, 176, 334, 365,
+  427, 458, 240, 271,  17,  17,  18,  49, 177, 208, 303, 334,
+  458, 489,  50,  81,  82, 113, 272, 303, 209, 240, 397, 428,
+  114, 145, 366, 397, 428, 459, 335, 366, 459, 490, 146, 177,
+  241, 272, 304, 335, 178, 209,  18,  18,  19,  50,  51,  82,
+  398, 429, 429, 460, 367, 398, 460, 491,  83, 114, 273, 304,
+  210, 241, 115, 146, 336, 367, 147, 178, 242, 273, 305, 336,
+  430, 461, 399, 430, 461, 492, 179, 210,  19,  19, 368, 399,
+  20,  51,  52,  83, 274, 305,  84, 115, 211, 242, 337, 368,
+  116, 147, 431, 462, 462, 493, 306, 337, 148, 179, 243, 274,
+  400, 431, 369, 400, 180, 211,  20,  20,  21,  52, 275, 306,
+  53,  84, 338, 369, 212, 243,  85, 116, 463, 494, 432, 463,
+  117, 148, 401, 432, 307, 338, 244, 275, 149, 180, 370, 401,
+  181, 212, 276, 307, 464, 495, 339, 370,  21,  21,  22,  53,
+  433, 464,  54,  85, 213, 244,  86, 117, 402, 433, 118, 149,
+  308, 339, 245, 276, 371, 402, 150, 181, 465, 496, 182, 213,
+  434, 465, 340, 371, 277, 308,  22,  22,  23,  54, 403, 434,
+  55,  86, 214, 245,  87, 118, 309, 340, 372, 403, 119, 150,
+  246, 277, 466, 497, 151, 182, 435, 466, 341, 372, 183, 214,
+  278, 309, 404, 435,  23,  23,  24,  55, 215, 246,  56,  87,
+  310, 341,  88, 119, 373, 404, 467, 498, 120, 151, 247, 278,
+  436, 467, 152, 183, 342, 373, 279, 310, 405, 436, 184, 215,
+  24,  24, 216, 247, 468, 499,  25,  56, 374, 405,  57,  88,
+  311, 342,  89, 120, 437, 468, 248, 279, 121, 152, 153, 184,
+  343, 374, 406, 437, 280, 311, 185, 216, 469, 500, 375, 406,
+  217, 248,  25,  25, 312, 343,  26,  57,  58,  89, 438, 469,
+  90, 121, 249, 280, 122, 153, 344, 375, 407, 438, 154, 185,
+  281, 312, 470, 501, 186, 217, 376, 407, 218, 249, 313, 344,
+  439, 470,  26,  26,  27,  58,  59,  90,  91, 122, 250, 281,
+  123, 154, 408, 439, 345, 376, 155, 186, 471, 502, 282, 313,
+  187, 218, 377, 408, 440, 471, 314, 345, 219, 250,  27,  27,
+  28,  59,  60,  91,  92, 123, 251, 282, 409, 440, 346, 377,
+  124, 155, 472, 503, 156, 187, 283, 314, 188, 219, 378, 409,
+  441, 472, 315, 346, 220, 251,  28,  28,  29,  60,  61,  92,
+  410, 441, 252, 283,  93, 124, 347, 378, 473, 504, 125, 156,
+  284, 315, 157, 188, 442, 473, 379, 410, 189, 220, 316, 347,
+  221, 252, 411, 442,  29,  29,  30,  61, 474, 505,  62,  93,
+  253, 284, 348, 379,  94, 125, 126, 157, 285, 316, 158, 189,
+  443, 474, 380, 411, 190, 221, 317, 348, 222, 253, 412, 443,
+  475, 506,  30,  30,  31,  62, 349, 380, 254, 285,  63,  94,
+  95, 126, 127, 158, 444, 475, 286, 317, 381, 412, 159, 190,
+  191, 222, 318, 349, 476, 507, 223, 254, 413, 444, 350, 381,
+  255, 286, 445, 476, 287, 318, 382, 413, 319, 350, 477, 508,
+  414, 445, 351, 382, 446, 477, 383, 414, 478, 509, 415, 446,
+  447, 478, 479, 510, 480, 480, 481, 512, 482, 513, 483, 514,
+  484, 515, 485, 516, 512, 512, 513, 544, 486, 517, 514, 545,
+  515, 546, 487, 518, 516, 547, 517, 548, 488, 519, 544, 544,
+  518, 549, 545, 576, 546, 577, 547, 578, 489, 520, 519, 550,
+  548, 579, 549, 580, 520, 551, 490, 521, 550, 581, 576, 576,
+  577, 608, 578, 609, 521, 552, 579, 610, 551, 582, 491, 522,
+  580, 611, 581, 612, 552, 583, 522, 553, 582, 613, 492, 523,
+  608, 608, 609, 640, 610, 641, 553, 584, 611, 642, 523, 554,
+  583, 614, 612, 643, 554, 585, 493, 524, 584, 615, 613, 644,
+  524, 555, 614, 645, 640, 640, 585, 616, 641, 672, 555, 586,
+  642, 673, 615, 646, 643, 674, 494, 525, 644, 675, 525, 556,
+  586, 617, 616, 647, 645, 676, 556, 587, 646, 677, 495, 526,
+  617, 648, 587, 618, 672, 672, 526, 557, 673, 704, 674, 705,
+  647, 678, 557, 588, 675, 706, 618, 649, 676, 707, 588, 619,
+  648, 679, 677, 708, 496, 527, 527, 558, 558, 589, 678, 709,
+  619, 650, 649, 680, 704, 704, 589, 620, 705, 736, 679, 710,
+  706, 737, 707, 738, 650, 681, 620, 651, 497, 528, 528, 559,
+  708, 739, 680, 711, 559, 590, 709, 740, 590, 621, 651, 682,
+  681, 712, 710, 741, 621, 652, 736, 736, 737, 768, 529, 560,
+  711, 742, 498, 529, 560, 591, 738, 769, 682, 713, 652, 683,
+  739, 770, 591, 622, 740, 771, 712, 743, 622, 653, 741, 772,
+  683, 714, 653, 684, 713, 744, 742, 773, 530, 561, 561, 592,
+  499, 530, 592, 623, 623, 654, 743, 774, 768, 768, 769, 800,
+  684, 715, 714, 745, 770, 801, 771, 802, 654, 685, 744, 775,
+  772, 803, 562, 593, 531, 562, 593, 624, 715, 746, 773, 804,
+  685, 716, 500, 531, 624, 655, 745, 776, 774, 805, 655, 686,
+  716, 747, 775, 806, 746, 777, 800, 800, 801, 832, 686, 717,
+  802, 833, 563, 594, 594, 625, 803, 834, 532, 563, 625, 656,
+  776, 807, 804, 835, 501, 532, 656, 687, 747, 778, 717, 748,
+  805, 836, 777, 808, 687, 718, 806, 837, 748, 779, 595, 626,
+  564, 595, 626, 657, 718, 749, 778, 809, 807, 838, 832, 832,
+  533, 564, 657, 688, 833, 864, 834, 865, 835, 866, 502, 533,
+  688, 719, 808, 839, 749, 780, 836, 867, 779, 810, 719, 750,
+  837, 868, 809, 840, 596, 627, 627, 658, 565, 596, 658, 689,
+  838, 869, 780, 811, 750, 781, 534, 565, 689, 720, 810, 841,
+  839, 870, 864, 864, 503, 534, 720, 751, 865, 896, 866, 897,
+  840, 871, 867, 898, 781, 812, 811, 842, 628, 659, 868, 899,
+  751, 782, 597, 628, 659, 690, 566, 597, 690, 721, 869, 900,
+  841, 872, 535, 566, 721, 752, 812, 843, 870, 901, 782, 813,
+  842, 873, 504, 535, 752, 783, 871, 902, 629, 660, 660, 691,
+  896, 896, 897, 928, 598, 629, 691, 722, 813, 844, 898, 929,
+  872, 903, 783, 814, 843, 874, 899, 930, 567, 598, 722, 753,
+  900, 931, 536, 567, 753, 784, 873, 904, 901, 932, 814, 845,
+  844, 875, 902, 933, 505, 536, 784, 815, 661, 692, 630, 661,
+  692, 723, 874, 905, 599, 630, 723, 754, 903, 934, 845, 876,
+  568, 599, 754, 785, 928, 928, 815, 846, 929, 960, 930, 961,
+  875, 906, 904, 935, 931, 962, 537, 568, 785, 816, 932, 963,
+  905, 936, 662, 693, 693, 724, 846, 877, 933, 964, 876, 907,
+  631, 662, 724, 755, 506, 537, 816, 847, 934, 965, 600, 631,
+  755, 786, 906, 937, 569, 600, 786, 817, 935, 966, 877, 908,
+  847, 878, 960, 960, 907, 938, 961, 992, 936, 967, 538, 569,
+  817, 848, 962, 993, 694, 725, 663, 694, 725, 756, 963, 994,
+  632, 663, 756, 787, 964, 995, 878, 909, 937, 968, 507, 538,
+  848, 879, 908, 939, 601, 632, 787, 818, 965, 996, 966, 997,
+  570, 601, 818, 849, 938, 969, 879, 910, 909, 940, 967, 998,
+  695, 726, 726, 757, 664, 695, 757, 788, 539, 570, 849, 880,
+  939, 970, 633, 664, 788, 819, 968, 999, 602, 633, 819, 850,
+  910,  941,  508,  539,  880,  911,  969, 1000,  940,  971,  571,  602,
+  850,  881,  727,  758,  696,  727,  758,  789,  970, 1001,  665,  696,
+  789, 820, 911, 942, 941, 972, 540, 571, 881, 912, 634, 665,
+  820,  851,  971, 1002,  603,  634,  851,  882,  942,  973,  509,  540,
+  912,  943,  728,  759,  759,  790,  972, 1003,  572,  603,  882,  913,
+  697, 728, 790, 821, 666, 697, 821, 852, 943, 974, 635, 666,
+  852,  883,  541,  572,  913,  944,  973, 1004,  604,  635,  883,  914,
+  760,  791,  729,  760,  791,  822,  510,  541,  944,  975,  974, 1005,
+  698, 729, 822, 853, 573, 604, 914, 945, 667, 698, 853, 884,
+  636,  667,  884,  915,  975, 1006,  542,  573,  945,  976,  761,  792,
+  792, 823, 605, 636, 915, 946, 730, 761, 823, 854, 699, 730,
+  854,  885,  511,  542,  976, 1007,  574,  605,  946,  977,  668,  699,
+  885,  916,  637,  668,  916,  947,  543,  574,  793,  824,  977, 1008,
+  762, 793, 824, 855, 731, 762, 855, 886, 606, 637, 947, 978,
+  700,  731,  886,  917,  669,  700,  917,  948,  575,  606,  978, 1009,
+  638, 669, 948, 979, 794, 825, 825, 856, 763, 794, 856, 887,
+  732,  763,  887,  918,  607,  638,  979, 1010,  701,  732,  918,  949,
+  670, 701, 949, 980, 826, 857, 795, 826, 857, 888, 764, 795,
+  888,  919,  639,  670,  980, 1011,  733,  764,  919,  950,  702,  733,
+  950,  981,  671,  702,  981, 1012,  827,  858,  858,  889,  796,  827,
+  889, 920, 765, 796, 920, 951, 734, 765, 951, 982, 703, 734,
+  982, 1013,  859,  890,  828,  859,  890,  921,  797,  828,  921,  952,
+  766,  797,  952,  983,  735,  766,  983, 1014,  860,  891,  891,  922,
+  829,  860,  922,  953,  798,  829,  953,  984,  767,  798,  984, 1015,
+  892, 923, 861, 892, 923, 954, 830, 861, 954, 985, 799, 830,
+  985, 1016,  893,  924,  924,  955,  862,  893,  955,  986,  831,  862,
+  986, 1017,  925,  956,  894,  925,  956,  987,  863,  894,  987, 1018,
+  926,  957,  957,  988,  895,  926,  988, 1019,  958,  989,  927,  958,
+  989, 1020,  959,  990,  990, 1021,  991, 1022, 0, 0
+};
+
+DECLARE_ALIGNED(16, static const int16_t,
+                qtr_scan_32x32_neighbors[1025 * MAX_NEIGHBORS]) = {
+  0,  0,  0,  0,  0,  0,  1, 32,  1,  1, 32, 32,
+  2, 33, 33, 64, 34, 65,  2,  2, 64, 64,  3, 34,
+  65, 96, 35, 66, 66, 97,  3,  3, 96, 96,  4, 35,
+  97, 128,  67,  98,  36,  67,  98, 129,   4,   4,  68,  99,
+  99, 130, 128, 128,   5,  36, 129, 160,  37,  68, 130, 161,
+  100, 131,  69, 100, 131, 162,   5,   5, 160, 160,   6,  37,
+  161, 192,  38,  69, 162, 193, 101, 132, 132, 163,  70, 101,
+  163, 194,   6,   6, 192, 192,   7,  38, 133, 164, 193, 224,
+  102, 133, 164, 195,  39,  70, 194, 225,  71, 102, 195, 226,
+  134, 165, 165, 196,   7,   7, 224, 224,   8,  39, 103, 134,
+  196, 227, 225, 256,  40,  71, 226, 257, 166, 197,  72, 103,
+  227, 258, 135, 166, 197, 228, 104, 135, 228, 259,   8,   8,
+  256, 256,   9,  40, 257, 288,  41,  72, 167, 198, 198, 229,
+  258, 289, 136, 167, 229, 260,  73, 104, 259, 290, 105, 136,
+  260, 291, 199, 230,   9,   9, 168, 199, 230, 261, 288, 288,
+  10,  41, 289, 320,  42,  73, 290, 321, 137, 168, 261, 292,
+  74, 105, 291, 322, 200, 231, 231, 262, 106, 137, 292, 323,
+  169, 200, 262, 293,  10,  10, 320, 320,  11,  42, 321, 352,
+  43,  74, 138, 169, 293, 324, 322, 353, 232, 263,  75, 106,
+  201, 232, 263, 294, 323, 354, 170, 201, 294, 325, 107, 138,
+  324, 355,  11,  11, 352, 352,  12,  43, 233, 264, 264, 295,
+  353, 384, 139, 170, 325, 356,  44,  75, 354, 385, 202, 233,
+  295, 326,  76, 107, 355, 386, 171, 202, 326, 357, 108, 139,
+  356, 387, 265, 296, 234, 265, 296, 327,  12,  12, 140, 171,
+  357, 388, 384, 384,  13,  44, 203, 234, 327, 358, 385, 416,
+  45,  76, 386, 417,  77, 108, 387, 418, 172, 203, 358, 389,
+  266, 297, 297, 328, 109, 140, 235, 266, 328, 359, 388, 419,
+  204, 235, 359, 390, 141, 172, 389, 420,  13,  13, 416, 416,
+  14,  45, 417, 448,  46,  77, 298, 329, 418, 449, 267, 298,
+  329, 360,  78, 109, 173, 204, 390, 421, 419, 450, 236, 267,
+  360, 391, 110, 141, 420, 451, 205, 236, 391, 422, 142, 173,
+  299, 330, 330, 361, 421, 452,  14,  14, 268, 299, 361, 392,
+  448, 448,  15,  46, 449, 480,  47,  78, 450, 481, 174, 205,
+  422, 453, 237, 268, 392, 423,  79, 110, 451, 482, 111, 142,
+  452, 483, 331, 362, 300, 331, 362, 393, 206, 237, 423, 454,
+  143, 174, 269, 300, 393, 424, 453, 484, 238, 269, 424, 455,
+  175, 206, 454, 485, 332, 363, 363, 394, 301, 332, 394, 425,
+  207, 238, 455, 486, 270, 301, 425, 456, 364, 395, 239, 270,
+  456, 487, 333, 364, 395, 426, 302, 333, 426, 457, 271, 302,
+  457, 488, 365, 396, 396, 427, 334, 365, 427, 458, 303, 334,
+  458, 489, 397, 428, 366, 397, 428, 459, 335, 366, 459, 490,
+  398, 429, 429, 460, 367, 398, 460, 491, 430, 461, 399, 430,
+  461, 492, 431, 462, 462, 493, 463, 494,  15,  15, 480, 480,
+  16,  47, 481, 512,  48,  79, 482, 513,  80, 111, 483, 514,
+  112, 143, 484, 515, 144, 175, 485, 516,  16,  16, 512, 512,
+  17,  48, 513, 544, 176, 207, 486, 517,  49,  80, 514, 545,
+  81, 112, 515, 546, 113, 144, 208, 239, 487, 518, 516, 547,
+  145, 176, 517, 548, 240, 271, 488, 519,  17,  17, 544, 544,
+  18,  49, 177, 208, 518, 549, 545, 576,  50,  81, 546, 577,
+  82, 113, 547, 578, 272, 303, 489, 520, 209, 240, 519, 550,
+  114, 145, 548, 579, 146, 177, 549, 580, 241, 272, 520, 551,
+  304, 335, 490, 521, 178, 209, 550, 581,  18,  18, 576, 576,
+  19,  50, 577, 608,  51,  82, 578, 609,  83, 114, 273, 304,
+  521, 552, 579, 610, 210, 241, 551, 582, 115, 146, 336, 367,
+  491, 522, 580, 611, 147, 178, 581, 612, 242, 273, 552, 583,
+  305, 336, 522, 553, 179, 210, 582, 613,  19,  19, 368, 399,
+  492, 523, 608, 608,  20,  51, 609, 640,  52,  83, 610, 641,
+  274, 305, 553, 584,  84, 115, 611, 642, 211, 242, 337, 368,
+  523, 554, 583, 614, 116, 147, 612, 643, 306, 337, 554, 585,
+  148, 179, 243, 274, 400, 431, 493, 524, 584, 615, 613, 644,
+  369, 400, 524, 555, 180, 211, 614, 645,  20,  20, 640, 640,
+  21,  52, 275, 306, 585, 616, 641, 672,  53,  84, 338, 369,
+  555, 586, 642, 673, 212, 243, 615, 646,  85, 116, 643, 674,
+  432, 463, 494, 525, 117, 148, 644, 675, 401, 432, 525, 556,
+  307, 338, 586, 617, 244, 275, 616, 647, 149, 180, 645, 676,
+  370, 401, 556, 587, 181, 212, 646, 677, 276, 307, 464, 495,
+  495, 526, 617, 648, 339, 370, 587, 618,  21,  21, 672, 672,
+  22,  53, 433, 464, 526, 557, 673, 704,  54,  85, 674, 705,
+  213, 244, 647, 678,  86, 117, 402, 433, 557, 588, 675, 706,
+  118, 149, 308, 339, 618, 649, 676, 707, 245, 276, 371, 402,
+  588, 619, 648, 679, 150, 181, 677, 708, 496, 527, 465, 496,
+  527, 558, 182, 213, 434, 465, 558, 589, 678, 709, 340, 371,
+  619, 650, 277, 308, 649, 680,  22,  22, 704, 704,  23,  54,
+  403, 434, 589, 620, 705, 736,  55,  86, 214, 245, 679, 710,
+  706, 737,  87, 118, 707, 738, 309, 340, 650, 681, 372, 403,
+  620, 651, 119, 150, 497, 528, 528, 559, 708, 739, 246, 277,
+  680, 711, 466, 497, 559, 590, 151, 182, 709, 740, 435, 466,
+  590, 621, 341, 372, 651, 682, 183, 214, 278, 309, 681, 712,
+  710, 741, 404, 435, 621, 652,  23,  23, 736, 736,  24,  55,
+  737, 768, 215, 246, 529, 560, 711, 742,  56,  87, 498, 529,
+  560, 591, 738, 769, 310, 341, 682, 713,  88, 119, 373, 404,
+  652, 683, 739, 770, 467, 498, 591, 622, 120, 151, 740, 771,
+  247, 278, 712, 743, 436, 467, 622, 653, 152, 183, 741, 772,
+  342, 373, 683, 714, 279, 310, 405, 436, 653, 684, 713, 744,
+  184, 215, 742, 773, 530, 561, 561, 592, 499, 530, 592, 623,
+  24,  24, 216, 247, 468, 499, 623, 654, 743, 774, 768, 768,
+  25,  56, 769, 800, 374, 405, 684, 715,  57,  88, 311, 342,
+  714, 745, 770, 801,  89, 120, 771, 802, 437, 468, 654, 685,
+  248, 279, 744, 775, 121, 152, 772, 803, 562, 593, 153, 184,
+  343, 374, 531, 562, 593, 624, 715, 746, 773, 804, 406, 437,
+  685, 716, 500, 531, 624, 655, 280, 311, 745, 776, 185, 216,
+  774, 805, 469, 500, 655, 686, 375, 406, 716, 747, 217, 248,
+  775, 806,  25,  25, 312, 343, 746, 777, 800, 800,  26,  57,
+  801, 832,  58,  89, 438, 469, 686, 717, 802, 833,  90, 121,
+  563, 594, 594, 625, 803, 834, 249, 280, 532, 563, 625, 656,
+  776, 807, 122, 153, 804, 835, 344, 375, 501, 532, 656, 687,
+  747, 778, 407, 438, 717, 748, 154, 185, 805, 836, 281, 312,
+  777, 808, 470, 501, 687, 718, 186, 217, 806, 837, 376, 407,
+  748, 779, 595, 626, 564, 595, 626, 657, 218, 249, 313, 344,
+  439, 470, 718, 749, 778, 809, 807, 838,  26,  26, 832, 832,
+  27,  58, 533, 564, 657, 688, 833, 864,  59,  90, 834, 865,
+  91, 122, 835, 866, 250, 281, 502, 533, 688, 719, 808, 839,
+  123, 154, 408, 439, 749, 780, 836, 867, 345, 376, 779, 810,
+  155, 186, 471, 502, 719, 750, 837, 868, 282, 313, 809, 840,
+  596, 627, 627, 658, 187, 218, 565, 596, 658, 689, 838, 869,
+  377, 408, 780, 811, 440, 471, 750, 781, 534, 565, 689, 720,
+  314, 345, 810, 841, 219, 250, 839, 870,  27,  27, 864, 864,
+  28,  59, 503, 534, 720, 751, 865, 896,  60,  91, 866, 897,
+  92, 123, 251, 282, 840, 871, 867, 898, 409, 440, 781, 812,
+  346, 377, 811, 842, 124, 155, 628, 659, 868, 899, 472, 503,
+  751, 782, 597, 628, 659, 690, 566, 597, 690, 721, 156, 187,
+  869, 900, 283, 314, 841, 872, 535, 566, 721, 752, 188, 219,
+  378, 409, 812, 843, 870, 901, 441, 472, 782, 813, 315, 346,
+  842, 873, 504, 535, 752, 783, 220, 251, 871, 902,  28,  28,
+  629, 660, 660, 691, 896, 896,  29,  60, 897, 928,  61,  92,
+  410, 441, 598, 629, 691, 722, 813, 844, 898, 929, 252, 283,
+  872, 903,  93, 124, 347, 378, 473, 504, 783, 814, 843, 874,
+  899, 930, 567, 598, 722, 753, 125, 156, 900, 931, 284, 315,
+  536, 567, 753, 784, 873, 904, 157, 188, 901, 932, 442, 473,
+  814, 845, 379, 410, 844, 875, 189, 220, 902, 933, 505, 536,
+  784, 815, 661, 692, 316, 347, 630, 661, 692, 723, 874, 905,
+  221, 252, 599, 630, 723, 754, 903, 934, 411, 442, 845, 876,
+  29,  29, 568, 599, 754, 785, 928, 928,  30,  61, 474, 505,
+  815, 846, 929, 960,  62,  93, 930, 961, 253, 284, 348, 379,
+  875, 906, 904, 935,  94, 125, 931, 962, 537, 568, 785, 816,
+  126, 157, 932, 963, 285, 316, 905, 936, 158, 189, 443, 474,
+  662, 693, 693, 724, 846, 877, 933, 964, 380, 411, 876, 907,
+  631, 662, 724, 755, 506, 537, 816, 847, 190, 221, 934, 965,
+  600, 631, 755, 786, 317, 348, 906, 937, 222, 253, 569, 600,
+  786, 817, 935, 966, 412, 443, 877, 908, 475, 506, 847, 878,
+  30,  30, 960, 960,  31,  62, 349, 380, 907, 938, 961, 992,
+  254, 285, 936, 967,  63,  94, 538, 569, 817, 848, 962, 993,
+  694, 725,  95, 126, 663, 694, 725, 756, 963, 994, 632, 663,
+  756, 787, 127, 158, 964, 995, 444, 475, 878, 909, 286, 317,
+  937, 968, 381, 412, 507, 538, 848, 879, 908, 939, 159, 190,
+  601, 632, 787, 818, 965, 996, 191, 222, 966, 997, 318, 349,
+  570, 601, 818, 849, 938, 969, 476, 507, 879, 910, 223, 254,
+  413, 444, 909, 940, 967, 998, 695, 726, 726, 757, 664, 695,
+  757, 788, 539, 570, 849, 880, 350, 381, 939, 970, 255, 286,
+  633, 664, 788, 819, 968, 999, 445, 476, 602, 633, 819, 850,
+  910,  941,  508,  539,  880,  911,  287,  318,  969, 1000,  382,  413,
+  940, 971, 571, 602, 850, 881, 727, 758, 696, 727, 758, 789,
+  319,  350,  970, 1001,  477,  508,  665,  696,  789,  820,  911,  942,
+  414, 445, 941, 972, 540, 571, 881, 912, 634, 665, 820, 851,
+  351,  382,  971, 1002,  603,  634,  851,  882,  446,  477,  942,  973,
+  509,  540,  912,  943,  383,  414,  728,  759,  759,  790,  972, 1003,
+  572, 603, 882, 913, 697, 728, 790, 821, 666, 697, 821, 852,
+  478, 509, 943, 974, 635, 666, 852, 883, 415, 446, 541, 572,
+  913,  944,  973, 1004,  604,  635,  883,  914,  760,  791,  729,  760,
+  791,  822,  510,  541,  944,  975,  447,  478,  974, 1005,  698,  729,
+  822, 853, 573, 604, 914, 945, 667, 698, 853, 884, 636, 667,
+  884,  915,  479,  510,  975, 1006,  542,  573,  945,  976,  761,  792,
+  792, 823, 605, 636, 915, 946, 730, 761, 823, 854, 699, 730,
+  854,  885,  511,  542,  976, 1007,  574,  605,  946,  977,  668,  699,
+  885,  916,  637,  668,  916,  947,  543,  574,  793,  824,  977, 1008,
+  762, 793, 824, 855, 731, 762, 855, 886, 606, 637, 947, 978,
+  700,  731,  886,  917,  669,  700,  917,  948,  575,  606,  978, 1009,
+  638, 669, 948, 979, 794, 825, 825, 856, 763, 794, 856, 887,
+  732,  763,  887,  918,  607,  638,  979, 1010,  701,  732,  918,  949,
+  670, 701, 949, 980, 826, 857, 795, 826, 857, 888, 764, 795,
+  888,  919,  639,  670,  980, 1011,  733,  764,  919,  950,  702,  733,
+  950,  981,  671,  702,  981, 1012,  827,  858,  858,  889,  796,  827,
+  889, 920, 765, 796, 920, 951, 734, 765, 951, 982, 703, 734,
+  982, 1013,  859,  890,  828,  859,  890,  921,  797,  828,  921,  952,
+  766,  797,  952,  983,  735,  766,  983, 1014,  860,  891,  891,  922,
+  829,  860,  922,  953,  798,  829,  953,  984,  767,  798,  984, 1015,
+  892, 923, 861, 892, 923, 954, 830, 861, 954, 985, 799, 830,
+  985, 1016,  893,  924,  924,  955,  862,  893,  955,  986,  831,  862,
+  986, 1017,  925,  956,  894,  925,  956,  987,  863,  894,  987, 1018,
+  926,  957,  957,  988,  895,  926,  988, 1019,  958,  989,  927,  958,
+  989, 1020,  959,  990,  990, 1021,  991, 1022, 0, 0
+};
+#endif  // CONFIG_EXT_TX
 
 DECLARE_ALIGNED(16, static const int16_t, vp10_default_iscan_4x4[16]) = {
   0, 2, 5, 8, 1, 3, 9, 12, 4, 7, 11, 14, 6, 10, 13, 15,
 };
 
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t, vp10_mcol_iscan_4x4[16]) = {
+  0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp10_mrow_iscan_4x4[16]) = {
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+};
+#endif  // CONFIG_EXT_TX
+
 DECLARE_ALIGNED(16, static const int16_t, vp10_col_iscan_4x4[16]) = {
   0, 3, 7, 11, 1, 5, 9, 12, 2, 6, 10, 14, 4, 8, 13, 15,
 };
@@ -542,6 +2258,22 @@
   0, 1, 3, 5, 2, 4, 6, 9, 7, 8, 11, 13, 10, 12, 14, 15,
 };
 
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t, vp10_mcol_iscan_8x8[64]) = {
+  0, 8, 16, 24, 32, 40, 48, 56, 1, 9, 17, 25, 33, 41, 49, 57, 2, 10,
+  18, 26, 34, 42, 50, 58, 3, 11, 19, 27, 35, 43, 51, 59, 4, 12, 20,
+  28, 36, 44, 52, 60, 5, 13, 21, 29, 37, 45, 53, 61, 6, 14, 22, 30,
+  38, 46, 54, 62, 7, 15, 23, 31, 39, 47, 55, 63,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp10_mrow_iscan_8x8[64]) = {
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+  20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
+  37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
+  54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+};
+#endif  // CONFIG_EXT_TX
+
 DECLARE_ALIGNED(16, static const int16_t, vp10_col_iscan_8x8[64]) = {
   0, 3, 8, 15, 22, 32, 40, 47, 1, 5, 11, 18, 26, 34, 44, 51,
   2, 7, 13, 20, 28, 38, 46, 54, 4, 10, 16, 24, 31, 41, 50, 56,
@@ -563,6 +2295,55 @@
   25, 32, 39, 45, 50, 55, 59, 62, 33, 40, 46, 51, 54, 58, 61, 63,
 };
 
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t, vp10_mcol_iscan_16x16[256]) = {
+  0, 16, 32, 48, 64, 80, 96, 112, 128, 144, 160, 176, 192, 208, 224, 240,
+  1, 17, 33, 49, 65, 81, 97, 113, 129, 145, 161, 177, 193, 209, 225, 241,
+  2, 18, 34, 50, 66, 82, 98, 114, 130, 146, 162, 178, 194, 210, 226, 242,
+  3, 19, 35, 51, 67, 83, 99, 115, 131, 147, 163, 179, 195, 211, 227, 243,
+  4, 20, 36, 52, 68, 84, 100, 116, 132, 148, 164, 180, 196, 212, 228, 244,
+  5, 21, 37, 53, 69, 85, 101, 117, 133, 149, 165, 181, 197, 213, 229, 245,
+  6, 22, 38, 54, 70, 86, 102, 118, 134, 150, 166, 182, 198, 214, 230, 246,
+  7, 23, 39, 55, 71, 87, 103, 119, 135, 151, 167, 183, 199, 215, 231, 247,
+  8, 24, 40, 56, 72, 88, 104, 120, 136, 152, 168, 184, 200, 216, 232, 248,
+  9, 25, 41, 57, 73, 89, 105, 121, 137, 153, 169, 185, 201, 217, 233, 249,
+  10, 26, 42, 58, 74, 90, 106, 122, 138, 154, 170, 186, 202, 218, 234, 250,
+  11, 27, 43, 59, 75, 91, 107, 123, 139, 155, 171, 187, 203, 219, 235, 251,
+  12, 28, 44, 60, 76, 92, 108, 124, 140, 156, 172, 188, 204, 220, 236, 252,
+  13, 29, 45, 61, 77, 93, 109, 125, 141, 157, 173, 189, 205, 221, 237, 253,
+  14, 30, 46, 62, 78, 94, 110, 126, 142, 158, 174, 190, 206, 222, 238, 254,
+  15, 31, 47, 63, 79, 95, 111, 127, 143, 159, 175, 191, 207, 223, 239, 255,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp10_mrow_iscan_16x16[256]) = {
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+  64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+  80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
+  96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
+  112, 113, 114, 115, 116, 117, 118, 119,
+  120, 121, 122, 123, 124, 125, 126, 127,
+  128, 129, 130, 131, 132, 133, 134, 135,
+  136, 137, 138, 139, 140, 141, 142, 143,
+  144, 145, 146, 147, 148, 149, 150, 151,
+  152, 153, 154, 155, 156, 157, 158, 159,
+  160, 161, 162, 163, 164, 165, 166, 167,
+  168, 169, 170, 171, 172, 173, 174, 175,
+  176, 177, 178, 179, 180, 181, 182, 183,
+  184, 185, 186, 187, 188, 189, 190, 191,
+  192, 193, 194, 195, 196, 197, 198, 199,
+  200, 201, 202, 203, 204, 205, 206, 207,
+  208, 209, 210, 211, 212, 213, 214, 215,
+  216, 217, 218, 219, 220, 221, 222, 223,
+  224, 225, 226, 227, 228, 229, 230, 231,
+  232, 233, 234, 235, 236, 237, 238, 239,
+  240, 241, 242, 243, 244, 245, 246, 247,
+  248, 249, 250, 251, 252, 253, 254, 255,
+};
+#endif  // CONFIG_EXT_TX
+
 DECLARE_ALIGNED(16, static const int16_t, vp10_col_iscan_16x16[256]) = {
   0, 4, 11, 20, 31, 43, 59, 75, 85, 109, 130, 150, 165, 181, 195, 198,
   1, 6, 14, 23, 34, 47, 64, 81, 95, 114, 135, 153, 171, 188, 201, 212,
@@ -622,6 +2403,206 @@
   249, 253, 255,
 };
 
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t, vp10_mcol_iscan_32x32[1024]) = {
+  0, 32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416,
+  448, 480, 512, 544, 576, 608, 640, 672, 704, 736, 768, 800, 832,
+  864, 896, 928, 960, 992,
+  1, 33, 65, 97, 129, 161, 193, 225, 257, 289, 321, 353, 385, 417,
+  449, 481, 513, 545, 577, 609, 641, 673, 705, 737, 769, 801, 833,
+  865, 897, 929, 961, 993,
+  2, 34, 66, 98, 130, 162, 194, 226, 258, 290, 322, 354, 386, 418,
+  450, 482, 514, 546, 578, 610, 642, 674, 706, 738, 770, 802, 834,
+  866, 898, 930, 962, 994,
+  3, 35, 67, 99, 131, 163, 195, 227, 259, 291, 323, 355, 387, 419,
+  451, 483, 515, 547, 579, 611, 643, 675, 707, 739, 771, 803, 835,
+  867, 899, 931, 963, 995,
+  4, 36, 68, 100, 132, 164, 196, 228, 260, 292, 324, 356, 388, 420,
+  452, 484, 516, 548, 580, 612, 644, 676, 708, 740, 772, 804, 836,
+  868, 900, 932, 964, 996,
+  5, 37, 69, 101, 133, 165, 197, 229, 261, 293, 325, 357, 389, 421,
+  453, 485, 517, 549, 581, 613, 645, 677, 709, 741, 773, 805, 837,
+  869, 901, 933, 965, 997,
+  6, 38, 70, 102, 134, 166, 198, 230, 262, 294, 326, 358, 390, 422,
+  454, 486, 518, 550, 582, 614, 646, 678, 710, 742, 774, 806, 838,
+  870, 902, 934, 966, 998,
+  7, 39, 71, 103, 135, 167, 199, 231, 263, 295, 327, 359, 391, 423,
+  455, 487, 519, 551, 583, 615, 647, 679, 711, 743, 775, 807, 839,
+  871, 903, 935, 967, 999,
+  8, 40, 72, 104, 136, 168, 200, 232, 264, 296, 328, 360, 392, 424,
+  456, 488, 520, 552, 584, 616, 648, 680, 712, 744, 776, 808, 840,
+  872, 904, 936, 968, 1000,
+  9, 41, 73, 105, 137, 169, 201, 233, 265, 297, 329, 361, 393, 425,
+  457, 489, 521, 553, 585, 617, 649, 681, 713, 745, 777, 809, 841,
+  873, 905, 937, 969, 1001,
+  10, 42, 74, 106, 138, 170, 202, 234, 266, 298, 330, 362, 394,
+  426, 458, 490, 522, 554, 586, 618, 650, 682, 714, 746, 778, 810,
+  842, 874, 906, 938, 970, 1002,
+  11, 43, 75, 107, 139, 171, 203, 235, 267, 299, 331, 363, 395,
+  427, 459, 491, 523, 555, 587, 619, 651, 683, 715, 747, 779, 811,
+  843, 875, 907, 939, 971, 1003,
+  12, 44, 76, 108, 140, 172, 204, 236, 268, 300, 332, 364, 396,
+  428, 460, 492, 524, 556, 588, 620, 652, 684, 716, 748, 780,
+  812, 844, 876, 908, 940, 972, 1004,
+  13, 45, 77, 109, 141, 173, 205, 237, 269, 301, 333, 365, 397,
+  429, 461, 493, 525, 557, 589, 621, 653, 685, 717, 749, 781,
+  813, 845, 877, 909, 941, 973, 1005,
+  14, 46, 78, 110, 142, 174, 206, 238, 270, 302, 334, 366, 398,
+  430, 462, 494, 526, 558, 590, 622, 654, 686, 718, 750, 782,
+  814, 846, 878, 910, 942, 974, 1006,
+  15, 47, 79, 111, 143, 175, 207, 239, 271, 303, 335, 367, 399,
+  431, 463, 495, 527, 559, 591, 623, 655, 687, 719, 751, 783,
+  815, 847, 879, 911, 943, 975, 1007,
+  16, 48, 80, 112, 144, 176, 208, 240, 272, 304, 336, 368, 400,
+  432, 464, 496, 528, 560, 592, 624, 656, 688, 720, 752, 784,
+  816, 848, 880, 912, 944, 976, 1008,
+  17, 49, 81, 113, 145, 177, 209, 241, 273, 305, 337, 369, 401,
+  433, 465, 497, 529, 561, 593, 625, 657, 689, 721, 753, 785,
+  817, 849, 881, 913, 945, 977, 1009,
+  18, 50, 82, 114, 146, 178, 210, 242, 274, 306, 338, 370, 402,
+  434, 466, 498, 530, 562, 594, 626, 658, 690, 722, 754, 786,
+  818, 850, 882, 914, 946, 978, 1010,
+  19, 51, 83, 115, 147, 179, 211, 243, 275, 307, 339, 371, 403,
+  435, 467, 499, 531, 563, 595, 627, 659, 691, 723, 755, 787,
+  819, 851, 883, 915, 947, 979, 1011,
+  20, 52, 84, 116, 148, 180, 212, 244, 276, 308, 340, 372, 404,
+  436, 468, 500, 532, 564, 596, 628, 660, 692, 724, 756, 788,
+  820, 852, 884, 916, 948, 980, 1012,
+  21, 53, 85, 117, 149, 181, 213, 245, 277, 309, 341, 373, 405,
+  437, 469, 501, 533, 565, 597, 629, 661, 693, 725, 757, 789,
+  821, 853, 885, 917, 949, 981, 1013,
+  22, 54, 86, 118, 150, 182, 214, 246, 278, 310, 342, 374, 406,
+  438, 470, 502, 534, 566, 598, 630, 662, 694, 726, 758, 790,
+  822, 854, 886, 918, 950, 982, 1014,
+  23, 55, 87, 119, 151, 183, 215, 247, 279, 311, 343, 375, 407,
+  439, 471, 503, 535, 567, 599, 631, 663, 695, 727, 759, 791,
+  823, 855, 887, 919, 951, 983, 1015,
+  24, 56, 88, 120, 152, 184, 216, 248, 280, 312, 344, 376, 408,
+  440, 472, 504, 536, 568, 600, 632, 664, 696, 728, 760, 792,
+  824, 856, 888, 920, 952, 984, 1016,
+  25, 57, 89, 121, 153, 185, 217, 249, 281, 313, 345, 377, 409,
+  441, 473, 505, 537, 569, 601, 633, 665, 697, 729, 761, 793,
+  825, 857, 889, 921, 953, 985, 1017,
+  26, 58, 90, 122, 154, 186, 218, 250, 282, 314, 346, 378, 410,
+  442, 474, 506, 538, 570, 602, 634, 666, 698, 730, 762, 794,
+  826, 858, 890, 922, 954, 986, 1018,
+  27, 59, 91, 123, 155, 187, 219, 251, 283, 315, 347, 379, 411,
+  443, 475, 507, 539, 571, 603, 635, 667, 699, 731, 763, 795,
+  827, 859, 891, 923, 955, 987, 1019,
+  28, 60, 92, 124, 156, 188, 220, 252, 284, 316, 348, 380, 412,
+  444, 476, 508, 540, 572, 604, 636, 668, 700, 732, 764, 796,
+  828, 860, 892, 924, 956, 988, 1020,
+  29, 61, 93, 125, 157, 189, 221, 253, 285, 317, 349, 381, 413,
+  445, 477, 509, 541, 573, 605, 637, 669, 701, 733, 765, 797,
+  829, 861, 893, 925, 957, 989, 1021,
+  30, 62, 94, 126, 158, 190, 222, 254, 286, 318, 350, 382, 414,
+  446, 478, 510, 542, 574, 606, 638, 670, 702, 734, 766, 798,
+  830, 862, 894, 926, 958, 990, 1022,
+  31, 63, 95, 127, 159, 191, 223, 255, 287, 319, 351, 383, 415,
+  447, 479, 511, 543, 575, 607, 639, 671, 703, 735, 767, 799,
+  831, 863, 895, 927, 959, 991, 1023,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp10_mrow_iscan_32x32[1024]) = {
+  0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+  17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
+  46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+  60, 61, 62, 63,
+  64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
+  78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91,
+  92, 93, 94, 95,
+  96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
+  108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118,
+  119, 120, 121, 122, 123, 124, 125, 126, 127,
+  128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138,
+  139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149,
+  150, 151, 152, 153, 154, 155, 156, 157, 158, 159,
+  160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170,
+  171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181,
+  182, 183, 184, 185, 186, 187, 188, 189, 190, 191,
+  192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202,
+  203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213,
+  214, 215, 216, 217, 218, 219, 220, 221, 222, 223,
+  224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234,
+  235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245,
+  246, 247, 248, 249, 250, 251, 252, 253, 254, 255,
+  256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266,
+  267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277,
+  278, 279, 280, 281, 282, 283, 284, 285, 286, 287,
+  288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298,
+  299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309,
+  310, 311, 312, 313, 314, 315, 316, 317, 318, 319,
+  320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330,
+  331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341,
+  342, 343, 344, 345, 346, 347, 348, 349, 350, 351,
+  352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362,
+  363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373,
+  374, 375, 376, 377, 378, 379, 380, 381, 382, 383,
+  384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394,
+  395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405,
+  406, 407, 408, 409, 410, 411, 412, 413, 414, 415,
+  416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426,
+  427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437,
+  438, 439, 440, 441, 442, 443, 444, 445, 446, 447,
+  448, 449, 450, 451, 452, 453, 454, 455, 456, 457, 458,
+  459, 460, 461, 462, 463, 464, 465, 466, 467, 468, 469,
+  470, 471, 472, 473, 474, 475, 476, 477, 478, 479,
+  480, 481, 482, 483, 484, 485, 486, 487, 488, 489, 490,
+  491, 492, 493, 494, 495, 496, 497, 498, 499, 500, 501,
+  502, 503, 504, 505, 506, 507, 508, 509, 510, 511,
+  512, 513, 514, 515, 516, 517, 518, 519, 520, 521, 522,
+  523, 524, 525, 526, 527, 528, 529, 530, 531, 532, 533,
+  534, 535, 536, 537, 538, 539, 540, 541, 542, 543,
+  544, 545, 546, 547, 548, 549, 550, 551, 552, 553, 554,
+  555, 556, 557, 558, 559, 560, 561, 562, 563, 564, 565,
+  566, 567, 568, 569, 570, 571, 572, 573, 574, 575,
+  576, 577, 578, 579, 580, 581, 582, 583, 584, 585, 586,
+  587, 588, 589, 590, 591, 592, 593, 594, 595, 596, 597,
+  598, 599, 600, 601, 602, 603, 604, 605, 606, 607,
+  608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618,
+  619, 620, 621, 622, 623, 624, 625, 626, 627, 628, 629,
+  630, 631, 632, 633, 634, 635, 636, 637, 638, 639,
+  640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650,
+  651, 652, 653, 654, 655, 656, 657, 658, 659, 660, 661,
+  662, 663, 664, 665, 666, 667, 668, 669, 670, 671,
+  672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682,
+  683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693,
+  694, 695, 696, 697, 698, 699, 700, 701, 702, 703,
+  704, 705, 706, 707, 708, 709, 710, 711, 712, 713, 714,
+  715, 716, 717, 718, 719, 720, 721, 722, 723, 724, 725,
+  726, 727, 728, 729, 730, 731, 732, 733, 734, 735,
+  736, 737, 738, 739, 740, 741, 742, 743, 744, 745, 746,
+  747, 748, 749, 750, 751, 752, 753, 754, 755, 756, 757,
+  758, 759, 760, 761, 762, 763, 764, 765, 766, 767,
+  768, 769, 770, 771, 772, 773, 774, 775, 776, 777, 778,
+  779, 780, 781, 782, 783, 784, 785, 786, 787, 788, 789,
+  790, 791, 792, 793, 794, 795, 796, 797, 798, 799,
+  800, 801, 802, 803, 804, 805, 806, 807, 808, 809, 810,
+  811, 812, 813, 814, 815, 816, 817, 818, 819, 820, 821,
+  822, 823, 824, 825, 826, 827, 828, 829, 830, 831,
+  832, 833, 834, 835, 836, 837, 838, 839, 840, 841, 842,
+  843, 844, 845, 846, 847, 848, 849, 850, 851, 852, 853,
+  854, 855, 856, 857, 858, 859, 860, 861, 862, 863,
+  864, 865, 866, 867, 868, 869, 870, 871, 872, 873, 874,
+  875, 876, 877, 878, 879, 880, 881, 882, 883, 884, 885,
+  886, 887, 888, 889, 890, 891, 892, 893, 894, 895,
+  896, 897, 898, 899, 900, 901, 902, 903, 904, 905, 906,
+  907, 908, 909, 910, 911, 912, 913, 914, 915, 916, 917,
+  918, 919, 920, 921, 922, 923, 924, 925, 926, 927,
+  928, 929, 930, 931, 932, 933, 934, 935, 936, 937, 938,
+  939, 940, 941, 942, 943, 944, 945, 946, 947, 948, 949,
+  950, 951, 952, 953, 954, 955, 956, 957, 958, 959,
+  960, 961, 962, 963, 964, 965, 966, 967, 968, 969, 970,
+  971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981,
+  982, 983, 984, 985, 986, 987, 988, 989, 990, 991,
+  992, 993, 994, 995, 996, 997, 998, 999, 1000, 1001,
+  1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010,
+  1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019,
+  1020, 1021, 1022, 1023,
+};
+#endif  // CONFIG_EXT_TX
+
 DECLARE_ALIGNED(16, static const int16_t, vp10_default_iscan_32x32[1024]) = {
   0, 2, 5, 10, 17, 25, 38, 47, 62, 83, 101, 121, 145, 170, 193, 204,
   210, 219, 229, 233, 245, 257, 275, 299, 342, 356, 377, 405, 455, 471, 495,
@@ -693,6 +2674,275 @@
   1023,
 };
 
+#if CONFIG_EXT_TX
+DECLARE_ALIGNED(16, static const int16_t, vp10_v2_iscan_32x32[1024]) = {
+  0,   1,   4,   9,  15,  22,  33,  43,  56,  71,  86, 104,
+  121, 142, 166, 189, 512, 518, 527, 539, 551, 566, 584, 602,
+  621, 644, 668, 695, 721, 748, 780, 811,   2,   3,   6,  11,
+  17,  26,  35,  45,  58,  73,  90, 106, 123, 146, 168, 193,
+  513, 519, 528, 540, 553, 567, 585, 603, 622, 647, 670, 696,
+  722, 751, 783, 812,   5,   7,   8,  13,  20,  28,  37,  50,
+  62,  75,  92, 108, 129, 150, 170, 195, 514, 521, 530, 541,
+  554, 569, 587, 605, 625, 649, 671, 699, 725, 752, 785, 815,
+  10,  12,  14,  19,  23,  31,  41,  52,  65,  81,  96, 113,
+  133, 152, 175, 201, 515, 522, 531, 542, 556, 572, 589, 607,
+  629, 651, 673, 700, 726, 757, 788, 819,  16,  18,  21,  24,
+  30,  39,  48,  59,  69,  83, 100, 119, 137, 158, 181, 203,
+  516, 523, 534, 545, 559, 574, 591, 610, 632, 654, 679, 704,
+  730, 762, 791, 824,  25,  27,  29,  32,  40,  46,  54,  67,
+  79,  94, 109, 127, 143, 164, 185, 210, 517, 525, 535, 547,
+  561, 578, 595, 615, 635, 656, 684, 707, 737, 766, 793, 830,
+  34,  36,  38,  42,  49,  55,  64,  76,  87, 102, 117, 135,
+  154, 176, 197, 219, 520, 529, 538, 550, 565, 580, 598, 618,
+  639, 664, 687, 712, 741, 769, 802, 833,  44,  47,  51,  53,
+  60,  68,  77,  85,  98, 114, 131, 147, 162, 183, 208, 227,
+  524, 533, 544, 557, 571, 588, 606, 623, 645, 667, 692, 720,
+  747, 776, 806, 838,  57,  61,  63,  66,  70,  80,  88,  99,
+  112, 124, 140, 159, 179, 199, 216, 233, 526, 536, 548, 562,
+  577, 593, 613, 633, 653, 676, 701, 727, 756, 786, 814, 847,
+  72,  74,  78,  82,  84,  95, 103, 115, 125, 139, 156, 173,
+  190, 211, 229, 246, 532, 543, 555, 568, 581, 601, 619, 637,
+  663, 685, 709, 738, 763, 792, 826, 855,  89,  91,  93,  97,
+  101, 110, 118, 132, 141, 157, 171, 186, 206, 224, 241, 255,
+  537, 549, 560, 576, 592, 608, 628, 650, 669, 693, 719, 744,
+  773, 805, 834, 862, 105, 107, 111, 116, 120, 128, 136, 148,
+  160, 174, 187, 205, 221, 236, 251, 267, 546, 558, 570, 583,
+  600, 617, 636, 657, 680, 706, 729, 758, 787, 813, 846, 871,
+  122, 126, 130, 134, 138, 144, 155, 163, 180, 191, 207, 222,
+  232, 248, 264, 278, 552, 564, 579, 594, 609, 630, 648, 666,
+  688, 715, 742, 768, 797, 827, 856, 877, 145, 149, 151, 153,
+  161, 165, 177, 184, 200, 212, 225, 237, 249, 262, 275, 289,
+  563, 575, 590, 604, 620, 638, 660, 683, 705, 728, 753, 779,
+  809, 839, 866, 889, 167, 169, 172, 178, 182, 188, 198, 209,
+  217, 230, 242, 252, 265, 276, 288, 301, 573, 586, 599, 616,
+  634, 652, 672, 694, 716, 743, 767, 794, 825, 850, 874, 899,
+  192, 194, 196, 202, 204, 213, 220, 228, 234, 247, 256, 268,
+  279, 290, 302, 315, 582, 597, 614, 631, 646, 665, 686, 708,
+  732, 759, 784, 810, 837, 863, 886, 908, 214, 215, 218, 223,
+  226, 231, 239, 244, 253, 261, 271, 283, 292, 304, 317, 325,
+  596, 611, 626, 642, 661, 681, 702, 723, 745, 770, 800, 828,
+  853, 875, 897, 919, 235, 238, 240, 243, 245, 250, 257, 263,
+  270, 280, 287, 298, 307, 319, 329, 340, 612, 624, 640, 658,
+  677, 697, 717, 739, 764, 789, 816, 844, 867, 890, 909, 927,
+  254, 258, 259, 260, 266, 269, 272, 282, 286, 296, 303, 312,
+  323, 333, 341, 355, 627, 641, 655, 674, 690, 713, 735, 760,
+  781, 807, 835, 857, 880, 902, 921, 940, 273, 274, 277, 281,
+  284, 285, 291, 299, 305, 310, 320, 327, 337, 346, 357, 369,
+  643, 659, 675, 689, 710, 733, 754, 777, 803, 831, 851, 872,
+  892, 913, 934, 950, 293, 294, 295, 297, 300, 306, 308, 314,
+  321, 326, 335, 343, 352, 361, 372, 378, 662, 678, 691, 711,
+  731, 749, 774, 798, 822, 848, 869, 887, 906, 925, 942, 961,
+  309, 311, 313, 316, 318, 322, 324, 332, 338, 344, 351, 358,
+  367, 375, 386, 394, 682, 698, 714, 734, 750, 772, 795, 820,
+  842, 864, 884, 904, 923, 938, 954, 967, 328, 330, 331, 334,
+  336, 339, 342, 348, 354, 359, 366, 374, 382, 391, 400, 409,
+  703, 718, 736, 755, 775, 796, 818, 840, 860, 882, 900, 917,
+  936, 952, 965, 977, 345, 347, 349, 350, 353, 356, 360, 364,
+  371, 376, 383, 389, 395, 406, 412, 423, 724, 740, 761, 778,
+  799, 821, 841, 859, 878, 895, 915, 932, 948, 963, 975, 986,
+  362, 363, 365, 368, 370, 373, 377, 379, 387, 392, 397, 405,
+  411, 420, 428, 439, 746, 765, 782, 804, 823, 843, 861, 879,
+  894, 911, 930, 946, 959, 973, 984, 994, 380, 381, 384, 385,
+  388, 390, 393, 396, 403, 408, 413, 422, 427, 436, 444, 452,
+  771, 790, 808, 832, 849, 865, 883, 896, 912, 928, 944, 957,
+  971,  982,  992, 1001,  398,  399,  401,  402,  404,  407,  410,  414,
+  419, 425, 429, 437, 442, 449, 458, 465, 801, 817, 836, 852,
+  870,  885,  901,  916,  931,  945,  956,  969,  980,  990,  999, 1007,
+  415, 416, 417, 418, 421, 424, 426, 430, 434, 441, 445, 453,
+  459, 463, 473, 480, 829, 845, 858, 873, 888, 905, 918, 933,
+  947,  958,  970,  979,  988,  997, 1005, 1012,  431,  432,  433,  435,
+  438, 440, 443, 446, 451, 456, 461, 468, 475, 479, 488, 494,
+  854, 868, 881, 893, 907, 924, 937, 949, 960, 972, 981, 989,
+  996, 1003, 1010, 1016,  447,  448,  450,  454,  455,  457,  460,  462,
+  469, 472, 477, 482, 490, 495, 499, 503, 876, 891, 903, 914,
+  926,  939,  953,  964,  974,  983,  991,  998, 1004, 1009, 1014, 1019,
+  464, 466, 467, 470, 471, 474, 476, 478, 484, 489, 493, 497,
+  501, 504, 506, 508, 898, 910, 922, 935, 943, 955, 966, 976,
+  985,  993, 1000, 1006, 1011, 1015, 1018, 1021,  481,  483,  485,  486,
+  487, 491, 492, 496, 498, 500, 502, 505, 507, 509, 510, 511,
+  920,  929,  941,  951,  962,  968,  978,  987,  995, 1002, 1008, 1013,
+  1017, 1020, 1022, 1023,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp10_h2_iscan_32x32[1024]) = {
+  0,   1,   4,   9,  15,  22,  33,  43,  56,  71,  86, 104,
+  121, 142, 166, 189, 214, 233, 254, 273, 292, 309, 328, 345,
+  362, 378, 397, 415, 431, 447, 464, 481,   2,   3,   6,  11,
+  17,  26,  35,  45,  58,  73,  90, 106, 123, 146, 168, 193,
+  215, 236, 255, 274, 294, 310, 329, 346, 363, 381, 399, 416,
+  432, 448, 465, 482,   5,   7,   8,  13,  20,  28,  37,  50,
+  62,  75,  92, 108, 129, 150, 170, 195, 216, 240, 259, 275,
+  295, 312, 331, 348, 365, 383, 400, 417, 433, 449, 467, 485,
+  10,  12,  14,  19,  23,  31,  41,  52,  65,  81,  96, 113,
+  133, 152, 175, 201, 221, 243, 260, 280, 297, 315, 333, 350,
+  367, 385, 402, 418, 434, 452, 470, 486,  16,  18,  21,  24,
+  30,  39,  48,  59,  69,  83, 100, 119, 137, 158, 181, 203,
+  226, 244, 264, 283, 300, 318, 335, 353, 370, 388, 404, 420,
+  438, 455, 471, 487,  25,  27,  29,  32,  40,  46,  54,  67,
+  79,  94, 109, 127, 143, 164, 185, 210, 231, 250, 269, 285,
+  304, 322, 339, 356, 373, 389, 407, 423, 440, 457, 473, 491,
+  34,  36,  38,  42,  49,  55,  64,  76,  87, 102, 117, 135,
+  154, 176, 197, 219, 239, 256, 272, 291, 308, 324, 341, 359,
+  377, 393, 410, 426, 442, 460, 476, 492,  44,  47,  51,  53,
+  60,  68,  77,  85,  98, 114, 131, 147, 162, 183, 208, 227,
+  245, 262, 282, 298, 314, 332, 349, 364, 379, 396, 412, 430,
+  446, 462, 478, 495,  57,  61,  63,  66,  70,  80,  88,  99,
+  112, 124, 140, 159, 179, 199, 217, 234, 253, 270, 286, 305,
+  321, 337, 354, 371, 387, 403, 419, 435, 451, 468, 484, 498,
+  72,  74,  78,  82,  84,  95, 103, 115, 125, 139, 156, 173,
+  190, 211, 229, 246, 261, 281, 296, 311, 325, 344, 360, 375,
+  392, 408, 425, 441, 456, 472, 489, 500,  89,  91,  93,  97,
+  101, 110, 118, 132, 141, 157, 171, 186, 206, 224, 241, 257,
+  271, 287, 303, 320, 336, 351, 366, 384, 398, 413, 429, 445,
+  461, 477, 493, 502, 105, 107, 111, 116, 120, 128, 136, 148,
+  160, 174, 187, 205, 222, 237, 251, 267, 284, 299, 313, 327,
+  343, 358, 374, 390, 405, 422, 437, 453, 469, 483, 497, 505,
+  122, 126, 130, 134, 138, 144, 155, 163, 180, 191, 207, 223,
+  232, 248, 265, 278, 293, 307, 323, 338, 352, 368, 382, 395,
+  411, 427, 443, 459, 475, 490, 501, 507, 145, 149, 151, 153,
+  161, 165, 177, 184, 200, 212, 225, 238, 249, 263, 276, 289,
+  306, 319, 334, 347, 361, 376, 391, 406, 421, 436, 450, 463,
+  479, 496, 504, 509, 167, 169, 172, 178, 182, 188, 198, 209,
+  218, 230, 242, 252, 266, 277, 288, 301, 317, 330, 342, 357,
+  372, 386, 401, 414, 428, 444, 458, 474, 488, 499, 506, 510,
+  192, 194, 196, 202, 204, 213, 220, 228, 235, 247, 258, 268,
+  279, 290, 302, 316, 326, 340, 355, 369, 380, 394, 409, 424,
+  439, 454, 466, 480, 494, 503, 508, 511, 512, 513, 514, 515,
+  516, 517, 520, 523, 526, 532, 537, 545, 551, 561, 573, 581,
+  596, 610, 625, 642, 661, 680, 701, 722, 745, 770, 800, 827,
+  853, 875, 897, 919, 518, 519, 521, 522, 524, 525, 528, 533,
+  536, 542, 549, 557, 564, 575, 585, 597, 611, 623, 640, 656,
+  676, 696, 717, 739, 763, 789, 815, 844, 867, 889, 909, 927,
+  527, 529, 530, 531, 534, 535, 538, 544, 548, 555, 560, 569,
+  579, 589, 598, 614, 626, 641, 655, 673, 690, 712, 735, 760,
+  780, 806, 834, 857, 880, 902, 921, 940, 539, 540, 541, 543,
+  546, 547, 550, 558, 562, 567, 576, 583, 593, 603, 616, 631,
+  643, 657, 674, 689, 710, 733, 752, 776, 803, 830, 850, 872,
+  892, 913, 934, 950, 552, 553, 554, 556, 559, 563, 565, 571,
+  577, 582, 591, 600, 609, 620, 634, 644, 662, 677, 691, 711,
+  730, 748, 773, 798, 822, 847, 869, 887, 906, 925, 942, 961,
+  566, 568, 570, 572, 574, 578, 580, 588, 594, 601, 608, 617,
+  629, 637, 652, 665, 681, 697, 713, 734, 749, 772, 793, 819,
+  842, 863, 884, 904, 923, 938, 954, 967, 584, 586, 587, 590,
+  592, 595, 599, 605, 613, 618, 628, 636, 648, 660, 671, 686,
+  702, 718, 736, 753, 774, 794, 818, 840, 860, 882, 900, 917,
+  936, 952, 965, 977, 602, 604, 606, 607, 612, 615, 619, 624,
+  633, 638, 649, 658, 666, 683, 692, 707, 723, 740, 761, 777,
+  799, 820, 841, 859, 877, 895, 915, 932, 948, 963, 975, 986,
+  621, 622, 627, 630, 632, 635, 639, 645, 653, 663, 668, 682,
+  688, 704, 716, 732, 746, 764, 781, 804, 823, 843, 861, 878,
+  894, 911, 930, 946, 959, 973, 984, 994, 646, 647, 650, 651,
+  654, 659, 664, 667, 678, 685, 693, 706, 715, 728, 743, 757,
+  771, 790, 807, 831, 848, 864, 883, 896, 912, 928, 944, 957,
+  971,  982,  992, 1001,  669,  670,  672,  675,  679,  684,  687,  694,
+  703, 709, 719, 729, 741, 754, 767, 783, 801, 816, 835, 851,
+  870,  885,  901,  916,  931,  945,  956,  969,  980,  990,  999, 1007,
+  695, 698, 699, 700, 705, 708, 714, 720, 726, 738, 744, 758,
+  768, 779, 795, 810, 828, 845, 858, 873, 888, 905, 918, 933,
+  947,  958,  970,  979,  988,  997, 1005, 1012,  721,  724,  725,  727,
+  731, 737, 742, 747, 756, 765, 775, 786, 797, 809, 825, 837,
+  854, 868, 881, 893, 907, 924, 937, 949, 960, 972, 981, 989,
+  996, 1003, 1010, 1016,  750,  751,  755,  759,  762,  766,  769,  778,
+  787, 792, 805, 812, 829, 838, 852, 865, 876, 890, 903, 914,
+  926,  939,  953,  964,  974,  983,  991,  998, 1004, 1009, 1014, 1019,
+  782, 784, 785, 788, 791, 796, 802, 808, 814, 826, 836, 846,
+  856, 866, 874, 886, 898, 910, 922, 935, 943, 955, 966, 976,
+  985,  993, 1000, 1006, 1011, 1015, 1018, 1021,  811,  813,  817,  821,
+  824, 832, 833, 839, 849, 855, 862, 871, 879, 891, 899, 908,
+  920,  929,  941,  951,  962,  968,  978,  987,  995, 1002, 1008, 1013,
+  1017, 1020, 1022, 1023,
+};
+
+DECLARE_ALIGNED(16, static const int16_t, vp10_qtr_iscan_32x32[1024]) = {
+  0,   1,   4,   9,  15,  22,  33,  43,  56,  71,  86, 104,
+  121, 142, 166, 189, 256, 268, 286, 310, 334, 364, 400, 435,
+  471, 510, 553, 598, 640, 683, 732, 780,   2,   3,   6,  11,
+  17,  26,  35,  45,  58,  73,  90, 106, 123, 146, 168, 193,
+  258, 270, 288, 312, 338, 366, 402, 437, 473, 516, 557, 600,
+  642, 687, 736, 782,   5,   7,   8,  13,  20,  28,  37,  50,
+  62,  75,  92, 108, 129, 150, 170, 195, 260, 274, 292, 314,
+  340, 370, 406, 441, 478, 520, 559, 604, 646, 689, 740, 788,
+  10,  12,  14,  19,  23,  31,  41,  52,  65,  81,  96, 113,
+  133, 152, 175, 201, 262, 276, 294, 316, 344, 376, 410, 445,
+  484, 524, 563, 606, 648, 697, 746, 793,  16,  18,  21,  24,
+  30,  39,  48,  59,  69,  83, 100, 119, 137, 158, 181, 203,
+  264, 278, 300, 322, 350, 380, 414, 451, 490, 530, 571, 612,
+  656, 705, 750, 799,  25,  27,  29,  32,  40,  46,  54,  67,
+  79,  94, 109, 127, 143, 164, 185, 210, 266, 282, 302, 326,
+  354, 388, 422, 459, 496, 533, 579, 618, 665, 711, 754, 809,
+  34,  36,  38,  42,  49,  55,  64,  76,  87, 102, 117, 135,
+  154, 176, 197, 216, 272, 289, 308, 332, 362, 392, 427, 465,
+  504, 545, 585, 626, 671, 717, 766, 813,  44,  47,  51,  53,
+  60,  68,  77,  85,  98, 114, 131, 147, 162, 183, 208, 222,
+  279, 298, 320, 346, 374, 408, 442, 475, 511, 551, 592, 638,
+  681, 726, 772, 821,  57,  61,  63,  66,  70,  80,  88,  99,
+  112, 124, 140, 159, 179, 199, 214, 227, 284, 304, 328, 355,
+  386, 418, 455, 492, 528, 567, 608, 649, 695, 742, 786, 833,
+  72,  74,  78,  82,  84,  95, 103, 115, 125, 139, 156, 173,
+  190, 211, 224, 233, 296, 317, 342, 367, 394, 433, 466, 500,
+  543, 581, 622, 667, 707, 752, 803, 843,  89,  91,  93,  97,
+  101, 110, 118, 132, 141, 157, 171, 186, 206, 220, 231, 239,
+  306, 330, 352, 384, 415, 447, 482, 521, 554, 593, 636, 677,
+  722, 770, 815, 852, 105, 107, 111, 116, 120, 128, 136, 148,
+  160, 174, 187, 205, 218, 229, 237, 244, 323, 347, 371, 398,
+  431, 463, 498, 534, 573, 616, 654, 698, 743, 783, 831, 864,
+  122, 126, 130, 134, 138, 144, 155, 163, 180, 191, 207, 219,
+  226, 235, 242, 248, 335, 360, 390, 419, 449, 485, 518, 549,
+  587, 630, 672, 715, 760, 805, 845, 872, 145, 149, 151, 153,
+  161, 165, 177, 184, 200, 212, 221, 230, 236, 241, 246, 251,
+  356, 382, 411, 438, 469, 501, 539, 577, 613, 652, 690, 730,
+  776, 822, 858, 886, 167, 169, 172, 178, 182, 188, 198, 209,
+  215, 225, 232, 238, 243, 247, 250, 253, 378, 403, 428, 461,
+  494, 526, 560, 594, 632, 675, 713, 755, 801, 837, 868, 897,
+  192, 194, 196, 202, 204, 213, 217, 223, 228, 234, 240, 245,
+  249, 252, 254, 255, 395, 425, 457, 488, 512, 547, 583, 619,
+  659, 699, 737, 778, 819, 854, 882, 907, 257, 259, 261, 263,
+  265, 267, 273, 280, 285, 297, 307, 324, 336, 357, 379, 396,
+  424, 452, 479, 508, 541, 574, 609, 643, 679, 719, 764, 806,
+  841, 870, 895, 919, 269, 271, 275, 277, 281, 283, 290, 299,
+  305, 318, 331, 348, 361, 383, 404, 426, 453, 476, 506, 535,
+  568, 601, 634, 669, 708, 748, 789, 829, 860, 887, 909, 927,
+  287, 291, 293, 295, 301, 303, 309, 321, 329, 343, 353, 372,
+  391, 412, 429, 458, 480, 507, 532, 564, 590, 627, 663, 703,
+  733, 773, 816, 847, 876, 901, 921, 940, 311, 313, 315, 319,
+  325, 327, 333, 349, 358, 368, 385, 399, 420, 439, 462, 489,
+  509, 536, 565, 589, 624, 661, 691, 727, 768, 810, 838, 866,
+  890, 913, 934, 950, 337, 339, 341, 345, 351, 359, 363, 375,
+  387, 397, 416, 432, 450, 470, 495, 513, 542, 569, 591, 625,
+  657, 684, 723, 762, 797, 834, 862, 884, 905, 925, 942, 961,
+  365, 369, 373, 377, 381, 389, 393, 409, 421, 434, 448, 464,
+  486, 502, 527, 548, 575, 602, 628, 662, 685, 721, 756, 794,
+  827, 855, 880, 903, 923, 938, 954, 967, 401, 405, 407, 413,
+  417, 423, 430, 443, 456, 467, 483, 499, 519, 540, 561, 584,
+  610, 635, 664, 692, 724, 757, 792, 825, 850, 878, 899, 917,
+  936, 952, 965, 977, 436, 440, 444, 446, 454, 460, 468, 477,
+  493, 503, 522, 537, 550, 578, 595, 620, 644, 670, 704, 728,
+  763, 795, 826, 849, 873, 893, 915, 932, 948, 963, 975, 986,
+  472, 474, 481, 487, 491, 497, 505, 514, 529, 544, 555, 576,
+  588, 614, 633, 660, 680, 709, 734, 769, 798, 828, 851, 874,
+  892, 911, 930, 946, 959, 973, 984, 994, 515, 517, 523, 525,
+  531, 538, 546, 552, 570, 582, 596, 617, 631, 653, 676, 700,
+  720, 749, 774, 811, 835, 856, 879, 894, 912, 928, 944, 957,
+  971,  982,  992, 1001,  556,  558,  562,  566,  572,  580,  586,  597,
+  611, 623, 637, 655, 673, 693, 714, 738, 765, 790, 817, 839,
+  863,  881,  900,  916,  931,  945,  956,  969,  980,  990,  999, 1007,
+  599, 603, 605, 607, 615, 621, 629, 639, 650, 668, 678, 701,
+  716, 731, 758, 779, 807, 830, 848, 867, 885, 904, 918, 933,
+  947,  958,  970,  979,  988,  997, 1005, 1012,  641,  645,  647,  651,
+  658, 666, 674, 682, 696, 710, 725, 744, 761, 777, 802, 820,
+  842, 861, 877, 891, 906, 924, 937, 949, 960, 972, 981, 989,
+  996, 1003, 1010, 1016,  686,  688,  694,  702,  706,  712,  718,  729,
+  745, 753, 771, 784, 808, 823, 840, 857, 871, 888, 902, 914,
+  926,  939,  953,  964,  974,  983,  991,  998, 1004, 1009, 1014, 1019,
+  735, 739, 741, 747, 751, 759, 767, 775, 787, 804, 818, 832,
+  846, 859, 869, 883, 896, 910, 922, 935, 943, 955, 966, 976,
+  985,  993, 1000, 1006, 1011, 1015, 1018, 1021,  781,  785,  791,  796,
+  800, 812, 814, 824, 836, 844, 853, 865, 875, 889, 898, 908,
+  920,  929,  941,  951,  962,  968,  978,  987,  995, 1002, 1008, 1013,
+  1017, 1020, 1022, 1023,
+};
+#endif  // CONFIG_EXT_TX
+
 const scan_order vp10_default_scan_orders[TX_SIZES] = {
   {default_scan_4x4,   vp10_default_iscan_4x4,   default_scan_4x4_neighbors},
   {default_scan_8x8,   vp10_default_iscan_8x8,   default_scan_8x8_neighbors},
@@ -700,7 +2950,188 @@
   {default_scan_32x32, vp10_default_iscan_32x32, default_scan_32x32_neighbors},
 };
 
-const scan_order vp10_scan_orders[TX_SIZES][TX_TYPES] = {
+#if CONFIG_EXT_TX
+const scan_order vp10_intra_scan_orders[TX_SIZES][TX_TYPES] = {
+  {  // TX_4X4
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {row_scan_4x4,     vp10_row_iscan_4x4,     row_scan_4x4_neighbors},
+    {col_scan_4x4,     vp10_col_iscan_4x4,     col_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {mrow_scan_4x4,    vp10_mrow_iscan_4x4,    mrow_scan_4x4_neighbors},
+    {row_scan_4x4,     vp10_row_iscan_4x4,     row_scan_4x4_neighbors},
+    {col_scan_4x4,     vp10_col_iscan_4x4,     col_scan_4x4_neighbors},
+    {row_scan_4x4,     vp10_row_iscan_4x4,     row_scan_4x4_neighbors},
+    {col_scan_4x4,     vp10_col_iscan_4x4,     col_scan_4x4_neighbors},
+    {row_scan_4x4,     vp10_row_iscan_4x4,     row_scan_4x4_neighbors},
+    {col_scan_4x4,     vp10_col_iscan_4x4,     col_scan_4x4_neighbors},
+  }, {  // TX_8X8
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {row_scan_8x8,     vp10_row_iscan_8x8,     row_scan_8x8_neighbors},
+    {col_scan_8x8,     vp10_col_iscan_8x8,     col_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {mrow_scan_8x8,    vp10_mrow_iscan_8x8,    mrow_scan_8x8_neighbors},
+    {row_scan_8x8,     vp10_row_iscan_8x8,     row_scan_8x8_neighbors},
+    {col_scan_8x8,     vp10_col_iscan_8x8,     col_scan_8x8_neighbors},
+    {row_scan_8x8,     vp10_row_iscan_8x8,     row_scan_8x8_neighbors},
+    {col_scan_8x8,     vp10_col_iscan_8x8,     col_scan_8x8_neighbors},
+    {row_scan_8x8,     vp10_row_iscan_8x8,     row_scan_8x8_neighbors},
+    {col_scan_8x8,     vp10_col_iscan_8x8,     col_scan_8x8_neighbors},
+  }, {  // TX_16X16
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {row_scan_16x16,     vp10_row_iscan_16x16,     row_scan_16x16_neighbors},
+    {col_scan_16x16,     vp10_col_iscan_16x16,     col_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {mrow_scan_16x16,  vp10_mrow_iscan_16x16,  mrow_scan_16x16_neighbors},
+    {row_scan_16x16,     vp10_row_iscan_16x16,     row_scan_16x16_neighbors},
+    {col_scan_16x16,     vp10_col_iscan_16x16,     col_scan_16x16_neighbors},
+    {row_scan_16x16,     vp10_row_iscan_16x16,     row_scan_16x16_neighbors},
+    {col_scan_16x16,     vp10_col_iscan_16x16,     col_scan_16x16_neighbors},
+    {row_scan_16x16,     vp10_row_iscan_16x16,     row_scan_16x16_neighbors},
+    {col_scan_16x16,     vp10_col_iscan_16x16,     col_scan_16x16_neighbors},
+  }, {  // TX_32X32
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
+    {h2_scan_32x32, vp10_h2_iscan_32x32,
+     h2_scan_32x32_neighbors},
+    {v2_scan_32x32, vp10_v2_iscan_32x32,
+     v2_scan_32x32_neighbors},
+    {qtr_scan_32x32, vp10_qtr_iscan_32x32,
+     qtr_scan_32x32_neighbors},
+    {h2_scan_32x32, vp10_h2_iscan_32x32,
+     h2_scan_32x32_neighbors},
+    {v2_scan_32x32, vp10_v2_iscan_32x32,
+     v2_scan_32x32_neighbors},
+    {qtr_scan_32x32, vp10_qtr_iscan_32x32,
+     qtr_scan_32x32_neighbors},
+    {qtr_scan_32x32, vp10_qtr_iscan_32x32,
+     qtr_scan_32x32_neighbors},
+    {qtr_scan_32x32, vp10_qtr_iscan_32x32,
+     qtr_scan_32x32_neighbors},
+    {mrow_scan_32x32,  vp10_mrow_iscan_32x32,  mrow_scan_32x32_neighbors},
+    {mrow_scan_32x32,  vp10_mrow_iscan_32x32,  mrow_scan_32x32_neighbors},
+    {mcol_scan_32x32,  vp10_mcol_iscan_32x32,  mcol_scan_32x32_neighbors},
+    {mrow_scan_32x32,  vp10_mrow_iscan_32x32,  mrow_scan_32x32_neighbors},
+    {mcol_scan_32x32,  vp10_mcol_iscan_32x32,  mcol_scan_32x32_neighbors},
+    {mrow_scan_32x32,  vp10_mrow_iscan_32x32,  mrow_scan_32x32_neighbors},
+    {mcol_scan_32x32,  vp10_mcol_iscan_32x32,  mcol_scan_32x32_neighbors},
+  }
+};
+
+const scan_order vp10_inter_scan_orders[TX_SIZES][TX_TYPES] = {
+  {  // TX_4X4
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
+    {mrow_scan_4x4,    vp10_mrow_iscan_4x4,    mrow_scan_4x4_neighbors},
+    {mrow_scan_4x4,    vp10_mrow_iscan_4x4,    mrow_scan_4x4_neighbors},
+    {mcol_scan_4x4,    vp10_mcol_iscan_4x4,    mcol_scan_4x4_neighbors},
+    {mrow_scan_4x4,    vp10_mrow_iscan_4x4,    mrow_scan_4x4_neighbors},
+    {mcol_scan_4x4,    vp10_mcol_iscan_4x4,    mcol_scan_4x4_neighbors},
+    {mrow_scan_4x4,    vp10_mrow_iscan_4x4,    mrow_scan_4x4_neighbors},
+    {mcol_scan_4x4,    vp10_mcol_iscan_4x4,    mcol_scan_4x4_neighbors},
+  }, {  // TX_8X8
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors},
+    {mrow_scan_8x8,    vp10_mrow_iscan_8x8,    mrow_scan_8x8_neighbors},
+    {mrow_scan_8x8,    vp10_mrow_iscan_8x8,    mrow_scan_8x8_neighbors},
+    {mcol_scan_8x8,    vp10_mcol_iscan_8x8,    mcol_scan_8x8_neighbors},
+    {mrow_scan_8x8,    vp10_mrow_iscan_8x8,    mrow_scan_8x8_neighbors},
+    {mcol_scan_8x8,    vp10_mcol_iscan_8x8,    mcol_scan_8x8_neighbors},
+    {mrow_scan_8x8,    vp10_mrow_iscan_8x8,    mrow_scan_8x8_neighbors},
+    {mcol_scan_8x8,    vp10_mcol_iscan_8x8,    mcol_scan_8x8_neighbors},
+  }, {  // TX_16X16
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
+    {mrow_scan_16x16,  vp10_mrow_iscan_16x16,  mrow_scan_16x16_neighbors},
+    {mrow_scan_16x16,  vp10_mrow_iscan_16x16,  mrow_scan_16x16_neighbors},
+    {mcol_scan_16x16,  vp10_mcol_iscan_16x16,  mcol_scan_16x16_neighbors},
+    {mrow_scan_16x16,  vp10_mrow_iscan_16x16,  mrow_scan_16x16_neighbors},
+    {mcol_scan_16x16,  vp10_mcol_iscan_16x16,  mcol_scan_16x16_neighbors},
+    {mrow_scan_16x16,  vp10_mrow_iscan_16x16,  mrow_scan_16x16_neighbors},
+    {mcol_scan_16x16,  vp10_mcol_iscan_16x16,  mcol_scan_16x16_neighbors},
+  }, {  // TX_32X32
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
+    {h2_scan_32x32, vp10_h2_iscan_32x32,
+     h2_scan_32x32_neighbors},
+    {v2_scan_32x32, vp10_v2_iscan_32x32,
+     v2_scan_32x32_neighbors},
+    {qtr_scan_32x32, vp10_qtr_iscan_32x32,
+     qtr_scan_32x32_neighbors},
+    {h2_scan_32x32, vp10_h2_iscan_32x32,
+     h2_scan_32x32_neighbors},
+    {v2_scan_32x32, vp10_v2_iscan_32x32,
+     v2_scan_32x32_neighbors},
+    {qtr_scan_32x32, vp10_qtr_iscan_32x32,
+     qtr_scan_32x32_neighbors},
+    {qtr_scan_32x32, vp10_qtr_iscan_32x32,
+     qtr_scan_32x32_neighbors},
+    {qtr_scan_32x32, vp10_qtr_iscan_32x32,
+     qtr_scan_32x32_neighbors},
+    {mrow_scan_32x32,  vp10_mrow_iscan_32x32,  mrow_scan_32x32_neighbors},
+    {mrow_scan_32x32,  vp10_mrow_iscan_32x32,  mrow_scan_32x32_neighbors},
+    {mcol_scan_32x32,  vp10_mcol_iscan_32x32,  mcol_scan_32x32_neighbors},
+    {mrow_scan_32x32,  vp10_mrow_iscan_32x32,  mrow_scan_32x32_neighbors},
+    {mcol_scan_32x32,  vp10_mcol_iscan_32x32,  mcol_scan_32x32_neighbors},
+    {mrow_scan_32x32,  vp10_mrow_iscan_32x32,  mrow_scan_32x32_neighbors},
+    {mcol_scan_32x32,  vp10_mcol_iscan_32x32,  mcol_scan_32x32_neighbors},
+  }
+};
+
+#else   // CONFIG_EXT_TX
+
+const scan_order vp10_intra_scan_orders[TX_SIZES][TX_TYPES] = {
   {  // TX_4X4
     {default_scan_4x4, vp10_default_iscan_4x4, default_scan_4x4_neighbors},
     {row_scan_4x4,     vp10_row_iscan_4x4,     row_scan_4x4_neighbors},
@@ -712,14 +3143,21 @@
     {col_scan_8x8,     vp10_col_iscan_8x8,     col_scan_8x8_neighbors},
     {default_scan_8x8, vp10_default_iscan_8x8, default_scan_8x8_neighbors}
   }, {  // TX_16X16
-    {default_scan_16x16, vp10_default_iscan_16x16, default_scan_16x16_neighbors},
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors},
     {row_scan_16x16,     vp10_row_iscan_16x16,     row_scan_16x16_neighbors},
     {col_scan_16x16,     vp10_col_iscan_16x16,     col_scan_16x16_neighbors},
-    {default_scan_16x16, vp10_default_iscan_16x16, default_scan_16x16_neighbors}
+    {default_scan_16x16, vp10_default_iscan_16x16,
+     default_scan_16x16_neighbors}
   }, {  // TX_32X32
-    {default_scan_32x32, vp10_default_iscan_32x32, default_scan_32x32_neighbors},
-    {default_scan_32x32, vp10_default_iscan_32x32, default_scan_32x32_neighbors},
-    {default_scan_32x32, vp10_default_iscan_32x32, default_scan_32x32_neighbors},
-    {default_scan_32x32, vp10_default_iscan_32x32, default_scan_32x32_neighbors},
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
+    {default_scan_32x32, vp10_default_iscan_32x32,
+     default_scan_32x32_neighbors},
   }
 };
+#endif  // CONFIG_EXT_TX
diff --git a/vp10/common/scan.h b/vp10/common/scan.h
index f5a020f..aadae40 100644
--- a/vp10/common/scan.h
+++ b/vp10/common/scan.h
@@ -30,7 +30,7 @@
 } scan_order;
 
 extern const scan_order vp10_default_scan_orders[TX_SIZES];
-extern const scan_order vp10_scan_orders[TX_SIZES][TX_TYPES];
+extern const scan_order vp10_intra_scan_orders[TX_SIZES][TX_TYPES];
 
 static INLINE int get_coef_context(const int16_t *neighbors,
                                    const uint8_t *token_cache, int c) {
@@ -38,8 +38,31 @@
           token_cache[neighbors[MAX_NEIGHBORS * c + 1]]) >> 1;
 }
 
-static INLINE const scan_order *get_scan(TX_SIZE tx_size, TX_TYPE tx_type) {
-  return &vp10_scan_orders[tx_size][tx_type];
+static INLINE const scan_order *get_intra_scan(TX_SIZE tx_size,
+                                               TX_TYPE tx_type) {
+  return &vp10_intra_scan_orders[tx_size][tx_type];
+}
+
+#if CONFIG_EXT_TX
+extern const scan_order vp10_inter_scan_orders[TX_SIZES][TX_TYPES];
+
+static INLINE const scan_order *get_inter_scan(TX_SIZE tx_size,
+                                               TX_TYPE tx_type) {
+  return &vp10_inter_scan_orders[tx_size][tx_type];
+}
+#endif  // CONFIG_EXT_TX
+
+static INLINE const scan_order *get_scan(TX_SIZE tx_size,
+                                         TX_TYPE tx_type,
+                                         int is_inter) {
+#if CONFIG_EXT_TX
+  return
+      is_inter ? &vp10_inter_scan_orders[tx_size][tx_type] :
+                 &vp10_intra_scan_orders[tx_size][tx_type];
+#else
+  (void) is_inter;
+  return &vp10_intra_scan_orders[tx_size][tx_type];
+#endif  // CONFIG_EXT_TX
 }
 
 #ifdef __cplusplus
diff --git a/vp10/common/thread_common.c b/vp10/common/thread_common.c
index 0c7a1c2..963e577 100644
--- a/vp10/common/thread_common.c
+++ b/vp10/common/thread_common.c
@@ -94,9 +94,11 @@
                              int start, int stop, int y_only,
                              VP9LfSync *const lf_sync) {
   const int num_planes = y_only ? 1 : MAX_MB_PLANE;
-  const int sb_cols = mi_cols_aligned_to_sb(cm->mi_cols) >> MI_BLOCK_SIZE_LOG2;
+  const int sb_cols = mi_cols_aligned_to_sb(cm) >> cm->mib_size_log2;
   int mi_row, mi_col;
+#if !CONFIG_EXT_PARTITION_TYPES
   enum lf_path path;
+  LOOP_FILTER_MASK lfm;
   if (y_only)
     path = LF_PATH_444;
   else if (planes[1].subsampling_y == 1 && planes[1].subsampling_x == 1)
@@ -105,21 +107,32 @@
     path = LF_PATH_444;
   else
     path = LF_PATH_SLOW;
+#endif  // !CONFIG_EXT_PARTITION_TYPES
+
+#if CONFIG_EXT_PARTITION
+  printf("STOPPING: This code has not been modified to work with the "
+         "extended coding unit size experiment");
+  exit(EXIT_FAILURE);
+#endif  // CONFIG_EXT_PARTITION
 
   for (mi_row = start; mi_row < stop;
-       mi_row += lf_sync->num_workers * MI_BLOCK_SIZE) {
+       mi_row += lf_sync->num_workers * cm->mib_size) {
     MODE_INFO **const mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
 
-    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += MI_BLOCK_SIZE) {
-      const int r = mi_row >> MI_BLOCK_SIZE_LOG2;
-      const int c = mi_col >> MI_BLOCK_SIZE_LOG2;
-      LOOP_FILTER_MASK lfm;
+    for (mi_col = 0; mi_col < cm->mi_cols; mi_col += cm->mib_size) {
+      const int r = mi_row >> cm->mib_size_log2;
+      const int c = mi_col >> cm->mib_size_log2;
       int plane;
 
       sync_read(lf_sync, r, c);
 
       vp10_setup_dst_planes(planes, frame_buffer, mi_row, mi_col);
 
+#if CONFIG_EXT_PARTITION_TYPES
+      for (plane = 0; plane < num_planes; ++plane)
+        vp10_filter_block_plane_non420(cm, &planes[plane], mi + mi_col,
+                                       mi_row, mi_col);
+#else
       // TODO(JBB): Make setup_mask work for non 420.
       vp10_setup_mask(cm, mi_row, mi_col, mi + mi_col, cm->mi_stride,
                      &lfm);
@@ -139,7 +152,7 @@
             break;
         }
       }
-
+#endif  // CONFIG_EXT_PARTITION_TYPES
       sync_write(lf_sync, r, c, sb_cols);
     }
   }
@@ -162,13 +175,19 @@
                                 VP9LfSync *lf_sync) {
   const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
   // Number of superblock rows and cols
-  const int sb_rows = mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2;
+  const int sb_rows = mi_rows_aligned_to_sb(cm) >> cm->mib_size_log2;
   // Decoder may allocate more threads than number of tiles based on user's
   // input.
-  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_cols = cm->tile_cols;
   const int num_workers = VPXMIN(nworkers, tile_cols);
   int i;
 
+#if CONFIG_EXT_PARTITION
+      printf("STOPPING: This code has not been modified to work with the "
+             "extended coding unit size experiment");
+      exit(EXIT_FAILURE);
+#endif  // CONFIG_EXT_PARTITION
+
   if (!lf_sync->sync_range || sb_rows != lf_sync->rows ||
       num_workers > lf_sync->num_workers) {
     vp10_loop_filter_dealloc(lf_sync);
@@ -196,7 +215,7 @@
 
     // Loopfilter data
     vp10_loop_filter_data_reset(lf_data, frame, cm, planes);
-    lf_data->start = start + i * MI_BLOCK_SIZE;
+    lf_data->start = start + i * cm->mib_size;
     lf_data->stop = stop;
     lf_data->y_only = y_only;
 
@@ -317,143 +336,15 @@
   }
 }
 
-// Accumulate frame counts.
-void vp10_accumulate_frame_counts(VP10_COMMON *cm, FRAME_COUNTS *counts,
-                                 int is_dec) {
-  int i, j, k, l, m;
+// Accumulate frame counts. FRAME_COUNTS consist solely of 'unsigned int'
+// members, so we treat it as an array, and sum over the whole length.
+void vp10_accumulate_frame_counts(VP10_COMMON *cm, FRAME_COUNTS *counts) {
+  unsigned int *const acc = (unsigned int*)&cm->counts;
+  const unsigned int *const cnt = (unsigned int*)counts;
 
-  for (i = 0; i < BLOCK_SIZE_GROUPS; i++)
-    for (j = 0; j < INTRA_MODES; j++)
-      cm->counts.y_mode[i][j] += counts->y_mode[i][j];
+  const unsigned int n_counts = sizeof(FRAME_COUNTS)/sizeof(unsigned int);
+  unsigned int i;
 
-  for (i = 0; i < INTRA_MODES; i++)
-    for (j = 0; j < INTRA_MODES; j++)
-      cm->counts.uv_mode[i][j] += counts->uv_mode[i][j];
-
-  for (i = 0; i < PARTITION_CONTEXTS; i++)
-    for (j = 0; j < PARTITION_TYPES; j++)
-      cm->counts.partition[i][j] += counts->partition[i][j];
-
-  if (is_dec) {
-    int n;
-    for (i = 0; i < TX_SIZES; i++)
-      for (j = 0; j < PLANE_TYPES; j++)
-        for (k = 0; k < REF_TYPES; k++)
-          for (l = 0; l < COEF_BANDS; l++)
-            for (m = 0; m < COEFF_CONTEXTS; m++) {
-              cm->counts.eob_branch[i][j][k][l][m] +=
-                  counts->eob_branch[i][j][k][l][m];
-              for (n = 0; n < UNCONSTRAINED_NODES + 1; n++)
-                cm->counts.coef[i][j][k][l][m][n] +=
-                    counts->coef[i][j][k][l][m][n];
-            }
-  } else {
-    for (i = 0; i < TX_SIZES; i++)
-      for (j = 0; j < PLANE_TYPES; j++)
-        for (k = 0; k < REF_TYPES; k++)
-          for (l = 0; l < COEF_BANDS; l++)
-            for (m = 0; m < COEFF_CONTEXTS; m++)
-              cm->counts.eob_branch[i][j][k][l][m] +=
-                  counts->eob_branch[i][j][k][l][m];
-                // In the encoder, cm->counts.coef is only updated at frame
-                // level, so not need to accumulate it here.
-                // for (n = 0; n < UNCONSTRAINED_NODES + 1; n++)
-                //   cm->counts.coef[i][j][k][l][m][n] +=
-                //       counts->coef[i][j][k][l][m][n];
-  }
-
-  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
-    for (j = 0; j < SWITCHABLE_FILTERS; j++)
-      cm->counts.switchable_interp[i][j] += counts->switchable_interp[i][j];
-
-  for (i = 0; i < INTER_MODE_CONTEXTS; i++)
-    for (j = 0; j < INTER_MODES; j++)
-      cm->counts.inter_mode[i][j] += counts->inter_mode[i][j];
-
-  for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
-    for (j = 0; j < 2; j++)
-      cm->counts.intra_inter[i][j] += counts->intra_inter[i][j];
-
-  for (i = 0; i < COMP_INTER_CONTEXTS; i++)
-    for (j = 0; j < 2; j++)
-      cm->counts.comp_inter[i][j] += counts->comp_inter[i][j];
-
-  for (i = 0; i < REF_CONTEXTS; i++)
-    for (j = 0; j < 2; j++)
-      for (k = 0; k < 2; k++)
-      cm->counts.single_ref[i][j][k] += counts->single_ref[i][j][k];
-
-  for (i = 0; i < REF_CONTEXTS; i++)
-    for (j = 0; j < 2; j++)
-      cm->counts.comp_ref[i][j] += counts->comp_ref[i][j];
-
-  for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-    for (j = 0; j < TX_SIZES; j++)
-      cm->counts.tx.p32x32[i][j] += counts->tx.p32x32[i][j];
-
-    for (j = 0; j < TX_SIZES - 1; j++)
-      cm->counts.tx.p16x16[i][j] += counts->tx.p16x16[i][j];
-
-    for (j = 0; j < TX_SIZES - 2; j++)
-      cm->counts.tx.p8x8[i][j] += counts->tx.p8x8[i][j];
-  }
-
-  for (i = 0; i < TX_SIZES; i++)
-    cm->counts.tx.tx_totals[i] += counts->tx.tx_totals[i];
-
-  for (i = 0; i < SKIP_CONTEXTS; i++)
-    for (j = 0; j < 2; j++)
-      cm->counts.skip[i][j] += counts->skip[i][j];
-
-  for (i = 0; i < MV_JOINTS; i++)
-    cm->counts.mv.joints[i] += counts->mv.joints[i];
-
-  for (k = 0; k < 2; k++) {
-    nmv_component_counts *comps = &cm->counts.mv.comps[k];
-    nmv_component_counts *comps_t = &counts->mv.comps[k];
-
-    for (i = 0; i < 2; i++) {
-      comps->sign[i] += comps_t->sign[i];
-      comps->class0_hp[i] += comps_t->class0_hp[i];
-      comps->hp[i] += comps_t->hp[i];
-    }
-
-    for (i = 0; i < MV_CLASSES; i++)
-      comps->classes[i] += comps_t->classes[i];
-
-    for (i = 0; i < CLASS0_SIZE; i++) {
-      comps->class0[i] += comps_t->class0[i];
-      for (j = 0; j < MV_FP_SIZE; j++)
-        comps->class0_fp[i][j] += comps_t->class0_fp[i][j];
-    }
-
-    for (i = 0; i < MV_OFFSET_BITS; i++)
-      for (j = 0; j < 2; j++)
-        comps->bits[i][j] += comps_t->bits[i][j];
-
-    for (i = 0; i < MV_FP_SIZE; i++)
-      comps->fp[i] += comps_t->fp[i];
-  }
-
-  for (i = 0; i < EXT_TX_SIZES; i++) {
-    int j;
-    for (j = 0; j < TX_TYPES; ++j)
-      for (k = 0; k < TX_TYPES; k++)
-        cm->counts.intra_ext_tx[i][j][k] += counts->intra_ext_tx[i][j][k];
-  }
-  for (i = 0; i < EXT_TX_SIZES; i++) {
-    for (k = 0; k < TX_TYPES; k++)
-      cm->counts.inter_ext_tx[i][k] += counts->inter_ext_tx[i][k];
-  }
-
-#if CONFIG_MISC_FIXES
-  for (i = 0; i < PREDICTION_PROBS; i++)
-    for (j = 0; j < 2; j++)
-      cm->counts.seg.pred[i][j] += counts->seg.pred[i][j];
-
-  for (i = 0; i < MAX_SEGMENTS; i++) {
-    cm->counts.seg.tree_total[i] += counts->seg.tree_total[i];
-    cm->counts.seg.tree_mispred[i] += counts->seg.tree_mispred[i];
-  }
-#endif
+  for (i = 0; i < n_counts; i++)
+    acc[i] += cnt[i];
 }
diff --git a/vp10/common/thread_common.h b/vp10/common/thread_common.h
index a401ddc..a4ac030 100644
--- a/vp10/common/thread_common.h
+++ b/vp10/common/thread_common.h
@@ -56,7 +56,7 @@
                               VP9LfSync *lf_sync);
 
 void vp10_accumulate_frame_counts(struct VP10Common *cm,
-                                 struct FRAME_COUNTS *counts, int is_dec);
+                                 struct FRAME_COUNTS *counts);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp10/common/tile_common.c b/vp10/common/tile_common.c
index 4d92b4c..04b19eb 100644
--- a/vp10/common/tile_common.c
+++ b/vp10/common/tile_common.c
@@ -12,23 +12,16 @@
 #include "vp10/common/onyxc_int.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 
-#define MIN_TILE_WIDTH_B64 4
-#define MAX_TILE_WIDTH_B64 64
-
-static int get_tile_offset(int idx, int mis, int log2) {
-  const int sb_cols = mi_cols_aligned_to_sb(mis) >> MI_BLOCK_SIZE_LOG2;
-  const int offset = ((idx * sb_cols) >> log2) << MI_BLOCK_SIZE_LOG2;
-  return VPXMIN(offset, mis);
-}
-
 void vp10_tile_set_row(TileInfo *tile, const VP10_COMMON *cm, int row) {
-  tile->mi_row_start = get_tile_offset(row, cm->mi_rows, cm->log2_tile_rows);
-  tile->mi_row_end = get_tile_offset(row + 1, cm->mi_rows, cm->log2_tile_rows);
+  tile->mi_row_start = row * cm->tile_height;
+  tile->mi_row_end   = VPXMIN(tile->mi_row_start + cm->tile_height,
+                              cm->mi_rows);
 }
 
 void vp10_tile_set_col(TileInfo *tile, const VP10_COMMON *cm, int col) {
-  tile->mi_col_start = get_tile_offset(col, cm->mi_cols, cm->log2_tile_cols);
-  tile->mi_col_end = get_tile_offset(col + 1, cm->mi_cols, cm->log2_tile_cols);
+  tile->mi_col_start = col * cm->tile_width;
+  tile->mi_col_end   = VPXMIN(tile->mi_col_start + cm->tile_width,
+                              cm->mi_cols);
 }
 
 void vp10_tile_init(TileInfo *tile, const VP10_COMMON *cm, int row, int col) {
@@ -36,24 +29,36 @@
   vp10_tile_set_col(tile, cm, col);
 }
 
-static int get_min_log2_tile_cols(const int sb64_cols) {
+#if !CONFIG_EXT_TILE
+
+# if CONFIG_EXT_PARTITION
+#   define MIN_TILE_WIDTH_MAX_SB 2
+#   define MAX_TILE_WIDTH_MAX_SB 32
+# else
+#   define MIN_TILE_WIDTH_MAX_SB 4
+#   define MAX_TILE_WIDTH_MAX_SB 64
+# endif  // CONFIG_EXT_PARTITION
+
+static int get_min_log2_tile_cols(const int max_sb_cols) {
   int min_log2 = 0;
-  while ((MAX_TILE_WIDTH_B64 << min_log2) < sb64_cols)
+  while ((MAX_TILE_WIDTH_MAX_SB << min_log2) < max_sb_cols)
     ++min_log2;
   return min_log2;
 }
 
-static int get_max_log2_tile_cols(const int sb64_cols) {
+static int get_max_log2_tile_cols(const int max_sb_cols) {
   int max_log2 = 1;
-  while ((sb64_cols >> max_log2) >= MIN_TILE_WIDTH_B64)
+  while ((max_sb_cols >> max_log2) >= MIN_TILE_WIDTH_MAX_SB)
     ++max_log2;
   return max_log2 - 1;
 }
 
-void vp10_get_tile_n_bits(int mi_cols,
-                         int *min_log2_tile_cols, int *max_log2_tile_cols) {
-  const int sb64_cols = mi_cols_aligned_to_sb(mi_cols) >> MI_BLOCK_SIZE_LOG2;
-  *min_log2_tile_cols = get_min_log2_tile_cols(sb64_cols);
-  *max_log2_tile_cols = get_max_log2_tile_cols(sb64_cols);
+void vp10_get_tile_n_bits(const int mi_cols,
+                          int *min_log2_tile_cols, int *max_log2_tile_cols) {
+  const int max_sb_cols =
+      ALIGN_POWER_OF_TWO(mi_cols, MAX_MIB_SIZE_LOG2) >> MAX_MIB_SIZE_LOG2;
+  *min_log2_tile_cols = get_min_log2_tile_cols(max_sb_cols);
+  *max_log2_tile_cols = get_max_log2_tile_cols(max_sb_cols);
   assert(*min_log2_tile_cols <= *max_log2_tile_cols);
 }
+#endif  // !CONFIG_EXT_TILE
diff --git a/vp10/common/tile_common.h b/vp10/common/tile_common.h
index 09cf060..2babc89 100644
--- a/vp10/common/tile_common.h
+++ b/vp10/common/tile_common.h
@@ -30,8 +30,8 @@
 void vp10_tile_set_row(TileInfo *tile, const struct VP10Common *cm, int row);
 void vp10_tile_set_col(TileInfo *tile, const struct VP10Common *cm, int col);
 
-void vp10_get_tile_n_bits(int mi_cols,
-                         int *min_log2_tile_cols, int *max_log2_tile_cols);
+void vp10_get_tile_n_bits(const int mi_cols,
+                          int *min_log2_tile_cols, int *max_log2_tile_cols);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp10/common/vp10_convolve.c b/vp10/common/vp10_convolve.c
new file mode 100644
index 0000000..5b1d921
--- /dev/null
+++ b/vp10/common/vp10_convolve.c
@@ -0,0 +1,358 @@
+#include <assert.h>
+#include <string.h>
+
+#include "./vp10_rtcd.h"
+#include "vp10/common/vp10_convolve.h"
+#include "vp10/common/filter.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_ports/mem.h"
+
+#define MAX_BLOCK_WIDTH (MAX_SB_SIZE)
+#define MAX_BLOCK_HEIGHT (MAX_SB_SIZE)
+#define MAX_STEP (32)
+#define MAX_FILTER_TAP (12)
+
+void vp10_convolve_horiz_c(const uint8_t *src, int src_stride, uint8_t *dst,
+                           int dst_stride, int w, int h,
+                           const InterpFilterParams filter_params,
+                           const int subpel_x_q4, int x_step_q4, int avg) {
+  int x, y;
+  int filter_size = filter_params.taps;
+  src -= filter_size / 2 - 1;
+  for (y = 0; y < h; ++y) {
+    int x_q4 = subpel_x_q4;
+    for (x = 0; x < w; ++x) {
+      const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *x_filter =
+          vp10_get_interp_filter_subpel_kernel(
+              filter_params, x_q4 & SUBPEL_MASK);
+      int k, sum = 0;
+      for (k = 0; k < filter_size; ++k) sum += src_x[k] * x_filter[k];
+      if (avg) {
+        dst[x] = ROUND_POWER_OF_TWO(
+            dst[x] + clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)), 1);
+      } else {
+        dst[x] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+      }
+      x_q4 += x_step_q4;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void vp10_convolve_vert_c(const uint8_t *src, int src_stride, uint8_t *dst,
+                          int dst_stride, int w, int h,
+                          const InterpFilterParams filter_params,
+                          const int subpel_y_q4, int y_step_q4, int avg) {
+  int x, y;
+  int filter_size = filter_params.taps;
+  src -= src_stride * (filter_size / 2 - 1);
+
+  for (x = 0; x < w; ++x) {
+    int y_q4 = subpel_y_q4;
+    for (y = 0; y < h; ++y) {
+      const uint8_t *const src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *y_filter =
+          vp10_get_interp_filter_subpel_kernel(
+              filter_params, y_q4 & SUBPEL_MASK);
+      int k, sum = 0;
+      for (k = 0; k < filter_size; ++k)
+        sum += src_y[k * src_stride] * y_filter[k];
+      if (avg) {
+        dst[y * dst_stride] = ROUND_POWER_OF_TWO(
+            dst[y * dst_stride] +
+                clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS)),
+            1);
+      } else {
+        dst[y * dst_stride] = clip_pixel(ROUND_POWER_OF_TWO(sum, FILTER_BITS));
+      }
+      y_q4 += y_step_q4;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+static void convolve_copy(const uint8_t *src, int src_stride, uint8_t *dst,
+                          int dst_stride, int w, int h, int avg) {
+  if (avg == 0) {
+    int r;
+    for (r = 0; r < h; ++r) {
+      memcpy(dst, src, w);
+      src += src_stride;
+      dst += dst_stride;
+    }
+  } else {
+    int r, c;
+    for (r = 0; r < h; ++r) {
+      for (c = 0; c < w; ++c) {
+        dst[c] = clip_pixel(ROUND_POWER_OF_TWO(dst[c] + src[c], 1));
+      }
+      src += src_stride;
+      dst += dst_stride;
+    }
+  }
+}
+
+void vp10_convolve(const uint8_t *src, int src_stride, uint8_t *dst,
+                   int dst_stride, int w, int h,
+#if CONFIG_DUAL_FILTER
+                   const INTERP_FILTER *interp_filter,
+#else
+                   const INTERP_FILTER interp_filter,
+#endif
+                   const int subpel_x_q4, int x_step_q4, const int subpel_y_q4,
+                   int y_step_q4, int ref_idx) {
+  int ignore_horiz = x_step_q4 == 16 && subpel_x_q4 == 0;
+  int ignore_vert = y_step_q4 == 16 && subpel_y_q4 == 0;
+
+  assert(w <= MAX_BLOCK_WIDTH);
+  assert(h <= MAX_BLOCK_HEIGHT);
+  assert(y_step_q4 <= MAX_STEP);
+  assert(x_step_q4 <= MAX_STEP);
+
+  if (ignore_horiz && ignore_vert) {
+    convolve_copy(src, src_stride, dst, dst_stride, w, h, ref_idx);
+  } else if (ignore_vert) {
+#if CONFIG_DUAL_FILTER
+    InterpFilterParams filter_params =
+        vp10_get_interp_filter_params(interp_filter[1 + 2 * ref_idx]);
+#else
+    InterpFilterParams filter_params =
+        vp10_get_interp_filter_params(interp_filter);
+#endif
+    assert(filter_params.taps <= MAX_FILTER_TAP);
+    vp10_convolve_horiz(src, src_stride, dst, dst_stride, w, h, filter_params,
+                        subpel_x_q4, x_step_q4, ref_idx);
+  } else if (ignore_horiz) {
+#if CONFIG_DUAL_FILTER
+    InterpFilterParams filter_params =
+        vp10_get_interp_filter_params(interp_filter[2 * ref_idx]);
+#else
+    InterpFilterParams filter_params =
+        vp10_get_interp_filter_params(interp_filter);
+#endif
+    assert(filter_params.taps <= MAX_FILTER_TAP);
+    vp10_convolve_vert(src, src_stride, dst, dst_stride, w, h, filter_params,
+                       subpel_y_q4, y_step_q4, ref_idx);
+  } else {
+    // temp's size is set to (maximum possible intermediate_height) *
+    // MAX_BLOCK_WIDTH
+    uint8_t temp[((((MAX_BLOCK_HEIGHT - 1) * MAX_STEP + 15) >> SUBPEL_BITS) +
+                  MAX_FILTER_TAP) *
+                 MAX_BLOCK_WIDTH];
+    int temp_stride = MAX_BLOCK_WIDTH;
+#if CONFIG_DUAL_FILTER
+    InterpFilterParams filter_params_x =
+        vp10_get_interp_filter_params(interp_filter[1 + 2 * ref_idx]);
+    InterpFilterParams filter_params_y =
+        vp10_get_interp_filter_params(interp_filter[0 + 2 * ref_idx]);
+    InterpFilterParams filter_params = filter_params_x;
+
+    // The filter size implies the required number of reference pixels for
+    // the second stage filtering. It is possible that the two directions
+    // require different filter sizes.
+    int filter_size = filter_params_y.taps;
+#else
+    InterpFilterParams filter_params =
+        vp10_get_interp_filter_params(interp_filter);
+    int filter_size = filter_params.taps;
+#endif
+    int intermediate_height =
+        (((h - 1) * y_step_q4 + subpel_y_q4) >> SUBPEL_BITS) + filter_size;
+
+    assert(filter_params.taps <= MAX_FILTER_TAP);
+
+    vp10_convolve_horiz(src - src_stride * (filter_size / 2 - 1), src_stride,
+                        temp, temp_stride, w, intermediate_height,
+                        filter_params, subpel_x_q4, x_step_q4, 0);
+
+#if CONFIG_DUAL_FILTER
+    filter_params = filter_params_y;
+#else
+    filter_params = vp10_get_interp_filter_params(interp_filter);
+#endif
+    filter_size = filter_params.taps;
+    assert(filter_params.taps <= MAX_FILTER_TAP);
+
+    vp10_convolve_vert(temp + temp_stride * (filter_size / 2 - 1), temp_stride,
+                       dst, dst_stride, w, h, filter_params,
+                       subpel_y_q4, y_step_q4, ref_idx);
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_highbd_convolve_horiz_c(const uint16_t *src, int src_stride,
+                                  uint16_t *dst, int dst_stride, int w, int h,
+                                  const InterpFilterParams filter_params,
+                                  const int subpel_x_q4, int x_step_q4, int avg,
+                                  int bd) {
+  int x, y;
+  int filter_size = filter_params.taps;
+  src -= filter_size / 2 - 1;
+  for (y = 0; y < h; ++y) {
+    int x_q4 = subpel_x_q4;
+    for (x = 0; x < w; ++x) {
+      const uint16_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
+      const int16_t *x_filter =
+          vp10_get_interp_filter_subpel_kernel(
+              filter_params, x_q4 & SUBPEL_MASK);
+      int k, sum = 0;
+      for (k = 0; k < filter_size; ++k) sum += src_x[k] * x_filter[k];
+      if (avg)
+        dst[x] = ROUND_POWER_OF_TWO(
+            dst[x] +
+                clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
+            1);
+      else
+        dst[x] = clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+      x_q4 += x_step_q4;
+    }
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+void vp10_highbd_convolve_vert_c(const uint16_t *src, int src_stride,
+                                 uint16_t *dst, int dst_stride, int w, int h,
+                                 const InterpFilterParams filter_params,
+                                 const int subpel_y_q4, int y_step_q4, int avg,
+                                 int bd) {
+  int x, y;
+  int filter_size = filter_params.taps;
+  src -= src_stride * (filter_size / 2 - 1);
+
+  for (x = 0; x < w; ++x) {
+    int y_q4 = subpel_y_q4;
+    for (y = 0; y < h; ++y) {
+      const uint16_t *const src_y = &src[(y_q4 >> SUBPEL_BITS) * src_stride];
+      const int16_t *y_filter =
+          vp10_get_interp_filter_subpel_kernel(
+              filter_params, y_q4 & SUBPEL_MASK);
+      int k, sum = 0;
+      for (k = 0; k < filter_size; ++k)
+        sum += src_y[k * src_stride] * y_filter[k];
+      if (avg) {
+        dst[y * dst_stride] = ROUND_POWER_OF_TWO(
+            dst[y * dst_stride] +
+                clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd),
+            1);
+      } else {
+        dst[y * dst_stride] =
+            clip_pixel_highbd(ROUND_POWER_OF_TWO(sum, FILTER_BITS), bd);
+      }
+      y_q4 += y_step_q4;
+    }
+    ++src;
+    ++dst;
+  }
+}
+
+static void highbd_convolve_copy(const uint16_t *src, int src_stride,
+                                 uint16_t *dst, int dst_stride, int w, int h,
+                                 int avg, int bd) {
+  if (avg == 0) {
+    int r;
+    for (r = 0; r < h; ++r) {
+      memcpy(dst, src, w * sizeof(*src));
+      src += src_stride;
+      dst += dst_stride;
+    }
+  } else {
+    int r, c;
+    for (r = 0; r < h; ++r) {
+      for (c = 0; c < w; ++c) {
+        dst[c] = clip_pixel_highbd(ROUND_POWER_OF_TWO(dst[c] + src[c], 1), bd);
+      }
+      src += src_stride;
+      dst += dst_stride;
+    }
+  }
+}
+
+void vp10_highbd_convolve(const uint8_t *src8, int src_stride, uint8_t *dst8,
+                          int dst_stride, int w, int h,
+#if CONFIG_DUAL_FILTER
+                          const INTERP_FILTER *interp_filter,
+#else
+                          const INTERP_FILTER interp_filter,
+#endif
+                          const int subpel_x_q4, int x_step_q4,
+                          const int subpel_y_q4, int y_step_q4, int ref_idx,
+                          int bd) {
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
+  int ignore_horiz = x_step_q4 == 16 && subpel_x_q4 == 0;
+  int ignore_vert = y_step_q4 == 16 && subpel_y_q4 == 0;
+
+  assert(w <= MAX_BLOCK_WIDTH);
+  assert(h <= MAX_BLOCK_HEIGHT);
+  assert(y_step_q4 <= MAX_STEP);
+  assert(x_step_q4 <= MAX_STEP);
+
+  if (ignore_horiz && ignore_vert) {
+    highbd_convolve_copy(src, src_stride, dst, dst_stride, w, h, ref_idx, bd);
+  } else if (ignore_vert) {
+#if CONFIG_DUAL_FILTER
+    InterpFilterParams filter_params =
+        vp10_get_interp_filter_params(interp_filter[1 + 2 * ref_idx]);
+#else
+    InterpFilterParams filter_params =
+        vp10_get_interp_filter_params(interp_filter);
+#endif
+    vp10_highbd_convolve_horiz(src, src_stride, dst, dst_stride, w, h,
+                               filter_params, subpel_x_q4, x_step_q4, ref_idx,
+                               bd);
+  } else if (ignore_horiz) {
+#if CONFIG_DUAL_FILTER
+    InterpFilterParams filter_params =
+        vp10_get_interp_filter_params(interp_filter[0 + 2 * ref_idx]);
+#else
+    InterpFilterParams filter_params =
+        vp10_get_interp_filter_params(interp_filter);
+#endif
+    vp10_highbd_convolve_vert(src, src_stride, dst, dst_stride, w, h,
+                              filter_params, subpel_y_q4, y_step_q4, ref_idx,
+                              bd);
+  } else {
+    // temp's size is set to (maximum possible intermediate_height) *
+    // MAX_BLOCK_WIDTH
+    uint16_t temp[((((MAX_BLOCK_HEIGHT - 1) * MAX_STEP + 15) >> SUBPEL_BITS) +
+                   MAX_FILTER_TAP) *
+                  MAX_BLOCK_WIDTH];
+    int temp_stride = MAX_BLOCK_WIDTH;
+
+#if CONFIG_DUAL_FILTER
+    InterpFilterParams filter_params_x =
+        vp10_get_interp_filter_params(interp_filter[1 + 2 * ref_idx]);
+    InterpFilterParams filter_params_y =
+        vp10_get_interp_filter_params(interp_filter[0 + 2 * ref_idx]);
+    InterpFilterParams filter_params = filter_params_x;
+    int filter_size = filter_params_y.taps;
+#else
+    InterpFilterParams filter_params =
+        vp10_get_interp_filter_params(interp_filter);
+    int filter_size = filter_params.taps;
+#endif
+
+    int intermediate_height =
+        (((h - 1) * y_step_q4 + subpel_y_q4) >> SUBPEL_BITS) + filter_size;
+
+    vp10_highbd_convolve_horiz(src - src_stride * (filter_size / 2 - 1),
+                               src_stride, temp, temp_stride, w,
+                               intermediate_height, filter_params, subpel_x_q4,
+                               x_step_q4, 0, bd);
+
+#if CONFIG_DUAL_FILTER
+    filter_params = filter_params_y;
+#endif
+    filter_size = filter_params.taps;
+    assert(filter_params.taps <= MAX_FILTER_TAP);
+
+    vp10_highbd_convolve_vert(temp + temp_stride * (filter_size / 2 - 1),
+                              temp_stride, dst, dst_stride, w, h, filter_params,
+                              subpel_y_q4, y_step_q4, ref_idx, bd);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp10/common/vp10_convolve.h b/vp10/common/vp10_convolve.h
new file mode 100644
index 0000000..13f87fc
--- /dev/null
+++ b/vp10/common/vp10_convolve.h
@@ -0,0 +1,39 @@
+#ifndef VP10_COMMON_VP10_CONVOLVE_H_
+#define VP10_COMMON_VP10_CONVOLVE_H_
+#include "vp10/common/filter.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp10_convolve(const uint8_t *src, int src_stride,
+                   uint8_t *dst, int dst_stride,
+                   int w, int h,
+#if CONFIG_DUAL_FILTER
+                   const INTERP_FILTER *interp_filter,
+#else
+                   const INTERP_FILTER interp_filter,
+#endif
+                   const int subpel_x,
+                   const int subpel_y,
+                   int xstep, int ystep, int avg);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_highbd_convolve(const uint8_t *src, int src_stride,
+                   uint8_t *dst, int dst_stride,
+                   int w, int h,
+#if CONFIG_DUAL_FILTER
+                   const INTERP_FILTER *interp_filter,
+#else
+                   const INTERP_FILTER interp_filter,
+#endif
+                   const int subpel_x,
+                   const int subpel_y,
+                   int xstep, int ystep, int avg, int bd);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_COMMON_VP10_CONVOLVE_H_
diff --git a/vp10/common/vp10_fwd_txfm.c b/vp10/common/vp10_fwd_txfm.c
index 3211cd0..17935c5 100644
--- a/vp10/common/vp10_fwd_txfm.c
+++ b/vp10/common/vp10_fwd_txfm.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vp10_rtcd.h"
 #include "vp10/common/vp10_fwd_txfm.h"
 
 void vp10_fdct4x4_c(const int16_t *input, tran_low_t *output, int stride) {
diff --git a/vp10/common/vp10_fwd_txfm1d.c b/vp10/common/vp10_fwd_txfm1d.c
new file mode 100644
index 0000000..ef24362
--- /dev/null
+++ b/vp10/common/vp10_fwd_txfm1d.c
@@ -0,0 +1,2323 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include "vp10/common/vp10_fwd_txfm1d.h"
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+#define range_check(stage, input, buf, size, bit)                         \
+  {                                                                       \
+    int i, j;                                                             \
+    for (i = 0; i < size; ++i) {                                          \
+        int buf_bit = get_max_bit(abs(buf[i])) + 1;                       \
+        if (buf_bit > bit) {                                              \
+        printf("======== %s overflow ========\n", __func__);              \
+        printf("stage: %d node: %d\n", stage, i);                         \
+        printf("bit: %d buf_bit: %d buf[i]: %d\n", bit, buf_bit, buf[i]); \
+        printf("input:\n");                                               \
+        for (j = 0; j < size; j++) {                                      \
+          printf("%d,", input[j]);                                        \
+        }                                                                 \
+        printf("\n");                                                     \
+        assert(0);                                                        \
+      }                                                                   \
+    }                                                                     \
+  }
+#else
+#define range_check(stage, input, buf, size, bit) \
+  {                                               \
+    (void)stage;                                  \
+    (void)input;                                  \
+    (void)buf;                                    \
+    (void)size;                                   \
+    (void)bit;                                    \
+  }
+#endif
+
+void vp10_fdct4_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 4;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[4];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0] + input[3];
+  bf1[1] = input[1] + input[2];
+  bf1[2] = -input[2] + input[1];
+  bf1[3] = -input[3] + input[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[2];
+  bf1[2] = bf0[1];
+  bf1[3] = bf0[3];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void vp10_fdct8_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 8;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[8];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0] + input[7];
+  bf1[1] = input[1] + input[6];
+  bf1[2] = input[2] + input[5];
+  bf1[3] = input[3] + input[4];
+  bf1[4] = -input[4] + input[3];
+  bf1[5] = -input[5] + input[2];
+  bf1[6] = -input[6] + input[1];
+  bf1[7] = -input[7] + input[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = -bf0[2] + bf0[1];
+  bf1[3] = -bf0[3] + bf0[0];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit[stage]);
+  bf1[7] = bf0[7];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = -bf0[5] + bf0[4];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[7] + bf0[6];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[4];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[6];
+  bf1[4] = bf0[1];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[3];
+  bf1[7] = bf0[7];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void vp10_fdct16_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 16;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[16];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0] + input[15];
+  bf1[1] = input[1] + input[14];
+  bf1[2] = input[2] + input[13];
+  bf1[3] = input[3] + input[12];
+  bf1[4] = input[4] + input[11];
+  bf1[5] = input[5] + input[10];
+  bf1[6] = input[6] + input[9];
+  bf1[7] = input[7] + input[8];
+  bf1[8] = -input[8] + input[7];
+  bf1[9] = -input[9] + input[6];
+  bf1[10] = -input[10] + input[5];
+  bf1[11] = -input[11] + input[4];
+  bf1[12] = -input[12] + input[3];
+  bf1[13] = -input[13] + input[2];
+  bf1[14] = -input[14] + input[1];
+  bf1[15] = -input[15] + input[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[7];
+  bf1[1] = bf0[1] + bf0[6];
+  bf1[2] = bf0[2] + bf0[5];
+  bf1[3] = bf0[3] + bf0[4];
+  bf1[4] = -bf0[4] + bf0[3];
+  bf1[5] = -bf0[5] + bf0[2];
+  bf1[6] = -bf0[6] + bf0[1];
+  bf1[7] = -bf0[7] + bf0[0];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit[stage]);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = -bf0[2] + bf0[1];
+  bf1[3] = -bf0[3] + bf0[0];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit[stage]);
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8] + bf0[11];
+  bf1[9] = bf0[9] + bf0[10];
+  bf1[10] = -bf0[10] + bf0[9];
+  bf1[11] = -bf0[11] + bf0[8];
+  bf1[12] = -bf0[12] + bf0[15];
+  bf1[13] = -bf0[13] + bf0[14];
+  bf1[14] = bf0[14] + bf0[13];
+  bf1[15] = bf0[15] + bf0[12];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = -bf0[5] + bf0[4];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[7] + bf0[6];
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit[stage]);
+  bf1[15] = bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit[stage]);
+  bf1[8] = bf0[8] + bf0[9];
+  bf1[9] = -bf0[9] + bf0[8];
+  bf1[10] = -bf0[10] + bf0[11];
+  bf1[11] = bf0[11] + bf0[10];
+  bf1[12] = bf0[12] + bf0[13];
+  bf1[13] = -bf0[13] + bf0[12];
+  bf1[14] = -bf0[14] + bf0[15];
+  bf1[15] = bf0[15] + bf0[14];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit[stage]);
+  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[8];
+  bf1[2] = bf0[4];
+  bf1[3] = bf0[12];
+  bf1[4] = bf0[2];
+  bf1[5] = bf0[10];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[14];
+  bf1[8] = bf0[1];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[5];
+  bf1[11] = bf0[13];
+  bf1[12] = bf0[3];
+  bf1[13] = bf0[11];
+  bf1[14] = bf0[7];
+  bf1[15] = bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void vp10_fdct32_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 32;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[32];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0] + input[31];
+  bf1[1] = input[1] + input[30];
+  bf1[2] = input[2] + input[29];
+  bf1[3] = input[3] + input[28];
+  bf1[4] = input[4] + input[27];
+  bf1[5] = input[5] + input[26];
+  bf1[6] = input[6] + input[25];
+  bf1[7] = input[7] + input[24];
+  bf1[8] = input[8] + input[23];
+  bf1[9] = input[9] + input[22];
+  bf1[10] = input[10] + input[21];
+  bf1[11] = input[11] + input[20];
+  bf1[12] = input[12] + input[19];
+  bf1[13] = input[13] + input[18];
+  bf1[14] = input[14] + input[17];
+  bf1[15] = input[15] + input[16];
+  bf1[16] = -input[16] + input[15];
+  bf1[17] = -input[17] + input[14];
+  bf1[18] = -input[18] + input[13];
+  bf1[19] = -input[19] + input[12];
+  bf1[20] = -input[20] + input[11];
+  bf1[21] = -input[21] + input[10];
+  bf1[22] = -input[22] + input[9];
+  bf1[23] = -input[23] + input[8];
+  bf1[24] = -input[24] + input[7];
+  bf1[25] = -input[25] + input[6];
+  bf1[26] = -input[26] + input[5];
+  bf1[27] = -input[27] + input[4];
+  bf1[28] = -input[28] + input[3];
+  bf1[29] = -input[29] + input[2];
+  bf1[30] = -input[30] + input[1];
+  bf1[31] = -input[31] + input[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[15];
+  bf1[1] = bf0[1] + bf0[14];
+  bf1[2] = bf0[2] + bf0[13];
+  bf1[3] = bf0[3] + bf0[12];
+  bf1[4] = bf0[4] + bf0[11];
+  bf1[5] = bf0[5] + bf0[10];
+  bf1[6] = bf0[6] + bf0[9];
+  bf1[7] = bf0[7] + bf0[8];
+  bf1[8] = -bf0[8] + bf0[7];
+  bf1[9] = -bf0[9] + bf0[6];
+  bf1[10] = -bf0[10] + bf0[5];
+  bf1[11] = -bf0[11] + bf0[4];
+  bf1[12] = -bf0[12] + bf0[3];
+  bf1[13] = -bf0[13] + bf0[2];
+  bf1[14] = -bf0[14] + bf0[1];
+  bf1[15] = -bf0[15] + bf0[0];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
+  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit[stage]);
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[7];
+  bf1[1] = bf0[1] + bf0[6];
+  bf1[2] = bf0[2] + bf0[5];
+  bf1[3] = bf0[3] + bf0[4];
+  bf1[4] = -bf0[4] + bf0[3];
+  bf1[5] = -bf0[5] + bf0[2];
+  bf1[6] = -bf0[6] + bf0[1];
+  bf1[7] = -bf0[7] + bf0[0];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit[stage]);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[23];
+  bf1[17] = bf0[17] + bf0[22];
+  bf1[18] = bf0[18] + bf0[21];
+  bf1[19] = bf0[19] + bf0[20];
+  bf1[20] = -bf0[20] + bf0[19];
+  bf1[21] = -bf0[21] + bf0[18];
+  bf1[22] = -bf0[22] + bf0[17];
+  bf1[23] = -bf0[23] + bf0[16];
+  bf1[24] = -bf0[24] + bf0[31];
+  bf1[25] = -bf0[25] + bf0[30];
+  bf1[26] = -bf0[26] + bf0[29];
+  bf1[27] = -bf0[27] + bf0[28];
+  bf1[28] = bf0[28] + bf0[27];
+  bf1[29] = bf0[29] + bf0[26];
+  bf1[30] = bf0[30] + bf0[25];
+  bf1[31] = bf0[31] + bf0[24];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = -bf0[2] + bf0[1];
+  bf1[3] = -bf0[3] + bf0[0];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit[stage]);
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8] + bf0[11];
+  bf1[9] = bf0[9] + bf0[10];
+  bf1[10] = -bf0[10] + bf0[9];
+  bf1[11] = -bf0[11] + bf0[8];
+  bf1[12] = -bf0[12] + bf0[15];
+  bf1[13] = -bf0[13] + bf0[14];
+  bf1[14] = bf0[14] + bf0[13];
+  bf1[15] = bf0[15] + bf0[12];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit[stage]);
+  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit[stage]);
+  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit[stage]);
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit[stage]);
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = -bf0[5] + bf0[4];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[7] + bf0[6];
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit[stage]);
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[19];
+  bf1[17] = bf0[17] + bf0[18];
+  bf1[18] = -bf0[18] + bf0[17];
+  bf1[19] = -bf0[19] + bf0[16];
+  bf1[20] = -bf0[20] + bf0[23];
+  bf1[21] = -bf0[21] + bf0[22];
+  bf1[22] = bf0[22] + bf0[21];
+  bf1[23] = bf0[23] + bf0[20];
+  bf1[24] = bf0[24] + bf0[27];
+  bf1[25] = bf0[25] + bf0[26];
+  bf1[26] = -bf0[26] + bf0[25];
+  bf1[27] = -bf0[27] + bf0[24];
+  bf1[28] = -bf0[28] + bf0[31];
+  bf1[29] = -bf0[29] + bf0[30];
+  bf1[30] = bf0[30] + bf0[29];
+  bf1[31] = bf0[31] + bf0[28];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit[stage]);
+  bf1[8] = bf0[8] + bf0[9];
+  bf1[9] = -bf0[9] + bf0[8];
+  bf1[10] = -bf0[10] + bf0[11];
+  bf1[11] = bf0[11] + bf0[10];
+  bf1[12] = bf0[12] + bf0[13];
+  bf1[13] = -bf0[13] + bf0[12];
+  bf1[14] = -bf0[14] + bf0[15];
+  bf1[15] = bf0[15] + bf0[14];
+  bf1[16] = bf0[16];
+  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit[stage]);
+  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit[stage]);
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit[stage]);
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit[stage]);
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit[stage]);
+  bf1[31] = bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit[stage]);
+  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit[stage]);
+  bf1[16] = bf0[16] + bf0[17];
+  bf1[17] = -bf0[17] + bf0[16];
+  bf1[18] = -bf0[18] + bf0[19];
+  bf1[19] = bf0[19] + bf0[18];
+  bf1[20] = bf0[20] + bf0[21];
+  bf1[21] = -bf0[21] + bf0[20];
+  bf1[22] = -bf0[22] + bf0[23];
+  bf1[23] = bf0[23] + bf0[22];
+  bf1[24] = bf0[24] + bf0[25];
+  bf1[25] = -bf0[25] + bf0[24];
+  bf1[26] = -bf0[26] + bf0[27];
+  bf1[27] = bf0[27] + bf0[26];
+  bf1[28] = bf0[28] + bf0[29];
+  bf1[29] = -bf0[29] + bf0[28];
+  bf1[30] = -bf0[30] + bf0[31];
+  bf1[31] = bf0[31] + bf0[30];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit[stage]);
+  bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit[stage]);
+  bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit[stage]);
+  bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit[stage]);
+  bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit[stage]);
+  bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[16];
+  bf1[2] = bf0[8];
+  bf1[3] = bf0[24];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[20];
+  bf1[6] = bf0[12];
+  bf1[7] = bf0[28];
+  bf1[8] = bf0[2];
+  bf1[9] = bf0[18];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[26];
+  bf1[12] = bf0[6];
+  bf1[13] = bf0[22];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[30];
+  bf1[16] = bf0[1];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[9];
+  bf1[19] = bf0[25];
+  bf1[20] = bf0[5];
+  bf1[21] = bf0[21];
+  bf1[22] = bf0[13];
+  bf1[23] = bf0[29];
+  bf1[24] = bf0[3];
+  bf1[25] = bf0[19];
+  bf1[26] = bf0[11];
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[7];
+  bf1[29] = bf0[23];
+  bf1[30] = bf0[15];
+  bf1[31] = bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void vp10_fadst4_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 4;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[4];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[3];
+  bf1[1] = input[0];
+  bf1[2] = input[1];
+  bf1[3] = input[2];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[8], bf0[0], cospi[56], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(-cospi[8], bf0[1], cospi[56], bf0[0], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[40], bf0[2], cospi[24], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(-cospi[40], bf0[3], cospi[24], bf0[2], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[2];
+  bf1[1] = bf0[1] + bf0[3];
+  bf1[2] = -bf0[2] + bf0[0];
+  bf1[3] = -bf0[3] + bf0[1];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = -bf0[2];
+  bf1[2] = bf0[3];
+  bf1[3] = -bf0[1];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void vp10_fadst8_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 8;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[8];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[7];
+  bf1[1] = input[0];
+  bf1[2] = input[5];
+  bf1[3] = input[2];
+  bf1[4] = input[3];
+  bf1[5] = input[4];
+  bf1[6] = input[1];
+  bf1[7] = input[6];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(-cospi[4], bf0[1], cospi[60], bf0[0], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(-cospi[20], bf0[3], cospi[44], bf0[2], cos_bit[stage]);
+  bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(-cospi[36], bf0[5], cospi[28], bf0[4], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(-cospi[52], bf0[7], cospi[12], bf0[6], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[4];
+  bf1[1] = bf0[1] + bf0[5];
+  bf1[2] = bf0[2] + bf0[6];
+  bf1[3] = bf0[3] + bf0[7];
+  bf1[4] = -bf0[4] + bf0[0];
+  bf1[5] = -bf0[5] + bf0[1];
+  bf1[6] = -bf0[6] + bf0[2];
+  bf1[7] = -bf0[7] + bf0[3];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(-cospi[16], bf0[5], cospi[48], bf0[4], cos_bit[stage]);
+  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[48], bf0[7], cospi[16], bf0[6], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[2];
+  bf1[1] = bf0[1] + bf0[3];
+  bf1[2] = -bf0[2] + bf0[0];
+  bf1[3] = -bf0[3] + bf0[1];
+  bf1[4] = bf0[4] + bf0[6];
+  bf1[5] = bf0[5] + bf0[7];
+  bf1[6] = -bf0[6] + bf0[4];
+  bf1[7] = -bf0[7] + bf0[5];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit[stage]);
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(-cospi[32], bf0[7], cospi[32], bf0[6], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = -bf0[4];
+  bf1[2] = bf0[6];
+  bf1[3] = -bf0[2];
+  bf1[4] = bf0[3];
+  bf1[5] = -bf0[7];
+  bf1[6] = bf0[5];
+  bf1[7] = -bf0[1];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void vp10_fadst16_new(const int32_t *input, int32_t *output,
+                      const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 16;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[16];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[15];
+  bf1[1] = input[0];
+  bf1[2] = input[13];
+  bf1[3] = input[2];
+  bf1[4] = input[11];
+  bf1[5] = input[4];
+  bf1[6] = input[9];
+  bf1[7] = input[6];
+  bf1[8] = input[7];
+  bf1[9] = input[8];
+  bf1[10] = input[5];
+  bf1[11] = input[10];
+  bf1[12] = input[3];
+  bf1[13] = input[12];
+  bf1[14] = input[1];
+  bf1[15] = input[14];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(-cospi[2], bf0[1], cospi[62], bf0[0], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(-cospi[10], bf0[3], cospi[54], bf0[2], cos_bit[stage]);
+  bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(-cospi[18], bf0[5], cospi[46], bf0[4], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(-cospi[26], bf0[7], cospi[38], bf0[6], cos_bit[stage]);
+  bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit[stage]);
+  bf1[9] = half_btf(-cospi[34], bf0[9], cospi[30], bf0[8], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[42], bf0[11], cospi[22], bf0[10], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(-cospi[50], bf0[13], cospi[14], bf0[12], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(-cospi[58], bf0[15], cospi[6], bf0[14], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[8];
+  bf1[1] = bf0[1] + bf0[9];
+  bf1[2] = bf0[2] + bf0[10];
+  bf1[3] = bf0[3] + bf0[11];
+  bf1[4] = bf0[4] + bf0[12];
+  bf1[5] = bf0[5] + bf0[13];
+  bf1[6] = bf0[6] + bf0[14];
+  bf1[7] = bf0[7] + bf0[15];
+  bf1[8] = -bf0[8] + bf0[0];
+  bf1[9] = -bf0[9] + bf0[1];
+  bf1[10] = -bf0[10] + bf0[2];
+  bf1[11] = -bf0[11] + bf0[3];
+  bf1[12] = -bf0[12] + bf0[4];
+  bf1[13] = -bf0[13] + bf0[5];
+  bf1[14] = -bf0[14] + bf0[6];
+  bf1[15] = -bf0[15] + bf0[7];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit[stage]);
+  bf1[9] = half_btf(-cospi[8], bf0[9], cospi[56], bf0[8], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[40], bf0[11], cospi[24], bf0[10], cos_bit[stage]);
+  bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[56], bf0[13], cospi[8], bf0[12], cos_bit[stage]);
+  bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[24], bf0[15], cospi[40], bf0[14], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[4];
+  bf1[1] = bf0[1] + bf0[5];
+  bf1[2] = bf0[2] + bf0[6];
+  bf1[3] = bf0[3] + bf0[7];
+  bf1[4] = -bf0[4] + bf0[0];
+  bf1[5] = -bf0[5] + bf0[1];
+  bf1[6] = -bf0[6] + bf0[2];
+  bf1[7] = -bf0[7] + bf0[3];
+  bf1[8] = bf0[8] + bf0[12];
+  bf1[9] = bf0[9] + bf0[13];
+  bf1[10] = bf0[10] + bf0[14];
+  bf1[11] = bf0[11] + bf0[15];
+  bf1[12] = -bf0[12] + bf0[8];
+  bf1[13] = -bf0[13] + bf0[9];
+  bf1[14] = -bf0[14] + bf0[10];
+  bf1[15] = -bf0[15] + bf0[11];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(-cospi[16], bf0[5], cospi[48], bf0[4], cos_bit[stage]);
+  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[48], bf0[7], cospi[16], bf0[6], cos_bit[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(-cospi[16], bf0[13], cospi[48], bf0[12], cos_bit[stage]);
+  bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[48], bf0[15], cospi[16], bf0[14], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[2];
+  bf1[1] = bf0[1] + bf0[3];
+  bf1[2] = -bf0[2] + bf0[0];
+  bf1[3] = -bf0[3] + bf0[1];
+  bf1[4] = bf0[4] + bf0[6];
+  bf1[5] = bf0[5] + bf0[7];
+  bf1[6] = -bf0[6] + bf0[4];
+  bf1[7] = -bf0[7] + bf0[5];
+  bf1[8] = bf0[8] + bf0[10];
+  bf1[9] = bf0[9] + bf0[11];
+  bf1[10] = -bf0[10] + bf0[8];
+  bf1[11] = -bf0[11] + bf0[9];
+  bf1[12] = bf0[12] + bf0[14];
+  bf1[13] = bf0[13] + bf0[15];
+  bf1[14] = -bf0[14] + bf0[12];
+  bf1[15] = -bf0[15] + bf0[13];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit[stage]);
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(-cospi[32], bf0[7], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[10], cos_bit[stage]);
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(-cospi[32], bf0[15], cospi[32], bf0[14], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = -bf0[8];
+  bf1[2] = bf0[12];
+  bf1[3] = -bf0[4];
+  bf1[4] = bf0[6];
+  bf1[5] = -bf0[14];
+  bf1[6] = bf0[10];
+  bf1[7] = -bf0[2];
+  bf1[8] = bf0[3];
+  bf1[9] = -bf0[11];
+  bf1[10] = bf0[15];
+  bf1[11] = -bf0[7];
+  bf1[12] = bf0[5];
+  bf1[13] = -bf0[13];
+  bf1[14] = bf0[9];
+  bf1[15] = -bf0[1];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void vp10_fadst32_new(const int32_t *input, int32_t *output,
+                      const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 32;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[32];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[31];
+  bf1[1] = input[0];
+  bf1[2] = input[29];
+  bf1[3] = input[2];
+  bf1[4] = input[27];
+  bf1[5] = input[4];
+  bf1[6] = input[25];
+  bf1[7] = input[6];
+  bf1[8] = input[23];
+  bf1[9] = input[8];
+  bf1[10] = input[21];
+  bf1[11] = input[10];
+  bf1[12] = input[19];
+  bf1[13] = input[12];
+  bf1[14] = input[17];
+  bf1[15] = input[14];
+  bf1[16] = input[15];
+  bf1[17] = input[16];
+  bf1[18] = input[13];
+  bf1[19] = input[18];
+  bf1[20] = input[11];
+  bf1[21] = input[20];
+  bf1[22] = input[9];
+  bf1[23] = input[22];
+  bf1[24] = input[7];
+  bf1[25] = input[24];
+  bf1[26] = input[5];
+  bf1[27] = input[26];
+  bf1[28] = input[3];
+  bf1[29] = input[28];
+  bf1[30] = input[1];
+  bf1[31] = input[30];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[1], bf0[0], cospi[63], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(-cospi[1], bf0[1], cospi[63], bf0[0], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[5], bf0[2], cospi[59], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(-cospi[5], bf0[3], cospi[59], bf0[2], cos_bit[stage]);
+  bf1[4] = half_btf(cospi[9], bf0[4], cospi[55], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(-cospi[9], bf0[5], cospi[55], bf0[4], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[13], bf0[6], cospi[51], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(-cospi[13], bf0[7], cospi[51], bf0[6], cos_bit[stage]);
+  bf1[8] = half_btf(cospi[17], bf0[8], cospi[47], bf0[9], cos_bit[stage]);
+  bf1[9] = half_btf(-cospi[17], bf0[9], cospi[47], bf0[8], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[21], bf0[10], cospi[43], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[21], bf0[11], cospi[43], bf0[10], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[25], bf0[12], cospi[39], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(-cospi[25], bf0[13], cospi[39], bf0[12], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[29], bf0[14], cospi[35], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(-cospi[29], bf0[15], cospi[35], bf0[14], cos_bit[stage]);
+  bf1[16] = half_btf(cospi[33], bf0[16], cospi[31], bf0[17], cos_bit[stage]);
+  bf1[17] = half_btf(-cospi[33], bf0[17], cospi[31], bf0[16], cos_bit[stage]);
+  bf1[18] = half_btf(cospi[37], bf0[18], cospi[27], bf0[19], cos_bit[stage]);
+  bf1[19] = half_btf(-cospi[37], bf0[19], cospi[27], bf0[18], cos_bit[stage]);
+  bf1[20] = half_btf(cospi[41], bf0[20], cospi[23], bf0[21], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[41], bf0[21], cospi[23], bf0[20], cos_bit[stage]);
+  bf1[22] = half_btf(cospi[45], bf0[22], cospi[19], bf0[23], cos_bit[stage]);
+  bf1[23] = half_btf(-cospi[45], bf0[23], cospi[19], bf0[22], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[49], bf0[24], cospi[15], bf0[25], cos_bit[stage]);
+  bf1[25] = half_btf(-cospi[49], bf0[25], cospi[15], bf0[24], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[53], bf0[26], cospi[11], bf0[27], cos_bit[stage]);
+  bf1[27] = half_btf(-cospi[53], bf0[27], cospi[11], bf0[26], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[57], bf0[28], cospi[7], bf0[29], cos_bit[stage]);
+  bf1[29] = half_btf(-cospi[57], bf0[29], cospi[7], bf0[28], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[61], bf0[30], cospi[3], bf0[31], cos_bit[stage]);
+  bf1[31] = half_btf(-cospi[61], bf0[31], cospi[3], bf0[30], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[16];
+  bf1[1] = bf0[1] + bf0[17];
+  bf1[2] = bf0[2] + bf0[18];
+  bf1[3] = bf0[3] + bf0[19];
+  bf1[4] = bf0[4] + bf0[20];
+  bf1[5] = bf0[5] + bf0[21];
+  bf1[6] = bf0[6] + bf0[22];
+  bf1[7] = bf0[7] + bf0[23];
+  bf1[8] = bf0[8] + bf0[24];
+  bf1[9] = bf0[9] + bf0[25];
+  bf1[10] = bf0[10] + bf0[26];
+  bf1[11] = bf0[11] + bf0[27];
+  bf1[12] = bf0[12] + bf0[28];
+  bf1[13] = bf0[13] + bf0[29];
+  bf1[14] = bf0[14] + bf0[30];
+  bf1[15] = bf0[15] + bf0[31];
+  bf1[16] = -bf0[16] + bf0[0];
+  bf1[17] = -bf0[17] + bf0[1];
+  bf1[18] = -bf0[18] + bf0[2];
+  bf1[19] = -bf0[19] + bf0[3];
+  bf1[20] = -bf0[20] + bf0[4];
+  bf1[21] = -bf0[21] + bf0[5];
+  bf1[22] = -bf0[22] + bf0[6];
+  bf1[23] = -bf0[23] + bf0[7];
+  bf1[24] = -bf0[24] + bf0[8];
+  bf1[25] = -bf0[25] + bf0[9];
+  bf1[26] = -bf0[26] + bf0[10];
+  bf1[27] = -bf0[27] + bf0[11];
+  bf1[28] = -bf0[28] + bf0[12];
+  bf1[29] = -bf0[29] + bf0[13];
+  bf1[30] = -bf0[30] + bf0[14];
+  bf1[31] = -bf0[31] + bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = half_btf(cospi[4], bf0[16], cospi[60], bf0[17], cos_bit[stage]);
+  bf1[17] = half_btf(-cospi[4], bf0[17], cospi[60], bf0[16], cos_bit[stage]);
+  bf1[18] = half_btf(cospi[20], bf0[18], cospi[44], bf0[19], cos_bit[stage]);
+  bf1[19] = half_btf(-cospi[20], bf0[19], cospi[44], bf0[18], cos_bit[stage]);
+  bf1[20] = half_btf(cospi[36], bf0[20], cospi[28], bf0[21], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[36], bf0[21], cospi[28], bf0[20], cos_bit[stage]);
+  bf1[22] = half_btf(cospi[52], bf0[22], cospi[12], bf0[23], cos_bit[stage]);
+  bf1[23] = half_btf(-cospi[52], bf0[23], cospi[12], bf0[22], cos_bit[stage]);
+  bf1[24] = half_btf(-cospi[60], bf0[24], cospi[4], bf0[25], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[60], bf0[25], cospi[4], bf0[24], cos_bit[stage]);
+  bf1[26] = half_btf(-cospi[44], bf0[26], cospi[20], bf0[27], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[44], bf0[27], cospi[20], bf0[26], cos_bit[stage]);
+  bf1[28] = half_btf(-cospi[28], bf0[28], cospi[36], bf0[29], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[28], bf0[29], cospi[36], bf0[28], cos_bit[stage]);
+  bf1[30] = half_btf(-cospi[12], bf0[30], cospi[52], bf0[31], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[12], bf0[31], cospi[52], bf0[30], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[8];
+  bf1[1] = bf0[1] + bf0[9];
+  bf1[2] = bf0[2] + bf0[10];
+  bf1[3] = bf0[3] + bf0[11];
+  bf1[4] = bf0[4] + bf0[12];
+  bf1[5] = bf0[5] + bf0[13];
+  bf1[6] = bf0[6] + bf0[14];
+  bf1[7] = bf0[7] + bf0[15];
+  bf1[8] = -bf0[8] + bf0[0];
+  bf1[9] = -bf0[9] + bf0[1];
+  bf1[10] = -bf0[10] + bf0[2];
+  bf1[11] = -bf0[11] + bf0[3];
+  bf1[12] = -bf0[12] + bf0[4];
+  bf1[13] = -bf0[13] + bf0[5];
+  bf1[14] = -bf0[14] + bf0[6];
+  bf1[15] = -bf0[15] + bf0[7];
+  bf1[16] = bf0[16] + bf0[24];
+  bf1[17] = bf0[17] + bf0[25];
+  bf1[18] = bf0[18] + bf0[26];
+  bf1[19] = bf0[19] + bf0[27];
+  bf1[20] = bf0[20] + bf0[28];
+  bf1[21] = bf0[21] + bf0[29];
+  bf1[22] = bf0[22] + bf0[30];
+  bf1[23] = bf0[23] + bf0[31];
+  bf1[24] = -bf0[24] + bf0[16];
+  bf1[25] = -bf0[25] + bf0[17];
+  bf1[26] = -bf0[26] + bf0[18];
+  bf1[27] = -bf0[27] + bf0[19];
+  bf1[28] = -bf0[28] + bf0[20];
+  bf1[29] = -bf0[29] + bf0[21];
+  bf1[30] = -bf0[30] + bf0[22];
+  bf1[31] = -bf0[31] + bf0[23];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit[stage]);
+  bf1[9] = half_btf(-cospi[8], bf0[9], cospi[56], bf0[8], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[40], bf0[11], cospi[24], bf0[10], cos_bit[stage]);
+  bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[56], bf0[13], cospi[8], bf0[12], cos_bit[stage]);
+  bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[24], bf0[15], cospi[40], bf0[14], cos_bit[stage]);
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = bf0[21];
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = half_btf(cospi[8], bf0[24], cospi[56], bf0[25], cos_bit[stage]);
+  bf1[25] = half_btf(-cospi[8], bf0[25], cospi[56], bf0[24], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[27], cos_bit[stage]);
+  bf1[27] = half_btf(-cospi[40], bf0[27], cospi[24], bf0[26], cos_bit[stage]);
+  bf1[28] = half_btf(-cospi[56], bf0[28], cospi[8], bf0[29], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[56], bf0[29], cospi[8], bf0[28], cos_bit[stage]);
+  bf1[30] = half_btf(-cospi[24], bf0[30], cospi[40], bf0[31], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[24], bf0[31], cospi[40], bf0[30], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[4];
+  bf1[1] = bf0[1] + bf0[5];
+  bf1[2] = bf0[2] + bf0[6];
+  bf1[3] = bf0[3] + bf0[7];
+  bf1[4] = -bf0[4] + bf0[0];
+  bf1[5] = -bf0[5] + bf0[1];
+  bf1[6] = -bf0[6] + bf0[2];
+  bf1[7] = -bf0[7] + bf0[3];
+  bf1[8] = bf0[8] + bf0[12];
+  bf1[9] = bf0[9] + bf0[13];
+  bf1[10] = bf0[10] + bf0[14];
+  bf1[11] = bf0[11] + bf0[15];
+  bf1[12] = -bf0[12] + bf0[8];
+  bf1[13] = -bf0[13] + bf0[9];
+  bf1[14] = -bf0[14] + bf0[10];
+  bf1[15] = -bf0[15] + bf0[11];
+  bf1[16] = bf0[16] + bf0[20];
+  bf1[17] = bf0[17] + bf0[21];
+  bf1[18] = bf0[18] + bf0[22];
+  bf1[19] = bf0[19] + bf0[23];
+  bf1[20] = -bf0[20] + bf0[16];
+  bf1[21] = -bf0[21] + bf0[17];
+  bf1[22] = -bf0[22] + bf0[18];
+  bf1[23] = -bf0[23] + bf0[19];
+  bf1[24] = bf0[24] + bf0[28];
+  bf1[25] = bf0[25] + bf0[29];
+  bf1[26] = bf0[26] + bf0[30];
+  bf1[27] = bf0[27] + bf0[31];
+  bf1[28] = -bf0[28] + bf0[24];
+  bf1[29] = -bf0[29] + bf0[25];
+  bf1[30] = -bf0[30] + bf0[26];
+  bf1[31] = -bf0[31] + bf0[27];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(-cospi[16], bf0[5], cospi[48], bf0[4], cos_bit[stage]);
+  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[48], bf0[7], cospi[16], bf0[6], cos_bit[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(-cospi[16], bf0[13], cospi[48], bf0[12], cos_bit[stage]);
+  bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[48], bf0[15], cospi[16], bf0[14], cos_bit[stage]);
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = half_btf(cospi[16], bf0[20], cospi[48], bf0[21], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[20], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[48], bf0[22], cospi[16], bf0[23], cos_bit[stage]);
+  bf1[23] = half_btf(cospi[48], bf0[23], cospi[16], bf0[22], cos_bit[stage]);
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = bf0[26];
+  bf1[27] = bf0[27];
+  bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[29], cos_bit[stage]);
+  bf1[29] = half_btf(-cospi[16], bf0[29], cospi[48], bf0[28], cos_bit[stage]);
+  bf1[30] = half_btf(-cospi[48], bf0[30], cospi[16], bf0[31], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[48], bf0[31], cospi[16], bf0[30], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[2];
+  bf1[1] = bf0[1] + bf0[3];
+  bf1[2] = -bf0[2] + bf0[0];
+  bf1[3] = -bf0[3] + bf0[1];
+  bf1[4] = bf0[4] + bf0[6];
+  bf1[5] = bf0[5] + bf0[7];
+  bf1[6] = -bf0[6] + bf0[4];
+  bf1[7] = -bf0[7] + bf0[5];
+  bf1[8] = bf0[8] + bf0[10];
+  bf1[9] = bf0[9] + bf0[11];
+  bf1[10] = -bf0[10] + bf0[8];
+  bf1[11] = -bf0[11] + bf0[9];
+  bf1[12] = bf0[12] + bf0[14];
+  bf1[13] = bf0[13] + bf0[15];
+  bf1[14] = -bf0[14] + bf0[12];
+  bf1[15] = -bf0[15] + bf0[13];
+  bf1[16] = bf0[16] + bf0[18];
+  bf1[17] = bf0[17] + bf0[19];
+  bf1[18] = -bf0[18] + bf0[16];
+  bf1[19] = -bf0[19] + bf0[17];
+  bf1[20] = bf0[20] + bf0[22];
+  bf1[21] = bf0[21] + bf0[23];
+  bf1[22] = -bf0[22] + bf0[20];
+  bf1[23] = -bf0[23] + bf0[21];
+  bf1[24] = bf0[24] + bf0[26];
+  bf1[25] = bf0[25] + bf0[27];
+  bf1[26] = -bf0[26] + bf0[24];
+  bf1[27] = -bf0[27] + bf0[25];
+  bf1[28] = bf0[28] + bf0[30];
+  bf1[29] = bf0[29] + bf0[31];
+  bf1[30] = -bf0[30] + bf0[28];
+  bf1[31] = -bf0[31] + bf0[29];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 10
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(-cospi[32], bf0[3], cospi[32], bf0[2], cos_bit[stage]);
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(-cospi[32], bf0[7], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[10], cos_bit[stage]);
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(-cospi[32], bf0[15], cospi[32], bf0[14], cos_bit[stage]);
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = half_btf(cospi[32], bf0[18], cospi[32], bf0[19], cos_bit[stage]);
+  bf1[19] = half_btf(-cospi[32], bf0[19], cospi[32], bf0[18], cos_bit[stage]);
+  bf1[20] = bf0[20];
+  bf1[21] = bf0[21];
+  bf1[22] = half_btf(cospi[32], bf0[22], cospi[32], bf0[23], cos_bit[stage]);
+  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[22], cos_bit[stage]);
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[27], cos_bit[stage]);
+  bf1[27] = half_btf(-cospi[32], bf0[27], cospi[32], bf0[26], cos_bit[stage]);
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = half_btf(cospi[32], bf0[30], cospi[32], bf0[31], cos_bit[stage]);
+  bf1[31] = half_btf(-cospi[32], bf0[31], cospi[32], bf0[30], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 11
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = -bf0[16];
+  bf1[2] = bf0[24];
+  bf1[3] = -bf0[8];
+  bf1[4] = bf0[12];
+  bf1[5] = -bf0[28];
+  bf1[6] = bf0[20];
+  bf1[7] = -bf0[4];
+  bf1[8] = bf0[6];
+  bf1[9] = -bf0[22];
+  bf1[10] = bf0[30];
+  bf1[11] = -bf0[14];
+  bf1[12] = bf0[10];
+  bf1[13] = -bf0[26];
+  bf1[14] = bf0[18];
+  bf1[15] = -bf0[2];
+  bf1[16] = bf0[3];
+  bf1[17] = -bf0[19];
+  bf1[18] = bf0[27];
+  bf1[19] = -bf0[11];
+  bf1[20] = bf0[15];
+  bf1[21] = -bf0[31];
+  bf1[22] = bf0[23];
+  bf1[23] = -bf0[7];
+  bf1[24] = bf0[5];
+  bf1[25] = -bf0[21];
+  bf1[26] = bf0[29];
+  bf1[27] = -bf0[13];
+  bf1[28] = bf0[9];
+  bf1[29] = -bf0[25];
+  bf1[30] = bf0[17];
+  bf1[31] = -bf0[1];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void vp10_fdct64_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 64;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[64];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf1 = output;
+  bf1[0] = input[0] + input[63];
+  bf1[1] = input[1] + input[62];
+  bf1[2] = input[2] + input[61];
+  bf1[3] = input[3] + input[60];
+  bf1[4] = input[4] + input[59];
+  bf1[5] = input[5] + input[58];
+  bf1[6] = input[6] + input[57];
+  bf1[7] = input[7] + input[56];
+  bf1[8] = input[8] + input[55];
+  bf1[9] = input[9] + input[54];
+  bf1[10] = input[10] + input[53];
+  bf1[11] = input[11] + input[52];
+  bf1[12] = input[12] + input[51];
+  bf1[13] = input[13] + input[50];
+  bf1[14] = input[14] + input[49];
+  bf1[15] = input[15] + input[48];
+  bf1[16] = input[16] + input[47];
+  bf1[17] = input[17] + input[46];
+  bf1[18] = input[18] + input[45];
+  bf1[19] = input[19] + input[44];
+  bf1[20] = input[20] + input[43];
+  bf1[21] = input[21] + input[42];
+  bf1[22] = input[22] + input[41];
+  bf1[23] = input[23] + input[40];
+  bf1[24] = input[24] + input[39];
+  bf1[25] = input[25] + input[38];
+  bf1[26] = input[26] + input[37];
+  bf1[27] = input[27] + input[36];
+  bf1[28] = input[28] + input[35];
+  bf1[29] = input[29] + input[34];
+  bf1[30] = input[30] + input[33];
+  bf1[31] = input[31] + input[32];
+  bf1[32] = -input[32] + input[31];
+  bf1[33] = -input[33] + input[30];
+  bf1[34] = -input[34] + input[29];
+  bf1[35] = -input[35] + input[28];
+  bf1[36] = -input[36] + input[27];
+  bf1[37] = -input[37] + input[26];
+  bf1[38] = -input[38] + input[25];
+  bf1[39] = -input[39] + input[24];
+  bf1[40] = -input[40] + input[23];
+  bf1[41] = -input[41] + input[22];
+  bf1[42] = -input[42] + input[21];
+  bf1[43] = -input[43] + input[20];
+  bf1[44] = -input[44] + input[19];
+  bf1[45] = -input[45] + input[18];
+  bf1[46] = -input[46] + input[17];
+  bf1[47] = -input[47] + input[16];
+  bf1[48] = -input[48] + input[15];
+  bf1[49] = -input[49] + input[14];
+  bf1[50] = -input[50] + input[13];
+  bf1[51] = -input[51] + input[12];
+  bf1[52] = -input[52] + input[11];
+  bf1[53] = -input[53] + input[10];
+  bf1[54] = -input[54] + input[9];
+  bf1[55] = -input[55] + input[8];
+  bf1[56] = -input[56] + input[7];
+  bf1[57] = -input[57] + input[6];
+  bf1[58] = -input[58] + input[5];
+  bf1[59] = -input[59] + input[4];
+  bf1[60] = -input[60] + input[3];
+  bf1[61] = -input[61] + input[2];
+  bf1[62] = -input[62] + input[1];
+  bf1[63] = -input[63] + input[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[31];
+  bf1[1] = bf0[1] + bf0[30];
+  bf1[2] = bf0[2] + bf0[29];
+  bf1[3] = bf0[3] + bf0[28];
+  bf1[4] = bf0[4] + bf0[27];
+  bf1[5] = bf0[5] + bf0[26];
+  bf1[6] = bf0[6] + bf0[25];
+  bf1[7] = bf0[7] + bf0[24];
+  bf1[8] = bf0[8] + bf0[23];
+  bf1[9] = bf0[9] + bf0[22];
+  bf1[10] = bf0[10] + bf0[21];
+  bf1[11] = bf0[11] + bf0[20];
+  bf1[12] = bf0[12] + bf0[19];
+  bf1[13] = bf0[13] + bf0[18];
+  bf1[14] = bf0[14] + bf0[17];
+  bf1[15] = bf0[15] + bf0[16];
+  bf1[16] = -bf0[16] + bf0[15];
+  bf1[17] = -bf0[17] + bf0[14];
+  bf1[18] = -bf0[18] + bf0[13];
+  bf1[19] = -bf0[19] + bf0[12];
+  bf1[20] = -bf0[20] + bf0[11];
+  bf1[21] = -bf0[21] + bf0[10];
+  bf1[22] = -bf0[22] + bf0[9];
+  bf1[23] = -bf0[23] + bf0[8];
+  bf1[24] = -bf0[24] + bf0[7];
+  bf1[25] = -bf0[25] + bf0[6];
+  bf1[26] = -bf0[26] + bf0[5];
+  bf1[27] = -bf0[27] + bf0[4];
+  bf1[28] = -bf0[28] + bf0[3];
+  bf1[29] = -bf0[29] + bf0[2];
+  bf1[30] = -bf0[30] + bf0[1];
+  bf1[31] = -bf0[31] + bf0[0];
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = bf0[34];
+  bf1[35] = bf0[35];
+  bf1[36] = bf0[36];
+  bf1[37] = bf0[37];
+  bf1[38] = bf0[38];
+  bf1[39] = bf0[39];
+  bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit[stage]);
+  bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit[stage]);
+  bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit[stage]);
+  bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit[stage]);
+  bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit[stage]);
+  bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit[stage]);
+  bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit[stage]);
+  bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit[stage]);
+  bf1[48] = half_btf(cospi[32], bf0[48], cospi[32], bf0[47], cos_bit[stage]);
+  bf1[49] = half_btf(cospi[32], bf0[49], cospi[32], bf0[46], cos_bit[stage]);
+  bf1[50] = half_btf(cospi[32], bf0[50], cospi[32], bf0[45], cos_bit[stage]);
+  bf1[51] = half_btf(cospi[32], bf0[51], cospi[32], bf0[44], cos_bit[stage]);
+  bf1[52] = half_btf(cospi[32], bf0[52], cospi[32], bf0[43], cos_bit[stage]);
+  bf1[53] = half_btf(cospi[32], bf0[53], cospi[32], bf0[42], cos_bit[stage]);
+  bf1[54] = half_btf(cospi[32], bf0[54], cospi[32], bf0[41], cos_bit[stage]);
+  bf1[55] = half_btf(cospi[32], bf0[55], cospi[32], bf0[40], cos_bit[stage]);
+  bf1[56] = bf0[56];
+  bf1[57] = bf0[57];
+  bf1[58] = bf0[58];
+  bf1[59] = bf0[59];
+  bf1[60] = bf0[60];
+  bf1[61] = bf0[61];
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[15];
+  bf1[1] = bf0[1] + bf0[14];
+  bf1[2] = bf0[2] + bf0[13];
+  bf1[3] = bf0[3] + bf0[12];
+  bf1[4] = bf0[4] + bf0[11];
+  bf1[5] = bf0[5] + bf0[10];
+  bf1[6] = bf0[6] + bf0[9];
+  bf1[7] = bf0[7] + bf0[8];
+  bf1[8] = -bf0[8] + bf0[7];
+  bf1[9] = -bf0[9] + bf0[6];
+  bf1[10] = -bf0[10] + bf0[5];
+  bf1[11] = -bf0[11] + bf0[4];
+  bf1[12] = -bf0[12] + bf0[3];
+  bf1[13] = -bf0[13] + bf0[2];
+  bf1[14] = -bf0[14] + bf0[1];
+  bf1[15] = -bf0[15] + bf0[0];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
+  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[32], bf0[24], cospi[32], bf0[23], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[32], bf0[25], cospi[32], bf0[22], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[21], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[32], bf0[27], cospi[32], bf0[20], cos_bit[stage]);
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = bf0[32] + bf0[47];
+  bf1[33] = bf0[33] + bf0[46];
+  bf1[34] = bf0[34] + bf0[45];
+  bf1[35] = bf0[35] + bf0[44];
+  bf1[36] = bf0[36] + bf0[43];
+  bf1[37] = bf0[37] + bf0[42];
+  bf1[38] = bf0[38] + bf0[41];
+  bf1[39] = bf0[39] + bf0[40];
+  bf1[40] = -bf0[40] + bf0[39];
+  bf1[41] = -bf0[41] + bf0[38];
+  bf1[42] = -bf0[42] + bf0[37];
+  bf1[43] = -bf0[43] + bf0[36];
+  bf1[44] = -bf0[44] + bf0[35];
+  bf1[45] = -bf0[45] + bf0[34];
+  bf1[46] = -bf0[46] + bf0[33];
+  bf1[47] = -bf0[47] + bf0[32];
+  bf1[48] = -bf0[48] + bf0[63];
+  bf1[49] = -bf0[49] + bf0[62];
+  bf1[50] = -bf0[50] + bf0[61];
+  bf1[51] = -bf0[51] + bf0[60];
+  bf1[52] = -bf0[52] + bf0[59];
+  bf1[53] = -bf0[53] + bf0[58];
+  bf1[54] = -bf0[54] + bf0[57];
+  bf1[55] = -bf0[55] + bf0[56];
+  bf1[56] = bf0[56] + bf0[55];
+  bf1[57] = bf0[57] + bf0[54];
+  bf1[58] = bf0[58] + bf0[53];
+  bf1[59] = bf0[59] + bf0[52];
+  bf1[60] = bf0[60] + bf0[51];
+  bf1[61] = bf0[61] + bf0[50];
+  bf1[62] = bf0[62] + bf0[49];
+  bf1[63] = bf0[63] + bf0[48];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[7];
+  bf1[1] = bf0[1] + bf0[6];
+  bf1[2] = bf0[2] + bf0[5];
+  bf1[3] = bf0[3] + bf0[4];
+  bf1[4] = -bf0[4] + bf0[3];
+  bf1[5] = -bf0[5] + bf0[2];
+  bf1[6] = -bf0[6] + bf0[1];
+  bf1[7] = -bf0[7] + bf0[0];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[32], bf0[12], cospi[32], bf0[11], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[32], bf0[13], cospi[32], bf0[10], cos_bit[stage]);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[23];
+  bf1[17] = bf0[17] + bf0[22];
+  bf1[18] = bf0[18] + bf0[21];
+  bf1[19] = bf0[19] + bf0[20];
+  bf1[20] = -bf0[20] + bf0[19];
+  bf1[21] = -bf0[21] + bf0[18];
+  bf1[22] = -bf0[22] + bf0[17];
+  bf1[23] = -bf0[23] + bf0[16];
+  bf1[24] = -bf0[24] + bf0[31];
+  bf1[25] = -bf0[25] + bf0[30];
+  bf1[26] = -bf0[26] + bf0[29];
+  bf1[27] = -bf0[27] + bf0[28];
+  bf1[28] = bf0[28] + bf0[27];
+  bf1[29] = bf0[29] + bf0[26];
+  bf1[30] = bf0[30] + bf0[25];
+  bf1[31] = bf0[31] + bf0[24];
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = bf0[34];
+  bf1[35] = bf0[35];
+  bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit[stage]);
+  bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit[stage]);
+  bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit[stage]);
+  bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit[stage]);
+  bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit[stage]);
+  bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit[stage]);
+  bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit[stage]);
+  bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit[stage]);
+  bf1[44] = bf0[44];
+  bf1[45] = bf0[45];
+  bf1[46] = bf0[46];
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = bf0[49];
+  bf1[50] = bf0[50];
+  bf1[51] = bf0[51];
+  bf1[52] = half_btf(cospi[48], bf0[52], -cospi[16], bf0[43], cos_bit[stage]);
+  bf1[53] = half_btf(cospi[48], bf0[53], -cospi[16], bf0[42], cos_bit[stage]);
+  bf1[54] = half_btf(cospi[48], bf0[54], -cospi[16], bf0[41], cos_bit[stage]);
+  bf1[55] = half_btf(cospi[48], bf0[55], -cospi[16], bf0[40], cos_bit[stage]);
+  bf1[56] = half_btf(cospi[16], bf0[56], cospi[48], bf0[39], cos_bit[stage]);
+  bf1[57] = half_btf(cospi[16], bf0[57], cospi[48], bf0[38], cos_bit[stage]);
+  bf1[58] = half_btf(cospi[16], bf0[58], cospi[48], bf0[37], cos_bit[stage]);
+  bf1[59] = half_btf(cospi[16], bf0[59], cospi[48], bf0[36], cos_bit[stage]);
+  bf1[60] = bf0[60];
+  bf1[61] = bf0[61];
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = -bf0[2] + bf0[1];
+  bf1[3] = -bf0[3] + bf0[0];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[5], cos_bit[stage]);
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8] + bf0[11];
+  bf1[9] = bf0[9] + bf0[10];
+  bf1[10] = -bf0[10] + bf0[9];
+  bf1[11] = -bf0[11] + bf0[8];
+  bf1[12] = -bf0[12] + bf0[15];
+  bf1[13] = -bf0[13] + bf0[14];
+  bf1[14] = bf0[14] + bf0[13];
+  bf1[15] = bf0[15] + bf0[12];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit[stage]);
+  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit[stage]);
+  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit[stage]);
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = half_btf(cospi[48], bf0[26], -cospi[16], bf0[21], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[48], bf0[27], -cospi[16], bf0[20], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[19], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[16], bf0[29], cospi[48], bf0[18], cos_bit[stage]);
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = bf0[32] + bf0[39];
+  bf1[33] = bf0[33] + bf0[38];
+  bf1[34] = bf0[34] + bf0[37];
+  bf1[35] = bf0[35] + bf0[36];
+  bf1[36] = -bf0[36] + bf0[35];
+  bf1[37] = -bf0[37] + bf0[34];
+  bf1[38] = -bf0[38] + bf0[33];
+  bf1[39] = -bf0[39] + bf0[32];
+  bf1[40] = -bf0[40] + bf0[47];
+  bf1[41] = -bf0[41] + bf0[46];
+  bf1[42] = -bf0[42] + bf0[45];
+  bf1[43] = -bf0[43] + bf0[44];
+  bf1[44] = bf0[44] + bf0[43];
+  bf1[45] = bf0[45] + bf0[42];
+  bf1[46] = bf0[46] + bf0[41];
+  bf1[47] = bf0[47] + bf0[40];
+  bf1[48] = bf0[48] + bf0[55];
+  bf1[49] = bf0[49] + bf0[54];
+  bf1[50] = bf0[50] + bf0[53];
+  bf1[51] = bf0[51] + bf0[52];
+  bf1[52] = -bf0[52] + bf0[51];
+  bf1[53] = -bf0[53] + bf0[50];
+  bf1[54] = -bf0[54] + bf0[49];
+  bf1[55] = -bf0[55] + bf0[48];
+  bf1[56] = -bf0[56] + bf0[63];
+  bf1[57] = -bf0[57] + bf0[62];
+  bf1[58] = -bf0[58] + bf0[61];
+  bf1[59] = -bf0[59] + bf0[60];
+  bf1[60] = bf0[60] + bf0[59];
+  bf1[61] = bf0[61] + bf0[58];
+  bf1[62] = bf0[62] + bf0[57];
+  bf1[63] = bf0[63] + bf0[56];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(-cospi[32], bf0[1], cospi[32], bf0[0], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[48], bf0[2], cospi[16], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[48], bf0[3], -cospi[16], bf0[2], cos_bit[stage]);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = -bf0[5] + bf0[4];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[7] + bf0[6];
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(cospi[48], bf0[13], -cospi[16], bf0[10], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[16], bf0[14], cospi[48], bf0[9], cos_bit[stage]);
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[19];
+  bf1[17] = bf0[17] + bf0[18];
+  bf1[18] = -bf0[18] + bf0[17];
+  bf1[19] = -bf0[19] + bf0[16];
+  bf1[20] = -bf0[20] + bf0[23];
+  bf1[21] = -bf0[21] + bf0[22];
+  bf1[22] = bf0[22] + bf0[21];
+  bf1[23] = bf0[23] + bf0[20];
+  bf1[24] = bf0[24] + bf0[27];
+  bf1[25] = bf0[25] + bf0[26];
+  bf1[26] = -bf0[26] + bf0[25];
+  bf1[27] = -bf0[27] + bf0[24];
+  bf1[28] = -bf0[28] + bf0[31];
+  bf1[29] = -bf0[29] + bf0[30];
+  bf1[30] = bf0[30] + bf0[29];
+  bf1[31] = bf0[31] + bf0[28];
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit[stage]);
+  bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit[stage]);
+  bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit[stage]);
+  bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit[stage]);
+  bf1[38] = bf0[38];
+  bf1[39] = bf0[39];
+  bf1[40] = bf0[40];
+  bf1[41] = bf0[41];
+  bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit[stage]);
+  bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit[stage]);
+  bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit[stage]);
+  bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit[stage]);
+  bf1[46] = bf0[46];
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = bf0[49];
+  bf1[50] = half_btf(cospi[24], bf0[50], -cospi[40], bf0[45], cos_bit[stage]);
+  bf1[51] = half_btf(cospi[24], bf0[51], -cospi[40], bf0[44], cos_bit[stage]);
+  bf1[52] = half_btf(cospi[40], bf0[52], cospi[24], bf0[43], cos_bit[stage]);
+  bf1[53] = half_btf(cospi[40], bf0[53], cospi[24], bf0[42], cos_bit[stage]);
+  bf1[54] = bf0[54];
+  bf1[55] = bf0[55];
+  bf1[56] = bf0[56];
+  bf1[57] = bf0[57];
+  bf1[58] = half_btf(cospi[56], bf0[58], -cospi[8], bf0[37], cos_bit[stage]);
+  bf1[59] = half_btf(cospi[56], bf0[59], -cospi[8], bf0[36], cos_bit[stage]);
+  bf1[60] = half_btf(cospi[8], bf0[60], cospi[56], bf0[35], cos_bit[stage]);
+  bf1[61] = half_btf(cospi[8], bf0[61], cospi[56], bf0[34], cos_bit[stage]);
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], cospi[8], bf0[7], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[24], bf0[5], cospi[40], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[24], bf0[6], -cospi[40], bf0[5], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[56], bf0[7], -cospi[8], bf0[4], cos_bit[stage]);
+  bf1[8] = bf0[8] + bf0[9];
+  bf1[9] = -bf0[9] + bf0[8];
+  bf1[10] = -bf0[10] + bf0[11];
+  bf1[11] = bf0[11] + bf0[10];
+  bf1[12] = bf0[12] + bf0[13];
+  bf1[13] = -bf0[13] + bf0[12];
+  bf1[14] = -bf0[14] + bf0[15];
+  bf1[15] = bf0[15] + bf0[14];
+  bf1[16] = bf0[16];
+  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit[stage]);
+  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit[stage]);
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit[stage]);
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = half_btf(cospi[24], bf0[25], -cospi[40], bf0[22], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[21], cos_bit[stage]);
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = half_btf(cospi[56], bf0[29], -cospi[8], bf0[18], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[8], bf0[30], cospi[56], bf0[17], cos_bit[stage]);
+  bf1[31] = bf0[31];
+  bf1[32] = bf0[32] + bf0[35];
+  bf1[33] = bf0[33] + bf0[34];
+  bf1[34] = -bf0[34] + bf0[33];
+  bf1[35] = -bf0[35] + bf0[32];
+  bf1[36] = -bf0[36] + bf0[39];
+  bf1[37] = -bf0[37] + bf0[38];
+  bf1[38] = bf0[38] + bf0[37];
+  bf1[39] = bf0[39] + bf0[36];
+  bf1[40] = bf0[40] + bf0[43];
+  bf1[41] = bf0[41] + bf0[42];
+  bf1[42] = -bf0[42] + bf0[41];
+  bf1[43] = -bf0[43] + bf0[40];
+  bf1[44] = -bf0[44] + bf0[47];
+  bf1[45] = -bf0[45] + bf0[46];
+  bf1[46] = bf0[46] + bf0[45];
+  bf1[47] = bf0[47] + bf0[44];
+  bf1[48] = bf0[48] + bf0[51];
+  bf1[49] = bf0[49] + bf0[50];
+  bf1[50] = -bf0[50] + bf0[49];
+  bf1[51] = -bf0[51] + bf0[48];
+  bf1[52] = -bf0[52] + bf0[55];
+  bf1[53] = -bf0[53] + bf0[54];
+  bf1[54] = bf0[54] + bf0[53];
+  bf1[55] = bf0[55] + bf0[52];
+  bf1[56] = bf0[56] + bf0[59];
+  bf1[57] = bf0[57] + bf0[58];
+  bf1[58] = -bf0[58] + bf0[57];
+  bf1[59] = -bf0[59] + bf0[56];
+  bf1[60] = -bf0[60] + bf0[63];
+  bf1[61] = -bf0[61] + bf0[62];
+  bf1[62] = bf0[62] + bf0[61];
+  bf1[63] = bf0[63] + bf0[60];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], cospi[4], bf0[15], cos_bit[stage]);
+  bf1[9] = half_btf(cospi[28], bf0[9], cospi[36], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[44], bf0[10], cospi[20], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[12], bf0[11], cospi[52], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[12], bf0[12], -cospi[52], bf0[11], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[44], bf0[13], -cospi[20], bf0[10], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[28], bf0[14], -cospi[36], bf0[9], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[60], bf0[15], -cospi[4], bf0[8], cos_bit[stage]);
+  bf1[16] = bf0[16] + bf0[17];
+  bf1[17] = -bf0[17] + bf0[16];
+  bf1[18] = -bf0[18] + bf0[19];
+  bf1[19] = bf0[19] + bf0[18];
+  bf1[20] = bf0[20] + bf0[21];
+  bf1[21] = -bf0[21] + bf0[20];
+  bf1[22] = -bf0[22] + bf0[23];
+  bf1[23] = bf0[23] + bf0[22];
+  bf1[24] = bf0[24] + bf0[25];
+  bf1[25] = -bf0[25] + bf0[24];
+  bf1[26] = -bf0[26] + bf0[27];
+  bf1[27] = bf0[27] + bf0[26];
+  bf1[28] = bf0[28] + bf0[29];
+  bf1[29] = -bf0[29] + bf0[28];
+  bf1[30] = -bf0[30] + bf0[31];
+  bf1[31] = bf0[31] + bf0[30];
+  bf1[32] = bf0[32];
+  bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit[stage]);
+  bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit[stage]);
+  bf1[35] = bf0[35];
+  bf1[36] = bf0[36];
+  bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit[stage]);
+  bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit[stage]);
+  bf1[39] = bf0[39];
+  bf1[40] = bf0[40];
+  bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit[stage]);
+  bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit[stage]);
+  bf1[43] = bf0[43];
+  bf1[44] = bf0[44];
+  bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit[stage]);
+  bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit[stage]);
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = half_btf(cospi[12], bf0[49], -cospi[52], bf0[46], cos_bit[stage]);
+  bf1[50] = half_btf(cospi[52], bf0[50], cospi[12], bf0[45], cos_bit[stage]);
+  bf1[51] = bf0[51];
+  bf1[52] = bf0[52];
+  bf1[53] = half_btf(cospi[44], bf0[53], -cospi[20], bf0[42], cos_bit[stage]);
+  bf1[54] = half_btf(cospi[20], bf0[54], cospi[44], bf0[41], cos_bit[stage]);
+  bf1[55] = bf0[55];
+  bf1[56] = bf0[56];
+  bf1[57] = half_btf(cospi[28], bf0[57], -cospi[36], bf0[38], cos_bit[stage]);
+  bf1[58] = half_btf(cospi[36], bf0[58], cospi[28], bf0[37], cos_bit[stage]);
+  bf1[59] = bf0[59];
+  bf1[60] = bf0[60];
+  bf1[61] = half_btf(cospi[60], bf0[61], -cospi[4], bf0[34], cos_bit[stage]);
+  bf1[62] = half_btf(cospi[4], bf0[62], cospi[60], bf0[33], cos_bit[stage]);
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = half_btf(cospi[62], bf0[16], cospi[2], bf0[31], cos_bit[stage]);
+  bf1[17] = half_btf(cospi[30], bf0[17], cospi[34], bf0[30], cos_bit[stage]);
+  bf1[18] = half_btf(cospi[46], bf0[18], cospi[18], bf0[29], cos_bit[stage]);
+  bf1[19] = half_btf(cospi[14], bf0[19], cospi[50], bf0[28], cos_bit[stage]);
+  bf1[20] = half_btf(cospi[54], bf0[20], cospi[10], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(cospi[22], bf0[21], cospi[42], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(cospi[38], bf0[22], cospi[26], bf0[25], cos_bit[stage]);
+  bf1[23] = half_btf(cospi[6], bf0[23], cospi[58], bf0[24], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[6], bf0[24], -cospi[58], bf0[23], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[38], bf0[25], -cospi[26], bf0[22], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[22], bf0[26], -cospi[42], bf0[21], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[54], bf0[27], -cospi[10], bf0[20], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[14], bf0[28], -cospi[50], bf0[19], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[46], bf0[29], -cospi[18], bf0[18], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[30], bf0[30], -cospi[34], bf0[17], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[62], bf0[31], -cospi[2], bf0[16], cos_bit[stage]);
+  bf1[32] = bf0[32] + bf0[33];
+  bf1[33] = -bf0[33] + bf0[32];
+  bf1[34] = -bf0[34] + bf0[35];
+  bf1[35] = bf0[35] + bf0[34];
+  bf1[36] = bf0[36] + bf0[37];
+  bf1[37] = -bf0[37] + bf0[36];
+  bf1[38] = -bf0[38] + bf0[39];
+  bf1[39] = bf0[39] + bf0[38];
+  bf1[40] = bf0[40] + bf0[41];
+  bf1[41] = -bf0[41] + bf0[40];
+  bf1[42] = -bf0[42] + bf0[43];
+  bf1[43] = bf0[43] + bf0[42];
+  bf1[44] = bf0[44] + bf0[45];
+  bf1[45] = -bf0[45] + bf0[44];
+  bf1[46] = -bf0[46] + bf0[47];
+  bf1[47] = bf0[47] + bf0[46];
+  bf1[48] = bf0[48] + bf0[49];
+  bf1[49] = -bf0[49] + bf0[48];
+  bf1[50] = -bf0[50] + bf0[51];
+  bf1[51] = bf0[51] + bf0[50];
+  bf1[52] = bf0[52] + bf0[53];
+  bf1[53] = -bf0[53] + bf0[52];
+  bf1[54] = -bf0[54] + bf0[55];
+  bf1[55] = bf0[55] + bf0[54];
+  bf1[56] = bf0[56] + bf0[57];
+  bf1[57] = -bf0[57] + bf0[56];
+  bf1[58] = -bf0[58] + bf0[59];
+  bf1[59] = bf0[59] + bf0[58];
+  bf1[60] = bf0[60] + bf0[61];
+  bf1[61] = -bf0[61] + bf0[60];
+  bf1[62] = -bf0[62] + bf0[63];
+  bf1[63] = bf0[63] + bf0[62];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 10
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = bf0[21];
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = bf0[26];
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = half_btf(cospi[63], bf0[32], cospi[1], bf0[63], cos_bit[stage]);
+  bf1[33] = half_btf(cospi[31], bf0[33], cospi[33], bf0[62], cos_bit[stage]);
+  bf1[34] = half_btf(cospi[47], bf0[34], cospi[17], bf0[61], cos_bit[stage]);
+  bf1[35] = half_btf(cospi[15], bf0[35], cospi[49], bf0[60], cos_bit[stage]);
+  bf1[36] = half_btf(cospi[55], bf0[36], cospi[9], bf0[59], cos_bit[stage]);
+  bf1[37] = half_btf(cospi[23], bf0[37], cospi[41], bf0[58], cos_bit[stage]);
+  bf1[38] = half_btf(cospi[39], bf0[38], cospi[25], bf0[57], cos_bit[stage]);
+  bf1[39] = half_btf(cospi[7], bf0[39], cospi[57], bf0[56], cos_bit[stage]);
+  bf1[40] = half_btf(cospi[59], bf0[40], cospi[5], bf0[55], cos_bit[stage]);
+  bf1[41] = half_btf(cospi[27], bf0[41], cospi[37], bf0[54], cos_bit[stage]);
+  bf1[42] = half_btf(cospi[43], bf0[42], cospi[21], bf0[53], cos_bit[stage]);
+  bf1[43] = half_btf(cospi[11], bf0[43], cospi[53], bf0[52], cos_bit[stage]);
+  bf1[44] = half_btf(cospi[51], bf0[44], cospi[13], bf0[51], cos_bit[stage]);
+  bf1[45] = half_btf(cospi[19], bf0[45], cospi[45], bf0[50], cos_bit[stage]);
+  bf1[46] = half_btf(cospi[35], bf0[46], cospi[29], bf0[49], cos_bit[stage]);
+  bf1[47] = half_btf(cospi[3], bf0[47], cospi[61], bf0[48], cos_bit[stage]);
+  bf1[48] = half_btf(cospi[3], bf0[48], -cospi[61], bf0[47], cos_bit[stage]);
+  bf1[49] = half_btf(cospi[35], bf0[49], -cospi[29], bf0[46], cos_bit[stage]);
+  bf1[50] = half_btf(cospi[19], bf0[50], -cospi[45], bf0[45], cos_bit[stage]);
+  bf1[51] = half_btf(cospi[51], bf0[51], -cospi[13], bf0[44], cos_bit[stage]);
+  bf1[52] = half_btf(cospi[11], bf0[52], -cospi[53], bf0[43], cos_bit[stage]);
+  bf1[53] = half_btf(cospi[43], bf0[53], -cospi[21], bf0[42], cos_bit[stage]);
+  bf1[54] = half_btf(cospi[27], bf0[54], -cospi[37], bf0[41], cos_bit[stage]);
+  bf1[55] = half_btf(cospi[59], bf0[55], -cospi[5], bf0[40], cos_bit[stage]);
+  bf1[56] = half_btf(cospi[7], bf0[56], -cospi[57], bf0[39], cos_bit[stage]);
+  bf1[57] = half_btf(cospi[39], bf0[57], -cospi[25], bf0[38], cos_bit[stage]);
+  bf1[58] = half_btf(cospi[23], bf0[58], -cospi[41], bf0[37], cos_bit[stage]);
+  bf1[59] = half_btf(cospi[55], bf0[59], -cospi[9], bf0[36], cos_bit[stage]);
+  bf1[60] = half_btf(cospi[15], bf0[60], -cospi[49], bf0[35], cos_bit[stage]);
+  bf1[61] = half_btf(cospi[47], bf0[61], -cospi[17], bf0[34], cos_bit[stage]);
+  bf1[62] = half_btf(cospi[31], bf0[62], -cospi[33], bf0[33], cos_bit[stage]);
+  bf1[63] = half_btf(cospi[63], bf0[63], -cospi[1], bf0[32], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 11
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[32];
+  bf1[2] = bf0[16];
+  bf1[3] = bf0[48];
+  bf1[4] = bf0[8];
+  bf1[5] = bf0[40];
+  bf1[6] = bf0[24];
+  bf1[7] = bf0[56];
+  bf1[8] = bf0[4];
+  bf1[9] = bf0[36];
+  bf1[10] = bf0[20];
+  bf1[11] = bf0[52];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[44];
+  bf1[14] = bf0[28];
+  bf1[15] = bf0[60];
+  bf1[16] = bf0[2];
+  bf1[17] = bf0[34];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[50];
+  bf1[20] = bf0[10];
+  bf1[21] = bf0[42];
+  bf1[22] = bf0[26];
+  bf1[23] = bf0[58];
+  bf1[24] = bf0[6];
+  bf1[25] = bf0[38];
+  bf1[26] = bf0[22];
+  bf1[27] = bf0[54];
+  bf1[28] = bf0[14];
+  bf1[29] = bf0[46];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[62];
+  bf1[32] = bf0[1];
+  bf1[33] = bf0[33];
+  bf1[34] = bf0[17];
+  bf1[35] = bf0[49];
+  bf1[36] = bf0[9];
+  bf1[37] = bf0[41];
+  bf1[38] = bf0[25];
+  bf1[39] = bf0[57];
+  bf1[40] = bf0[5];
+  bf1[41] = bf0[37];
+  bf1[42] = bf0[21];
+  bf1[43] = bf0[53];
+  bf1[44] = bf0[13];
+  bf1[45] = bf0[45];
+  bf1[46] = bf0[29];
+  bf1[47] = bf0[61];
+  bf1[48] = bf0[3];
+  bf1[49] = bf0[35];
+  bf1[50] = bf0[19];
+  bf1[51] = bf0[51];
+  bf1[52] = bf0[11];
+  bf1[53] = bf0[43];
+  bf1[54] = bf0[27];
+  bf1[55] = bf0[59];
+  bf1[56] = bf0[7];
+  bf1[57] = bf0[39];
+  bf1[58] = bf0[23];
+  bf1[59] = bf0[55];
+  bf1[60] = bf0[15];
+  bf1[61] = bf0[47];
+  bf1[62] = bf0[31];
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
diff --git a/vp10/common/vp10_fwd_txfm1d.h b/vp10/common/vp10_fwd_txfm1d.h
new file mode 100644
index 0000000..d06e305
--- /dev/null
+++ b/vp10/common/vp10_fwd_txfm1d.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_FWD_TXFM1D_H_
+#define VP10_FWD_TXFM1D_H_
+
+#include "vp10/common/vp10_txfm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp10_fdct4_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range);
+void vp10_fdct8_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range);
+void vp10_fdct16_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range);
+void vp10_fdct32_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range);
+void vp10_fdct64_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range);
+
+void vp10_fadst4_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range);
+void vp10_fadst8_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range);
+void vp10_fadst16_new(const int32_t *input, int32_t *output,
+                      const int8_t *cos_bit, const int8_t *stage_range);
+void vp10_fadst32_new(const int32_t *input, int32_t *output,
+                      const int8_t *cos_bit, const int8_t *stage_range);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // VP10_FWD_TXFM1D_H_
diff --git a/vp10/common/vp10_fwd_txfm2d.c b/vp10/common/vp10_fwd_txfm2d.c
new file mode 100644
index 0000000..cddd7dc
--- /dev/null
+++ b/vp10/common/vp10_fwd_txfm2d.c
@@ -0,0 +1,196 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vp10_rtcd.h"
+#include "vp10/common/enums.h"
+#include "vp10/common/vp10_fwd_txfm1d.h"
+#include "vp10/common/vp10_fwd_txfm2d_cfg.h"
+#include "vp10/common/vp10_txfm.h"
+
+static INLINE TxfmFunc fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
+  switch (txfm_type) {
+    case TXFM_TYPE_DCT4:
+      return vp10_fdct4_new;
+    case TXFM_TYPE_DCT8:
+      return vp10_fdct8_new;
+    case TXFM_TYPE_DCT16:
+      return vp10_fdct16_new;
+    case TXFM_TYPE_DCT32:
+      return vp10_fdct32_new;
+    case TXFM_TYPE_DCT64:
+      return vp10_fdct64_new;
+    case TXFM_TYPE_ADST4:
+      return vp10_fadst4_new;
+    case TXFM_TYPE_ADST8:
+      return vp10_fadst8_new;
+    case TXFM_TYPE_ADST16:
+      return vp10_fadst16_new;
+    case TXFM_TYPE_ADST32:
+      return vp10_fadst32_new;
+    default:
+      assert(0);
+      return NULL;
+  }
+}
+
+static INLINE void fwd_txfm2d_c(const int16_t *input, int32_t *output,
+                                const int stride, const TXFM_2D_FLIP_CFG *cfg,
+                                int32_t *buf) {
+  int c, r;
+  const int txfm_size = cfg->cfg->txfm_size;
+  const int8_t *shift = cfg->cfg->shift;
+  const int8_t *stage_range_col = cfg->cfg->stage_range_col;
+  const int8_t *stage_range_row = cfg->cfg->stage_range_row;
+  const int8_t *cos_bit_col = cfg->cfg->cos_bit_col;
+  const int8_t *cos_bit_row = cfg->cfg->cos_bit_row;
+  const TxfmFunc txfm_func_col = fwd_txfm_type_to_func(cfg->cfg->txfm_type_col);
+  const TxfmFunc txfm_func_row = fwd_txfm_type_to_func(cfg->cfg->txfm_type_row);
+
+  // use output buffer as temp buffer
+  int32_t* temp_in = output;
+  int32_t* temp_out = output + txfm_size;
+
+  // Columns
+  for (c = 0; c < txfm_size; ++c) {
+    if (cfg->ud_flip == 0) {
+      for (r = 0; r < txfm_size; ++r)
+        temp_in[r] = input[r * stride + c];
+    } else {
+      for (r = 0; r < txfm_size; ++r)
+        // flip upside down
+        temp_in[r] = input[(txfm_size - r - 1) * stride + c];
+    }
+    round_shift_array(temp_in, txfm_size, -shift[0]);
+    txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
+    round_shift_array(temp_out, txfm_size, -shift[1]);
+    if (cfg->lr_flip == 0) {
+      for (r = 0; r < txfm_size; ++r)
+        buf[r * txfm_size + c] = temp_out[r];
+    } else {
+      for (r = 0; r < txfm_size; ++r)
+        // flip from left to right
+        buf[r * txfm_size + (txfm_size - c - 1)] = temp_out[r];
+    }
+  }
+
+  // Rows
+  for (r = 0; r < txfm_size; ++r) {
+    txfm_func_row(buf + r * txfm_size, output + r * txfm_size, cos_bit_row,
+                  stage_range_row);
+    round_shift_array(output + r * txfm_size, txfm_size, -shift[2]);
+  }
+}
+
+void vp10_fwd_txfm2d_4x4_c(const int16_t *input, int32_t *output,
+                           const int stride, int tx_type,
+                           const int bd) {
+  int32_t txfm_buf[4 * 4];
+  TXFM_2D_FLIP_CFG cfg = vp10_get_fwd_txfm_cfg(tx_type, TX_4X4);
+  (void)bd;
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf);
+}
+
+void vp10_fwd_txfm2d_8x8_c(const int16_t *input, int32_t *output,
+                           const int stride, int tx_type,
+                           const int bd) {
+  int32_t txfm_buf[8 * 8];
+  TXFM_2D_FLIP_CFG cfg = vp10_get_fwd_txfm_cfg(tx_type, TX_8X8);
+  (void)bd;
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf);
+}
+
+void vp10_fwd_txfm2d_16x16_c(const int16_t *input, int32_t *output,
+                             const int stride, int tx_type,
+                             const int bd) {
+  int32_t txfm_buf[16 * 16];
+  TXFM_2D_FLIP_CFG cfg = vp10_get_fwd_txfm_cfg(tx_type, TX_16X16);
+  (void)bd;
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf);
+}
+
+void vp10_fwd_txfm2d_32x32_c(const int16_t *input, int32_t *output,
+                             const int stride, int tx_type,
+                             const int bd) {
+  int32_t txfm_buf[32 * 32];
+  TXFM_2D_FLIP_CFG cfg = vp10_get_fwd_txfm_cfg(tx_type, TX_32X32);
+  (void)bd;
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf);
+}
+
+void vp10_fwd_txfm2d_64x64_c(const int16_t *input, int32_t *output,
+                             const int stride, int tx_type,
+                             const int bd) {
+  int32_t txfm_buf[64 * 64];
+  TXFM_2D_FLIP_CFG cfg = vp10_get_fwd_txfm_64x64_cfg(tx_type);
+  (void)bd;
+  fwd_txfm2d_c(input, output, stride, &cfg, txfm_buf);
+}
+
+#if CONFIG_EXT_TX
+static const TXFM_2D_CFG* fwd_txfm_cfg_ls[FLIPADST_ADST + 1][TX_SIZES] = {
+    {&fwd_txfm_2d_cfg_dct_dct_4  , &fwd_txfm_2d_cfg_dct_dct_8,
+     &fwd_txfm_2d_cfg_dct_dct_16  , &fwd_txfm_2d_cfg_dct_dct_32},
+    {&fwd_txfm_2d_cfg_adst_dct_4 , &fwd_txfm_2d_cfg_adst_dct_8,
+     &fwd_txfm_2d_cfg_adst_dct_16 , &fwd_txfm_2d_cfg_adst_dct_32},
+    {&fwd_txfm_2d_cfg_dct_adst_4 , &fwd_txfm_2d_cfg_dct_adst_8,
+     &fwd_txfm_2d_cfg_dct_adst_16 , &fwd_txfm_2d_cfg_dct_adst_32},
+    {&fwd_txfm_2d_cfg_adst_adst_4, &fwd_txfm_2d_cfg_adst_adst_8,
+     &fwd_txfm_2d_cfg_adst_adst_16, &fwd_txfm_2d_cfg_adst_adst_32},
+    {&fwd_txfm_2d_cfg_adst_dct_4 , &fwd_txfm_2d_cfg_adst_dct_8,
+     &fwd_txfm_2d_cfg_adst_dct_16 , &fwd_txfm_2d_cfg_adst_dct_32},
+    {&fwd_txfm_2d_cfg_dct_adst_4 , &fwd_txfm_2d_cfg_dct_adst_8,
+     &fwd_txfm_2d_cfg_dct_adst_16 , &fwd_txfm_2d_cfg_dct_adst_32},
+    {&fwd_txfm_2d_cfg_adst_adst_4, &fwd_txfm_2d_cfg_adst_adst_8,
+     &fwd_txfm_2d_cfg_adst_adst_16, &fwd_txfm_2d_cfg_adst_adst_32},
+    {&fwd_txfm_2d_cfg_adst_adst_4, &fwd_txfm_2d_cfg_adst_adst_8,
+     &fwd_txfm_2d_cfg_adst_adst_16, &fwd_txfm_2d_cfg_adst_adst_32},
+    {&fwd_txfm_2d_cfg_adst_adst_4, &fwd_txfm_2d_cfg_adst_adst_8,
+     &fwd_txfm_2d_cfg_adst_adst_16, &fwd_txfm_2d_cfg_adst_adst_32},
+};
+#else  // CONFIG_EXT_TX
+static const TXFM_2D_CFG* fwd_txfm_cfg_ls[TX_TYPES][TX_SIZES] = {
+    {&fwd_txfm_2d_cfg_dct_dct_4  , &fwd_txfm_2d_cfg_dct_dct_8,
+     &fwd_txfm_2d_cfg_dct_dct_16  , &fwd_txfm_2d_cfg_dct_dct_32},
+    {&fwd_txfm_2d_cfg_adst_dct_4 , &fwd_txfm_2d_cfg_adst_dct_8,
+     &fwd_txfm_2d_cfg_adst_dct_16 , &fwd_txfm_2d_cfg_adst_dct_32},
+    {&fwd_txfm_2d_cfg_dct_adst_4 , &fwd_txfm_2d_cfg_dct_adst_8,
+     &fwd_txfm_2d_cfg_dct_adst_16 , &fwd_txfm_2d_cfg_dct_adst_32},
+    {&fwd_txfm_2d_cfg_adst_adst_4, &fwd_txfm_2d_cfg_adst_adst_8,
+     &fwd_txfm_2d_cfg_adst_adst_16, &fwd_txfm_2d_cfg_adst_adst_32},
+};
+#endif  // CONFIG_EXT_TX
+
+TXFM_2D_FLIP_CFG vp10_get_fwd_txfm_cfg(int tx_type, int tx_size) {
+  TXFM_2D_FLIP_CFG cfg;
+  set_flip_cfg(tx_type, &cfg);
+  cfg.cfg = fwd_txfm_cfg_ls[tx_type][tx_size];
+  return cfg;
+}
+
+TXFM_2D_FLIP_CFG vp10_get_fwd_txfm_64x64_cfg(int tx_type) {
+  TXFM_2D_FLIP_CFG cfg;
+  switch (tx_type) {
+    case DCT_DCT:
+      cfg.cfg = &fwd_txfm_2d_cfg_dct_dct_64;
+      cfg.ud_flip = 0;
+      cfg.lr_flip = 0;
+      break;
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    default:
+      cfg.ud_flip = 0;
+      cfg.lr_flip = 0;
+      assert(0);
+  }
+  return cfg;
+}
diff --git a/vp10/common/vp10_fwd_txfm2d_cfg.h b/vp10/common/vp10_fwd_txfm2d_cfg.h
new file mode 100644
index 0000000..e15e4ba
--- /dev/null
+++ b/vp10/common/vp10_fwd_txfm2d_cfg.h
@@ -0,0 +1,402 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_FWD_TXFM2D_CFG_H_
+#define VP10_FWD_TXFM2D_CFG_H_
+#include "vp10/common/enums.h"
+#include "vp10/common/vp10_fwd_txfm1d.h"
+//  ---------------- config fwd_dct_dct_4 ----------------
+static const int8_t fwd_shift_dct_dct_4[3] = {2, 0, 0};
+static const int8_t fwd_stage_range_col_dct_dct_4[4] = {15, 16, 17, 17};
+static const int8_t fwd_stage_range_row_dct_dct_4[4] = {17, 18, 18, 18};
+static const int8_t fwd_cos_bit_col_dct_dct_4[4] = {13, 13, 13, 13};
+static const int8_t fwd_cos_bit_row_dct_dct_4[4] = {13, 13, 13, 13};
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_dct_dct_4 = {
+    4,  // .txfm_size
+    4,  // .stage_num_col
+    4,  // .stage_num_row
+    // 0,  // .log_scale
+    fwd_shift_dct_dct_4,            // .shift
+    fwd_stage_range_col_dct_dct_4,  // .stage_range_col
+    fwd_stage_range_row_dct_dct_4,  // .stage_range_row
+    fwd_cos_bit_col_dct_dct_4,      // .cos_bit_col
+    fwd_cos_bit_row_dct_dct_4,      // .cos_bit_row
+    TXFM_TYPE_DCT4,                 // .txfm_type_col
+    TXFM_TYPE_DCT4};                // .txfm_type_row
+
+//  ---------------- config fwd_dct_dct_8 ----------------
+static const int8_t fwd_shift_dct_dct_8[3] = {2, -1, 0};
+static const int8_t fwd_stage_range_col_dct_dct_8[6] = {15, 16, 17, 18, 18, 18};
+static const int8_t fwd_stage_range_row_dct_dct_8[6] = {17, 18, 19, 19, 19, 19};
+static const int8_t fwd_cos_bit_col_dct_dct_8[6] = {13, 13, 13, 13, 13, 13};
+static const int8_t fwd_cos_bit_row_dct_dct_8[6] = {13, 13, 13, 13, 13, 13};
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_dct_dct_8 = {
+    8,  // .txfm_size
+    6,  // .stage_num_col
+    6,  // .stage_num_row
+    // 0,  // .log_scale
+    fwd_shift_dct_dct_8,            // .shift
+    fwd_stage_range_col_dct_dct_8,  // .stage_range_col
+    fwd_stage_range_row_dct_dct_8,  // .stage_range_row
+    fwd_cos_bit_col_dct_dct_8,      // .cos_bit_col
+    fwd_cos_bit_row_dct_dct_8,      // .cos_bit_row
+    TXFM_TYPE_DCT8,                 // .txfm_type_col
+    TXFM_TYPE_DCT8};                // .txfm_type_row
+
+//  ---------------- config fwd_dct_dct_16 ----------------
+static const int8_t fwd_shift_dct_dct_16[3] = {2, -2, 0};
+static const int8_t fwd_stage_range_col_dct_dct_16[8] = {15, 16, 17, 18,
+                                                         19, 19, 19, 19};
+static const int8_t fwd_stage_range_row_dct_dct_16[8] = {17, 18, 19, 20,
+                                                         20, 20, 20, 20};
+static const int8_t fwd_cos_bit_col_dct_dct_16[8] = {13, 13, 13, 13,
+                                                     13, 13, 13, 13};
+static const int8_t fwd_cos_bit_row_dct_dct_16[8] = {12, 12, 12, 12,
+                                                     12, 12, 12, 12};
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_dct_dct_16 = {
+    16,  // .txfm_size
+    8,   // .stage_num_col
+    8,   // .stage_num_row
+    // 0,  // .log_scale
+    fwd_shift_dct_dct_16,            // .shift
+    fwd_stage_range_col_dct_dct_16,  // .stage_range_col
+    fwd_stage_range_row_dct_dct_16,  // .stage_range_row
+    fwd_cos_bit_col_dct_dct_16,      // .cos_bit_col
+    fwd_cos_bit_row_dct_dct_16,      // .cos_bit_row
+    TXFM_TYPE_DCT16,                 // .txfm_type_col
+    TXFM_TYPE_DCT16};                // .txfm_type_row
+
+//  ---------------- config fwd_dct_dct_32 ----------------
+static const int8_t fwd_shift_dct_dct_32[3] = {2, -4, 0};
+static const int8_t fwd_stage_range_col_dct_dct_32[10] = {15, 16, 17, 18, 19,
+                                                          20, 20, 20, 20, 20};
+static const int8_t fwd_stage_range_row_dct_dct_32[10] = {16, 17, 18, 19, 20,
+                                                          20, 20, 20, 20, 20};
+static const int8_t fwd_cos_bit_col_dct_dct_32[10] = {12, 12, 12, 12, 12,
+                                                      12, 12, 12, 12, 12};
+static const int8_t fwd_cos_bit_row_dct_dct_32[10] = {12, 12, 12, 12, 12,
+                                                      12, 12, 12, 12, 12};
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_dct_dct_32 = {
+    32,  // .txfm_size
+    10,  // .stage_num_col
+    10,  // .stage_num_row
+    // 1,  // .log_scale
+    fwd_shift_dct_dct_32,            // .shift
+    fwd_stage_range_col_dct_dct_32,  // .stage_range_col
+    fwd_stage_range_row_dct_dct_32,  // .stage_range_row
+    fwd_cos_bit_col_dct_dct_32,      // .cos_bit_col
+    fwd_cos_bit_row_dct_dct_32,      // .cos_bit_row
+    TXFM_TYPE_DCT32,                 // .txfm_type_col
+    TXFM_TYPE_DCT32};                // .txfm_type_row
+
+//  ---------------- config fwd_dct_dct_64 ----------------
+static const int8_t fwd_shift_dct_dct_64[3] = {2, -2, -2};
+static const int8_t fwd_stage_range_col_dct_dct_64[12] = {
+    13, 14, 15, 16, 17, 18, 19, 19, 19, 19, 19, 19};
+static const int8_t fwd_stage_range_row_dct_dct_64[12] = {
+    17, 18, 19, 20, 21, 22, 22, 22, 22, 22, 22, 22};
+static const int8_t fwd_cos_bit_col_dct_dct_64[12] = {15, 15, 15, 15, 15, 14,
+                                                      13, 13, 13, 13, 13, 13};
+static const int8_t fwd_cos_bit_row_dct_dct_64[12] = {15, 14, 13, 12, 11, 10,
+                                                      10, 10, 10, 10, 10, 10};
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_dct_dct_64 = {
+    64,                              // .txfm_size
+    12,                              // .stage_num_col
+    12,                              // .stage_num_row
+    fwd_shift_dct_dct_64,            // .shift
+    fwd_stage_range_col_dct_dct_64,  // .stage_range_col
+    fwd_stage_range_row_dct_dct_64,  // .stage_range_row
+    fwd_cos_bit_col_dct_dct_64,      // .cos_bit_col
+    fwd_cos_bit_row_dct_dct_64,      // .cos_bit_row
+    TXFM_TYPE_DCT64,                 // .txfm_type_col
+    TXFM_TYPE_DCT64};                // .txfm_type_row
+
+//  ---------------- config fwd_dct_adst_4 ----------------
+static const int8_t fwd_shift_dct_adst_4[3] = {2, 0, 0};
+static const int8_t fwd_stage_range_col_dct_adst_4[4] = {15, 16, 17, 17};
+static const int8_t fwd_stage_range_row_dct_adst_4[6] = {17, 17, 17,
+                                                         18, 18, 18};
+static const int8_t fwd_cos_bit_col_dct_adst_4[4] = {13, 13, 13, 13};
+static const int8_t fwd_cos_bit_row_dct_adst_4[6] = {13, 13, 13, 13, 13, 13};
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_dct_adst_4 = {
+    4,  // .txfm_size
+    4,  // .stage_num_col
+    6,  // .stage_num_row
+    // 0,  // .log_scale
+    fwd_shift_dct_adst_4,            // .shift
+    fwd_stage_range_col_dct_adst_4,  // .stage_range_col
+    fwd_stage_range_row_dct_adst_4,  // .stage_range_row
+    fwd_cos_bit_col_dct_adst_4,      // .cos_bit_col
+    fwd_cos_bit_row_dct_adst_4,      // .cos_bit_row
+    TXFM_TYPE_DCT4,                  // .txfm_type_col
+    TXFM_TYPE_ADST4};                // .txfm_type_row
+
+//  ---------------- config fwd_dct_adst_8 ----------------
+static const int8_t fwd_shift_dct_adst_8[3] = {2, -1, 0};
+static const int8_t fwd_stage_range_col_dct_adst_8[6] = {15, 16, 17,
+                                                         18, 18, 18};
+static const int8_t fwd_stage_range_row_dct_adst_8[8] = {17, 17, 17, 18,
+                                                         18, 19, 19, 19};
+static const int8_t fwd_cos_bit_col_dct_adst_8[6] = {13, 13, 13, 13, 13, 13};
+static const int8_t fwd_cos_bit_row_dct_adst_8[8] = {13, 13, 13, 13,
+                                                     13, 13, 13, 13};
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_dct_adst_8 = {
+    8,  // .txfm_size
+    6,  // .stage_num_col
+    8,  // .stage_num_row
+    // 0,  // .log_scale
+    fwd_shift_dct_adst_8,            // .shift
+    fwd_stage_range_col_dct_adst_8,  // .stage_range_col
+    fwd_stage_range_row_dct_adst_8,  // .stage_range_row
+    fwd_cos_bit_col_dct_adst_8,      // .cos_bit_col
+    fwd_cos_bit_row_dct_adst_8,      // .cos_bit_row
+    TXFM_TYPE_DCT8,                  // .txfm_type_col
+    TXFM_TYPE_ADST8};                // .txfm_type_row
+
+//  ---------------- config fwd_dct_adst_16 ----------------
+static const int8_t fwd_shift_dct_adst_16[3] = {2, -2, 0};
+static const int8_t fwd_stage_range_col_dct_adst_16[8] = {15, 16, 17, 18,
+                                                          19, 19, 19, 19};
+static const int8_t fwd_stage_range_row_dct_adst_16[10] = {17, 17, 17, 18, 18,
+                                                           19, 19, 20, 20, 20};
+static const int8_t fwd_cos_bit_col_dct_adst_16[8] = {13, 13, 13, 13,
+                                                      13, 13, 13, 13};
+static const int8_t fwd_cos_bit_row_dct_adst_16[10] = {12, 12, 12, 12, 12,
+                                                       12, 12, 12, 12, 12};
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_dct_adst_16 = {
+    16,  // .txfm_size
+    8,   // .stage_num_col
+    10,  // .stage_num_row
+    // 0,  // .log_scale
+    fwd_shift_dct_adst_16,            // .shift
+    fwd_stage_range_col_dct_adst_16,  // .stage_range_col
+    fwd_stage_range_row_dct_adst_16,  // .stage_range_row
+    fwd_cos_bit_col_dct_adst_16,      // .cos_bit_col
+    fwd_cos_bit_row_dct_adst_16,      // .cos_bit_row
+    TXFM_TYPE_DCT16,                  // .txfm_type_col
+    TXFM_TYPE_ADST16};                // .txfm_type_row
+
+//  ---------------- config fwd_dct_adst_32 ----------------
+static const int8_t fwd_shift_dct_adst_32[3] = {2, -4, 0};
+static const int8_t fwd_stage_range_col_dct_adst_32[10] = {15, 16, 17, 18, 19,
+                                                           20, 20, 20, 20, 20};
+static const int8_t fwd_stage_range_row_dct_adst_32[12] = {
+    16, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 20};
+static const int8_t fwd_cos_bit_col_dct_adst_32[10] = {12, 12, 12, 12, 12,
+                                                       12, 12, 12, 12, 12};
+static const int8_t fwd_cos_bit_row_dct_adst_32[12] = {12, 12, 12, 12, 12, 12,
+                                                       12, 12, 12, 12, 12, 12};
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_dct_adst_32 = {
+    32,  // .txfm_size
+    10,  // .stage_num_col
+    12,  // .stage_num_row
+    // 1,  // .log_scale
+    fwd_shift_dct_adst_32,            // .shift
+    fwd_stage_range_col_dct_adst_32,  // .stage_range_col
+    fwd_stage_range_row_dct_adst_32,  // .stage_range_row
+    fwd_cos_bit_col_dct_adst_32,      // .cos_bit_col
+    fwd_cos_bit_row_dct_adst_32,      // .cos_bit_row
+    TXFM_TYPE_DCT32,                  // .txfm_type_col
+    TXFM_TYPE_ADST32};                // .txfm_type_row
+//  ---------------- config fwd_adst_adst_4 ----------------
+static const int8_t fwd_shift_adst_adst_4[3] = {2, 0, 0};
+static const int8_t fwd_stage_range_col_adst_adst_4[6] = {15, 15, 16,
+                                                          17, 17, 17};
+static const int8_t fwd_stage_range_row_adst_adst_4[6] = {17, 17, 17,
+                                                          18, 18, 18};
+static const int8_t fwd_cos_bit_col_adst_adst_4[6] = {13, 13, 13, 13, 13, 13};
+static const int8_t fwd_cos_bit_row_adst_adst_4[6] = {13, 13, 13, 13, 13, 13};
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_adst_adst_4 = {
+    4,  // .txfm_size
+    6,  // .stage_num_col
+    6,  // .stage_num_row
+    // 0,  // .log_scale
+    fwd_shift_adst_adst_4,            // .shift
+    fwd_stage_range_col_adst_adst_4,  // .stage_range_col
+    fwd_stage_range_row_adst_adst_4,  // .stage_range_row
+    fwd_cos_bit_col_adst_adst_4,      // .cos_bit_col
+    fwd_cos_bit_row_adst_adst_4,      // .cos_bit_row
+    TXFM_TYPE_ADST4,                  // .txfm_type_col
+    TXFM_TYPE_ADST4};                 // .txfm_type_row
+
+//  ---------------- config fwd_adst_adst_8 ----------------
+static const int8_t fwd_shift_adst_adst_8[3] = {2, -1, 0};
+static const int8_t fwd_stage_range_col_adst_adst_8[8] = {15, 15, 16, 17,
+                                                          17, 18, 18, 18};
+static const int8_t fwd_stage_range_row_adst_adst_8[8] = {17, 17, 17, 18,
+                                                          18, 19, 19, 19};
+static const int8_t fwd_cos_bit_col_adst_adst_8[8] = {13, 13, 13, 13,
+                                                      13, 13, 13, 13};
+static const int8_t fwd_cos_bit_row_adst_adst_8[8] = {13, 13, 13, 13,
+                                                      13, 13, 13, 13};
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_adst_adst_8 = {
+    8,  // .txfm_size
+    8,  // .stage_num_col
+    8,  // .stage_num_row
+    // 0,  // .log_scale
+    fwd_shift_adst_adst_8,            // .shift
+    fwd_stage_range_col_adst_adst_8,  // .stage_range_col
+    fwd_stage_range_row_adst_adst_8,  // .stage_range_row
+    fwd_cos_bit_col_adst_adst_8,      // .cos_bit_col
+    fwd_cos_bit_row_adst_adst_8,      // .cos_bit_row
+    TXFM_TYPE_ADST8,                  // .txfm_type_col
+    TXFM_TYPE_ADST8};                 // .txfm_type_row
+
+//  ---------------- config fwd_adst_adst_16 ----------------
+static const int8_t fwd_shift_adst_adst_16[3] = {2, -2, 0};
+static const int8_t fwd_stage_range_col_adst_adst_16[10] = {15, 15, 16, 17, 17,
+                                                            18, 18, 19, 19, 19};
+static const int8_t fwd_stage_range_row_adst_adst_16[10] = {17, 17, 17, 18, 18,
+                                                            19, 19, 20, 20, 20};
+static const int8_t fwd_cos_bit_col_adst_adst_16[10] = {13, 13, 13, 13, 13,
+                                                        13, 13, 13, 13, 13};
+static const int8_t fwd_cos_bit_row_adst_adst_16[10] = {12, 12, 12, 12, 12,
+                                                        12, 12, 12, 12, 12};
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_adst_adst_16 = {
+    16,  // .txfm_size
+    10,  // .stage_num_col
+    10,  // .stage_num_row
+    // 0,  // .log_scale
+    fwd_shift_adst_adst_16,            // .shift
+    fwd_stage_range_col_adst_adst_16,  // .stage_range_col
+    fwd_stage_range_row_adst_adst_16,  // .stage_range_row
+    fwd_cos_bit_col_adst_adst_16,      // .cos_bit_col
+    fwd_cos_bit_row_adst_adst_16,      // .cos_bit_row
+    TXFM_TYPE_ADST16,                  // .txfm_type_col
+    TXFM_TYPE_ADST16};                 // .txfm_type_row
+
+//  ---------------- config fwd_adst_adst_32 ----------------
+static const int8_t fwd_shift_adst_adst_32[3] = {2, -4, 0};
+static const int8_t fwd_stage_range_col_adst_adst_32[12] = {
+    15, 15, 16, 17, 17, 18, 18, 19, 19, 20, 20, 20};
+static const int8_t fwd_stage_range_row_adst_adst_32[12] = {
+    16, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20, 20};
+static const int8_t fwd_cos_bit_col_adst_adst_32[12] = {12, 12, 12, 12, 12, 12,
+                                                        12, 12, 12, 12, 12, 12};
+static const int8_t fwd_cos_bit_row_adst_adst_32[12] = {12, 12, 12, 12, 12, 12,
+                                                        12, 12, 12, 12, 12, 12};
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_adst_adst_32 = {
+    32,  // .txfm_size
+    12,  // .stage_num_col
+    12,  // .stage_num_row
+    // 1,  // .log_scale
+    fwd_shift_adst_adst_32,            // .shift
+    fwd_stage_range_col_adst_adst_32,  // .stage_range_col
+    fwd_stage_range_row_adst_adst_32,  // .stage_range_row
+    fwd_cos_bit_col_adst_adst_32,      // .cos_bit_col
+    fwd_cos_bit_row_adst_adst_32,      // .cos_bit_row
+    TXFM_TYPE_ADST32,                  // .txfm_type_col
+    TXFM_TYPE_ADST32};                 // .txfm_type_row
+
+//  ---------------- config fwd_adst_dct_4 ----------------
+static const int8_t fwd_shift_adst_dct_4[3] = {2, 0, 0};
+static const int8_t fwd_stage_range_col_adst_dct_4[6] = {15, 15, 16,
+                                                         17, 17, 17};
+static const int8_t fwd_stage_range_row_adst_dct_4[4] = {17, 18, 18, 18};
+static const int8_t fwd_cos_bit_col_adst_dct_4[6] = {13, 13, 13, 13, 13, 13};
+static const int8_t fwd_cos_bit_row_adst_dct_4[4] = {13, 13, 13, 13};
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_adst_dct_4 = {
+    4,  // .txfm_size
+    6,  // .stage_num_col
+    4,  // .stage_num_row
+    // 0,  // .log_scale
+    fwd_shift_adst_dct_4,            // .shift
+    fwd_stage_range_col_adst_dct_4,  // .stage_range_col
+    fwd_stage_range_row_adst_dct_4,  // .stage_range_row
+    fwd_cos_bit_col_adst_dct_4,      // .cos_bit_col
+    fwd_cos_bit_row_adst_dct_4,      // .cos_bit_row
+    TXFM_TYPE_ADST4,                 // .txfm_type_col
+    TXFM_TYPE_DCT4};                 // .txfm_type_row
+
+//  ---------------- config fwd_adst_dct_8 ----------------
+static const int8_t fwd_shift_adst_dct_8[3] = {2, -1, 0};
+static const int8_t fwd_stage_range_col_adst_dct_8[8] = {15, 15, 16, 17,
+                                                         17, 18, 18, 18};
+static const int8_t fwd_stage_range_row_adst_dct_8[6] = {17, 18, 19,
+                                                         19, 19, 19};
+static const int8_t fwd_cos_bit_col_adst_dct_8[8] = {13, 13, 13, 13,
+                                                     13, 13, 13, 13};
+static const int8_t fwd_cos_bit_row_adst_dct_8[6] = {13, 13, 13, 13, 13, 13};
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_adst_dct_8 = {
+    8,  // .txfm_size
+    8,  // .stage_num_col
+    6,  // .stage_num_row
+    // 0,  // .log_scale
+    fwd_shift_adst_dct_8,            // .shift
+    fwd_stage_range_col_adst_dct_8,  // .stage_range_col
+    fwd_stage_range_row_adst_dct_8,  // .stage_range_row
+    fwd_cos_bit_col_adst_dct_8,      // .cos_bit_col
+    fwd_cos_bit_row_adst_dct_8,      // .cos_bit_row
+    TXFM_TYPE_ADST8,                 // .txfm_type_col
+    TXFM_TYPE_DCT8};                 // .txfm_type_row
+
+//  ---------------- config fwd_adst_dct_16 ----------------
+static const int8_t fwd_shift_adst_dct_16[3] = {2, -2, 0};
+static const int8_t fwd_stage_range_col_adst_dct_16[10] = {15, 15, 16, 17, 17,
+                                                           18, 18, 19, 19, 19};
+static const int8_t fwd_stage_range_row_adst_dct_16[8] = {17, 18, 19, 20,
+                                                          20, 20, 20, 20};
+static const int8_t fwd_cos_bit_col_adst_dct_16[10] = {13, 13, 13, 13, 13,
+                                                       13, 13, 13, 13, 13};
+static const int8_t fwd_cos_bit_row_adst_dct_16[8] = {12, 12, 12, 12,
+                                                      12, 12, 12, 12};
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_adst_dct_16 = {
+    16,  // .txfm_size
+    10,  // .stage_num_col
+    8,   // .stage_num_row
+    // 0,  // .log_scale
+    fwd_shift_adst_dct_16,            // .shift
+    fwd_stage_range_col_adst_dct_16,  // .stage_range_col
+    fwd_stage_range_row_adst_dct_16,  // .stage_range_row
+    fwd_cos_bit_col_adst_dct_16,      // .cos_bit_col
+    fwd_cos_bit_row_adst_dct_16,      // .cos_bit_row
+    TXFM_TYPE_ADST16,                 // .txfm_type_col
+    TXFM_TYPE_DCT16};                 // .txfm_type_row
+
+//  ---------------- config fwd_adst_dct_32 ----------------
+static const int8_t fwd_shift_adst_dct_32[3] = {2, -4, 0};
+static const int8_t fwd_stage_range_col_adst_dct_32[12] = {
+    15, 15, 16, 17, 17, 18, 18, 19, 19, 20, 20, 20};
+static const int8_t fwd_stage_range_row_adst_dct_32[10] = {16, 17, 18, 19, 20,
+                                                           20, 20, 20, 20, 20};
+static const int8_t fwd_cos_bit_col_adst_dct_32[12] = {12, 12, 12, 12, 12, 12,
+                                                       12, 12, 12, 12, 12, 12};
+static const int8_t fwd_cos_bit_row_adst_dct_32[10] = {12, 12, 12, 12, 12,
+                                                       12, 12, 12, 12, 12};
+
+static const TXFM_2D_CFG fwd_txfm_2d_cfg_adst_dct_32 = {
+    32,  // .txfm_size
+    12,  // .stage_num_col
+    10,  // .stage_num_row
+    // 1,  // .log_scale
+    fwd_shift_adst_dct_32,            // .shift
+    fwd_stage_range_col_adst_dct_32,  // .stage_range_col
+    fwd_stage_range_row_adst_dct_32,  // .stage_range_row
+    fwd_cos_bit_col_adst_dct_32,      // .cos_bit_col
+    fwd_cos_bit_row_adst_dct_32,      // .cos_bit_row
+    TXFM_TYPE_ADST32,                 // .txfm_type_col
+    TXFM_TYPE_DCT32};                 // .txfm_type_row
+#endif  // VP10_FWD_TXFM2D_CFG_H_
diff --git a/vp10/common/vp10_inv_txfm.c b/vp10/common/vp10_inv_txfm.c
index c31275e..d41f389 100644
--- a/vp10/common/vp10_inv_txfm.c
+++ b/vp10/common/vp10_inv_txfm.c
@@ -12,6 +12,7 @@
 #include <math.h>
 #include <string.h>
 
+#include "./vp10_rtcd.h"
 #include "vp10/common/vp10_inv_txfm.h"
 
 void vp10_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride) {
diff --git a/vp10/common/vp10_inv_txfm1d.c b/vp10/common/vp10_inv_txfm1d.c
new file mode 100644
index 0000000..494000f
--- /dev/null
+++ b/vp10/common/vp10_inv_txfm1d.c
@@ -0,0 +1,2330 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include "vp10/common/vp10_inv_txfm1d.h"
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+#define range_check(stage, input, buf, size, bit)                         \
+  {                                                                       \
+    int i, j;                                                             \
+    for (i = 0; i < size; ++i) {                                          \
+      int buf_bit = get_max_bit(abs(buf[i])) + 1;                         \
+      if (buf_bit > bit) {                                                \
+        printf("======== %s overflow ========\n", __func__);              \
+        printf("stage: %d node: %d\n", stage, i);                         \
+        printf("bit: %d buf_bit: %d buf[i]: %d\n", bit, buf_bit, buf[i]); \
+        printf("input:\n");                                               \
+        for (j = 0; j < size; j++) {                                      \
+          printf("%d,", input[j]);                                        \
+        }                                                                 \
+        printf("\n");                                                     \
+        assert(0);                                                        \
+      }                                                                   \
+    }                                                                     \
+  }
+#else
+#define range_check(stage, input, buf, size, bit) \
+  {                                               \
+    (void)stage;                                  \
+    (void)input;                                  \
+    (void)buf;                                    \
+    (void)size;                                   \
+    (void)bit;                                    \
+  }
+#endif
+
+void vp10_idct4_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 4;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[4];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = input[2];
+  bf1[2] = input[1];
+  bf1[3] = input[3];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = bf0[1] - bf0[2];
+  bf1[3] = bf0[0] - bf0[3];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void vp10_idct8_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 8;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[8];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = input[4];
+  bf1[2] = input[2];
+  bf1[3] = input[6];
+  bf1[4] = input[1];
+  bf1[5] = input[5];
+  bf1[6] = input[3];
+  bf1[7] = input[7];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit[stage]);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = bf0[4] - bf0[5];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[6] + bf0[7];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = bf0[1] - bf0[2];
+  bf1[3] = bf0[0] - bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[7] = bf0[7];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[7];
+  bf1[1] = bf0[1] + bf0[6];
+  bf1[2] = bf0[2] + bf0[5];
+  bf1[3] = bf0[3] + bf0[4];
+  bf1[4] = bf0[3] - bf0[4];
+  bf1[5] = bf0[2] - bf0[5];
+  bf1[6] = bf0[1] - bf0[6];
+  bf1[7] = bf0[0] - bf0[7];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void vp10_idct16_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 16;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[16];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = input[8];
+  bf1[2] = input[4];
+  bf1[3] = input[12];
+  bf1[4] = input[2];
+  bf1[5] = input[10];
+  bf1[6] = input[6];
+  bf1[7] = input[14];
+  bf1[8] = input[1];
+  bf1[9] = input[9];
+  bf1[10] = input[5];
+  bf1[11] = input[13];
+  bf1[12] = input[3];
+  bf1[13] = input[11];
+  bf1[14] = input[7];
+  bf1[15] = input[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit[stage]);
+  bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit[stage]);
+  bf1[8] = bf0[8] + bf0[9];
+  bf1[9] = bf0[8] - bf0[9];
+  bf1[10] = -bf0[10] + bf0[11];
+  bf1[11] = bf0[10] + bf0[11];
+  bf1[12] = bf0[12] + bf0[13];
+  bf1[13] = bf0[12] - bf0[13];
+  bf1[14] = -bf0[14] + bf0[15];
+  bf1[15] = bf0[14] + bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit[stage]);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = bf0[4] - bf0[5];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[6] + bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit[stage]);
+  bf1[15] = bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = bf0[1] - bf0[2];
+  bf1[3] = bf0[0] - bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8] + bf0[11];
+  bf1[9] = bf0[9] + bf0[10];
+  bf1[10] = bf0[9] - bf0[10];
+  bf1[11] = bf0[8] - bf0[11];
+  bf1[12] = -bf0[12] + bf0[15];
+  bf1[13] = -bf0[13] + bf0[14];
+  bf1[14] = bf0[13] + bf0[14];
+  bf1[15] = bf0[12] + bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[7];
+  bf1[1] = bf0[1] + bf0[6];
+  bf1[2] = bf0[2] + bf0[5];
+  bf1[3] = bf0[3] + bf0[4];
+  bf1[4] = bf0[3] - bf0[4];
+  bf1[5] = bf0[2] - bf0[5];
+  bf1[6] = bf0[1] - bf0[6];
+  bf1[7] = bf0[0] - bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[15];
+  bf1[1] = bf0[1] + bf0[14];
+  bf1[2] = bf0[2] + bf0[13];
+  bf1[3] = bf0[3] + bf0[12];
+  bf1[4] = bf0[4] + bf0[11];
+  bf1[5] = bf0[5] + bf0[10];
+  bf1[6] = bf0[6] + bf0[9];
+  bf1[7] = bf0[7] + bf0[8];
+  bf1[8] = bf0[7] - bf0[8];
+  bf1[9] = bf0[6] - bf0[9];
+  bf1[10] = bf0[5] - bf0[10];
+  bf1[11] = bf0[4] - bf0[11];
+  bf1[12] = bf0[3] - bf0[12];
+  bf1[13] = bf0[2] - bf0[13];
+  bf1[14] = bf0[1] - bf0[14];
+  bf1[15] = bf0[0] - bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void vp10_idct32_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 32;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[32];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = input[16];
+  bf1[2] = input[8];
+  bf1[3] = input[24];
+  bf1[4] = input[4];
+  bf1[5] = input[20];
+  bf1[6] = input[12];
+  bf1[7] = input[28];
+  bf1[8] = input[2];
+  bf1[9] = input[18];
+  bf1[10] = input[10];
+  bf1[11] = input[26];
+  bf1[12] = input[6];
+  bf1[13] = input[22];
+  bf1[14] = input[14];
+  bf1[15] = input[30];
+  bf1[16] = input[1];
+  bf1[17] = input[17];
+  bf1[18] = input[9];
+  bf1[19] = input[25];
+  bf1[20] = input[5];
+  bf1[21] = input[21];
+  bf1[22] = input[13];
+  bf1[23] = input[29];
+  bf1[24] = input[3];
+  bf1[25] = input[19];
+  bf1[26] = input[11];
+  bf1[27] = input[27];
+  bf1[28] = input[7];
+  bf1[29] = input[23];
+  bf1[30] = input[15];
+  bf1[31] = input[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit[stage]);
+  bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit[stage]);
+  bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit[stage]);
+  bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit[stage]);
+  bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit[stage]);
+  bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit[stage]);
+  bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit[stage]);
+  bf1[16] = bf0[16] + bf0[17];
+  bf1[17] = bf0[16] - bf0[17];
+  bf1[18] = -bf0[18] + bf0[19];
+  bf1[19] = bf0[18] + bf0[19];
+  bf1[20] = bf0[20] + bf0[21];
+  bf1[21] = bf0[20] - bf0[21];
+  bf1[22] = -bf0[22] + bf0[23];
+  bf1[23] = bf0[22] + bf0[23];
+  bf1[24] = bf0[24] + bf0[25];
+  bf1[25] = bf0[24] - bf0[25];
+  bf1[26] = -bf0[26] + bf0[27];
+  bf1[27] = bf0[26] + bf0[27];
+  bf1[28] = bf0[28] + bf0[29];
+  bf1[29] = bf0[28] - bf0[29];
+  bf1[30] = -bf0[30] + bf0[31];
+  bf1[31] = bf0[30] + bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit[stage]);
+  bf1[8] = bf0[8] + bf0[9];
+  bf1[9] = bf0[8] - bf0[9];
+  bf1[10] = -bf0[10] + bf0[11];
+  bf1[11] = bf0[10] + bf0[11];
+  bf1[12] = bf0[12] + bf0[13];
+  bf1[13] = bf0[12] - bf0[13];
+  bf1[14] = -bf0[14] + bf0[15];
+  bf1[15] = bf0[14] + bf0[15];
+  bf1[16] = bf0[16];
+  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit[stage]);
+  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit[stage]);
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit[stage]);
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit[stage]);
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit[stage]);
+  bf1[31] = bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit[stage]);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = bf0[4] - bf0[5];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[6] + bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit[stage]);
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[19];
+  bf1[17] = bf0[17] + bf0[18];
+  bf1[18] = bf0[17] - bf0[18];
+  bf1[19] = bf0[16] - bf0[19];
+  bf1[20] = -bf0[20] + bf0[23];
+  bf1[21] = -bf0[21] + bf0[22];
+  bf1[22] = bf0[21] + bf0[22];
+  bf1[23] = bf0[20] + bf0[23];
+  bf1[24] = bf0[24] + bf0[27];
+  bf1[25] = bf0[25] + bf0[26];
+  bf1[26] = bf0[25] - bf0[26];
+  bf1[27] = bf0[24] - bf0[27];
+  bf1[28] = -bf0[28] + bf0[31];
+  bf1[29] = -bf0[29] + bf0[30];
+  bf1[30] = bf0[29] + bf0[30];
+  bf1[31] = bf0[28] + bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = bf0[1] - bf0[2];
+  bf1[3] = bf0[0] - bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8] + bf0[11];
+  bf1[9] = bf0[9] + bf0[10];
+  bf1[10] = bf0[9] - bf0[10];
+  bf1[11] = bf0[8] - bf0[11];
+  bf1[12] = -bf0[12] + bf0[15];
+  bf1[13] = -bf0[13] + bf0[14];
+  bf1[14] = bf0[13] + bf0[14];
+  bf1[15] = bf0[12] + bf0[15];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit[stage]);
+  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit[stage]);
+  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit[stage]);
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit[stage]);
+  bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit[stage]);
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[7];
+  bf1[1] = bf0[1] + bf0[6];
+  bf1[2] = bf0[2] + bf0[5];
+  bf1[3] = bf0[3] + bf0[4];
+  bf1[4] = bf0[3] - bf0[4];
+  bf1[5] = bf0[2] - bf0[5];
+  bf1[6] = bf0[1] - bf0[6];
+  bf1[7] = bf0[0] - bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[23];
+  bf1[17] = bf0[17] + bf0[22];
+  bf1[18] = bf0[18] + bf0[21];
+  bf1[19] = bf0[19] + bf0[20];
+  bf1[20] = bf0[19] - bf0[20];
+  bf1[21] = bf0[18] - bf0[21];
+  bf1[22] = bf0[17] - bf0[22];
+  bf1[23] = bf0[16] - bf0[23];
+  bf1[24] = -bf0[24] + bf0[31];
+  bf1[25] = -bf0[25] + bf0[30];
+  bf1[26] = -bf0[26] + bf0[29];
+  bf1[27] = -bf0[27] + bf0[28];
+  bf1[28] = bf0[27] + bf0[28];
+  bf1[29] = bf0[26] + bf0[29];
+  bf1[30] = bf0[25] + bf0[30];
+  bf1[31] = bf0[24] + bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[15];
+  bf1[1] = bf0[1] + bf0[14];
+  bf1[2] = bf0[2] + bf0[13];
+  bf1[3] = bf0[3] + bf0[12];
+  bf1[4] = bf0[4] + bf0[11];
+  bf1[5] = bf0[5] + bf0[10];
+  bf1[6] = bf0[6] + bf0[9];
+  bf1[7] = bf0[7] + bf0[8];
+  bf1[8] = bf0[7] - bf0[8];
+  bf1[9] = bf0[6] - bf0[9];
+  bf1[10] = bf0[5] - bf0[10];
+  bf1[11] = bf0[4] - bf0[11];
+  bf1[12] = bf0[3] - bf0[12];
+  bf1[13] = bf0[2] - bf0[13];
+  bf1[14] = bf0[1] - bf0[14];
+  bf1[15] = bf0[0] - bf0[15];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
+  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[31];
+  bf1[1] = bf0[1] + bf0[30];
+  bf1[2] = bf0[2] + bf0[29];
+  bf1[3] = bf0[3] + bf0[28];
+  bf1[4] = bf0[4] + bf0[27];
+  bf1[5] = bf0[5] + bf0[26];
+  bf1[6] = bf0[6] + bf0[25];
+  bf1[7] = bf0[7] + bf0[24];
+  bf1[8] = bf0[8] + bf0[23];
+  bf1[9] = bf0[9] + bf0[22];
+  bf1[10] = bf0[10] + bf0[21];
+  bf1[11] = bf0[11] + bf0[20];
+  bf1[12] = bf0[12] + bf0[19];
+  bf1[13] = bf0[13] + bf0[18];
+  bf1[14] = bf0[14] + bf0[17];
+  bf1[15] = bf0[15] + bf0[16];
+  bf1[16] = bf0[15] - bf0[16];
+  bf1[17] = bf0[14] - bf0[17];
+  bf1[18] = bf0[13] - bf0[18];
+  bf1[19] = bf0[12] - bf0[19];
+  bf1[20] = bf0[11] - bf0[20];
+  bf1[21] = bf0[10] - bf0[21];
+  bf1[22] = bf0[9] - bf0[22];
+  bf1[23] = bf0[8] - bf0[23];
+  bf1[24] = bf0[7] - bf0[24];
+  bf1[25] = bf0[6] - bf0[25];
+  bf1[26] = bf0[5] - bf0[26];
+  bf1[27] = bf0[4] - bf0[27];
+  bf1[28] = bf0[3] - bf0[28];
+  bf1[29] = bf0[2] - bf0[29];
+  bf1[30] = bf0[1] - bf0[30];
+  bf1[31] = bf0[0] - bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void vp10_iadst4_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 4;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[4];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = -input[3];
+  bf1[2] = -input[1];
+  bf1[3] = input[2];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[2];
+  bf1[1] = bf0[1] + bf0[3];
+  bf1[2] = bf0[0] - bf0[2];
+  bf1[3] = bf0[1] - bf0[3];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[8], bf0[0], cospi[56], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(cospi[56], bf0[0], -cospi[8], bf0[1], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[40], bf0[2], cospi[24], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[24], bf0[2], -cospi[40], bf0[3], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[1];
+  bf1[1] = bf0[2];
+  bf1[2] = bf0[3];
+  bf1[3] = bf0[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void vp10_iadst8_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 8;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[8];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = -input[7];
+  bf1[2] = -input[3];
+  bf1[3] = input[4];
+  bf1[4] = -input[1];
+  bf1[5] = input[6];
+  bf1[6] = input[2];
+  bf1[7] = -input[5];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit[stage]);
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[2];
+  bf1[1] = bf0[1] + bf0[3];
+  bf1[2] = bf0[0] - bf0[2];
+  bf1[3] = bf0[1] - bf0[3];
+  bf1[4] = bf0[4] + bf0[6];
+  bf1[5] = bf0[5] + bf0[7];
+  bf1[6] = bf0[4] - bf0[6];
+  bf1[7] = bf0[5] - bf0[7];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit[stage]);
+  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[4];
+  bf1[1] = bf0[1] + bf0[5];
+  bf1[2] = bf0[2] + bf0[6];
+  bf1[3] = bf0[3] + bf0[7];
+  bf1[4] = bf0[0] - bf0[4];
+  bf1[5] = bf0[1] - bf0[5];
+  bf1[6] = bf0[2] - bf0[6];
+  bf1[7] = bf0[3] - bf0[7];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[4], bf0[0], cospi[60], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(cospi[60], bf0[0], -cospi[4], bf0[1], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[20], bf0[2], cospi[44], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[44], bf0[2], -cospi[20], bf0[3], cos_bit[stage]);
+  bf1[4] = half_btf(cospi[36], bf0[4], cospi[28], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[28], bf0[4], -cospi[36], bf0[5], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[52], bf0[6], cospi[12], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[12], bf0[6], -cospi[52], bf0[7], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[1];
+  bf1[1] = bf0[6];
+  bf1[2] = bf0[3];
+  bf1[3] = bf0[4];
+  bf1[4] = bf0[5];
+  bf1[5] = bf0[2];
+  bf1[6] = bf0[7];
+  bf1[7] = bf0[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void vp10_iadst16_new(const int32_t *input, int32_t *output,
+                      const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 16;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[16];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = -input[15];
+  bf1[2] = -input[7];
+  bf1[3] = input[8];
+  bf1[4] = -input[3];
+  bf1[5] = input[12];
+  bf1[6] = input[4];
+  bf1[7] = -input[11];
+  bf1[8] = -input[1];
+  bf1[9] = input[14];
+  bf1[10] = input[6];
+  bf1[11] = -input[9];
+  bf1[12] = input[2];
+  bf1[13] = -input[13];
+  bf1[14] = -input[5];
+  bf1[15] = input[10];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit[stage]);
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit[stage]);
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[2];
+  bf1[1] = bf0[1] + bf0[3];
+  bf1[2] = bf0[0] - bf0[2];
+  bf1[3] = bf0[1] - bf0[3];
+  bf1[4] = bf0[4] + bf0[6];
+  bf1[5] = bf0[5] + bf0[7];
+  bf1[6] = bf0[4] - bf0[6];
+  bf1[7] = bf0[5] - bf0[7];
+  bf1[8] = bf0[8] + bf0[10];
+  bf1[9] = bf0[9] + bf0[11];
+  bf1[10] = bf0[8] - bf0[10];
+  bf1[11] = bf0[9] - bf0[11];
+  bf1[12] = bf0[12] + bf0[14];
+  bf1[13] = bf0[13] + bf0[15];
+  bf1[14] = bf0[12] - bf0[14];
+  bf1[15] = bf0[13] - bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit[stage]);
+  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[4];
+  bf1[1] = bf0[1] + bf0[5];
+  bf1[2] = bf0[2] + bf0[6];
+  bf1[3] = bf0[3] + bf0[7];
+  bf1[4] = bf0[0] - bf0[4];
+  bf1[5] = bf0[1] - bf0[5];
+  bf1[6] = bf0[2] - bf0[6];
+  bf1[7] = bf0[3] - bf0[7];
+  bf1[8] = bf0[8] + bf0[12];
+  bf1[9] = bf0[9] + bf0[13];
+  bf1[10] = bf0[10] + bf0[14];
+  bf1[11] = bf0[11] + bf0[15];
+  bf1[12] = bf0[8] - bf0[12];
+  bf1[13] = bf0[9] - bf0[13];
+  bf1[14] = bf0[10] - bf0[14];
+  bf1[15] = bf0[11] - bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit[stage]);
+  bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit[stage]);
+  bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[8];
+  bf1[1] = bf0[1] + bf0[9];
+  bf1[2] = bf0[2] + bf0[10];
+  bf1[3] = bf0[3] + bf0[11];
+  bf1[4] = bf0[4] + bf0[12];
+  bf1[5] = bf0[5] + bf0[13];
+  bf1[6] = bf0[6] + bf0[14];
+  bf1[7] = bf0[7] + bf0[15];
+  bf1[8] = bf0[0] - bf0[8];
+  bf1[9] = bf0[1] - bf0[9];
+  bf1[10] = bf0[2] - bf0[10];
+  bf1[11] = bf0[3] - bf0[11];
+  bf1[12] = bf0[4] - bf0[12];
+  bf1[13] = bf0[5] - bf0[13];
+  bf1[14] = bf0[6] - bf0[14];
+  bf1[15] = bf0[7] - bf0[15];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[2], bf0[0], cospi[62], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(cospi[62], bf0[0], -cospi[2], bf0[1], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[10], bf0[2], cospi[54], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[54], bf0[2], -cospi[10], bf0[3], cos_bit[stage]);
+  bf1[4] = half_btf(cospi[18], bf0[4], cospi[46], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[46], bf0[4], -cospi[18], bf0[5], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[26], bf0[6], cospi[38], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[38], bf0[6], -cospi[26], bf0[7], cos_bit[stage]);
+  bf1[8] = half_btf(cospi[34], bf0[8], cospi[30], bf0[9], cos_bit[stage]);
+  bf1[9] = half_btf(cospi[30], bf0[8], -cospi[34], bf0[9], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[42], bf0[10], cospi[22], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[22], bf0[10], -cospi[42], bf0[11], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[50], bf0[12], cospi[14], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[14], bf0[12], -cospi[50], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[58], bf0[14], cospi[6], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[6], bf0[14], -cospi[58], bf0[15], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[1];
+  bf1[1] = bf0[14];
+  bf1[2] = bf0[3];
+  bf1[3] = bf0[12];
+  bf1[4] = bf0[5];
+  bf1[5] = bf0[10];
+  bf1[6] = bf0[7];
+  bf1[7] = bf0[8];
+  bf1[8] = bf0[9];
+  bf1[9] = bf0[6];
+  bf1[10] = bf0[11];
+  bf1[11] = bf0[4];
+  bf1[12] = bf0[13];
+  bf1[13] = bf0[2];
+  bf1[14] = bf0[15];
+  bf1[15] = bf0[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void vp10_iadst32_new(const int32_t *input, int32_t *output,
+                      const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 32;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[32];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = -input[31];
+  bf1[2] = -input[15];
+  bf1[3] = input[16];
+  bf1[4] = -input[7];
+  bf1[5] = input[24];
+  bf1[6] = input[8];
+  bf1[7] = -input[23];
+  bf1[8] = -input[3];
+  bf1[9] = input[28];
+  bf1[10] = input[12];
+  bf1[11] = -input[19];
+  bf1[12] = input[4];
+  bf1[13] = -input[27];
+  bf1[14] = -input[11];
+  bf1[15] = input[20];
+  bf1[16] = -input[1];
+  bf1[17] = input[30];
+  bf1[18] = input[14];
+  bf1[19] = -input[17];
+  bf1[20] = input[6];
+  bf1[21] = -input[25];
+  bf1[22] = -input[9];
+  bf1[23] = input[22];
+  bf1[24] = input[2];
+  bf1[25] = -input[29];
+  bf1[26] = -input[13];
+  bf1[27] = input[18];
+  bf1[28] = -input[5];
+  bf1[29] = input[26];
+  bf1[30] = input[10];
+  bf1[31] = -input[21];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = half_btf(cospi[32], bf0[2], cospi[32], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[32], bf0[2], -cospi[32], bf0[3], cos_bit[stage]);
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = half_btf(cospi[32], bf0[6], cospi[32], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[32], bf0[6], -cospi[32], bf0[7], cos_bit[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(cospi[32], bf0[10], cospi[32], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[32], bf0[10], -cospi[32], bf0[11], cos_bit[stage]);
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = half_btf(cospi[32], bf0[14], cospi[32], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[32], bf0[14], -cospi[32], bf0[15], cos_bit[stage]);
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = half_btf(cospi[32], bf0[18], cospi[32], bf0[19], cos_bit[stage]);
+  bf1[19] = half_btf(cospi[32], bf0[18], -cospi[32], bf0[19], cos_bit[stage]);
+  bf1[20] = bf0[20];
+  bf1[21] = bf0[21];
+  bf1[22] = half_btf(cospi[32], bf0[22], cospi[32], bf0[23], cos_bit[stage]);
+  bf1[23] = half_btf(cospi[32], bf0[22], -cospi[32], bf0[23], cos_bit[stage]);
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = half_btf(cospi[32], bf0[26], cospi[32], bf0[27], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[32], bf0[26], -cospi[32], bf0[27], cos_bit[stage]);
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = half_btf(cospi[32], bf0[30], cospi[32], bf0[31], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[32], bf0[30], -cospi[32], bf0[31], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[2];
+  bf1[1] = bf0[1] + bf0[3];
+  bf1[2] = bf0[0] - bf0[2];
+  bf1[3] = bf0[1] - bf0[3];
+  bf1[4] = bf0[4] + bf0[6];
+  bf1[5] = bf0[5] + bf0[7];
+  bf1[6] = bf0[4] - bf0[6];
+  bf1[7] = bf0[5] - bf0[7];
+  bf1[8] = bf0[8] + bf0[10];
+  bf1[9] = bf0[9] + bf0[11];
+  bf1[10] = bf0[8] - bf0[10];
+  bf1[11] = bf0[9] - bf0[11];
+  bf1[12] = bf0[12] + bf0[14];
+  bf1[13] = bf0[13] + bf0[15];
+  bf1[14] = bf0[12] - bf0[14];
+  bf1[15] = bf0[13] - bf0[15];
+  bf1[16] = bf0[16] + bf0[18];
+  bf1[17] = bf0[17] + bf0[19];
+  bf1[18] = bf0[16] - bf0[18];
+  bf1[19] = bf0[17] - bf0[19];
+  bf1[20] = bf0[20] + bf0[22];
+  bf1[21] = bf0[21] + bf0[23];
+  bf1[22] = bf0[20] - bf0[22];
+  bf1[23] = bf0[21] - bf0[23];
+  bf1[24] = bf0[24] + bf0[26];
+  bf1[25] = bf0[25] + bf0[27];
+  bf1[26] = bf0[24] - bf0[26];
+  bf1[27] = bf0[25] - bf0[27];
+  bf1[28] = bf0[28] + bf0[30];
+  bf1[29] = bf0[29] + bf0[31];
+  bf1[30] = bf0[28] - bf0[30];
+  bf1[31] = bf0[29] - bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[16], bf0[4], cospi[48], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[48], bf0[4], -cospi[16], bf0[5], cos_bit[stage]);
+  bf1[6] = half_btf(-cospi[48], bf0[6], cospi[16], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[16], bf0[6], cospi[48], bf0[7], cos_bit[stage]);
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = half_btf(cospi[16], bf0[12], cospi[48], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[48], bf0[12], -cospi[16], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(-cospi[48], bf0[14], cospi[16], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[16], bf0[14], cospi[48], bf0[15], cos_bit[stage]);
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = half_btf(cospi[16], bf0[20], cospi[48], bf0[21], cos_bit[stage]);
+  bf1[21] = half_btf(cospi[48], bf0[20], -cospi[16], bf0[21], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[48], bf0[22], cospi[16], bf0[23], cos_bit[stage]);
+  bf1[23] = half_btf(cospi[16], bf0[22], cospi[48], bf0[23], cos_bit[stage]);
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = bf0[26];
+  bf1[27] = bf0[27];
+  bf1[28] = half_btf(cospi[16], bf0[28], cospi[48], bf0[29], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[48], bf0[28], -cospi[16], bf0[29], cos_bit[stage]);
+  bf1[30] = half_btf(-cospi[48], bf0[30], cospi[16], bf0[31], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[16], bf0[30], cospi[48], bf0[31], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[4];
+  bf1[1] = bf0[1] + bf0[5];
+  bf1[2] = bf0[2] + bf0[6];
+  bf1[3] = bf0[3] + bf0[7];
+  bf1[4] = bf0[0] - bf0[4];
+  bf1[5] = bf0[1] - bf0[5];
+  bf1[6] = bf0[2] - bf0[6];
+  bf1[7] = bf0[3] - bf0[7];
+  bf1[8] = bf0[8] + bf0[12];
+  bf1[9] = bf0[9] + bf0[13];
+  bf1[10] = bf0[10] + bf0[14];
+  bf1[11] = bf0[11] + bf0[15];
+  bf1[12] = bf0[8] - bf0[12];
+  bf1[13] = bf0[9] - bf0[13];
+  bf1[14] = bf0[10] - bf0[14];
+  bf1[15] = bf0[11] - bf0[15];
+  bf1[16] = bf0[16] + bf0[20];
+  bf1[17] = bf0[17] + bf0[21];
+  bf1[18] = bf0[18] + bf0[22];
+  bf1[19] = bf0[19] + bf0[23];
+  bf1[20] = bf0[16] - bf0[20];
+  bf1[21] = bf0[17] - bf0[21];
+  bf1[22] = bf0[18] - bf0[22];
+  bf1[23] = bf0[19] - bf0[23];
+  bf1[24] = bf0[24] + bf0[28];
+  bf1[25] = bf0[25] + bf0[29];
+  bf1[26] = bf0[26] + bf0[30];
+  bf1[27] = bf0[27] + bf0[31];
+  bf1[28] = bf0[24] - bf0[28];
+  bf1[29] = bf0[25] - bf0[29];
+  bf1[30] = bf0[26] - bf0[30];
+  bf1[31] = bf0[27] - bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[8], bf0[8], cospi[56], bf0[9], cos_bit[stage]);
+  bf1[9] = half_btf(cospi[56], bf0[8], -cospi[8], bf0[9], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[40], bf0[10], cospi[24], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[24], bf0[10], -cospi[40], bf0[11], cos_bit[stage]);
+  bf1[12] = half_btf(-cospi[56], bf0[12], cospi[8], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[8], bf0[12], cospi[56], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(-cospi[24], bf0[14], cospi[40], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[40], bf0[14], cospi[24], bf0[15], cos_bit[stage]);
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = bf0[21];
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = half_btf(cospi[8], bf0[24], cospi[56], bf0[25], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[56], bf0[24], -cospi[8], bf0[25], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[40], bf0[26], cospi[24], bf0[27], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[24], bf0[26], -cospi[40], bf0[27], cos_bit[stage]);
+  bf1[28] = half_btf(-cospi[56], bf0[28], cospi[8], bf0[29], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[8], bf0[28], cospi[56], bf0[29], cos_bit[stage]);
+  bf1[30] = half_btf(-cospi[24], bf0[30], cospi[40], bf0[31], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[40], bf0[30], cospi[24], bf0[31], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[8];
+  bf1[1] = bf0[1] + bf0[9];
+  bf1[2] = bf0[2] + bf0[10];
+  bf1[3] = bf0[3] + bf0[11];
+  bf1[4] = bf0[4] + bf0[12];
+  bf1[5] = bf0[5] + bf0[13];
+  bf1[6] = bf0[6] + bf0[14];
+  bf1[7] = bf0[7] + bf0[15];
+  bf1[8] = bf0[0] - bf0[8];
+  bf1[9] = bf0[1] - bf0[9];
+  bf1[10] = bf0[2] - bf0[10];
+  bf1[11] = bf0[3] - bf0[11];
+  bf1[12] = bf0[4] - bf0[12];
+  bf1[13] = bf0[5] - bf0[13];
+  bf1[14] = bf0[6] - bf0[14];
+  bf1[15] = bf0[7] - bf0[15];
+  bf1[16] = bf0[16] + bf0[24];
+  bf1[17] = bf0[17] + bf0[25];
+  bf1[18] = bf0[18] + bf0[26];
+  bf1[19] = bf0[19] + bf0[27];
+  bf1[20] = bf0[20] + bf0[28];
+  bf1[21] = bf0[21] + bf0[29];
+  bf1[22] = bf0[22] + bf0[30];
+  bf1[23] = bf0[23] + bf0[31];
+  bf1[24] = bf0[16] - bf0[24];
+  bf1[25] = bf0[17] - bf0[25];
+  bf1[26] = bf0[18] - bf0[26];
+  bf1[27] = bf0[19] - bf0[27];
+  bf1[28] = bf0[20] - bf0[28];
+  bf1[29] = bf0[21] - bf0[29];
+  bf1[30] = bf0[22] - bf0[30];
+  bf1[31] = bf0[23] - bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = half_btf(cospi[4], bf0[16], cospi[60], bf0[17], cos_bit[stage]);
+  bf1[17] = half_btf(cospi[60], bf0[16], -cospi[4], bf0[17], cos_bit[stage]);
+  bf1[18] = half_btf(cospi[20], bf0[18], cospi[44], bf0[19], cos_bit[stage]);
+  bf1[19] = half_btf(cospi[44], bf0[18], -cospi[20], bf0[19], cos_bit[stage]);
+  bf1[20] = half_btf(cospi[36], bf0[20], cospi[28], bf0[21], cos_bit[stage]);
+  bf1[21] = half_btf(cospi[28], bf0[20], -cospi[36], bf0[21], cos_bit[stage]);
+  bf1[22] = half_btf(cospi[52], bf0[22], cospi[12], bf0[23], cos_bit[stage]);
+  bf1[23] = half_btf(cospi[12], bf0[22], -cospi[52], bf0[23], cos_bit[stage]);
+  bf1[24] = half_btf(-cospi[60], bf0[24], cospi[4], bf0[25], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[4], bf0[24], cospi[60], bf0[25], cos_bit[stage]);
+  bf1[26] = half_btf(-cospi[44], bf0[26], cospi[20], bf0[27], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[20], bf0[26], cospi[44], bf0[27], cos_bit[stage]);
+  bf1[28] = half_btf(-cospi[28], bf0[28], cospi[36], bf0[29], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[36], bf0[28], cospi[28], bf0[29], cos_bit[stage]);
+  bf1[30] = half_btf(-cospi[12], bf0[30], cospi[52], bf0[31], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[52], bf0[30], cospi[12], bf0[31], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[16];
+  bf1[1] = bf0[1] + bf0[17];
+  bf1[2] = bf0[2] + bf0[18];
+  bf1[3] = bf0[3] + bf0[19];
+  bf1[4] = bf0[4] + bf0[20];
+  bf1[5] = bf0[5] + bf0[21];
+  bf1[6] = bf0[6] + bf0[22];
+  bf1[7] = bf0[7] + bf0[23];
+  bf1[8] = bf0[8] + bf0[24];
+  bf1[9] = bf0[9] + bf0[25];
+  bf1[10] = bf0[10] + bf0[26];
+  bf1[11] = bf0[11] + bf0[27];
+  bf1[12] = bf0[12] + bf0[28];
+  bf1[13] = bf0[13] + bf0[29];
+  bf1[14] = bf0[14] + bf0[30];
+  bf1[15] = bf0[15] + bf0[31];
+  bf1[16] = bf0[0] - bf0[16];
+  bf1[17] = bf0[1] - bf0[17];
+  bf1[18] = bf0[2] - bf0[18];
+  bf1[19] = bf0[3] - bf0[19];
+  bf1[20] = bf0[4] - bf0[20];
+  bf1[21] = bf0[5] - bf0[21];
+  bf1[22] = bf0[6] - bf0[22];
+  bf1[23] = bf0[7] - bf0[23];
+  bf1[24] = bf0[8] - bf0[24];
+  bf1[25] = bf0[9] - bf0[25];
+  bf1[26] = bf0[10] - bf0[26];
+  bf1[27] = bf0[11] - bf0[27];
+  bf1[28] = bf0[12] - bf0[28];
+  bf1[29] = bf0[13] - bf0[29];
+  bf1[30] = bf0[14] - bf0[30];
+  bf1[31] = bf0[15] - bf0[31];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 10
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[1], bf0[0], cospi[63], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(cospi[63], bf0[0], -cospi[1], bf0[1], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[5], bf0[2], cospi[59], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[59], bf0[2], -cospi[5], bf0[3], cos_bit[stage]);
+  bf1[4] = half_btf(cospi[9], bf0[4], cospi[55], bf0[5], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[55], bf0[4], -cospi[9], bf0[5], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[13], bf0[6], cospi[51], bf0[7], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[51], bf0[6], -cospi[13], bf0[7], cos_bit[stage]);
+  bf1[8] = half_btf(cospi[17], bf0[8], cospi[47], bf0[9], cos_bit[stage]);
+  bf1[9] = half_btf(cospi[47], bf0[8], -cospi[17], bf0[9], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[21], bf0[10], cospi[43], bf0[11], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[43], bf0[10], -cospi[21], bf0[11], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[25], bf0[12], cospi[39], bf0[13], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[39], bf0[12], -cospi[25], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[29], bf0[14], cospi[35], bf0[15], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[35], bf0[14], -cospi[29], bf0[15], cos_bit[stage]);
+  bf1[16] = half_btf(cospi[33], bf0[16], cospi[31], bf0[17], cos_bit[stage]);
+  bf1[17] = half_btf(cospi[31], bf0[16], -cospi[33], bf0[17], cos_bit[stage]);
+  bf1[18] = half_btf(cospi[37], bf0[18], cospi[27], bf0[19], cos_bit[stage]);
+  bf1[19] = half_btf(cospi[27], bf0[18], -cospi[37], bf0[19], cos_bit[stage]);
+  bf1[20] = half_btf(cospi[41], bf0[20], cospi[23], bf0[21], cos_bit[stage]);
+  bf1[21] = half_btf(cospi[23], bf0[20], -cospi[41], bf0[21], cos_bit[stage]);
+  bf1[22] = half_btf(cospi[45], bf0[22], cospi[19], bf0[23], cos_bit[stage]);
+  bf1[23] = half_btf(cospi[19], bf0[22], -cospi[45], bf0[23], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[49], bf0[24], cospi[15], bf0[25], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[15], bf0[24], -cospi[49], bf0[25], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[53], bf0[26], cospi[11], bf0[27], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[11], bf0[26], -cospi[53], bf0[27], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[57], bf0[28], cospi[7], bf0[29], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[7], bf0[28], -cospi[57], bf0[29], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[61], bf0[30], cospi[3], bf0[31], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[3], bf0[30], -cospi[61], bf0[31], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 11
+  stage++;
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[1];
+  bf1[1] = bf0[30];
+  bf1[2] = bf0[3];
+  bf1[3] = bf0[28];
+  bf1[4] = bf0[5];
+  bf1[5] = bf0[26];
+  bf1[6] = bf0[7];
+  bf1[7] = bf0[24];
+  bf1[8] = bf0[9];
+  bf1[9] = bf0[22];
+  bf1[10] = bf0[11];
+  bf1[11] = bf0[20];
+  bf1[12] = bf0[13];
+  bf1[13] = bf0[18];
+  bf1[14] = bf0[15];
+  bf1[15] = bf0[16];
+  bf1[16] = bf0[17];
+  bf1[17] = bf0[14];
+  bf1[18] = bf0[19];
+  bf1[19] = bf0[12];
+  bf1[20] = bf0[21];
+  bf1[21] = bf0[10];
+  bf1[22] = bf0[23];
+  bf1[23] = bf0[8];
+  bf1[24] = bf0[25];
+  bf1[25] = bf0[6];
+  bf1[26] = bf0[27];
+  bf1[27] = bf0[4];
+  bf1[28] = bf0[29];
+  bf1[29] = bf0[2];
+  bf1[30] = bf0[31];
+  bf1[31] = bf0[0];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
+
+void vp10_idct64_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range) {
+  const int32_t size = 64;
+  const int32_t *cospi;
+
+  int32_t stage = 0;
+  int32_t *bf0, *bf1;
+  int32_t step[64];
+
+  // stage 0;
+  range_check(stage, input, input, size, stage_range[stage]);
+
+  // stage 1;
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf1 = output;
+  bf1[0] = input[0];
+  bf1[1] = input[32];
+  bf1[2] = input[16];
+  bf1[3] = input[48];
+  bf1[4] = input[8];
+  bf1[5] = input[40];
+  bf1[6] = input[24];
+  bf1[7] = input[56];
+  bf1[8] = input[4];
+  bf1[9] = input[36];
+  bf1[10] = input[20];
+  bf1[11] = input[52];
+  bf1[12] = input[12];
+  bf1[13] = input[44];
+  bf1[14] = input[28];
+  bf1[15] = input[60];
+  bf1[16] = input[2];
+  bf1[17] = input[34];
+  bf1[18] = input[18];
+  bf1[19] = input[50];
+  bf1[20] = input[10];
+  bf1[21] = input[42];
+  bf1[22] = input[26];
+  bf1[23] = input[58];
+  bf1[24] = input[6];
+  bf1[25] = input[38];
+  bf1[26] = input[22];
+  bf1[27] = input[54];
+  bf1[28] = input[14];
+  bf1[29] = input[46];
+  bf1[30] = input[30];
+  bf1[31] = input[62];
+  bf1[32] = input[1];
+  bf1[33] = input[33];
+  bf1[34] = input[17];
+  bf1[35] = input[49];
+  bf1[36] = input[9];
+  bf1[37] = input[41];
+  bf1[38] = input[25];
+  bf1[39] = input[57];
+  bf1[40] = input[5];
+  bf1[41] = input[37];
+  bf1[42] = input[21];
+  bf1[43] = input[53];
+  bf1[44] = input[13];
+  bf1[45] = input[45];
+  bf1[46] = input[29];
+  bf1[47] = input[61];
+  bf1[48] = input[3];
+  bf1[49] = input[35];
+  bf1[50] = input[19];
+  bf1[51] = input[51];
+  bf1[52] = input[11];
+  bf1[53] = input[43];
+  bf1[54] = input[27];
+  bf1[55] = input[59];
+  bf1[56] = input[7];
+  bf1[57] = input[39];
+  bf1[58] = input[23];
+  bf1[59] = input[55];
+  bf1[60] = input[15];
+  bf1[61] = input[47];
+  bf1[62] = input[31];
+  bf1[63] = input[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 2
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = bf0[21];
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = bf0[26];
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = half_btf(cospi[63], bf0[32], -cospi[1], bf0[63], cos_bit[stage]);
+  bf1[33] = half_btf(cospi[31], bf0[33], -cospi[33], bf0[62], cos_bit[stage]);
+  bf1[34] = half_btf(cospi[47], bf0[34], -cospi[17], bf0[61], cos_bit[stage]);
+  bf1[35] = half_btf(cospi[15], bf0[35], -cospi[49], bf0[60], cos_bit[stage]);
+  bf1[36] = half_btf(cospi[55], bf0[36], -cospi[9], bf0[59], cos_bit[stage]);
+  bf1[37] = half_btf(cospi[23], bf0[37], -cospi[41], bf0[58], cos_bit[stage]);
+  bf1[38] = half_btf(cospi[39], bf0[38], -cospi[25], bf0[57], cos_bit[stage]);
+  bf1[39] = half_btf(cospi[7], bf0[39], -cospi[57], bf0[56], cos_bit[stage]);
+  bf1[40] = half_btf(cospi[59], bf0[40], -cospi[5], bf0[55], cos_bit[stage]);
+  bf1[41] = half_btf(cospi[27], bf0[41], -cospi[37], bf0[54], cos_bit[stage]);
+  bf1[42] = half_btf(cospi[43], bf0[42], -cospi[21], bf0[53], cos_bit[stage]);
+  bf1[43] = half_btf(cospi[11], bf0[43], -cospi[53], bf0[52], cos_bit[stage]);
+  bf1[44] = half_btf(cospi[51], bf0[44], -cospi[13], bf0[51], cos_bit[stage]);
+  bf1[45] = half_btf(cospi[19], bf0[45], -cospi[45], bf0[50], cos_bit[stage]);
+  bf1[46] = half_btf(cospi[35], bf0[46], -cospi[29], bf0[49], cos_bit[stage]);
+  bf1[47] = half_btf(cospi[3], bf0[47], -cospi[61], bf0[48], cos_bit[stage]);
+  bf1[48] = half_btf(cospi[61], bf0[47], cospi[3], bf0[48], cos_bit[stage]);
+  bf1[49] = half_btf(cospi[29], bf0[46], cospi[35], bf0[49], cos_bit[stage]);
+  bf1[50] = half_btf(cospi[45], bf0[45], cospi[19], bf0[50], cos_bit[stage]);
+  bf1[51] = half_btf(cospi[13], bf0[44], cospi[51], bf0[51], cos_bit[stage]);
+  bf1[52] = half_btf(cospi[53], bf0[43], cospi[11], bf0[52], cos_bit[stage]);
+  bf1[53] = half_btf(cospi[21], bf0[42], cospi[43], bf0[53], cos_bit[stage]);
+  bf1[54] = half_btf(cospi[37], bf0[41], cospi[27], bf0[54], cos_bit[stage]);
+  bf1[55] = half_btf(cospi[5], bf0[40], cospi[59], bf0[55], cos_bit[stage]);
+  bf1[56] = half_btf(cospi[57], bf0[39], cospi[7], bf0[56], cos_bit[stage]);
+  bf1[57] = half_btf(cospi[25], bf0[38], cospi[39], bf0[57], cos_bit[stage]);
+  bf1[58] = half_btf(cospi[41], bf0[37], cospi[23], bf0[58], cos_bit[stage]);
+  bf1[59] = half_btf(cospi[9], bf0[36], cospi[55], bf0[59], cos_bit[stage]);
+  bf1[60] = half_btf(cospi[49], bf0[35], cospi[15], bf0[60], cos_bit[stage]);
+  bf1[61] = half_btf(cospi[17], bf0[34], cospi[47], bf0[61], cos_bit[stage]);
+  bf1[62] = half_btf(cospi[33], bf0[33], cospi[31], bf0[62], cos_bit[stage]);
+  bf1[63] = half_btf(cospi[1], bf0[32], cospi[63], bf0[63], cos_bit[stage]);
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 3
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = bf0[10];
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = bf0[13];
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = half_btf(cospi[62], bf0[16], -cospi[2], bf0[31], cos_bit[stage]);
+  bf1[17] = half_btf(cospi[30], bf0[17], -cospi[34], bf0[30], cos_bit[stage]);
+  bf1[18] = half_btf(cospi[46], bf0[18], -cospi[18], bf0[29], cos_bit[stage]);
+  bf1[19] = half_btf(cospi[14], bf0[19], -cospi[50], bf0[28], cos_bit[stage]);
+  bf1[20] = half_btf(cospi[54], bf0[20], -cospi[10], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(cospi[22], bf0[21], -cospi[42], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(cospi[38], bf0[22], -cospi[26], bf0[25], cos_bit[stage]);
+  bf1[23] = half_btf(cospi[6], bf0[23], -cospi[58], bf0[24], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[58], bf0[23], cospi[6], bf0[24], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[26], bf0[22], cospi[38], bf0[25], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[42], bf0[21], cospi[22], bf0[26], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[10], bf0[20], cospi[54], bf0[27], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[50], bf0[19], cospi[14], bf0[28], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[18], bf0[18], cospi[46], bf0[29], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[34], bf0[17], cospi[30], bf0[30], cos_bit[stage]);
+  bf1[31] = half_btf(cospi[2], bf0[16], cospi[62], bf0[31], cos_bit[stage]);
+  bf1[32] = bf0[32] + bf0[33];
+  bf1[33] = bf0[32] - bf0[33];
+  bf1[34] = -bf0[34] + bf0[35];
+  bf1[35] = bf0[34] + bf0[35];
+  bf1[36] = bf0[36] + bf0[37];
+  bf1[37] = bf0[36] - bf0[37];
+  bf1[38] = -bf0[38] + bf0[39];
+  bf1[39] = bf0[38] + bf0[39];
+  bf1[40] = bf0[40] + bf0[41];
+  bf1[41] = bf0[40] - bf0[41];
+  bf1[42] = -bf0[42] + bf0[43];
+  bf1[43] = bf0[42] + bf0[43];
+  bf1[44] = bf0[44] + bf0[45];
+  bf1[45] = bf0[44] - bf0[45];
+  bf1[46] = -bf0[46] + bf0[47];
+  bf1[47] = bf0[46] + bf0[47];
+  bf1[48] = bf0[48] + bf0[49];
+  bf1[49] = bf0[48] - bf0[49];
+  bf1[50] = -bf0[50] + bf0[51];
+  bf1[51] = bf0[50] + bf0[51];
+  bf1[52] = bf0[52] + bf0[53];
+  bf1[53] = bf0[52] - bf0[53];
+  bf1[54] = -bf0[54] + bf0[55];
+  bf1[55] = bf0[54] + bf0[55];
+  bf1[56] = bf0[56] + bf0[57];
+  bf1[57] = bf0[56] - bf0[57];
+  bf1[58] = -bf0[58] + bf0[59];
+  bf1[59] = bf0[58] + bf0[59];
+  bf1[60] = bf0[60] + bf0[61];
+  bf1[61] = bf0[60] - bf0[61];
+  bf1[62] = -bf0[62] + bf0[63];
+  bf1[63] = bf0[62] + bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 4
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = bf0[5];
+  bf1[6] = bf0[6];
+  bf1[7] = bf0[7];
+  bf1[8] = half_btf(cospi[60], bf0[8], -cospi[4], bf0[15], cos_bit[stage]);
+  bf1[9] = half_btf(cospi[28], bf0[9], -cospi[36], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(cospi[44], bf0[10], -cospi[20], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(cospi[12], bf0[11], -cospi[52], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[52], bf0[11], cospi[12], bf0[12], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[20], bf0[10], cospi[44], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[36], bf0[9], cospi[28], bf0[14], cos_bit[stage]);
+  bf1[15] = half_btf(cospi[4], bf0[8], cospi[60], bf0[15], cos_bit[stage]);
+  bf1[16] = bf0[16] + bf0[17];
+  bf1[17] = bf0[16] - bf0[17];
+  bf1[18] = -bf0[18] + bf0[19];
+  bf1[19] = bf0[18] + bf0[19];
+  bf1[20] = bf0[20] + bf0[21];
+  bf1[21] = bf0[20] - bf0[21];
+  bf1[22] = -bf0[22] + bf0[23];
+  bf1[23] = bf0[22] + bf0[23];
+  bf1[24] = bf0[24] + bf0[25];
+  bf1[25] = bf0[24] - bf0[25];
+  bf1[26] = -bf0[26] + bf0[27];
+  bf1[27] = bf0[26] + bf0[27];
+  bf1[28] = bf0[28] + bf0[29];
+  bf1[29] = bf0[28] - bf0[29];
+  bf1[30] = -bf0[30] + bf0[31];
+  bf1[31] = bf0[30] + bf0[31];
+  bf1[32] = bf0[32];
+  bf1[33] = half_btf(-cospi[4], bf0[33], cospi[60], bf0[62], cos_bit[stage]);
+  bf1[34] = half_btf(-cospi[60], bf0[34], -cospi[4], bf0[61], cos_bit[stage]);
+  bf1[35] = bf0[35];
+  bf1[36] = bf0[36];
+  bf1[37] = half_btf(-cospi[36], bf0[37], cospi[28], bf0[58], cos_bit[stage]);
+  bf1[38] = half_btf(-cospi[28], bf0[38], -cospi[36], bf0[57], cos_bit[stage]);
+  bf1[39] = bf0[39];
+  bf1[40] = bf0[40];
+  bf1[41] = half_btf(-cospi[20], bf0[41], cospi[44], bf0[54], cos_bit[stage]);
+  bf1[42] = half_btf(-cospi[44], bf0[42], -cospi[20], bf0[53], cos_bit[stage]);
+  bf1[43] = bf0[43];
+  bf1[44] = bf0[44];
+  bf1[45] = half_btf(-cospi[52], bf0[45], cospi[12], bf0[50], cos_bit[stage]);
+  bf1[46] = half_btf(-cospi[12], bf0[46], -cospi[52], bf0[49], cos_bit[stage]);
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = half_btf(-cospi[52], bf0[46], cospi[12], bf0[49], cos_bit[stage]);
+  bf1[50] = half_btf(cospi[12], bf0[45], cospi[52], bf0[50], cos_bit[stage]);
+  bf1[51] = bf0[51];
+  bf1[52] = bf0[52];
+  bf1[53] = half_btf(-cospi[20], bf0[42], cospi[44], bf0[53], cos_bit[stage]);
+  bf1[54] = half_btf(cospi[44], bf0[41], cospi[20], bf0[54], cos_bit[stage]);
+  bf1[55] = bf0[55];
+  bf1[56] = bf0[56];
+  bf1[57] = half_btf(-cospi[36], bf0[38], cospi[28], bf0[57], cos_bit[stage]);
+  bf1[58] = half_btf(cospi[28], bf0[37], cospi[36], bf0[58], cos_bit[stage]);
+  bf1[59] = bf0[59];
+  bf1[60] = bf0[60];
+  bf1[61] = half_btf(-cospi[4], bf0[34], cospi[60], bf0[61], cos_bit[stage]);
+  bf1[62] = half_btf(cospi[60], bf0[33], cospi[4], bf0[62], cos_bit[stage]);
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 5
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0];
+  bf1[1] = bf0[1];
+  bf1[2] = bf0[2];
+  bf1[3] = bf0[3];
+  bf1[4] = half_btf(cospi[56], bf0[4], -cospi[8], bf0[7], cos_bit[stage]);
+  bf1[5] = half_btf(cospi[24], bf0[5], -cospi[40], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[40], bf0[5], cospi[24], bf0[6], cos_bit[stage]);
+  bf1[7] = half_btf(cospi[8], bf0[4], cospi[56], bf0[7], cos_bit[stage]);
+  bf1[8] = bf0[8] + bf0[9];
+  bf1[9] = bf0[8] - bf0[9];
+  bf1[10] = -bf0[10] + bf0[11];
+  bf1[11] = bf0[10] + bf0[11];
+  bf1[12] = bf0[12] + bf0[13];
+  bf1[13] = bf0[12] - bf0[13];
+  bf1[14] = -bf0[14] + bf0[15];
+  bf1[15] = bf0[14] + bf0[15];
+  bf1[16] = bf0[16];
+  bf1[17] = half_btf(-cospi[8], bf0[17], cospi[56], bf0[30], cos_bit[stage]);
+  bf1[18] = half_btf(-cospi[56], bf0[18], -cospi[8], bf0[29], cos_bit[stage]);
+  bf1[19] = bf0[19];
+  bf1[20] = bf0[20];
+  bf1[21] = half_btf(-cospi[40], bf0[21], cospi[24], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[24], bf0[22], -cospi[40], bf0[25], cos_bit[stage]);
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = half_btf(-cospi[40], bf0[22], cospi[24], bf0[25], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[24], bf0[21], cospi[40], bf0[26], cos_bit[stage]);
+  bf1[27] = bf0[27];
+  bf1[28] = bf0[28];
+  bf1[29] = half_btf(-cospi[8], bf0[18], cospi[56], bf0[29], cos_bit[stage]);
+  bf1[30] = half_btf(cospi[56], bf0[17], cospi[8], bf0[30], cos_bit[stage]);
+  bf1[31] = bf0[31];
+  bf1[32] = bf0[32] + bf0[35];
+  bf1[33] = bf0[33] + bf0[34];
+  bf1[34] = bf0[33] - bf0[34];
+  bf1[35] = bf0[32] - bf0[35];
+  bf1[36] = -bf0[36] + bf0[39];
+  bf1[37] = -bf0[37] + bf0[38];
+  bf1[38] = bf0[37] + bf0[38];
+  bf1[39] = bf0[36] + bf0[39];
+  bf1[40] = bf0[40] + bf0[43];
+  bf1[41] = bf0[41] + bf0[42];
+  bf1[42] = bf0[41] - bf0[42];
+  bf1[43] = bf0[40] - bf0[43];
+  bf1[44] = -bf0[44] + bf0[47];
+  bf1[45] = -bf0[45] + bf0[46];
+  bf1[46] = bf0[45] + bf0[46];
+  bf1[47] = bf0[44] + bf0[47];
+  bf1[48] = bf0[48] + bf0[51];
+  bf1[49] = bf0[49] + bf0[50];
+  bf1[50] = bf0[49] - bf0[50];
+  bf1[51] = bf0[48] - bf0[51];
+  bf1[52] = -bf0[52] + bf0[55];
+  bf1[53] = -bf0[53] + bf0[54];
+  bf1[54] = bf0[53] + bf0[54];
+  bf1[55] = bf0[52] + bf0[55];
+  bf1[56] = bf0[56] + bf0[59];
+  bf1[57] = bf0[57] + bf0[58];
+  bf1[58] = bf0[57] - bf0[58];
+  bf1[59] = bf0[56] - bf0[59];
+  bf1[60] = -bf0[60] + bf0[63];
+  bf1[61] = -bf0[61] + bf0[62];
+  bf1[62] = bf0[61] + bf0[62];
+  bf1[63] = bf0[60] + bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 6
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = half_btf(cospi[32], bf0[0], cospi[32], bf0[1], cos_bit[stage]);
+  bf1[1] = half_btf(cospi[32], bf0[0], -cospi[32], bf0[1], cos_bit[stage]);
+  bf1[2] = half_btf(cospi[48], bf0[2], -cospi[16], bf0[3], cos_bit[stage]);
+  bf1[3] = half_btf(cospi[16], bf0[2], cospi[48], bf0[3], cos_bit[stage]);
+  bf1[4] = bf0[4] + bf0[5];
+  bf1[5] = bf0[4] - bf0[5];
+  bf1[6] = -bf0[6] + bf0[7];
+  bf1[7] = bf0[6] + bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = half_btf(-cospi[16], bf0[9], cospi[48], bf0[14], cos_bit[stage]);
+  bf1[10] = half_btf(-cospi[48], bf0[10], -cospi[16], bf0[13], cos_bit[stage]);
+  bf1[11] = bf0[11];
+  bf1[12] = bf0[12];
+  bf1[13] = half_btf(-cospi[16], bf0[10], cospi[48], bf0[13], cos_bit[stage]);
+  bf1[14] = half_btf(cospi[48], bf0[9], cospi[16], bf0[14], cos_bit[stage]);
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[19];
+  bf1[17] = bf0[17] + bf0[18];
+  bf1[18] = bf0[17] - bf0[18];
+  bf1[19] = bf0[16] - bf0[19];
+  bf1[20] = -bf0[20] + bf0[23];
+  bf1[21] = -bf0[21] + bf0[22];
+  bf1[22] = bf0[21] + bf0[22];
+  bf1[23] = bf0[20] + bf0[23];
+  bf1[24] = bf0[24] + bf0[27];
+  bf1[25] = bf0[25] + bf0[26];
+  bf1[26] = bf0[25] - bf0[26];
+  bf1[27] = bf0[24] - bf0[27];
+  bf1[28] = -bf0[28] + bf0[31];
+  bf1[29] = -bf0[29] + bf0[30];
+  bf1[30] = bf0[29] + bf0[30];
+  bf1[31] = bf0[28] + bf0[31];
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = half_btf(-cospi[8], bf0[34], cospi[56], bf0[61], cos_bit[stage]);
+  bf1[35] = half_btf(-cospi[8], bf0[35], cospi[56], bf0[60], cos_bit[stage]);
+  bf1[36] = half_btf(-cospi[56], bf0[36], -cospi[8], bf0[59], cos_bit[stage]);
+  bf1[37] = half_btf(-cospi[56], bf0[37], -cospi[8], bf0[58], cos_bit[stage]);
+  bf1[38] = bf0[38];
+  bf1[39] = bf0[39];
+  bf1[40] = bf0[40];
+  bf1[41] = bf0[41];
+  bf1[42] = half_btf(-cospi[40], bf0[42], cospi[24], bf0[53], cos_bit[stage]);
+  bf1[43] = half_btf(-cospi[40], bf0[43], cospi[24], bf0[52], cos_bit[stage]);
+  bf1[44] = half_btf(-cospi[24], bf0[44], -cospi[40], bf0[51], cos_bit[stage]);
+  bf1[45] = half_btf(-cospi[24], bf0[45], -cospi[40], bf0[50], cos_bit[stage]);
+  bf1[46] = bf0[46];
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = bf0[49];
+  bf1[50] = half_btf(-cospi[40], bf0[45], cospi[24], bf0[50], cos_bit[stage]);
+  bf1[51] = half_btf(-cospi[40], bf0[44], cospi[24], bf0[51], cos_bit[stage]);
+  bf1[52] = half_btf(cospi[24], bf0[43], cospi[40], bf0[52], cos_bit[stage]);
+  bf1[53] = half_btf(cospi[24], bf0[42], cospi[40], bf0[53], cos_bit[stage]);
+  bf1[54] = bf0[54];
+  bf1[55] = bf0[55];
+  bf1[56] = bf0[56];
+  bf1[57] = bf0[57];
+  bf1[58] = half_btf(-cospi[8], bf0[37], cospi[56], bf0[58], cos_bit[stage]);
+  bf1[59] = half_btf(-cospi[8], bf0[36], cospi[56], bf0[59], cos_bit[stage]);
+  bf1[60] = half_btf(cospi[56], bf0[35], cospi[8], bf0[60], cos_bit[stage]);
+  bf1[61] = half_btf(cospi[56], bf0[34], cospi[8], bf0[61], cos_bit[stage]);
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 7
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[3];
+  bf1[1] = bf0[1] + bf0[2];
+  bf1[2] = bf0[1] - bf0[2];
+  bf1[3] = bf0[0] - bf0[3];
+  bf1[4] = bf0[4];
+  bf1[5] = half_btf(-cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[6] = half_btf(cospi[32], bf0[5], cospi[32], bf0[6], cos_bit[stage]);
+  bf1[7] = bf0[7];
+  bf1[8] = bf0[8] + bf0[11];
+  bf1[9] = bf0[9] + bf0[10];
+  bf1[10] = bf0[9] - bf0[10];
+  bf1[11] = bf0[8] - bf0[11];
+  bf1[12] = -bf0[12] + bf0[15];
+  bf1[13] = -bf0[13] + bf0[14];
+  bf1[14] = bf0[13] + bf0[14];
+  bf1[15] = bf0[12] + bf0[15];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = half_btf(-cospi[16], bf0[18], cospi[48], bf0[29], cos_bit[stage]);
+  bf1[19] = half_btf(-cospi[16], bf0[19], cospi[48], bf0[28], cos_bit[stage]);
+  bf1[20] = half_btf(-cospi[48], bf0[20], -cospi[16], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[48], bf0[21], -cospi[16], bf0[26], cos_bit[stage]);
+  bf1[22] = bf0[22];
+  bf1[23] = bf0[23];
+  bf1[24] = bf0[24];
+  bf1[25] = bf0[25];
+  bf1[26] = half_btf(-cospi[16], bf0[21], cospi[48], bf0[26], cos_bit[stage]);
+  bf1[27] = half_btf(-cospi[16], bf0[20], cospi[48], bf0[27], cos_bit[stage]);
+  bf1[28] = half_btf(cospi[48], bf0[19], cospi[16], bf0[28], cos_bit[stage]);
+  bf1[29] = half_btf(cospi[48], bf0[18], cospi[16], bf0[29], cos_bit[stage]);
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = bf0[32] + bf0[39];
+  bf1[33] = bf0[33] + bf0[38];
+  bf1[34] = bf0[34] + bf0[37];
+  bf1[35] = bf0[35] + bf0[36];
+  bf1[36] = bf0[35] - bf0[36];
+  bf1[37] = bf0[34] - bf0[37];
+  bf1[38] = bf0[33] - bf0[38];
+  bf1[39] = bf0[32] - bf0[39];
+  bf1[40] = -bf0[40] + bf0[47];
+  bf1[41] = -bf0[41] + bf0[46];
+  bf1[42] = -bf0[42] + bf0[45];
+  bf1[43] = -bf0[43] + bf0[44];
+  bf1[44] = bf0[43] + bf0[44];
+  bf1[45] = bf0[42] + bf0[45];
+  bf1[46] = bf0[41] + bf0[46];
+  bf1[47] = bf0[40] + bf0[47];
+  bf1[48] = bf0[48] + bf0[55];
+  bf1[49] = bf0[49] + bf0[54];
+  bf1[50] = bf0[50] + bf0[53];
+  bf1[51] = bf0[51] + bf0[52];
+  bf1[52] = bf0[51] - bf0[52];
+  bf1[53] = bf0[50] - bf0[53];
+  bf1[54] = bf0[49] - bf0[54];
+  bf1[55] = bf0[48] - bf0[55];
+  bf1[56] = -bf0[56] + bf0[63];
+  bf1[57] = -bf0[57] + bf0[62];
+  bf1[58] = -bf0[58] + bf0[61];
+  bf1[59] = -bf0[59] + bf0[60];
+  bf1[60] = bf0[59] + bf0[60];
+  bf1[61] = bf0[58] + bf0[61];
+  bf1[62] = bf0[57] + bf0[62];
+  bf1[63] = bf0[56] + bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 8
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[7];
+  bf1[1] = bf0[1] + bf0[6];
+  bf1[2] = bf0[2] + bf0[5];
+  bf1[3] = bf0[3] + bf0[4];
+  bf1[4] = bf0[3] - bf0[4];
+  bf1[5] = bf0[2] - bf0[5];
+  bf1[6] = bf0[1] - bf0[6];
+  bf1[7] = bf0[0] - bf0[7];
+  bf1[8] = bf0[8];
+  bf1[9] = bf0[9];
+  bf1[10] = half_btf(-cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[11] = half_btf(-cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
+  bf1[12] = half_btf(cospi[32], bf0[11], cospi[32], bf0[12], cos_bit[stage]);
+  bf1[13] = half_btf(cospi[32], bf0[10], cospi[32], bf0[13], cos_bit[stage]);
+  bf1[14] = bf0[14];
+  bf1[15] = bf0[15];
+  bf1[16] = bf0[16] + bf0[23];
+  bf1[17] = bf0[17] + bf0[22];
+  bf1[18] = bf0[18] + bf0[21];
+  bf1[19] = bf0[19] + bf0[20];
+  bf1[20] = bf0[19] - bf0[20];
+  bf1[21] = bf0[18] - bf0[21];
+  bf1[22] = bf0[17] - bf0[22];
+  bf1[23] = bf0[16] - bf0[23];
+  bf1[24] = -bf0[24] + bf0[31];
+  bf1[25] = -bf0[25] + bf0[30];
+  bf1[26] = -bf0[26] + bf0[29];
+  bf1[27] = -bf0[27] + bf0[28];
+  bf1[28] = bf0[27] + bf0[28];
+  bf1[29] = bf0[26] + bf0[29];
+  bf1[30] = bf0[25] + bf0[30];
+  bf1[31] = bf0[24] + bf0[31];
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = bf0[34];
+  bf1[35] = bf0[35];
+  bf1[36] = half_btf(-cospi[16], bf0[36], cospi[48], bf0[59], cos_bit[stage]);
+  bf1[37] = half_btf(-cospi[16], bf0[37], cospi[48], bf0[58], cos_bit[stage]);
+  bf1[38] = half_btf(-cospi[16], bf0[38], cospi[48], bf0[57], cos_bit[stage]);
+  bf1[39] = half_btf(-cospi[16], bf0[39], cospi[48], bf0[56], cos_bit[stage]);
+  bf1[40] = half_btf(-cospi[48], bf0[40], -cospi[16], bf0[55], cos_bit[stage]);
+  bf1[41] = half_btf(-cospi[48], bf0[41], -cospi[16], bf0[54], cos_bit[stage]);
+  bf1[42] = half_btf(-cospi[48], bf0[42], -cospi[16], bf0[53], cos_bit[stage]);
+  bf1[43] = half_btf(-cospi[48], bf0[43], -cospi[16], bf0[52], cos_bit[stage]);
+  bf1[44] = bf0[44];
+  bf1[45] = bf0[45];
+  bf1[46] = bf0[46];
+  bf1[47] = bf0[47];
+  bf1[48] = bf0[48];
+  bf1[49] = bf0[49];
+  bf1[50] = bf0[50];
+  bf1[51] = bf0[51];
+  bf1[52] = half_btf(-cospi[16], bf0[43], cospi[48], bf0[52], cos_bit[stage]);
+  bf1[53] = half_btf(-cospi[16], bf0[42], cospi[48], bf0[53], cos_bit[stage]);
+  bf1[54] = half_btf(-cospi[16], bf0[41], cospi[48], bf0[54], cos_bit[stage]);
+  bf1[55] = half_btf(-cospi[16], bf0[40], cospi[48], bf0[55], cos_bit[stage]);
+  bf1[56] = half_btf(cospi[48], bf0[39], cospi[16], bf0[56], cos_bit[stage]);
+  bf1[57] = half_btf(cospi[48], bf0[38], cospi[16], bf0[57], cos_bit[stage]);
+  bf1[58] = half_btf(cospi[48], bf0[37], cospi[16], bf0[58], cos_bit[stage]);
+  bf1[59] = half_btf(cospi[48], bf0[36], cospi[16], bf0[59], cos_bit[stage]);
+  bf1[60] = bf0[60];
+  bf1[61] = bf0[61];
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 9
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[15];
+  bf1[1] = bf0[1] + bf0[14];
+  bf1[2] = bf0[2] + bf0[13];
+  bf1[3] = bf0[3] + bf0[12];
+  bf1[4] = bf0[4] + bf0[11];
+  bf1[5] = bf0[5] + bf0[10];
+  bf1[6] = bf0[6] + bf0[9];
+  bf1[7] = bf0[7] + bf0[8];
+  bf1[8] = bf0[7] - bf0[8];
+  bf1[9] = bf0[6] - bf0[9];
+  bf1[10] = bf0[5] - bf0[10];
+  bf1[11] = bf0[4] - bf0[11];
+  bf1[12] = bf0[3] - bf0[12];
+  bf1[13] = bf0[2] - bf0[13];
+  bf1[14] = bf0[1] - bf0[14];
+  bf1[15] = bf0[0] - bf0[15];
+  bf1[16] = bf0[16];
+  bf1[17] = bf0[17];
+  bf1[18] = bf0[18];
+  bf1[19] = bf0[19];
+  bf1[20] = half_btf(-cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
+  bf1[21] = half_btf(-cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
+  bf1[22] = half_btf(-cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
+  bf1[23] = half_btf(-cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
+  bf1[24] = half_btf(cospi[32], bf0[23], cospi[32], bf0[24], cos_bit[stage]);
+  bf1[25] = half_btf(cospi[32], bf0[22], cospi[32], bf0[25], cos_bit[stage]);
+  bf1[26] = half_btf(cospi[32], bf0[21], cospi[32], bf0[26], cos_bit[stage]);
+  bf1[27] = half_btf(cospi[32], bf0[20], cospi[32], bf0[27], cos_bit[stage]);
+  bf1[28] = bf0[28];
+  bf1[29] = bf0[29];
+  bf1[30] = bf0[30];
+  bf1[31] = bf0[31];
+  bf1[32] = bf0[32] + bf0[47];
+  bf1[33] = bf0[33] + bf0[46];
+  bf1[34] = bf0[34] + bf0[45];
+  bf1[35] = bf0[35] + bf0[44];
+  bf1[36] = bf0[36] + bf0[43];
+  bf1[37] = bf0[37] + bf0[42];
+  bf1[38] = bf0[38] + bf0[41];
+  bf1[39] = bf0[39] + bf0[40];
+  bf1[40] = bf0[39] - bf0[40];
+  bf1[41] = bf0[38] - bf0[41];
+  bf1[42] = bf0[37] - bf0[42];
+  bf1[43] = bf0[36] - bf0[43];
+  bf1[44] = bf0[35] - bf0[44];
+  bf1[45] = bf0[34] - bf0[45];
+  bf1[46] = bf0[33] - bf0[46];
+  bf1[47] = bf0[32] - bf0[47];
+  bf1[48] = -bf0[48] + bf0[63];
+  bf1[49] = -bf0[49] + bf0[62];
+  bf1[50] = -bf0[50] + bf0[61];
+  bf1[51] = -bf0[51] + bf0[60];
+  bf1[52] = -bf0[52] + bf0[59];
+  bf1[53] = -bf0[53] + bf0[58];
+  bf1[54] = -bf0[54] + bf0[57];
+  bf1[55] = -bf0[55] + bf0[56];
+  bf1[56] = bf0[55] + bf0[56];
+  bf1[57] = bf0[54] + bf0[57];
+  bf1[58] = bf0[53] + bf0[58];
+  bf1[59] = bf0[52] + bf0[59];
+  bf1[60] = bf0[51] + bf0[60];
+  bf1[61] = bf0[50] + bf0[61];
+  bf1[62] = bf0[49] + bf0[62];
+  bf1[63] = bf0[48] + bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 10
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = output;
+  bf1 = step;
+  bf1[0] = bf0[0] + bf0[31];
+  bf1[1] = bf0[1] + bf0[30];
+  bf1[2] = bf0[2] + bf0[29];
+  bf1[3] = bf0[3] + bf0[28];
+  bf1[4] = bf0[4] + bf0[27];
+  bf1[5] = bf0[5] + bf0[26];
+  bf1[6] = bf0[6] + bf0[25];
+  bf1[7] = bf0[7] + bf0[24];
+  bf1[8] = bf0[8] + bf0[23];
+  bf1[9] = bf0[9] + bf0[22];
+  bf1[10] = bf0[10] + bf0[21];
+  bf1[11] = bf0[11] + bf0[20];
+  bf1[12] = bf0[12] + bf0[19];
+  bf1[13] = bf0[13] + bf0[18];
+  bf1[14] = bf0[14] + bf0[17];
+  bf1[15] = bf0[15] + bf0[16];
+  bf1[16] = bf0[15] - bf0[16];
+  bf1[17] = bf0[14] - bf0[17];
+  bf1[18] = bf0[13] - bf0[18];
+  bf1[19] = bf0[12] - bf0[19];
+  bf1[20] = bf0[11] - bf0[20];
+  bf1[21] = bf0[10] - bf0[21];
+  bf1[22] = bf0[9] - bf0[22];
+  bf1[23] = bf0[8] - bf0[23];
+  bf1[24] = bf0[7] - bf0[24];
+  bf1[25] = bf0[6] - bf0[25];
+  bf1[26] = bf0[5] - bf0[26];
+  bf1[27] = bf0[4] - bf0[27];
+  bf1[28] = bf0[3] - bf0[28];
+  bf1[29] = bf0[2] - bf0[29];
+  bf1[30] = bf0[1] - bf0[30];
+  bf1[31] = bf0[0] - bf0[31];
+  bf1[32] = bf0[32];
+  bf1[33] = bf0[33];
+  bf1[34] = bf0[34];
+  bf1[35] = bf0[35];
+  bf1[36] = bf0[36];
+  bf1[37] = bf0[37];
+  bf1[38] = bf0[38];
+  bf1[39] = bf0[39];
+  bf1[40] = half_btf(-cospi[32], bf0[40], cospi[32], bf0[55], cos_bit[stage]);
+  bf1[41] = half_btf(-cospi[32], bf0[41], cospi[32], bf0[54], cos_bit[stage]);
+  bf1[42] = half_btf(-cospi[32], bf0[42], cospi[32], bf0[53], cos_bit[stage]);
+  bf1[43] = half_btf(-cospi[32], bf0[43], cospi[32], bf0[52], cos_bit[stage]);
+  bf1[44] = half_btf(-cospi[32], bf0[44], cospi[32], bf0[51], cos_bit[stage]);
+  bf1[45] = half_btf(-cospi[32], bf0[45], cospi[32], bf0[50], cos_bit[stage]);
+  bf1[46] = half_btf(-cospi[32], bf0[46], cospi[32], bf0[49], cos_bit[stage]);
+  bf1[47] = half_btf(-cospi[32], bf0[47], cospi[32], bf0[48], cos_bit[stage]);
+  bf1[48] = half_btf(cospi[32], bf0[47], cospi[32], bf0[48], cos_bit[stage]);
+  bf1[49] = half_btf(cospi[32], bf0[46], cospi[32], bf0[49], cos_bit[stage]);
+  bf1[50] = half_btf(cospi[32], bf0[45], cospi[32], bf0[50], cos_bit[stage]);
+  bf1[51] = half_btf(cospi[32], bf0[44], cospi[32], bf0[51], cos_bit[stage]);
+  bf1[52] = half_btf(cospi[32], bf0[43], cospi[32], bf0[52], cos_bit[stage]);
+  bf1[53] = half_btf(cospi[32], bf0[42], cospi[32], bf0[53], cos_bit[stage]);
+  bf1[54] = half_btf(cospi[32], bf0[41], cospi[32], bf0[54], cos_bit[stage]);
+  bf1[55] = half_btf(cospi[32], bf0[40], cospi[32], bf0[55], cos_bit[stage]);
+  bf1[56] = bf0[56];
+  bf1[57] = bf0[57];
+  bf1[58] = bf0[58];
+  bf1[59] = bf0[59];
+  bf1[60] = bf0[60];
+  bf1[61] = bf0[61];
+  bf1[62] = bf0[62];
+  bf1[63] = bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+
+  // stage 11
+  stage++;
+  cospi = cospi_arr[cos_bit[stage] - cos_bit_min];
+  bf0 = step;
+  bf1 = output;
+  bf1[0] = bf0[0] + bf0[63];
+  bf1[1] = bf0[1] + bf0[62];
+  bf1[2] = bf0[2] + bf0[61];
+  bf1[3] = bf0[3] + bf0[60];
+  bf1[4] = bf0[4] + bf0[59];
+  bf1[5] = bf0[5] + bf0[58];
+  bf1[6] = bf0[6] + bf0[57];
+  bf1[7] = bf0[7] + bf0[56];
+  bf1[8] = bf0[8] + bf0[55];
+  bf1[9] = bf0[9] + bf0[54];
+  bf1[10] = bf0[10] + bf0[53];
+  bf1[11] = bf0[11] + bf0[52];
+  bf1[12] = bf0[12] + bf0[51];
+  bf1[13] = bf0[13] + bf0[50];
+  bf1[14] = bf0[14] + bf0[49];
+  bf1[15] = bf0[15] + bf0[48];
+  bf1[16] = bf0[16] + bf0[47];
+  bf1[17] = bf0[17] + bf0[46];
+  bf1[18] = bf0[18] + bf0[45];
+  bf1[19] = bf0[19] + bf0[44];
+  bf1[20] = bf0[20] + bf0[43];
+  bf1[21] = bf0[21] + bf0[42];
+  bf1[22] = bf0[22] + bf0[41];
+  bf1[23] = bf0[23] + bf0[40];
+  bf1[24] = bf0[24] + bf0[39];
+  bf1[25] = bf0[25] + bf0[38];
+  bf1[26] = bf0[26] + bf0[37];
+  bf1[27] = bf0[27] + bf0[36];
+  bf1[28] = bf0[28] + bf0[35];
+  bf1[29] = bf0[29] + bf0[34];
+  bf1[30] = bf0[30] + bf0[33];
+  bf1[31] = bf0[31] + bf0[32];
+  bf1[32] = bf0[31] - bf0[32];
+  bf1[33] = bf0[30] - bf0[33];
+  bf1[34] = bf0[29] - bf0[34];
+  bf1[35] = bf0[28] - bf0[35];
+  bf1[36] = bf0[27] - bf0[36];
+  bf1[37] = bf0[26] - bf0[37];
+  bf1[38] = bf0[25] - bf0[38];
+  bf1[39] = bf0[24] - bf0[39];
+  bf1[40] = bf0[23] - bf0[40];
+  bf1[41] = bf0[22] - bf0[41];
+  bf1[42] = bf0[21] - bf0[42];
+  bf1[43] = bf0[20] - bf0[43];
+  bf1[44] = bf0[19] - bf0[44];
+  bf1[45] = bf0[18] - bf0[45];
+  bf1[46] = bf0[17] - bf0[46];
+  bf1[47] = bf0[16] - bf0[47];
+  bf1[48] = bf0[15] - bf0[48];
+  bf1[49] = bf0[14] - bf0[49];
+  bf1[50] = bf0[13] - bf0[50];
+  bf1[51] = bf0[12] - bf0[51];
+  bf1[52] = bf0[11] - bf0[52];
+  bf1[53] = bf0[10] - bf0[53];
+  bf1[54] = bf0[9] - bf0[54];
+  bf1[55] = bf0[8] - bf0[55];
+  bf1[56] = bf0[7] - bf0[56];
+  bf1[57] = bf0[6] - bf0[57];
+  bf1[58] = bf0[5] - bf0[58];
+  bf1[59] = bf0[4] - bf0[59];
+  bf1[60] = bf0[3] - bf0[60];
+  bf1[61] = bf0[2] - bf0[61];
+  bf1[62] = bf0[1] - bf0[62];
+  bf1[63] = bf0[0] - bf0[63];
+  range_check(stage, input, bf1, size, stage_range[stage]);
+}
diff --git a/vp10/common/vp10_inv_txfm1d.h b/vp10/common/vp10_inv_txfm1d.h
new file mode 100644
index 0000000..fd547a6
--- /dev/null
+++ b/vp10/common/vp10_inv_txfm1d.h
@@ -0,0 +1,44 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_INV_TXFM1D_H_
+#define VP10_INV_TXFM1D_H_
+
+#include "vp10/common/vp10_txfm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp10_idct4_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range);
+void vp10_idct8_new(const int32_t *input, int32_t *output,
+                    const int8_t *cos_bit, const int8_t *stage_range);
+void vp10_idct16_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range);
+void vp10_idct32_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range);
+void vp10_idct64_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range);
+
+void vp10_iadst4_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range);
+void vp10_iadst8_new(const int32_t *input, int32_t *output,
+                     const int8_t *cos_bit, const int8_t *stage_range);
+void vp10_iadst16_new(const int32_t *input, int32_t *output,
+                      const int8_t *cos_bit, const int8_t *stage_range);
+void vp10_iadst32_new(const int32_t *input, int32_t *output,
+                      const int8_t *cos_bit, const int8_t *stage_range);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // VP10_INV_TXFM1D_H_
diff --git a/vp10/common/vp10_inv_txfm2d.c b/vp10/common/vp10_inv_txfm2d.c
new file mode 100644
index 0000000..85a33ba
--- /dev/null
+++ b/vp10/common/vp10_inv_txfm2d.c
@@ -0,0 +1,211 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp10_rtcd.h"
+#include "vp10/common/enums.h"
+#include "vp10/common/vp10_txfm.h"
+#include "vp10/common/vp10_inv_txfm1d.h"
+#include "vp10/common/vp10_inv_txfm2d_cfg.h"
+
+static INLINE TxfmFunc inv_txfm_type_to_func(TXFM_TYPE txfm_type) {
+  switch (txfm_type) {
+    case TXFM_TYPE_DCT4:
+      return vp10_idct4_new;
+    case TXFM_TYPE_DCT8:
+      return vp10_idct8_new;
+    case TXFM_TYPE_DCT16:
+      return vp10_idct16_new;
+    case TXFM_TYPE_DCT32:
+      return vp10_idct32_new;
+    case TXFM_TYPE_DCT64:
+      return vp10_idct64_new;
+    case TXFM_TYPE_ADST4:
+      return vp10_iadst4_new;
+    case TXFM_TYPE_ADST8:
+      return vp10_iadst8_new;
+    case TXFM_TYPE_ADST16:
+      return vp10_iadst16_new;
+    case TXFM_TYPE_ADST32:
+      return vp10_iadst32_new;
+    default:
+      assert(0);
+      return NULL;
+  }
+}
+
+#if CONFIG_EXT_TX
+static const TXFM_2D_CFG* inv_txfm_cfg_ls[FLIPADST_ADST + 1][TX_SIZES] = {
+    {&inv_txfm_2d_cfg_dct_dct_4  , &inv_txfm_2d_cfg_dct_dct_8,
+     &inv_txfm_2d_cfg_dct_dct_16  , &inv_txfm_2d_cfg_dct_dct_32},
+    {&inv_txfm_2d_cfg_adst_dct_4 , &inv_txfm_2d_cfg_adst_dct_8,
+     &inv_txfm_2d_cfg_adst_dct_16 , &inv_txfm_2d_cfg_adst_dct_32},
+    {&inv_txfm_2d_cfg_dct_adst_4 , &inv_txfm_2d_cfg_dct_adst_8,
+     &inv_txfm_2d_cfg_dct_adst_16 , &inv_txfm_2d_cfg_dct_adst_32},
+    {&inv_txfm_2d_cfg_adst_adst_4, &inv_txfm_2d_cfg_adst_adst_8,
+     &inv_txfm_2d_cfg_adst_adst_16, &inv_txfm_2d_cfg_adst_adst_32},
+    {&inv_txfm_2d_cfg_adst_dct_4 , &inv_txfm_2d_cfg_adst_dct_8,
+     &inv_txfm_2d_cfg_adst_dct_16 , &inv_txfm_2d_cfg_adst_dct_32},
+    {&inv_txfm_2d_cfg_dct_adst_4 , &inv_txfm_2d_cfg_dct_adst_8,
+     &inv_txfm_2d_cfg_dct_adst_16 , &inv_txfm_2d_cfg_dct_adst_32},
+    {&inv_txfm_2d_cfg_adst_adst_4, &inv_txfm_2d_cfg_adst_adst_8,
+     &inv_txfm_2d_cfg_adst_adst_16, &inv_txfm_2d_cfg_adst_adst_32},
+    {&inv_txfm_2d_cfg_adst_adst_4, &inv_txfm_2d_cfg_adst_adst_8,
+     &inv_txfm_2d_cfg_adst_adst_16, &inv_txfm_2d_cfg_adst_adst_32},
+    {&inv_txfm_2d_cfg_adst_adst_4, &inv_txfm_2d_cfg_adst_adst_8,
+     &inv_txfm_2d_cfg_adst_adst_16, &inv_txfm_2d_cfg_adst_adst_32},
+};
+#else
+static const TXFM_2D_CFG* inv_txfm_cfg_ls[TX_TYPES][TX_SIZES] = {
+    {&inv_txfm_2d_cfg_dct_dct_4  , &inv_txfm_2d_cfg_dct_dct_8,
+      &inv_txfm_2d_cfg_dct_dct_16  , &inv_txfm_2d_cfg_dct_dct_32},
+    {&inv_txfm_2d_cfg_adst_dct_4 , &inv_txfm_2d_cfg_adst_dct_8,
+      &inv_txfm_2d_cfg_adst_dct_16 , &inv_txfm_2d_cfg_adst_dct_32},
+    {&inv_txfm_2d_cfg_dct_adst_4 , &inv_txfm_2d_cfg_dct_adst_8,
+      &inv_txfm_2d_cfg_dct_adst_16 , &inv_txfm_2d_cfg_dct_adst_32},
+    {&inv_txfm_2d_cfg_adst_adst_4, &inv_txfm_2d_cfg_adst_adst_8,
+      &inv_txfm_2d_cfg_adst_adst_16, &inv_txfm_2d_cfg_adst_adst_32},
+};
+#endif
+
+TXFM_2D_FLIP_CFG vp10_get_inv_txfm_cfg(int tx_type, int tx_size) {
+  TXFM_2D_FLIP_CFG cfg;
+  set_flip_cfg(tx_type, &cfg);
+  cfg.cfg = inv_txfm_cfg_ls[tx_type][tx_size];
+  return cfg;
+}
+
+TXFM_2D_FLIP_CFG vp10_get_inv_txfm_64x64_cfg(int tx_type) {
+  TXFM_2D_FLIP_CFG cfg;
+  switch (tx_type) {
+    case DCT_DCT:
+      cfg.cfg = &inv_txfm_2d_cfg_dct_dct_64;
+      set_flip_cfg(tx_type, &cfg);
+      break;
+    default:
+      assert(0);
+  }
+  return cfg;
+}
+
+static INLINE void inv_txfm2d_add_c(const int32_t *input, int16_t *output,
+                                    int stride, TXFM_2D_FLIP_CFG *cfg,
+                                    int32_t *txfm_buf) {
+  const int txfm_size = cfg->cfg->txfm_size;
+  const int8_t *shift = cfg->cfg->shift;
+  const int8_t *stage_range_col = cfg->cfg->stage_range_col;
+  const int8_t *stage_range_row = cfg->cfg->stage_range_row;
+  const int8_t *cos_bit_col = cfg->cfg->cos_bit_col;
+  const int8_t *cos_bit_row = cfg->cfg->cos_bit_row;
+  const TxfmFunc txfm_func_col = inv_txfm_type_to_func(cfg->cfg->txfm_type_col);
+  const TxfmFunc txfm_func_row = inv_txfm_type_to_func(cfg->cfg->txfm_type_row);
+
+  // txfm_buf's length is  txfm_size * txfm_size + 2 * txfm_size
+  // it is used for intermediate data buffering
+  int32_t *temp_in = txfm_buf;
+  int32_t *temp_out = temp_in + txfm_size;
+  int32_t *buf = temp_out + txfm_size;
+  int32_t *buf_ptr = buf;
+  int c, r;
+
+  // Rows
+  for (r = 0; r < txfm_size; ++r) {
+    txfm_func_row(input, buf_ptr, cos_bit_row, stage_range_row);
+    round_shift_array(buf_ptr, txfm_size, -shift[0]);
+    input += txfm_size;
+    buf_ptr += txfm_size;
+  }
+
+  // Columns
+  for (c = 0; c < txfm_size; ++c) {
+    if (cfg->lr_flip == 0) {
+      for (r = 0; r < txfm_size; ++r)
+        temp_in[r] = buf[r * txfm_size + c];
+    } else {
+      // flip left right
+      for (r = 0; r < txfm_size; ++r)
+        temp_in[r] = buf[r * txfm_size + (txfm_size - c - 1)];
+    }
+    txfm_func_col(temp_in, temp_out, cos_bit_col, stage_range_col);
+    round_shift_array(temp_out, txfm_size, -shift[1]);
+    if (cfg->ud_flip == 0) {
+      for (r = 0; r < txfm_size; ++r)
+        output[r * stride + c] += temp_out[r];
+    } else {
+      // flip upside down
+      for (r = 0; r < txfm_size; ++r)
+        output[r * stride + c] += temp_out[txfm_size - r - 1];
+    }
+  }
+}
+
+void vp10_inv_txfm2d_add_4x4_c(const int32_t *input, uint16_t *output,
+                               int stride, int tx_type,
+                               int bd) {
+  int txfm_buf[4 * 4 + 4 + 4];
+  // output contains the prediction signal which is always positive and smaller
+  // than (1 << bd) - 1
+  // since bd < 16-1, therefore we can treat the uint16_t* output buffer as an
+  // int16_t*
+  TXFM_2D_FLIP_CFG cfg = vp10_get_inv_txfm_cfg(tx_type, TX_4X4);
+  inv_txfm2d_add_c(input, (int16_t *)output, stride, &cfg, txfm_buf);
+  clamp_block((int16_t *)output, 4, stride, 0, (1 << bd) - 1);
+}
+
+void vp10_inv_txfm2d_add_8x8_c(const int32_t *input, uint16_t *output,
+                               int stride, int tx_type,
+                               int bd) {
+  int txfm_buf[8 * 8 + 8 + 8];
+  // output contains the prediction signal which is always positive and smaller
+  // than (1 << bd) - 1
+  // since bd < 16-1, therefore we can treat the uint16_t* output buffer as an
+  // int16_t*
+  TXFM_2D_FLIP_CFG cfg = vp10_get_inv_txfm_cfg(tx_type, TX_8X8);
+  inv_txfm2d_add_c(input, (int16_t *)output, stride, &cfg, txfm_buf);
+  clamp_block((int16_t *)output, 8, stride, 0, (1 << bd) - 1);
+}
+
+void vp10_inv_txfm2d_add_16x16_c(const int32_t *input, uint16_t *output,
+                                 int stride, int tx_type,
+                                 int bd) {
+  int txfm_buf[16 * 16 + 16 + 16];
+  // output contains the prediction signal which is always positive and smaller
+  // than (1 << bd) - 1
+  // since bd < 16-1, therefore we can treat the uint16_t* output buffer as an
+  // int16_t*
+  TXFM_2D_FLIP_CFG cfg = vp10_get_inv_txfm_cfg(tx_type, TX_16X16);
+  inv_txfm2d_add_c(input, (int16_t *)output, stride, &cfg, txfm_buf);
+  clamp_block((int16_t *)output, 16, stride, 0, (1 << bd) - 1);
+}
+
+void vp10_inv_txfm2d_add_32x32_c(const int32_t *input, uint16_t *output,
+                                 int stride, int tx_type,
+                                 int bd) {
+  int txfm_buf[32 * 32 + 32 + 32];
+  // output contains the prediction signal which is always positive and smaller
+  // than (1 << bd) - 1
+  // since bd < 16-1, therefore we can treat the uint16_t* output buffer as an
+  // int16_t*
+  TXFM_2D_FLIP_CFG cfg = vp10_get_inv_txfm_cfg(tx_type, TX_32X32);
+  inv_txfm2d_add_c(input, (int16_t *)output, stride, &cfg, txfm_buf);
+  clamp_block((int16_t *)output, 32, stride, 0, (1 << bd) - 1);
+}
+
+void vp10_inv_txfm2d_add_64x64_c(const int32_t *input, uint16_t *output,
+                                 int stride, int tx_type,
+                                 int bd) {
+  int txfm_buf[64 * 64 + 64 + 64];
+  // output contains the prediction signal which is always positive and smaller
+  // than (1 << bd) - 1
+  // since bd < 16-1, therefore we can treat the uint16_t* output buffer as an
+  // int16_t*
+  TXFM_2D_FLIP_CFG cfg = vp10_get_inv_txfm_64x64_cfg(tx_type);
+  inv_txfm2d_add_c(input, (int16_t *)output, stride, &cfg, txfm_buf);
+  clamp_block((int16_t *)output, 64, stride, 0, (1 << bd) - 1);
+}
diff --git a/vp10/common/vp10_inv_txfm2d_cfg.h b/vp10/common/vp10_inv_txfm2d_cfg.h
new file mode 100644
index 0000000..9199068
--- /dev/null
+++ b/vp10/common/vp10_inv_txfm2d_cfg.h
@@ -0,0 +1,403 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_INV_TXFM2D_CFG_H_
+#define VP10_INV_TXFM2D_CFG_H_
+#include "vp10/common/vp10_inv_txfm1d.h"
+//  ---------------- config inv_dct_dct_4 ----------------
+static const int8_t inv_shift_dct_dct_4[2] = {0, -4};
+static const int8_t inv_stage_range_col_dct_dct_4[4] = {18, 18, 17, 17};
+static const int8_t inv_stage_range_row_dct_dct_4[4] = {18, 18, 18, 18};
+static const int8_t inv_cos_bit_col_dct_dct_4[4] = {13, 13, 13, 13};
+static const int8_t inv_cos_bit_row_dct_dct_4[4] = {13, 13, 13, 13};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_dct_4 = {
+    4,  // .txfm_size
+    4,  // .stage_num_col
+    4,  // .stage_num_row
+    // 0,  // .log_scale
+    inv_shift_dct_dct_4,            // .shift
+    inv_stage_range_col_dct_dct_4,  // .stage_range_col
+    inv_stage_range_row_dct_dct_4,  // .stage_range_row
+    inv_cos_bit_col_dct_dct_4,      // .cos_bit_col
+    inv_cos_bit_row_dct_dct_4,      // .cos_bit_row
+    TXFM_TYPE_DCT4,                 // .txfm_type_col
+    TXFM_TYPE_DCT4};                // .txfm_type_row
+
+//  ---------------- config inv_dct_dct_8 ----------------
+static const int8_t inv_shift_dct_dct_8[2] = {0, -5};
+static const int8_t inv_stage_range_col_dct_dct_8[6] = {19, 19, 19, 19, 18, 18};
+static const int8_t inv_stage_range_row_dct_dct_8[6] = {19, 19, 19, 19, 19, 19};
+static const int8_t inv_cos_bit_col_dct_dct_8[6] = {13, 13, 13, 13, 13, 13};
+static const int8_t inv_cos_bit_row_dct_dct_8[6] = {13, 13, 13, 13, 13, 13};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_dct_8 = {
+    8,  // .txfm_size
+    6,  // .stage_num_col
+    6,  // .stage_num_row
+    // 0,  // .log_scale
+    inv_shift_dct_dct_8,            // .shift
+    inv_stage_range_col_dct_dct_8,  // .stage_range_col
+    inv_stage_range_row_dct_dct_8,  // .stage_range_row
+    inv_cos_bit_col_dct_dct_8,      // .cos_bit_col
+    inv_cos_bit_row_dct_dct_8,      // .cos_bit_row
+    TXFM_TYPE_DCT8,                 // .txfm_type_col
+    TXFM_TYPE_DCT8};                // .txfm_type_row
+
+//  ---------------- config inv_dct_dct_16 ----------------
+static const int8_t inv_shift_dct_dct_16[2] = {-1, -5};
+static const int8_t inv_stage_range_col_dct_dct_16[8] = {19, 19, 19, 19,
+                                                         19, 19, 18, 18};
+static const int8_t inv_stage_range_row_dct_dct_16[8] = {20, 20, 20, 20,
+                                                         20, 20, 20, 20};
+static const int8_t inv_cos_bit_col_dct_dct_16[8] = {13, 13, 13, 13,
+                                                     13, 13, 13, 13};
+static const int8_t inv_cos_bit_row_dct_dct_16[8] = {12, 12, 12, 12,
+                                                     12, 12, 12, 12};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_dct_16 = {
+    16,  // .txfm_size
+    8,   // .stage_num_col
+    8,   // .stage_num_row
+    // 0,  // .log_scale
+    inv_shift_dct_dct_16,            // .shift
+    inv_stage_range_col_dct_dct_16,  // .stage_range_col
+    inv_stage_range_row_dct_dct_16,  // .stage_range_row
+    inv_cos_bit_col_dct_dct_16,      // .cos_bit_col
+    inv_cos_bit_row_dct_dct_16,      // .cos_bit_row
+    TXFM_TYPE_DCT16,                 // .txfm_type_col
+    TXFM_TYPE_DCT16};                // .txfm_type_row
+
+//  ---------------- config inv_dct_dct_32 ----------------
+static const int8_t inv_shift_dct_dct_32[2] = {-1, -5};
+static const int8_t inv_stage_range_col_dct_dct_32[10] = {19, 19, 19, 19, 19,
+                                                          19, 19, 19, 18, 18};
+static const int8_t inv_stage_range_row_dct_dct_32[10] = {20, 20, 20, 20, 20,
+                                                          20, 20, 20, 20, 20};
+static const int8_t inv_cos_bit_col_dct_dct_32[10] = {13, 13, 13, 13, 13,
+                                                      13, 13, 13, 13, 13};
+static const int8_t inv_cos_bit_row_dct_dct_32[10] = {12, 12, 12, 12, 12,
+                                                      12, 12, 12, 12, 12};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_dct_32 = {
+    32,  // .txfm_size
+    10,  // .stage_num_col
+    10,  // .stage_num_row
+    // 1,  // .log_scale
+    inv_shift_dct_dct_32,            // .shift
+    inv_stage_range_col_dct_dct_32,  // .stage_range_col
+    inv_stage_range_row_dct_dct_32,  // .stage_range_row
+    inv_cos_bit_col_dct_dct_32,      // .cos_bit_col
+    inv_cos_bit_row_dct_dct_32,      // .cos_bit_row
+    TXFM_TYPE_DCT32,                 // .txfm_type_col
+    TXFM_TYPE_DCT32};                // .txfm_type_row
+
+//  ---------------- config inv_dct_dct_64 ----------------
+static const int8_t inv_shift_dct_dct_64[2] = {-1, -7};
+static const int8_t inv_stage_range_col_dct_dct_64[12] = {
+    19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18};
+static const int8_t inv_stage_range_row_dct_dct_64[12] = {
+    20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
+static const int8_t inv_cos_bit_col_dct_dct_64[12] = {13, 13, 13, 13, 13, 13,
+                                                      13, 13, 13, 13, 13, 13};
+static const int8_t inv_cos_bit_row_dct_dct_64[12] = {12, 12, 12, 12, 12, 12,
+                                                      12, 12, 12, 12, 12, 12};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_dct_64 = {
+    64,                              // .txfm_size
+    12,                              // .stage_num_col
+    12,                              // .stage_num_row
+    inv_shift_dct_dct_64,            // .shift
+    inv_stage_range_col_dct_dct_64,  // .stage_range_col
+    inv_stage_range_row_dct_dct_64,  // .stage_range_row
+    inv_cos_bit_col_dct_dct_64,      // .cos_bit_col
+    inv_cos_bit_row_dct_dct_64,      // .cos_bit_row
+    TXFM_TYPE_DCT64,                 // .txfm_type_col
+    TXFM_TYPE_DCT64};                // .txfm_type_row
+
+//  ---------------- config inv_dct_adst_4 ----------------
+static const int8_t inv_shift_dct_adst_4[2] = {0, -4};
+static const int8_t inv_stage_range_col_dct_adst_4[4] = {18, 18, 17, 17};
+static const int8_t inv_stage_range_row_dct_adst_4[6] = {18, 18, 18,
+                                                         18, 18, 18};
+static const int8_t inv_cos_bit_col_dct_adst_4[4] = {13, 13, 13, 13};
+static const int8_t inv_cos_bit_row_dct_adst_4[6] = {13, 13, 13, 13, 13, 13};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_adst_4 = {
+    4,  // .txfm_size
+    4,  // .stage_num_col
+    6,  // .stage_num_row
+    // 0,  // .log_scale
+    inv_shift_dct_adst_4,            // .shift
+    inv_stage_range_col_dct_adst_4,  // .stage_range_col
+    inv_stage_range_row_dct_adst_4,  // .stage_range_row
+    inv_cos_bit_col_dct_adst_4,      // .cos_bit_col
+    inv_cos_bit_row_dct_adst_4,      // .cos_bit_row
+    TXFM_TYPE_DCT4,                  // .txfm_type_col
+    TXFM_TYPE_ADST4};                // .txfm_type_row
+
+//  ---------------- config inv_dct_adst_8 ----------------
+static const int8_t inv_shift_dct_adst_8[2] = {0, -5};
+static const int8_t inv_stage_range_col_dct_adst_8[6] = {19, 19, 19,
+                                                         19, 18, 18};
+static const int8_t inv_stage_range_row_dct_adst_8[8] = {19, 19, 19, 19,
+                                                         19, 19, 19, 19};
+static const int8_t inv_cos_bit_col_dct_adst_8[6] = {13, 13, 13, 13, 13, 13};
+static const int8_t inv_cos_bit_row_dct_adst_8[8] = {13, 13, 13, 13,
+                                                     13, 13, 13, 13};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_adst_8 = {
+    8,  // .txfm_size
+    6,  // .stage_num_col
+    8,  // .stage_num_row
+    // 0,  // .log_scale
+    inv_shift_dct_adst_8,            // .shift
+    inv_stage_range_col_dct_adst_8,  // .stage_range_col
+    inv_stage_range_row_dct_adst_8,  // .stage_range_row
+    inv_cos_bit_col_dct_adst_8,      // .cos_bit_col
+    inv_cos_bit_row_dct_adst_8,      // .cos_bit_row
+    TXFM_TYPE_DCT8,                  // .txfm_type_col
+    TXFM_TYPE_ADST8};                // .txfm_type_row
+
+//  ---------------- config inv_dct_adst_16 ----------------
+static const int8_t inv_shift_dct_adst_16[2] = {-1, -5};
+static const int8_t inv_stage_range_col_dct_adst_16[8] = {19, 19, 19, 19,
+                                                          19, 19, 18, 18};
+static const int8_t inv_stage_range_row_dct_adst_16[10] = {20, 20, 20, 20, 20,
+                                                           20, 20, 20, 20, 20};
+static const int8_t inv_cos_bit_col_dct_adst_16[8] = {13, 13, 13, 13,
+                                                      13, 13, 13, 13};
+static const int8_t inv_cos_bit_row_dct_adst_16[10] = {12, 12, 12, 12, 12,
+                                                       12, 12, 12, 12, 12};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_adst_16 = {
+    16,  // .txfm_size
+    8,   // .stage_num_col
+    10,  // .stage_num_row
+    // 0,  // .log_scale
+    inv_shift_dct_adst_16,            // .shift
+    inv_stage_range_col_dct_adst_16,  // .stage_range_col
+    inv_stage_range_row_dct_adst_16,  // .stage_range_row
+    inv_cos_bit_col_dct_adst_16,      // .cos_bit_col
+    inv_cos_bit_row_dct_adst_16,      // .cos_bit_row
+    TXFM_TYPE_DCT16,                  // .txfm_type_col
+    TXFM_TYPE_ADST16};                // .txfm_type_row
+
+//  ---------------- config inv_dct_adst_32 ----------------
+static const int8_t inv_shift_dct_adst_32[2] = {-1, -5};
+static const int8_t inv_stage_range_col_dct_adst_32[10] = {19, 19, 19, 19, 19,
+                                                           19, 19, 19, 18, 18};
+static const int8_t inv_stage_range_row_dct_adst_32[12] = {
+    20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
+static const int8_t inv_cos_bit_col_dct_adst_32[10] = {13, 13, 13, 13, 13,
+                                                       13, 13, 13, 13, 13};
+static const int8_t inv_cos_bit_row_dct_adst_32[12] = {12, 12, 12, 12, 12, 12,
+                                                       12, 12, 12, 12, 12, 12};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_dct_adst_32 = {
+    32,  // .txfm_size
+    10,  // .stage_num_col
+    12,  // .stage_num_row
+    // 1,  // .log_scale
+    inv_shift_dct_adst_32,            // .shift
+    inv_stage_range_col_dct_adst_32,  // .stage_range_col
+    inv_stage_range_row_dct_adst_32,  // .stage_range_row
+    inv_cos_bit_col_dct_adst_32,      // .cos_bit_col
+    inv_cos_bit_row_dct_adst_32,      // .cos_bit_row
+    TXFM_TYPE_DCT32,                  // .txfm_type_col
+    TXFM_TYPE_ADST32};                // .txfm_type_row
+
+//  ---------------- config inv_adst_adst_4 ----------------
+static const int8_t inv_shift_adst_adst_4[2] = {0, -4};
+static const int8_t inv_stage_range_col_adst_adst_4[6] = {18, 18, 18,
+                                                          18, 17, 17};
+static const int8_t inv_stage_range_row_adst_adst_4[6] = {18, 18, 18,
+                                                          18, 18, 18};
+static const int8_t inv_cos_bit_col_adst_adst_4[6] = {13, 13, 13, 13, 13, 13};
+static const int8_t inv_cos_bit_row_adst_adst_4[6] = {13, 13, 13, 13, 13, 13};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_adst_adst_4 = {
+    4,  // .txfm_size
+    6,  // .stage_num_col
+    6,  // .stage_num_row
+    // 0,  // .log_scale
+    inv_shift_adst_adst_4,            // .shift
+    inv_stage_range_col_adst_adst_4,  // .stage_range_col
+    inv_stage_range_row_adst_adst_4,  // .stage_range_row
+    inv_cos_bit_col_adst_adst_4,      // .cos_bit_col
+    inv_cos_bit_row_adst_adst_4,      // .cos_bit_row
+    TXFM_TYPE_ADST4,                  // .txfm_type_col
+    TXFM_TYPE_ADST4};                 // .txfm_type_row
+
+//  ---------------- config inv_adst_adst_8 ----------------
+static const int8_t inv_shift_adst_adst_8[2] = {0, -5};
+static const int8_t inv_stage_range_col_adst_adst_8[8] = {19, 19, 19, 19,
+                                                          19, 19, 18, 18};
+static const int8_t inv_stage_range_row_adst_adst_8[8] = {19, 19, 19, 19,
+                                                          19, 19, 19, 19};
+static const int8_t inv_cos_bit_col_adst_adst_8[8] = {13, 13, 13, 13,
+                                                      13, 13, 13, 13};
+static const int8_t inv_cos_bit_row_adst_adst_8[8] = {13, 13, 13, 13,
+                                                      13, 13, 13, 13};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_adst_adst_8 = {
+    8,  // .txfm_size
+    8,  // .stage_num_col
+    8,  // .stage_num_row
+    // 0,  // .log_scale
+    inv_shift_adst_adst_8,            // .shift
+    inv_stage_range_col_adst_adst_8,  // .stage_range_col
+    inv_stage_range_row_adst_adst_8,  // .stage_range_row
+    inv_cos_bit_col_adst_adst_8,      // .cos_bit_col
+    inv_cos_bit_row_adst_adst_8,      // .cos_bit_row
+    TXFM_TYPE_ADST8,                  // .txfm_type_col
+    TXFM_TYPE_ADST8};                 // .txfm_type_row
+
+//  ---------------- config inv_adst_adst_16 ----------------
+static const int8_t inv_shift_adst_adst_16[2] = {-1, -5};
+static const int8_t inv_stage_range_col_adst_adst_16[10] = {19, 19, 19, 19, 19,
+                                                            19, 19, 19, 18, 18};
+static const int8_t inv_stage_range_row_adst_adst_16[10] = {20, 20, 20, 20, 20,
+                                                            20, 20, 20, 20, 20};
+static const int8_t inv_cos_bit_col_adst_adst_16[10] = {13, 13, 13, 13, 13,
+                                                        13, 13, 13, 13, 13};
+static const int8_t inv_cos_bit_row_adst_adst_16[10] = {12, 12, 12, 12, 12,
+                                                        12, 12, 12, 12, 12};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_adst_adst_16 = {
+    16,  // .txfm_size
+    10,  // .stage_num_col
+    10,  // .stage_num_row
+    // 0,  // .log_scale
+    inv_shift_adst_adst_16,            // .shift
+    inv_stage_range_col_adst_adst_16,  // .stage_range_col
+    inv_stage_range_row_adst_adst_16,  // .stage_range_row
+    inv_cos_bit_col_adst_adst_16,      // .cos_bit_col
+    inv_cos_bit_row_adst_adst_16,      // .cos_bit_row
+    TXFM_TYPE_ADST16,                  // .txfm_type_col
+    TXFM_TYPE_ADST16};                 // .txfm_type_row
+
+//  ---------------- config inv_adst_adst_32 ----------------
+static const int8_t inv_shift_adst_adst_32[2] = {-1, -5};
+static const int8_t inv_stage_range_col_adst_adst_32[12] = {
+    19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18};
+static const int8_t inv_stage_range_row_adst_adst_32[12] = {
+    20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20};
+static const int8_t inv_cos_bit_col_adst_adst_32[12] = {13, 13, 13, 13, 13, 13,
+                                                        13, 13, 13, 13, 13, 13};
+static const int8_t inv_cos_bit_row_adst_adst_32[12] = {12, 12, 12, 12, 12, 12,
+                                                        12, 12, 12, 12, 12, 12};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_adst_adst_32 = {
+    32,  // .txfm_size
+    12,  // .stage_num_col
+    12,  // .stage_num_row
+    // 1,  // .log_scale
+    inv_shift_adst_adst_32,            // .shift
+    inv_stage_range_col_adst_adst_32,  // .stage_range_col
+    inv_stage_range_row_adst_adst_32,  // .stage_range_row
+    inv_cos_bit_col_adst_adst_32,      // .cos_bit_col
+    inv_cos_bit_row_adst_adst_32,      // .cos_bit_row
+    TXFM_TYPE_ADST32,                  // .txfm_type_col
+    TXFM_TYPE_ADST32};                 // .txfm_type_row
+
+//  ---------------- config inv_adst_dct_4 ----------------
+static const int8_t inv_shift_adst_dct_4[2] = {0, -4};
+static const int8_t inv_stage_range_col_adst_dct_4[6] = {18, 18, 18,
+                                                         18, 17, 17};
+static const int8_t inv_stage_range_row_adst_dct_4[4] = {18, 18, 18, 18};
+static const int8_t inv_cos_bit_col_adst_dct_4[6] = {13, 13, 13, 13, 13, 13};
+static const int8_t inv_cos_bit_row_adst_dct_4[4] = {13, 13, 13, 13};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_adst_dct_4 = {
+    4,  // .txfm_size
+    6,  // .stage_num_col
+    4,  // .stage_num_row
+    // 0,  // .log_scale
+    inv_shift_adst_dct_4,            // .shift
+    inv_stage_range_col_adst_dct_4,  // .stage_range_col
+    inv_stage_range_row_adst_dct_4,  // .stage_range_row
+    inv_cos_bit_col_adst_dct_4,      // .cos_bit_col
+    inv_cos_bit_row_adst_dct_4,      // .cos_bit_row
+    TXFM_TYPE_ADST4,                 // .txfm_type_col
+    TXFM_TYPE_DCT4};                 // .txfm_type_row
+
+//  ---------------- config inv_adst_dct_8 ----------------
+static const int8_t inv_shift_adst_dct_8[2] = {0, -5};
+static const int8_t inv_stage_range_col_adst_dct_8[8] = {19, 19, 19, 19,
+                                                         19, 19, 18, 18};
+static const int8_t inv_stage_range_row_adst_dct_8[6] = {19, 19, 19,
+                                                         19, 19, 19};
+static const int8_t inv_cos_bit_col_adst_dct_8[8] = {13, 13, 13, 13,
+                                                     13, 13, 13, 13};
+static const int8_t inv_cos_bit_row_adst_dct_8[6] = {13, 13, 13, 13, 13, 13};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_adst_dct_8 = {
+    8,  // .txfm_size
+    8,  // .stage_num_col
+    6,  // .stage_num_row
+    // 0,  // .log_scale
+    inv_shift_adst_dct_8,            // .shift
+    inv_stage_range_col_adst_dct_8,  // .stage_range_col
+    inv_stage_range_row_adst_dct_8,  // .stage_range_row
+    inv_cos_bit_col_adst_dct_8,      // .cos_bit_col
+    inv_cos_bit_row_adst_dct_8,      // .cos_bit_row
+    TXFM_TYPE_ADST8,                 // .txfm_type_col
+    TXFM_TYPE_DCT8};                 // .txfm_type_row
+
+//  ---------------- config inv_adst_dct_16 ----------------
+static const int8_t inv_shift_adst_dct_16[2] = {-1, -5};
+static const int8_t inv_stage_range_col_adst_dct_16[10] = {19, 19, 19, 19, 19,
+                                                           19, 19, 19, 18, 18};
+static const int8_t inv_stage_range_row_adst_dct_16[8] = {20, 20, 20, 20,
+                                                          20, 20, 20, 20};
+static const int8_t inv_cos_bit_col_adst_dct_16[10] = {13, 13, 13, 13, 13,
+                                                       13, 13, 13, 13, 13};
+static const int8_t inv_cos_bit_row_adst_dct_16[8] = {12, 12, 12, 12,
+                                                      12, 12, 12, 12};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_adst_dct_16 = {
+    16,  // .txfm_size
+    10,  // .stage_num_col
+    8,   // .stage_num_row
+    // 0,  // .log_scale
+    inv_shift_adst_dct_16,            // .shift
+    inv_stage_range_col_adst_dct_16,  // .stage_range_col
+    inv_stage_range_row_adst_dct_16,  // .stage_range_row
+    inv_cos_bit_col_adst_dct_16,      // .cos_bit_col
+    inv_cos_bit_row_adst_dct_16,      // .cos_bit_row
+    TXFM_TYPE_ADST16,                 // .txfm_type_col
+    TXFM_TYPE_DCT16};                 // .txfm_type_row
+
+//  ---------------- config inv_adst_dct_32 ----------------
+static const int8_t inv_shift_adst_dct_32[2] = {-1, -5};
+static const int8_t inv_stage_range_col_adst_dct_32[12] = {
+    19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 18, 18};
+static const int8_t inv_stage_range_row_adst_dct_32[10] = {20, 20, 20, 20, 20,
+                                                           20, 20, 20, 20, 20};
+static const int8_t inv_cos_bit_col_adst_dct_32[12] = {13, 13, 13, 13, 13, 13,
+                                                       13, 13, 13, 13, 13, 13};
+static const int8_t inv_cos_bit_row_adst_dct_32[10] = {12, 12, 12, 12, 12,
+                                                       12, 12, 12, 12, 12};
+
+static const TXFM_2D_CFG inv_txfm_2d_cfg_adst_dct_32 = {
+    32,  // .txfm_size
+    12,  // .stage_num_col
+    10,  // .stage_num_row
+    // 1,  // .log_scale
+    inv_shift_adst_dct_32,            // .shift
+    inv_stage_range_col_adst_dct_32,  // .stage_range_col
+    inv_stage_range_row_adst_dct_32,  // .stage_range_row
+    inv_cos_bit_col_adst_dct_32,      // .cos_bit_col
+    inv_cos_bit_row_adst_dct_32,      // .cos_bit_row
+    TXFM_TYPE_ADST32,                 // .txfm_type_col
+    TXFM_TYPE_DCT32};                 // .txfm_type_row
+
+#endif  // VP10_INV_TXFM2D_CFG_H_
diff --git a/vp10/common/vp10_rtcd_defs.pl b/vp10/common/vp10_rtcd_defs.pl
index c8a10e5..51b674b 100644
--- a/vp10/common/vp10_rtcd_defs.pl
+++ b/vp10/common/vp10_rtcd_defs.pl
@@ -7,12 +7,15 @@
 #include "vpx/vpx_integer.h"
 #include "vp10/common/common.h"
 #include "vp10/common/enums.h"
+#include "vp10/common/quant_common.h"
+#include "vp10/common/filter.h"
+#include "vp10/common/vp10_txfm.h"
 
 struct macroblockd;
 
 /* Encoder forward decls */
 struct macroblock;
-struct vp9_variance_vtable;
+struct vp10_variance_vtable;
 struct search_site_config;
 struct mv;
 union int_mv;
@@ -78,6 +81,22 @@
 }
 
 #
+# 10/12-tap convolution filters
+#
+add_proto qw/void vp10_convolve_horiz/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg";
+specialize qw/vp10_convolve_horiz ssse3/;
+
+add_proto qw/void vp10_convolve_vert/, "const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg";
+specialize qw/vp10_convolve_vert ssse3/;
+
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void vp10_highbd_convolve_horiz/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg, int bd";
+  specialize qw/vp10_highbd_convolve_horiz sse4_1/;
+  add_proto qw/void vp10_highbd_convolve_vert/, "const uint16_t *src, int src_stride, uint16_t *dst, int dst_stride, int w, int h, const InterpFilterParams fp, const int subpel_x_q4, int x_step_q4, int avg, int bd";
+  specialize qw/vp10_highbd_convolve_vert sse4_1/;
+}
+
+#
 # dct
 #
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
@@ -151,7 +170,7 @@
     specialize qw/vp10_iht8x8_64_add sse2/;
 
     add_proto qw/void vp10_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
-    specialize qw/vp10_iht16x16_256_add/;
+    specialize qw/vp10_iht16x16_256_add sse2/;
 
     add_proto qw/void vp10_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
     specialize qw/vp10_fdct4x4 sse2/;
@@ -244,13 +263,19 @@
     specialize qw/vp10_fdct32x32_1/;
   } else {
     add_proto qw/void vp10_iht4x4_16_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
-    specialize qw/vp10_iht4x4_16_add sse2 neon dspr2 msa/;
+    specialize qw/vp10_iht4x4_16_add sse2 neon dspr2/;
 
     add_proto qw/void vp10_iht8x8_64_add/, "const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type";
-    specialize qw/vp10_iht8x8_64_add sse2 neon dspr2 msa/;
+    specialize qw/vp10_iht8x8_64_add sse2 neon dspr2/;
 
     add_proto qw/void vp10_iht16x16_256_add/, "const tran_low_t *input, uint8_t *output, int pitch, int tx_type";
-    specialize qw/vp10_iht16x16_256_add sse2 dspr2 msa/;
+    specialize qw/vp10_iht16x16_256_add sse2 dspr2/;
+
+    if (vpx_config("CONFIG_EXT_TX") ne "yes") {
+      specialize qw/vp10_iht4x4_16_add msa/;
+      specialize qw/vp10_iht8x8_64_add msa/;
+      specialize qw/vp10_iht16x16_256_add msa/;
+    }
 
     add_proto qw/void vp10_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
     specialize qw/vp10_fdct4x4 sse2/;
@@ -281,6 +306,20 @@
   }
 }
 
+if (vpx_config("CONFIG_NEW_QUANT") eq "yes") {
+  add_proto qw/void quantize_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
+  specialize qw/quantize_nuq/;
+
+  add_proto qw/void quantize_fp_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
+  specialize qw/quantize_fp_nuq/;
+
+  add_proto qw/void quantize_32x32_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
+  specialize qw/quantize_32x32_nuq/;
+
+  add_proto qw/void quantize_32x32_fp_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
+  specialize qw/quantize_32x32_fp_nuq/;
+}
+
 # High bitdepth functions
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   #
@@ -397,22 +436,37 @@
   add_proto qw/void vp10_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
   specialize qw/vp10_fht16x16 sse2/;
 
+  add_proto qw/void vp10_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp10_fht32x32/;
+
   add_proto qw/void vp10_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vp10_fwht4x4/, "$sse2_x86inc";
 } else {
   add_proto qw/void vp10_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp10_fht4x4 sse2 msa/;
+  specialize qw/vp10_fht4x4 sse2/;
 
   add_proto qw/void vp10_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp10_fht8x8 sse2 msa/;
+  specialize qw/vp10_fht8x8 sse2/;
 
   add_proto qw/void vp10_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp10_fht16x16 sse2 msa/;
+  specialize qw/vp10_fht16x16 sse2/;
+
+  if (vpx_config("CONFIG_EXT_TX") ne "yes") {
+    specialize qw/vp10_fht4x4 msa/;
+    specialize qw/vp10_fht8x8 msa/;
+    specialize qw/vp10_fht16x16 msa/;
+  }
+
+  add_proto qw/void vp10_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp10_fht32x32/;
 
   add_proto qw/void vp10_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vp10_fwht4x4 msa/, "$sse2_x86inc";
 }
 
+add_proto qw/void vp10_fwd_idtx/, "const int16_t *src_diff, tran_low_t *coeff, int stride, int bs, int tx_type";
+  specialize qw/vp10_fwd_idtx/;
+
 # Inverse transform
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   # Note as optimized versions of these functions are added we need to add a check to ensure
@@ -595,18 +649,44 @@
   }  # CONFIG_EMULATE_HARDWARE
 }  # CONFIG_VP9_HIGHBITDEPTH
 
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  #fwd txfm
+  add_proto qw/void vp10_fwd_txfm2d_4x4/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
+  specialize qw/vp10_fwd_txfm2d_4x4 sse4_1/;
+  add_proto qw/void vp10_fwd_txfm2d_8x8/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
+  specialize qw/vp10_fwd_txfm2d_8x8 sse4_1/;
+  add_proto qw/void vp10_fwd_txfm2d_16x16/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
+  specialize qw/vp10_fwd_txfm2d_16x16 sse4_1/;
+  add_proto qw/void vp10_fwd_txfm2d_32x32/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
+  specialize qw/vp10_fwd_txfm2d_32x32 sse4_1/;
+  add_proto qw/void vp10_fwd_txfm2d_64x64/, "const int16_t *input, int32_t *output, int stride, int tx_type, int bd";
+  specialize qw/vp10_fwd_txfm2d_64x64 sse4_1/;
+
+  #inv txfm
+  add_proto qw/void vp10_inv_txfm2d_add_4x4/, "const int32_t *input, uint16_t *output, int stride, int tx_type, int bd";
+  specialize qw/vp10_inv_txfm2d_add_4x4 sse4_1/;
+  add_proto qw/void vp10_inv_txfm2d_add_8x8/, "const int32_t *input, uint16_t *output, int stride, int tx_type, int bd";
+  specialize qw/vp10_inv_txfm2d_add_8x8 sse4_1/;
+  add_proto qw/void vp10_inv_txfm2d_add_16x16/, "const int32_t *input, uint16_t *output, int stride, int tx_type, int bd";
+  specialize qw/vp10_inv_txfm2d_add_16x16 sse4_1/;
+  add_proto qw/void vp10_inv_txfm2d_add_32x32/, "const int32_t *input, uint16_t *output, int stride, int tx_type, int bd";
+  specialize qw/vp10_inv_txfm2d_add_32x32/;
+  add_proto qw/void vp10_inv_txfm2d_add_64x64/, "const int32_t *input, uint16_t *output, int stride, int tx_type, int bd";
+  specialize qw/vp10_inv_txfm2d_add_64x64/;
+}
+
 #
 # Motion search
 #
-add_proto qw/int vp10_full_search_sad/, "const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv";
+add_proto qw/int vp10_full_search_sad/, "const struct macroblock *x, const struct mv *ref_mv, int sad_per_bit, int distance, const struct vp10_variance_vtable *fn_ptr, const struct mv *center_mv, struct mv *best_mv";
 specialize qw/vp10_full_search_sad sse3 sse4_1/;
 $vp10_full_search_sad_sse3=vp10_full_search_sadx3;
 $vp10_full_search_sad_sse4_1=vp10_full_search_sadx8;
 
-add_proto qw/int vp10_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
+add_proto qw/int vp10_diamond_search_sad/, "const struct macroblock *x, const struct search_site_config *cfg,  struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp10_variance_vtable *fn_ptr, const struct mv *center_mv";
 specialize qw/vp10_diamond_search_sad/;
 
-add_proto qw/int vp10_full_range_search/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp9_variance_vtable *fn_ptr, const struct mv *center_mv";
+add_proto qw/int vp10_full_range_search/, "const struct macroblock *x, const struct search_site_config *cfg, struct mv *ref_mv, struct mv *best_mv, int search_param, int sad_per_bit, int *num00, const struct vp10_variance_vtable *fn_ptr, const struct mv *center_mv";
 specialize qw/vp10_full_range_search/;
 
 add_proto qw/void vp10_temporal_filter_apply/, "uint8_t *frame1, unsigned int stride, uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int filter_weight, unsigned int *accumulator, uint16_t *count";
@@ -615,19 +695,32 @@
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
 
   # ENCODEMB INVOKE
+  if (vpx_config("CONFIG_NEW_QUANT") eq "yes") {
+    add_proto qw/void highbd_quantize_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
+    specialize qw/highbd_quantize_nuq/;
+
+    add_proto qw/void highbd_quantize_fp_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
+    specialize qw/highbd_quantize_fp_nuq/;
+
+    add_proto qw/void highbd_quantize_32x32_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
+    specialize qw/highbd_quantize_32x32_nuq/;
+
+    add_proto qw/void highbd_quantize_32x32_fp_nuq/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *quant_ptr, const int16_t *dequant_ptr, const cuml_bins_type_nuq *cuml_bins_ptr, const dequant_val_type_nuq *dequant_val, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const int16_t *scan, const uint8_t *band";
+    specialize qw/highbd_quantize_32x32_fp_nuq/;
+  }
 
   add_proto qw/int64_t vp10_highbd_block_error/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, intptr_t block_size, int64_t *ssz, int bd";
   specialize qw/vp10_highbd_block_error sse2/;
 
-  add_proto qw/void vp10_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vp10_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale";
   specialize qw/vp10_highbd_quantize_fp/;
 
-  add_proto qw/void vp10_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vp10_highbd_quantize_fp_32x32/;
+  add_proto qw/void vp10_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan, int log_scale";
+  specialize qw/vp10_highbd_quantize_b/;
 
   # fdct functions
   add_proto qw/void vp10_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
-  specialize qw/vp10_highbd_fht4x4/;
+  specialize qw/vp10_highbd_fht4x4 sse4_1/;
 
   add_proto qw/void vp10_highbd_fht8x8/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
   specialize qw/vp10_highbd_fht8x8/;
@@ -635,6 +728,9 @@
   add_proto qw/void vp10_highbd_fht16x16/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
   specialize qw/vp10_highbd_fht16x16/;
 
+  add_proto qw/void vp10_highbd_fht32x32/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
+  specialize qw/vp10_highbd_fht32x32/;
+
   add_proto qw/void vp10_highbd_fwht4x4/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vp10_highbd_fwht4x4/;
 
@@ -644,6 +740,15 @@
 }
 # End vp10_high encoder functions
 
+if (vpx_config("CONFIG_EXT_INTER") eq "yes") {
+  add_proto qw/uint64_t vp10_wedge_sse_from_residuals/, "const int16_t *r1, const int16_t *d, const uint8_t *m, int N";
+  specialize qw/vp10_wedge_sse_from_residuals sse2/;
+  add_proto qw/int vp10_wedge_sign_from_residuals/, "const int16_t *ds, const uint8_t *m, int N, int64_t limit";
+  specialize qw/vp10_wedge_sign_from_residuals sse2/;
+  add_proto qw/void vp10_wedge_compute_delta_squares/, "int16_t *d, const int16_t *a, const int16_t *b, int N";
+  specialize qw/vp10_wedge_compute_delta_squares sse2/;
+}
+
 }
 # end encoder functions
 1;
diff --git a/vp10/common/vp10_txfm.h b/vp10/common/vp10_txfm.h
new file mode 100644
index 0000000..2ac8f81
--- /dev/null
+++ b/vp10/common/vp10_txfm.h
@@ -0,0 +1,222 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#ifndef VP10_TXFM_H_
+#define VP10_TXFM_H_
+
+#include <stdio.h>
+#include <math.h>
+#include <assert.h>
+
+#include "vp10/common/enums.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+static const int cos_bit_min = 10;
+static const int cos_bit_max = 16;
+
+// cospi_arr[i][j] = (int)round(cos(M_PI*j/128) * (1<<(cos_bit_min+i)));
+static const int32_t cospi_arr[7][64] =
+  {{ 1024,  1024,  1023,  1021,  1019,  1016,  1013,  1009,
+     1004,   999,   993,   987,   980,   972,   964,   955,
+      946,   936,   926,   915,   903,   891,   878,   865,
+      851,   837,   822,   807,   792,   775,   759,   742,
+      724,   706,   688,   669,   650,   630,   610,   590,
+      569,   548,   526,   505,   483,   460,   438,   415,
+      392,   369,   345,   321,   297,   273,   249,   224,
+      200,   175,   150,   125,   100,    75,    50,    25},
+  {  2048,  2047,  2046,  2042,  2038,  2033,  2026,  2018,
+     2009,  1998,  1987,  1974,  1960,  1945,  1928,  1911,
+     1892,  1872,  1851,  1829,  1806,  1782,  1757,  1730,
+     1703,  1674,  1645,  1615,  1583,  1551,  1517,  1483,
+     1448,  1412,  1375,  1338,  1299,  1260,  1220,  1179,
+     1138,  1096,  1053,  1009,   965,   921,   876,   830,
+      784,   737,   690,   642,   595,   546,   498,   449,
+      400,   350,   301,   251,   201,   151,   100,    50},
+  {  4096,  4095,  4091,  4085,  4076,  4065,  4052,  4036,
+     4017,  3996,  3973,  3948,  3920,  3889,  3857,  3822,
+     3784,  3745,  3703,  3659,  3612,  3564,  3513,  3461,
+     3406,  3349,  3290,  3229,  3166,  3102,  3035,  2967,
+     2896,  2824,  2751,  2675,  2598,  2520,  2440,  2359,
+     2276,  2191,  2106,  2019,  1931,  1842,  1751,  1660,
+     1567,  1474,  1380,  1285,  1189,  1092,   995,   897,
+      799,   700,   601,   501,   401,   301,   201,   101},
+  {  8192,  8190,  8182,  8170,  8153,  8130,  8103,  8071,
+     8035,  7993,  7946,  7895,  7839,  7779,  7713,  7643,
+     7568,  7489,  7405,  7317,  7225,  7128,  7027,  6921,
+     6811,  6698,  6580,  6458,  6333,  6203,  6070,  5933,
+     5793,  5649,  5501,  5351,  5197,  5040,  4880,  4717,
+     4551,  4383,  4212,  4038,  3862,  3683,  3503,  3320,
+     3135,  2948,  2760,  2570,  2378,  2185,  1990,  1795,
+     1598,  1401,  1202,  1003,   803,   603,   402,   201},
+  { 16384, 16379, 16364, 16340, 16305, 16261, 16207, 16143,
+    16069, 15986, 15893, 15791, 15679, 15557, 15426, 15286,
+    15137, 14978, 14811, 14635, 14449, 14256, 14053, 13842,
+    13623, 13395, 13160, 12916, 12665, 12406, 12140, 11866,
+    11585, 11297, 11003, 10702, 10394, 10080,  9760,  9434,
+     9102,  8765,  8423,  8076,  7723,  7366,  7005,  6639,
+     6270,  5897,  5520,  5139,  4756,  4370,  3981,  3590,
+     3196,  2801,  2404,  2006,  1606,  1205,   804,   402},
+  { 32768, 32758, 32729, 32679, 32610, 32522, 32413, 32286,
+    32138, 31972, 31786, 31581, 31357, 31114, 30853, 30572,
+    30274, 29957, 29622, 29269, 28899, 28511, 28106, 27684,
+    27246, 26791, 26320, 25833, 25330, 24812, 24279, 23732,
+    23170, 22595, 22006, 21403, 20788, 20160, 19520, 18868,
+    18205, 17531, 16846, 16151, 15447, 14733, 14010, 13279,
+    12540, 11793, 11039, 10279,  9512,  8740,  7962,  7180,
+     6393,  5602,  4808,  4011,  3212,  2411,  1608,   804},
+  { 65536, 65516, 65457, 65358, 65220, 65043, 64827, 64571,
+    64277, 63944, 63572, 63162, 62714, 62228, 61705, 61145,
+    60547, 59914, 59244, 58538, 57798, 57022, 56212, 55368,
+    54491, 53581, 52639, 51665, 50660, 49624, 48559, 47464,
+    46341, 45190, 44011, 42806, 41576, 40320, 39040, 37736,
+    36410, 35062, 33692, 32303, 30893, 29466, 28020, 26558,
+    25080, 23586, 22078, 20557, 19024, 17479, 15924, 14359,
+    12785, 11204,  9616,  8022,  6424,  4821,  3216,  1608}};
+
+static INLINE int32_t round_shift(int32_t value, int bit) {
+  return (value + (1 << (bit - 1))) >> bit;
+}
+
+static INLINE void round_shift_array(int32_t *arr, int size, int bit) {
+  int i;
+  if (bit == 0) {
+    return;
+  } else {
+    if (bit > 0) {
+      for (i = 0; i < size; i++) {
+        arr[i] = round_shift(arr[i], bit);
+      }
+    } else {
+      for (i = 0; i < size; i++) {
+        arr[i] = arr[i] << (-bit);
+      }
+    }
+  }
+}
+
+static INLINE int32_t half_btf(int32_t w0, int32_t in0, int32_t w1, int32_t in1,
+                               int bit) {
+  int32_t result_32 = w0 * in0 + w1 * in1;
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+  int64_t result_64 = (int64_t)w0 * (int64_t)in0 + (int64_t)w1 * (int64_t)in1;
+  if (result_32 != result_64) {
+    printf(
+        "%s overflow result_32: %d result_64: %lld w0: %d in0: %d w1: %d in1: "
+        "%d\n",
+        __func__, result_32, (long long int)result_64, w0, in0, w1, in1);
+    assert(0 && "half_btf overflow");
+  }
+#endif
+  return round_shift(result_32, bit);
+}
+
+static INLINE int get_max_bit(int x) {
+  int max_bit = -1;
+  while (x) {
+    x = x >> 1;
+    max_bit++;
+  }
+  return max_bit;
+}
+
+// TODO(angiebird): implement SSE
+static INLINE void clamp_block(int16_t *block, int block_size, int stride,
+                               int low, int high) {
+  int i, j;
+  for (i = 0; i < block_size; ++i) {
+    for (j = 0; j < block_size; ++j) {
+      block[i * stride + j] = clamp(block[i * stride + j], low, high);
+    }
+  }
+}
+
+typedef void (*TxfmFunc)(const int32_t *input, int32_t *output,
+                         const int8_t *cos_bit, const int8_t *stage_range);
+
+typedef enum TXFM_TYPE {
+  TXFM_TYPE_DCT4,
+  TXFM_TYPE_DCT8,
+  TXFM_TYPE_DCT16,
+  TXFM_TYPE_DCT32,
+  TXFM_TYPE_DCT64,
+  TXFM_TYPE_ADST4,
+  TXFM_TYPE_ADST8,
+  TXFM_TYPE_ADST16,
+  TXFM_TYPE_ADST32,
+} TXFM_TYPE;
+
+typedef struct TXFM_2D_CFG {
+  const int txfm_size;
+  const int stage_num_col;
+  const int stage_num_row;
+
+  const int8_t *shift;
+  const int8_t *stage_range_col;
+  const int8_t *stage_range_row;
+  const int8_t *cos_bit_col;
+  const int8_t *cos_bit_row;
+  const TXFM_TYPE txfm_type_col;
+  const TXFM_TYPE txfm_type_row;
+} TXFM_2D_CFG;
+
+typedef struct TXFM_2D_FLIP_CFG {
+  int ud_flip;  // flip upside down
+  int lr_flip;  // flip left to right
+  const TXFM_2D_CFG* cfg;
+} TXFM_2D_FLIP_CFG;
+
+static INLINE void set_flip_cfg(int tx_type, TXFM_2D_FLIP_CFG* cfg) {
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      cfg->ud_flip = 0;
+      cfg->lr_flip = 0;
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      cfg->ud_flip = 1;
+      cfg->lr_flip = 0;
+      break;
+    case DCT_FLIPADST:
+      cfg->ud_flip = 0;
+      cfg->lr_flip = 1;
+      break;
+    case FLIPADST_FLIPADST:
+      cfg->ud_flip = 1;
+      cfg->lr_flip = 1;
+      break;
+    case ADST_FLIPADST:
+      cfg->ud_flip = 0;
+      cfg->lr_flip = 1;
+      break;
+    case FLIPADST_ADST:
+      cfg->ud_flip = 1;
+      cfg->lr_flip = 0;
+      break;
+#endif  // CONFIG_EXT_TX
+    default:
+      cfg->ud_flip = 0;
+      cfg->lr_flip = 0;
+      assert(0);
+  }
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+TXFM_2D_FLIP_CFG vp10_get_fwd_txfm_cfg(int tx_type, int tx_size);
+TXFM_2D_FLIP_CFG vp10_get_fwd_txfm_64x64_cfg(int tx_type);
+#ifdef __cplusplus
+}
+#endif  // __cplusplus
+
+#endif  // VP10_TXFM_H_
diff --git a/vp10/common/warped_motion.c b/vp10/common/warped_motion.c
new file mode 100644
index 0000000..3b924ea
--- /dev/null
+++ b/vp10/common/warped_motion.c
@@ -0,0 +1,692 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be
+ *  found  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <math.h>
+#include <assert.h>
+
+#include "vp10/common/warped_motion.h"
+
+
+typedef void (*projectPointsType)(int *mat,
+                                  int *points,
+                                  int *proj,
+                                  const int n,
+                                  const int stride_points,
+                                  const int stride_proj,
+                                  const int subsampling_x,
+                                  const int subsampling_y);
+
+static void projectPointsHomography(int *mat,
+                                    int *points,
+                                    int *proj,
+                                    const int n,
+                                    const int stride_points,
+                                    const int stride_proj,
+                                    const int subsampling_x,
+                                    const int subsampling_y);
+static void projectPointsAffine(int *mat,
+                                int *points,
+                                int *proj,
+                                const int n,
+                                const int stride_points,
+                                const int stride_proj,
+                                const int subsampling_x,
+                                const int subsampling_y);
+static void projectPointsRotZoom(int *mat,
+                                 int *points,
+                                 int *proj,
+                                 const int n,
+                                 const int stride_points,
+                                 const int stride_proj,
+                                 const int subsampling_x,
+                                 const int subsampling_y);
+static void projectPointsTranslation(int *mat,
+                                     int *points,
+                                     int *proj,
+                                     const int n,
+                                     const int stride_points,
+                                     const int stride_proj,
+                                     const int subsampling_x,
+                                     const int subsampling_y);
+
+static projectPointsType get_projectPointsType(TransformationType type) {
+  switch (type) {
+    case HOMOGRAPHY:
+      return projectPointsHomography;
+    case AFFINE:
+      return projectPointsAffine;
+    case ROTZOOM:
+      return projectPointsRotZoom;
+    case TRANSLATION:
+      return projectPointsTranslation;
+    default:
+      assert(0);
+      return NULL;
+  }
+}
+
+static void projectPointsTranslation(int *mat, int *points, int *proj,
+                                     const int n,
+                                     const int stride_points,
+                                     const int stride_proj,
+                                     const int subsampling_x,
+                                     const int subsampling_y) {
+  int i;
+  for (i = 0; i < n; ++i) {
+    const int x = *(points++), y = *(points++);
+    if (subsampling_x)
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
+          ((x << (WARPEDMODEL_PREC_BITS + 1)) + mat[0]),
+          WARPEDPIXEL_PREC_BITS + 1);
+    else
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
+          ((x << WARPEDMODEL_PREC_BITS)) + mat[0],
+          WARPEDPIXEL_PREC_BITS);
+    if (subsampling_y)
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
+          ((y << (WARPEDMODEL_PREC_BITS + 1)) + mat[1]),
+          WARPEDPIXEL_PREC_BITS + 1);
+    else
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
+          ((y << WARPEDMODEL_PREC_BITS)) + mat[1],
+          WARPEDPIXEL_PREC_BITS);
+    points += stride_points - 2;
+    proj += stride_proj - 2;
+  }
+}
+
+void projectPointsRotZoom(int *mat, int *points, int *proj,
+                          const int n,
+                          const int stride_points,
+                          const int stride_proj,
+                          const int subsampling_x,
+                          const int subsampling_y) {
+  int i;
+  for (i = 0; i < n; ++i) {
+    const int x = *(points++), y = *(points++);
+    if (subsampling_x)
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
+          mat[0] * 2 * x + mat[1] * 2 * y + mat[2] +
+          (mat[0] + mat[1] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+          WARPEDDIFF_PREC_BITS + 1);
+    else
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(mat[0] * x + mat[1] * y + mat[2],
+                                            WARPEDDIFF_PREC_BITS);
+    if (subsampling_y)
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
+          -mat[1] * 2 * x + mat[0] * 2 * y + mat[3] +
+          (-mat[1] + mat[0] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+          WARPEDDIFF_PREC_BITS + 1);
+    else
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(-mat[1] * x + mat[0] * y + mat[3],
+                                            WARPEDDIFF_PREC_BITS);
+    points += stride_points - 2;
+    proj += stride_proj - 2;
+  }
+}
+
+static void projectPointsAffine(int *mat, int *points, int *proj,
+                                const int n,
+                                const int stride_points,
+                                const int stride_proj,
+                                const int subsampling_x,
+                                const int subsampling_y) {
+  int i;
+  for (i = 0; i < n; ++i) {
+    const int x = *(points++), y = *(points++);
+    if (subsampling_x)
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
+          mat[0] * 2 * x + mat[1] * 2 * y + mat[4] +
+          (mat[0] + mat[1] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+          WARPEDDIFF_PREC_BITS + 1);
+    else
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(mat[0] * x + mat[1] * y + mat[4],
+                                            WARPEDDIFF_PREC_BITS);
+    if (subsampling_y)
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(
+          mat[2] * 2 * x + mat[3] * 2 * y + mat[5] +
+          (mat[2] + mat[3] - (1 << WARPEDMODEL_PREC_BITS)) / 2,
+          WARPEDDIFF_PREC_BITS + 1);
+    else
+      *(proj++) = ROUND_POWER_OF_TWO_SIGNED(mat[2] * x + mat[3] * y + mat[5],
+                                            WARPEDDIFF_PREC_BITS);
+    points += stride_points - 2;
+    proj += stride_proj - 2;
+  }
+}
+
+static void projectPointsHomography(int *mat, int *points, int *proj,
+                                    const int n,
+                                    const int stride_points,
+                                    const int stride_proj,
+                                    const int subsampling_x,
+                                    const int subsampling_y) {
+  int i;
+  int64_t x, y, Z;
+  int64_t xp, yp;
+  for (i = 0; i < n; ++i) {
+    x = *(points++), y = *(points++);
+    x = (subsampling_x ? 4 * x + 1 : 2 * x);
+    y = (subsampling_y ? 4 * y + 1 : 2 * y);
+
+    Z = (mat[6] * x + mat[7] * y + (1 << (WARPEDMODEL_ROW3HOMO_PREC_BITS + 1)));
+    xp = (mat[0] * x + mat[1] * y + 2 * mat[2])
+        << (WARPEDPIXEL_PREC_BITS +
+        WARPEDMODEL_ROW3HOMO_PREC_BITS - WARPEDMODEL_PREC_BITS);
+    yp = (mat[3] * x + mat[4] * y + 2 * mat[5])
+        << (WARPEDPIXEL_PREC_BITS +
+        WARPEDMODEL_ROW3HOMO_PREC_BITS - WARPEDMODEL_PREC_BITS);
+
+    xp = xp > 0 ? (xp + Z / 2) / Z : (xp - Z / 2) / Z;
+    yp = yp > 0 ? (yp + Z / 2) / Z : (yp - Z / 2) / Z;
+
+    if (subsampling_x)
+      xp = (xp - (1 << (WARPEDPIXEL_PREC_BITS - 1))) / 2;
+    if (subsampling_y)
+      yp = (yp - (1 << (WARPEDPIXEL_PREC_BITS - 1))) / 2;
+    *(proj++) = xp;
+    *(proj++) = yp;
+
+    points += stride_points - 2;
+    proj += stride_proj - 2;
+  }
+}
+
+static const int16_t
+filter_4tap[WARPEDPIXEL_PREC_SHIFTS][4] = {
+  {0, 128,   0, 0},
+  {-1, 127,   2, 0},
+  {-2, 127,   4, -1},
+  {-3, 126,   6, -1},
+  {-3, 125,   8, -2},
+  {-4, 124,  11, -3},
+  {-5, 123,  13, -3},
+  {-5, 121,  15, -3},
+  {-6, 120,  18, -4},
+  {-7, 119,  20, -4},
+  {-7, 118,  22, -5},
+  {-8, 116,  25, -5},
+  {-8, 115,  27, -6},
+  {-9, 113,  30, -6},
+  {-9, 112,  32, -7},
+  {-9, 110,  34, -7},
+  {-10, 108,  37, -7},
+  {-10, 107,  39, -8},
+  {-10, 105,  41, -8},
+  {-11, 103,  44, -8},
+  {-11, 101,  47, -9},
+  {-11,  99,  49, -9},
+  {-11,  97,  51, -9},
+  {-11,  95,  54, -10},
+  {-11,  93,  56, -10},
+  {-12,  91,  59, -10},
+  {-12,  89,  61, -10},
+  {-12,  87,  64, -11},
+  {-12,  85,  66, -11},
+  {-12,  82,  69, -11},
+  {-12,  80,  71, -11},
+  {-12,  78,  73, -11},
+  {-11,  75,  75, -11},
+  {-11,  73,  78, -12},
+  {-11,  71,  80, -12},
+  {-11,  69,  82, -12},
+  {-11,  66,  85, -12},
+  {-11,  64,  87, -12},
+  {-10,  61,  89, -12},
+  {-10,  59,  91, -12},
+  {-10,  56,  93, -11},
+  {-10,  54,  95, -11},
+  {-9, 51, 97, -11},
+  {-9, 49, 99, -11},
+  {-9,  47, 101, -11},
+  {-8,  44, 103, -11},
+  {-8,  41, 105, -10},
+  {-8,  39, 107, -10},
+  {-7,  37, 108, -10},
+  {-7,  34, 110, -9},
+  {-7,  32, 112, -9},
+  {-6,  30, 113, -9},
+  {-6,  27, 115, -8},
+  {-5,  25, 116, -8},
+  {-5,  22, 118, -7},
+  {-4,  20, 119, -7},
+  {-4,  18, 120, -6},
+  {-3,  15, 121, -5},
+  {-3,  13, 123, -5},
+  {-3,  11, 124, -4},
+  {-2,   8, 125, -3},
+  {-1,   6, 126, -3},
+  {-1,   4, 127, -2},
+  {0,   2, 127, -1},
+};
+
+static const int16_t
+filter_ntap[WARPEDPIXEL_PREC_SHIFTS][WARPEDPIXEL_FILTER_TAPS] = {
+  {0,   0, 128,   0,   0, 0},
+  {0,  -1, 128,   2,  -1, 0},
+  {1,  -3, 127,   4,  -1, 0},
+  {1,  -4, 126,   6,  -2, 1},
+  {1,  -5, 126,   8,  -3, 1},
+  {1,  -6, 125,  11,  -4, 1},
+  {1,  -7, 124,  13,  -4, 1},
+  {2,  -8, 123,  15,  -5, 1},
+  {2,  -9, 122,  18,  -6, 1},
+  {2, -10, 121,  20,  -6, 1},
+  {2, -11, 120,  22,  -7, 2},
+  {2, -12, 119,  25,  -8, 2},
+  {3, -13, 117,  27,  -8, 2},
+  {3, -13, 116,  29,  -9, 2},
+  {3, -14, 114,  32, -10, 3},
+  {3, -15, 113,  35, -10, 2},
+  {3, -15, 111,  37, -11, 3},
+  {3, -16, 109,  40, -11, 3},
+  {3, -16, 108,  42, -12, 3},
+  {4, -17, 106,  45, -13, 3},
+  {4, -17, 104,  47, -13, 3},
+  {4, -17, 102,  50, -14, 3},
+  {4, -17, 100,  52, -14, 3},
+  {4, -18,  98,  55, -15, 4},
+  {4, -18,  96,  58, -15, 3},
+  {4, -18,  94,  60, -16, 4},
+  {4, -18,  91,  63, -16, 4},
+  {4, -18,  89,  65, -16, 4},
+  {4, -18,  87,  68, -17, 4},
+  {4, -18,  85,  70, -17, 4},
+  {4, -18,  82,  73, -17, 4},
+  {4, -18,  80,  75, -17, 4},
+  {4, -18,  78,  78, -18, 4},
+  {4, -17,  75,  80, -18, 4},
+  {4, -17,  73,  82, -18, 4},
+  {4, -17,  70,  85, -18, 4},
+  {4, -17,  68,  87, -18, 4},
+  {4, -16,  65,  89, -18, 4},
+  {4, -16,  63,  91, -18, 4},
+  {4, -16,  60,  94, -18, 4},
+  {3, -15,  58,  96, -18, 4},
+  {4, -15,  55,  98, -18, 4},
+  {3, -14,  52, 100, -17, 4},
+  {3, -14,  50, 102, -17, 4},
+  {3, -13,  47, 104, -17, 4},
+  {3, -13,  45, 106, -17, 4},
+  {3, -12,  42, 108, -16, 3},
+  {3, -11,  40, 109, -16, 3},
+  {3, -11,  37, 111, -15, 3},
+  {2, -10,  35, 113, -15, 3},
+  {3, -10,  32, 114, -14, 3},
+  {2,  -9,  29, 116, -13, 3},
+  {2,  -8,  27, 117, -13, 3},
+  {2,  -8,  25, 119, -12, 2},
+  {2,  -7,  22, 120, -11, 2},
+  {1,  -6,  20, 121, -10, 2},
+  {1,  -6,  18, 122,  -9, 2},
+  {1,  -5,  15, 123,  -8, 2},
+  {1,  -4,  13, 124,  -7, 1},
+  {1,  -4,  11, 125,  -6, 1},
+  {1,  -3,   8, 126,  -5, 1},
+  {1,  -2,   6, 126,  -4, 1},
+  {0,  -1,   4, 127,  -3, 1},
+  {0,  -1,   2, 128,  -1, 0},
+};
+
+static int32_t do_ntap_filter(int32_t *p, int x) {
+  int i;
+  int32_t sum = 0;
+  for (i = 0; i < WARPEDPIXEL_FILTER_TAPS; ++i) {
+    sum += p[i - WARPEDPIXEL_FILTER_TAPS / 2 + 1] * filter_ntap[x][i];
+  }
+  return sum;
+}
+
+static int32_t do_cubic_filter(int32_t *p, int x) {
+  if (x == 0)  {
+    return p[0];
+  } else if (x == (1 << WARPEDPIXEL_PREC_BITS)) {
+    return p[1];
+  } else {
+    const int64_t v1 = x * x * x * (3 * (p[0] - p[1]) + p[2] - p[-1]);
+    const int64_t v2 = x * x * (2 * p[-1] - 5 * p[0] + 4 * p[1] - p[2]);
+    const int64_t v3 = x * (p[1] - p[-1]);
+    const int64_t v4 = 2 * p[0];
+    return (int32_t)ROUND_POWER_OF_TWO_SIGNED(
+        (v4 << (3 * WARPEDPIXEL_PREC_BITS)) +
+        (v3 << (2 * WARPEDPIXEL_PREC_BITS)) +
+        (v2 << WARPEDPIXEL_PREC_BITS) + v1,
+        3 * WARPEDPIXEL_PREC_BITS + 1 - WARPEDPIXEL_FILTER_BITS);
+  }
+}
+
+/*
+static int32_t do_linear_filter(int32_t *p, int x) {
+  int32_t sum = 0;
+  sum = p[0] * (WARPEDPIXEL_PREC_SHIFTS - x) + p[1] * x;
+  sum <<= (WARPEDPIXEL_FILTER_BITS - WARPEDPIXEL_PREC_BITS);
+  return sum;
+}
+
+static int32_t do_4tap_filter(int32_t *p, int x) {
+  int i;
+  int32_t sum = 0;
+  for (i = 0; i < 4; ++i) {
+    sum += p[i - 1] * filter_4tap[x][i];
+  }
+  return sum;
+}
+*/
+
+static INLINE void get_subcolumn(int taps, uint8_t *ref, int32_t *col,
+                                 int stride, int x, int y_start) {
+  int i;
+  for (i = 0; i < taps; ++i) {
+    col[i] = ref[(i + y_start) * stride + x];
+  }
+}
+
+static uint8_t bi_ntap_filter(uint8_t *ref, int x, int y, int stride) {
+  int32_t val, arr[WARPEDPIXEL_FILTER_TAPS];
+  int k;
+  int i = (int) x >> WARPEDPIXEL_PREC_BITS;
+  int j = (int) y >> WARPEDPIXEL_PREC_BITS;
+  for (k = 0; k < WARPEDPIXEL_FILTER_TAPS; ++k) {
+    int32_t arr_temp[WARPEDPIXEL_FILTER_TAPS];
+    get_subcolumn(WARPEDPIXEL_FILTER_TAPS, ref, arr_temp, stride,
+                  i + k + 1 - WARPEDPIXEL_FILTER_TAPS / 2,
+                  j + 1 - WARPEDPIXEL_FILTER_TAPS / 2);
+    arr[k] = do_ntap_filter(arr_temp + WARPEDPIXEL_FILTER_TAPS / 2 - 1,
+                            y - (j << WARPEDPIXEL_PREC_BITS));
+  }
+  val = do_ntap_filter(arr + WARPEDPIXEL_FILTER_TAPS / 2 - 1,
+                       x - (i << WARPEDPIXEL_PREC_BITS));
+  val = ROUND_POWER_OF_TWO_SIGNED(val, WARPEDPIXEL_FILTER_BITS * 2);
+  return (uint8_t)clip_pixel(val);
+}
+
+static uint8_t bi_cubic_filter(uint8_t *ref, int x, int y, int stride) {
+  int32_t val, arr[4];
+  int k;
+  int i = (int) x >> WARPEDPIXEL_PREC_BITS;
+  int j = (int) y >> WARPEDPIXEL_PREC_BITS;
+  for (k = 0; k < 4; ++k) {
+    int32_t arr_temp[4];
+    get_subcolumn(4, ref, arr_temp, stride,
+                  i + k - 1, j - 1);
+    arr[k] = do_cubic_filter(arr_temp + 1, y - (j << WARPEDPIXEL_PREC_BITS));
+  }
+  val = do_cubic_filter(arr + 1, x - (i << WARPEDPIXEL_PREC_BITS));
+  val = ROUND_POWER_OF_TWO_SIGNED(val, WARPEDPIXEL_FILTER_BITS * 2);
+  return (uint8_t)clip_pixel(val);
+}
+
+static uint8_t bi_linear_filter(uint8_t *ref, int x, int y, int stride) {
+  const int ix = x >> WARPEDPIXEL_PREC_BITS;
+  const int iy = y >> WARPEDPIXEL_PREC_BITS;
+  const int sx = x - (ix << WARPEDPIXEL_PREC_BITS);
+  const int sy = y - (iy << WARPEDPIXEL_PREC_BITS);
+  int32_t val;
+  val = ROUND_POWER_OF_TWO_SIGNED(
+      ref[iy * stride + ix] * (WARPEDPIXEL_PREC_SHIFTS - sy) *
+                              (WARPEDPIXEL_PREC_SHIFTS - sx) +
+      ref[iy * stride + ix + 1] * (WARPEDPIXEL_PREC_SHIFTS - sy) * sx +
+      ref[(iy + 1) * stride + ix] * sy * (WARPEDPIXEL_PREC_SHIFTS - sx) +
+      ref[(iy + 1) * stride + ix + 1] * sy * sx,
+      WARPEDPIXEL_PREC_BITS * 2);
+  return (uint8_t)clip_pixel(val);
+}
+
+static uint8_t warp_interpolate(uint8_t *ref, int x, int y,
+                                int width, int height, int stride) {
+  int ix = x >> WARPEDPIXEL_PREC_BITS;
+  int iy = y >> WARPEDPIXEL_PREC_BITS;
+  int sx = x - (ix << WARPEDPIXEL_PREC_BITS);
+  int sy = y - (iy << WARPEDPIXEL_PREC_BITS);
+  int32_t v;
+
+  if (ix < 0 && iy < 0) return ref[0];
+  else if (ix < 0 && iy > height - 1)
+    return ref[(height - 1) * stride];
+  else if (ix > width - 1 && iy < 0)
+    return ref[width - 1];
+  else if (ix > width - 1 && iy > height - 1)
+    return ref[(height - 1) * stride + (width - 1)];
+  else if (ix < 0) {
+    v = ROUND_POWER_OF_TWO_SIGNED(
+        ref[iy * stride] * (WARPEDPIXEL_PREC_SHIFTS - sy) +
+        ref[(iy + 1) * stride] * sy,
+        WARPEDPIXEL_PREC_BITS);
+    return clip_pixel(v);
+  } else if (iy < 0) {
+    v = ROUND_POWER_OF_TWO_SIGNED(
+        ref[ix] * (WARPEDPIXEL_PREC_SHIFTS - sx) +
+        ref[ix + 1] * sx,
+        WARPEDPIXEL_PREC_BITS);
+    return clip_pixel(v);
+  } else if (ix > width - 1) {
+    v = ROUND_POWER_OF_TWO_SIGNED(
+        ref[iy * stride + width - 1] * (WARPEDPIXEL_PREC_SHIFTS - sy) +
+        ref[(iy + 1) * stride + width - 1] * sy,
+        WARPEDPIXEL_PREC_BITS);
+    return clip_pixel(v);
+  } else if (iy > height - 1) {
+    v = ROUND_POWER_OF_TWO_SIGNED(
+        ref[(height - 1) * stride + ix] * (WARPEDPIXEL_PREC_SHIFTS - sx) +
+        ref[(height - 1) * stride + ix + 1] * sx,
+        WARPEDPIXEL_PREC_BITS);
+    return clip_pixel(v);
+  } else if (ix >= WARPEDPIXEL_FILTER_TAPS / 2 - 1 &&
+             iy >= WARPEDPIXEL_FILTER_TAPS / 2 - 1 &&
+             ix < width - WARPEDPIXEL_FILTER_TAPS / 2 &&
+             iy < height - WARPEDPIXEL_FILTER_TAPS / 2) {
+    return bi_ntap_filter(ref, x, y, stride);
+  } else if (ix >= 1 && iy >= 1 &&
+             ix < width - 2 && iy < height - 2) {
+    return bi_cubic_filter(ref, x, y, stride);
+  } else {
+    return bi_linear_filter(ref, x, y, stride);
+  }
+}
+
+void vp10_warp_plane(WarpedMotionParams *wm,
+                     uint8_t *ref,
+                     int width, int height, int stride,
+                     uint8_t *pred,
+                     int p_col, int p_row,
+                     int p_width, int p_height, int p_stride,
+                     int subsampling_x, int subsampling_y,
+                     int x_scale, int y_scale) {
+  int i, j;
+  projectPointsType projectPoints = get_projectPointsType(wm->wmtype);
+  if (projectPoints == NULL)
+    return;
+  for (i = p_row; i < p_row + p_height; ++i) {
+    for (j = p_col; j < p_col + p_width; ++j) {
+      int in[2], out[2];
+      projectPoints(wm->wmmat, in, out, 1, 2, 2, subsampling_x, subsampling_y);
+      out[0] = ROUND_POWER_OF_TWO_SIGNED(out[0] * x_scale, 4);
+      out[1] = ROUND_POWER_OF_TWO_SIGNED(out[1] * y_scale, 4);
+      pred[(j - p_col) + (i - p_row) * p_stride] =
+          warp_interpolate(ref, out[0], out[1], width, height, stride);
+    }
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void highbd_get_subcolumn(int taps, uint16_t *ref, int32_t *col,
+                                        int stride, int x, int y_start) {
+  int i;
+  for (i = 0; i < taps; ++i) {
+    col[i] = ref[(i + y_start) * stride + x];
+  }
+}
+
+static uint16_t highbd_bi_ntap_filter(uint16_t *ref,
+                                      int x, int y, int stride,
+                                      int bd) {
+  int32_t val, arr[WARPEDPIXEL_FILTER_TAPS];
+  int k;
+  int i = (int) x >> WARPEDPIXEL_PREC_BITS;
+  int j = (int) y >> WARPEDPIXEL_PREC_BITS;
+  for (k = 0; k < WARPEDPIXEL_FILTER_TAPS; ++k) {
+    int32_t arr_temp[WARPEDPIXEL_FILTER_TAPS];
+    highbd_get_subcolumn(WARPEDPIXEL_FILTER_TAPS, ref, arr_temp, stride,
+                         i + k + 1 - WARPEDPIXEL_FILTER_TAPS / 2,
+                         j + 1 - WARPEDPIXEL_FILTER_TAPS / 2);
+    arr[k] = do_ntap_filter(arr_temp + WARPEDPIXEL_FILTER_TAPS / 2 - 1,
+                            y - (j << WARPEDPIXEL_PREC_BITS));
+  }
+  val = do_ntap_filter(arr + WARPEDPIXEL_FILTER_TAPS / 2 - 1,
+                       x - (i << WARPEDPIXEL_PREC_BITS));
+  val = ROUND_POWER_OF_TWO_SIGNED(val, WARPEDPIXEL_FILTER_BITS * 2);
+  return (uint16_t)highbd_clip_pixel(val, bd);
+}
+
+static uint16_t highbd_bi_cubic_filter(uint16_t *ref,
+                                       int x, int y, int stride,
+                                       int bd) {
+  int32_t val, arr[4];
+  int k;
+  int i = (int) x >> WARPEDPIXEL_PREC_BITS;
+  int j = (int) y >> WARPEDPIXEL_PREC_BITS;
+  for (k = 0; k < 4; ++k) {
+    int32_t arr_temp[4];
+    highbd_get_subcolumn(4, ref, arr_temp, stride,
+                         i + k - 1, j - 1);
+    arr[k] = do_cubic_filter(arr_temp + 1, y - (j << WARPEDPIXEL_PREC_BITS));
+  }
+  val = do_cubic_filter(arr + 1, x - (i << WARPEDPIXEL_PREC_BITS));
+  val = ROUND_POWER_OF_TWO_SIGNED(val, WARPEDPIXEL_FILTER_BITS * 2);
+  return (uint16_t)highbd_clip_pixel(val, bd);
+}
+
+static uint16_t highbd_bi_linear_filter(uint16_t *ref,
+                                        int x, int y, int stride,
+                                        int bd) {
+  const int ix = x >> WARPEDPIXEL_PREC_BITS;
+  const int iy = y >> WARPEDPIXEL_PREC_BITS;
+  const int sx = x - (ix << WARPEDPIXEL_PREC_BITS);
+  const int sy = y - (iy << WARPEDPIXEL_PREC_BITS);
+  int32_t val;
+  val = ROUND_POWER_OF_TWO_SIGNED(
+      ref[iy * stride + ix] * (WARPEDPIXEL_PREC_SHIFTS - sy) *
+                              (WARPEDPIXEL_PREC_SHIFTS - sx) +
+      ref[iy * stride + ix + 1] * (WARPEDPIXEL_PREC_SHIFTS - sy) * sx +
+      ref[(iy + 1) * stride + ix] * sy * (WARPEDPIXEL_PREC_SHIFTS - sx) +
+      ref[(iy + 1) * stride + ix + 1] * sy * sx,
+      WARPEDPIXEL_PREC_BITS * 2);
+  return (uint16_t)highbd_clip_pixel(val, bd);
+}
+
+static uint16_t highbd_warp_interpolate(uint16_t *ref,
+                                        int x, int y,
+                                        int width, int height, int stride,
+                                        int bd) {
+  int ix = x >> WARPEDPIXEL_PREC_BITS;
+  int iy = y >> WARPEDPIXEL_PREC_BITS;
+  int sx = x - (ix << WARPEDPIXEL_PREC_BITS);
+  int sy = y - (iy << WARPEDPIXEL_PREC_BITS);
+  int32_t v;
+
+  if (ix < 0 && iy < 0) return ref[0];
+  else if (ix < 0 && iy > height - 1)
+    return ref[(height - 1) * stride];
+  else if (ix > width - 1 && iy < 0)
+    return ref[width - 1];
+  else if (ix > width - 1 && iy > height - 1)
+    return ref[(height - 1) * stride + (width - 1)];
+  else if (ix < 0) {
+    v = ROUND_POWER_OF_TWO_SIGNED(
+        ref[iy * stride] * (WARPEDPIXEL_PREC_SHIFTS - sy) +
+        ref[(iy + 1) * stride] * sy,
+        WARPEDPIXEL_PREC_BITS);
+    return highbd_clip_pixel(v, bd);
+  } else if (iy < 0) {
+    v = ROUND_POWER_OF_TWO_SIGNED(
+        ref[ix] * (WARPEDPIXEL_PREC_SHIFTS - sx) +
+        ref[ix + 1] * sx,
+        WARPEDPIXEL_PREC_BITS);
+    return highbd_clip_pixel(v, bd);
+  } else if (ix > width - 1) {
+    v = ROUND_POWER_OF_TWO_SIGNED(
+        ref[iy * stride + width - 1] * (WARPEDPIXEL_PREC_SHIFTS - sy) +
+        ref[(iy + 1) * stride + width - 1] * sy,
+        WARPEDPIXEL_PREC_BITS);
+    return highbd_clip_pixel(v, bd);
+  } else if (iy > height - 1) {
+    v = ROUND_POWER_OF_TWO_SIGNED(
+        ref[(height - 1) * stride + ix] * (WARPEDPIXEL_PREC_SHIFTS - sx) +
+        ref[(height - 1) * stride + ix + 1] * sx,
+        WARPEDPIXEL_PREC_BITS);
+    return highbd_clip_pixel(v, bd);
+  } else if (ix >= WARPEDPIXEL_FILTER_TAPS / 2 - 1 &&
+             iy >= WARPEDPIXEL_FILTER_TAPS / 2 - 1 &&
+             ix < width - WARPEDPIXEL_FILTER_TAPS / 2 &&
+             iy < height - WARPEDPIXEL_FILTER_TAPS / 2) {
+    return highbd_bi_ntap_filter(ref, x, y, stride, bd);
+  } else if (ix >= 1 && iy >= 1 &&
+             ix < width - 2 && iy < height - 2) {
+    return highbd_bi_cubic_filter(ref, x, y, stride, bd);
+  } else {
+    return highbd_bi_linear_filter(ref, x, y, stride, bd);
+  }
+}
+
+void vp10_highbd_warp_plane(WarpedMotionParams *wm,
+                            uint8_t *ref8,
+                            int width, int height, int stride,
+                            uint8_t *pred8,
+                            int p_col, int p_row,
+                            int p_width, int p_height, int p_stride,
+                            int subsampling_col, int subsampling_row,
+                            int x_scale, int y_scale,
+                            int bd) {
+  int i, j;
+  projectPointsType projectPoints = get_projectPointsType(wm->wmtype);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  if (projectPoints == NULL)
+    return;
+  for (i = p_row; i < p_row + p_height; ++i) {
+    for (j = p_col; j < p_col + p_width; ++j) {
+      int in[2], out[2];
+      projectPoints(wm->wmmat, in, out, 1, 2, 2, subsampling_x, subsampling_y);
+      out[0] = ROUND_POWER_OF_TWO_SIGNED(out[0] * x_scale, 4);
+      out[1] = ROUND_POWER_OF_TWO_SIGNED(out[1] * y_scale, 4);
+      pred[(j - p_col) + (i - p_row) * p_stride] =
+          highbd_warp_interpolate(
+              ref, out[0], out[1], width, height, stride, bd);
+    }
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+void vp10_integerize_model(double *H, TransformationType wmtype,
+                           WarpedMotionParams *wm) {
+  wm->wmtype = wmtype;
+  switch (wmtype) {
+    case HOMOGRAPHY:
+      assert(fabs(H[8] - 1.0) < 1e-12);
+      wm->wmmat[7] = rint(H[7] * (1 << WARPEDMODEL_ROW3HOMO_PREC_BITS));
+      wm->wmmat[6] = rint(H[6] * (1 << WARPEDMODEL_ROW3HOMO_PREC_BITS));
+    case AFFINE:
+      wm->wmmat[5] = rint(H[5] * (1 << WARPEDMODEL_PREC_BITS));
+      wm->wmmat[4] = rint(H[4] * (1 << WARPEDMODEL_PREC_BITS));
+    case ROTZOOM:
+      wm->wmmat[3] = rint(H[3] * (1 << WARPEDMODEL_PREC_BITS));
+      wm->wmmat[2] = rint(H[2] * (1 << WARPEDMODEL_PREC_BITS));
+    case TRANSLATION:
+      wm->wmmat[1] = rint(H[1] * (1 << WARPEDMODEL_PREC_BITS));
+      wm->wmmat[0] = rint(H[0] * (1 << WARPEDMODEL_PREC_BITS));
+      break;
+    default:
+      assert(0);
+  };
+  return;
+}
diff --git a/vp10/common/warped_motion.h b/vp10/common/warped_motion.h
new file mode 100644
index 0000000..1d25688
--- /dev/null
+++ b/vp10/common/warped_motion.h
@@ -0,0 +1,77 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be
+ *  found  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_COMMON_WARPED_MOTION_H
+#define VP10_COMMON_WARPED_MOTION_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <math.h>
+#include <assert.h>
+
+#include "./vpx_config.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+// Bits of precision used for the model
+#define WARPEDMODEL_PREC_BITS    8
+#define WARPEDMODEL_ROW3HOMO_PREC_BITS    12
+
+// Bits of subpel precision for warped interpolation
+#define WARPEDPIXEL_PREC_BITS    6
+#define WARPEDPIXEL_PREC_SHIFTS  (1 << WARPEDPIXEL_PREC_BITS)
+
+// Taps for ntap filter
+#define WARPEDPIXEL_FILTER_TAPS  6
+
+// Precision of filter taps
+#define WARPEDPIXEL_FILTER_BITS  7
+
+#define WARPEDDIFF_PREC_BITS  (WARPEDMODEL_PREC_BITS - WARPEDPIXEL_PREC_BITS)
+
+typedef enum {
+  UNKNOWN_TRANSFORM = -1,
+  HOMOGRAPHY,      // homography, 8-parameter
+  AFFINE,          // affine, 6-parameter
+  ROTZOOM,         // simplified affine with rotation and zoom only, 4-parameter
+  TRANSLATION      // translational motion 2-parameter
+} TransformationType;
+
+typedef struct {
+  TransformationType wmtype;
+  int wmmat[8];  // For homography wmmat[9] is assumed to be 1
+} WarpedMotionParams;
+
+// Integerize model into the WarpedMotionParams structure
+void vp10_integerize_model(double *H,
+                           TransformationType wmtype,
+                           WarpedMotionParams *wm);
+
+void vp10_warp_plane(WarpedMotionParams *wm,
+                     uint8_t *ref,
+                     int width, int height, int stride,
+                     uint8_t *pred,
+                     int p_col, int p_row,
+                     int p_width, int p_height, int p_stride,
+                     int subsampling_col, int subsampling_row,
+                     int x_scale, int y_scale);
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_highbd_warp_plane(WarpedMotionParams *wm,
+                            uint8_t *ref,
+                            int width, int height, int stride,
+                            uint8_t *pred,
+                            int p_col, int p_row,
+                            int p_width, int p_height, int p_stride,
+                            int subsampling_col, int subsampling_row,
+                            int x_scale, int y_scale,
+                            int bd);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // VP10_COMMON_WARPED_MOTION_H
diff --git a/vp10/common/x86/highbd_inv_txfm_sse4.c b/vp10/common/x86/highbd_inv_txfm_sse4.c
new file mode 100644
index 0000000..349aec5
--- /dev/null
+++ b/vp10/common/x86/highbd_inv_txfm_sse4.c
@@ -0,0 +1,1401 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>  /* SSE4.1 */
+
+#include "./vp10_rtcd.h"
+#include "./vpx_config.h"
+#include "vp10/common/vp10_inv_txfm2d_cfg.h"
+#include "vp10/common/x86/highbd_txfm_utility_sse4.h"
+
+static INLINE void load_buffer_4x4(const int32_t *coeff, __m128i *in) {
+  in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
+  in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
+  in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
+  in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
+}
+
+static void idct4x4_sse4_1(__m128i *in, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3, x, y;
+
+  v0 = _mm_unpacklo_epi32(in[0], in[1]);
+  v1 = _mm_unpackhi_epi32(in[0], in[1]);
+  v2 = _mm_unpacklo_epi32(in[2], in[3]);
+  v3 = _mm_unpackhi_epi32(in[2], in[3]);
+
+  u0 = _mm_unpacklo_epi64(v0, v2);
+  u1 = _mm_unpackhi_epi64(v0, v2);
+  u2 = _mm_unpacklo_epi64(v1, v3);
+  u3 = _mm_unpackhi_epi64(v1, v3);
+
+  x = _mm_mullo_epi32(u0, cospi32);
+  y = _mm_mullo_epi32(u2, cospi32);
+  v0 = _mm_add_epi32(x, y);
+  v0 = _mm_add_epi32(v0, rnding);
+  v0 = _mm_srai_epi32(v0, bit);
+
+  v1 = _mm_sub_epi32(x, y);
+  v1 = _mm_add_epi32(v1, rnding);
+  v1 = _mm_srai_epi32(v1, bit);
+
+  x = _mm_mullo_epi32(u1, cospi48);
+  y = _mm_mullo_epi32(u3, cospim16);
+  v2 = _mm_add_epi32(x, y);
+  v2 = _mm_add_epi32(v2, rnding);
+  v2 = _mm_srai_epi32(v2, bit);
+
+  x = _mm_mullo_epi32(u1, cospi16);
+  y = _mm_mullo_epi32(u3, cospi48);
+  v3 = _mm_add_epi32(x, y);
+  v3 = _mm_add_epi32(v3, rnding);
+  v3 = _mm_srai_epi32(v3, bit);
+
+  in[0] = _mm_add_epi32(v0, v3);
+  in[1] = _mm_add_epi32(v1, v2);
+  in[2] = _mm_sub_epi32(v1, v2);
+  in[3] = _mm_sub_epi32(v0, v3);
+}
+
+static void iadst4x4_sse4_1(__m128i *in, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i zero = _mm_setzero_si128();
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3, x, y;
+
+  v0 = _mm_unpacklo_epi32(in[0], in[1]);
+  v1 = _mm_unpackhi_epi32(in[0], in[1]);
+  v2 = _mm_unpacklo_epi32(in[2], in[3]);
+  v3 = _mm_unpackhi_epi32(in[2], in[3]);
+
+  u0 = _mm_unpacklo_epi64(v0, v2);
+  u1 = _mm_unpackhi_epi64(v0, v2);
+  u2 = _mm_unpacklo_epi64(v1, v3);
+  u3 = _mm_unpackhi_epi64(v1, v3);
+
+  // stage 0
+  // stage 1
+  u1 = _mm_sub_epi32(zero, u1);
+  u3 = _mm_sub_epi32(zero, u3);
+
+  // stage 2
+  v0 = u0;
+  v1 = u3;
+  x = _mm_mullo_epi32(u1, cospi32);
+  y = _mm_mullo_epi32(u2, cospi32);
+  v2 = _mm_add_epi32(x, y);
+  v2 = _mm_add_epi32(v2, rnding);
+  v2 = _mm_srai_epi32(v2, bit);
+
+  v3 = _mm_sub_epi32(x, y);
+  v3 = _mm_add_epi32(v3, rnding);
+  v3 = _mm_srai_epi32(v3, bit);
+
+  // stage 3
+  u0 = _mm_add_epi32(v0, v2);
+  u1 = _mm_add_epi32(v1, v3);
+  u2 = _mm_sub_epi32(v0, v2);
+  u3 = _mm_sub_epi32(v1, v3);
+
+  // stage 4
+  x = _mm_mullo_epi32(u0, cospi8);
+  y = _mm_mullo_epi32(u1, cospi56);
+  in[3] = _mm_add_epi32(x, y);
+  in[3] = _mm_add_epi32(in[3], rnding);
+  in[3] = _mm_srai_epi32(in[3], bit);
+
+  x = _mm_mullo_epi32(u0, cospi56);
+  y = _mm_mullo_epi32(u1, cospim8);
+  in[0] = _mm_add_epi32(x, y);
+  in[0] = _mm_add_epi32(in[0], rnding);
+  in[0] = _mm_srai_epi32(in[0], bit);
+
+  x = _mm_mullo_epi32(u2, cospi40);
+  y = _mm_mullo_epi32(u3, cospi24);
+  in[1] = _mm_add_epi32(x, y);
+  in[1] = _mm_add_epi32(in[1], rnding);
+  in[1] = _mm_srai_epi32(in[1], bit);
+
+  x = _mm_mullo_epi32(u2, cospi24);
+  y = _mm_mullo_epi32(u3, cospim40);
+  in[2] = _mm_add_epi32(x, y);
+  in[2] = _mm_add_epi32(in[2], rnding);
+  in[2] = _mm_srai_epi32(in[2], bit);
+}
+
+static INLINE void round_shift_4x4(__m128i *in, int shift) {
+  __m128i rnding = _mm_set1_epi32(1 << (shift - 1));
+
+  in[0] = _mm_add_epi32(in[0], rnding);
+  in[1] = _mm_add_epi32(in[1], rnding);
+  in[2] = _mm_add_epi32(in[2], rnding);
+  in[3] = _mm_add_epi32(in[3], rnding);
+
+  in[0] = _mm_srai_epi32(in[0], shift);
+  in[1] = _mm_srai_epi32(in[1], shift);
+  in[2] = _mm_srai_epi32(in[2], shift);
+  in[3] = _mm_srai_epi32(in[3], shift);
+}
+
+static INLINE __m128i highbd_clamp_epi16(__m128i u, int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
+  __m128i clamped, mask;
+
+  mask = _mm_cmpgt_epi16(u, max);
+  clamped = _mm_andnot_si128(mask, u);
+  mask = _mm_and_si128(mask, max);
+  clamped = _mm_or_si128(mask, clamped);
+  mask = _mm_cmpgt_epi16(clamped, zero);
+  clamped = _mm_and_si128(clamped, mask);
+
+  return clamped;
+}
+
+static void write_buffer_4x4(__m128i *in, uint16_t *output, int stride,
+                             int fliplr, int flipud, int shift, int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3;
+
+  round_shift_4x4(in, shift);
+
+  v0 = _mm_loadl_epi64((__m128i const *)(output + 0 * stride));
+  v1 = _mm_loadl_epi64((__m128i const *)(output + 1 * stride));
+  v2 = _mm_loadl_epi64((__m128i const *)(output + 2 * stride));
+  v3 = _mm_loadl_epi64((__m128i const *)(output + 3 * stride));
+
+  v0 = _mm_unpacklo_epi16(v0, zero);
+  v1 = _mm_unpacklo_epi16(v1, zero);
+  v2 = _mm_unpacklo_epi16(v2, zero);
+  v3 = _mm_unpacklo_epi16(v3, zero);
+
+  if (fliplr) {
+    in[0] = _mm_shuffle_epi32(in[0], 0x1B);
+    in[1] = _mm_shuffle_epi32(in[1], 0x1B);
+    in[2] = _mm_shuffle_epi32(in[2], 0x1B);
+    in[3] = _mm_shuffle_epi32(in[3], 0x1B);
+  }
+
+  if (flipud) {
+    u0 = _mm_add_epi32(in[3], v0);
+    u1 = _mm_add_epi32(in[2], v1);
+    u2 = _mm_add_epi32(in[1], v2);
+    u3 = _mm_add_epi32(in[0], v3);
+  } else {
+    u0 = _mm_add_epi32(in[0], v0);
+    u1 = _mm_add_epi32(in[1], v1);
+    u2 = _mm_add_epi32(in[2], v2);
+    u3 = _mm_add_epi32(in[3], v3);
+  }
+
+  v0 = _mm_packus_epi32(u0, u1);
+  v2 = _mm_packus_epi32(u2, u3);
+
+  u0 = highbd_clamp_epi16(v0, bd);
+  u2 = highbd_clamp_epi16(v2, bd);
+
+  v0 = _mm_unpacklo_epi64(u0, u0);
+  v1 = _mm_unpackhi_epi64(u0, u0);
+  v2 = _mm_unpacklo_epi64(u2, u2);
+  v3 = _mm_unpackhi_epi64(u2, u2);
+
+  _mm_storel_epi64((__m128i *)(output + 0 * stride), v0);
+  _mm_storel_epi64((__m128i *)(output + 1 * stride), v1);
+  _mm_storel_epi64((__m128i *)(output + 2 * stride), v2);
+  _mm_storel_epi64((__m128i *)(output + 3 * stride), v3);
+}
+
+void vp10_inv_txfm2d_add_4x4_sse4_1(const int32_t *coeff, uint16_t *output,
+                                    int stride, int tx_type, int bd) {
+  __m128i in[4];
+  const TXFM_2D_CFG *cfg = NULL;
+
+  switch (tx_type) {
+    case DCT_DCT:
+      cfg = &inv_txfm_2d_cfg_dct_dct_4;
+      load_buffer_4x4(coeff, in);
+      idct4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      idct4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+    case ADST_DCT:
+      cfg = &inv_txfm_2d_cfg_adst_dct_4;
+      load_buffer_4x4(coeff, in);
+      idct4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+    case DCT_ADST:
+      cfg = &inv_txfm_2d_cfg_dct_adst_4;
+      load_buffer_4x4(coeff, in);
+      iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      idct4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+    case ADST_ADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_4;
+      load_buffer_4x4(coeff, in);
+      iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      write_buffer_4x4(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      cfg = &inv_txfm_2d_cfg_adst_dct_4;
+      load_buffer_4x4(coeff, in);
+      idct4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      write_buffer_4x4(in, output, stride, 0, 1, -cfg->shift[1], bd);
+      break;
+    case DCT_FLIPADST:
+      cfg = &inv_txfm_2d_cfg_dct_adst_4;
+      load_buffer_4x4(coeff, in);
+      iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      idct4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      write_buffer_4x4(in, output, stride, 1, 0, -cfg->shift[1], bd);
+      break;
+    case FLIPADST_FLIPADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_4;
+      load_buffer_4x4(coeff, in);
+      iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      write_buffer_4x4(in, output, stride, 1, 1, -cfg->shift[1], bd);
+      break;
+    case ADST_FLIPADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_4;
+      load_buffer_4x4(coeff, in);
+      iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      write_buffer_4x4(in, output, stride, 1, 0, -cfg->shift[1], bd);
+      break;
+    case FLIPADST_ADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_4;
+      load_buffer_4x4(coeff, in);
+      iadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      iadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      write_buffer_4x4(in, output, stride, 0, 1, -cfg->shift[1], bd);
+      break;
+#endif  // CONFIG_EXT_TX
+    default:
+      assert(0);
+  }
+}
+
+// 8x8
+static void load_buffer_8x8(const int32_t *coeff, __m128i *in) {
+  in[0] = _mm_load_si128((const __m128i *)(coeff + 0));
+  in[1] = _mm_load_si128((const __m128i *)(coeff + 4));
+  in[2] = _mm_load_si128((const __m128i *)(coeff + 8));
+  in[3] = _mm_load_si128((const __m128i *)(coeff + 12));
+  in[4] = _mm_load_si128((const __m128i *)(coeff + 16));
+  in[5] = _mm_load_si128((const __m128i *)(coeff + 20));
+  in[6] = _mm_load_si128((const __m128i *)(coeff + 24));
+  in[7] = _mm_load_si128((const __m128i *)(coeff + 28));
+  in[8] = _mm_load_si128((const __m128i *)(coeff + 32));
+  in[9] = _mm_load_si128((const __m128i *)(coeff + 36));
+  in[10] = _mm_load_si128((const __m128i *)(coeff + 40));
+  in[11] = _mm_load_si128((const __m128i *)(coeff + 44));
+  in[12] = _mm_load_si128((const __m128i *)(coeff + 48));
+  in[13] = _mm_load_si128((const __m128i *)(coeff + 52));
+  in[14] = _mm_load_si128((const __m128i *)(coeff + 56));
+  in[15] = _mm_load_si128((const __m128i *)(coeff + 60));
+}
+
+static void idct8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i x, y;
+  int col;
+
+  // Note:
+  //  Even column: 0, 2, ..., 14
+  //  Odd column: 1, 3, ..., 15
+  //  one even column plus one odd column constructs one row (8 coeffs)
+  //  total we have 8 rows (8x8).
+  for (col = 0; col < 2; ++col) {
+    // stage 0
+    // stage 1
+    // stage 2
+    u0 = in[0 * 2 + col];
+    u1 = in[4 * 2 + col];
+    u2 = in[2 * 2 + col];
+    u3 = in[6 * 2 + col];
+
+    x = _mm_mullo_epi32(in[1 * 2 + col], cospi56);
+    y = _mm_mullo_epi32(in[7 * 2 + col], cospim8);
+    u4 = _mm_add_epi32(x, y);
+    u4 = _mm_add_epi32(u4, rnding);
+    u4 = _mm_srai_epi32(u4, bit);
+
+    x = _mm_mullo_epi32(in[1 * 2 + col], cospi8);
+    y = _mm_mullo_epi32(in[7 * 2 + col], cospi56);
+    u7 = _mm_add_epi32(x, y);
+    u7 = _mm_add_epi32(u7, rnding);
+    u7 = _mm_srai_epi32(u7, bit);
+
+    x = _mm_mullo_epi32(in[5 * 2 + col], cospi24);
+    y = _mm_mullo_epi32(in[3 * 2 + col], cospim40);
+    u5 = _mm_add_epi32(x, y);
+    u5 = _mm_add_epi32(u5, rnding);
+    u5 = _mm_srai_epi32(u5, bit);
+
+    x = _mm_mullo_epi32(in[5 * 2 + col], cospi40);
+    y = _mm_mullo_epi32(in[3 * 2 + col], cospi24);
+    u6 = _mm_add_epi32(x, y);
+    u6 = _mm_add_epi32(u6, rnding);
+    u6 = _mm_srai_epi32(u6, bit);
+
+    // stage 3
+    x = _mm_mullo_epi32(u0, cospi32);
+    y = _mm_mullo_epi32(u1, cospi32);
+    v0 = _mm_add_epi32(x, y);
+    v0 = _mm_add_epi32(v0, rnding);
+    v0 = _mm_srai_epi32(v0, bit);
+
+    v1 = _mm_sub_epi32(x, y);
+    v1 = _mm_add_epi32(v1, rnding);
+    v1 = _mm_srai_epi32(v1, bit);
+
+    x = _mm_mullo_epi32(u2, cospi48);
+    y = _mm_mullo_epi32(u3, cospim16);
+    v2 = _mm_add_epi32(x, y);
+    v2 = _mm_add_epi32(v2, rnding);
+    v2 = _mm_srai_epi32(v2, bit);
+
+    x = _mm_mullo_epi32(u2, cospi16);
+    y = _mm_mullo_epi32(u3, cospi48);
+    v3 = _mm_add_epi32(x, y);
+    v3 = _mm_add_epi32(v3, rnding);
+    v3 = _mm_srai_epi32(v3, bit);
+
+    v4 = _mm_add_epi32(u4, u5);
+    v5 = _mm_sub_epi32(u4, u5);
+    v6 = _mm_sub_epi32(u7, u6);
+    v7 = _mm_add_epi32(u6, u7);
+
+    // stage 4
+    u0 = _mm_add_epi32(v0, v3);
+    u1 = _mm_add_epi32(v1, v2);
+    u2 = _mm_sub_epi32(v1, v2);
+    u3 = _mm_sub_epi32(v0, v3);
+    u4 = v4;
+    u7 = v7;
+
+    x = _mm_mullo_epi32(v5, cospi32);
+    y = _mm_mullo_epi32(v6, cospi32);
+    u6 = _mm_add_epi32(y, x);
+    u6 = _mm_add_epi32(u6, rnding);
+    u6 = _mm_srai_epi32(u6, bit);
+
+    u5 = _mm_sub_epi32(y, x);
+    u5 = _mm_add_epi32(u5, rnding);
+    u5 = _mm_srai_epi32(u5, bit);
+
+    // stage 5
+    out[0 * 2 + col] = _mm_add_epi32(u0, u7);
+    out[1 * 2 + col] = _mm_add_epi32(u1, u6);
+    out[2 * 2 + col] = _mm_add_epi32(u2, u5);
+    out[3 * 2 + col] = _mm_add_epi32(u3, u4);
+    out[4 * 2 + col] = _mm_sub_epi32(u3, u4);
+    out[5 * 2 + col] = _mm_sub_epi32(u2, u5);
+    out[6 * 2 + col] = _mm_sub_epi32(u1, u6);
+    out[7 * 2 + col] = _mm_sub_epi32(u0, u7);
+  }
+}
+
+static void iadst8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i zero = _mm_setzero_si128();
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i x, y;
+  int col;
+
+  // Note:
+  //  Even column: 0, 2, ..., 14
+  //  Odd column: 1, 3, ..., 15
+  //  one even column plus one odd column constructs one row (8 coeffs)
+  //  total we have 8 rows (8x8).
+  for (col = 0; col < 2; ++col) {
+    // stage 0
+    // stage 1
+    u0 = in[2 * 0 + col];
+    u1 = _mm_sub_epi32(zero, in[2 * 7 + col]);
+    u2 = _mm_sub_epi32(zero, in[2 * 3 + col]);
+    u3 = in[2 * 4 + col];
+    u4 = _mm_sub_epi32(zero, in[2 * 1 + col]);
+    u5 = in[2 * 6 + col];
+    u6 = in[2 * 2 + col];
+    u7 = _mm_sub_epi32(zero, in[2 * 5 + col]);
+
+    // stage 2
+    v0 = u0;
+    v1 = u1;
+
+    x = _mm_mullo_epi32(u2, cospi32);
+    y = _mm_mullo_epi32(u3, cospi32);
+    v2 = _mm_add_epi32(x, y);
+    v2 = _mm_add_epi32(v2, rnding);
+    v2 = _mm_srai_epi32(v2, bit);
+
+    v3 = _mm_sub_epi32(x, y);
+    v3 = _mm_add_epi32(v3, rnding);
+    v3 = _mm_srai_epi32(v3, bit);
+
+    v4 = u4;
+    v5 = u5;
+
+    x = _mm_mullo_epi32(u6, cospi32);
+    y = _mm_mullo_epi32(u7, cospi32);
+    v6 = _mm_add_epi32(x, y);
+    v6 = _mm_add_epi32(v6, rnding);
+    v6 = _mm_srai_epi32(v6, bit);
+
+    v7 = _mm_sub_epi32(x, y);
+    v7 = _mm_add_epi32(v7, rnding);
+    v7 = _mm_srai_epi32(v7, bit);
+
+    // stage 3
+    u0 = _mm_add_epi32(v0, v2);
+    u1 = _mm_add_epi32(v1, v3);
+    u2 = _mm_sub_epi32(v0, v2);
+    u3 = _mm_sub_epi32(v1, v3);
+    u4 = _mm_add_epi32(v4, v6);
+    u5 = _mm_add_epi32(v5, v7);
+    u6 = _mm_sub_epi32(v4, v6);
+    u7 = _mm_sub_epi32(v5, v7);
+
+    // stage 4
+    v0 = u0;
+    v1 = u1;
+    v2 = u2;
+    v3 = u3;
+
+    x = _mm_mullo_epi32(u4, cospi16);
+    y = _mm_mullo_epi32(u5, cospi48);
+    v4 = _mm_add_epi32(x, y);
+    v4 = _mm_add_epi32(v4, rnding);
+    v4 = _mm_srai_epi32(v4, bit);
+
+    x = _mm_mullo_epi32(u4, cospi48);
+    y = _mm_mullo_epi32(u5, cospim16);
+    v5 = _mm_add_epi32(x, y);
+    v5 = _mm_add_epi32(v5, rnding);
+    v5 = _mm_srai_epi32(v5, bit);
+
+    x = _mm_mullo_epi32(u6, cospim48);
+    y = _mm_mullo_epi32(u7, cospi16);
+    v6 = _mm_add_epi32(x, y);
+    v6 = _mm_add_epi32(v6, rnding);
+    v6 = _mm_srai_epi32(v6, bit);
+
+    x = _mm_mullo_epi32(u6, cospi16);
+    y = _mm_mullo_epi32(u7, cospi48);
+    v7 = _mm_add_epi32(x, y);
+    v7 = _mm_add_epi32(v7, rnding);
+    v7 = _mm_srai_epi32(v7, bit);
+
+    // stage 5
+    u0 = _mm_add_epi32(v0, v4);
+    u1 = _mm_add_epi32(v1, v5);
+    u2 = _mm_add_epi32(v2, v6);
+    u3 = _mm_add_epi32(v3, v7);
+    u4 = _mm_sub_epi32(v0, v4);
+    u5 = _mm_sub_epi32(v1, v5);
+    u6 = _mm_sub_epi32(v2, v6);
+    u7 = _mm_sub_epi32(v3, v7);
+
+    // stage 6
+    x = _mm_mullo_epi32(u0, cospi4);
+    y = _mm_mullo_epi32(u1, cospi60);
+    v0 = _mm_add_epi32(x, y);
+    v0 = _mm_add_epi32(v0, rnding);
+    v0 = _mm_srai_epi32(v0, bit);
+
+    x = _mm_mullo_epi32(u0, cospi60);
+    y = _mm_mullo_epi32(u1, cospim4);
+    v1 = _mm_add_epi32(x, y);
+    v1 = _mm_add_epi32(v1, rnding);
+    v1 = _mm_srai_epi32(v1, bit);
+
+    x = _mm_mullo_epi32(u2, cospi20);
+    y = _mm_mullo_epi32(u3, cospi44);
+    v2 = _mm_add_epi32(x, y);
+    v2 = _mm_add_epi32(v2, rnding);
+    v2 = _mm_srai_epi32(v2, bit);
+
+    x = _mm_mullo_epi32(u2, cospi44);
+    y = _mm_mullo_epi32(u3, cospim20);
+    v3 = _mm_add_epi32(x, y);
+    v3 = _mm_add_epi32(v3, rnding);
+    v3 = _mm_srai_epi32(v3, bit);
+
+    x = _mm_mullo_epi32(u4, cospi36);
+    y = _mm_mullo_epi32(u5, cospi28);
+    v4 = _mm_add_epi32(x, y);
+    v4 = _mm_add_epi32(v4, rnding);
+    v4 = _mm_srai_epi32(v4, bit);
+
+    x = _mm_mullo_epi32(u4, cospi28);
+    y = _mm_mullo_epi32(u5, cospim36);
+    v5 = _mm_add_epi32(x, y);
+    v5 = _mm_add_epi32(v5, rnding);
+    v5 = _mm_srai_epi32(v5, bit);
+
+    x = _mm_mullo_epi32(u6, cospi52);
+    y = _mm_mullo_epi32(u7, cospi12);
+    v6 = _mm_add_epi32(x, y);
+    v6 = _mm_add_epi32(v6, rnding);
+    v6 = _mm_srai_epi32(v6, bit);
+
+    x = _mm_mullo_epi32(u6, cospi12);
+    y = _mm_mullo_epi32(u7, cospim52);
+    v7 = _mm_add_epi32(x, y);
+    v7 = _mm_add_epi32(v7, rnding);
+    v7 = _mm_srai_epi32(v7, bit);
+
+    // stage 7
+    out[2 * 0 + col] = v1;
+    out[2 * 1 + col] = v6;
+    out[2 * 2 + col] = v3;
+    out[2 * 3 + col] = v4;
+    out[2 * 4 + col] = v5;
+    out[2 * 5 + col] = v2;
+    out[2 * 6 + col] = v7;
+    out[2 * 7 + col] = v0;
+  }
+}
+
+static void round_shift_8x8(__m128i *in , int shift) {
+  round_shift_4x4(&in[0], shift);
+  round_shift_4x4(&in[4], shift);
+  round_shift_4x4(&in[8], shift);
+  round_shift_4x4(&in[12], shift);
+}
+
+static __m128i get_recon_8x8(const __m128i pred, __m128i res_lo,
+                             __m128i res_hi, int fliplr, int bd) {
+  __m128i x0, x1;
+  const __m128i zero = _mm_setzero_si128();
+
+  x0 = _mm_unpacklo_epi16(pred, zero);
+  x1 = _mm_unpackhi_epi16(pred, zero);
+
+  if (fliplr) {
+    res_lo = _mm_shuffle_epi32(res_lo, 0x1B);
+    res_hi = _mm_shuffle_epi32(res_hi, 0x1B);
+    x0 = _mm_add_epi32(res_hi, x0);
+    x1 = _mm_add_epi32(res_lo, x1);
+
+  } else {
+    x0 = _mm_add_epi32(res_lo, x0);
+    x1 = _mm_add_epi32(res_hi, x1);
+  }
+
+  x0 = _mm_packus_epi32(x0, x1);
+  return highbd_clamp_epi16(x0, bd);
+}
+
+static void write_buffer_8x8(__m128i *in, uint16_t *output, int stride,
+                             int fliplr, int flipud, int shift, int bd) {
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+
+  round_shift_8x8(in, shift);
+
+  v0 = _mm_load_si128((__m128i const *)(output + 0 * stride));
+  v1 = _mm_load_si128((__m128i const *)(output + 1 * stride));
+  v2 = _mm_load_si128((__m128i const *)(output + 2 * stride));
+  v3 = _mm_load_si128((__m128i const *)(output + 3 * stride));
+  v4 = _mm_load_si128((__m128i const *)(output + 4 * stride));
+  v5 = _mm_load_si128((__m128i const *)(output + 5 * stride));
+  v6 = _mm_load_si128((__m128i const *)(output + 6 * stride));
+  v7 = _mm_load_si128((__m128i const *)(output + 7 * stride));
+
+  if (flipud) {
+    u0 = get_recon_8x8(v0, in[14], in[15], fliplr, bd);
+    u1 = get_recon_8x8(v1, in[12], in[13], fliplr, bd);
+    u2 = get_recon_8x8(v2, in[10], in[11], fliplr, bd);
+    u3 = get_recon_8x8(v3, in[8], in[9], fliplr, bd);
+    u4 = get_recon_8x8(v4, in[6], in[7], fliplr, bd);
+    u5 = get_recon_8x8(v5, in[4], in[5], fliplr, bd);
+    u6 = get_recon_8x8(v6, in[2], in[3], fliplr, bd);
+    u7 = get_recon_8x8(v7, in[0], in[1], fliplr, bd);
+  } else {
+    u0 = get_recon_8x8(v0, in[0], in[1], fliplr, bd);
+    u1 = get_recon_8x8(v1, in[2], in[3], fliplr, bd);
+    u2 = get_recon_8x8(v2, in[4], in[5], fliplr, bd);
+    u3 = get_recon_8x8(v3, in[6], in[7], fliplr, bd);
+    u4 = get_recon_8x8(v4, in[8], in[9], fliplr, bd);
+    u5 = get_recon_8x8(v5, in[10], in[11], fliplr, bd);
+    u6 = get_recon_8x8(v6, in[12], in[13], fliplr, bd);
+    u7 = get_recon_8x8(v7, in[14], in[15], fliplr, bd);
+  }
+
+  _mm_store_si128((__m128i *)(output + 0 * stride), u0);
+  _mm_store_si128((__m128i *)(output + 1 * stride), u1);
+  _mm_store_si128((__m128i *)(output + 2 * stride), u2);
+  _mm_store_si128((__m128i *)(output + 3 * stride), u3);
+  _mm_store_si128((__m128i *)(output + 4 * stride), u4);
+  _mm_store_si128((__m128i *)(output + 5 * stride), u5);
+  _mm_store_si128((__m128i *)(output + 6 * stride), u6);
+  _mm_store_si128((__m128i *)(output + 7 * stride), u7);
+}
+
+void vp10_inv_txfm2d_add_8x8_sse4_1(const int32_t *coeff, uint16_t *output,
+                                    int stride, int tx_type, int bd) {
+  __m128i in[16], out[16];
+  const TXFM_2D_CFG *cfg = NULL;
+
+  switch (tx_type) {
+    case DCT_DCT:
+      cfg = &inv_txfm_2d_cfg_dct_dct_8;
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      idct8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      transpose_8x8(in, out);
+      idct8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_8x8(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+    case DCT_ADST:
+      cfg = &inv_txfm_2d_cfg_dct_adst_8;
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      transpose_8x8(in, out);
+      idct8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_8x8(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+    case ADST_DCT:
+      cfg = &inv_txfm_2d_cfg_adst_dct_8;
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      idct8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_8x8(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+    case ADST_ADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_8;
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_8x8(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      cfg = &inv_txfm_2d_cfg_adst_dct_8;
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      idct8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_8x8(in, output, stride, 0, 1, -cfg->shift[1], bd);
+      break;
+    case DCT_FLIPADST:
+      cfg = &inv_txfm_2d_cfg_dct_adst_8;
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      transpose_8x8(in, out);
+      idct8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_8x8(in, output, stride, 1, 0, -cfg->shift[1], bd);
+      break;
+    case ADST_FLIPADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_8;
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_8x8(in, output, stride, 1, 0, -cfg->shift[1], bd);
+      break;
+    case FLIPADST_FLIPADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_8;
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_8x8(in, output, stride, 1, 1, -cfg->shift[1], bd);
+      break;
+    case FLIPADST_ADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_8;
+      load_buffer_8x8(coeff, in);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_row[2]);
+      transpose_8x8(in, out);
+      iadst8x8_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_8x8(in, output, stride, 0, 1, -cfg->shift[1], bd);
+      break;
+#endif  // CONFIG_EXT_TX
+    default:
+      assert(0);
+  }
+}
+
+// 16x16
+static void load_buffer_16x16(const int32_t *coeff, __m128i *in) {
+  int i;
+  for (i = 0; i < 64; ++i) {
+    in[i] = _mm_load_si128((const __m128i *)(coeff + (i << 2)));
+  }
+}
+
+static void assign_8x8_input_from_16x16(const __m128i *in, __m128i *in8x8,
+                                        int col) {
+  int i;
+  for (i = 0; i < 16; i += 2) {
+    in8x8[i] = in[col];
+    in8x8[i + 1] = in[col + 1];
+    col += 4;
+  }
+}
+
+static void swap_addr(uint16_t **output1, uint16_t **output2) {
+  uint16_t *tmp;
+  tmp = *output1;
+  *output1 = *output2;
+  *output2 = tmp;
+}
+
+static void write_buffer_16x16(__m128i *in, uint16_t *output, int stride,
+                               int fliplr, int flipud, int shift, int bd) {
+  __m128i in8x8[16];
+  uint16_t *leftUp = &output[0];
+  uint16_t *rightUp = &output[8];
+  uint16_t *leftDown = &output[8 * stride];
+  uint16_t *rightDown = &output[8 * stride + 8];
+
+  if (fliplr) {
+    swap_addr(&leftUp, &rightUp);
+    swap_addr(&leftDown, &rightDown);
+  }
+
+  if (flipud) {
+    swap_addr(&leftUp, &leftDown);
+    swap_addr(&rightUp, &rightDown);
+  }
+
+  // Left-up quarter
+  assign_8x8_input_from_16x16(in, in8x8, 0);
+  write_buffer_8x8(in8x8, leftUp, stride, fliplr, flipud, shift, bd);
+
+  // Right-up quarter
+  assign_8x8_input_from_16x16(in, in8x8, 2);
+  write_buffer_8x8(in8x8, rightUp, stride, fliplr, flipud, shift, bd);
+
+  // Left-down quarter
+  assign_8x8_input_from_16x16(in, in8x8, 32);
+  write_buffer_8x8(in8x8, leftDown, stride, fliplr, flipud, shift, bd);
+
+  // Right-down quarter
+  assign_8x8_input_from_16x16(in, in8x8, 34);
+  write_buffer_8x8(in8x8, rightDown, stride, fliplr, flipud, shift, bd);
+}
+
+static void idct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospim4 = _mm_set1_epi32(-cospi[4]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospim36 = _mm_set1_epi32(-cospi[36]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospim20 = _mm_set1_epi32(-cospi[20]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospim52 = _mm_set1_epi32(-cospi[52]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  __m128i u[16], v[16], x, y;
+  int col;
+
+  for (col = 0; col < 4; ++col) {
+    // stage 0
+    // stage 1
+    u[0] = in[0 * 4 + col];
+    u[1] = in[8 * 4 + col];
+    u[2] = in[4 * 4 + col];
+    u[3] = in[12 * 4 + col];
+    u[4] = in[2 * 4 + col];
+    u[5] = in[10 * 4 + col];
+    u[6] = in[6 * 4 + col];
+    u[7] = in[14 * 4 + col];
+    u[8] = in[1 * 4 + col];
+    u[9] = in[9 * 4 + col];
+    u[10] = in[5 * 4 + col];
+    u[11] = in[13 * 4 + col];
+    u[12] = in[3 * 4 + col];
+    u[13] = in[11 * 4 + col];
+    u[14] = in[7 * 4 + col];
+    u[15] = in[15 * 4 + col];
+
+    // stage 2
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+
+    v[8] = half_btf_sse4_1(cospi60, u[8], cospim4, u[15], rnding, bit);
+    v[9] = half_btf_sse4_1(cospi28, u[9], cospim36, u[14], rnding, bit);
+    v[10] = half_btf_sse4_1(cospi44, u[10], cospim20, u[13], rnding, bit);
+    v[11] = half_btf_sse4_1(cospi12, u[11], cospim52, u[12], rnding, bit);
+    v[12] = half_btf_sse4_1(cospi52, u[11], cospi12, u[12], rnding, bit);
+    v[13] = half_btf_sse4_1(cospi20, u[10], cospi44, u[13], rnding, bit);
+    v[14] = half_btf_sse4_1(cospi36, u[9], cospi28, u[14], rnding, bit);
+    v[15] = half_btf_sse4_1(cospi4, u[8], cospi60, u[15], rnding, bit);
+
+    // stage 3
+    u[0] = v[0];
+    u[1] = v[1];
+    u[2] = v[2];
+    u[3] = v[3];
+    u[4] = half_btf_sse4_1(cospi56, v[4], cospim8, v[7], rnding, bit);
+    u[5] = half_btf_sse4_1(cospi24, v[5], cospim40, v[6], rnding, bit);
+    u[6] = half_btf_sse4_1(cospi40, v[5], cospi24, v[6], rnding, bit);
+    u[7] = half_btf_sse4_1(cospi8, v[4], cospi56, v[7], rnding, bit);
+    u[8] = _mm_add_epi32(v[8], v[9]);
+    u[9] = _mm_sub_epi32(v[8], v[9]);
+    u[10] = _mm_sub_epi32(v[11], v[10]);
+    u[11] = _mm_add_epi32(v[10], v[11]);
+    u[12] = _mm_add_epi32(v[12], v[13]);
+    u[13] = _mm_sub_epi32(v[12], v[13]);
+    u[14] = _mm_sub_epi32(v[15], v[14]);
+    u[15] = _mm_add_epi32(v[14], v[15]);
+
+    // stage 4
+    x = _mm_mullo_epi32(u[0], cospi32);
+    y = _mm_mullo_epi32(u[1], cospi32);
+    v[0] = _mm_add_epi32(x, y);
+    v[0] = _mm_add_epi32(v[0], rnding);
+    v[0] = _mm_srai_epi32(v[0], bit);
+
+    v[1] = _mm_sub_epi32(x, y);
+    v[1] = _mm_add_epi32(v[1], rnding);
+    v[1] = _mm_srai_epi32(v[1], bit);
+
+    v[2] = half_btf_sse4_1(cospi48, u[2], cospim16, u[3], rnding, bit);
+    v[3] = half_btf_sse4_1(cospi16, u[2], cospi48, u[3], rnding, bit);
+    v[4] = _mm_add_epi32(u[4], u[5]);
+    v[5] = _mm_sub_epi32(u[4], u[5]);
+    v[6] = _mm_sub_epi32(u[7], u[6]);
+    v[7] = _mm_add_epi32(u[6], u[7]);
+    v[8] = u[8];
+    v[9] = half_btf_sse4_1(cospim16, u[9], cospi48, u[14], rnding, bit);
+    v[10] = half_btf_sse4_1(cospim48, u[10], cospim16, u[13], rnding, bit);
+    v[11] = u[11];
+    v[12] = u[12];
+    v[13] = half_btf_sse4_1(cospim16, u[10], cospi48, u[13], rnding, bit);
+    v[14] = half_btf_sse4_1(cospi48, u[9], cospi16, u[14], rnding, bit);
+    v[15] = u[15];
+
+    // stage 5
+    u[0] = _mm_add_epi32(v[0], v[3]);
+    u[1] = _mm_add_epi32(v[1], v[2]);
+    u[2] = _mm_sub_epi32(v[1], v[2]);
+    u[3] = _mm_sub_epi32(v[0], v[3]);
+    u[4] = v[4];
+
+    x = _mm_mullo_epi32(v[5], cospi32);
+    y = _mm_mullo_epi32(v[6], cospi32);
+    u[5] = _mm_sub_epi32(y, x);
+    u[5] = _mm_add_epi32(u[5], rnding);
+    u[5] = _mm_srai_epi32(u[5], bit);
+
+    u[6] = _mm_add_epi32(y, x);
+    u[6] = _mm_add_epi32(u[6], rnding);
+    u[6] = _mm_srai_epi32(u[6], bit);
+
+    u[7] = v[7];
+    u[8] = _mm_add_epi32(v[8], v[11]);
+    u[9] = _mm_add_epi32(v[9], v[10]);
+    u[10] = _mm_sub_epi32(v[9], v[10]);
+    u[11] = _mm_sub_epi32(v[8], v[11]);
+    u[12] = _mm_sub_epi32(v[15], v[12]);
+    u[13] = _mm_sub_epi32(v[14], v[13]);
+    u[14] = _mm_add_epi32(v[13], v[14]);
+    u[15] = _mm_add_epi32(v[12], v[15]);
+
+    // stage 6
+    v[0] = _mm_add_epi32(u[0], u[7]);
+    v[1] = _mm_add_epi32(u[1], u[6]);
+    v[2] = _mm_add_epi32(u[2], u[5]);
+    v[3] = _mm_add_epi32(u[3], u[4]);
+    v[4] = _mm_sub_epi32(u[3], u[4]);
+    v[5] = _mm_sub_epi32(u[2], u[5]);
+    v[6] = _mm_sub_epi32(u[1], u[6]);
+    v[7] = _mm_sub_epi32(u[0], u[7]);
+    v[8] = u[8];
+    v[9] = u[9];
+
+    x = _mm_mullo_epi32(u[10], cospi32);
+    y = _mm_mullo_epi32(u[13], cospi32);
+    v[10] = _mm_sub_epi32(y, x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[13] = _mm_add_epi32(x, y);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    x = _mm_mullo_epi32(u[11], cospi32);
+    y = _mm_mullo_epi32(u[12], cospi32);
+    v[11] = _mm_sub_epi32(y, x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = _mm_add_epi32(x, y);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
+
+    v[14] = u[14];
+    v[15] = u[15];
+
+    // stage 7
+    out[0 * 4 + col] = _mm_add_epi32(v[0], v[15]);
+    out[1 * 4 + col] = _mm_add_epi32(v[1], v[14]);
+    out[2 * 4 + col] = _mm_add_epi32(v[2], v[13]);
+    out[3 * 4 + col] = _mm_add_epi32(v[3], v[12]);
+    out[4 * 4 + col] = _mm_add_epi32(v[4], v[11]);
+    out[5 * 4 + col] = _mm_add_epi32(v[5], v[10]);
+    out[6 * 4 + col] = _mm_add_epi32(v[6], v[9]);
+    out[7 * 4 + col] = _mm_add_epi32(v[7], v[8]);
+    out[8 * 4 + col] = _mm_sub_epi32(v[7], v[8]);
+    out[9 * 4 + col] = _mm_sub_epi32(v[6], v[9]);
+    out[10 * 4 + col] = _mm_sub_epi32(v[5], v[10]);
+    out[11 * 4 + col] = _mm_sub_epi32(v[4], v[11]);
+    out[12 * 4 + col] = _mm_sub_epi32(v[3], v[12]);
+    out[13 * 4 + col] = _mm_sub_epi32(v[2], v[13]);
+    out[14 * 4 + col] = _mm_sub_epi32(v[1], v[14]);
+    out[15 * 4 + col] = _mm_sub_epi32(v[0], v[15]);
+  }
+}
+
+static void iadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim8 = _mm_set1_epi32(-cospi[8]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospim40 = _mm_set1_epi32(-cospi[40]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospim2 = _mm_set1_epi32(-cospi[2]);
+  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+  const __m128i cospim10 = _mm_set1_epi32(-cospi[10]);
+  const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+  const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+  const __m128i cospim18 = _mm_set1_epi32(-cospi[18]);
+  const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+  const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+  const __m128i cospim26 = _mm_set1_epi32(-cospi[26]);
+  const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
+  const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+  const __m128i cospim34 = _mm_set1_epi32(-cospi[34]);
+  const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
+  const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+  const __m128i cospim42 = _mm_set1_epi32(-cospi[42]);
+  const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
+  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+  const __m128i cospim50 = _mm_set1_epi32(-cospi[50]);
+  const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospim58 = _mm_set1_epi32(-cospi[58]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i zero = _mm_setzero_si128();
+
+  __m128i u[16], v[16], x, y;
+  int col;
+
+  for (col = 0; col < 4; ++col) {
+    // stage 0
+    // stage 1
+    u[0] = in[0 * 4 + col];
+    u[1] = _mm_sub_epi32(zero, in[15 * 4 + col]);
+    u[2] = _mm_sub_epi32(zero, in[7 * 4 + col]);
+    u[3] = in[8 * 4 + col];
+    u[4] = _mm_sub_epi32(zero, in[3 * 4 + col]);
+    u[5] = in[12 * 4 + col];
+    u[6] = in[4 * 4 + col];
+    u[7] = _mm_sub_epi32(zero, in[11 * 4 + col]);
+    u[8] = _mm_sub_epi32(zero, in[1 * 4 + col]);
+    u[9] = in[14 * 4 + col];
+    u[10] = in[6 * 4 + col];
+    u[11] = _mm_sub_epi32(zero, in[9 * 4 + col]);
+    u[12] = in[2 * 4 + col];
+    u[13] = _mm_sub_epi32(zero, in[13 * 4 + col]);
+    u[14] = _mm_sub_epi32(zero, in[5 * 4 + col]);
+    u[15] = in[10 * 4 + col];
+
+    // stage 2
+    v[0] = u[0];
+    v[1] = u[1];
+
+    x = _mm_mullo_epi32(u[2], cospi32);
+    y = _mm_mullo_epi32(u[3], cospi32);
+    v[2] = _mm_add_epi32(x, y);
+    v[2] = _mm_add_epi32(v[2], rnding);
+    v[2] = _mm_srai_epi32(v[2], bit);
+
+    v[3] = _mm_sub_epi32(x, y);
+    v[3] = _mm_add_epi32(v[3], rnding);
+    v[3] = _mm_srai_epi32(v[3], bit);
+
+    v[4] = u[4];
+    v[5] = u[5];
+
+    x = _mm_mullo_epi32(u[6], cospi32);
+    y = _mm_mullo_epi32(u[7], cospi32);
+    v[6] = _mm_add_epi32(x, y);
+    v[6] = _mm_add_epi32(v[6], rnding);
+    v[6] = _mm_srai_epi32(v[6], bit);
+
+    v[7] = _mm_sub_epi32(x, y);
+    v[7] = _mm_add_epi32(v[7], rnding);
+    v[7] = _mm_srai_epi32(v[7], bit);
+
+    v[8] = u[8];
+    v[9] = u[9];
+
+    x = _mm_mullo_epi32(u[10], cospi32);
+    y = _mm_mullo_epi32(u[11], cospi32);
+    v[10] = _mm_add_epi32(x, y);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[11] = _mm_sub_epi32(x, y);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = u[12];
+    v[13] = u[13];
+
+    x = _mm_mullo_epi32(u[14], cospi32);
+    y = _mm_mullo_epi32(u[15], cospi32);
+    v[14] = _mm_add_epi32(x, y);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[15] = _mm_sub_epi32(x, y);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
+
+    // stage 3
+    u[0] = _mm_add_epi32(v[0], v[2]);
+    u[1] = _mm_add_epi32(v[1], v[3]);
+    u[2] = _mm_sub_epi32(v[0], v[2]);
+    u[3] = _mm_sub_epi32(v[1], v[3]);
+    u[4] = _mm_add_epi32(v[4], v[6]);
+    u[5] = _mm_add_epi32(v[5], v[7]);
+    u[6] = _mm_sub_epi32(v[4], v[6]);
+    u[7] = _mm_sub_epi32(v[5], v[7]);
+    u[8] = _mm_add_epi32(v[8], v[10]);
+    u[9] = _mm_add_epi32(v[9], v[11]);
+    u[10] = _mm_sub_epi32(v[8], v[10]);
+    u[11] = _mm_sub_epi32(v[9], v[11]);
+    u[12] = _mm_add_epi32(v[12], v[14]);
+    u[13] = _mm_add_epi32(v[13], v[15]);
+    u[14] = _mm_sub_epi32(v[12], v[14]);
+    u[15] = _mm_sub_epi32(v[13], v[15]);
+
+    // stage 4
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = half_btf_sse4_1(cospi16, u[4], cospi48, u[5], rnding, bit);
+    v[5] = half_btf_sse4_1(cospi48, u[4], cospim16, u[5], rnding, bit);
+    v[6] = half_btf_sse4_1(cospim48, u[6], cospi16, u[7], rnding, bit);
+    v[7] = half_btf_sse4_1(cospi16, u[6], cospi48, u[7], rnding, bit);
+    v[8] = u[8];
+    v[9] = u[9];
+    v[10] = u[10];
+    v[11] = u[11];
+    v[12] = half_btf_sse4_1(cospi16, u[12], cospi48, u[13], rnding, bit);
+    v[13] = half_btf_sse4_1(cospi48, u[12], cospim16, u[13], rnding, bit);
+    v[14] = half_btf_sse4_1(cospim48, u[14], cospi16, u[15], rnding, bit);
+    v[15] = half_btf_sse4_1(cospi16, u[14], cospi48, u[15], rnding, bit);
+
+    // stage 5
+    u[0] = _mm_add_epi32(v[0], v[4]);
+    u[1] = _mm_add_epi32(v[1], v[5]);
+    u[2] = _mm_add_epi32(v[2], v[6]);
+    u[3] = _mm_add_epi32(v[3], v[7]);
+    u[4] = _mm_sub_epi32(v[0], v[4]);
+    u[5] = _mm_sub_epi32(v[1], v[5]);
+    u[6] = _mm_sub_epi32(v[2], v[6]);
+    u[7] = _mm_sub_epi32(v[3], v[7]);
+    u[8] = _mm_add_epi32(v[8], v[12]);
+    u[9] = _mm_add_epi32(v[9], v[13]);
+    u[10] = _mm_add_epi32(v[10], v[14]);
+    u[11] = _mm_add_epi32(v[11], v[15]);
+    u[12] = _mm_sub_epi32(v[8], v[12]);
+    u[13] = _mm_sub_epi32(v[9], v[13]);
+    u[14] = _mm_sub_epi32(v[10], v[14]);
+    u[15] = _mm_sub_epi32(v[11], v[15]);
+
+    // stage 6
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+    v[8] = half_btf_sse4_1(cospi8, u[8], cospi56, u[9], rnding, bit);
+    v[9] = half_btf_sse4_1(cospi56, u[8], cospim8, u[9], rnding, bit);
+    v[10] = half_btf_sse4_1(cospi40, u[10], cospi24, u[11], rnding, bit);
+    v[11] = half_btf_sse4_1(cospi24, u[10], cospim40, u[11], rnding, bit);
+    v[12] = half_btf_sse4_1(cospim56, u[12], cospi8, u[13], rnding, bit);
+    v[13] = half_btf_sse4_1(cospi8, u[12], cospi56, u[13], rnding, bit);
+    v[14] = half_btf_sse4_1(cospim24, u[14], cospi40, u[15], rnding, bit);
+    v[15] = half_btf_sse4_1(cospi40, u[14], cospi24, u[15], rnding, bit);
+
+    // stage 7
+    u[0] = _mm_add_epi32(v[0], v[8]);
+    u[1] = _mm_add_epi32(v[1], v[9]);
+    u[2] = _mm_add_epi32(v[2], v[10]);
+    u[3] = _mm_add_epi32(v[3], v[11]);
+    u[4] = _mm_add_epi32(v[4], v[12]);
+    u[5] = _mm_add_epi32(v[5], v[13]);
+    u[6] = _mm_add_epi32(v[6], v[14]);
+    u[7] = _mm_add_epi32(v[7], v[15]);
+    u[8] = _mm_sub_epi32(v[0], v[8]);
+    u[9] = _mm_sub_epi32(v[1], v[9]);
+    u[10] = _mm_sub_epi32(v[2], v[10]);
+    u[11] = _mm_sub_epi32(v[3], v[11]);
+    u[12] = _mm_sub_epi32(v[4], v[12]);
+    u[13] = _mm_sub_epi32(v[5], v[13]);
+    u[14] = _mm_sub_epi32(v[6], v[14]);
+    u[15] = _mm_sub_epi32(v[7], v[15]);
+
+    // stage 8
+    v[0] = half_btf_sse4_1(cospi2, u[0], cospi62, u[1], rnding, bit);
+    v[1] = half_btf_sse4_1(cospi62, u[0], cospim2, u[1], rnding, bit);
+    v[2] = half_btf_sse4_1(cospi10, u[2], cospi54, u[3], rnding, bit);
+    v[3] = half_btf_sse4_1(cospi54, u[2], cospim10, u[3], rnding, bit);
+    v[4] = half_btf_sse4_1(cospi18, u[4], cospi46, u[5], rnding, bit);
+    v[5] = half_btf_sse4_1(cospi46, u[4], cospim18, u[5], rnding, bit);
+    v[6] = half_btf_sse4_1(cospi26, u[6], cospi38, u[7], rnding, bit);
+    v[7] = half_btf_sse4_1(cospi38, u[6], cospim26, u[7], rnding, bit);
+    v[8] = half_btf_sse4_1(cospi34, u[8], cospi30, u[9], rnding, bit);
+    v[9] = half_btf_sse4_1(cospi30, u[8], cospim34, u[9], rnding, bit);
+    v[10] = half_btf_sse4_1(cospi42, u[10], cospi22, u[11], rnding, bit);
+    v[11] = half_btf_sse4_1(cospi22, u[10], cospim42, u[11], rnding, bit);
+    v[12] = half_btf_sse4_1(cospi50, u[12], cospi14, u[13], rnding, bit);
+    v[13] = half_btf_sse4_1(cospi14, u[12], cospim50, u[13], rnding, bit);
+    v[14] = half_btf_sse4_1(cospi58, u[14], cospi6, u[15], rnding, bit);
+    v[15] = half_btf_sse4_1(cospi6, u[14], cospim58, u[15], rnding, bit);
+
+    // stage 9
+    out[0 * 4 + col] = v[1];
+    out[1 * 4 + col] = v[14];
+    out[2 * 4 + col] = v[3];
+    out[3 * 4 + col] = v[12];
+    out[4 * 4 + col] = v[5];
+    out[5 * 4 + col] = v[10];
+    out[6 * 4 + col] = v[7];
+    out[7 * 4 + col] = v[8];
+    out[8 * 4 + col] = v[9];
+    out[9 * 4 + col] = v[6];
+    out[10 * 4 + col] = v[11];
+    out[11 * 4 + col] = v[4];
+    out[12 * 4 + col] = v[13];
+    out[13 * 4 + col] = v[2];
+    out[14 * 4 + col] = v[15];
+    out[15 * 4 + col] = v[0];
+  }
+}
+
+static void round_shift_16x16(__m128i *in, int shift) {
+  round_shift_8x8(&in[0], shift);
+  round_shift_8x8(&in[16], shift);
+  round_shift_8x8(&in[32], shift);
+  round_shift_8x8(&in[48], shift);
+}
+
+void vp10_inv_txfm2d_add_16x16_sse4_1(const int32_t *coeff, uint16_t *output,
+                                      int stride, int tx_type, int bd) {
+  __m128i in[64], out[64];
+  const TXFM_2D_CFG *cfg = NULL;
+
+  switch (tx_type) {
+    case DCT_DCT:
+      cfg = &inv_txfm_2d_cfg_dct_dct_16;
+      load_buffer_16x16(coeff, in);
+      transpose_16x16(in, out);
+      idct16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+      round_shift_16x16(in, -cfg->shift[0]);
+      transpose_16x16(in, out);
+      idct16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_16x16(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+    case DCT_ADST:
+      cfg = &inv_txfm_2d_cfg_dct_adst_16;
+      load_buffer_16x16(coeff, in);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+      round_shift_16x16(in, -cfg->shift[0]);
+      transpose_16x16(in, out);
+      idct16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_16x16(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+    case ADST_DCT:
+      cfg = &inv_txfm_2d_cfg_adst_dct_16;
+      load_buffer_16x16(coeff, in);
+      transpose_16x16(in, out);
+      idct16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+      round_shift_16x16(in, -cfg->shift[0]);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_16x16(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+    case ADST_ADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_16;
+      load_buffer_16x16(coeff, in);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+      round_shift_16x16(in, -cfg->shift[0]);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_16x16(in, output, stride, 0, 0, -cfg->shift[1], bd);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      cfg = &inv_txfm_2d_cfg_adst_dct_16;
+      load_buffer_16x16(coeff, in);
+      transpose_16x16(in, out);
+      idct16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+      round_shift_16x16(in, -cfg->shift[0]);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_16x16(in, output, stride, 0, 1, -cfg->shift[1], bd);
+      break;
+    case DCT_FLIPADST:
+      cfg = &inv_txfm_2d_cfg_dct_adst_16;
+      load_buffer_16x16(coeff, in);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+      round_shift_16x16(in, -cfg->shift[0]);
+      transpose_16x16(in, out);
+      idct16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_16x16(in, output, stride, 1, 0, -cfg->shift[1], bd);
+      break;
+    case ADST_FLIPADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_16;
+      load_buffer_16x16(coeff, in);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+      round_shift_16x16(in, -cfg->shift[0]);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_16x16(in, output, stride, 1, 0, -cfg->shift[1], bd);
+      break;
+    case FLIPADST_FLIPADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_16;
+      load_buffer_16x16(coeff, in);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+      round_shift_16x16(in, -cfg->shift[0]);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_16x16(in, output, stride, 1, 1, -cfg->shift[1], bd);
+      break;
+    case FLIPADST_ADST:
+      cfg = &inv_txfm_2d_cfg_adst_adst_16;
+      load_buffer_16x16(coeff, in);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_row[2]);
+      round_shift_16x16(in, -cfg->shift[0]);
+      transpose_16x16(in, out);
+      iadst16x16_sse4_1(out, in, cfg->cos_bit_col[2]);
+      write_buffer_16x16(in, output, stride, 0, 1, -cfg->shift[1], bd);
+      break;
+#endif
+    default:
+      assert(0);
+  }
+}
diff --git a/vp10/common/x86/highbd_txfm_utility_sse4.h b/vp10/common/x86/highbd_txfm_utility_sse4.h
new file mode 100644
index 0000000..319b50a
--- /dev/null
+++ b/vp10/common/x86/highbd_txfm_utility_sse4.h
@@ -0,0 +1,96 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef _HIGHBD_TXFM_UTILITY_SSE4_H
+#define _HIGHBD_TXFM_UTILITY_SSE4_H
+
+#include <smmintrin.h>  /* SSE4.1 */
+
+#define TRANSPOSE_4X4(x0, x1, x2, x3, y0, y1, y2, y3) \
+  do {                                \
+    __m128i u0, u1, u2, u3;           \
+    u0 = _mm_unpacklo_epi32(x0, x1);  \
+    u1 = _mm_unpackhi_epi32(x0, x1);  \
+    u2 = _mm_unpacklo_epi32(x2, x3);  \
+    u3 = _mm_unpackhi_epi32(x2, x3);  \
+    y0 = _mm_unpacklo_epi64(u0, u2);  \
+    y1 = _mm_unpackhi_epi64(u0, u2);  \
+    y2 = _mm_unpacklo_epi64(u1, u3);  \
+    y3 = _mm_unpackhi_epi64(u1, u3);  \
+  } while (0)
+
+static INLINE void transpose_8x8(const __m128i *in, __m128i *out) {
+  TRANSPOSE_4X4(in[0], in[2], in[4], in[6],
+                out[0], out[2], out[4], out[6]);
+  TRANSPOSE_4X4(in[1], in[3], in[5], in[7],
+                out[8], out[10], out[12], out[14]);
+  TRANSPOSE_4X4(in[8], in[10], in[12], in[14],
+                out[1], out[3], out[5], out[7]);
+  TRANSPOSE_4X4(in[9], in[11], in[13], in[15],
+                out[9], out[11], out[13], out[15]);
+}
+
+static INLINE void transpose_16x16(const __m128i *in, __m128i *out) {
+  // Upper left 8x8
+  TRANSPOSE_4X4(in[0], in[4], in[8], in[12],
+                out[0], out[4], out[8], out[12]);
+  TRANSPOSE_4X4(in[1], in[5], in[9], in[13],
+                out[16], out[20], out[24], out[28]);
+  TRANSPOSE_4X4(in[16], in[20], in[24], in[28],
+                out[1], out[5], out[9], out[13]);
+  TRANSPOSE_4X4(in[17], in[21], in[25], in[29],
+                out[17], out[21], out[25], out[29]);
+
+  // Upper right 8x8
+  TRANSPOSE_4X4(in[2], in[6], in[10], in[14],
+                out[32], out[36], out[40], out[44]);
+  TRANSPOSE_4X4(in[3], in[7], in[11], in[15],
+                out[48], out[52], out[56], out[60]);
+  TRANSPOSE_4X4(in[18], in[22], in[26], in[30],
+                out[33], out[37], out[41], out[45]);
+  TRANSPOSE_4X4(in[19], in[23], in[27], in[31],
+                out[49], out[53], out[57], out[61]);
+
+  // Lower left 8x8
+  TRANSPOSE_4X4(in[32], in[36], in[40], in[44],
+                out[2], out[6], out[10], out[14]);
+  TRANSPOSE_4X4(in[33], in[37], in[41], in[45],
+                out[18], out[22], out[26], out[30]);
+  TRANSPOSE_4X4(in[48], in[52], in[56], in[60],
+                out[3], out[7], out[11], out[15]);
+  TRANSPOSE_4X4(in[49], in[53], in[57], in[61],
+                out[19], out[23], out[27], out[31]);
+  // Lower right 8x8
+  TRANSPOSE_4X4(in[34], in[38], in[42], in[46],
+                out[34], out[38], out[42], out[46]);
+  TRANSPOSE_4X4(in[35], in[39], in[43], in[47],
+                out[50], out[54], out[58], out[62]);
+  TRANSPOSE_4X4(in[50], in[54], in[58], in[62],
+                out[35], out[39], out[43], out[47]);
+  TRANSPOSE_4X4(in[51], in[55], in[59], in[63],
+                out[51], out[55], out[59], out[63]);
+}
+
+// Note:
+//  rounding = 1 << (bit - 1)
+static INLINE __m128i half_btf_sse4_1(__m128i w0, __m128i n0,
+                                      __m128i w1, __m128i n1,
+                                      __m128i rounding, int bit) {
+  __m128i x, y;
+
+  x = _mm_mullo_epi32(w0, n0);
+  y = _mm_mullo_epi32(w1, n1);
+  x = _mm_add_epi32(x, y);
+  x = _mm_add_epi32(x, rounding);
+  x = _mm_srai_epi32(x, bit);
+  return x;
+}
+
+#endif  // _HIGHBD_TXFM_UTILITY_SSE4_H
diff --git a/vp10/common/x86/idct_intrin_sse2.c b/vp10/common/x86/idct_intrin_sse2.c
index a2c674b..da60764 100644
--- a/vp10/common/x86/idct_intrin_sse2.c
+++ b/vp10/common/x86/idct_intrin_sse2.c
@@ -8,9 +8,51 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "./vp10_rtcd.h"
 #include "vpx_dsp/x86/inv_txfm_sse2.h"
 #include "vpx_dsp/x86/txfm_common_sse2.h"
 #include "vpx_ports/mem.h"
+#include "vp10/common/enums.h"
+
+#if CONFIG_EXT_TX
+static INLINE void fliplr_4x4(__m128i in[2]) {
+  in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
+  in[0] = _mm_shufflehi_epi16(in[0], 0x1b);
+  in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
+  in[1] = _mm_shufflehi_epi16(in[1], 0x1b);
+}
+
+static INLINE void fliplr_8x8(__m128i in[8]) {
+  in[0] = mm_reverse_epi16(in[0]);
+  in[1] = mm_reverse_epi16(in[1]);
+  in[2] = mm_reverse_epi16(in[2]);
+  in[3] = mm_reverse_epi16(in[3]);
+
+  in[4] = mm_reverse_epi16(in[4]);
+  in[5] = mm_reverse_epi16(in[5]);
+  in[6] = mm_reverse_epi16(in[6]);
+  in[7] = mm_reverse_epi16(in[7]);
+}
+
+static INLINE void fliplr_16x8(__m128i in[16]) {
+  fliplr_8x8(&in[0]);
+  fliplr_8x8(&in[8]);
+}
+
+#define FLIPLR_16x16(in0, in1) do {             \
+  __m128i *tmp;                                 \
+  fliplr_16x8(in0);                             \
+  fliplr_16x8(in1);                             \
+  tmp = (in0);                                  \
+  (in0) = (in1);                                \
+  (in1) = tmp;                                  \
+} while (0)
+
+#define FLIPUD_PTR(dest, stride, size) do {     \
+    (dest) = (dest) + ((size) - 1) * (stride);  \
+    (stride) = - (stride);                      \
+} while (0)
+#endif
 
 void vp10_iht4x4_16_add_sse2(const tran_low_t *input, uint8_t *dest, int stride,
                              int tx_type) {
@@ -22,22 +64,50 @@
   in[1] = load_input_data(input + 8);
 
   switch (tx_type) {
-    case 0:  // DCT_DCT
+    case DCT_DCT:
       idct4_sse2(in);
       idct4_sse2(in);
       break;
-    case 1:  // ADST_DCT
+    case ADST_DCT:
       idct4_sse2(in);
       iadst4_sse2(in);
       break;
-    case 2:  // DCT_ADST
+    case DCT_ADST:
       iadst4_sse2(in);
       idct4_sse2(in);
       break;
-    case 3:  // ADST_ADST
+    case ADST_ADST:
       iadst4_sse2(in);
       iadst4_sse2(in);
       break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      idct4_sse2(in);
+      iadst4_sse2(in);
+      FLIPUD_PTR(dest, stride, 4);
+      break;
+    case DCT_FLIPADST:
+      iadst4_sse2(in);
+      idct4_sse2(in);
+      fliplr_4x4(in);
+      break;
+    case FLIPADST_FLIPADST:
+      iadst4_sse2(in);
+      iadst4_sse2(in);
+      FLIPUD_PTR(dest, stride, 4);
+      fliplr_4x4(in);
+      break;
+    case ADST_FLIPADST:
+      iadst4_sse2(in);
+      iadst4_sse2(in);
+      fliplr_4x4(in);
+      break;
+    case FLIPADST_ADST:
+      iadst4_sse2(in);
+      iadst4_sse2(in);
+      FLIPUD_PTR(dest, stride, 4);
+      break;
+#endif  // CONFIG_EXT_TX
     default:
       assert(0);
       break;
@@ -52,12 +122,12 @@
 
   // Reconstruction and Store
   {
-    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
+    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 0));
+    __m128i d1 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 1));
     __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
-    d0 = _mm_unpacklo_epi32(d0,
-                            _mm_cvtsi32_si128(*(const int *)(dest + stride)));
-    d2 = _mm_unpacklo_epi32(
-        d2, _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)));
+    __m128i d3 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 3));
+    d0 = _mm_unpacklo_epi32(d0, d1);
+    d2 = _mm_unpacklo_epi32(d2, d3);
     d0 = _mm_unpacklo_epi8(d0, zero);
     d2 = _mm_unpacklo_epi8(d2, zero);
     d0 = _mm_add_epi16(d0, in[0]);
@@ -94,22 +164,50 @@
   in[7] = load_input_data(input + 8 * 7);
 
   switch (tx_type) {
-    case 0:  // DCT_DCT
+    case DCT_DCT:
       idct8_sse2(in);
       idct8_sse2(in);
       break;
-    case 1:  // ADST_DCT
+    case ADST_DCT:
       idct8_sse2(in);
       iadst8_sse2(in);
       break;
-    case 2:  // DCT_ADST
+    case DCT_ADST:
       iadst8_sse2(in);
       idct8_sse2(in);
       break;
-    case 3:  // ADST_ADST
+    case ADST_ADST:
       iadst8_sse2(in);
       iadst8_sse2(in);
       break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      idct8_sse2(in);
+      iadst8_sse2(in);
+      FLIPUD_PTR(dest, stride, 8);
+      break;
+    case DCT_FLIPADST:
+      iadst8_sse2(in);
+      idct8_sse2(in);
+      fliplr_8x8(in);
+      break;
+    case FLIPADST_FLIPADST:
+      iadst8_sse2(in);
+      iadst8_sse2(in);
+      FLIPUD_PTR(dest, stride, 8);
+      fliplr_8x8(in);
+      break;
+    case ADST_FLIPADST:
+      iadst8_sse2(in);
+      iadst8_sse2(in);
+      fliplr_8x8(in);
+      break;
+    case FLIPADST_ADST:
+      iadst8_sse2(in);
+      iadst8_sse2(in);
+      FLIPUD_PTR(dest, stride, 8);
+      break;
+#endif  // CONFIG_EXT_TX
     default:
       assert(0);
       break;
@@ -146,29 +244,59 @@
 
 void vp10_iht16x16_256_add_sse2(const tran_low_t *input, uint8_t *dest,
                                 int stride, int tx_type) {
-  __m128i in0[16], in1[16];
+  __m128i in[32];
+  __m128i *in0 = &in[0];
+  __m128i *in1 = &in[16];
 
   load_buffer_8x16(input, in0);
   input += 8;
   load_buffer_8x16(input, in1);
 
   switch (tx_type) {
-    case 0:  // DCT_DCT
+    case DCT_DCT:
       idct16_sse2(in0, in1);
       idct16_sse2(in0, in1);
       break;
-    case 1:  // ADST_DCT
+    case ADST_DCT:
       idct16_sse2(in0, in1);
       iadst16_sse2(in0, in1);
       break;
-    case 2:  // DCT_ADST
+    case DCT_ADST:
       iadst16_sse2(in0, in1);
       idct16_sse2(in0, in1);
       break;
-    case 3:  // ADST_ADST
+    case ADST_ADST:
       iadst16_sse2(in0, in1);
       iadst16_sse2(in0, in1);
       break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      idct16_sse2(in0, in1);
+      iadst16_sse2(in0, in1);
+      FLIPUD_PTR(dest, stride, 16);
+      break;
+    case DCT_FLIPADST:
+      iadst16_sse2(in0, in1);
+      idct16_sse2(in0, in1);
+      FLIPLR_16x16(in0, in1);
+      break;
+    case FLIPADST_FLIPADST:
+      iadst16_sse2(in0, in1);
+      iadst16_sse2(in0, in1);
+      FLIPUD_PTR(dest, stride, 16);
+      FLIPLR_16x16(in0, in1);
+      break;
+    case ADST_FLIPADST:
+      iadst16_sse2(in0, in1);
+      iadst16_sse2(in0, in1);
+      FLIPLR_16x16(in0, in1);
+      break;
+    case FLIPADST_ADST:
+      iadst16_sse2(in0, in1);
+      iadst16_sse2(in0, in1);
+      FLIPUD_PTR(dest, stride, 16);
+      break;
+#endif  // CONFIG_EXT_TX
     default:
       assert(0);
       break;
diff --git a/vp10/common/x86/vp10_convolve_filters_ssse3.c b/vp10/common/x86/vp10_convolve_filters_ssse3.c
new file mode 100644
index 0000000..2f7b3c7
--- /dev/null
+++ b/vp10/common/x86/vp10_convolve_filters_ssse3.c
@@ -0,0 +1,942 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_config.h"
+#include "vp10/common/filter.h"
+
+#if CONFIG_EXT_INTERP
+DECLARE_ALIGNED(16, const int8_t,
+                sub_pel_filters_10sharp_signal_dir[15][2][16]) = {
+  {
+    {  0,   0,  -1,   3,  -6, 127,   8,  -4,
+       2,  -1,   0,   0,   0,   0,   0,   0, },
+    {  0,   0,   0,   0,  -1,   3,  -6, 127,
+       8,  -4,   2,  -1,   0,   0,   0,   0, },
+  },
+  {
+    {  0,   1,  -2,   5, -12, 124,  18,  -7,
+       3,  -2,   0,   0,   0,   0,   0,   0, },
+    {  0,   0,   0,   1,  -2,   5, -12, 124,
+      18,  -7,   3,  -2,   0,   0,   0,   0, },
+  },
+  {
+    {  0,   1,  -3,   7, -17, 119,  28, -11,
+       5,  -2,   1,   0,   0,   0,   0,   0, },
+    {  0,   0,   0,   1,  -3,   7, -17, 119,
+      28, -11,   5,  -2,   1,   0,   0,   0, },
+  },
+  {
+    {  0,   1,  -4,   8, -20, 114,  38, -14,
+       7,  -3,   1,   0,   0,   0,   0,   0, },
+    {  0,   0,   0,   1,  -4,   8, -20, 114,
+      38, -14,   7,  -3,   1,   0,   0,   0, },
+  },
+  {
+    {  0,   1,  -4,   9, -22, 107,  49, -17,
+       8,  -4,   1,   0,   0,   0,   0,   0, },
+    {  0,   0,   0,   1,  -4,   9, -22, 107,
+      49, -17,   8,  -4,   1,   0,   0,   0, },
+  },
+  {
+    {  0,   2,  -5,  10, -24,  99,  59, -20,
+       9,  -4,   2,   0,   0,   0,   0,   0, },
+    {  0,   0,   0,   2,  -5,  10, -24,  99,
+      59, -20,   9,  -4,   2,   0,   0,   0, },
+  },
+  {
+    {  0,   2,  -5,  10, -24,  90,  70, -22,
+      10,  -5,   2,   0,   0,   0,   0,   0, },
+    {  0,   0,   0,   2,  -5,  10, -24,  90,
+      70, -22,  10,  -5,   2,   0,   0,   0, },
+  },
+  {
+    {  0,   2,  -5,  10, -23,  80,  80, -23,
+      10,  -5,   2,   0,   0,   0,   0,   0, },
+    {  0,   0,   0,   2,  -5,  10, -23,  80,
+      80, -23,  10,  -5,   2,   0,   0,   0, },
+  },
+  {
+    {  0,   2,  -5,  10, -22,  70,  90, -24,
+      10,  -5,   2,   0,   0,   0,   0,   0, },
+    {  0,   0,   0,   2,  -5,  10, -22,  70,
+      90, -24,  10,  -5,   2,   0,   0,   0, },
+  },
+  {
+    {  0,   2,  -4,   9, -20,  59,  99, -24,
+      10,  -5,   2,   0,   0,   0,   0,   0, },
+    {  0,   0,   0,   2,  -4,   9, -20,  59,
+      99, -24,  10,  -5,   2,   0,   0,   0, },
+  },
+  {
+    {  0,   1,  -4,   8, -17,  49, 107, -22,
+       9,  -4,   1,   0,   0,   0,   0,   0, },
+    {  0,   0,   0,   1,  -4,   8, -17,  49,
+     107, -22,   9,  -4,   1,   0,   0,   0, },
+  },
+  {
+    {  0,   1,  -3,   7, -14,  38, 114, -20,
+       8,  -4,   1,   0,   0,   0,   0,   0, },
+    {  0,   0,   0,   1,  -3,   7, -14,  38,
+     114, -20,   8,  -4,   1,   0,   0,   0, },
+  },
+  {
+    {  0,   1,  -2,   5, -11,  28, 119, -17,
+       7,  -3,   1,   0,   0,   0,   0,   0, },
+    {  0,   0,   0,   1,  -2,   5, -11,  28,
+     119, -17,   7,  -3,   1,   0,   0,   0, },
+  },
+  {
+    {  0,   0,  -2,   3,  -7,  18, 124, -12,
+       5,  -2,   1,   0,   0,   0,   0,   0, },
+    {  0,   0,   0,   0,  -2,   3,  -7,  18,
+     124, -12,   5,  -2,   1,   0,   0,   0, },
+  },
+  {
+    {  0,   0,  -1,   2,  -4,   8, 127,  -6,
+       3,  -1,   0,   0,   0,   0,   0,   0, },
+    {  0,   0,   0,   0,  -1,   2,  -4,   8,
+     127,  -6,   3,  -1,   0,   0,   0,   0, },
+  },
+};
+#endif
+#if CONFIG_EXT_INTERP
+DECLARE_ALIGNED(16, const int8_t,
+                sub_pel_filters_10sharp_ver_signal_dir[15][6][16]) = {
+  {
+    {  0,   0,   0,   0,   0,   0,   0,   0,
+       0,   0,   0,   0,   0,   0,   0,   0, },
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3,
+      -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -6, 127,  -6, 127,  -6, 127,  -6, 127,
+      -6, 127,  -6, 127,  -6, 127,  -6, 127, },
+    {  8,  -4,   8,  -4,   8,  -4,   8,  -4,
+       8,  -4,   8,  -4,   8,  -4,   8,  -4, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1,
+       2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+    {  0,   0,   0,   0,   0,   0,   0,   0,
+       0,   0,   0,   0,   0,   0,   0,   0, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1,
+       0,   1,   0,   1,   0,   1,   0,   1, },
+    { -2,   5,  -2,   5,  -2,   5,  -2,   5,
+      -2,   5,  -2,   5,  -2,   5,  -2,   5, },
+    {-12, 124, -12, 124, -12, 124, -12, 124,
+     -12, 124, -12, 124, -12, 124, -12, 124, },
+    { 18,  -7,  18,  -7,  18,  -7,  18,  -7,
+      18,  -7,  18,  -7,  18,  -7,  18,  -7, },
+    {  3,  -2,   3,  -2,   3,  -2,   3,  -2,
+       3,  -2,   3,  -2,   3,  -2,   3,  -2, },
+    {  0,   0,   0,   0,   0,   0,   0,   0,
+       0,   0,   0,   0,   0,   0,   0,   0, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1,
+       0,   1,   0,   1,   0,   1,   0,   1, },
+    { -3,   7,  -3,   7,  -3,   7,  -3,   7,
+      -3,   7,  -3,   7,  -3,   7,  -3,   7, },
+    {-17, 119, -17, 119, -17, 119, -17, 119,
+     -17, 119, -17, 119, -17, 119, -17, 119, },
+    { 28, -11,  28, -11,  28, -11,  28, -11,
+      28, -11,  28, -11,  28, -11,  28, -11, },
+    {  5,  -2,   5,  -2,   5,  -2,   5,  -2,
+       5,  -2,   5,  -2,   5,  -2,   5,  -2, },
+    {  1,   0,   1,   0,   1,   0,   1,   0,
+       1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1,
+       0,   1,   0,   1,   0,   1,   0,   1, },
+    { -4,   8,  -4,   8,  -4,   8,  -4,   8,
+      -4,   8,  -4,   8,  -4,   8,  -4,   8, },
+    {-20, 114, -20, 114, -20, 114, -20, 114,
+     -20, 114, -20, 114, -20, 114, -20, 114, },
+    { 38, -14,  38, -14,  38, -14,  38, -14,
+      38, -14,  38, -14,  38, -14,  38, -14, },
+    {  7,  -3,   7,  -3,   7,  -3,   7,  -3,
+       7,  -3,   7,  -3,   7,  -3,   7,  -3, },
+    {  1,   0,   1,   0,   1,   0,   1,   0,
+       1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1,
+       0,   1,   0,   1,   0,   1,   0,   1, },
+    { -4,   9,  -4,   9,  -4,   9,  -4,   9,
+      -4,   9,  -4,   9,  -4,   9,  -4,   9, },
+    {-22, 107, -22, 107, -22, 107, -22, 107,
+     -22, 107, -22, 107, -22, 107, -22, 107, },
+    { 49, -17,  49, -17,  49, -17,  49, -17,
+      49, -17,  49, -17,  49, -17,  49, -17, },
+    {  8,  -4,   8,  -4,   8,  -4,   8,  -4,
+       8,  -4,   8,  -4,   8,  -4,   8,  -4, },
+    {  1,   0,   1,   0,   1,   0,   1,   0,
+       1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    {  0,   2,   0,   2,   0,   2,   0,   2,
+       0,   2,   0,   2,   0,   2,   0,   2, },
+    { -5,  10,  -5,  10,  -5,  10,  -5,  10,
+      -5,  10,  -5,  10,  -5,  10,  -5,  10, },
+    {-24,  99, -24,  99, -24,  99, -24,  99,
+     -24,  99, -24,  99, -24,  99, -24,  99, },
+    { 59, -20,  59, -20,  59, -20,  59, -20,
+      59, -20,  59, -20,  59, -20,  59, -20, },
+    {  9,  -4,   9,  -4,   9,  -4,   9,  -4,
+       9,  -4,   9,  -4,   9,  -4,   9,  -4, },
+    {  2,   0,   2,   0,   2,   0,   2,   0,
+       2,   0,   2,   0,   2,   0,   2,   0, },
+  },
+  {
+    {  0,   2,   0,   2,   0,   2,   0,   2,
+       0,   2,   0,   2,   0,   2,   0,   2, },
+    { -5,  10,  -5,  10,  -5,  10,  -5,  10,
+      -5,  10,  -5,  10,  -5,  10,  -5,  10, },
+    {-24,  90, -24,  90, -24,  90, -24,  90,
+     -24,  90, -24,  90, -24,  90, -24,  90, },
+    { 70, -22,  70, -22,  70, -22,  70, -22,
+      70, -22,  70, -22,  70, -22,  70, -22, },
+    { 10,  -5,  10,  -5,  10,  -5,  10,  -5,
+      10,  -5,  10,  -5,  10,  -5,  10,  -5, },
+    {  2,   0,   2,   0,   2,   0,   2,   0,
+       2,   0,   2,   0,   2,   0,   2,   0, },
+  },
+  {
+    {  0,   2,   0,   2,   0,   2,   0,   2,
+       0,   2,   0,   2,   0,   2,   0,   2, },
+    { -5,  10,  -5,  10,  -5,  10,  -5,  10,
+      -5,  10,  -5,  10,  -5,  10,  -5,  10, },
+    {-23,  80, -23,  80, -23,  80, -23,  80,
+     -23,  80, -23,  80, -23,  80, -23,  80, },
+    { 80, -23,  80, -23,  80, -23,  80, -23,
+      80, -23,  80, -23,  80, -23,  80, -23, },
+    { 10,  -5,  10,  -5,  10,  -5,  10,  -5,
+      10,  -5,  10,  -5,  10,  -5,  10,  -5, },
+    {  2,   0,   2,   0,   2,   0,   2,   0,
+       2,   0,   2,   0,   2,   0,   2,   0, },
+  },
+  {
+    {  0,   2,   0,   2,   0,   2,   0,   2,
+       0,   2,   0,   2,   0,   2,   0,   2, },
+    { -5,  10,  -5,  10,  -5,  10,  -5,  10,
+      -5,  10,  -5,  10,  -5,  10,  -5,  10, },
+    {-22,  70, -22,  70, -22,  70, -22,  70,
+     -22,  70, -22,  70, -22,  70, -22,  70, },
+    { 90, -24,  90, -24,  90, -24,  90, -24,
+      90, -24,  90, -24,  90, -24,  90, -24, },
+    { 10,  -5,  10,  -5,  10,  -5,  10,  -5,
+      10,  -5,  10,  -5,  10,  -5,  10,  -5, },
+    {  2,   0,   2,   0,   2,   0,   2,   0,
+       2,   0,   2,   0,   2,   0,   2,   0, },
+  },
+  {
+    {  0,   2,   0,   2,   0,   2,   0,   2,
+       0,   2,   0,   2,   0,   2,   0,   2, },
+    { -4,   9,  -4,   9,  -4,   9,  -4,   9,
+      -4,   9,  -4,   9,  -4,   9,  -4,   9, },
+    {-20,  59, -20,  59, -20,  59, -20,  59,
+     -20,  59, -20,  59, -20,  59, -20,  59, },
+    { 99, -24,  99, -24,  99, -24,  99, -24,
+      99, -24,  99, -24,  99, -24,  99, -24, },
+    { 10,  -5,  10,  -5,  10,  -5,  10,  -5,
+      10,  -5,  10,  -5,  10,  -5,  10,  -5, },
+    {  2,   0,   2,   0,   2,   0,   2,   0,
+       2,   0,   2,   0,   2,   0,   2,   0, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1,
+       0,   1,   0,   1,   0,   1,   0,   1, },
+    { -4,   8,  -4,   8,  -4,   8,  -4,   8,
+      -4,   8,  -4,   8,  -4,   8,  -4,   8, },
+    {-17,  49, -17,  49, -17,  49, -17,  49,
+     -17,  49, -17,  49, -17,  49, -17,  49, },
+    {107, -22, 107, -22, 107, -22, 107, -22,
+     107, -22, 107, -22, 107, -22, 107, -22, },
+    {  9,  -4,   9,  -4,   9,  -4,   9,  -4,
+       9,  -4,   9,  -4,   9,  -4,   9,  -4, },
+    {  1,   0,   1,   0,   1,   0,   1,   0,
+       1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1,
+       0,   1,   0,   1,   0,   1,   0,   1, },
+    { -3,   7,  -3,   7,  -3,   7,  -3,   7,
+      -3,   7,  -3,   7,  -3,   7,  -3,   7, },
+    {-14,  38, -14,  38, -14,  38, -14,  38,
+     -14,  38, -14,  38, -14,  38, -14,  38, },
+    {114, -20, 114, -20, 114, -20, 114, -20,
+     114, -20, 114, -20, 114, -20, 114, -20, },
+    {  8,  -4,   8,  -4,   8,  -4,   8,  -4,
+       8,  -4,   8,  -4,   8,  -4,   8,  -4, },
+    {  1,   0,   1,   0,   1,   0,   1,   0,
+       1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1,
+       0,   1,   0,   1,   0,   1,   0,   1, },
+    { -2,   5,  -2,   5,  -2,   5,  -2,   5,
+      -2,   5,  -2,   5,  -2,   5,  -2,   5, },
+    {-11,  28, -11,  28, -11,  28, -11,  28,
+     -11,  28, -11,  28, -11,  28, -11,  28, },
+    {119, -17, 119, -17, 119, -17, 119, -17,
+     119, -17, 119, -17, 119, -17, 119, -17, },
+    {  7,  -3,   7,  -3,   7,  -3,   7,  -3,
+       7,  -3,   7,  -3,   7,  -3,   7,  -3, },
+    {  1,   0,   1,   0,   1,   0,   1,   0,
+       1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    {  0,   0,   0,   0,   0,   0,   0,   0,
+       0,   0,   0,   0,   0,   0,   0,   0, },
+    { -2,   3,  -2,   3,  -2,   3,  -2,   3,
+      -2,   3,  -2,   3,  -2,   3,  -2,   3, },
+    { -7,  18,  -7,  18,  -7,  18,  -7,  18,
+      -7,  18,  -7,  18,  -7,  18,  -7,  18, },
+    {124, -12, 124, -12, 124, -12, 124, -12,
+     124, -12, 124, -12, 124, -12, 124, -12, },
+    {  5,  -2,   5,  -2,   5,  -2,   5,  -2,
+       5,  -2,   5,  -2,   5,  -2,   5,  -2, },
+    {  1,   0,   1,   0,   1,   0,   1,   0,
+       1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    {  0,   0,   0,   0,   0,   0,   0,   0,
+       0,   0,   0,   0,   0,   0,   0,   0, },
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2,
+      -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -4,   8,  -4,   8,  -4,   8,  -4,   8,
+      -4,   8,  -4,   8,  -4,   8,  -4,   8, },
+    {127,  -6, 127,  -6, 127,  -6, 127,  -6,
+     127,  -6, 127,  -6, 127,  -6, 127,  -6, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1,
+       3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+    {  0,   0,   0,   0,   0,   0,   0,   0,
+       0,   0,   0,   0,   0,   0,   0,   0, },
+  },
+};
+#endif
+#if CONFIG_EXT_INTERP
+DECLARE_ALIGNED(16, const int8_t,
+                sub_pel_filters_12sharp_signal_dir[15][2][16]) = {
+  {
+    {  0,   1,  -2,   3,  -7, 127,   8,  -4,
+       2,  -1,   1,   0,   0,   0,   0,   0, },
+    {  0,   0,   0,   1,  -2,   3,  -7, 127,
+       8,  -4,   2,  -1,   1,   0,   0,   0, },
+  },
+  {
+    { -1,   2,  -3,   6, -13, 124,  18,  -8,
+       4,  -2,   2,  -1,   0,   0,   0,   0, },
+    {  0,   0,  -1,   2,  -3,   6, -13, 124,
+      18,  -8,   4,  -2,   2,  -1,   0,   0, },
+  },
+  {
+    { -1,   3,  -4,   8, -18, 120,  28, -12,
+       7,  -4,   2,  -1,   0,   0,   0,   0, },
+    {  0,   0,  -1,   3,  -4,   8, -18, 120,
+      28, -12,   7,  -4,   2,  -1,   0,   0, },
+  },
+  {
+    { -1,   3,  -6,  10, -21, 115,  38, -15,
+       8,  -5,   3,  -1,   0,   0,   0,   0, },
+    {  0,   0,  -1,   3,  -6,  10, -21, 115,
+      38, -15,   8,  -5,   3,  -1,   0,   0, },
+  },
+  {
+    { -2,   4,  -6,  12, -24, 108,  49, -18,
+      10,  -6,   3,  -2,   0,   0,   0,   0, },
+    {  0,   0,  -2,   4,  -6,  12, -24, 108,
+      49, -18,  10,  -6,   3,  -2,   0,   0, },
+  },
+  {
+    { -2,   4,  -7,  13, -25, 100,  60, -21,
+      11,  -7,   4,  -2,   0,   0,   0,   0, },
+    {  0,   0,  -2,   4,  -7,  13, -25, 100,
+      60, -21,  11,  -7,   4,  -2,   0,   0, },
+  },
+  {
+    { -2,   4,  -7,  13, -26,  91,  71, -24,
+      13,  -7,   4,  -2,   0,   0,   0,   0, },
+    {  0,   0,  -2,   4,  -7,  13, -26,  91,
+      71, -24,  13,  -7,   4,  -2,   0,   0, },
+  },
+  {
+    { -2,   4,  -7,  13, -25,  81,  81, -25,
+      13,  -7,   4,  -2,   0,   0,   0,   0, },
+    {  0,   0,  -2,   4,  -7,  13, -25,  81,
+      81, -25,  13,  -7,   4,  -2,   0,   0, },
+  },
+  {
+    { -2,   4,  -7,  13, -24,  71,  91, -26,
+      13,  -7,   4,  -2,   0,   0,   0,   0, },
+    {  0,   0,  -2,   4,  -7,  13, -24,  71,
+      91, -26,  13,  -7,   4,  -2,   0,   0, },
+  },
+  {
+    { -2,   4,  -7,  11, -21,  60, 100, -25,
+      13,  -7,   4,  -2,   0,   0,   0,   0, },
+    {  0,   0,  -2,   4,  -7,  11, -21,  60,
+     100, -25,  13,  -7,   4,  -2,   0,   0, },
+  },
+  {
+    { -2,   3,  -6,  10, -18,  49, 108, -24,
+      12,  -6,   4,  -2,   0,   0,   0,   0, },
+    {  0,   0,  -2,   3,  -6,  10, -18,  49,
+     108, -24,  12,  -6,   4,  -2,   0,   0, },
+  },
+  {
+    { -1,   3,  -5,   8, -15,  38, 115, -21,
+      10,  -6,   3,  -1,   0,   0,   0,   0, },
+    {  0,   0,  -1,   3,  -5,   8, -15,  38,
+     115, -21,  10,  -6,   3,  -1,   0,   0, },
+  },
+  {
+    { -1,   2,  -4,   7, -12,  28, 120, -18,
+       8,  -4,   3,  -1,   0,   0,   0,   0, },
+    {  0,   0,  -1,   2,  -4,   7, -12,  28,
+     120, -18,   8,  -4,   3,  -1,   0,   0, },
+  },
+  {
+    { -1,   2,  -2,   4,  -8,  18, 124, -13,
+       6,  -3,   2,  -1,   0,   0,   0,   0, },
+    {  0,   0,  -1,   2,  -2,   4,  -8,  18,
+     124, -13,   6,  -3,   2,  -1,   0,   0, },
+  },
+  {
+    {  0,   1,  -1,   2,  -4,   8, 127,  -7,
+       3,  -2,   1,   0,   0,   0,   0,   0, },
+    {  0,   0,   0,   1,  -1,   2,  -4,   8,
+     127,  -7,   3,  -2,   1,   0,   0,   0, },
+  },
+};
+#endif
+#if CONFIG_EXT_INTERP
+DECLARE_ALIGNED(16, const int8_t,
+                sub_pel_filters_12sharp_ver_signal_dir[15][6][16]) = {
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1,
+       0,   1,   0,   1,   0,   1,   0,   1, },
+    { -2,   3,  -2,   3,  -2,   3,  -2,   3,
+      -2,   3,  -2,   3,  -2,   3,  -2,   3, },
+    { -7, 127,  -7, 127,  -7, 127,  -7, 127,
+      -7, 127,  -7, 127,  -7, 127,  -7, 127, },
+    {  8,  -4,   8,  -4,   8,  -4,   8,  -4,
+       8,  -4,   8,  -4,   8,  -4,   8,  -4, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1,
+       2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+    {  1,   0,   1,   0,   1,   0,   1,   0,
+       1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2,
+      -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -3,   6,  -3,   6,  -3,   6,  -3,   6,
+      -3,   6,  -3,   6,  -3,   6,  -3,   6, },
+    {-13, 124, -13, 124, -13, 124, -13, 124,
+     -13, 124, -13, 124, -13, 124, -13, 124, },
+    { 18,  -8,  18,  -8,  18,  -8,  18,  -8,
+      18,  -8,  18,  -8,  18,  -8,  18,  -8, },
+    {  4,  -2,   4,  -2,   4,  -2,   4,  -2,
+       4,  -2,   4,  -2,   4,  -2,   4,  -2, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1,
+       2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+  },
+  {
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3,
+      -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -4,   8,  -4,   8,  -4,   8,  -4,   8,
+      -4,   8,  -4,   8,  -4,   8,  -4,   8, },
+    {-18, 120, -18, 120, -18, 120, -18, 120,
+     -18, 120, -18, 120, -18, 120, -18, 120, },
+    { 28, -12,  28, -12,  28, -12,  28, -12,
+      28, -12,  28, -12,  28, -12,  28, -12, },
+    {  7,  -4,   7,  -4,   7,  -4,   7,  -4,
+       7,  -4,   7,  -4,   7,  -4,   7,  -4, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1,
+       2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+  },
+  {
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3,
+      -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -6,  10,  -6,  10,  -6,  10,  -6,  10,
+      -6,  10,  -6,  10,  -6,  10,  -6,  10, },
+    {-21, 115, -21, 115, -21, 115, -21, 115,
+     -21, 115, -21, 115, -21, 115, -21, 115, },
+    { 38, -15,  38, -15,  38, -15,  38, -15,
+      38, -15,  38, -15,  38, -15,  38, -15, },
+    {  8,  -5,   8,  -5,   8,  -5,   8,  -5,
+       8,  -5,   8,  -5,   8,  -5,   8,  -5, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1,
+       3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+  },
+  {
+    { -2,   4,  -2,   4,  -2,   4,  -2,   4,
+      -2,   4,  -2,   4,  -2,   4,  -2,   4, },
+    { -6,  12,  -6,  12,  -6,  12,  -6,  12,
+      -6,  12,  -6,  12,  -6,  12,  -6,  12, },
+    {-24, 108, -24, 108, -24, 108, -24, 108,
+     -24, 108, -24, 108, -24, 108, -24, 108, },
+    { 49, -18,  49, -18,  49, -18,  49, -18,
+      49, -18,  49, -18,  49, -18,  49, -18, },
+    { 10,  -6,  10,  -6,  10,  -6,  10,  -6,
+      10,  -6,  10,  -6,  10,  -6,  10,  -6, },
+    {  3,  -2,   3,  -2,   3,  -2,   3,  -2,
+       3,  -2,   3,  -2,   3,  -2,   3,  -2, },
+  },
+  {
+    { -2,   4,  -2,   4,  -2,   4,  -2,   4,
+      -2,   4,  -2,   4,  -2,   4,  -2,   4, },
+    { -7,  13,  -7,  13,  -7,  13,  -7,  13,
+      -7,  13,  -7,  13,  -7,  13,  -7,  13, },
+    {-25, 100, -25, 100, -25, 100, -25, 100,
+     -25, 100, -25, 100, -25, 100, -25, 100, },
+    { 60, -21,  60, -21,  60, -21,  60, -21,
+      60, -21,  60, -21,  60, -21,  60, -21, },
+    { 11,  -7,  11,  -7,  11,  -7,  11,  -7,
+      11,  -7,  11,  -7,  11,  -7,  11,  -7, },
+    {  4,  -2,   4,  -2,   4,  -2,   4,  -2,
+       4,  -2,   4,  -2,   4,  -2,   4,  -2, },
+  },
+  {
+    { -2,   4,  -2,   4,  -2,   4,  -2,   4,
+      -2,   4,  -2,   4,  -2,   4,  -2,   4, },
+    { -7,  13,  -7,  13,  -7,  13,  -7,  13,
+      -7,  13,  -7,  13,  -7,  13,  -7,  13, },
+    {-26,  91, -26,  91, -26,  91, -26,  91,
+     -26,  91, -26,  91, -26,  91, -26,  91, },
+    { 71, -24,  71, -24,  71, -24,  71, -24,
+      71, -24,  71, -24,  71, -24,  71, -24, },
+    { 13,  -7,  13,  -7,  13,  -7,  13,  -7,
+      13,  -7,  13,  -7,  13,  -7,  13,  -7, },
+    {  4,  -2,   4,  -2,   4,  -2,   4,  -2,
+       4,  -2,   4,  -2,   4,  -2,   4,  -2, },
+  },
+  {
+    { -2,   4,  -2,   4,  -2,   4,  -2,   4,
+      -2,   4,  -2,   4,  -2,   4,  -2,   4, },
+    { -7,  13,  -7,  13,  -7,  13,  -7,  13,
+      -7,  13,  -7,  13,  -7,  13,  -7,  13, },
+    {-25,  81, -25,  81, -25,  81, -25,  81,
+     -25,  81, -25,  81, -25,  81, -25,  81, },
+    { 81, -25,  81, -25,  81, -25,  81, -25,
+      81, -25,  81, -25,  81, -25,  81, -25, },
+    { 13,  -7,  13,  -7,  13,  -7,  13,  -7,
+      13,  -7,  13,  -7,  13,  -7,  13,  -7, },
+    {  4,  -2,   4,  -2,   4,  -2,   4,  -2,
+       4,  -2,   4,  -2,   4,  -2,   4,  -2, },
+  },
+  {
+    { -2,   4,  -2,   4,  -2,   4,  -2,   4,
+      -2,   4,  -2,   4,  -2,   4,  -2,   4, },
+    { -7,  13,  -7,  13,  -7,  13,  -7,  13,
+      -7,  13,  -7,  13,  -7,  13,  -7,  13, },
+    {-24,  71, -24,  71, -24,  71, -24,  71,
+     -24,  71, -24,  71, -24,  71, -24,  71, },
+    { 91, -26,  91, -26,  91, -26,  91, -26,
+      91, -26,  91, -26,  91, -26,  91, -26, },
+    { 13,  -7,  13,  -7,  13,  -7,  13,  -7,
+      13,  -7,  13,  -7,  13,  -7,  13,  -7, },
+    {  4,  -2,   4,  -2,   4,  -2,   4,  -2,
+       4,  -2,   4,  -2,   4,  -2,   4,  -2, },
+  },
+  {
+    { -2,   4,  -2,   4,  -2,   4,  -2,   4,
+      -2,   4,  -2,   4,  -2,   4,  -2,   4, },
+    { -7,  11,  -7,  11,  -7,  11,  -7,  11,
+      -7,  11,  -7,  11,  -7,  11,  -7,  11, },
+    {-21,  60, -21,  60, -21,  60, -21,  60,
+     -21,  60, -21,  60, -21,  60, -21,  60, },
+    {100, -25, 100, -25, 100, -25, 100, -25,
+     100, -25, 100, -25, 100, -25, 100, -25, },
+    { 13,  -7,  13,  -7,  13,  -7,  13,  -7,
+      13,  -7,  13,  -7,  13,  -7,  13,  -7, },
+    {  4,  -2,   4,  -2,   4,  -2,   4,  -2,
+       4,  -2,   4,  -2,   4,  -2,   4,  -2, },
+  },
+  {
+    { -2,   3,  -2,   3,  -2,   3,  -2,   3,
+      -2,   3,  -2,   3,  -2,   3,  -2,   3, },
+    { -6,  10,  -6,  10,  -6,  10,  -6,  10,
+      -6,  10,  -6,  10,  -6,  10,  -6,  10, },
+    {-18,  49, -18,  49, -18,  49, -18,  49,
+     -18,  49, -18,  49, -18,  49, -18,  49, },
+    {108, -24, 108, -24, 108, -24, 108, -24,
+     108, -24, 108, -24, 108, -24, 108, -24, },
+    { 12,  -6,  12,  -6,  12,  -6,  12,  -6,
+      12,  -6,  12,  -6,  12,  -6,  12,  -6, },
+    {  4,  -2,   4,  -2,   4,  -2,   4,  -2,
+       4,  -2,   4,  -2,   4,  -2,   4,  -2, },
+  },
+  {
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3,
+      -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -5,   8,  -5,   8,  -5,   8,  -5,   8,
+      -5,   8,  -5,   8,  -5,   8,  -5,   8, },
+    {-15,  38, -15,  38, -15,  38, -15,  38,
+     -15,  38, -15,  38, -15,  38, -15,  38, },
+    {115, -21, 115, -21, 115, -21, 115, -21,
+     115, -21, 115, -21, 115, -21, 115, -21, },
+    { 10,  -6,  10,  -6,  10,  -6,  10,  -6,
+      10,  -6,  10,  -6,  10,  -6,  10,  -6, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1,
+       3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+  },
+  {
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2,
+      -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -4,   7,  -4,   7,  -4,   7,  -4,   7,
+      -4,   7,  -4,   7,  -4,   7,  -4,   7, },
+    {-12,  28, -12,  28, -12,  28, -12,  28,
+     -12,  28, -12,  28, -12,  28, -12,  28, },
+    {120, -18, 120, -18, 120, -18, 120, -18,
+     120, -18, 120, -18, 120, -18, 120, -18, },
+    {  8,  -4,   8,  -4,   8,  -4,   8,  -4,
+       8,  -4,   8,  -4,   8,  -4,   8,  -4, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1,
+       3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+  },
+  {
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2,
+      -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -2,   4,  -2,   4,  -2,   4,  -2,   4,
+      -2,   4,  -2,   4,  -2,   4,  -2,   4, },
+    { -8,  18,  -8,  18,  -8,  18,  -8,  18,
+      -8,  18,  -8,  18,  -8,  18,  -8,  18, },
+    {124, -13, 124, -13, 124, -13, 124, -13,
+     124, -13, 124, -13, 124, -13, 124, -13, },
+    {  6,  -3,   6,  -3,   6,  -3,   6,  -3,
+       6,  -3,   6,  -3,   6,  -3,   6,  -3, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1,
+       2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1,
+       0,   1,   0,   1,   0,   1,   0,   1, },
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2,
+      -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -4,   8,  -4,   8,  -4,   8,  -4,   8,
+      -4,   8,  -4,   8,  -4,   8,  -4,   8, },
+    {127,  -7, 127,  -7, 127,  -7, 127,  -7,
+     127,  -7, 127,  -7, 127,  -7, 127,  -7, },
+    {  3,  -2,   3,  -2,   3,  -2,   3,  -2,
+       3,  -2,   3,  -2,   3,  -2,   3,  -2, },
+    {  1,   0,   1,   0,   1,   0,   1,   0,
+       1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+};
+#endif
+#if USE_TEMPORALFILTER_12TAP
+DECLARE_ALIGNED(16, const int8_t,
+                sub_pel_filters_temporalfilter_12_signal_dir[15][2][16]) = {
+  {
+    {  0,   1,  -1,   3,  -7, 127,   8,  -4,
+       2,  -1,   0,   0,   0,   0,   0,   0, },
+    {  0,   0,   0,   1,  -1,   3,  -7, 127,
+       8,  -4,   2,  -1,   0,   0,   0,   0, },
+  },
+  {
+    {  0,   1,  -3,   5, -12, 124,  18,  -8,
+       4,  -2,   1,   0,   0,   0,   0,   0, },
+    {  0,   0,   0,   1,  -3,   5, -12, 124,
+      18,  -8,   4,  -2,   1,   0,   0,   0, },
+  },
+  {
+    { -1,   2,  -4,   8, -17, 120,  28, -11,
+       6,  -3,   1,  -1,   0,   0,   0,   0, },
+    {  0,   0,  -1,   2,  -4,   8, -17, 120,
+      28, -11,   6,  -3,   1,  -1,   0,   0, },
+  },
+  {
+    { -1,   2,  -4,  10, -21, 114,  38, -15,
+       8,  -4,   2,  -1,   0,   0,   0,   0, },
+    {  0,   0,  -1,   2,  -4,  10, -21, 114,
+      38, -15,   8,  -4,   2,  -1,   0,   0, },
+  },
+  {
+    { -1,   3,  -5,  11, -23, 107,  49, -18,
+       9,  -5,   2,  -1,   0,   0,   0,   0, },
+    {  0,   0,  -1,   3,  -5,  11, -23, 107,
+      49, -18,   9,  -5,   2,  -1,   0,   0, },
+  },
+  {
+    { -1,   3,  -6,  12, -25,  99,  60, -21,
+      11,  -6,   3,  -1,   0,   0,   0,   0, },
+    {  0,   0,  -1,   3,  -6,  12, -25,  99,
+      60, -21,  11,  -6,   3,  -1,   0,   0, },
+  },
+  {
+    { -1,   3,  -6,  12, -25,  90,  70, -23,
+      12,  -6,   3,  -1,   0,   0,   0,   0, },
+    {  0,   0,  -1,   3,  -6,  12, -25,  90,
+      70, -23,  12,  -6,   3,  -1,   0,   0, },
+  },
+  {
+    { -1,   3,  -6,  12, -24,  80,  80, -24,
+      12,  -6,   3,  -1,   0,   0,   0,   0, },
+    {  0,   0,  -1,   3,  -6,  12, -24,  80,
+      80, -24,  12,  -6,   3,  -1,   0,   0, },
+  },
+  {
+    { -1,   3,  -6,  12, -23,  70,  90, -25,
+      12,  -6,   3,  -1,   0,   0,   0,   0, },
+    {  0,   0,  -1,   3,  -6,  12, -23,  70,
+      90, -25,  12,  -6,   3,  -1,   0,   0, },
+  },
+  {
+    { -1,   3,  -6,  11, -21,  60,  99, -25,
+      12,  -6,   3,  -1,   0,   0,   0,   0, },
+    {  0,   0,  -1,   3,  -6,  11, -21,  60,
+      99, -25,  12,  -6,   3,  -1,   0,   0, },
+  },
+  {
+    { -1,   2,  -5,   9, -18,  49, 107, -23,
+      11,  -5,   3,  -1,   0,   0,   0,   0, },
+    {  0,   0,  -1,   2,  -5,   9, -18,  49,
+     107, -23,  11,  -5,   3,  -1,   0,   0, },
+  },
+  {
+    { -1,   2,  -4,   8, -15,  38, 114, -21,
+      10,  -4,   2,  -1,   0,   0,   0,   0, },
+    {  0,   0,  -1,   2,  -4,   8, -15,  38,
+     114, -21,  10,  -4,   2,  -1,   0,   0, },
+  },
+  {
+    { -1,   1,  -3,   6, -11,  28, 120, -17,
+       8,  -4,   2,  -1,   0,   0,   0,   0, },
+    {  0,   0,  -1,   1,  -3,   6, -11,  28,
+     120, -17,   8,  -4,   2,  -1,   0,   0, },
+  },
+  {
+    {  0,   1,  -2,   4,  -8,  18, 124, -12,
+       5,  -3,   1,   0,   0,   0,   0,   0, },
+    {  0,   0,   0,   1,  -2,   4,  -8,  18,
+     124, -12,   5,  -3,   1,   0,   0,   0, },
+  },
+  {
+    {  0,   0,  -1,   2,  -4,   8, 127,  -7,
+       3,  -1,   1,   0,   0,   0,   0,   0, },
+    {  0,   0,   0,   0,  -1,   2,  -4,   8,
+     127,  -7,   3,  -1,   1,   0,   0,   0, },
+  },
+};
+#endif
+#if USE_TEMPORALFILTER_12TAP
+DECLARE_ALIGNED(16, const int8_t,
+                sub_pel_filters_temporalfilter_12_ver_signal_dir[15][6][16]) = {
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1,
+       0,   1,   0,   1,   0,   1,   0,   1, },
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3,
+      -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -7, 127,  -7, 127,  -7, 127,  -7, 127,
+      -7, 127,  -7, 127,  -7, 127,  -7, 127, },
+    {  8,  -4,   8,  -4,   8,  -4,   8,  -4,
+       8,  -4,   8,  -4,   8,  -4,   8,  -4, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1,
+       2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+    {  0,   0,   0,   0,   0,   0,   0,   0,
+       0,   0,   0,   0,   0,   0,   0,   0, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1,
+       0,   1,   0,   1,   0,   1,   0,   1, },
+    { -3,   5,  -3,   5,  -3,   5,  -3,   5,
+      -3,   5,  -3,   5,  -3,   5,  -3,   5, },
+    {-12, 124, -12, 124, -12, 124, -12, 124,
+     -12, 124, -12, 124, -12, 124, -12, 124, },
+    { 18,  -8,  18,  -8,  18,  -8,  18,  -8,
+      18,  -8,  18,  -8,  18,  -8,  18,  -8, },
+    {  4,  -2,   4,  -2,   4,  -2,   4,  -2,
+       4,  -2,   4,  -2,   4,  -2,   4,  -2, },
+    {  1,   0,   1,   0,   1,   0,   1,   0,
+       1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2,
+      -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -4,   8,  -4,   8,  -4,   8,  -4,   8,
+      -4,   8,  -4,   8,  -4,   8,  -4,   8, },
+    {-17, 120, -17, 120, -17, 120, -17, 120,
+     -17, 120, -17, 120, -17, 120, -17, 120, },
+    { 28, -11,  28, -11,  28, -11,  28, -11,
+      28, -11,  28, -11,  28, -11,  28, -11, },
+    {  6,  -3,   6,  -3,   6,  -3,   6,  -3,
+       6,  -3,   6,  -3,   6,  -3,   6,  -3, },
+    {  1,  -1,   1,  -1,   1,  -1,   1,  -1,
+       1,  -1,   1,  -1,   1,  -1,   1,  -1, },
+  },
+  {
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2,
+      -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -4,  10,  -4,  10,  -4,  10,  -4,  10,
+      -4,  10,  -4,  10,  -4,  10,  -4,  10, },
+    {-21, 114, -21, 114, -21, 114, -21, 114,
+     -21, 114, -21, 114, -21, 114, -21, 114, },
+    { 38, -15,  38, -15,  38, -15,  38, -15,
+      38, -15,  38, -15,  38, -15,  38, -15, },
+    {  8,  -4,   8,  -4,   8,  -4,   8,  -4,
+       8,  -4,   8,  -4,   8,  -4,   8,  -4, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1,
+       2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+  },
+  {
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3,
+      -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -5,  11,  -5,  11,  -5,  11,  -5,  11,
+      -5,  11,  -5,  11,  -5,  11,  -5,  11, },
+    {-23, 107, -23, 107, -23, 107, -23, 107,
+     -23, 107, -23, 107, -23, 107, -23, 107, },
+    { 49, -18,  49, -18,  49, -18,  49, -18,
+      49, -18,  49, -18,  49, -18,  49, -18, },
+    {  9,  -5,   9,  -5,   9,  -5,   9,  -5,
+       9,  -5,   9,  -5,   9,  -5,   9,  -5, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1,
+       2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+  },
+  {
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3,
+      -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -6,  12,  -6,  12,  -6,  12,  -6,  12,
+      -6,  12,  -6,  12,  -6,  12,  -6,  12, },
+    {-25,  99, -25,  99, -25,  99, -25,  99,
+     -25,  99, -25,  99, -25,  99, -25,  99, },
+    { 60, -21,  60, -21,  60, -21,  60, -21,
+      60, -21,  60, -21,  60, -21,  60, -21, },
+    { 11,  -6,  11,  -6,  11,  -6,  11,  -6,
+      11,  -6,  11,  -6,  11,  -6,  11,  -6, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1,
+       3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+  },
+  {
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3,
+      -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -6,  12,  -6,  12,  -6,  12,  -6,  12,
+      -6,  12,  -6,  12,  -6,  12,  -6,  12, },
+    {-25,  90, -25,  90, -25,  90, -25,  90,
+     -25,  90, -25,  90, -25,  90, -25,  90, },
+    { 70, -23,  70, -23,  70, -23,  70, -23,
+      70, -23,  70, -23,  70, -23,  70, -23, },
+    { 12,  -6,  12,  -6,  12,  -6,  12,  -6,
+      12,  -6,  12,  -6,  12,  -6,  12,  -6, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1,
+       3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+  },
+  {
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3,
+      -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -6,  12,  -6,  12,  -6,  12,  -6,  12,
+      -6,  12,  -6,  12,  -6,  12,  -6,  12, },
+    {-24,  80, -24,  80, -24,  80, -24,  80,
+     -24,  80, -24,  80, -24,  80, -24,  80, },
+    { 80, -24,  80, -24,  80, -24,  80, -24,
+      80, -24,  80, -24,  80, -24,  80, -24, },
+    { 12,  -6,  12,  -6,  12,  -6,  12,  -6,
+      12,  -6,  12,  -6,  12,  -6,  12,  -6, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1,
+       3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+  },
+  {
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3,
+      -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -6,  12,  -6,  12,  -6,  12,  -6,  12,
+      -6,  12,  -6,  12,  -6,  12,  -6,  12, },
+    {-23,  70, -23,  70, -23,  70, -23,  70,
+     -23,  70, -23,  70, -23,  70, -23,  70, },
+    { 90, -25,  90, -25,  90, -25,  90, -25,
+      90, -25,  90, -25,  90, -25,  90, -25, },
+    { 12,  -6,  12,  -6,  12,  -6,  12,  -6,
+      12,  -6,  12,  -6,  12,  -6,  12,  -6, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1,
+       3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+  },
+  {
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3,
+      -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -6,  11,  -6,  11,  -6,  11,  -6,  11,
+      -6,  11,  -6,  11,  -6,  11,  -6,  11, },
+    {-21,  60, -21,  60, -21,  60, -21,  60,
+     -21,  60, -21,  60, -21,  60, -21,  60, },
+    { 99, -25,  99, -25,  99, -25,  99, -25,
+      99, -25,  99, -25,  99, -25,  99, -25, },
+    { 12,  -6,  12,  -6,  12,  -6,  12,  -6,
+      12,  -6,  12,  -6,  12,  -6,  12,  -6, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1,
+       3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+  },
+  {
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2,
+      -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -5,   9,  -5,   9,  -5,   9,  -5,   9,
+      -5,   9,  -5,   9,  -5,   9,  -5,   9, },
+    {-18,  49, -18,  49, -18,  49, -18,  49,
+     -18,  49, -18,  49, -18,  49, -18,  49, },
+    {107, -23, 107, -23, 107, -23, 107, -23,
+     107, -23, 107, -23, 107, -23, 107, -23, },
+    { 11,  -5,  11,  -5,  11,  -5,  11,  -5,
+      11,  -5,  11,  -5,  11,  -5,  11,  -5, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1,
+       3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+  },
+  {
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2,
+      -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -4,   8,  -4,   8,  -4,   8,  -4,   8,
+      -4,   8,  -4,   8,  -4,   8,  -4,   8, },
+    {-15,  38, -15,  38, -15,  38, -15,  38,
+     -15,  38, -15,  38, -15,  38, -15,  38, },
+    {114, -21, 114, -21, 114, -21, 114, -21,
+     114, -21, 114, -21, 114, -21, 114, -21, },
+    { 10,  -4,  10,  -4,  10,  -4,  10,  -4,
+      10,  -4,  10,  -4,  10,  -4,  10,  -4, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1,
+       2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+  },
+  {
+    { -1,   1,  -1,   1,  -1,   1,  -1,   1,
+      -1,   1,  -1,   1,  -1,   1,  -1,   1, },
+    { -3,   6,  -3,   6,  -3,   6,  -3,   6,
+      -3,   6,  -3,   6,  -3,   6,  -3,   6, },
+    {-11,  28, -11,  28, -11,  28, -11,  28,
+     -11,  28, -11,  28, -11,  28, -11,  28, },
+    {120, -17, 120, -17, 120, -17, 120, -17,
+     120, -17, 120, -17, 120, -17, 120, -17, },
+    {  8,  -4,   8,  -4,   8,  -4,   8,  -4,
+       8,  -4,   8,  -4,   8,  -4,   8,  -4, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1,
+       2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1,
+       0,   1,   0,   1,   0,   1,   0,   1, },
+    { -2,   4,  -2,   4,  -2,   4,  -2,   4,
+      -2,   4,  -2,   4,  -2,   4,  -2,   4, },
+    { -8,  18,  -8,  18,  -8,  18,  -8,  18,
+      -8,  18,  -8,  18,  -8,  18,  -8,  18, },
+    {124, -12, 124, -12, 124, -12, 124, -12,
+     124, -12, 124, -12, 124, -12, 124, -12, },
+    {  5,  -3,   5,  -3,   5,  -3,   5,  -3,
+       5,  -3,   5,  -3,   5,  -3,   5,  -3, },
+    {  1,   0,   1,   0,   1,   0,   1,   0,
+       1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    {  0,   0,   0,   0,   0,   0,   0,   0,
+       0,   0,   0,   0,   0,   0,   0,   0, },
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2,
+      -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -4,   8,  -4,   8,  -4,   8,  -4,   8,
+      -4,   8,  -4,   8,  -4,   8,  -4,   8, },
+    {127,  -7, 127,  -7, 127,  -7, 127,  -7,
+     127,  -7, 127,  -7, 127,  -7, 127,  -7, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1,
+       3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+    {  1,   0,   1,   0,   1,   0,   1,   0,
+       1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+};
+#endif
diff --git a/vp10/common/x86/vp10_convolve_ssse3.c b/vp10/common/x86/vp10_convolve_ssse3.c
new file mode 100644
index 0000000..07dc11d
--- /dev/null
+++ b/vp10/common/x86/vp10_convolve_ssse3.c
@@ -0,0 +1,903 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <tmmintrin.h>
+
+#include "./vp10_rtcd.h"
+#include "vp10/common/filter.h"
+
+#define WIDTH_BOUND  (16)
+#define HEIGHT_BOUND (16)
+
+static INLINE void transpose_4x8(const __m128i *in, __m128i *out) {
+  __m128i t0, t1;
+
+  t0 = _mm_unpacklo_epi16(in[0], in[1]);
+  t1 = _mm_unpacklo_epi16(in[2], in[3]);
+
+  out[0] = _mm_unpacklo_epi32(t0, t1);
+  out[1] = _mm_srli_si128(out[0], 8);
+  out[2] = _mm_unpackhi_epi32(t0, t1);
+  out[3] = _mm_srli_si128(out[2], 8);
+
+  t0 = _mm_unpackhi_epi16(in[0], in[1]);
+  t1 = _mm_unpackhi_epi16(in[2], in[3]);
+
+  out[4] = _mm_unpacklo_epi32(t0, t1);
+  out[5] = _mm_srli_si128(out[4], 8);
+  // Note: We ignore out[6] and out[7] because
+  // they're zero vectors.
+}
+
+typedef void (*store_pixel_t)(const __m128i *x, uint8_t *dst);
+
+static INLINE __m128i accumulate_store(const __m128i *x, uint8_t *src) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i y = _mm_loadl_epi64((__m128i const *)src);
+  y = _mm_unpacklo_epi8(y, zero);
+  y = _mm_add_epi16(*x, y);
+  y = _mm_add_epi16(y, one);
+  y = _mm_srai_epi16(y, 1);
+  y = _mm_packus_epi16(y, y);
+  return y;
+}
+
+static INLINE void store_2_pixel_only(const __m128i *x, uint8_t *dst) {
+  uint32_t temp;
+  __m128i u = _mm_packus_epi16(*x, *x);
+  temp = _mm_cvtsi128_si32(u);
+  *(uint16_t *)dst = (uint16_t)temp;
+}
+
+static INLINE void accumulate_store_2_pixel(const __m128i *x, uint8_t *dst) {
+  uint32_t temp;
+  __m128i y = accumulate_store(x, dst);
+  temp = _mm_cvtsi128_si32(y);
+  *(uint16_t *)dst = (uint16_t)temp;
+}
+
+static store_pixel_t store2pixelTab[2] = {
+  store_2_pixel_only, accumulate_store_2_pixel};
+
+static INLINE void store_4_pixel_only(const __m128i *x, uint8_t *dst) {
+  __m128i u = _mm_packus_epi16(*x, *x);
+  *(int *)dst = _mm_cvtsi128_si32(u);
+}
+
+static INLINE void accumulate_store_4_pixel(const __m128i *x, uint8_t *dst) {
+  __m128i y = accumulate_store(x, dst);
+  *(int *)dst = _mm_cvtsi128_si32(y);
+}
+
+static store_pixel_t store4pixelTab[2] = {
+  store_4_pixel_only, accumulate_store_4_pixel};
+
+static void horiz_w4_ssse3(const uint8_t *src, const __m128i *f,
+                           int tapsNum, store_pixel_t store_func,
+                           uint8_t *dst) {
+  __m128i sumPairRow[4];
+  __m128i sumPairCol[8];
+  __m128i pixel;
+  const __m128i k_256 = _mm_set1_epi16(1 << 8);
+  const __m128i zero = _mm_setzero_si128();
+
+  if (10 == tapsNum) {
+    src -= 1;
+  }
+
+  pixel = _mm_loadu_si128((__m128i const *)src);
+  sumPairRow[0] = _mm_maddubs_epi16(pixel, f[0]);
+  sumPairRow[2] = _mm_maddubs_epi16(pixel, f[1]);
+  sumPairRow[2] = _mm_srli_si128(sumPairRow[2], 2);
+
+  pixel = _mm_loadu_si128((__m128i const *)(src + 1));
+  sumPairRow[1] = _mm_maddubs_epi16(pixel, f[0]);
+  sumPairRow[3] = _mm_maddubs_epi16(pixel, f[1]);
+  sumPairRow[3] = _mm_srli_si128(sumPairRow[3], 2);
+
+  transpose_4x8(sumPairRow, sumPairCol);
+
+  sumPairRow[0] = _mm_adds_epi16(sumPairCol[0], sumPairCol[1]);
+  sumPairRow[1] = _mm_adds_epi16(sumPairCol[4], sumPairCol[5]);
+
+  sumPairRow[2] = _mm_min_epi16(sumPairCol[2], sumPairCol[3]);
+  sumPairRow[3] = _mm_max_epi16(sumPairCol[2], sumPairCol[3]);
+
+  sumPairRow[0] = _mm_adds_epi16(sumPairRow[0], sumPairRow[1]);
+  sumPairRow[0] = _mm_adds_epi16(sumPairRow[0], sumPairRow[2]);
+  sumPairRow[0] = _mm_adds_epi16(sumPairRow[0], sumPairRow[3]);
+
+  sumPairRow[1] = _mm_mulhrs_epi16(sumPairRow[0], k_256);
+  sumPairRow[1] = _mm_packus_epi16(sumPairRow[1], sumPairRow[1]);
+  sumPairRow[1] = _mm_unpacklo_epi8(sumPairRow[1], zero);
+
+  store_func(&sumPairRow[1], dst);
+}
+
+static void horiz_w8_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
+                           store_pixel_t store, uint8_t *buf) {
+  horiz_w4_ssse3(src, f, tapsNum, store, buf);
+  src += 4;
+  buf += 4;
+  horiz_w4_ssse3(src, f, tapsNum, store, buf);
+}
+
+static void horiz_w16_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
+                            store_pixel_t store, uint8_t *buf) {
+  horiz_w8_ssse3(src, f, tapsNum, store, buf);
+  src += 8;
+  buf += 8;
+  horiz_w8_ssse3(src, f, tapsNum, store, buf);
+}
+
+static void horiz_w32_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
+                            store_pixel_t store, uint8_t *buf) {
+  horiz_w16_ssse3(src, f, tapsNum, store, buf);
+  src += 16;
+  buf += 16;
+  horiz_w16_ssse3(src, f, tapsNum, store, buf);
+}
+
+static void horiz_w64_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
+                            store_pixel_t store, uint8_t *buf) {
+  horiz_w32_ssse3(src, f, tapsNum, store, buf);
+  src += 32;
+  buf += 32;
+  horiz_w32_ssse3(src, f, tapsNum, store, buf);
+}
+
+static void horiz_w128_ssse3(const uint8_t *src, const __m128i *f, int tapsNum,
+                             store_pixel_t store, uint8_t *buf) {
+  horiz_w64_ssse3(src, f, tapsNum, store, buf);
+  src += 64;
+  buf += 64;
+  horiz_w64_ssse3(src, f, tapsNum, store, buf);
+}
+
+static void (*horizTab[6])(const uint8_t *, const __m128i *, int,
+                           store_pixel_t, uint8_t *) = {
+  horiz_w4_ssse3,
+  horiz_w8_ssse3,
+  horiz_w16_ssse3,
+  horiz_w32_ssse3,
+  horiz_w64_ssse3,
+  horiz_w128_ssse3,
+};
+
+static void filter_horiz_ssse3(const uint8_t *src, __m128i *f, int tapsNum,
+                               int width, store_pixel_t store, uint8_t *dst) {
+  switch (width) {
+    // Note:
+    // For width=2 and 4, store function must be different
+    case 2:
+    case 4:
+      horizTab[0](src, f, tapsNum, store, dst);
+      break;
+    case 8:
+      horizTab[1](src, f, tapsNum, store, dst);
+      break;
+    case 16:
+      horizTab[2](src, f, tapsNum, store, dst);
+      break;
+    case 32:
+      horizTab[3](src, f, tapsNum, store, dst);
+      break;
+    case 64:
+      horizTab[4](src, f, tapsNum, store, dst);
+      break;
+    case 128:
+      horizTab[5](src, f, tapsNum, store, dst);
+      break;
+    default:
+      assert(0);
+  }
+}
+
+// Vertical 8-pixel parallel
+typedef void (*transpose_to_dst_t)(const uint16_t *src, int src_stride,
+                                   uint8_t *dst, int dst_stride);
+
+static INLINE void transpose8x8_direct_to_dst(const uint16_t *src,
+                                              int src_stride,
+                                              uint8_t *dst,
+                                              int dst_stride) {
+  const __m128i k_256 = _mm_set1_epi16(1 << 8);
+  __m128i v0, v1, v2, v3;
+
+  __m128i u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+  __m128i u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+  __m128i u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+  __m128i u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+  __m128i u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
+  __m128i u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
+  __m128i u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
+  __m128i u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
+
+  u0 = _mm_mulhrs_epi16(u0, k_256);
+  u1 = _mm_mulhrs_epi16(u1, k_256);
+  u2 = _mm_mulhrs_epi16(u2, k_256);
+  u3 = _mm_mulhrs_epi16(u3, k_256);
+  u4 = _mm_mulhrs_epi16(u4, k_256);
+  u5 = _mm_mulhrs_epi16(u5, k_256);
+  u6 = _mm_mulhrs_epi16(u6, k_256);
+  u7 = _mm_mulhrs_epi16(u7, k_256);
+
+  v0 = _mm_packus_epi16(u0, u1);
+  v1 = _mm_packus_epi16(u2, u3);
+  v2 = _mm_packus_epi16(u4, u5);
+  v3 = _mm_packus_epi16(u6, u7);
+
+  u0 = _mm_unpacklo_epi8(v0, v1);
+  u1 = _mm_unpackhi_epi8(v0, v1);
+  u2 = _mm_unpacklo_epi8(v2, v3);
+  u3 = _mm_unpackhi_epi8(v2, v3);
+
+  u4 = _mm_unpacklo_epi8(u0, u1);
+  u5 = _mm_unpacklo_epi8(u2, u3);
+  u6 = _mm_unpackhi_epi8(u0, u1);
+  u7 = _mm_unpackhi_epi8(u2, u3);
+
+  u0 = _mm_unpacklo_epi32(u4, u5);
+  u1 = _mm_unpackhi_epi32(u4, u5);
+  u2 = _mm_unpacklo_epi32(u6, u7);
+  u3 = _mm_unpackhi_epi32(u6, u7);
+
+  u4 = _mm_srli_si128(u0, 8);
+  u5 = _mm_srli_si128(u1, 8);
+  u6 = _mm_srli_si128(u2, 8);
+  u7 = _mm_srli_si128(u3, 8);
+
+  _mm_storel_epi64((__m128i*)dst, u0);
+  _mm_storel_epi64((__m128i*)(dst + dst_stride * 1), u4);
+  _mm_storel_epi64((__m128i*)(dst + dst_stride * 2), u1);
+  _mm_storel_epi64((__m128i*)(dst + dst_stride * 3), u5);
+  _mm_storel_epi64((__m128i*)(dst + dst_stride * 4), u2);
+  _mm_storel_epi64((__m128i*)(dst + dst_stride * 5), u6);
+  _mm_storel_epi64((__m128i*)(dst + dst_stride * 6), u3);
+  _mm_storel_epi64((__m128i*)(dst + dst_stride * 7), u7);
+}
+
+static INLINE void transpose8x8_accumu_to_dst(const uint16_t *src,
+                                              int src_stride,
+                                              uint8_t *dst,
+                                              int dst_stride) {
+  const __m128i k_256 = _mm_set1_epi16(1 << 8);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+
+  __m128i u0 = _mm_loadu_si128((__m128i const *)(src + 0 * src_stride));
+  __m128i u1 = _mm_loadu_si128((__m128i const *)(src + 1 * src_stride));
+  __m128i u2 = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+  __m128i u3 = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+  __m128i u4 = _mm_loadu_si128((__m128i const *)(src + 4 * src_stride));
+  __m128i u5 = _mm_loadu_si128((__m128i const *)(src + 5 * src_stride));
+  __m128i u6 = _mm_loadu_si128((__m128i const *)(src + 6 * src_stride));
+  __m128i u7 = _mm_loadu_si128((__m128i const *)(src + 7 * src_stride));
+
+  u0 = _mm_mulhrs_epi16(u0, k_256);
+  u1 = _mm_mulhrs_epi16(u1, k_256);
+  u2 = _mm_mulhrs_epi16(u2, k_256);
+  u3 = _mm_mulhrs_epi16(u3, k_256);
+  u4 = _mm_mulhrs_epi16(u4, k_256);
+  u5 = _mm_mulhrs_epi16(u5, k_256);
+  u6 = _mm_mulhrs_epi16(u6, k_256);
+  u7 = _mm_mulhrs_epi16(u7, k_256);
+
+  v0 = _mm_packus_epi16(u0, u1);
+  v1 = _mm_packus_epi16(u2, u3);
+  v2 = _mm_packus_epi16(u4, u5);
+  v3 = _mm_packus_epi16(u6, u7);
+
+  u0 = _mm_unpacklo_epi8(v0, v1);
+  u1 = _mm_unpackhi_epi8(v0, v1);
+  u2 = _mm_unpacklo_epi8(v2, v3);
+  u3 = _mm_unpackhi_epi8(v2, v3);
+
+  u4 = _mm_unpacklo_epi8(u0, u1);
+  u5 = _mm_unpacklo_epi8(u2, u3);
+  u6 = _mm_unpackhi_epi8(u0, u1);
+  u7 = _mm_unpackhi_epi8(u2, u3);
+
+  u0 = _mm_unpacklo_epi32(u4, u5);
+  u1 = _mm_unpackhi_epi32(u4, u5);
+  u2 = _mm_unpacklo_epi32(u6, u7);
+  u3 = _mm_unpackhi_epi32(u6, u7);
+
+  u4 = _mm_srli_si128(u0, 8);
+  u5 = _mm_srli_si128(u1, 8);
+  u6 = _mm_srli_si128(u2, 8);
+  u7 = _mm_srli_si128(u3, 8);
+
+  v0 = _mm_loadl_epi64((__m128i const *)(dst + 0 * dst_stride));
+  v1 = _mm_loadl_epi64((__m128i const *)(dst + 1 * dst_stride));
+  v2 = _mm_loadl_epi64((__m128i const *)(dst + 2 * dst_stride));
+  v3 = _mm_loadl_epi64((__m128i const *)(dst + 3 * dst_stride));
+  v4 = _mm_loadl_epi64((__m128i const *)(dst + 4 * dst_stride));
+  v5 = _mm_loadl_epi64((__m128i const *)(dst + 5 * dst_stride));
+  v6 = _mm_loadl_epi64((__m128i const *)(dst + 6 * dst_stride));
+  v7 = _mm_loadl_epi64((__m128i const *)(dst + 7 * dst_stride));
+
+  u0 = _mm_unpacklo_epi8(u0, zero);
+  u1 = _mm_unpacklo_epi8(u1, zero);
+  u2 = _mm_unpacklo_epi8(u2, zero);
+  u3 = _mm_unpacklo_epi8(u3, zero);
+  u4 = _mm_unpacklo_epi8(u4, zero);
+  u5 = _mm_unpacklo_epi8(u5, zero);
+  u6 = _mm_unpacklo_epi8(u6, zero);
+  u7 = _mm_unpacklo_epi8(u7, zero);
+
+  v0 = _mm_unpacklo_epi8(v0, zero);
+  v1 = _mm_unpacklo_epi8(v1, zero);
+  v2 = _mm_unpacklo_epi8(v2, zero);
+  v3 = _mm_unpacklo_epi8(v3, zero);
+  v4 = _mm_unpacklo_epi8(v4, zero);
+  v5 = _mm_unpacklo_epi8(v5, zero);
+  v6 = _mm_unpacklo_epi8(v6, zero);
+  v7 = _mm_unpacklo_epi8(v7, zero);
+
+  v0 = _mm_adds_epi16(u0, v0);
+  v1 = _mm_adds_epi16(u4, v1);
+  v2 = _mm_adds_epi16(u1, v2);
+  v3 = _mm_adds_epi16(u5, v3);
+  v4 = _mm_adds_epi16(u2, v4);
+  v5 = _mm_adds_epi16(u6, v5);
+  v6 = _mm_adds_epi16(u3, v6);
+  v7 = _mm_adds_epi16(u7, v7);
+
+  v0 = _mm_adds_epi16(v0, one);
+  v1 = _mm_adds_epi16(v1, one);
+  v2 = _mm_adds_epi16(v2, one);
+  v3 = _mm_adds_epi16(v3, one);
+  v4 = _mm_adds_epi16(v4, one);
+  v5 = _mm_adds_epi16(v5, one);
+  v6 = _mm_adds_epi16(v6, one);
+  v7 = _mm_adds_epi16(v7, one);
+
+  v0 = _mm_srai_epi16(v0, 1);
+  v1 = _mm_srai_epi16(v1, 1);
+  v2 = _mm_srai_epi16(v2, 1);
+  v3 = _mm_srai_epi16(v3, 1);
+  v4 = _mm_srai_epi16(v4, 1);
+  v5 = _mm_srai_epi16(v5, 1);
+  v6 = _mm_srai_epi16(v6, 1);
+  v7 = _mm_srai_epi16(v7, 1);
+
+  u0 = _mm_packus_epi16(v0, v1);
+  u1 = _mm_packus_epi16(v2, v3);
+  u2 = _mm_packus_epi16(v4, v5);
+  u3 = _mm_packus_epi16(v6, v7);
+
+  u4 = _mm_srli_si128(u0, 8);
+  u5 = _mm_srli_si128(u1, 8);
+  u6 = _mm_srli_si128(u2, 8);
+  u7 = _mm_srli_si128(u3, 8);
+
+  _mm_storel_epi64((__m128i*)dst, u0);
+  _mm_storel_epi64((__m128i*)(dst + dst_stride * 1), u4);
+  _mm_storel_epi64((__m128i*)(dst + dst_stride * 2), u1);
+  _mm_storel_epi64((__m128i*)(dst + dst_stride * 3), u5);
+  _mm_storel_epi64((__m128i*)(dst + dst_stride * 4), u2);
+  _mm_storel_epi64((__m128i*)(dst + dst_stride * 5), u6);
+  _mm_storel_epi64((__m128i*)(dst + dst_stride * 6), u3);
+  _mm_storel_epi64((__m128i*)(dst + dst_stride * 7), u7);
+}
+
+static transpose_to_dst_t trans8x8Tab[2] = {
+  transpose8x8_direct_to_dst, transpose8x8_accumu_to_dst
+};
+
+static INLINE void transpose_8x16(const __m128i *in, __m128i *out) {
+  __m128i t0, t1, t2, t3, u0, u1;
+
+  t0 = _mm_unpacklo_epi16(in[0], in[1]);
+  t1 = _mm_unpacklo_epi16(in[2], in[3]);
+  t2 = _mm_unpacklo_epi16(in[4], in[5]);
+  t3 = _mm_unpacklo_epi16(in[6], in[7]);
+
+  u0 = _mm_unpacklo_epi32(t0, t1);
+  u1 = _mm_unpacklo_epi32(t2, t3);
+
+  out[0] = _mm_unpacklo_epi64(u0, u1);
+  out[1] = _mm_unpackhi_epi64(u0, u1);
+
+  u0 = _mm_unpackhi_epi32(t0, t1);
+  u1 = _mm_unpackhi_epi32(t2, t3);
+
+  out[2] = _mm_unpacklo_epi64(u0, u1);
+  out[3] = _mm_unpackhi_epi64(u0, u1);
+
+  t0 = _mm_unpackhi_epi16(in[0], in[1]);
+  t1 = _mm_unpackhi_epi16(in[2], in[3]);
+  t2 = _mm_unpackhi_epi16(in[4], in[5]);
+  t3 = _mm_unpackhi_epi16(in[6], in[7]);
+
+  u0 = _mm_unpacklo_epi32(t0, t1);
+  u1 = _mm_unpacklo_epi32(t2, t3);
+
+  out[4] = _mm_unpacklo_epi64(u0, u1);
+  out[5] = _mm_unpackhi_epi64(u0, u1);
+
+  // Ignore out[6] and out[7]
+  // they're zero vectors.
+}
+
+static void filter_horiz_v8p_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+                                   __m128i *f, int tapsNum, uint16_t *buf) {
+  __m128i s[8], t[6];
+  __m128i min_x2x3, max_x2x3;
+  __m128i temp;
+
+  if (tapsNum == 10) {
+    src_ptr -= 1;
+  }
+  s[0] = _mm_loadu_si128((const __m128i *)src_ptr);
+  s[1] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
+  s[2] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
+  s[3] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
+  s[4] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
+  s[5] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
+  s[6] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
+  s[7] = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
+
+  // TRANSPOSE...
+  // Vecotor represents column pixel pairs instead of a row
+  transpose_8x16(s, t);
+
+  // multiply 2 adjacent elements with the filter and add the result
+  s[0] = _mm_maddubs_epi16(t[0], f[0]);
+  s[1] = _mm_maddubs_epi16(t[1], f[1]);
+  s[2] = _mm_maddubs_epi16(t[2], f[2]);
+  s[3] = _mm_maddubs_epi16(t[3], f[3]);
+  s[4] = _mm_maddubs_epi16(t[4], f[4]);
+  s[5] = _mm_maddubs_epi16(t[5], f[5]);
+
+  // add and saturate the results together
+  min_x2x3 = _mm_min_epi16(s[2], s[3]);
+  max_x2x3 = _mm_max_epi16(s[2], s[3]);
+  temp = _mm_adds_epi16(s[0], s[1]);
+  temp = _mm_adds_epi16(temp, s[5]);
+  temp = _mm_adds_epi16(temp, s[4]);
+
+  temp = _mm_adds_epi16(temp, min_x2x3);
+  temp = _mm_adds_epi16(temp, max_x2x3);
+
+  _mm_storeu_si128((__m128i *)buf, temp);
+}
+
+// Vertical 4-pixel parallel
+static INLINE void transpose4x4_direct_to_dst(const uint16_t *src,
+                                              int src_stride,
+                                              uint8_t *dst,
+                                              int dst_stride) {
+  const __m128i k_256 = _mm_set1_epi16(1 << 8);
+  __m128i v0, v1, v2, v3;
+
+  // TODO(luoyi): two loads, 8 elements per load (two bytes per element)
+  __m128i u0 = _mm_loadl_epi64((__m128i const *)(src + 0 * src_stride));
+  __m128i u1 = _mm_loadl_epi64((__m128i const *)(src + 1 * src_stride));
+  __m128i u2 = _mm_loadl_epi64((__m128i const *)(src + 2 * src_stride));
+  __m128i u3 = _mm_loadl_epi64((__m128i const *)(src + 3 * src_stride));
+
+  v0 = _mm_unpacklo_epi16(u0, u1);
+  v1 = _mm_unpacklo_epi16(u2, u3);
+
+  v2 = _mm_unpacklo_epi32(v0, v1);
+  v3 = _mm_unpackhi_epi32(v0, v1);
+
+  u0 = _mm_mulhrs_epi16(v2, k_256);
+  u1 = _mm_mulhrs_epi16(v3, k_256);
+
+  u0 = _mm_packus_epi16(u0, u1);
+  u1 = _mm_srli_si128(u0, 4);
+  u2 = _mm_srli_si128(u0, 8);
+  u3 = _mm_srli_si128(u0, 12);
+
+  *(int *)(dst) = _mm_cvtsi128_si32(u0);
+  *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u1);
+  *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(u2);
+  *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(u3);
+}
+
+static INLINE void transpose4x4_accumu_to_dst(const uint16_t *src,
+                                              int src_stride,
+                                              uint8_t *dst,
+                                              int dst_stride) {
+  const __m128i k_256 = _mm_set1_epi16(1 << 8);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+
+  __m128i v0, v1, v2, v3;
+
+  __m128i u0 = _mm_loadl_epi64((__m128i const *)(src));
+  __m128i u1 = _mm_loadl_epi64((__m128i const *)(src + src_stride));
+  __m128i u2 = _mm_loadl_epi64((__m128i const *)(src + 2 * src_stride));
+  __m128i u3 = _mm_loadl_epi64((__m128i const *)(src + 3 * src_stride));
+
+  v0 = _mm_unpacklo_epi16(u0, u1);
+  v1 = _mm_unpacklo_epi16(u2, u3);
+
+  v2 = _mm_unpacklo_epi32(v0, v1);
+  v3 = _mm_unpackhi_epi32(v0, v1);
+
+  u0 = _mm_mulhrs_epi16(v2, k_256);
+  u1 = _mm_mulhrs_epi16(v3, k_256);
+
+  u2 = _mm_packus_epi16(u0, u1);
+  u0 = _mm_unpacklo_epi8(u2, zero);
+  u1 = _mm_unpackhi_epi8(u2, zero);
+
+  // load pixel values
+  v0 = _mm_loadl_epi64((__m128i const *)(dst));
+  v1 = _mm_loadl_epi64((__m128i const *)(dst + dst_stride));
+  v2 = _mm_loadl_epi64((__m128i const *)(dst + 2 * dst_stride));
+  v3 = _mm_loadl_epi64((__m128i const *)(dst + 3 * dst_stride));
+
+  v0 = _mm_unpacklo_epi8(v0, zero);
+  v1 = _mm_unpacklo_epi8(v1, zero);
+  v2 = _mm_unpacklo_epi8(v2, zero);
+  v3 = _mm_unpacklo_epi8(v3, zero);
+
+  v0 = _mm_unpacklo_epi64(v0, v1);
+  v1 = _mm_unpacklo_epi64(v2, v3);
+
+  u0 = _mm_adds_epi16(u0, v0);
+  u1 = _mm_adds_epi16(u1, v1);
+
+  u0 = _mm_adds_epi16(u0, one);
+  u1 = _mm_adds_epi16(u1, one);
+
+  u0 = _mm_srai_epi16(u0, 1);
+  u1 = _mm_srai_epi16(u1, 1);
+
+  // saturation and pack to pixels
+  u0 = _mm_packus_epi16(u0, u1);
+  u1 = _mm_srli_si128(u0, 4);
+  u2 = _mm_srli_si128(u0, 8);
+  u3 = _mm_srli_si128(u0, 12);
+
+  *(int *)(dst) = _mm_cvtsi128_si32(u0);
+  *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u1);
+  *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(u2);
+  *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(u3);
+}
+
+static transpose_to_dst_t trans4x4Tab[2] = {
+  transpose4x4_direct_to_dst, transpose4x4_accumu_to_dst
+};
+
+static void filter_horiz_v4p_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
+                                   __m128i *f, int tapsNum, uint16_t *buf) {
+  __m128i A, B, C, D;
+  __m128i tr0_0, tr0_1, s1s0, s3s2, s5s4, s7s6, s9s8, sbsa;
+  __m128i x0, x1, x2, x3, x4, x5;
+  __m128i min_x2x3, max_x2x3, temp;
+
+  if (tapsNum == 10) {
+    src_ptr -= 1;
+  }
+  A = _mm_loadu_si128((const __m128i *)src_ptr);
+  B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
+  C = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
+  D = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
+
+  // TRANSPOSE...
+  // Vecotor represents column pixel pairs instead of a row
+  // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
+  tr0_0 = _mm_unpacklo_epi16(A, B);
+  // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
+  tr0_1 = _mm_unpacklo_epi16(C, D);
+  // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
+  s1s0  = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
+  s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1);
+  // 02 03 12 13 22 23 32 33
+  s3s2 = _mm_srli_si128(s1s0, 8);
+  // 06 07 16 17 26 27 36 37
+  s7s6 = _mm_srli_si128(s5s4, 8);
+
+  tr0_0 = _mm_unpackhi_epi16(A, B);
+  tr0_1 = _mm_unpackhi_epi16(C, D);
+  s9s8  = _mm_unpacklo_epi32(tr0_0, tr0_1);
+  sbsa = _mm_srli_si128(s9s8, 8);
+
+  // multiply 2 adjacent elements with the filter and add the result
+  x0 = _mm_maddubs_epi16(s1s0, f[0]);
+  x1 = _mm_maddubs_epi16(s3s2, f[1]);
+  x2 = _mm_maddubs_epi16(s5s4, f[2]);
+  x3 = _mm_maddubs_epi16(s7s6, f[3]);
+  x4 = _mm_maddubs_epi16(s9s8, f[4]);
+  x5 = _mm_maddubs_epi16(sbsa, f[5]);
+  // add and saturate the results together
+  min_x2x3 = _mm_min_epi16(x2, x3);
+  max_x2x3 = _mm_max_epi16(x2, x3);
+  temp = _mm_adds_epi16(x0, x1);
+  temp = _mm_adds_epi16(temp, x5);
+  temp = _mm_adds_epi16(temp, x4);
+
+  temp = _mm_adds_epi16(temp, min_x2x3);
+  temp = _mm_adds_epi16(temp, max_x2x3);
+  _mm_storel_epi64((__m128i *)buf, temp);
+}
+
+// Note:
+//  This function assumes:
+// (1) 10/12-taps filters
+// (2) x_step_q4 = 16 then filter is fixed at the call
+
+void vp10_convolve_horiz_ssse3(const uint8_t *src, int src_stride, uint8_t *dst,
+                               int dst_stride, int w, int h,
+                               const InterpFilterParams filter_params,
+                               const int subpel_x_q4, int x_step_q4, int avg) {
+  DECLARE_ALIGNED(16, uint16_t, temp[8 * 8]);
+  __m128i verf[6];
+  __m128i horf[2];
+  SubpelFilterCoeffs hCoeffs, vCoeffs;
+  const uint8_t *src_ptr;
+  store_pixel_t store2p = store2pixelTab[avg];
+  store_pixel_t store4p = store4pixelTab[avg];
+  transpose_to_dst_t transpose_4x4 = trans4x4Tab[avg];
+  transpose_to_dst_t transpose_8x8 = trans8x8Tab[avg];
+
+  const int tapsNum = filter_params.taps;
+  int block_height, block_residu;
+  int i, col, count;
+  (void)x_step_q4;
+
+  if (0 == subpel_x_q4 || 16 != x_step_q4) {
+    vp10_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h, filter_params,
+                          subpel_x_q4, x_step_q4, avg);
+    return;
+  }
+
+  hCoeffs = vp10_get_subpel_filter_signal_dir(
+      filter_params, subpel_x_q4 - 1);
+  vCoeffs = vp10_get_subpel_filter_ver_signal_dir(
+      filter_params, subpel_x_q4 - 1);
+
+  if (!hCoeffs || !vCoeffs) {
+    vp10_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h, filter_params,
+                          subpel_x_q4, x_step_q4, avg);
+    return;
+  }
+
+  verf[0] = *((const __m128i *)(vCoeffs));
+  verf[1] = *((const __m128i *)(vCoeffs + 1));
+  verf[2] = *((const __m128i *)(vCoeffs + 2));
+  verf[3] = *((const __m128i *)(vCoeffs + 3));
+  verf[4] = *((const __m128i *)(vCoeffs + 4));
+  verf[5] = *((const __m128i *)(vCoeffs + 5));
+
+  horf[0] = *((const __m128i *)(hCoeffs));
+  horf[1] = *((const __m128i *)(hCoeffs + 1));
+
+  count = 0;
+
+  // here tapsNum is filter size
+  src -= (tapsNum >> 1) - 1;
+  src_ptr = src;
+  if (w > WIDTH_BOUND && h > HEIGHT_BOUND) {
+    // 8-pixels parallel
+    block_height = h >> 3;
+    block_residu = h & 7;
+
+    do {
+      for (col = 0; col < w; col += 8) {
+        for (i = 0; i < 8; ++i) {
+          filter_horiz_v8p_ssse3(src_ptr, src_stride, verf, tapsNum,
+                                 temp + (i * 8));
+          src_ptr += 1;
+        }
+        transpose_8x8(temp, 8, dst + col, dst_stride);
+      }
+      count++;
+      src_ptr = src + count * src_stride * 8;
+      dst += dst_stride * 8;
+    } while (count < block_height);
+
+    for (i = 0; i < block_residu; ++i) {
+      filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store4p, dst);
+      src_ptr += src_stride;
+      dst += dst_stride;
+    }
+  } else {
+    if (w > 2) {
+      // 4-pixels parallel
+      block_height = h >> 2;
+      block_residu = h & 3;
+
+      do {
+        for (col = 0; col < w; col += 4) {
+          for (i = 0; i < 4; ++i) {
+            filter_horiz_v4p_ssse3(src_ptr, src_stride, verf, tapsNum,
+                                   temp + (i * 4));
+            src_ptr += 1;
+          }
+          transpose_4x4(temp, 4, dst + col, dst_stride);
+        }
+        count++;
+        src_ptr = src + count * src_stride * 4;
+        dst += dst_stride * 4;
+      } while (count < block_height);
+
+      for (i = 0; i < block_residu; ++i) {
+        filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store4p, dst);
+        src_ptr += src_stride;
+        dst += dst_stride;
+      }
+    } else {
+      for (i = 0; i < h; i++) {
+        filter_horiz_ssse3(src_ptr, horf, tapsNum, w, store2p, dst);
+        src_ptr += src_stride;
+        dst += dst_stride;
+      }
+    }
+  }
+}
+
+// Vertical convolution filtering
+static INLINE void store_8_pixel_only(const __m128i *x, uint8_t *dst) {
+  __m128i u = _mm_packus_epi16(*x, *x);
+  _mm_storel_epi64((__m128i *)dst, u);
+}
+
+static INLINE void accumulate_store_8_pixel(const __m128i *x, uint8_t *dst) {
+  __m128i y = accumulate_store(x, dst);
+  _mm_storel_epi64((__m128i *)dst, y);
+}
+
+static store_pixel_t store8pixelTab[2] = {
+  store_8_pixel_only, accumulate_store_8_pixel};
+
+static __m128i filter_vert_ssse3(const uint8_t *src, int src_stride,
+                                 int tapsNum, __m128i *f) {
+  __m128i s[12];
+  const __m128i k_256 = _mm_set1_epi16(1 << 8);
+  const __m128i zero = _mm_setzero_si128();
+  __m128i min_x2x3, max_x2x3, sum;
+  int i = 0;
+  int r = 0;
+
+  if (10 == tapsNum) {
+    i += 1;
+    s[0] = zero;
+  }
+  while (i < 12) {
+    s[i] = _mm_loadu_si128((__m128i const *)(src + r * src_stride));
+    i += 1;
+    r += 1;
+  }
+
+  s[0] = _mm_unpacklo_epi8(s[0], s[1]);
+  s[2] = _mm_unpacklo_epi8(s[2], s[3]);
+  s[4] = _mm_unpacklo_epi8(s[4], s[5]);
+  s[6] = _mm_unpacklo_epi8(s[6], s[7]);
+  s[8] = _mm_unpacklo_epi8(s[8], s[9]);
+  s[10] = _mm_unpacklo_epi8(s[10], s[11]);
+
+  s[0] = _mm_maddubs_epi16(s[0], f[0]);
+  s[2] = _mm_maddubs_epi16(s[2], f[1]);
+  s[4] = _mm_maddubs_epi16(s[4], f[2]);
+  s[6] = _mm_maddubs_epi16(s[6], f[3]);
+  s[8] = _mm_maddubs_epi16(s[8], f[4]);
+  s[10] = _mm_maddubs_epi16(s[10], f[5]);
+
+  min_x2x3 = _mm_min_epi16(s[4], s[6]);
+  max_x2x3 = _mm_max_epi16(s[4], s[6]);
+  sum = _mm_adds_epi16(s[0], s[2]);
+  sum = _mm_adds_epi16(sum, s[10]);
+  sum = _mm_adds_epi16(sum, s[8]);
+
+  sum = _mm_adds_epi16(sum, min_x2x3);
+  sum = _mm_adds_epi16(sum, max_x2x3);
+
+  sum = _mm_mulhrs_epi16(sum, k_256);
+  sum = _mm_packus_epi16(sum, sum);
+  sum = _mm_unpacklo_epi8(sum, zero);
+  return sum;
+}
+
+static void filter_vert_horiz_parallel_ssse3(const uint8_t *src, int src_stride,
+                                             __m128i *f, int tapsNum,
+                                             store_pixel_t store_func,
+                                             uint8_t *dst) {
+  __m128i sum = filter_vert_ssse3(src, src_stride, tapsNum, f);
+  store_func(&sum, dst);
+}
+
+static void filter_vert_compute_small(const uint8_t *src, int src_stride,
+                                      __m128i *f, int tapsNum,
+                                      store_pixel_t store_func, int h,
+                                      uint8_t *dst, int dst_stride) {
+  int rowIndex = 0;
+  do {
+    filter_vert_horiz_parallel_ssse3(src, src_stride, f, tapsNum, store_func,
+                                     dst);
+    rowIndex++;
+    src += src_stride;
+    dst += dst_stride;
+  } while (rowIndex < h);
+}
+
+static void filter_vert_compute_large(const uint8_t *src, int src_stride,
+                                      __m128i *f, int tapsNum,
+                                      store_pixel_t store_func, int w, int h,
+                                      uint8_t *dst, int dst_stride) {
+  int col;
+  int rowIndex = 0;
+  const uint8_t *src_ptr = src;
+  uint8_t *dst_ptr = dst;
+
+  do {
+    for (col = 0; col < w; col += 8) {
+      filter_vert_horiz_parallel_ssse3(src_ptr, src_stride, f, tapsNum,
+                                       store_func, dst_ptr);
+      src_ptr += 8;
+      dst_ptr += 8;
+    }
+    rowIndex++;
+    src_ptr = src + rowIndex * src_stride;
+    dst_ptr = dst + rowIndex * dst_stride;
+  } while (rowIndex < h);
+}
+
+void vp10_convolve_vert_ssse3(const uint8_t *src, int src_stride, uint8_t *dst,
+                              int dst_stride, int w, int h,
+                              const InterpFilterParams filter_params,
+                              const int subpel_y_q4, int y_step_q4, int avg) {
+  __m128i verf[6];
+  SubpelFilterCoeffs vCoeffs;
+  const uint8_t *src_ptr;
+  uint8_t *dst_ptr = dst;
+  store_pixel_t store2p = store2pixelTab[avg];
+  store_pixel_t store4p = store4pixelTab[avg];
+  store_pixel_t store8p = store8pixelTab[avg];
+  const int tapsNum = filter_params.taps;
+
+  if (0 == subpel_y_q4 || 16 != y_step_q4) {
+    vp10_convolve_vert_c(src, src_stride, dst, dst_stride, w, h, filter_params,
+                         subpel_y_q4, y_step_q4, avg);
+    return;
+  }
+
+  vCoeffs = vp10_get_subpel_filter_ver_signal_dir(
+      filter_params, subpel_y_q4 - 1);
+
+  if (!vCoeffs) {
+    vp10_convolve_vert_c(src, src_stride, dst, dst_stride, w, h, filter_params,
+                         subpel_y_q4, y_step_q4, avg);
+    return;
+  }
+
+  verf[0] = *((const __m128i *)(vCoeffs));
+  verf[1] = *((const __m128i *)(vCoeffs + 1));
+  verf[2] = *((const __m128i *)(vCoeffs + 2));
+  verf[3] = *((const __m128i *)(vCoeffs + 3));
+  verf[4] = *((const __m128i *)(vCoeffs + 4));
+  verf[5] = *((const __m128i *)(vCoeffs + 5));
+
+  src -= src_stride * ((tapsNum >> 1) - 1);
+  src_ptr = src;
+
+  if (w > 4) {
+    filter_vert_compute_large(src_ptr, src_stride, verf, tapsNum, store8p,
+                              w, h, dst_ptr, dst_stride);
+  } else if (4 == w) {
+    filter_vert_compute_small(src_ptr, src_stride, verf, tapsNum, store4p,
+                              h, dst_ptr, dst_stride);
+  } else if (2 == w) {
+    filter_vert_compute_small(src_ptr, src_stride, verf, tapsNum, store2p,
+                              h, dst_ptr, dst_stride);
+  } else {
+    assert(0);
+  }
+}
diff --git a/vp10/common/x86/vp10_fwd_txfm1d_sse4.c b/vp10/common/x86/vp10_fwd_txfm1d_sse4.c
new file mode 100644
index 0000000..5ade8bd
--- /dev/null
+++ b/vp10/common/x86/vp10_fwd_txfm1d_sse4.c
@@ -0,0 +1,2594 @@
+#include "vp10/common/x86/vp10_txfm1d_sse4.h"
+
+void vp10_fdct4_new_sse4_1(const __m128i* input, __m128i* output,
+                           const int8_t* cos_bit, const int8_t* stage_range) {
+  const int txfm_size = 4;
+  const int num_per_128 = 4;
+  const int32_t* cospi;
+  __m128i buf0[4];
+  __m128i buf1[4];
+  int col_num = txfm_size / num_per_128;
+  int bit;
+  int col;
+  (void)stage_range;
+  for (col = 0; col < col_num; col++) {
+    // stage 0;
+    int32_t stage_idx = 0;
+    buf0[0] = input[0 * col_num + col];
+    buf0[1] = input[1 * col_num + col];
+    buf0[2] = input[2 * col_num + col];
+    buf0[3] = input[3 * col_num + col];
+
+    // stage 1
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[3]);
+    buf1[3] = _mm_sub_epi32(buf0[0], buf0[3]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[2]);
+    buf1[2] = _mm_sub_epi32(buf0[1], buf0[2]);
+
+    // stage 2
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[0], buf1[1], buf0[0],
+                        buf0[1], bit);
+    btf_32_sse4_1_type1(cospi[48], cospi[16], buf1[2], buf1[3], buf0[2],
+                        buf0[3], bit);
+
+    // stage 3
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = buf0[0];
+    buf1[1] = buf0[2];
+    buf1[2] = buf0[1];
+    buf1[3] = buf0[3];
+
+    output[0 * col_num + col] = buf1[0];
+    output[1 * col_num + col] = buf1[1];
+    output[2 * col_num + col] = buf1[2];
+    output[3 * col_num + col] = buf1[3];
+  }
+}
+
+void vp10_fdct8_new_sse4_1(const __m128i* input, __m128i* output,
+                           const int8_t* cos_bit, const int8_t* stage_range) {
+  const int txfm_size = 8;
+  const int num_per_128 = 4;
+  const int32_t* cospi;
+  __m128i buf0[8];
+  __m128i buf1[8];
+  int col_num = txfm_size / num_per_128;
+  int bit;
+  int col;
+  (void)stage_range;
+  for (col = 0; col < col_num; col++) {
+    // stage 0;
+    int32_t stage_idx = 0;
+    buf0[0] = input[0 * col_num + col];
+    buf0[1] = input[1 * col_num + col];
+    buf0[2] = input[2 * col_num + col];
+    buf0[3] = input[3 * col_num + col];
+    buf0[4] = input[4 * col_num + col];
+    buf0[5] = input[5 * col_num + col];
+    buf0[6] = input[6 * col_num + col];
+    buf0[7] = input[7 * col_num + col];
+
+    // stage 1
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[7]);
+    buf1[7] = _mm_sub_epi32(buf0[0], buf0[7]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[6]);
+    buf1[6] = _mm_sub_epi32(buf0[1], buf0[6]);
+    buf1[2] = _mm_add_epi32(buf0[2], buf0[5]);
+    buf1[5] = _mm_sub_epi32(buf0[2], buf0[5]);
+    buf1[3] = _mm_add_epi32(buf0[3], buf0[4]);
+    buf1[4] = _mm_sub_epi32(buf0[3], buf0[4]);
+
+    // stage 2
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = _mm_add_epi32(buf1[0], buf1[3]);
+    buf0[3] = _mm_sub_epi32(buf1[0], buf1[3]);
+    buf0[1] = _mm_add_epi32(buf1[1], buf1[2]);
+    buf0[2] = _mm_sub_epi32(buf1[1], buf1[2]);
+    buf0[4] = buf1[4];
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5],
+                        buf0[6], bit);
+    buf0[7] = buf1[7];
+
+    // stage 3
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0],
+                        buf1[1], bit);
+    btf_32_sse4_1_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2],
+                        buf1[3], bit);
+    buf1[4] = _mm_add_epi32(buf0[4], buf0[5]);
+    buf1[5] = _mm_sub_epi32(buf0[4], buf0[5]);
+    buf1[6] = _mm_sub_epi32(buf0[7], buf0[6]);
+    buf1[7] = _mm_add_epi32(buf0[7], buf0[6]);
+
+    // stage 4
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    buf0[2] = buf1[2];
+    buf0[3] = buf1[3];
+    btf_32_sse4_1_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7],
+                        bit);
+    btf_32_sse4_1_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5],
+                        buf0[6], bit);
+
+    // stage 5
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = buf0[0];
+    buf1[1] = buf0[4];
+    buf1[2] = buf0[2];
+    buf1[3] = buf0[6];
+    buf1[4] = buf0[1];
+    buf1[5] = buf0[5];
+    buf1[6] = buf0[3];
+    buf1[7] = buf0[7];
+
+    output[0 * col_num + col] = buf1[0];
+    output[1 * col_num + col] = buf1[1];
+    output[2 * col_num + col] = buf1[2];
+    output[3 * col_num + col] = buf1[3];
+    output[4 * col_num + col] = buf1[4];
+    output[5 * col_num + col] = buf1[5];
+    output[6 * col_num + col] = buf1[6];
+    output[7 * col_num + col] = buf1[7];
+  }
+}
+
+void vp10_fdct16_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range) {
+  const int txfm_size = 16;
+  const int num_per_128 = 4;
+  const int32_t* cospi;
+  __m128i buf0[16];
+  __m128i buf1[16];
+  int col_num = txfm_size / num_per_128;
+  int bit;
+  int col;
+  (void)stage_range;
+  for (col = 0; col < col_num; col++) {
+    // stage 0;
+    int32_t stage_idx = 0;
+    buf0[0] = input[0 * col_num + col];
+    buf0[1] = input[1 * col_num + col];
+    buf0[2] = input[2 * col_num + col];
+    buf0[3] = input[3 * col_num + col];
+    buf0[4] = input[4 * col_num + col];
+    buf0[5] = input[5 * col_num + col];
+    buf0[6] = input[6 * col_num + col];
+    buf0[7] = input[7 * col_num + col];
+    buf0[8] = input[8 * col_num + col];
+    buf0[9] = input[9 * col_num + col];
+    buf0[10] = input[10 * col_num + col];
+    buf0[11] = input[11 * col_num + col];
+    buf0[12] = input[12 * col_num + col];
+    buf0[13] = input[13 * col_num + col];
+    buf0[14] = input[14 * col_num + col];
+    buf0[15] = input[15 * col_num + col];
+
+    // stage 1
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[15]);
+    buf1[15] = _mm_sub_epi32(buf0[0], buf0[15]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[14]);
+    buf1[14] = _mm_sub_epi32(buf0[1], buf0[14]);
+    buf1[2] = _mm_add_epi32(buf0[2], buf0[13]);
+    buf1[13] = _mm_sub_epi32(buf0[2], buf0[13]);
+    buf1[3] = _mm_add_epi32(buf0[3], buf0[12]);
+    buf1[12] = _mm_sub_epi32(buf0[3], buf0[12]);
+    buf1[4] = _mm_add_epi32(buf0[4], buf0[11]);
+    buf1[11] = _mm_sub_epi32(buf0[4], buf0[11]);
+    buf1[5] = _mm_add_epi32(buf0[5], buf0[10]);
+    buf1[10] = _mm_sub_epi32(buf0[5], buf0[10]);
+    buf1[6] = _mm_add_epi32(buf0[6], buf0[9]);
+    buf1[9] = _mm_sub_epi32(buf0[6], buf0[9]);
+    buf1[7] = _mm_add_epi32(buf0[7], buf0[8]);
+    buf1[8] = _mm_sub_epi32(buf0[7], buf0[8]);
+
+    // stage 2
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = _mm_add_epi32(buf1[0], buf1[7]);
+    buf0[7] = _mm_sub_epi32(buf1[0], buf1[7]);
+    buf0[1] = _mm_add_epi32(buf1[1], buf1[6]);
+    buf0[6] = _mm_sub_epi32(buf1[1], buf1[6]);
+    buf0[2] = _mm_add_epi32(buf1[2], buf1[5]);
+    buf0[5] = _mm_sub_epi32(buf1[2], buf1[5]);
+    buf0[3] = _mm_add_epi32(buf1[3], buf1[4]);
+    buf0[4] = _mm_sub_epi32(buf1[3], buf1[4]);
+    buf0[8] = buf1[8];
+    buf0[9] = buf1[9];
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[10], buf1[13], buf0[10],
+                        buf0[13], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[11], buf1[12], buf0[11],
+                        buf0[12], bit);
+    buf0[14] = buf1[14];
+    buf0[15] = buf1[15];
+
+    // stage 3
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[3]);
+    buf1[3] = _mm_sub_epi32(buf0[0], buf0[3]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[2]);
+    buf1[2] = _mm_sub_epi32(buf0[1], buf0[2]);
+    buf1[4] = buf0[4];
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[5], buf0[6], buf1[5],
+                        buf1[6], bit);
+    buf1[7] = buf0[7];
+    buf1[8] = _mm_add_epi32(buf0[8], buf0[11]);
+    buf1[11] = _mm_sub_epi32(buf0[8], buf0[11]);
+    buf1[9] = _mm_add_epi32(buf0[9], buf0[10]);
+    buf1[10] = _mm_sub_epi32(buf0[9], buf0[10]);
+    buf1[12] = _mm_sub_epi32(buf0[15], buf0[12]);
+    buf1[15] = _mm_add_epi32(buf0[15], buf0[12]);
+    buf1[13] = _mm_sub_epi32(buf0[14], buf0[13]);
+    buf1[14] = _mm_add_epi32(buf0[14], buf0[13]);
+
+    // stage 4
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[0], buf1[1], buf0[0],
+                        buf0[1], bit);
+    btf_32_sse4_1_type1(cospi[48], cospi[16], buf1[2], buf1[3], buf0[2],
+                        buf0[3], bit);
+    buf0[4] = _mm_add_epi32(buf1[4], buf1[5]);
+    buf0[5] = _mm_sub_epi32(buf1[4], buf1[5]);
+    buf0[6] = _mm_sub_epi32(buf1[7], buf1[6]);
+    buf0[7] = _mm_add_epi32(buf1[7], buf1[6]);
+    buf0[8] = buf1[8];
+    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[9], buf1[14], buf0[9],
+                        buf0[14], bit);
+    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[10], buf1[13], buf0[10],
+                        buf0[13], bit);
+    buf0[11] = buf1[11];
+    buf0[12] = buf1[12];
+    buf0[15] = buf1[15];
+
+    // stage 5
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = buf0[0];
+    buf1[1] = buf0[1];
+    buf1[2] = buf0[2];
+    buf1[3] = buf0[3];
+    btf_32_sse4_1_type1(cospi[56], cospi[8], buf0[4], buf0[7], buf1[4], buf1[7],
+                        bit);
+    btf_32_sse4_1_type1(cospi[24], cospi[40], buf0[5], buf0[6], buf1[5],
+                        buf1[6], bit);
+    buf1[8] = _mm_add_epi32(buf0[8], buf0[9]);
+    buf1[9] = _mm_sub_epi32(buf0[8], buf0[9]);
+    buf1[10] = _mm_sub_epi32(buf0[11], buf0[10]);
+    buf1[11] = _mm_add_epi32(buf0[11], buf0[10]);
+    buf1[12] = _mm_add_epi32(buf0[12], buf0[13]);
+    buf1[13] = _mm_sub_epi32(buf0[12], buf0[13]);
+    buf1[14] = _mm_sub_epi32(buf0[15], buf0[14]);
+    buf1[15] = _mm_add_epi32(buf0[15], buf0[14]);
+
+    // stage 6
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    buf0[2] = buf1[2];
+    buf0[3] = buf1[3];
+    buf0[4] = buf1[4];
+    buf0[5] = buf1[5];
+    buf0[6] = buf1[6];
+    buf0[7] = buf1[7];
+    btf_32_sse4_1_type1(cospi[60], cospi[4], buf1[8], buf1[15], buf0[8],
+                        buf0[15], bit);
+    btf_32_sse4_1_type1(cospi[28], cospi[36], buf1[9], buf1[14], buf0[9],
+                        buf0[14], bit);
+    btf_32_sse4_1_type1(cospi[44], cospi[20], buf1[10], buf1[13], buf0[10],
+                        buf0[13], bit);
+    btf_32_sse4_1_type1(cospi[12], cospi[52], buf1[11], buf1[12], buf0[11],
+                        buf0[12], bit);
+
+    // stage 7
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = buf0[0];
+    buf1[1] = buf0[8];
+    buf1[2] = buf0[4];
+    buf1[3] = buf0[12];
+    buf1[4] = buf0[2];
+    buf1[5] = buf0[10];
+    buf1[6] = buf0[6];
+    buf1[7] = buf0[14];
+    buf1[8] = buf0[1];
+    buf1[9] = buf0[9];
+    buf1[10] = buf0[5];
+    buf1[11] = buf0[13];
+    buf1[12] = buf0[3];
+    buf1[13] = buf0[11];
+    buf1[14] = buf0[7];
+    buf1[15] = buf0[15];
+
+    output[0 * col_num + col] = buf1[0];
+    output[1 * col_num + col] = buf1[1];
+    output[2 * col_num + col] = buf1[2];
+    output[3 * col_num + col] = buf1[3];
+    output[4 * col_num + col] = buf1[4];
+    output[5 * col_num + col] = buf1[5];
+    output[6 * col_num + col] = buf1[6];
+    output[7 * col_num + col] = buf1[7];
+    output[8 * col_num + col] = buf1[8];
+    output[9 * col_num + col] = buf1[9];
+    output[10 * col_num + col] = buf1[10];
+    output[11 * col_num + col] = buf1[11];
+    output[12 * col_num + col] = buf1[12];
+    output[13 * col_num + col] = buf1[13];
+    output[14 * col_num + col] = buf1[14];
+    output[15 * col_num + col] = buf1[15];
+  }
+}
+
+void vp10_fdct32_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range) {
+  const int txfm_size = 32;
+  const int num_per_128 = 4;
+  const int32_t* cospi;
+  __m128i buf0[32];
+  __m128i buf1[32];
+  int col_num = txfm_size / num_per_128;
+  int bit;
+  int col;
+  (void)stage_range;
+  for (col = 0; col < col_num; col++) {
+    // stage 0;
+    int32_t stage_idx = 0;
+    buf0[0] = input[0 * col_num + col];
+    buf0[1] = input[1 * col_num + col];
+    buf0[2] = input[2 * col_num + col];
+    buf0[3] = input[3 * col_num + col];
+    buf0[4] = input[4 * col_num + col];
+    buf0[5] = input[5 * col_num + col];
+    buf0[6] = input[6 * col_num + col];
+    buf0[7] = input[7 * col_num + col];
+    buf0[8] = input[8 * col_num + col];
+    buf0[9] = input[9 * col_num + col];
+    buf0[10] = input[10 * col_num + col];
+    buf0[11] = input[11 * col_num + col];
+    buf0[12] = input[12 * col_num + col];
+    buf0[13] = input[13 * col_num + col];
+    buf0[14] = input[14 * col_num + col];
+    buf0[15] = input[15 * col_num + col];
+    buf0[16] = input[16 * col_num + col];
+    buf0[17] = input[17 * col_num + col];
+    buf0[18] = input[18 * col_num + col];
+    buf0[19] = input[19 * col_num + col];
+    buf0[20] = input[20 * col_num + col];
+    buf0[21] = input[21 * col_num + col];
+    buf0[22] = input[22 * col_num + col];
+    buf0[23] = input[23 * col_num + col];
+    buf0[24] = input[24 * col_num + col];
+    buf0[25] = input[25 * col_num + col];
+    buf0[26] = input[26 * col_num + col];
+    buf0[27] = input[27 * col_num + col];
+    buf0[28] = input[28 * col_num + col];
+    buf0[29] = input[29 * col_num + col];
+    buf0[30] = input[30 * col_num + col];
+    buf0[31] = input[31 * col_num + col];
+
+    // stage 1
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[31]);
+    buf1[31] = _mm_sub_epi32(buf0[0], buf0[31]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[30]);
+    buf1[30] = _mm_sub_epi32(buf0[1], buf0[30]);
+    buf1[2] = _mm_add_epi32(buf0[2], buf0[29]);
+    buf1[29] = _mm_sub_epi32(buf0[2], buf0[29]);
+    buf1[3] = _mm_add_epi32(buf0[3], buf0[28]);
+    buf1[28] = _mm_sub_epi32(buf0[3], buf0[28]);
+    buf1[4] = _mm_add_epi32(buf0[4], buf0[27]);
+    buf1[27] = _mm_sub_epi32(buf0[4], buf0[27]);
+    buf1[5] = _mm_add_epi32(buf0[5], buf0[26]);
+    buf1[26] = _mm_sub_epi32(buf0[5], buf0[26]);
+    buf1[6] = _mm_add_epi32(buf0[6], buf0[25]);
+    buf1[25] = _mm_sub_epi32(buf0[6], buf0[25]);
+    buf1[7] = _mm_add_epi32(buf0[7], buf0[24]);
+    buf1[24] = _mm_sub_epi32(buf0[7], buf0[24]);
+    buf1[8] = _mm_add_epi32(buf0[8], buf0[23]);
+    buf1[23] = _mm_sub_epi32(buf0[8], buf0[23]);
+    buf1[9] = _mm_add_epi32(buf0[9], buf0[22]);
+    buf1[22] = _mm_sub_epi32(buf0[9], buf0[22]);
+    buf1[10] = _mm_add_epi32(buf0[10], buf0[21]);
+    buf1[21] = _mm_sub_epi32(buf0[10], buf0[21]);
+    buf1[11] = _mm_add_epi32(buf0[11], buf0[20]);
+    buf1[20] = _mm_sub_epi32(buf0[11], buf0[20]);
+    buf1[12] = _mm_add_epi32(buf0[12], buf0[19]);
+    buf1[19] = _mm_sub_epi32(buf0[12], buf0[19]);
+    buf1[13] = _mm_add_epi32(buf0[13], buf0[18]);
+    buf1[18] = _mm_sub_epi32(buf0[13], buf0[18]);
+    buf1[14] = _mm_add_epi32(buf0[14], buf0[17]);
+    buf1[17] = _mm_sub_epi32(buf0[14], buf0[17]);
+    buf1[15] = _mm_add_epi32(buf0[15], buf0[16]);
+    buf1[16] = _mm_sub_epi32(buf0[15], buf0[16]);
+
+    // stage 2
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = _mm_add_epi32(buf1[0], buf1[15]);
+    buf0[15] = _mm_sub_epi32(buf1[0], buf1[15]);
+    buf0[1] = _mm_add_epi32(buf1[1], buf1[14]);
+    buf0[14] = _mm_sub_epi32(buf1[1], buf1[14]);
+    buf0[2] = _mm_add_epi32(buf1[2], buf1[13]);
+    buf0[13] = _mm_sub_epi32(buf1[2], buf1[13]);
+    buf0[3] = _mm_add_epi32(buf1[3], buf1[12]);
+    buf0[12] = _mm_sub_epi32(buf1[3], buf1[12]);
+    buf0[4] = _mm_add_epi32(buf1[4], buf1[11]);
+    buf0[11] = _mm_sub_epi32(buf1[4], buf1[11]);
+    buf0[5] = _mm_add_epi32(buf1[5], buf1[10]);
+    buf0[10] = _mm_sub_epi32(buf1[5], buf1[10]);
+    buf0[6] = _mm_add_epi32(buf1[6], buf1[9]);
+    buf0[9] = _mm_sub_epi32(buf1[6], buf1[9]);
+    buf0[7] = _mm_add_epi32(buf1[7], buf1[8]);
+    buf0[8] = _mm_sub_epi32(buf1[7], buf1[8]);
+    buf0[16] = buf1[16];
+    buf0[17] = buf1[17];
+    buf0[18] = buf1[18];
+    buf0[19] = buf1[19];
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[20], buf1[27], buf0[20],
+                        buf0[27], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[21], buf1[26], buf0[21],
+                        buf0[26], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[22], buf1[25], buf0[22],
+                        buf0[25], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[23], buf1[24], buf0[23],
+                        buf0[24], bit);
+    buf0[28] = buf1[28];
+    buf0[29] = buf1[29];
+    buf0[30] = buf1[30];
+    buf0[31] = buf1[31];
+
+    // stage 3
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[7]);
+    buf1[7] = _mm_sub_epi32(buf0[0], buf0[7]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[6]);
+    buf1[6] = _mm_sub_epi32(buf0[1], buf0[6]);
+    buf1[2] = _mm_add_epi32(buf0[2], buf0[5]);
+    buf1[5] = _mm_sub_epi32(buf0[2], buf0[5]);
+    buf1[3] = _mm_add_epi32(buf0[3], buf0[4]);
+    buf1[4] = _mm_sub_epi32(buf0[3], buf0[4]);
+    buf1[8] = buf0[8];
+    buf1[9] = buf0[9];
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[10], buf0[13], buf1[10],
+                        buf1[13], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[11], buf0[12], buf1[11],
+                        buf1[12], bit);
+    buf1[14] = buf0[14];
+    buf1[15] = buf0[15];
+    buf1[16] = _mm_add_epi32(buf0[16], buf0[23]);
+    buf1[23] = _mm_sub_epi32(buf0[16], buf0[23]);
+    buf1[17] = _mm_add_epi32(buf0[17], buf0[22]);
+    buf1[22] = _mm_sub_epi32(buf0[17], buf0[22]);
+    buf1[18] = _mm_add_epi32(buf0[18], buf0[21]);
+    buf1[21] = _mm_sub_epi32(buf0[18], buf0[21]);
+    buf1[19] = _mm_add_epi32(buf0[19], buf0[20]);
+    buf1[20] = _mm_sub_epi32(buf0[19], buf0[20]);
+    buf1[24] = _mm_sub_epi32(buf0[31], buf0[24]);
+    buf1[31] = _mm_add_epi32(buf0[31], buf0[24]);
+    buf1[25] = _mm_sub_epi32(buf0[30], buf0[25]);
+    buf1[30] = _mm_add_epi32(buf0[30], buf0[25]);
+    buf1[26] = _mm_sub_epi32(buf0[29], buf0[26]);
+    buf1[29] = _mm_add_epi32(buf0[29], buf0[26]);
+    buf1[27] = _mm_sub_epi32(buf0[28], buf0[27]);
+    buf1[28] = _mm_add_epi32(buf0[28], buf0[27]);
+
+    // stage 4
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = _mm_add_epi32(buf1[0], buf1[3]);
+    buf0[3] = _mm_sub_epi32(buf1[0], buf1[3]);
+    buf0[1] = _mm_add_epi32(buf1[1], buf1[2]);
+    buf0[2] = _mm_sub_epi32(buf1[1], buf1[2]);
+    buf0[4] = buf1[4];
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[5], buf1[6], buf0[5],
+                        buf0[6], bit);
+    buf0[7] = buf1[7];
+    buf0[8] = _mm_add_epi32(buf1[8], buf1[11]);
+    buf0[11] = _mm_sub_epi32(buf1[8], buf1[11]);
+    buf0[9] = _mm_add_epi32(buf1[9], buf1[10]);
+    buf0[10] = _mm_sub_epi32(buf1[9], buf1[10]);
+    buf0[12] = _mm_sub_epi32(buf1[15], buf1[12]);
+    buf0[15] = _mm_add_epi32(buf1[15], buf1[12]);
+    buf0[13] = _mm_sub_epi32(buf1[14], buf1[13]);
+    buf0[14] = _mm_add_epi32(buf1[14], buf1[13]);
+    buf0[16] = buf1[16];
+    buf0[17] = buf1[17];
+    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[18], buf1[29], buf0[18],
+                        buf0[29], bit);
+    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[19], buf1[28], buf0[19],
+                        buf0[28], bit);
+    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[20], buf1[27], buf0[20],
+                        buf0[27], bit);
+    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[21], buf1[26], buf0[21],
+                        buf0[26], bit);
+    buf0[22] = buf1[22];
+    buf0[23] = buf1[23];
+    buf0[24] = buf1[24];
+    buf0[25] = buf1[25];
+    buf0[30] = buf1[30];
+    buf0[31] = buf1[31];
+
+    // stage 5
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf0[0], buf0[1], buf1[0],
+                        buf1[1], bit);
+    btf_32_sse4_1_type1(cospi[48], cospi[16], buf0[2], buf0[3], buf1[2],
+                        buf1[3], bit);
+    buf1[4] = _mm_add_epi32(buf0[4], buf0[5]);
+    buf1[5] = _mm_sub_epi32(buf0[4], buf0[5]);
+    buf1[6] = _mm_sub_epi32(buf0[7], buf0[6]);
+    buf1[7] = _mm_add_epi32(buf0[7], buf0[6]);
+    buf1[8] = buf0[8];
+    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf0[9], buf0[14], buf1[9],
+                        buf1[14], bit);
+    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf0[10], buf0[13], buf1[10],
+                        buf1[13], bit);
+    buf1[11] = buf0[11];
+    buf1[12] = buf0[12];
+    buf1[15] = buf0[15];
+    buf1[16] = _mm_add_epi32(buf0[16], buf0[19]);
+    buf1[19] = _mm_sub_epi32(buf0[16], buf0[19]);
+    buf1[17] = _mm_add_epi32(buf0[17], buf0[18]);
+    buf1[18] = _mm_sub_epi32(buf0[17], buf0[18]);
+    buf1[20] = _mm_sub_epi32(buf0[23], buf0[20]);
+    buf1[23] = _mm_add_epi32(buf0[23], buf0[20]);
+    buf1[21] = _mm_sub_epi32(buf0[22], buf0[21]);
+    buf1[22] = _mm_add_epi32(buf0[22], buf0[21]);
+    buf1[24] = _mm_add_epi32(buf0[24], buf0[27]);
+    buf1[27] = _mm_sub_epi32(buf0[24], buf0[27]);
+    buf1[25] = _mm_add_epi32(buf0[25], buf0[26]);
+    buf1[26] = _mm_sub_epi32(buf0[25], buf0[26]);
+    buf1[28] = _mm_sub_epi32(buf0[31], buf0[28]);
+    buf1[31] = _mm_add_epi32(buf0[31], buf0[28]);
+    buf1[29] = _mm_sub_epi32(buf0[30], buf0[29]);
+    buf1[30] = _mm_add_epi32(buf0[30], buf0[29]);
+
+    // stage 6
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    buf0[2] = buf1[2];
+    buf0[3] = buf1[3];
+    btf_32_sse4_1_type1(cospi[56], cospi[8], buf1[4], buf1[7], buf0[4], buf0[7],
+                        bit);
+    btf_32_sse4_1_type1(cospi[24], cospi[40], buf1[5], buf1[6], buf0[5],
+                        buf0[6], bit);
+    buf0[8] = _mm_add_epi32(buf1[8], buf1[9]);
+    buf0[9] = _mm_sub_epi32(buf1[8], buf1[9]);
+    buf0[10] = _mm_sub_epi32(buf1[11], buf1[10]);
+    buf0[11] = _mm_add_epi32(buf1[11], buf1[10]);
+    buf0[12] = _mm_add_epi32(buf1[12], buf1[13]);
+    buf0[13] = _mm_sub_epi32(buf1[12], buf1[13]);
+    buf0[14] = _mm_sub_epi32(buf1[15], buf1[14]);
+    buf0[15] = _mm_add_epi32(buf1[15], buf1[14]);
+    buf0[16] = buf1[16];
+    btf_32_sse4_1_type0(-cospi[8], cospi[56], buf1[17], buf1[30], buf0[17],
+                        buf0[30], bit);
+    btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf1[18], buf1[29], buf0[18],
+                        buf0[29], bit);
+    buf0[19] = buf1[19];
+    buf0[20] = buf1[20];
+    btf_32_sse4_1_type0(-cospi[40], cospi[24], buf1[21], buf1[26], buf0[21],
+                        buf0[26], bit);
+    btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf1[22], buf1[25], buf0[22],
+                        buf0[25], bit);
+    buf0[23] = buf1[23];
+    buf0[24] = buf1[24];
+    buf0[27] = buf1[27];
+    buf0[28] = buf1[28];
+    buf0[31] = buf1[31];
+
+    // stage 7
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = buf0[0];
+    buf1[1] = buf0[1];
+    buf1[2] = buf0[2];
+    buf1[3] = buf0[3];
+    buf1[4] = buf0[4];
+    buf1[5] = buf0[5];
+    buf1[6] = buf0[6];
+    buf1[7] = buf0[7];
+    btf_32_sse4_1_type1(cospi[60], cospi[4], buf0[8], buf0[15], buf1[8],
+                        buf1[15], bit);
+    btf_32_sse4_1_type1(cospi[28], cospi[36], buf0[9], buf0[14], buf1[9],
+                        buf1[14], bit);
+    btf_32_sse4_1_type1(cospi[44], cospi[20], buf0[10], buf0[13], buf1[10],
+                        buf1[13], bit);
+    btf_32_sse4_1_type1(cospi[12], cospi[52], buf0[11], buf0[12], buf1[11],
+                        buf1[12], bit);
+    buf1[16] = _mm_add_epi32(buf0[16], buf0[17]);
+    buf1[17] = _mm_sub_epi32(buf0[16], buf0[17]);
+    buf1[18] = _mm_sub_epi32(buf0[19], buf0[18]);
+    buf1[19] = _mm_add_epi32(buf0[19], buf0[18]);
+    buf1[20] = _mm_add_epi32(buf0[20], buf0[21]);
+    buf1[21] = _mm_sub_epi32(buf0[20], buf0[21]);
+    buf1[22] = _mm_sub_epi32(buf0[23], buf0[22]);
+    buf1[23] = _mm_add_epi32(buf0[23], buf0[22]);
+    buf1[24] = _mm_add_epi32(buf0[24], buf0[25]);
+    buf1[25] = _mm_sub_epi32(buf0[24], buf0[25]);
+    buf1[26] = _mm_sub_epi32(buf0[27], buf0[26]);
+    buf1[27] = _mm_add_epi32(buf0[27], buf0[26]);
+    buf1[28] = _mm_add_epi32(buf0[28], buf0[29]);
+    buf1[29] = _mm_sub_epi32(buf0[28], buf0[29]);
+    buf1[30] = _mm_sub_epi32(buf0[31], buf0[30]);
+    buf1[31] = _mm_add_epi32(buf0[31], buf0[30]);
+
+    // stage 8
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    buf0[2] = buf1[2];
+    buf0[3] = buf1[3];
+    buf0[4] = buf1[4];
+    buf0[5] = buf1[5];
+    buf0[6] = buf1[6];
+    buf0[7] = buf1[7];
+    buf0[8] = buf1[8];
+    buf0[9] = buf1[9];
+    buf0[10] = buf1[10];
+    buf0[11] = buf1[11];
+    buf0[12] = buf1[12];
+    buf0[13] = buf1[13];
+    buf0[14] = buf1[14];
+    buf0[15] = buf1[15];
+    btf_32_sse4_1_type1(cospi[62], cospi[2], buf1[16], buf1[31], buf0[16],
+                        buf0[31], bit);
+    btf_32_sse4_1_type1(cospi[30], cospi[34], buf1[17], buf1[30], buf0[17],
+                        buf0[30], bit);
+    btf_32_sse4_1_type1(cospi[46], cospi[18], buf1[18], buf1[29], buf0[18],
+                        buf0[29], bit);
+    btf_32_sse4_1_type1(cospi[14], cospi[50], buf1[19], buf1[28], buf0[19],
+                        buf0[28], bit);
+    btf_32_sse4_1_type1(cospi[54], cospi[10], buf1[20], buf1[27], buf0[20],
+                        buf0[27], bit);
+    btf_32_sse4_1_type1(cospi[22], cospi[42], buf1[21], buf1[26], buf0[21],
+                        buf0[26], bit);
+    btf_32_sse4_1_type1(cospi[38], cospi[26], buf1[22], buf1[25], buf0[22],
+                        buf0[25], bit);
+    btf_32_sse4_1_type1(cospi[6], cospi[58], buf1[23], buf1[24], buf0[23],
+                        buf0[24], bit);
+
+    // stage 9
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = buf0[0];
+    buf1[1] = buf0[16];
+    buf1[2] = buf0[8];
+    buf1[3] = buf0[24];
+    buf1[4] = buf0[4];
+    buf1[5] = buf0[20];
+    buf1[6] = buf0[12];
+    buf1[7] = buf0[28];
+    buf1[8] = buf0[2];
+    buf1[9] = buf0[18];
+    buf1[10] = buf0[10];
+    buf1[11] = buf0[26];
+    buf1[12] = buf0[6];
+    buf1[13] = buf0[22];
+    buf1[14] = buf0[14];
+    buf1[15] = buf0[30];
+    buf1[16] = buf0[1];
+    buf1[17] = buf0[17];
+    buf1[18] = buf0[9];
+    buf1[19] = buf0[25];
+    buf1[20] = buf0[5];
+    buf1[21] = buf0[21];
+    buf1[22] = buf0[13];
+    buf1[23] = buf0[29];
+    buf1[24] = buf0[3];
+    buf1[25] = buf0[19];
+    buf1[26] = buf0[11];
+    buf1[27] = buf0[27];
+    buf1[28] = buf0[7];
+    buf1[29] = buf0[23];
+    buf1[30] = buf0[15];
+    buf1[31] = buf0[31];
+
+    output[0 * col_num + col] = buf1[0];
+    output[1 * col_num + col] = buf1[1];
+    output[2 * col_num + col] = buf1[2];
+    output[3 * col_num + col] = buf1[3];
+    output[4 * col_num + col] = buf1[4];
+    output[5 * col_num + col] = buf1[5];
+    output[6 * col_num + col] = buf1[6];
+    output[7 * col_num + col] = buf1[7];
+    output[8 * col_num + col] = buf1[8];
+    output[9 * col_num + col] = buf1[9];
+    output[10 * col_num + col] = buf1[10];
+    output[11 * col_num + col] = buf1[11];
+    output[12 * col_num + col] = buf1[12];
+    output[13 * col_num + col] = buf1[13];
+    output[14 * col_num + col] = buf1[14];
+    output[15 * col_num + col] = buf1[15];
+    output[16 * col_num + col] = buf1[16];
+    output[17 * col_num + col] = buf1[17];
+    output[18 * col_num + col] = buf1[18];
+    output[19 * col_num + col] = buf1[19];
+    output[20 * col_num + col] = buf1[20];
+    output[21 * col_num + col] = buf1[21];
+    output[22 * col_num + col] = buf1[22];
+    output[23 * col_num + col] = buf1[23];
+    output[24 * col_num + col] = buf1[24];
+    output[25 * col_num + col] = buf1[25];
+    output[26 * col_num + col] = buf1[26];
+    output[27 * col_num + col] = buf1[27];
+    output[28 * col_num + col] = buf1[28];
+    output[29 * col_num + col] = buf1[29];
+    output[30 * col_num + col] = buf1[30];
+    output[31 * col_num + col] = buf1[31];
+  }
+}
+
+void vp10_fadst4_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range) {
+  const int txfm_size = 4;
+  const int num_per_128 = 4;
+  const int32_t* cospi;
+  __m128i buf0[4];
+  __m128i buf1[4];
+  int col_num = txfm_size / num_per_128;
+  int bit;
+  int col;
+  (void)stage_range;
+  for (col = 0; col < col_num; col++) {
+    // stage 0;
+    int32_t stage_idx = 0;
+    buf0[0] = input[0 * col_num + col];
+    buf0[1] = input[1 * col_num + col];
+    buf0[2] = input[2 * col_num + col];
+    buf0[3] = input[3 * col_num + col];
+
+    // stage 1
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = buf0[3];
+    buf1[1] = buf0[0];
+    buf1[2] = buf0[1];
+    buf1[3] = buf0[2];
+
+    // stage 2
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[0], buf1[1], buf0[0], buf0[1],
+                        bit);
+    btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[2], buf1[3], buf0[2],
+                        buf0[3], bit);
+
+    // stage 3
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
+    buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
+    buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);
+
+    // stage 4
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
+                        buf0[3], bit);
+
+    // stage 5
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = buf0[0];
+    buf1[1] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[2]);
+    buf1[2] = buf0[3];
+    buf1[3] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]);
+
+    output[0 * col_num + col] = buf1[0];
+    output[1 * col_num + col] = buf1[1];
+    output[2 * col_num + col] = buf1[2];
+    output[3 * col_num + col] = buf1[3];
+  }
+}
+
+void vp10_fadst8_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range) {
+  const int txfm_size = 8;
+  const int num_per_128 = 4;
+  const int32_t* cospi;
+  __m128i buf0[8];
+  __m128i buf1[8];
+  int col_num = txfm_size / num_per_128;
+  int bit;
+  int col;
+  (void)stage_range;
+  for (col = 0; col < col_num; col++) {
+    // stage 0;
+    int32_t stage_idx = 0;
+    buf0[0] = input[0 * col_num + col];
+    buf0[1] = input[1 * col_num + col];
+    buf0[2] = input[2 * col_num + col];
+    buf0[3] = input[3 * col_num + col];
+    buf0[4] = input[4 * col_num + col];
+    buf0[5] = input[5 * col_num + col];
+    buf0[6] = input[6 * col_num + col];
+    buf0[7] = input[7 * col_num + col];
+
+    // stage 1
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = buf0[7];
+    buf1[1] = buf0[0];
+    buf1[2] = buf0[5];
+    buf1[3] = buf0[2];
+    buf1[4] = buf0[3];
+    buf1[5] = buf0[4];
+    buf1[6] = buf0[1];
+    buf1[7] = buf0[6];
+
+    // stage 2
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    btf_32_sse4_1_type0(cospi[4], cospi[60], buf1[0], buf1[1], buf0[0], buf0[1],
+                        bit);
+    btf_32_sse4_1_type0(cospi[20], cospi[44], buf1[2], buf1[3], buf0[2],
+                        buf0[3], bit);
+    btf_32_sse4_1_type0(cospi[36], cospi[28], buf1[4], buf1[5], buf0[4],
+                        buf0[5], bit);
+    btf_32_sse4_1_type0(cospi[52], cospi[12], buf1[6], buf1[7], buf0[6],
+                        buf0[7], bit);
+
+    // stage 3
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[4]);
+    buf1[4] = _mm_sub_epi32(buf0[0], buf0[4]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[5]);
+    buf1[5] = _mm_sub_epi32(buf0[1], buf0[5]);
+    buf1[2] = _mm_add_epi32(buf0[2], buf0[6]);
+    buf1[6] = _mm_sub_epi32(buf0[2], buf0[6]);
+    buf1[3] = _mm_add_epi32(buf0[3], buf0[7]);
+    buf1[7] = _mm_sub_epi32(buf0[3], buf0[7]);
+
+    // stage 4
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    buf0[2] = buf1[2];
+    buf0[3] = buf1[3];
+    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4],
+                        buf0[5], bit);
+    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6],
+                        buf0[7], bit);
+
+    // stage 5
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
+    buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
+    buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);
+    buf1[4] = _mm_add_epi32(buf0[4], buf0[6]);
+    buf1[6] = _mm_sub_epi32(buf0[4], buf0[6]);
+    buf1[5] = _mm_add_epi32(buf0[5], buf0[7]);
+    buf1[7] = _mm_sub_epi32(buf0[5], buf0[7]);
+
+    // stage 6
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
+                        buf0[3], bit);
+    buf0[4] = buf1[4];
+    buf0[5] = buf1[5];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6],
+                        buf0[7], bit);
+
+    // stage 7
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = buf0[0];
+    buf1[1] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[4]);
+    buf1[2] = buf0[6];
+    buf1[3] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[2]);
+    buf1[4] = buf0[3];
+    buf1[5] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[7]);
+    buf1[6] = buf0[5];
+    buf1[7] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]);
+
+    output[0 * col_num + col] = buf1[0];
+    output[1 * col_num + col] = buf1[1];
+    output[2 * col_num + col] = buf1[2];
+    output[3 * col_num + col] = buf1[3];
+    output[4 * col_num + col] = buf1[4];
+    output[5 * col_num + col] = buf1[5];
+    output[6 * col_num + col] = buf1[6];
+    output[7 * col_num + col] = buf1[7];
+  }
+}
+
+void vp10_fadst16_new_sse4_1(const __m128i* input, __m128i* output,
+                             const int8_t* cos_bit, const int8_t* stage_range) {
+  const int txfm_size = 16;
+  const int num_per_128 = 4;
+  const int32_t* cospi;
+  __m128i buf0[16];
+  __m128i buf1[16];
+  int col_num = txfm_size / num_per_128;
+  int bit;
+  int col;
+  (void)stage_range;
+  for (col = 0; col < col_num; col++) {
+    // stage 0;
+    int32_t stage_idx = 0;
+    buf0[0] = input[0 * col_num + col];
+    buf0[1] = input[1 * col_num + col];
+    buf0[2] = input[2 * col_num + col];
+    buf0[3] = input[3 * col_num + col];
+    buf0[4] = input[4 * col_num + col];
+    buf0[5] = input[5 * col_num + col];
+    buf0[6] = input[6 * col_num + col];
+    buf0[7] = input[7 * col_num + col];
+    buf0[8] = input[8 * col_num + col];
+    buf0[9] = input[9 * col_num + col];
+    buf0[10] = input[10 * col_num + col];
+    buf0[11] = input[11 * col_num + col];
+    buf0[12] = input[12 * col_num + col];
+    buf0[13] = input[13 * col_num + col];
+    buf0[14] = input[14 * col_num + col];
+    buf0[15] = input[15 * col_num + col];
+
+    // stage 1
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = buf0[15];
+    buf1[1] = buf0[0];
+    buf1[2] = buf0[13];
+    buf1[3] = buf0[2];
+    buf1[4] = buf0[11];
+    buf1[5] = buf0[4];
+    buf1[6] = buf0[9];
+    buf1[7] = buf0[6];
+    buf1[8] = buf0[7];
+    buf1[9] = buf0[8];
+    buf1[10] = buf0[5];
+    buf1[11] = buf0[10];
+    buf1[12] = buf0[3];
+    buf1[13] = buf0[12];
+    buf1[14] = buf0[1];
+    buf1[15] = buf0[14];
+
+    // stage 2
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    btf_32_sse4_1_type0(cospi[2], cospi[62], buf1[0], buf1[1], buf0[0], buf0[1],
+                        bit);
+    btf_32_sse4_1_type0(cospi[10], cospi[54], buf1[2], buf1[3], buf0[2],
+                        buf0[3], bit);
+    btf_32_sse4_1_type0(cospi[18], cospi[46], buf1[4], buf1[5], buf0[4],
+                        buf0[5], bit);
+    btf_32_sse4_1_type0(cospi[26], cospi[38], buf1[6], buf1[7], buf0[6],
+                        buf0[7], bit);
+    btf_32_sse4_1_type0(cospi[34], cospi[30], buf1[8], buf1[9], buf0[8],
+                        buf0[9], bit);
+    btf_32_sse4_1_type0(cospi[42], cospi[22], buf1[10], buf1[11], buf0[10],
+                        buf0[11], bit);
+    btf_32_sse4_1_type0(cospi[50], cospi[14], buf1[12], buf1[13], buf0[12],
+                        buf0[13], bit);
+    btf_32_sse4_1_type0(cospi[58], cospi[6], buf1[14], buf1[15], buf0[14],
+                        buf0[15], bit);
+
+    // stage 3
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[8]);
+    buf1[8] = _mm_sub_epi32(buf0[0], buf0[8]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[9]);
+    buf1[9] = _mm_sub_epi32(buf0[1], buf0[9]);
+    buf1[2] = _mm_add_epi32(buf0[2], buf0[10]);
+    buf1[10] = _mm_sub_epi32(buf0[2], buf0[10]);
+    buf1[3] = _mm_add_epi32(buf0[3], buf0[11]);
+    buf1[11] = _mm_sub_epi32(buf0[3], buf0[11]);
+    buf1[4] = _mm_add_epi32(buf0[4], buf0[12]);
+    buf1[12] = _mm_sub_epi32(buf0[4], buf0[12]);
+    buf1[5] = _mm_add_epi32(buf0[5], buf0[13]);
+    buf1[13] = _mm_sub_epi32(buf0[5], buf0[13]);
+    buf1[6] = _mm_add_epi32(buf0[6], buf0[14]);
+    buf1[14] = _mm_sub_epi32(buf0[6], buf0[14]);
+    buf1[7] = _mm_add_epi32(buf0[7], buf0[15]);
+    buf1[15] = _mm_sub_epi32(buf0[7], buf0[15]);
+
+    // stage 4
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    buf0[2] = buf1[2];
+    buf0[3] = buf1[3];
+    buf0[4] = buf1[4];
+    buf0[5] = buf1[5];
+    buf0[6] = buf1[6];
+    buf0[7] = buf1[7];
+    btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[8], buf1[9], buf0[8], buf0[9],
+                        bit);
+    btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[10], buf1[11], buf0[10],
+                        buf0[11], bit);
+    btf_32_sse4_1_type0(-cospi[56], cospi[8], buf1[12], buf1[13], buf0[12],
+                        buf0[13], bit);
+    btf_32_sse4_1_type0(-cospi[24], cospi[40], buf1[14], buf1[15], buf0[14],
+                        buf0[15], bit);
+
+    // stage 5
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[4]);
+    buf1[4] = _mm_sub_epi32(buf0[0], buf0[4]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[5]);
+    buf1[5] = _mm_sub_epi32(buf0[1], buf0[5]);
+    buf1[2] = _mm_add_epi32(buf0[2], buf0[6]);
+    buf1[6] = _mm_sub_epi32(buf0[2], buf0[6]);
+    buf1[3] = _mm_add_epi32(buf0[3], buf0[7]);
+    buf1[7] = _mm_sub_epi32(buf0[3], buf0[7]);
+    buf1[8] = _mm_add_epi32(buf0[8], buf0[12]);
+    buf1[12] = _mm_sub_epi32(buf0[8], buf0[12]);
+    buf1[9] = _mm_add_epi32(buf0[9], buf0[13]);
+    buf1[13] = _mm_sub_epi32(buf0[9], buf0[13]);
+    buf1[10] = _mm_add_epi32(buf0[10], buf0[14]);
+    buf1[14] = _mm_sub_epi32(buf0[10], buf0[14]);
+    buf1[11] = _mm_add_epi32(buf0[11], buf0[15]);
+    buf1[15] = _mm_sub_epi32(buf0[11], buf0[15]);
+
+    // stage 6
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    buf0[2] = buf1[2];
+    buf0[3] = buf1[3];
+    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4],
+                        buf0[5], bit);
+    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6],
+                        buf0[7], bit);
+    buf0[8] = buf1[8];
+    buf0[9] = buf1[9];
+    buf0[10] = buf1[10];
+    buf0[11] = buf1[11];
+    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[12], buf1[13], buf0[12],
+                        buf0[13], bit);
+    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[14], buf1[15], buf0[14],
+                        buf0[15], bit);
+
+    // stage 7
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
+    buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
+    buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);
+    buf1[4] = _mm_add_epi32(buf0[4], buf0[6]);
+    buf1[6] = _mm_sub_epi32(buf0[4], buf0[6]);
+    buf1[5] = _mm_add_epi32(buf0[5], buf0[7]);
+    buf1[7] = _mm_sub_epi32(buf0[5], buf0[7]);
+    buf1[8] = _mm_add_epi32(buf0[8], buf0[10]);
+    buf1[10] = _mm_sub_epi32(buf0[8], buf0[10]);
+    buf1[9] = _mm_add_epi32(buf0[9], buf0[11]);
+    buf1[11] = _mm_sub_epi32(buf0[9], buf0[11]);
+    buf1[12] = _mm_add_epi32(buf0[12], buf0[14]);
+    buf1[14] = _mm_sub_epi32(buf0[12], buf0[14]);
+    buf1[13] = _mm_add_epi32(buf0[13], buf0[15]);
+    buf1[15] = _mm_sub_epi32(buf0[13], buf0[15]);
+
+    // stage 8
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
+                        buf0[3], bit);
+    buf0[4] = buf1[4];
+    buf0[5] = buf1[5];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6],
+                        buf0[7], bit);
+    buf0[8] = buf1[8];
+    buf0[9] = buf1[9];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[10], buf1[11], buf0[10],
+                        buf0[11], bit);
+    buf0[12] = buf1[12];
+    buf0[13] = buf1[13];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[14], buf1[15], buf0[14],
+                        buf0[15], bit);
+
+    // stage 9
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = buf0[0];
+    buf1[1] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[8]);
+    buf1[2] = buf0[12];
+    buf1[3] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[4]);
+    buf1[4] = buf0[6];
+    buf1[5] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[14]);
+    buf1[6] = buf0[10];
+    buf1[7] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[2]);
+    buf1[8] = buf0[3];
+    buf1[9] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[11]);
+    buf1[10] = buf0[15];
+    buf1[11] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[7]);
+    buf1[12] = buf0[5];
+    buf1[13] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[13]);
+    buf1[14] = buf0[9];
+    buf1[15] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]);
+
+    output[0 * col_num + col] = buf1[0];
+    output[1 * col_num + col] = buf1[1];
+    output[2 * col_num + col] = buf1[2];
+    output[3 * col_num + col] = buf1[3];
+    output[4 * col_num + col] = buf1[4];
+    output[5 * col_num + col] = buf1[5];
+    output[6 * col_num + col] = buf1[6];
+    output[7 * col_num + col] = buf1[7];
+    output[8 * col_num + col] = buf1[8];
+    output[9 * col_num + col] = buf1[9];
+    output[10 * col_num + col] = buf1[10];
+    output[11 * col_num + col] = buf1[11];
+    output[12 * col_num + col] = buf1[12];
+    output[13 * col_num + col] = buf1[13];
+    output[14 * col_num + col] = buf1[14];
+    output[15 * col_num + col] = buf1[15];
+  }
+}
+
+void vp10_fadst32_new_sse4_1(const __m128i* input, __m128i* output,
+                             const int8_t* cos_bit, const int8_t* stage_range) {
+  const int txfm_size = 32;
+  const int num_per_128 = 4;
+  const int32_t* cospi;
+  __m128i buf0[32];
+  __m128i buf1[32];
+  int col_num = txfm_size / num_per_128;
+  int bit;
+  int col;
+  (void)stage_range;
+  for (col = 0; col < col_num; col++) {
+    // stage 0;
+    int32_t stage_idx = 0;
+    buf0[0] = input[0 * col_num + col];
+    buf0[1] = input[1 * col_num + col];
+    buf0[2] = input[2 * col_num + col];
+    buf0[3] = input[3 * col_num + col];
+    buf0[4] = input[4 * col_num + col];
+    buf0[5] = input[5 * col_num + col];
+    buf0[6] = input[6 * col_num + col];
+    buf0[7] = input[7 * col_num + col];
+    buf0[8] = input[8 * col_num + col];
+    buf0[9] = input[9 * col_num + col];
+    buf0[10] = input[10 * col_num + col];
+    buf0[11] = input[11 * col_num + col];
+    buf0[12] = input[12 * col_num + col];
+    buf0[13] = input[13 * col_num + col];
+    buf0[14] = input[14 * col_num + col];
+    buf0[15] = input[15 * col_num + col];
+    buf0[16] = input[16 * col_num + col];
+    buf0[17] = input[17 * col_num + col];
+    buf0[18] = input[18 * col_num + col];
+    buf0[19] = input[19 * col_num + col];
+    buf0[20] = input[20 * col_num + col];
+    buf0[21] = input[21 * col_num + col];
+    buf0[22] = input[22 * col_num + col];
+    buf0[23] = input[23 * col_num + col];
+    buf0[24] = input[24 * col_num + col];
+    buf0[25] = input[25 * col_num + col];
+    buf0[26] = input[26 * col_num + col];
+    buf0[27] = input[27 * col_num + col];
+    buf0[28] = input[28 * col_num + col];
+    buf0[29] = input[29 * col_num + col];
+    buf0[30] = input[30 * col_num + col];
+    buf0[31] = input[31 * col_num + col];
+
+    // stage 1
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = buf0[31];
+    buf1[1] = buf0[0];
+    buf1[2] = buf0[29];
+    buf1[3] = buf0[2];
+    buf1[4] = buf0[27];
+    buf1[5] = buf0[4];
+    buf1[6] = buf0[25];
+    buf1[7] = buf0[6];
+    buf1[8] = buf0[23];
+    buf1[9] = buf0[8];
+    buf1[10] = buf0[21];
+    buf1[11] = buf0[10];
+    buf1[12] = buf0[19];
+    buf1[13] = buf0[12];
+    buf1[14] = buf0[17];
+    buf1[15] = buf0[14];
+    buf1[16] = buf0[15];
+    buf1[17] = buf0[16];
+    buf1[18] = buf0[13];
+    buf1[19] = buf0[18];
+    buf1[20] = buf0[11];
+    buf1[21] = buf0[20];
+    buf1[22] = buf0[9];
+    buf1[23] = buf0[22];
+    buf1[24] = buf0[7];
+    buf1[25] = buf0[24];
+    buf1[26] = buf0[5];
+    buf1[27] = buf0[26];
+    buf1[28] = buf0[3];
+    buf1[29] = buf0[28];
+    buf1[30] = buf0[1];
+    buf1[31] = buf0[30];
+
+    // stage 2
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    btf_32_sse4_1_type0(cospi[1], cospi[63], buf1[0], buf1[1], buf0[0], buf0[1],
+                        bit);
+    btf_32_sse4_1_type0(cospi[5], cospi[59], buf1[2], buf1[3], buf0[2], buf0[3],
+                        bit);
+    btf_32_sse4_1_type0(cospi[9], cospi[55], buf1[4], buf1[5], buf0[4], buf0[5],
+                        bit);
+    btf_32_sse4_1_type0(cospi[13], cospi[51], buf1[6], buf1[7], buf0[6],
+                        buf0[7], bit);
+    btf_32_sse4_1_type0(cospi[17], cospi[47], buf1[8], buf1[9], buf0[8],
+                        buf0[9], bit);
+    btf_32_sse4_1_type0(cospi[21], cospi[43], buf1[10], buf1[11], buf0[10],
+                        buf0[11], bit);
+    btf_32_sse4_1_type0(cospi[25], cospi[39], buf1[12], buf1[13], buf0[12],
+                        buf0[13], bit);
+    btf_32_sse4_1_type0(cospi[29], cospi[35], buf1[14], buf1[15], buf0[14],
+                        buf0[15], bit);
+    btf_32_sse4_1_type0(cospi[33], cospi[31], buf1[16], buf1[17], buf0[16],
+                        buf0[17], bit);
+    btf_32_sse4_1_type0(cospi[37], cospi[27], buf1[18], buf1[19], buf0[18],
+                        buf0[19], bit);
+    btf_32_sse4_1_type0(cospi[41], cospi[23], buf1[20], buf1[21], buf0[20],
+                        buf0[21], bit);
+    btf_32_sse4_1_type0(cospi[45], cospi[19], buf1[22], buf1[23], buf0[22],
+                        buf0[23], bit);
+    btf_32_sse4_1_type0(cospi[49], cospi[15], buf1[24], buf1[25], buf0[24],
+                        buf0[25], bit);
+    btf_32_sse4_1_type0(cospi[53], cospi[11], buf1[26], buf1[27], buf0[26],
+                        buf0[27], bit);
+    btf_32_sse4_1_type0(cospi[57], cospi[7], buf1[28], buf1[29], buf0[28],
+                        buf0[29], bit);
+    btf_32_sse4_1_type0(cospi[61], cospi[3], buf1[30], buf1[31], buf0[30],
+                        buf0[31], bit);
+
+    // stage 3
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[16]);
+    buf1[16] = _mm_sub_epi32(buf0[0], buf0[16]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[17]);
+    buf1[17] = _mm_sub_epi32(buf0[1], buf0[17]);
+    buf1[2] = _mm_add_epi32(buf0[2], buf0[18]);
+    buf1[18] = _mm_sub_epi32(buf0[2], buf0[18]);
+    buf1[3] = _mm_add_epi32(buf0[3], buf0[19]);
+    buf1[19] = _mm_sub_epi32(buf0[3], buf0[19]);
+    buf1[4] = _mm_add_epi32(buf0[4], buf0[20]);
+    buf1[20] = _mm_sub_epi32(buf0[4], buf0[20]);
+    buf1[5] = _mm_add_epi32(buf0[5], buf0[21]);
+    buf1[21] = _mm_sub_epi32(buf0[5], buf0[21]);
+    buf1[6] = _mm_add_epi32(buf0[6], buf0[22]);
+    buf1[22] = _mm_sub_epi32(buf0[6], buf0[22]);
+    buf1[7] = _mm_add_epi32(buf0[7], buf0[23]);
+    buf1[23] = _mm_sub_epi32(buf0[7], buf0[23]);
+    buf1[8] = _mm_add_epi32(buf0[8], buf0[24]);
+    buf1[24] = _mm_sub_epi32(buf0[8], buf0[24]);
+    buf1[9] = _mm_add_epi32(buf0[9], buf0[25]);
+    buf1[25] = _mm_sub_epi32(buf0[9], buf0[25]);
+    buf1[10] = _mm_add_epi32(buf0[10], buf0[26]);
+    buf1[26] = _mm_sub_epi32(buf0[10], buf0[26]);
+    buf1[11] = _mm_add_epi32(buf0[11], buf0[27]);
+    buf1[27] = _mm_sub_epi32(buf0[11], buf0[27]);
+    buf1[12] = _mm_add_epi32(buf0[12], buf0[28]);
+    buf1[28] = _mm_sub_epi32(buf0[12], buf0[28]);
+    buf1[13] = _mm_add_epi32(buf0[13], buf0[29]);
+    buf1[29] = _mm_sub_epi32(buf0[13], buf0[29]);
+    buf1[14] = _mm_add_epi32(buf0[14], buf0[30]);
+    buf1[30] = _mm_sub_epi32(buf0[14], buf0[30]);
+    buf1[15] = _mm_add_epi32(buf0[15], buf0[31]);
+    buf1[31] = _mm_sub_epi32(buf0[15], buf0[31]);
+
+    // stage 4
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    buf0[2] = buf1[2];
+    buf0[3] = buf1[3];
+    buf0[4] = buf1[4];
+    buf0[5] = buf1[5];
+    buf0[6] = buf1[6];
+    buf0[7] = buf1[7];
+    buf0[8] = buf1[8];
+    buf0[9] = buf1[9];
+    buf0[10] = buf1[10];
+    buf0[11] = buf1[11];
+    buf0[12] = buf1[12];
+    buf0[13] = buf1[13];
+    buf0[14] = buf1[14];
+    buf0[15] = buf1[15];
+    btf_32_sse4_1_type0(cospi[4], cospi[60], buf1[16], buf1[17], buf0[16],
+                        buf0[17], bit);
+    btf_32_sse4_1_type0(cospi[20], cospi[44], buf1[18], buf1[19], buf0[18],
+                        buf0[19], bit);
+    btf_32_sse4_1_type0(cospi[36], cospi[28], buf1[20], buf1[21], buf0[20],
+                        buf0[21], bit);
+    btf_32_sse4_1_type0(cospi[52], cospi[12], buf1[22], buf1[23], buf0[22],
+                        buf0[23], bit);
+    btf_32_sse4_1_type0(-cospi[60], cospi[4], buf1[24], buf1[25], buf0[24],
+                        buf0[25], bit);
+    btf_32_sse4_1_type0(-cospi[44], cospi[20], buf1[26], buf1[27], buf0[26],
+                        buf0[27], bit);
+    btf_32_sse4_1_type0(-cospi[28], cospi[36], buf1[28], buf1[29], buf0[28],
+                        buf0[29], bit);
+    btf_32_sse4_1_type0(-cospi[12], cospi[52], buf1[30], buf1[31], buf0[30],
+                        buf0[31], bit);
+
+    // stage 5
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[8]);
+    buf1[8] = _mm_sub_epi32(buf0[0], buf0[8]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[9]);
+    buf1[9] = _mm_sub_epi32(buf0[1], buf0[9]);
+    buf1[2] = _mm_add_epi32(buf0[2], buf0[10]);
+    buf1[10] = _mm_sub_epi32(buf0[2], buf0[10]);
+    buf1[3] = _mm_add_epi32(buf0[3], buf0[11]);
+    buf1[11] = _mm_sub_epi32(buf0[3], buf0[11]);
+    buf1[4] = _mm_add_epi32(buf0[4], buf0[12]);
+    buf1[12] = _mm_sub_epi32(buf0[4], buf0[12]);
+    buf1[5] = _mm_add_epi32(buf0[5], buf0[13]);
+    buf1[13] = _mm_sub_epi32(buf0[5], buf0[13]);
+    buf1[6] = _mm_add_epi32(buf0[6], buf0[14]);
+    buf1[14] = _mm_sub_epi32(buf0[6], buf0[14]);
+    buf1[7] = _mm_add_epi32(buf0[7], buf0[15]);
+    buf1[15] = _mm_sub_epi32(buf0[7], buf0[15]);
+    buf1[16] = _mm_add_epi32(buf0[16], buf0[24]);
+    buf1[24] = _mm_sub_epi32(buf0[16], buf0[24]);
+    buf1[17] = _mm_add_epi32(buf0[17], buf0[25]);
+    buf1[25] = _mm_sub_epi32(buf0[17], buf0[25]);
+    buf1[18] = _mm_add_epi32(buf0[18], buf0[26]);
+    buf1[26] = _mm_sub_epi32(buf0[18], buf0[26]);
+    buf1[19] = _mm_add_epi32(buf0[19], buf0[27]);
+    buf1[27] = _mm_sub_epi32(buf0[19], buf0[27]);
+    buf1[20] = _mm_add_epi32(buf0[20], buf0[28]);
+    buf1[28] = _mm_sub_epi32(buf0[20], buf0[28]);
+    buf1[21] = _mm_add_epi32(buf0[21], buf0[29]);
+    buf1[29] = _mm_sub_epi32(buf0[21], buf0[29]);
+    buf1[22] = _mm_add_epi32(buf0[22], buf0[30]);
+    buf1[30] = _mm_sub_epi32(buf0[22], buf0[30]);
+    buf1[23] = _mm_add_epi32(buf0[23], buf0[31]);
+    buf1[31] = _mm_sub_epi32(buf0[23], buf0[31]);
+
+    // stage 6
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    buf0[2] = buf1[2];
+    buf0[3] = buf1[3];
+    buf0[4] = buf1[4];
+    buf0[5] = buf1[5];
+    buf0[6] = buf1[6];
+    buf0[7] = buf1[7];
+    btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[8], buf1[9], buf0[8], buf0[9],
+                        bit);
+    btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[10], buf1[11], buf0[10],
+                        buf0[11], bit);
+    btf_32_sse4_1_type0(-cospi[56], cospi[8], buf1[12], buf1[13], buf0[12],
+                        buf0[13], bit);
+    btf_32_sse4_1_type0(-cospi[24], cospi[40], buf1[14], buf1[15], buf0[14],
+                        buf0[15], bit);
+    buf0[16] = buf1[16];
+    buf0[17] = buf1[17];
+    buf0[18] = buf1[18];
+    buf0[19] = buf1[19];
+    buf0[20] = buf1[20];
+    buf0[21] = buf1[21];
+    buf0[22] = buf1[22];
+    buf0[23] = buf1[23];
+    btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[24], buf1[25], buf0[24],
+                        buf0[25], bit);
+    btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[26], buf1[27], buf0[26],
+                        buf0[27], bit);
+    btf_32_sse4_1_type0(-cospi[56], cospi[8], buf1[28], buf1[29], buf0[28],
+                        buf0[29], bit);
+    btf_32_sse4_1_type0(-cospi[24], cospi[40], buf1[30], buf1[31], buf0[30],
+                        buf0[31], bit);
+
+    // stage 7
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[4]);
+    buf1[4] = _mm_sub_epi32(buf0[0], buf0[4]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[5]);
+    buf1[5] = _mm_sub_epi32(buf0[1], buf0[5]);
+    buf1[2] = _mm_add_epi32(buf0[2], buf0[6]);
+    buf1[6] = _mm_sub_epi32(buf0[2], buf0[6]);
+    buf1[3] = _mm_add_epi32(buf0[3], buf0[7]);
+    buf1[7] = _mm_sub_epi32(buf0[3], buf0[7]);
+    buf1[8] = _mm_add_epi32(buf0[8], buf0[12]);
+    buf1[12] = _mm_sub_epi32(buf0[8], buf0[12]);
+    buf1[9] = _mm_add_epi32(buf0[9], buf0[13]);
+    buf1[13] = _mm_sub_epi32(buf0[9], buf0[13]);
+    buf1[10] = _mm_add_epi32(buf0[10], buf0[14]);
+    buf1[14] = _mm_sub_epi32(buf0[10], buf0[14]);
+    buf1[11] = _mm_add_epi32(buf0[11], buf0[15]);
+    buf1[15] = _mm_sub_epi32(buf0[11], buf0[15]);
+    buf1[16] = _mm_add_epi32(buf0[16], buf0[20]);
+    buf1[20] = _mm_sub_epi32(buf0[16], buf0[20]);
+    buf1[17] = _mm_add_epi32(buf0[17], buf0[21]);
+    buf1[21] = _mm_sub_epi32(buf0[17], buf0[21]);
+    buf1[18] = _mm_add_epi32(buf0[18], buf0[22]);
+    buf1[22] = _mm_sub_epi32(buf0[18], buf0[22]);
+    buf1[19] = _mm_add_epi32(buf0[19], buf0[23]);
+    buf1[23] = _mm_sub_epi32(buf0[19], buf0[23]);
+    buf1[24] = _mm_add_epi32(buf0[24], buf0[28]);
+    buf1[28] = _mm_sub_epi32(buf0[24], buf0[28]);
+    buf1[25] = _mm_add_epi32(buf0[25], buf0[29]);
+    buf1[29] = _mm_sub_epi32(buf0[25], buf0[29]);
+    buf1[26] = _mm_add_epi32(buf0[26], buf0[30]);
+    buf1[30] = _mm_sub_epi32(buf0[26], buf0[30]);
+    buf1[27] = _mm_add_epi32(buf0[27], buf0[31]);
+    buf1[31] = _mm_sub_epi32(buf0[27], buf0[31]);
+
+    // stage 8
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    buf0[2] = buf1[2];
+    buf0[3] = buf1[3];
+    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4],
+                        buf0[5], bit);
+    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6],
+                        buf0[7], bit);
+    buf0[8] = buf1[8];
+    buf0[9] = buf1[9];
+    buf0[10] = buf1[10];
+    buf0[11] = buf1[11];
+    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[12], buf1[13], buf0[12],
+                        buf0[13], bit);
+    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[14], buf1[15], buf0[14],
+                        buf0[15], bit);
+    buf0[16] = buf1[16];
+    buf0[17] = buf1[17];
+    buf0[18] = buf1[18];
+    buf0[19] = buf1[19];
+    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[20], buf1[21], buf0[20],
+                        buf0[21], bit);
+    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[22], buf1[23], buf0[22],
+                        buf0[23], bit);
+    buf0[24] = buf1[24];
+    buf0[25] = buf1[25];
+    buf0[26] = buf1[26];
+    buf0[27] = buf1[27];
+    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[28], buf1[29], buf0[28],
+                        buf0[29], bit);
+    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[30], buf1[31], buf0[30],
+                        buf0[31], bit);
+
+    // stage 9
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
+    buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
+    buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);
+    buf1[4] = _mm_add_epi32(buf0[4], buf0[6]);
+    buf1[6] = _mm_sub_epi32(buf0[4], buf0[6]);
+    buf1[5] = _mm_add_epi32(buf0[5], buf0[7]);
+    buf1[7] = _mm_sub_epi32(buf0[5], buf0[7]);
+    buf1[8] = _mm_add_epi32(buf0[8], buf0[10]);
+    buf1[10] = _mm_sub_epi32(buf0[8], buf0[10]);
+    buf1[9] = _mm_add_epi32(buf0[9], buf0[11]);
+    buf1[11] = _mm_sub_epi32(buf0[9], buf0[11]);
+    buf1[12] = _mm_add_epi32(buf0[12], buf0[14]);
+    buf1[14] = _mm_sub_epi32(buf0[12], buf0[14]);
+    buf1[13] = _mm_add_epi32(buf0[13], buf0[15]);
+    buf1[15] = _mm_sub_epi32(buf0[13], buf0[15]);
+    buf1[16] = _mm_add_epi32(buf0[16], buf0[18]);
+    buf1[18] = _mm_sub_epi32(buf0[16], buf0[18]);
+    buf1[17] = _mm_add_epi32(buf0[17], buf0[19]);
+    buf1[19] = _mm_sub_epi32(buf0[17], buf0[19]);
+    buf1[20] = _mm_add_epi32(buf0[20], buf0[22]);
+    buf1[22] = _mm_sub_epi32(buf0[20], buf0[22]);
+    buf1[21] = _mm_add_epi32(buf0[21], buf0[23]);
+    buf1[23] = _mm_sub_epi32(buf0[21], buf0[23]);
+    buf1[24] = _mm_add_epi32(buf0[24], buf0[26]);
+    buf1[26] = _mm_sub_epi32(buf0[24], buf0[26]);
+    buf1[25] = _mm_add_epi32(buf0[25], buf0[27]);
+    buf1[27] = _mm_sub_epi32(buf0[25], buf0[27]);
+    buf1[28] = _mm_add_epi32(buf0[28], buf0[30]);
+    buf1[30] = _mm_sub_epi32(buf0[28], buf0[30]);
+    buf1[29] = _mm_add_epi32(buf0[29], buf0[31]);
+    buf1[31] = _mm_sub_epi32(buf0[29], buf0[31]);
+
+    // stage 10
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
+                        buf0[3], bit);
+    buf0[4] = buf1[4];
+    buf0[5] = buf1[5];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6],
+                        buf0[7], bit);
+    buf0[8] = buf1[8];
+    buf0[9] = buf1[9];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[10], buf1[11], buf0[10],
+                        buf0[11], bit);
+    buf0[12] = buf1[12];
+    buf0[13] = buf1[13];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[14], buf1[15], buf0[14],
+                        buf0[15], bit);
+    buf0[16] = buf1[16];
+    buf0[17] = buf1[17];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[18], buf1[19], buf0[18],
+                        buf0[19], bit);
+    buf0[20] = buf1[20];
+    buf0[21] = buf1[21];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[22], buf1[23], buf0[22],
+                        buf0[23], bit);
+    buf0[24] = buf1[24];
+    buf0[25] = buf1[25];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[26], buf1[27], buf0[26],
+                        buf0[27], bit);
+    buf0[28] = buf1[28];
+    buf0[29] = buf1[29];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[30], buf1[31], buf0[30],
+                        buf0[31], bit);
+
+    // stage 11
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = buf0[0];
+    buf1[1] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[16]);
+    buf1[2] = buf0[24];
+    buf1[3] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[8]);
+    buf1[4] = buf0[12];
+    buf1[5] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[28]);
+    buf1[6] = buf0[20];
+    buf1[7] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[4]);
+    buf1[8] = buf0[6];
+    buf1[9] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[22]);
+    buf1[10] = buf0[30];
+    buf1[11] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[14]);
+    buf1[12] = buf0[10];
+    buf1[13] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[26]);
+    buf1[14] = buf0[18];
+    buf1[15] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[2]);
+    buf1[16] = buf0[3];
+    buf1[17] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[19]);
+    buf1[18] = buf0[27];
+    buf1[19] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[11]);
+    buf1[20] = buf0[15];
+    buf1[21] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[31]);
+    buf1[22] = buf0[23];
+    buf1[23] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[7]);
+    buf1[24] = buf0[5];
+    buf1[25] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[21]);
+    buf1[26] = buf0[29];
+    buf1[27] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[13]);
+    buf1[28] = buf0[9];
+    buf1[29] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[25]);
+    buf1[30] = buf0[17];
+    buf1[31] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]);
+
+    output[0 * col_num + col] = buf1[0];
+    output[1 * col_num + col] = buf1[1];
+    output[2 * col_num + col] = buf1[2];
+    output[3 * col_num + col] = buf1[3];
+    output[4 * col_num + col] = buf1[4];
+    output[5 * col_num + col] = buf1[5];
+    output[6 * col_num + col] = buf1[6];
+    output[7 * col_num + col] = buf1[7];
+    output[8 * col_num + col] = buf1[8];
+    output[9 * col_num + col] = buf1[9];
+    output[10 * col_num + col] = buf1[10];
+    output[11 * col_num + col] = buf1[11];
+    output[12 * col_num + col] = buf1[12];
+    output[13 * col_num + col] = buf1[13];
+    output[14 * col_num + col] = buf1[14];
+    output[15 * col_num + col] = buf1[15];
+    output[16 * col_num + col] = buf1[16];
+    output[17 * col_num + col] = buf1[17];
+    output[18 * col_num + col] = buf1[18];
+    output[19 * col_num + col] = buf1[19];
+    output[20 * col_num + col] = buf1[20];
+    output[21 * col_num + col] = buf1[21];
+    output[22 * col_num + col] = buf1[22];
+    output[23 * col_num + col] = buf1[23];
+    output[24 * col_num + col] = buf1[24];
+    output[25 * col_num + col] = buf1[25];
+    output[26 * col_num + col] = buf1[26];
+    output[27 * col_num + col] = buf1[27];
+    output[28 * col_num + col] = buf1[28];
+    output[29 * col_num + col] = buf1[29];
+    output[30 * col_num + col] = buf1[30];
+    output[31 * col_num + col] = buf1[31];
+  }
+}
+
+void vp10_fdct64_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range) {
+  const int txfm_size = 64;
+  const int num_per_128 = 4;
+  const int32_t* cospi;
+  __m128i buf0[64];
+  __m128i buf1[64];
+  int col_num = txfm_size / num_per_128;
+  int bit;
+  int col;
+  (void)stage_range;
+  for (col = 0; col < col_num; col++) {
+    // stage 0;
+    int32_t stage_idx = 0;
+    buf0[0] = input[0 * col_num + col];
+    buf0[1] = input[1 * col_num + col];
+    buf0[2] = input[2 * col_num + col];
+    buf0[3] = input[3 * col_num + col];
+    buf0[4] = input[4 * col_num + col];
+    buf0[5] = input[5 * col_num + col];
+    buf0[6] = input[6 * col_num + col];
+    buf0[7] = input[7 * col_num + col];
+    buf0[8] = input[8 * col_num + col];
+    buf0[9] = input[9 * col_num + col];
+    buf0[10] = input[10 * col_num + col];
+    buf0[11] = input[11 * col_num + col];
+    buf0[12] = input[12 * col_num + col];
+    buf0[13] = input[13 * col_num + col];
+    buf0[14] = input[14 * col_num + col];
+    buf0[15] = input[15 * col_num + col];
+    buf0[16] = input[16 * col_num + col];
+    buf0[17] = input[17 * col_num + col];
+    buf0[18] = input[18 * col_num + col];
+    buf0[19] = input[19 * col_num + col];
+    buf0[20] = input[20 * col_num + col];
+    buf0[21] = input[21 * col_num + col];
+    buf0[22] = input[22 * col_num + col];
+    buf0[23] = input[23 * col_num + col];
+    buf0[24] = input[24 * col_num + col];
+    buf0[25] = input[25 * col_num + col];
+    buf0[26] = input[26 * col_num + col];
+    buf0[27] = input[27 * col_num + col];
+    buf0[28] = input[28 * col_num + col];
+    buf0[29] = input[29 * col_num + col];
+    buf0[30] = input[30 * col_num + col];
+    buf0[31] = input[31 * col_num + col];
+    buf0[32] = input[32 * col_num + col];
+    buf0[33] = input[33 * col_num + col];
+    buf0[34] = input[34 * col_num + col];
+    buf0[35] = input[35 * col_num + col];
+    buf0[36] = input[36 * col_num + col];
+    buf0[37] = input[37 * col_num + col];
+    buf0[38] = input[38 * col_num + col];
+    buf0[39] = input[39 * col_num + col];
+    buf0[40] = input[40 * col_num + col];
+    buf0[41] = input[41 * col_num + col];
+    buf0[42] = input[42 * col_num + col];
+    buf0[43] = input[43 * col_num + col];
+    buf0[44] = input[44 * col_num + col];
+    buf0[45] = input[45 * col_num + col];
+    buf0[46] = input[46 * col_num + col];
+    buf0[47] = input[47 * col_num + col];
+    buf0[48] = input[48 * col_num + col];
+    buf0[49] = input[49 * col_num + col];
+    buf0[50] = input[50 * col_num + col];
+    buf0[51] = input[51 * col_num + col];
+    buf0[52] = input[52 * col_num + col];
+    buf0[53] = input[53 * col_num + col];
+    buf0[54] = input[54 * col_num + col];
+    buf0[55] = input[55 * col_num + col];
+    buf0[56] = input[56 * col_num + col];
+    buf0[57] = input[57 * col_num + col];
+    buf0[58] = input[58 * col_num + col];
+    buf0[59] = input[59 * col_num + col];
+    buf0[60] = input[60 * col_num + col];
+    buf0[61] = input[61 * col_num + col];
+    buf0[62] = input[62 * col_num + col];
+    buf0[63] = input[63 * col_num + col];
+
+    // stage 1
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[63]);
+    buf1[63] = _mm_sub_epi32(buf0[0], buf0[63]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[62]);
+    buf1[62] = _mm_sub_epi32(buf0[1], buf0[62]);
+    buf1[2] = _mm_add_epi32(buf0[2], buf0[61]);
+    buf1[61] = _mm_sub_epi32(buf0[2], buf0[61]);
+    buf1[3] = _mm_add_epi32(buf0[3], buf0[60]);
+    buf1[60] = _mm_sub_epi32(buf0[3], buf0[60]);
+    buf1[4] = _mm_add_epi32(buf0[4], buf0[59]);
+    buf1[59] = _mm_sub_epi32(buf0[4], buf0[59]);
+    buf1[5] = _mm_add_epi32(buf0[5], buf0[58]);
+    buf1[58] = _mm_sub_epi32(buf0[5], buf0[58]);
+    buf1[6] = _mm_add_epi32(buf0[6], buf0[57]);
+    buf1[57] = _mm_sub_epi32(buf0[6], buf0[57]);
+    buf1[7] = _mm_add_epi32(buf0[7], buf0[56]);
+    buf1[56] = _mm_sub_epi32(buf0[7], buf0[56]);
+    buf1[8] = _mm_add_epi32(buf0[8], buf0[55]);
+    buf1[55] = _mm_sub_epi32(buf0[8], buf0[55]);
+    buf1[9] = _mm_add_epi32(buf0[9], buf0[54]);
+    buf1[54] = _mm_sub_epi32(buf0[9], buf0[54]);
+    buf1[10] = _mm_add_epi32(buf0[10], buf0[53]);
+    buf1[53] = _mm_sub_epi32(buf0[10], buf0[53]);
+    buf1[11] = _mm_add_epi32(buf0[11], buf0[52]);
+    buf1[52] = _mm_sub_epi32(buf0[11], buf0[52]);
+    buf1[12] = _mm_add_epi32(buf0[12], buf0[51]);
+    buf1[51] = _mm_sub_epi32(buf0[12], buf0[51]);
+    buf1[13] = _mm_add_epi32(buf0[13], buf0[50]);
+    buf1[50] = _mm_sub_epi32(buf0[13], buf0[50]);
+    buf1[14] = _mm_add_epi32(buf0[14], buf0[49]);
+    buf1[49] = _mm_sub_epi32(buf0[14], buf0[49]);
+    buf1[15] = _mm_add_epi32(buf0[15], buf0[48]);
+    buf1[48] = _mm_sub_epi32(buf0[15], buf0[48]);
+    buf1[16] = _mm_add_epi32(buf0[16], buf0[47]);
+    buf1[47] = _mm_sub_epi32(buf0[16], buf0[47]);
+    buf1[17] = _mm_add_epi32(buf0[17], buf0[46]);
+    buf1[46] = _mm_sub_epi32(buf0[17], buf0[46]);
+    buf1[18] = _mm_add_epi32(buf0[18], buf0[45]);
+    buf1[45] = _mm_sub_epi32(buf0[18], buf0[45]);
+    buf1[19] = _mm_add_epi32(buf0[19], buf0[44]);
+    buf1[44] = _mm_sub_epi32(buf0[19], buf0[44]);
+    buf1[20] = _mm_add_epi32(buf0[20], buf0[43]);
+    buf1[43] = _mm_sub_epi32(buf0[20], buf0[43]);
+    buf1[21] = _mm_add_epi32(buf0[21], buf0[42]);
+    buf1[42] = _mm_sub_epi32(buf0[21], buf0[42]);
+    buf1[22] = _mm_add_epi32(buf0[22], buf0[41]);
+    buf1[41] = _mm_sub_epi32(buf0[22], buf0[41]);
+    buf1[23] = _mm_add_epi32(buf0[23], buf0[40]);
+    buf1[40] = _mm_sub_epi32(buf0[23], buf0[40]);
+    buf1[24] = _mm_add_epi32(buf0[24], buf0[39]);
+    buf1[39] = _mm_sub_epi32(buf0[24], buf0[39]);
+    buf1[25] = _mm_add_epi32(buf0[25], buf0[38]);
+    buf1[38] = _mm_sub_epi32(buf0[25], buf0[38]);
+    buf1[26] = _mm_add_epi32(buf0[26], buf0[37]);
+    buf1[37] = _mm_sub_epi32(buf0[26], buf0[37]);
+    buf1[27] = _mm_add_epi32(buf0[27], buf0[36]);
+    buf1[36] = _mm_sub_epi32(buf0[27], buf0[36]);
+    buf1[28] = _mm_add_epi32(buf0[28], buf0[35]);
+    buf1[35] = _mm_sub_epi32(buf0[28], buf0[35]);
+    buf1[29] = _mm_add_epi32(buf0[29], buf0[34]);
+    buf1[34] = _mm_sub_epi32(buf0[29], buf0[34]);
+    buf1[30] = _mm_add_epi32(buf0[30], buf0[33]);
+    buf1[33] = _mm_sub_epi32(buf0[30], buf0[33]);
+    buf1[31] = _mm_add_epi32(buf0[31], buf0[32]);
+    buf1[32] = _mm_sub_epi32(buf0[31], buf0[32]);
+
+    // stage 2
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = _mm_add_epi32(buf1[0], buf1[31]);
+    buf0[31] = _mm_sub_epi32(buf1[0], buf1[31]);
+    buf0[1] = _mm_add_epi32(buf1[1], buf1[30]);
+    buf0[30] = _mm_sub_epi32(buf1[1], buf1[30]);
+    buf0[2] = _mm_add_epi32(buf1[2], buf1[29]);
+    buf0[29] = _mm_sub_epi32(buf1[2], buf1[29]);
+    buf0[3] = _mm_add_epi32(buf1[3], buf1[28]);
+    buf0[28] = _mm_sub_epi32(buf1[3], buf1[28]);
+    buf0[4] = _mm_add_epi32(buf1[4], buf1[27]);
+    buf0[27] = _mm_sub_epi32(buf1[4], buf1[27]);
+    buf0[5] = _mm_add_epi32(buf1[5], buf1[26]);
+    buf0[26] = _mm_sub_epi32(buf1[5], buf1[26]);
+    buf0[6] = _mm_add_epi32(buf1[6], buf1[25]);
+    buf0[25] = _mm_sub_epi32(buf1[6], buf1[25]);
+    buf0[7] = _mm_add_epi32(buf1[7], buf1[24]);
+    buf0[24] = _mm_sub_epi32(buf1[7], buf1[24]);
+    buf0[8] = _mm_add_epi32(buf1[8], buf1[23]);
+    buf0[23] = _mm_sub_epi32(buf1[8], buf1[23]);
+    buf0[9] = _mm_add_epi32(buf1[9], buf1[22]);
+    buf0[22] = _mm_sub_epi32(buf1[9], buf1[22]);
+    buf0[10] = _mm_add_epi32(buf1[10], buf1[21]);
+    buf0[21] = _mm_sub_epi32(buf1[10], buf1[21]);
+    buf0[11] = _mm_add_epi32(buf1[11], buf1[20]);
+    buf0[20] = _mm_sub_epi32(buf1[11], buf1[20]);
+    buf0[12] = _mm_add_epi32(buf1[12], buf1[19]);
+    buf0[19] = _mm_sub_epi32(buf1[12], buf1[19]);
+    buf0[13] = _mm_add_epi32(buf1[13], buf1[18]);
+    buf0[18] = _mm_sub_epi32(buf1[13], buf1[18]);
+    buf0[14] = _mm_add_epi32(buf1[14], buf1[17]);
+    buf0[17] = _mm_sub_epi32(buf1[14], buf1[17]);
+    buf0[15] = _mm_add_epi32(buf1[15], buf1[16]);
+    buf0[16] = _mm_sub_epi32(buf1[15], buf1[16]);
+    buf0[32] = buf1[32];
+    buf0[33] = buf1[33];
+    buf0[34] = buf1[34];
+    buf0[35] = buf1[35];
+    buf0[36] = buf1[36];
+    buf0[37] = buf1[37];
+    buf0[38] = buf1[38];
+    buf0[39] = buf1[39];
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[40], buf1[55], buf0[40],
+                        buf0[55], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[41], buf1[54], buf0[41],
+                        buf0[54], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[42], buf1[53], buf0[42],
+                        buf0[53], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[43], buf1[52], buf0[43],
+                        buf0[52], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[44], buf1[51], buf0[44],
+                        buf0[51], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[45], buf1[50], buf0[45],
+                        buf0[50], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[46], buf1[49], buf0[46],
+                        buf0[49], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[47], buf1[48], buf0[47],
+                        buf0[48], bit);
+    buf0[56] = buf1[56];
+    buf0[57] = buf1[57];
+    buf0[58] = buf1[58];
+    buf0[59] = buf1[59];
+    buf0[60] = buf1[60];
+    buf0[61] = buf1[61];
+    buf0[62] = buf1[62];
+    buf0[63] = buf1[63];
+
+    // stage 3
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[15]);
+    buf1[15] = _mm_sub_epi32(buf0[0], buf0[15]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[14]);
+    buf1[14] = _mm_sub_epi32(buf0[1], buf0[14]);
+    buf1[2] = _mm_add_epi32(buf0[2], buf0[13]);
+    buf1[13] = _mm_sub_epi32(buf0[2], buf0[13]);
+    buf1[3] = _mm_add_epi32(buf0[3], buf0[12]);
+    buf1[12] = _mm_sub_epi32(buf0[3], buf0[12]);
+    buf1[4] = _mm_add_epi32(buf0[4], buf0[11]);
+    buf1[11] = _mm_sub_epi32(buf0[4], buf0[11]);
+    buf1[5] = _mm_add_epi32(buf0[5], buf0[10]);
+    buf1[10] = _mm_sub_epi32(buf0[5], buf0[10]);
+    buf1[6] = _mm_add_epi32(buf0[6], buf0[9]);
+    buf1[9] = _mm_sub_epi32(buf0[6], buf0[9]);
+    buf1[7] = _mm_add_epi32(buf0[7], buf0[8]);
+    buf1[8] = _mm_sub_epi32(buf0[7], buf0[8]);
+    buf1[16] = buf0[16];
+    buf1[17] = buf0[17];
+    buf1[18] = buf0[18];
+    buf1[19] = buf0[19];
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[20], buf0[27], buf1[20],
+                        buf1[27], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[21], buf0[26], buf1[21],
+                        buf1[26], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[22], buf0[25], buf1[22],
+                        buf1[25], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[23], buf0[24], buf1[23],
+                        buf1[24], bit);
+    buf1[28] = buf0[28];
+    buf1[29] = buf0[29];
+    buf1[30] = buf0[30];
+    buf1[31] = buf0[31];
+    buf1[32] = _mm_add_epi32(buf0[32], buf0[47]);
+    buf1[47] = _mm_sub_epi32(buf0[32], buf0[47]);
+    buf1[33] = _mm_add_epi32(buf0[33], buf0[46]);
+    buf1[46] = _mm_sub_epi32(buf0[33], buf0[46]);
+    buf1[34] = _mm_add_epi32(buf0[34], buf0[45]);
+    buf1[45] = _mm_sub_epi32(buf0[34], buf0[45]);
+    buf1[35] = _mm_add_epi32(buf0[35], buf0[44]);
+    buf1[44] = _mm_sub_epi32(buf0[35], buf0[44]);
+    buf1[36] = _mm_add_epi32(buf0[36], buf0[43]);
+    buf1[43] = _mm_sub_epi32(buf0[36], buf0[43]);
+    buf1[37] = _mm_add_epi32(buf0[37], buf0[42]);
+    buf1[42] = _mm_sub_epi32(buf0[37], buf0[42]);
+    buf1[38] = _mm_add_epi32(buf0[38], buf0[41]);
+    buf1[41] = _mm_sub_epi32(buf0[38], buf0[41]);
+    buf1[39] = _mm_add_epi32(buf0[39], buf0[40]);
+    buf1[40] = _mm_sub_epi32(buf0[39], buf0[40]);
+    buf1[48] = _mm_sub_epi32(buf0[63], buf0[48]);
+    buf1[63] = _mm_add_epi32(buf0[63], buf0[48]);
+    buf1[49] = _mm_sub_epi32(buf0[62], buf0[49]);
+    buf1[62] = _mm_add_epi32(buf0[62], buf0[49]);
+    buf1[50] = _mm_sub_epi32(buf0[61], buf0[50]);
+    buf1[61] = _mm_add_epi32(buf0[61], buf0[50]);
+    buf1[51] = _mm_sub_epi32(buf0[60], buf0[51]);
+    buf1[60] = _mm_add_epi32(buf0[60], buf0[51]);
+    buf1[52] = _mm_sub_epi32(buf0[59], buf0[52]);
+    buf1[59] = _mm_add_epi32(buf0[59], buf0[52]);
+    buf1[53] = _mm_sub_epi32(buf0[58], buf0[53]);
+    buf1[58] = _mm_add_epi32(buf0[58], buf0[53]);
+    buf1[54] = _mm_sub_epi32(buf0[57], buf0[54]);
+    buf1[57] = _mm_add_epi32(buf0[57], buf0[54]);
+    buf1[55] = _mm_sub_epi32(buf0[56], buf0[55]);
+    buf1[56] = _mm_add_epi32(buf0[56], buf0[55]);
+
+    // stage 4
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = _mm_add_epi32(buf1[0], buf1[7]);
+    buf0[7] = _mm_sub_epi32(buf1[0], buf1[7]);
+    buf0[1] = _mm_add_epi32(buf1[1], buf1[6]);
+    buf0[6] = _mm_sub_epi32(buf1[1], buf1[6]);
+    buf0[2] = _mm_add_epi32(buf1[2], buf1[5]);
+    buf0[5] = _mm_sub_epi32(buf1[2], buf1[5]);
+    buf0[3] = _mm_add_epi32(buf1[3], buf1[4]);
+    buf0[4] = _mm_sub_epi32(buf1[3], buf1[4]);
+    buf0[8] = buf1[8];
+    buf0[9] = buf1[9];
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[10], buf1[13], buf0[10],
+                        buf0[13], bit);
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf1[11], buf1[12], buf0[11],
+                        buf0[12], bit);
+    buf0[14] = buf1[14];
+    buf0[15] = buf1[15];
+    buf0[16] = _mm_add_epi32(buf1[16], buf1[23]);
+    buf0[23] = _mm_sub_epi32(buf1[16], buf1[23]);
+    buf0[17] = _mm_add_epi32(buf1[17], buf1[22]);
+    buf0[22] = _mm_sub_epi32(buf1[17], buf1[22]);
+    buf0[18] = _mm_add_epi32(buf1[18], buf1[21]);
+    buf0[21] = _mm_sub_epi32(buf1[18], buf1[21]);
+    buf0[19] = _mm_add_epi32(buf1[19], buf1[20]);
+    buf0[20] = _mm_sub_epi32(buf1[19], buf1[20]);
+    buf0[24] = _mm_sub_epi32(buf1[31], buf1[24]);
+    buf0[31] = _mm_add_epi32(buf1[31], buf1[24]);
+    buf0[25] = _mm_sub_epi32(buf1[30], buf1[25]);
+    buf0[30] = _mm_add_epi32(buf1[30], buf1[25]);
+    buf0[26] = _mm_sub_epi32(buf1[29], buf1[26]);
+    buf0[29] = _mm_add_epi32(buf1[29], buf1[26]);
+    buf0[27] = _mm_sub_epi32(buf1[28], buf1[27]);
+    buf0[28] = _mm_add_epi32(buf1[28], buf1[27]);
+    buf0[32] = buf1[32];
+    buf0[33] = buf1[33];
+    buf0[34] = buf1[34];
+    buf0[35] = buf1[35];
+    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[36], buf1[59], buf0[36],
+                        buf0[59], bit);
+    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[37], buf1[58], buf0[37],
+                        buf0[58], bit);
+    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[38], buf1[57], buf0[38],
+                        buf0[57], bit);
+    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[39], buf1[56], buf0[39],
+                        buf0[56], bit);
+    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[40], buf1[55], buf0[40],
+                        buf0[55], bit);
+    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[41], buf1[54], buf0[41],
+                        buf0[54], bit);
+    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[42], buf1[53], buf0[42],
+                        buf0[53], bit);
+    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[43], buf1[52], buf0[43],
+                        buf0[52], bit);
+    buf0[44] = buf1[44];
+    buf0[45] = buf1[45];
+    buf0[46] = buf1[46];
+    buf0[47] = buf1[47];
+    buf0[48] = buf1[48];
+    buf0[49] = buf1[49];
+    buf0[50] = buf1[50];
+    buf0[51] = buf1[51];
+    buf0[60] = buf1[60];
+    buf0[61] = buf1[61];
+    buf0[62] = buf1[62];
+    buf0[63] = buf1[63];
+
+    // stage 5
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = _mm_add_epi32(buf0[0], buf0[3]);
+    buf1[3] = _mm_sub_epi32(buf0[0], buf0[3]);
+    buf1[1] = _mm_add_epi32(buf0[1], buf0[2]);
+    buf1[2] = _mm_sub_epi32(buf0[1], buf0[2]);
+    buf1[4] = buf0[4];
+    btf_32_sse4_1_type0(-cospi[32], cospi[32], buf0[5], buf0[6], buf1[5],
+                        buf1[6], bit);
+    buf1[7] = buf0[7];
+    buf1[8] = _mm_add_epi32(buf0[8], buf0[11]);
+    buf1[11] = _mm_sub_epi32(buf0[8], buf0[11]);
+    buf1[9] = _mm_add_epi32(buf0[9], buf0[10]);
+    buf1[10] = _mm_sub_epi32(buf0[9], buf0[10]);
+    buf1[12] = _mm_sub_epi32(buf0[15], buf0[12]);
+    buf1[15] = _mm_add_epi32(buf0[15], buf0[12]);
+    buf1[13] = _mm_sub_epi32(buf0[14], buf0[13]);
+    buf1[14] = _mm_add_epi32(buf0[14], buf0[13]);
+    buf1[16] = buf0[16];
+    buf1[17] = buf0[17];
+    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf0[18], buf0[29], buf1[18],
+                        buf1[29], bit);
+    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf0[19], buf0[28], buf1[19],
+                        buf1[28], bit);
+    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf0[20], buf0[27], buf1[20],
+                        buf1[27], bit);
+    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf0[21], buf0[26], buf1[21],
+                        buf1[26], bit);
+    buf1[22] = buf0[22];
+    buf1[23] = buf0[23];
+    buf1[24] = buf0[24];
+    buf1[25] = buf0[25];
+    buf1[30] = buf0[30];
+    buf1[31] = buf0[31];
+    buf1[32] = _mm_add_epi32(buf0[32], buf0[39]);
+    buf1[39] = _mm_sub_epi32(buf0[32], buf0[39]);
+    buf1[33] = _mm_add_epi32(buf0[33], buf0[38]);
+    buf1[38] = _mm_sub_epi32(buf0[33], buf0[38]);
+    buf1[34] = _mm_add_epi32(buf0[34], buf0[37]);
+    buf1[37] = _mm_sub_epi32(buf0[34], buf0[37]);
+    buf1[35] = _mm_add_epi32(buf0[35], buf0[36]);
+    buf1[36] = _mm_sub_epi32(buf0[35], buf0[36]);
+    buf1[40] = _mm_sub_epi32(buf0[47], buf0[40]);
+    buf1[47] = _mm_add_epi32(buf0[47], buf0[40]);
+    buf1[41] = _mm_sub_epi32(buf0[46], buf0[41]);
+    buf1[46] = _mm_add_epi32(buf0[46], buf0[41]);
+    buf1[42] = _mm_sub_epi32(buf0[45], buf0[42]);
+    buf1[45] = _mm_add_epi32(buf0[45], buf0[42]);
+    buf1[43] = _mm_sub_epi32(buf0[44], buf0[43]);
+    buf1[44] = _mm_add_epi32(buf0[44], buf0[43]);
+    buf1[48] = _mm_add_epi32(buf0[48], buf0[55]);
+    buf1[55] = _mm_sub_epi32(buf0[48], buf0[55]);
+    buf1[49] = _mm_add_epi32(buf0[49], buf0[54]);
+    buf1[54] = _mm_sub_epi32(buf0[49], buf0[54]);
+    buf1[50] = _mm_add_epi32(buf0[50], buf0[53]);
+    buf1[53] = _mm_sub_epi32(buf0[50], buf0[53]);
+    buf1[51] = _mm_add_epi32(buf0[51], buf0[52]);
+    buf1[52] = _mm_sub_epi32(buf0[51], buf0[52]);
+    buf1[56] = _mm_sub_epi32(buf0[63], buf0[56]);
+    buf1[63] = _mm_add_epi32(buf0[63], buf0[56]);
+    buf1[57] = _mm_sub_epi32(buf0[62], buf0[57]);
+    buf1[62] = _mm_add_epi32(buf0[62], buf0[57]);
+    buf1[58] = _mm_sub_epi32(buf0[61], buf0[58]);
+    buf1[61] = _mm_add_epi32(buf0[61], buf0[58]);
+    buf1[59] = _mm_sub_epi32(buf0[60], buf0[59]);
+    buf1[60] = _mm_add_epi32(buf0[60], buf0[59]);
+
+    // stage 6
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[0], buf1[1], buf0[0],
+                        buf0[1], bit);
+    btf_32_sse4_1_type1(cospi[48], cospi[16], buf1[2], buf1[3], buf0[2],
+                        buf0[3], bit);
+    buf0[4] = _mm_add_epi32(buf1[4], buf1[5]);
+    buf0[5] = _mm_sub_epi32(buf1[4], buf1[5]);
+    buf0[6] = _mm_sub_epi32(buf1[7], buf1[6]);
+    buf0[7] = _mm_add_epi32(buf1[7], buf1[6]);
+    buf0[8] = buf1[8];
+    btf_32_sse4_1_type0(-cospi[16], cospi[48], buf1[9], buf1[14], buf0[9],
+                        buf0[14], bit);
+    btf_32_sse4_1_type0(-cospi[48], -cospi[16], buf1[10], buf1[13], buf0[10],
+                        buf0[13], bit);
+    buf0[11] = buf1[11];
+    buf0[12] = buf1[12];
+    buf0[15] = buf1[15];
+    buf0[16] = _mm_add_epi32(buf1[16], buf1[19]);
+    buf0[19] = _mm_sub_epi32(buf1[16], buf1[19]);
+    buf0[17] = _mm_add_epi32(buf1[17], buf1[18]);
+    buf0[18] = _mm_sub_epi32(buf1[17], buf1[18]);
+    buf0[20] = _mm_sub_epi32(buf1[23], buf1[20]);
+    buf0[23] = _mm_add_epi32(buf1[23], buf1[20]);
+    buf0[21] = _mm_sub_epi32(buf1[22], buf1[21]);
+    buf0[22] = _mm_add_epi32(buf1[22], buf1[21]);
+    buf0[24] = _mm_add_epi32(buf1[24], buf1[27]);
+    buf0[27] = _mm_sub_epi32(buf1[24], buf1[27]);
+    buf0[25] = _mm_add_epi32(buf1[25], buf1[26]);
+    buf0[26] = _mm_sub_epi32(buf1[25], buf1[26]);
+    buf0[28] = _mm_sub_epi32(buf1[31], buf1[28]);
+    buf0[31] = _mm_add_epi32(buf1[31], buf1[28]);
+    buf0[29] = _mm_sub_epi32(buf1[30], buf1[29]);
+    buf0[30] = _mm_add_epi32(buf1[30], buf1[29]);
+    buf0[32] = buf1[32];
+    buf0[33] = buf1[33];
+    btf_32_sse4_1_type0(-cospi[8], cospi[56], buf1[34], buf1[61], buf0[34],
+                        buf0[61], bit);
+    btf_32_sse4_1_type0(-cospi[8], cospi[56], buf1[35], buf1[60], buf0[35],
+                        buf0[60], bit);
+    btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf1[36], buf1[59], buf0[36],
+                        buf0[59], bit);
+    btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf1[37], buf1[58], buf0[37],
+                        buf0[58], bit);
+    buf0[38] = buf1[38];
+    buf0[39] = buf1[39];
+    buf0[40] = buf1[40];
+    buf0[41] = buf1[41];
+    btf_32_sse4_1_type0(-cospi[40], cospi[24], buf1[42], buf1[53], buf0[42],
+                        buf0[53], bit);
+    btf_32_sse4_1_type0(-cospi[40], cospi[24], buf1[43], buf1[52], buf0[43],
+                        buf0[52], bit);
+    btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf1[44], buf1[51], buf0[44],
+                        buf0[51], bit);
+    btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf1[45], buf1[50], buf0[45],
+                        buf0[50], bit);
+    buf0[46] = buf1[46];
+    buf0[47] = buf1[47];
+    buf0[48] = buf1[48];
+    buf0[49] = buf1[49];
+    buf0[54] = buf1[54];
+    buf0[55] = buf1[55];
+    buf0[56] = buf1[56];
+    buf0[57] = buf1[57];
+    buf0[62] = buf1[62];
+    buf0[63] = buf1[63];
+
+    // stage 7
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = buf0[0];
+    buf1[1] = buf0[1];
+    buf1[2] = buf0[2];
+    buf1[3] = buf0[3];
+    btf_32_sse4_1_type1(cospi[56], cospi[8], buf0[4], buf0[7], buf1[4], buf1[7],
+                        bit);
+    btf_32_sse4_1_type1(cospi[24], cospi[40], buf0[5], buf0[6], buf1[5],
+                        buf1[6], bit);
+    buf1[8] = _mm_add_epi32(buf0[8], buf0[9]);
+    buf1[9] = _mm_sub_epi32(buf0[8], buf0[9]);
+    buf1[10] = _mm_sub_epi32(buf0[11], buf0[10]);
+    buf1[11] = _mm_add_epi32(buf0[11], buf0[10]);
+    buf1[12] = _mm_add_epi32(buf0[12], buf0[13]);
+    buf1[13] = _mm_sub_epi32(buf0[12], buf0[13]);
+    buf1[14] = _mm_sub_epi32(buf0[15], buf0[14]);
+    buf1[15] = _mm_add_epi32(buf0[15], buf0[14]);
+    buf1[16] = buf0[16];
+    btf_32_sse4_1_type0(-cospi[8], cospi[56], buf0[17], buf0[30], buf1[17],
+                        buf1[30], bit);
+    btf_32_sse4_1_type0(-cospi[56], -cospi[8], buf0[18], buf0[29], buf1[18],
+                        buf1[29], bit);
+    buf1[19] = buf0[19];
+    buf1[20] = buf0[20];
+    btf_32_sse4_1_type0(-cospi[40], cospi[24], buf0[21], buf0[26], buf1[21],
+                        buf1[26], bit);
+    btf_32_sse4_1_type0(-cospi[24], -cospi[40], buf0[22], buf0[25], buf1[22],
+                        buf1[25], bit);
+    buf1[23] = buf0[23];
+    buf1[24] = buf0[24];
+    buf1[27] = buf0[27];
+    buf1[28] = buf0[28];
+    buf1[31] = buf0[31];
+    buf1[32] = _mm_add_epi32(buf0[32], buf0[35]);
+    buf1[35] = _mm_sub_epi32(buf0[32], buf0[35]);
+    buf1[33] = _mm_add_epi32(buf0[33], buf0[34]);
+    buf1[34] = _mm_sub_epi32(buf0[33], buf0[34]);
+    buf1[36] = _mm_sub_epi32(buf0[39], buf0[36]);
+    buf1[39] = _mm_add_epi32(buf0[39], buf0[36]);
+    buf1[37] = _mm_sub_epi32(buf0[38], buf0[37]);
+    buf1[38] = _mm_add_epi32(buf0[38], buf0[37]);
+    buf1[40] = _mm_add_epi32(buf0[40], buf0[43]);
+    buf1[43] = _mm_sub_epi32(buf0[40], buf0[43]);
+    buf1[41] = _mm_add_epi32(buf0[41], buf0[42]);
+    buf1[42] = _mm_sub_epi32(buf0[41], buf0[42]);
+    buf1[44] = _mm_sub_epi32(buf0[47], buf0[44]);
+    buf1[47] = _mm_add_epi32(buf0[47], buf0[44]);
+    buf1[45] = _mm_sub_epi32(buf0[46], buf0[45]);
+    buf1[46] = _mm_add_epi32(buf0[46], buf0[45]);
+    buf1[48] = _mm_add_epi32(buf0[48], buf0[51]);
+    buf1[51] = _mm_sub_epi32(buf0[48], buf0[51]);
+    buf1[49] = _mm_add_epi32(buf0[49], buf0[50]);
+    buf1[50] = _mm_sub_epi32(buf0[49], buf0[50]);
+    buf1[52] = _mm_sub_epi32(buf0[55], buf0[52]);
+    buf1[55] = _mm_add_epi32(buf0[55], buf0[52]);
+    buf1[53] = _mm_sub_epi32(buf0[54], buf0[53]);
+    buf1[54] = _mm_add_epi32(buf0[54], buf0[53]);
+    buf1[56] = _mm_add_epi32(buf0[56], buf0[59]);
+    buf1[59] = _mm_sub_epi32(buf0[56], buf0[59]);
+    buf1[57] = _mm_add_epi32(buf0[57], buf0[58]);
+    buf1[58] = _mm_sub_epi32(buf0[57], buf0[58]);
+    buf1[60] = _mm_sub_epi32(buf0[63], buf0[60]);
+    buf1[63] = _mm_add_epi32(buf0[63], buf0[60]);
+    buf1[61] = _mm_sub_epi32(buf0[62], buf0[61]);
+    buf1[62] = _mm_add_epi32(buf0[62], buf0[61]);
+
+    // stage 8
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    buf0[2] = buf1[2];
+    buf0[3] = buf1[3];
+    buf0[4] = buf1[4];
+    buf0[5] = buf1[5];
+    buf0[6] = buf1[6];
+    buf0[7] = buf1[7];
+    btf_32_sse4_1_type1(cospi[60], cospi[4], buf1[8], buf1[15], buf0[8],
+                        buf0[15], bit);
+    btf_32_sse4_1_type1(cospi[28], cospi[36], buf1[9], buf1[14], buf0[9],
+                        buf0[14], bit);
+    btf_32_sse4_1_type1(cospi[44], cospi[20], buf1[10], buf1[13], buf0[10],
+                        buf0[13], bit);
+    btf_32_sse4_1_type1(cospi[12], cospi[52], buf1[11], buf1[12], buf0[11],
+                        buf0[12], bit);
+    buf0[16] = _mm_add_epi32(buf1[16], buf1[17]);
+    buf0[17] = _mm_sub_epi32(buf1[16], buf1[17]);
+    buf0[18] = _mm_sub_epi32(buf1[19], buf1[18]);
+    buf0[19] = _mm_add_epi32(buf1[19], buf1[18]);
+    buf0[20] = _mm_add_epi32(buf1[20], buf1[21]);
+    buf0[21] = _mm_sub_epi32(buf1[20], buf1[21]);
+    buf0[22] = _mm_sub_epi32(buf1[23], buf1[22]);
+    buf0[23] = _mm_add_epi32(buf1[23], buf1[22]);
+    buf0[24] = _mm_add_epi32(buf1[24], buf1[25]);
+    buf0[25] = _mm_sub_epi32(buf1[24], buf1[25]);
+    buf0[26] = _mm_sub_epi32(buf1[27], buf1[26]);
+    buf0[27] = _mm_add_epi32(buf1[27], buf1[26]);
+    buf0[28] = _mm_add_epi32(buf1[28], buf1[29]);
+    buf0[29] = _mm_sub_epi32(buf1[28], buf1[29]);
+    buf0[30] = _mm_sub_epi32(buf1[31], buf1[30]);
+    buf0[31] = _mm_add_epi32(buf1[31], buf1[30]);
+    buf0[32] = buf1[32];
+    btf_32_sse4_1_type0(-cospi[4], cospi[60], buf1[33], buf1[62], buf0[33],
+                        buf0[62], bit);
+    btf_32_sse4_1_type0(-cospi[60], -cospi[4], buf1[34], buf1[61], buf0[34],
+                        buf0[61], bit);
+    buf0[35] = buf1[35];
+    buf0[36] = buf1[36];
+    btf_32_sse4_1_type0(-cospi[36], cospi[28], buf1[37], buf1[58], buf0[37],
+                        buf0[58], bit);
+    btf_32_sse4_1_type0(-cospi[28], -cospi[36], buf1[38], buf1[57], buf0[38],
+                        buf0[57], bit);
+    buf0[39] = buf1[39];
+    buf0[40] = buf1[40];
+    btf_32_sse4_1_type0(-cospi[20], cospi[44], buf1[41], buf1[54], buf0[41],
+                        buf0[54], bit);
+    btf_32_sse4_1_type0(-cospi[44], -cospi[20], buf1[42], buf1[53], buf0[42],
+                        buf0[53], bit);
+    buf0[43] = buf1[43];
+    buf0[44] = buf1[44];
+    btf_32_sse4_1_type0(-cospi[52], cospi[12], buf1[45], buf1[50], buf0[45],
+                        buf0[50], bit);
+    btf_32_sse4_1_type0(-cospi[12], -cospi[52], buf1[46], buf1[49], buf0[46],
+                        buf0[49], bit);
+    buf0[47] = buf1[47];
+    buf0[48] = buf1[48];
+    buf0[51] = buf1[51];
+    buf0[52] = buf1[52];
+    buf0[55] = buf1[55];
+    buf0[56] = buf1[56];
+    buf0[59] = buf1[59];
+    buf0[60] = buf1[60];
+    buf0[63] = buf1[63];
+
+    // stage 9
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = buf0[0];
+    buf1[1] = buf0[1];
+    buf1[2] = buf0[2];
+    buf1[3] = buf0[3];
+    buf1[4] = buf0[4];
+    buf1[5] = buf0[5];
+    buf1[6] = buf0[6];
+    buf1[7] = buf0[7];
+    buf1[8] = buf0[8];
+    buf1[9] = buf0[9];
+    buf1[10] = buf0[10];
+    buf1[11] = buf0[11];
+    buf1[12] = buf0[12];
+    buf1[13] = buf0[13];
+    buf1[14] = buf0[14];
+    buf1[15] = buf0[15];
+    btf_32_sse4_1_type1(cospi[62], cospi[2], buf0[16], buf0[31], buf1[16],
+                        buf1[31], bit);
+    btf_32_sse4_1_type1(cospi[30], cospi[34], buf0[17], buf0[30], buf1[17],
+                        buf1[30], bit);
+    btf_32_sse4_1_type1(cospi[46], cospi[18], buf0[18], buf0[29], buf1[18],
+                        buf1[29], bit);
+    btf_32_sse4_1_type1(cospi[14], cospi[50], buf0[19], buf0[28], buf1[19],
+                        buf1[28], bit);
+    btf_32_sse4_1_type1(cospi[54], cospi[10], buf0[20], buf0[27], buf1[20],
+                        buf1[27], bit);
+    btf_32_sse4_1_type1(cospi[22], cospi[42], buf0[21], buf0[26], buf1[21],
+                        buf1[26], bit);
+    btf_32_sse4_1_type1(cospi[38], cospi[26], buf0[22], buf0[25], buf1[22],
+                        buf1[25], bit);
+    btf_32_sse4_1_type1(cospi[6], cospi[58], buf0[23], buf0[24], buf1[23],
+                        buf1[24], bit);
+    buf1[32] = _mm_add_epi32(buf0[32], buf0[33]);
+    buf1[33] = _mm_sub_epi32(buf0[32], buf0[33]);
+    buf1[34] = _mm_sub_epi32(buf0[35], buf0[34]);
+    buf1[35] = _mm_add_epi32(buf0[35], buf0[34]);
+    buf1[36] = _mm_add_epi32(buf0[36], buf0[37]);
+    buf1[37] = _mm_sub_epi32(buf0[36], buf0[37]);
+    buf1[38] = _mm_sub_epi32(buf0[39], buf0[38]);
+    buf1[39] = _mm_add_epi32(buf0[39], buf0[38]);
+    buf1[40] = _mm_add_epi32(buf0[40], buf0[41]);
+    buf1[41] = _mm_sub_epi32(buf0[40], buf0[41]);
+    buf1[42] = _mm_sub_epi32(buf0[43], buf0[42]);
+    buf1[43] = _mm_add_epi32(buf0[43], buf0[42]);
+    buf1[44] = _mm_add_epi32(buf0[44], buf0[45]);
+    buf1[45] = _mm_sub_epi32(buf0[44], buf0[45]);
+    buf1[46] = _mm_sub_epi32(buf0[47], buf0[46]);
+    buf1[47] = _mm_add_epi32(buf0[47], buf0[46]);
+    buf1[48] = _mm_add_epi32(buf0[48], buf0[49]);
+    buf1[49] = _mm_sub_epi32(buf0[48], buf0[49]);
+    buf1[50] = _mm_sub_epi32(buf0[51], buf0[50]);
+    buf1[51] = _mm_add_epi32(buf0[51], buf0[50]);
+    buf1[52] = _mm_add_epi32(buf0[52], buf0[53]);
+    buf1[53] = _mm_sub_epi32(buf0[52], buf0[53]);
+    buf1[54] = _mm_sub_epi32(buf0[55], buf0[54]);
+    buf1[55] = _mm_add_epi32(buf0[55], buf0[54]);
+    buf1[56] = _mm_add_epi32(buf0[56], buf0[57]);
+    buf1[57] = _mm_sub_epi32(buf0[56], buf0[57]);
+    buf1[58] = _mm_sub_epi32(buf0[59], buf0[58]);
+    buf1[59] = _mm_add_epi32(buf0[59], buf0[58]);
+    buf1[60] = _mm_add_epi32(buf0[60], buf0[61]);
+    buf1[61] = _mm_sub_epi32(buf0[60], buf0[61]);
+    buf1[62] = _mm_sub_epi32(buf0[63], buf0[62]);
+    buf1[63] = _mm_add_epi32(buf0[63], buf0[62]);
+
+    // stage 10
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf0[0] = buf1[0];
+    buf0[1] = buf1[1];
+    buf0[2] = buf1[2];
+    buf0[3] = buf1[3];
+    buf0[4] = buf1[4];
+    buf0[5] = buf1[5];
+    buf0[6] = buf1[6];
+    buf0[7] = buf1[7];
+    buf0[8] = buf1[8];
+    buf0[9] = buf1[9];
+    buf0[10] = buf1[10];
+    buf0[11] = buf1[11];
+    buf0[12] = buf1[12];
+    buf0[13] = buf1[13];
+    buf0[14] = buf1[14];
+    buf0[15] = buf1[15];
+    buf0[16] = buf1[16];
+    buf0[17] = buf1[17];
+    buf0[18] = buf1[18];
+    buf0[19] = buf1[19];
+    buf0[20] = buf1[20];
+    buf0[21] = buf1[21];
+    buf0[22] = buf1[22];
+    buf0[23] = buf1[23];
+    buf0[24] = buf1[24];
+    buf0[25] = buf1[25];
+    buf0[26] = buf1[26];
+    buf0[27] = buf1[27];
+    buf0[28] = buf1[28];
+    buf0[29] = buf1[29];
+    buf0[30] = buf1[30];
+    buf0[31] = buf1[31];
+    btf_32_sse4_1_type1(cospi[63], cospi[1], buf1[32], buf1[63], buf0[32],
+                        buf0[63], bit);
+    btf_32_sse4_1_type1(cospi[31], cospi[33], buf1[33], buf1[62], buf0[33],
+                        buf0[62], bit);
+    btf_32_sse4_1_type1(cospi[47], cospi[17], buf1[34], buf1[61], buf0[34],
+                        buf0[61], bit);
+    btf_32_sse4_1_type1(cospi[15], cospi[49], buf1[35], buf1[60], buf0[35],
+                        buf0[60], bit);
+    btf_32_sse4_1_type1(cospi[55], cospi[9], buf1[36], buf1[59], buf0[36],
+                        buf0[59], bit);
+    btf_32_sse4_1_type1(cospi[23], cospi[41], buf1[37], buf1[58], buf0[37],
+                        buf0[58], bit);
+    btf_32_sse4_1_type1(cospi[39], cospi[25], buf1[38], buf1[57], buf0[38],
+                        buf0[57], bit);
+    btf_32_sse4_1_type1(cospi[7], cospi[57], buf1[39], buf1[56], buf0[39],
+                        buf0[56], bit);
+    btf_32_sse4_1_type1(cospi[59], cospi[5], buf1[40], buf1[55], buf0[40],
+                        buf0[55], bit);
+    btf_32_sse4_1_type1(cospi[27], cospi[37], buf1[41], buf1[54], buf0[41],
+                        buf0[54], bit);
+    btf_32_sse4_1_type1(cospi[43], cospi[21], buf1[42], buf1[53], buf0[42],
+                        buf0[53], bit);
+    btf_32_sse4_1_type1(cospi[11], cospi[53], buf1[43], buf1[52], buf0[43],
+                        buf0[52], bit);
+    btf_32_sse4_1_type1(cospi[51], cospi[13], buf1[44], buf1[51], buf0[44],
+                        buf0[51], bit);
+    btf_32_sse4_1_type1(cospi[19], cospi[45], buf1[45], buf1[50], buf0[45],
+                        buf0[50], bit);
+    btf_32_sse4_1_type1(cospi[35], cospi[29], buf1[46], buf1[49], buf0[46],
+                        buf0[49], bit);
+    btf_32_sse4_1_type1(cospi[3], cospi[61], buf1[47], buf1[48], buf0[47],
+                        buf0[48], bit);
+
+    // stage 11
+    stage_idx++;
+    bit = cos_bit[stage_idx];
+    cospi = cospi_arr[bit - cos_bit_min];
+    buf1[0] = buf0[0];
+    buf1[1] = buf0[32];
+    buf1[2] = buf0[16];
+    buf1[3] = buf0[48];
+    buf1[4] = buf0[8];
+    buf1[5] = buf0[40];
+    buf1[6] = buf0[24];
+    buf1[7] = buf0[56];
+    buf1[8] = buf0[4];
+    buf1[9] = buf0[36];
+    buf1[10] = buf0[20];
+    buf1[11] = buf0[52];
+    buf1[12] = buf0[12];
+    buf1[13] = buf0[44];
+    buf1[14] = buf0[28];
+    buf1[15] = buf0[60];
+    buf1[16] = buf0[2];
+    buf1[17] = buf0[34];
+    buf1[18] = buf0[18];
+    buf1[19] = buf0[50];
+    buf1[20] = buf0[10];
+    buf1[21] = buf0[42];
+    buf1[22] = buf0[26];
+    buf1[23] = buf0[58];
+    buf1[24] = buf0[6];
+    buf1[25] = buf0[38];
+    buf1[26] = buf0[22];
+    buf1[27] = buf0[54];
+    buf1[28] = buf0[14];
+    buf1[29] = buf0[46];
+    buf1[30] = buf0[30];
+    buf1[31] = buf0[62];
+    buf1[32] = buf0[1];
+    buf1[33] = buf0[33];
+    buf1[34] = buf0[17];
+    buf1[35] = buf0[49];
+    buf1[36] = buf0[9];
+    buf1[37] = buf0[41];
+    buf1[38] = buf0[25];
+    buf1[39] = buf0[57];
+    buf1[40] = buf0[5];
+    buf1[41] = buf0[37];
+    buf1[42] = buf0[21];
+    buf1[43] = buf0[53];
+    buf1[44] = buf0[13];
+    buf1[45] = buf0[45];
+    buf1[46] = buf0[29];
+    buf1[47] = buf0[61];
+    buf1[48] = buf0[3];
+    buf1[49] = buf0[35];
+    buf1[50] = buf0[19];
+    buf1[51] = buf0[51];
+    buf1[52] = buf0[11];
+    buf1[53] = buf0[43];
+    buf1[54] = buf0[27];
+    buf1[55] = buf0[59];
+    buf1[56] = buf0[7];
+    buf1[57] = buf0[39];
+    buf1[58] = buf0[23];
+    buf1[59] = buf0[55];
+    buf1[60] = buf0[15];
+    buf1[61] = buf0[47];
+    buf1[62] = buf0[31];
+    buf1[63] = buf0[63];
+
+    output[0 * col_num + col] = buf1[0];
+    output[1 * col_num + col] = buf1[1];
+    output[2 * col_num + col] = buf1[2];
+    output[3 * col_num + col] = buf1[3];
+    output[4 * col_num + col] = buf1[4];
+    output[5 * col_num + col] = buf1[5];
+    output[6 * col_num + col] = buf1[6];
+    output[7 * col_num + col] = buf1[7];
+    output[8 * col_num + col] = buf1[8];
+    output[9 * col_num + col] = buf1[9];
+    output[10 * col_num + col] = buf1[10];
+    output[11 * col_num + col] = buf1[11];
+    output[12 * col_num + col] = buf1[12];
+    output[13 * col_num + col] = buf1[13];
+    output[14 * col_num + col] = buf1[14];
+    output[15 * col_num + col] = buf1[15];
+    output[16 * col_num + col] = buf1[16];
+    output[17 * col_num + col] = buf1[17];
+    output[18 * col_num + col] = buf1[18];
+    output[19 * col_num + col] = buf1[19];
+    output[20 * col_num + col] = buf1[20];
+    output[21 * col_num + col] = buf1[21];
+    output[22 * col_num + col] = buf1[22];
+    output[23 * col_num + col] = buf1[23];
+    output[24 * col_num + col] = buf1[24];
+    output[25 * col_num + col] = buf1[25];
+    output[26 * col_num + col] = buf1[26];
+    output[27 * col_num + col] = buf1[27];
+    output[28 * col_num + col] = buf1[28];
+    output[29 * col_num + col] = buf1[29];
+    output[30 * col_num + col] = buf1[30];
+    output[31 * col_num + col] = buf1[31];
+    output[32 * col_num + col] = buf1[32];
+    output[33 * col_num + col] = buf1[33];
+    output[34 * col_num + col] = buf1[34];
+    output[35 * col_num + col] = buf1[35];
+    output[36 * col_num + col] = buf1[36];
+    output[37 * col_num + col] = buf1[37];
+    output[38 * col_num + col] = buf1[38];
+    output[39 * col_num + col] = buf1[39];
+    output[40 * col_num + col] = buf1[40];
+    output[41 * col_num + col] = buf1[41];
+    output[42 * col_num + col] = buf1[42];
+    output[43 * col_num + col] = buf1[43];
+    output[44 * col_num + col] = buf1[44];
+    output[45 * col_num + col] = buf1[45];
+    output[46 * col_num + col] = buf1[46];
+    output[47 * col_num + col] = buf1[47];
+    output[48 * col_num + col] = buf1[48];
+    output[49 * col_num + col] = buf1[49];
+    output[50 * col_num + col] = buf1[50];
+    output[51 * col_num + col] = buf1[51];
+    output[52 * col_num + col] = buf1[52];
+    output[53 * col_num + col] = buf1[53];
+    output[54 * col_num + col] = buf1[54];
+    output[55 * col_num + col] = buf1[55];
+    output[56 * col_num + col] = buf1[56];
+    output[57 * col_num + col] = buf1[57];
+    output[58 * col_num + col] = buf1[58];
+    output[59 * col_num + col] = buf1[59];
+    output[60 * col_num + col] = buf1[60];
+    output[61 * col_num + col] = buf1[61];
+    output[62 * col_num + col] = buf1[62];
+    output[63 * col_num + col] = buf1[63];
+  }
+}
diff --git a/vp10/common/x86/vp10_fwd_txfm2d_sse4.c b/vp10/common/x86/vp10_fwd_txfm2d_sse4.c
new file mode 100644
index 0000000..ff04dc8
--- /dev/null
+++ b/vp10/common/x86/vp10_fwd_txfm2d_sse4.c
@@ -0,0 +1,108 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp10_rtcd.h"
+#include "vp10/common/enums.h"
+#include "vp10/common/vp10_txfm.h"
+#include "vp10/common/x86/vp10_txfm1d_sse4.h"
+
+static INLINE void int16_array_with_stride_to_int32_array_without_stride(
+    const int16_t *input, int stride, int32_t *output, int txfm1d_size) {
+  int r, c;
+  for (r = 0; r < txfm1d_size; r++) {
+    for (c = 0; c < txfm1d_size; c++) {
+      output[r * txfm1d_size + c] = (int32_t)input[r * stride + c];
+    }
+  }
+}
+
+typedef void (*TxfmFuncSSE2)(const __m128i *input, __m128i *output,
+                             const int8_t *cos_bit, const int8_t *stage_range);
+
+static INLINE TxfmFuncSSE2 fwd_txfm_type_to_func(TXFM_TYPE txfm_type) {
+  switch (txfm_type) {
+    case TXFM_TYPE_DCT4:
+      return vp10_fdct4_new_sse4_1;
+      break;
+    case TXFM_TYPE_DCT8:
+      return vp10_fdct8_new_sse4_1;
+      break;
+    case TXFM_TYPE_DCT16:
+      return vp10_fdct16_new_sse4_1;
+      break;
+    case TXFM_TYPE_DCT32:
+      return vp10_fdct32_new_sse4_1;
+      break;
+    case TXFM_TYPE_DCT64:
+      return vp10_fdct64_new_sse4_1;
+      break;
+    case TXFM_TYPE_ADST4:
+      return vp10_fadst4_new_sse4_1;
+      break;
+    case TXFM_TYPE_ADST8:
+      return vp10_fadst8_new_sse4_1;
+      break;
+    case TXFM_TYPE_ADST16:
+      return vp10_fadst16_new_sse4_1;
+      break;
+    case TXFM_TYPE_ADST32:
+      return vp10_fadst32_new_sse4_1;
+      break;
+    default:
+      assert(0);
+  }
+  return NULL;
+}
+
+static INLINE void fwd_txfm2d_sse4_1(const int16_t *input, int32_t *output,
+                                     const int stride, const TXFM_2D_CFG *cfg,
+                                     int32_t *txfm_buf) {
+  const int txfm_size = cfg->txfm_size;
+  const int8_t *shift = cfg->shift;
+  const int8_t *stage_range_col = cfg->stage_range_col;
+  const int8_t *stage_range_row = cfg->stage_range_row;
+  const int8_t *cos_bit_col = cfg->cos_bit_col;
+  const int8_t *cos_bit_row = cfg->cos_bit_row;
+  const TxfmFuncSSE2 txfm_func_col = fwd_txfm_type_to_func(cfg->txfm_type_col);
+  const TxfmFuncSSE2 txfm_func_row = fwd_txfm_type_to_func(cfg->txfm_type_row);
+
+  __m128i *buf_128 = (__m128i *)txfm_buf;
+  __m128i *out_128 = (__m128i *)output;
+  int num_per_128 = 4;
+  int txfm2d_size_128 = txfm_size * txfm_size / num_per_128;
+
+  int16_array_with_stride_to_int32_array_without_stride(input, stride, txfm_buf,
+                                                        txfm_size);
+  round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[0]);
+  txfm_func_col(out_128, buf_128, cos_bit_col, stage_range_col);
+  round_shift_array_32_sse4_1(buf_128, out_128, txfm2d_size_128, -shift[1]);
+  transpose_32(txfm_size, out_128, buf_128);
+  txfm_func_row(buf_128, out_128, cos_bit_row, stage_range_row);
+  round_shift_array_32_sse4_1(out_128, buf_128, txfm2d_size_128, -shift[2]);
+  transpose_32(txfm_size, buf_128, out_128);
+}
+
+void vp10_fwd_txfm2d_32x32_sse4_1(const int16_t *input, int32_t *output,
+                                  const int stride, int tx_type,
+                                  const int bd) {
+  int32_t txfm_buf[1024];
+  TXFM_2D_FLIP_CFG cfg = vp10_get_fwd_txfm_cfg(tx_type, TX_32X32);
+  (void)bd;
+  fwd_txfm2d_sse4_1(input, output, stride, cfg.cfg, txfm_buf);
+}
+
+void vp10_fwd_txfm2d_64x64_sse4_1(const int16_t *input, int32_t *output,
+                                  const int stride, int tx_type,
+                                  const int bd) {
+  int32_t txfm_buf[4096];
+  TXFM_2D_FLIP_CFG cfg = vp10_get_fwd_txfm_64x64_cfg(tx_type);
+  (void)bd;
+  fwd_txfm2d_sse4_1(input, output, stride, cfg.cfg, txfm_buf);
+}
diff --git a/vp10/common/x86/vp10_fwd_txfm_sse2.c b/vp10/common/x86/vp10_fwd_txfm_sse2.c
index 032c3cc..30bce5f 100644
--- a/vp10/common/x86/vp10_fwd_txfm_sse2.c
+++ b/vp10/common/x86/vp10_fwd_txfm_sse2.c
@@ -10,6 +10,7 @@
 
 #include <emmintrin.h>  // SSE2
 
+#include "./vp10_rtcd.h"
 #include "./vpx_config.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/x86/fwd_txfm_sse2.h"
diff --git a/vp10/common/x86/vp10_highbd_convolve_filters_sse4.c b/vp10/common/x86/vp10_highbd_convolve_filters_sse4.c
new file mode 100644
index 0000000..0251022
--- /dev/null
+++ b/vp10/common/x86/vp10_highbd_convolve_filters_sse4.c
@@ -0,0 +1,393 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+#include "./vpx_config.h"
+#include "vp10/common/filter.h"
+
+#if CONFIG_VP9_HIGHBITDEPTH
+#if CONFIG_EXT_INTERP
+DECLARE_ALIGNED(16, const int16_t,
+  sub_pel_filters_10sharp_highbd_ver_signal_dir[15][6][8]) = {
+  {
+    {  0,   0,   0,   0,   0,   0,   0,   0, },
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -6, 127,  -6, 127,  -6, 127,  -6, 127, },
+    {  8,  -4,   8,  -4,   8,  -4,   8,  -4, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+    {  0,   0,   0,   0,   0,   0,   0,   0, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1, },
+    { -2,   5,  -2,   5,  -2,   5,  -2,   5, },
+    {-12, 124, -12, 124, -12, 124, -12, 124, },
+    { 18,  -7,  18,  -7,  18,  -7,  18,  -7, },
+    {  3,  -2,   3,  -2,   3,  -2,   3,  -2, },
+    {  0,   0,   0,   0,   0,   0,   0,   0, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1, },
+    { -3,   7,  -3,   7,  -3,   7,  -3,   7, },
+    {-17, 119, -17, 119, -17, 119, -17, 119, },
+    { 28, -11,  28, -11,  28, -11,  28, -11, },
+    {  5,  -2,   5,  -2,   5,  -2,   5,  -2, },
+    {  1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1, },
+    { -4,   8,  -4,   8,  -4,   8,  -4,   8, },
+    {-20, 114, -20, 114, -20, 114, -20, 114, },
+    { 38, -14,  38, -14,  38, -14,  38, -14, },
+    {  7,  -3,   7,  -3,   7,  -3,   7,  -3, },
+    {  1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1, },
+    { -4,   9,  -4,   9,  -4,   9,  -4,   9, },
+    {-22, 107, -22, 107, -22, 107, -22, 107, },
+    { 49, -17,  49, -17,  49, -17,  49, -17, },
+    {  8,  -4,   8,  -4,   8,  -4,   8,  -4, },
+    {  1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    {  0,   2,   0,   2,   0,   2,   0,   2, },
+    { -5,  10,  -5,  10,  -5,  10,  -5,  10, },
+    {-24,  99, -24,  99, -24,  99, -24,  99, },
+    { 59, -20,  59, -20,  59, -20,  59, -20, },
+    {  9,  -4,   9,  -4,   9,  -4,   9,  -4, },
+    {  2,   0,   2,   0,   2,   0,   2,   0, },
+  },
+  {
+    {  0,   2,   0,   2,   0,   2,   0,   2, },
+    { -5,  10,  -5,  10,  -5,  10,  -5,  10, },
+    {-24,  90, -24,  90, -24,  90, -24,  90, },
+    { 70, -22,  70, -22,  70, -22,  70, -22, },
+    { 10,  -5,  10,  -5,  10,  -5,  10,  -5, },
+    {  2,   0,   2,   0,   2,   0,   2,   0, },
+  },
+  {
+    {  0,   2,   0,   2,   0,   2,   0,   2, },
+    { -5,  10,  -5,  10,  -5,  10,  -5,  10, },
+    {-23,  80, -23,  80, -23,  80, -23,  80, },
+    { 80, -23,  80, -23,  80, -23,  80, -23, },
+    { 10,  -5,  10,  -5,  10,  -5,  10,  -5, },
+    {  2,   0,   2,   0,   2,   0,   2,   0, },
+  },
+  {
+    {  0,   2,   0,   2,   0,   2,   0,   2, },
+    { -5,  10,  -5,  10,  -5,  10,  -5,  10, },
+    {-22,  70, -22,  70, -22,  70, -22,  70, },
+    { 90, -24,  90, -24,  90, -24,  90, -24, },
+    { 10,  -5,  10,  -5,  10,  -5,  10,  -5, },
+    {  2,   0,   2,   0,   2,   0,   2,   0, },
+  },
+  {
+    {  0,   2,   0,   2,   0,   2,   0,   2, },
+    { -4,   9,  -4,   9,  -4,   9,  -4,   9, },
+    {-20,  59, -20,  59, -20,  59, -20,  59, },
+    { 99, -24,  99, -24,  99, -24,  99, -24, },
+    { 10,  -5,  10,  -5,  10,  -5,  10,  -5, },
+    {  2,   0,   2,   0,   2,   0,   2,   0, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1, },
+    { -4,   8,  -4,   8,  -4,   8,  -4,   8, },
+    {-17,  49, -17,  49, -17,  49, -17,  49, },
+    {107, -22, 107, -22, 107, -22, 107, -22, },
+    {  9,  -4,   9,  -4,   9,  -4,   9,  -4, },
+    {  1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1, },
+    { -3,   7,  -3,   7,  -3,   7,  -3,   7, },
+    {-14,  38, -14,  38, -14,  38, -14,  38, },
+    {114, -20, 114, -20, 114, -20, 114, -20, },
+    {  8,  -4,   8,  -4,   8,  -4,   8,  -4, },
+    {  1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1, },
+    { -2,   5,  -2,   5,  -2,   5,  -2,   5, },
+    {-11,  28, -11,  28, -11,  28, -11,  28, },
+    {119, -17, 119, -17, 119, -17, 119, -17, },
+    {  7,  -3,   7,  -3,   7,  -3,   7,  -3, },
+    {  1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    {  0,   0,   0,   0,   0,   0,   0,   0, },
+    { -2,   3,  -2,   3,  -2,   3,  -2,   3, },
+    { -7,  18,  -7,  18,  -7,  18,  -7,  18, },
+    {124, -12, 124, -12, 124, -12, 124, -12, },
+    {  5,  -2,   5,  -2,   5,  -2,   5,  -2, },
+    {  1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    {  0,   0,   0,   0,   0,   0,   0,   0, },
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -4,   8,  -4,   8,  -4,   8,  -4,   8, },
+    {127,  -6, 127,  -6, 127,  -6, 127,  -6, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+    {  0,   0,   0,   0,   0,   0,   0,   0, },
+  },
+};
+#endif
+#endif
+#if CONFIG_VP9_HIGHBITDEPTH
+#if CONFIG_EXT_INTERP
+DECLARE_ALIGNED(16, const int16_t,
+  sub_pel_filters_12sharp_highbd_ver_signal_dir[15][6][8]) = {
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1, },
+    { -2,   3,  -2,   3,  -2,   3,  -2,   3, },
+    { -7, 127,  -7, 127,  -7, 127,  -7, 127, },
+    {  8,  -4,   8,  -4,   8,  -4,   8,  -4, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+    {  1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -3,   6,  -3,   6,  -3,   6,  -3,   6, },
+    {-13, 124, -13, 124, -13, 124, -13, 124, },
+    { 18,  -8,  18,  -8,  18,  -8,  18,  -8, },
+    {  4,  -2,   4,  -2,   4,  -2,   4,  -2, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+  },
+  {
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -4,   8,  -4,   8,  -4,   8,  -4,   8, },
+    {-18, 120, -18, 120, -18, 120, -18, 120, },
+    { 28, -12,  28, -12,  28, -12,  28, -12, },
+    {  7,  -4,   7,  -4,   7,  -4,   7,  -4, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+  },
+  {
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -6,  10,  -6,  10,  -6,  10,  -6,  10, },
+    {-21, 115, -21, 115, -21, 115, -21, 115, },
+    { 38, -15,  38, -15,  38, -15,  38, -15, },
+    {  8,  -5,   8,  -5,   8,  -5,   8,  -5, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+  },
+  {
+    { -2,   4,  -2,   4,  -2,   4,  -2,   4, },
+    { -6,  12,  -6,  12,  -6,  12,  -6,  12, },
+    {-24, 108, -24, 108, -24, 108, -24, 108, },
+    { 49, -18,  49, -18,  49, -18,  49, -18, },
+    { 10,  -6,  10,  -6,  10,  -6,  10,  -6, },
+    {  3,  -2,   3,  -2,   3,  -2,   3,  -2, },
+  },
+  {
+    { -2,   4,  -2,   4,  -2,   4,  -2,   4, },
+    { -7,  13,  -7,  13,  -7,  13,  -7,  13, },
+    {-25, 100, -25, 100, -25, 100, -25, 100, },
+    { 60, -21,  60, -21,  60, -21,  60, -21, },
+    { 11,  -7,  11,  -7,  11,  -7,  11,  -7, },
+    {  4,  -2,   4,  -2,   4,  -2,   4,  -2, },
+  },
+  {
+    { -2,   4,  -2,   4,  -2,   4,  -2,   4, },
+    { -7,  13,  -7,  13,  -7,  13,  -7,  13, },
+    {-26,  91, -26,  91, -26,  91, -26,  91, },
+    { 71, -24,  71, -24,  71, -24,  71, -24, },
+    { 13,  -7,  13,  -7,  13,  -7,  13,  -7, },
+    {  4,  -2,   4,  -2,   4,  -2,   4,  -2, },
+  },
+  {
+    { -2,   4,  -2,   4,  -2,   4,  -2,   4, },
+    { -7,  13,  -7,  13,  -7,  13,  -7,  13, },
+    {-25,  81, -25,  81, -25,  81, -25,  81, },
+    { 81, -25,  81, -25,  81, -25,  81, -25, },
+    { 13,  -7,  13,  -7,  13,  -7,  13,  -7, },
+    {  4,  -2,   4,  -2,   4,  -2,   4,  -2, },
+  },
+  {
+    { -2,   4,  -2,   4,  -2,   4,  -2,   4, },
+    { -7,  13,  -7,  13,  -7,  13,  -7,  13, },
+    {-24,  71, -24,  71, -24,  71, -24,  71, },
+    { 91, -26,  91, -26,  91, -26,  91, -26, },
+    { 13,  -7,  13,  -7,  13,  -7,  13,  -7, },
+    {  4,  -2,   4,  -2,   4,  -2,   4,  -2, },
+  },
+  {
+    { -2,   4,  -2,   4,  -2,   4,  -2,   4, },
+    { -7,  11,  -7,  11,  -7,  11,  -7,  11, },
+    {-21,  60, -21,  60, -21,  60, -21,  60, },
+    {100, -25, 100, -25, 100, -25, 100, -25, },
+    { 13,  -7,  13,  -7,  13,  -7,  13,  -7, },
+    {  4,  -2,   4,  -2,   4,  -2,   4,  -2, },
+  },
+  {
+    { -2,   3,  -2,   3,  -2,   3,  -2,   3, },
+    { -6,  10,  -6,  10,  -6,  10,  -6,  10, },
+    {-18,  49, -18,  49, -18,  49, -18,  49, },
+    {108, -24, 108, -24, 108, -24, 108, -24, },
+    { 12,  -6,  12,  -6,  12,  -6,  12,  -6, },
+    {  4,  -2,   4,  -2,   4,  -2,   4,  -2, },
+  },
+  {
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -5,   8,  -5,   8,  -5,   8,  -5,   8, },
+    {-15,  38, -15,  38, -15,  38, -15,  38, },
+    {115, -21, 115, -21, 115, -21, 115, -21, },
+    { 10,  -6,  10,  -6,  10,  -6,  10,  -6, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+  },
+  {
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -4,   7,  -4,   7,  -4,   7,  -4,   7, },
+    {-12,  28, -12,  28, -12,  28, -12,  28, },
+    {120, -18, 120, -18, 120, -18, 120, -18, },
+    {  8,  -4,   8,  -4,   8,  -4,   8,  -4, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+  },
+  {
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -2,   4,  -2,   4,  -2,   4,  -2,   4, },
+    { -8,  18,  -8,  18,  -8,  18,  -8,  18, },
+    {124, -13, 124, -13, 124, -13, 124, -13, },
+    {  6,  -3,   6,  -3,   6,  -3,   6,  -3, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1, },
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -4,   8,  -4,   8,  -4,   8,  -4,   8, },
+    {127,  -7, 127,  -7, 127,  -7, 127,  -7, },
+    {  3,  -2,   3,  -2,   3,  -2,   3,  -2, },
+    {  1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+};
+#endif
+#endif
+#if CONFIG_VP9_HIGHBITDEPTH
+#if USE_TEMPORALFILTER_12TAP
+DECLARE_ALIGNED(16, const int16_t,
+  sub_pel_filters_temporalfilter_12_highbd_ver_signal_dir[15][6][8]) = {
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1, },
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -7, 127,  -7, 127,  -7, 127,  -7, 127, },
+    {  8,  -4,   8,  -4,   8,  -4,   8,  -4, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+    {  0,   0,   0,   0,   0,   0,   0,   0, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1, },
+    { -3,   5,  -3,   5,  -3,   5,  -3,   5, },
+    {-12, 124, -12, 124, -12, 124, -12, 124, },
+    { 18,  -8,  18,  -8,  18,  -8,  18,  -8, },
+    {  4,  -2,   4,  -2,   4,  -2,   4,  -2, },
+    {  1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -4,   8,  -4,   8,  -4,   8,  -4,   8, },
+    {-17, 120, -17, 120, -17, 120, -17, 120, },
+    { 28, -11,  28, -11,  28, -11,  28, -11, },
+    {  6,  -3,   6,  -3,   6,  -3,   6,  -3, },
+    {  1,  -1,   1,  -1,   1,  -1,   1,  -1, },
+  },
+  {
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -4,  10,  -4,  10,  -4,  10,  -4,  10, },
+    {-21, 114, -21, 114, -21, 114, -21, 114, },
+    { 38, -15,  38, -15,  38, -15,  38, -15, },
+    {  8,  -4,   8,  -4,   8,  -4,   8,  -4, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+  },
+  {
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -5,  11,  -5,  11,  -5,  11,  -5,  11, },
+    {-23, 107, -23, 107, -23, 107, -23, 107, },
+    { 49, -18,  49, -18,  49, -18,  49, -18, },
+    {  9,  -5,   9,  -5,   9,  -5,   9,  -5, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+  },
+  {
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -6,  12,  -6,  12,  -6,  12,  -6,  12, },
+    {-25,  99, -25,  99, -25,  99, -25,  99, },
+    { 60, -21,  60, -21,  60, -21,  60, -21, },
+    { 11,  -6,  11,  -6,  11,  -6,  11,  -6, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+  },
+  {
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -6,  12,  -6,  12,  -6,  12,  -6,  12, },
+    {-25,  90, -25,  90, -25,  90, -25,  90, },
+    { 70, -23,  70, -23,  70, -23,  70, -23, },
+    { 12,  -6,  12,  -6,  12,  -6,  12,  -6, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+  },
+  {
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -6,  12,  -6,  12,  -6,  12,  -6,  12, },
+    {-24,  80, -24,  80, -24,  80, -24,  80, },
+    { 80, -24,  80, -24,  80, -24,  80, -24, },
+    { 12,  -6,  12,  -6,  12,  -6,  12,  -6, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+  },
+  {
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -6,  12,  -6,  12,  -6,  12,  -6,  12, },
+    {-23,  70, -23,  70, -23,  70, -23,  70, },
+    { 90, -25,  90, -25,  90, -25,  90, -25, },
+    { 12,  -6,  12,  -6,  12,  -6,  12,  -6, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+  },
+  {
+    { -1,   3,  -1,   3,  -1,   3,  -1,   3, },
+    { -6,  11,  -6,  11,  -6,  11,  -6,  11, },
+    {-21,  60, -21,  60, -21,  60, -21,  60, },
+    { 99, -25,  99, -25,  99, -25,  99, -25, },
+    { 12,  -6,  12,  -6,  12,  -6,  12,  -6, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+  },
+  {
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -5,   9,  -5,   9,  -5,   9,  -5,   9, },
+    {-18,  49, -18,  49, -18,  49, -18,  49, },
+    {107, -23, 107, -23, 107, -23, 107, -23, },
+    { 11,  -5,  11,  -5,  11,  -5,  11,  -5, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+  },
+  {
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -4,   8,  -4,   8,  -4,   8,  -4,   8, },
+    {-15,  38, -15,  38, -15,  38, -15,  38, },
+    {114, -21, 114, -21, 114, -21, 114, -21, },
+    { 10,  -4,  10,  -4,  10,  -4,  10,  -4, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+  },
+  {
+    { -1,   1,  -1,   1,  -1,   1,  -1,   1, },
+    { -3,   6,  -3,   6,  -3,   6,  -3,   6, },
+    {-11,  28, -11,  28, -11,  28, -11,  28, },
+    {120, -17, 120, -17, 120, -17, 120, -17, },
+    {  8,  -4,   8,  -4,   8,  -4,   8,  -4, },
+    {  2,  -1,   2,  -1,   2,  -1,   2,  -1, },
+  },
+  {
+    {  0,   1,   0,   1,   0,   1,   0,   1, },
+    { -2,   4,  -2,   4,  -2,   4,  -2,   4, },
+    { -8,  18,  -8,  18,  -8,  18,  -8,  18, },
+    {124, -12, 124, -12, 124, -12, 124, -12, },
+    {  5,  -3,   5,  -3,   5,  -3,   5,  -3, },
+    {  1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+  {
+    {  0,   0,   0,   0,   0,   0,   0,   0, },
+    { -1,   2,  -1,   2,  -1,   2,  -1,   2, },
+    { -4,   8,  -4,   8,  -4,   8,  -4,   8, },
+    {127,  -7, 127,  -7, 127,  -7, 127,  -7, },
+    {  3,  -1,   3,  -1,   3,  -1,   3,  -1, },
+    {  1,   0,   1,   0,   1,   0,   1,   0, },
+  },
+};
+#endif
+#endif
diff --git a/vp10/common/x86/vp10_highbd_convolve_sse4.c b/vp10/common/x86/vp10_highbd_convolve_sse4.c
new file mode 100644
index 0000000..e8e4f77
--- /dev/null
+++ b/vp10/common/x86/vp10_highbd_convolve_sse4.c
@@ -0,0 +1,477 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <smmintrin.h>
+
+#include "./vp10_rtcd.h"
+#include "vp10/common/filter.h"
+
+typedef void (*TransposeSave)(const int width, int pixelsNum,
+                              uint32_t *src, int src_stride,
+                              uint16_t *dst, int dst_stride,
+                              int bd);
+
+// pixelsNum 0: write all 4 pixels
+//           1/2/3: residual pixels 1/2/3
+static void writePixel(__m128i *u, int width, int pixelsNum,
+                       uint16_t *dst, int dst_stride) {
+  if (2 == width) {
+    if (0 == pixelsNum) {
+      *(int *)dst = _mm_cvtsi128_si32(u[0]);
+      *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
+      *(int *)(dst + 2 * dst_stride) = _mm_cvtsi128_si32(u[2]);
+      *(int *)(dst + 3 * dst_stride) = _mm_cvtsi128_si32(u[3]);
+    } else if (1 == pixelsNum) {
+      *(int *)dst = _mm_cvtsi128_si32(u[0]);
+    } else if (2 == pixelsNum) {
+      *(int *)dst = _mm_cvtsi128_si32(u[0]);
+      *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
+    } else if (3 == pixelsNum) {
+      *(int *)dst = _mm_cvtsi128_si32(u[0]);
+      *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
+      *(int *)(dst + 2 * dst_stride) = _mm_cvtsi128_si32(u[2]);
+    }
+  } else {
+    if (0 == pixelsNum) {
+      _mm_storel_epi64((__m128i *)dst, u[0]);
+      _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
+      _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), u[2]);
+      _mm_storel_epi64((__m128i *)(dst + 3 * dst_stride), u[3]);
+    } else if (1 == pixelsNum) {
+      _mm_storel_epi64((__m128i *)dst, u[0]);
+    } else if (2 == pixelsNum) {
+      _mm_storel_epi64((__m128i *)dst, u[0]);
+      _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
+    } else if (3 == pixelsNum) {
+      _mm_storel_epi64((__m128i *)dst, u[0]);
+      _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
+      _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), u[2]);
+    }
+  }
+}
+
+// 16-bit pixels clip with bd (10/12)
+static void highbd_clip(__m128i *p, int numVecs, int bd) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i one = _mm_set1_epi16(1);
+  const __m128i max = _mm_sub_epi16(_mm_slli_epi16(one, bd), one);
+  __m128i clamped, mask;
+  int i;
+
+  for (i = 0; i < numVecs; i++) {
+    mask = _mm_cmpgt_epi16(p[i], max);
+    clamped = _mm_andnot_si128(mask, p[i]);
+    mask = _mm_and_si128(mask, max);
+    clamped = _mm_or_si128(mask, clamped);
+    mask = _mm_cmpgt_epi16(clamped, zero);
+    p[i] = _mm_and_si128(clamped, mask);
+  }
+}
+
+static void transClipPixel(uint32_t *src, int src_stride, __m128i *u, int bd) {
+  __m128i v0, v1;
+  __m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1));
+
+  u[0] = _mm_loadu_si128((__m128i const *)src);
+  u[1] = _mm_loadu_si128((__m128i const *)(src + src_stride));
+  u[2] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+  u[3] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+
+  u[0] = _mm_add_epi32(u[0], rnd);
+  u[1] = _mm_add_epi32(u[1], rnd);
+  u[2] = _mm_add_epi32(u[2], rnd);
+  u[3] = _mm_add_epi32(u[3], rnd);
+
+  u[0] = _mm_srai_epi32(u[0], FILTER_BITS);
+  u[1] = _mm_srai_epi32(u[1], FILTER_BITS);
+  u[2] = _mm_srai_epi32(u[2], FILTER_BITS);
+  u[3] = _mm_srai_epi32(u[3], FILTER_BITS);
+
+  u[0] = _mm_packus_epi32(u[0], u[1]);
+  u[1] = _mm_packus_epi32(u[2], u[3]);
+
+  highbd_clip(u, 2, bd);
+
+  v0 = _mm_unpacklo_epi16(u[0], u[1]);
+  v1 = _mm_unpackhi_epi16(u[0], u[1]);
+
+  u[0] = _mm_unpacklo_epi16(v0, v1);
+  u[2] = _mm_unpackhi_epi16(v0, v1);
+
+  u[1] = _mm_srli_si128(u[0], 8);
+  u[3] = _mm_srli_si128(u[2], 8);
+}
+
+// pixelsNum = 0     : all 4 rows of pixels will be saved.
+// pixelsNum = 1/2/3 : residual 1/2/4 rows of pixels will be saved.
+void trans_save_4x4(const int width, int pixelsNum,
+                    uint32_t *src, int src_stride,
+                    uint16_t *dst, int dst_stride,
+                    int bd) {
+  __m128i u[4];
+  transClipPixel(src, src_stride, u, bd);
+  writePixel(u, width, pixelsNum, dst, dst_stride);
+}
+
+void trans_accum_save_4x4(const int width, int pixelsNum,
+                          uint32_t *src, int src_stride,
+                          uint16_t *dst, int dst_stride,
+                          int bd) {
+  __m128i u[4], v[4];
+  const __m128i ones = _mm_set1_epi16(1);
+
+  transClipPixel(src, src_stride, u, bd);
+
+  v[0] = _mm_loadl_epi64((__m128i const *)dst);
+  v[1] = _mm_loadl_epi64((__m128i const *)(dst + dst_stride));
+  v[2] = _mm_loadl_epi64((__m128i const *)(dst + 2 * dst_stride));
+  v[3] = _mm_loadl_epi64((__m128i const *)(dst + 3 * dst_stride));
+
+  u[0] = _mm_add_epi16(u[0], v[0]);
+  u[1] = _mm_add_epi16(u[1], v[1]);
+  u[2] = _mm_add_epi16(u[2], v[2]);
+  u[3] = _mm_add_epi16(u[3], v[3]);
+
+  u[0] = _mm_add_epi16(u[0], ones);
+  u[1] = _mm_add_epi16(u[1], ones);
+  u[2] = _mm_add_epi16(u[2], ones);
+  u[3] = _mm_add_epi16(u[3], ones);
+
+  u[0] = _mm_srai_epi16(u[0], 1);
+  u[1] = _mm_srai_epi16(u[1], 1);
+  u[2] = _mm_srai_epi16(u[2], 1);
+  u[3] = _mm_srai_epi16(u[3], 1);
+
+  writePixel(u, width, pixelsNum, dst, dst_stride);
+}
+
+static TransposeSave transSaveTab[2] = {
+  trans_save_4x4, trans_accum_save_4x4};
+
+static INLINE void transpose_pair(__m128i *in, __m128i *out) {
+  __m128i x0, x1;
+
+  x0 = _mm_unpacklo_epi32(in[0], in[1]);
+  x1 = _mm_unpacklo_epi32(in[2], in[3]);
+
+  out[0] = _mm_unpacklo_epi64(x0, x1);
+  out[1] = _mm_unpackhi_epi64(x0, x1);
+
+  x0 = _mm_unpackhi_epi32(in[0], in[1]);
+  x1 = _mm_unpackhi_epi32(in[2], in[3]);
+
+  out[2] = _mm_unpacklo_epi64(x0, x1);
+  out[3] = _mm_unpackhi_epi64(x0, x1);
+
+  x0 = _mm_unpacklo_epi32(in[4], in[5]);
+  x1 = _mm_unpacklo_epi32(in[6], in[7]);
+
+  out[4] = _mm_unpacklo_epi64(x0, x1);
+  out[5] = _mm_unpackhi_epi64(x0, x1);
+}
+
+static void highbd_filter_horiz(const uint16_t *src, int src_stride,
+                                __m128i *f, int tapsNum, uint32_t *buf) {
+  __m128i u[8], v[6];
+
+  if (tapsNum == 10) {
+    src -= 1;
+  }
+
+  u[0] = _mm_loadu_si128((__m128i const *)src);
+  u[1] = _mm_loadu_si128((__m128i const *)(src + src_stride));
+  u[2] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride));
+  u[3] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride));
+
+  u[4] = _mm_loadu_si128((__m128i const *)(src + 8));
+  u[5] = _mm_loadu_si128((__m128i const *)(src + src_stride + 8));
+  u[6] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride + 8));
+  u[7] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride + 8));
+
+  transpose_pair(u, v);
+
+  u[0] = _mm_madd_epi16(v[0], f[0]);
+  u[1] = _mm_madd_epi16(v[1], f[1]);
+  u[2] = _mm_madd_epi16(v[2], f[2]);
+  u[3] = _mm_madd_epi16(v[3], f[3]);
+  u[4] = _mm_madd_epi16(v[4], f[4]);
+  u[5] = _mm_madd_epi16(v[5], f[5]);
+
+  u[6] = _mm_min_epi32(u[2], u[3]);
+  u[7] = _mm_max_epi32(u[2], u[3]);
+
+  u[0] = _mm_add_epi32(u[0], u[1]);
+  u[0] = _mm_add_epi32(u[0], u[5]);
+  u[0] = _mm_add_epi32(u[0], u[4]);
+  u[0] = _mm_add_epi32(u[0], u[6]);
+  u[0] = _mm_add_epi32(u[0], u[7]);
+
+  _mm_storeu_si128((__m128i *)buf, u[0]);
+}
+
+void vp10_highbd_convolve_horiz_sse4_1(const uint16_t *src, int src_stride,
+                                       uint16_t *dst, int dst_stride,
+                                       int w, int h,
+                                       const InterpFilterParams filter_params,
+                                       const int subpel_x_q4, int x_step_q4,
+                                       int avg, int bd) {
+  DECLARE_ALIGNED(16, uint32_t, temp[4 * 4]);
+  __m128i verf[6];
+  HbdSubpelFilterCoeffs vCoeffs;
+  const uint16_t *srcPtr;
+  const int tapsNum = filter_params.taps;
+  int i, col, count, blkResidu, blkHeight;
+  TransposeSave transSave = transSaveTab[avg];
+  (void)x_step_q4;
+
+  if (0 == subpel_x_q4 || 16 != x_step_q4) {
+    vp10_highbd_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h,
+                                 filter_params, subpel_x_q4, x_step_q4, avg,
+                                 bd);
+    return;
+  }
+
+  vCoeffs = vp10_hbd_get_subpel_filter_ver_signal_dir(
+      filter_params, subpel_x_q4 - 1);
+  if (!vCoeffs) {
+    vp10_highbd_convolve_horiz_c(src, src_stride, dst, dst_stride, w, h,
+                                 filter_params, subpel_x_q4, x_step_q4, avg,
+                                 bd);
+    return;
+  }
+
+  verf[0] = *((const __m128i *)(vCoeffs));
+  verf[1] = *((const __m128i *)(vCoeffs + 1));
+  verf[2] = *((const __m128i *)(vCoeffs + 2));
+  verf[3] = *((const __m128i *)(vCoeffs + 3));
+  verf[4] = *((const __m128i *)(vCoeffs + 4));
+  verf[5] = *((const __m128i *)(vCoeffs + 5));
+
+  src -= (tapsNum >> 1) - 1;
+  srcPtr = src;
+
+  count = 0;
+  blkHeight = h >> 2;
+  blkResidu = h & 3;
+
+  while (blkHeight != 0) {
+    for (col = 0; col < w; col += 4) {
+      for (i = 0; i < 4; ++i) {
+        highbd_filter_horiz(srcPtr, src_stride, verf, tapsNum, temp + (i * 4));
+        srcPtr += 1;
+      }
+      transSave(w, 0, temp, 4, dst + col, dst_stride, bd);
+    }
+    count++;
+    srcPtr = src + count * src_stride * 4;
+    dst += dst_stride * 4;
+    blkHeight--;
+  }
+
+  if (blkResidu == 0)
+    return;
+
+  for (col = 0; col < w; col += 4) {
+    for (i = 0; i < 4; ++i) {
+      highbd_filter_horiz(srcPtr, src_stride, verf, tapsNum, temp + (i * 4));
+      srcPtr += 1;
+    }
+    transSave(w, blkResidu, temp, 4, dst + col, dst_stride, bd);
+  }
+}
+
+// Vertical convolutional filter
+
+typedef void (*WritePixels)(__m128i *u, int bd, uint16_t *dst);
+
+static void highbdRndingPacks(__m128i *u) {
+  __m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1));
+  u[0] = _mm_add_epi32(u[0], rnd);
+  u[0] = _mm_srai_epi32(u[0], FILTER_BITS);
+  u[0] = _mm_packus_epi32(u[0], u[0]);
+}
+
+static void write2pixelsOnly(__m128i *u, int bd, uint16_t *dst) {
+  highbdRndingPacks(u);
+  highbd_clip(u, 1, bd);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(u[0]);
+}
+
+static void write2pixelsAccum(__m128i *u, int bd, uint16_t *dst) {
+  __m128i v = _mm_loadl_epi64((__m128i const *)dst);
+  const __m128i ones = _mm_set1_epi16(1);
+
+  highbdRndingPacks(u);
+  highbd_clip(u, 1, bd);
+
+  v = _mm_add_epi16(v, u[0]);
+  v = _mm_add_epi16(v, ones);
+  v = _mm_srai_epi16(v, 1);
+  *(uint32_t *)dst = _mm_cvtsi128_si32(v);
+}
+
+WritePixels write2pixelsTab[2] = {write2pixelsOnly, write2pixelsAccum};
+
+static void write4pixelsOnly(__m128i *u, int bd, uint16_t *dst) {
+  highbdRndingPacks(u);
+  highbd_clip(u, 1, bd);
+  _mm_storel_epi64((__m128i *)dst, u[0]);
+}
+
+static void write4pixelsAccum(__m128i *u, int bd, uint16_t *dst) {
+  __m128i v = _mm_loadl_epi64((__m128i const *)dst);
+  const __m128i ones = _mm_set1_epi16(1);
+
+  highbdRndingPacks(u);
+  highbd_clip(u, 1, bd);
+
+  v = _mm_add_epi16(v, u[0]);
+  v = _mm_add_epi16(v, ones);
+  v = _mm_srai_epi16(v, 1);
+  _mm_storel_epi64((__m128i *)dst, v);
+}
+
+WritePixels write4pixelsTab[2] = {write4pixelsOnly, write4pixelsAccum};
+
+static void filter_vert_horiz_parallel(const uint16_t *src, int src_stride,
+                                       const __m128i *f, int taps,
+                                       uint16_t *dst, WritePixels saveFunc,
+                                       int bd) {
+  __m128i s[12];
+  __m128i zero = _mm_setzero_si128();
+  int i = 0;
+  int r = 0;
+
+  // TODO(luoyi) treat s[12] as a circular buffer in width = 2 case
+  if (10 == taps) {
+    i += 1;
+    s[0] = zero;
+  }
+  while (i < 12) {
+    s[i] = _mm_loadu_si128((__m128i const *)(src + r * src_stride));
+    i += 1;
+    r += 1;
+  }
+
+  s[0] = _mm_unpacklo_epi16(s[0], s[1]);
+  s[2] = _mm_unpacklo_epi16(s[2], s[3]);
+  s[4] = _mm_unpacklo_epi16(s[4], s[5]);
+  s[6] = _mm_unpacklo_epi16(s[6], s[7]);
+  s[8] = _mm_unpacklo_epi16(s[8], s[9]);
+  s[10] = _mm_unpacklo_epi16(s[10], s[11]);
+
+  s[0] = _mm_madd_epi16(s[0], f[0]);
+  s[2] = _mm_madd_epi16(s[2], f[1]);
+  s[4] = _mm_madd_epi16(s[4], f[2]);
+  s[6] = _mm_madd_epi16(s[6], f[3]);
+  s[8] = _mm_madd_epi16(s[8], f[4]);
+  s[10] = _mm_madd_epi16(s[10], f[5]);
+
+  s[1] = _mm_min_epi32(s[4], s[6]);
+  s[3] = _mm_max_epi32(s[4], s[6]);
+
+  s[0] = _mm_add_epi32(s[0], s[2]);
+  s[0] = _mm_add_epi32(s[0], s[10]);
+  s[0] = _mm_add_epi32(s[0], s[8]);
+  s[0] = _mm_add_epi32(s[0], s[1]);
+  s[0] = _mm_add_epi32(s[0], s[3]);
+
+  saveFunc(s, bd, dst);
+}
+
+static void highbd_filter_vert_compute_large(const uint16_t *src,
+                                             int src_stride,
+                                             const __m128i *f, int taps,
+                                             int w, int h,
+                                             uint16_t *dst, int dst_stride,
+                                             int avg, int bd) {
+  int col;
+  int rowIndex = 0;
+  const uint16_t *src_ptr = src;
+  uint16_t *dst_ptr = dst;
+  const int step = 4;
+  WritePixels write4pixels = write4pixelsTab[avg];
+
+  do {
+    for (col = 0; col < w; col += step) {
+      filter_vert_horiz_parallel(src_ptr, src_stride, f, taps,
+                                 dst_ptr, write4pixels, bd);
+      src_ptr += step;
+      dst_ptr += step;
+    }
+    rowIndex++;
+    src_ptr = src + rowIndex * src_stride;
+    dst_ptr = dst + rowIndex * dst_stride;
+  } while (rowIndex < h);
+}
+
+static void highbd_filter_vert_compute_small(const uint16_t *src,
+                                             int src_stride,
+                                             const __m128i *f, int taps,
+                                             int w, int h,
+                                             uint16_t *dst, int dst_stride,
+                                             int avg, int bd) {
+  int rowIndex = 0;
+  WritePixels write2pixels = write2pixelsTab[avg];
+  (void)w;
+
+  do {
+    filter_vert_horiz_parallel(src, src_stride, f, taps, dst, write2pixels,
+                               bd);
+    rowIndex++;
+    src += src_stride;
+    dst += dst_stride;
+  } while (rowIndex < h);
+}
+
+void vp10_highbd_convolve_vert_sse4_1(const uint16_t *src, int src_stride,
+                                      uint16_t *dst, int dst_stride,
+                                      int w, int h,
+                                      const InterpFilterParams filter_params,
+                                      const int subpel_y_q4, int y_step_q4,
+                                      int avg, int bd) {
+  __m128i verf[6];
+  HbdSubpelFilterCoeffs vCoeffs;
+  const int tapsNum = filter_params.taps;
+
+  if (0 == subpel_y_q4 || 16 != y_step_q4) {
+    vp10_highbd_convolve_vert_c(src, src_stride, dst, dst_stride, w, h,
+                                filter_params, subpel_y_q4, y_step_q4, avg,
+                                bd);
+    return;
+  }
+
+  vCoeffs = vp10_hbd_get_subpel_filter_ver_signal_dir(
+      filter_params, subpel_y_q4 - 1);
+  if (!vCoeffs) {
+    vp10_highbd_convolve_vert_c(src, src_stride, dst, dst_stride, w, h,
+                                filter_params, subpel_y_q4, y_step_q4, avg,
+                                bd);
+    return;
+  }
+
+  verf[0] = *((const __m128i *)(vCoeffs));
+  verf[1] = *((const __m128i *)(vCoeffs + 1));
+  verf[2] = *((const __m128i *)(vCoeffs + 2));
+  verf[3] = *((const __m128i *)(vCoeffs + 3));
+  verf[4] = *((const __m128i *)(vCoeffs + 4));
+  verf[5] = *((const __m128i *)(vCoeffs + 5));
+
+  src -= src_stride * ((tapsNum >> 1) - 1);
+
+  if (w > 2) {
+    highbd_filter_vert_compute_large(src, src_stride, verf, tapsNum, w, h,
+                                     dst, dst_stride, avg, bd);
+  } else {
+    highbd_filter_vert_compute_small(src, src_stride, verf, tapsNum, w, h,
+                                     dst, dst_stride, avg, bd);
+  }
+}
diff --git a/vp10/common/x86/vp10_txfm1d_sse4.h b/vp10/common/x86/vp10_txfm1d_sse4.h
new file mode 100644
index 0000000..86ab660
--- /dev/null
+++ b/vp10/common/x86/vp10_txfm1d_sse4.h
@@ -0,0 +1,144 @@
+#ifndef VP10_TXMF1D_SSE2_H_
+#define VP10_TXMF1D_SSE2_H_
+
+#include <smmintrin.h>
+#include "vp10/common/vp10_txfm.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp10_fdct4_new_sse4_1(const __m128i* input, __m128i* output,
+                           const int8_t* cos_bit, const int8_t* stage_range);
+void vp10_fdct8_new_sse4_1(const __m128i* input, __m128i* output,
+                           const int8_t* cos_bit, const int8_t* stage_range);
+void vp10_fdct16_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range);
+void vp10_fdct32_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range);
+void vp10_fdct64_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range);
+
+void vp10_fadst4_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range);
+void vp10_fadst8_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range);
+void vp10_fadst16_new_sse4_1(const __m128i* input, __m128i* output,
+                             const int8_t* cos_bit, const int8_t* stage_range);
+void vp10_fadst32_new_sse4_1(const __m128i* input, __m128i* output,
+                             const int8_t* cos_bit, const int8_t* stage_range);
+
+void vp10_idct4_new_sse4_1(const __m128i* input, __m128i* output,
+                           const int8_t* cos_bit, const int8_t* stage_range);
+void vp10_idct8_new_sse4_1(const __m128i* input, __m128i* output,
+                           const int8_t* cos_bit, const int8_t* stage_range);
+void vp10_idct16_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range);
+void vp10_idct32_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range);
+void vp10_idct64_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range);
+
+void vp10_iadst4_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range);
+void vp10_iadst8_new_sse4_1(const __m128i* input, __m128i* output,
+                            const int8_t* cos_bit, const int8_t* stage_range);
+void vp10_iadst16_new_sse4_1(const __m128i* input, __m128i* output,
+                             const int8_t* cos_bit, const int8_t* stage_range);
+void vp10_iadst32_new_sse4_1(const __m128i* input, __m128i* output,
+                             const int8_t* cos_bit, const int8_t* stage_range);
+
+static INLINE void transpose_32_4x4(int stride, const __m128i* input,
+                                    __m128i* output) {
+  __m128i temp0 = _mm_unpacklo_epi32(input[0 * stride], input[2 * stride]);
+  __m128i temp1 = _mm_unpackhi_epi32(input[0 * stride], input[2 * stride]);
+  __m128i temp2 = _mm_unpacklo_epi32(input[1 * stride], input[3 * stride]);
+  __m128i temp3 = _mm_unpackhi_epi32(input[1 * stride], input[3 * stride]);
+
+  output[0 * stride] = _mm_unpacklo_epi32(temp0, temp2);
+  output[1 * stride] = _mm_unpackhi_epi32(temp0, temp2);
+  output[2 * stride] = _mm_unpacklo_epi32(temp1, temp3);
+  output[3 * stride] = _mm_unpackhi_epi32(temp1, temp3);
+}
+
+// the entire input block can be represent by a grid of 4x4 blocks
+// each 4x4 blocks can be represent by 4 vertical __m128i
+// we first transpose each 4x4 block internally
+// than transpose the grid
+static INLINE void transpose_32(int txfm_size, const __m128i* input,
+                                __m128i* output) {
+  const int num_per_128 = 4;
+  const int row_size = txfm_size;
+  const int col_size = txfm_size / num_per_128;
+  int r, c;
+
+  // transpose each 4x4 block internally
+  for (r = 0; r < row_size; r += 4) {
+    for (c = 0; c < col_size; c++) {
+      transpose_32_4x4(col_size, &input[r * col_size + c],
+                       &output[c * 4 * col_size + r / 4]);
+    }
+  }
+}
+
+static INLINE __m128i round_shift_32_sse4_1(__m128i vec, int bit) {
+  __m128i tmp, round;
+  round = _mm_set1_epi32(1 << (bit - 1));
+  tmp = _mm_add_epi32(vec, round);
+  return _mm_srai_epi32(tmp, bit);
+}
+
+static INLINE void round_shift_array_32_sse4_1(__m128i *input, __m128i *output,
+                                               const int size, const int bit) {
+  if (bit > 0) {
+    int i;
+    for (i = 0; i < size; i++) {
+      output[i] = round_shift_32_sse4_1(input[i], bit);
+    }
+  } else {
+    int i;
+    for (i = 0; i < size; i++) {
+      output[i] = _mm_slli_epi32(input[i], -bit);
+    }
+  }
+}
+
+// out0 = in0*w0 + in1*w1
+// out1 = -in1*w0 + in0*w1
+#define btf_32_sse4_1_type0(w0, w1, in0, in1, out0, out1, bit) \
+  do {                                                         \
+    __m128i ww0, ww1, in0_w0, in1_w1, in0_w1, in1_w0;          \
+    ww0 = _mm_set1_epi32(w0);                                  \
+    ww1 = _mm_set1_epi32(w1);                                  \
+    in0_w0 = _mm_mullo_epi32(in0, ww0);                        \
+    in1_w1 = _mm_mullo_epi32(in1, ww1);                        \
+    out0 = _mm_add_epi32(in0_w0, in1_w1);                      \
+    out0 = round_shift_32_sse4_1(out0, bit);                   \
+    in0_w1 = _mm_mullo_epi32(in0, ww1);                        \
+    in1_w0 = _mm_mullo_epi32(in1, ww0);                        \
+    out1 = _mm_sub_epi32(in0_w1, in1_w0);                      \
+    out1 = round_shift_32_sse4_1(out1, bit);                   \
+  } while (0)
+
+// out0 = in0*w0 + in1*w1
+// out1 = in1*w0 - in0*w1
+#define btf_32_sse4_1_type1(w0, w1, in0, in1, out0, out1, bit) \
+  do {                                                         \
+    __m128i ww0, ww1, in0_w0, in1_w1, in0_w1, in1_w0;          \
+    ww0 = _mm_set1_epi32(w0);                                  \
+    ww1 = _mm_set1_epi32(w1);                                  \
+    in0_w0 = _mm_mullo_epi32(in0, ww0);                        \
+    in1_w1 = _mm_mullo_epi32(in1, ww1);                        \
+    out0 = _mm_add_epi32(in0_w0, in1_w1);                      \
+    out0 = round_shift_32_sse4_1(out0, bit);                   \
+    in0_w1 = _mm_mullo_epi32(in0, ww1);                        \
+    in1_w0 = _mm_mullo_epi32(in1, ww0);                        \
+    out1 = _mm_sub_epi32(in1_w0, in0_w1);                      \
+    out1 = round_shift_32_sse4_1(out1, bit);                   \
+  } while (0)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // VP10_TXMF1D_SSE2_H_
diff --git a/vp10/decoder/bitreader.h b/vp10/decoder/bitreader.h
new file mode 100644
index 0000000..baf8f03
--- /dev/null
+++ b/vp10/decoder/bitreader.h
@@ -0,0 +1,38 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/* The purpose of this header is to provide compile time pluggable bit reader
+ * implementations with a common interface. */
+
+#ifndef VPX10_DECODER_BITREADER_H_
+#define VPX10_DECODER_BITREADER_H_
+
+#include "./vpx_config.h"
+
+#if CONFIG_ANS
+#include "vp10/common/ans.h"
+#include "vpx/vp8dx.h"  // for vp10_decrypt_cb
+#define vp10_reader struct AnsDecoder
+#define vp10_reader_has_error ans_reader_has_error
+#define vp10_read uabs_read
+#define vp10_read_bit uabs_read_bit
+#define vp10_read_literal uabs_read_literal
+#define vp10_read_tree uabs_read_tree
+#else
+#include "vpx_dsp/bitreader.h"
+#define vp10_reader vpx_reader
+#define vp10_reader_has_error vpx_reader_has_error
+#define vp10_read vpx_read
+#define vp10_read_bit vpx_read_bit
+#define vp10_read_literal vpx_read_literal
+#define vp10_read_tree vpx_read_tree
+#endif
+
+#endif  // VPX10_DECODER_BITREADER_H_
diff --git a/vp10/decoder/decodeframe.c b/vp10/decoder/decodeframe.c
index d750c33..2f341b5 100644
--- a/vp10/decoder/decodeframe.c
+++ b/vp10/decoder/decodeframe.c
@@ -16,7 +16,7 @@
 #include "./vpx_scale_rtcd.h"
 
 #include "vpx_dsp/bitreader_buffer.h"
-#include "vpx_dsp/bitreader.h"
+#include "vp10/decoder/bitreader.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
@@ -57,6 +57,15 @@
 }
 
 static void setup_compound_reference_mode(VP10_COMMON *cm) {
+#if CONFIG_EXT_REFS
+  cm->comp_fwd_ref[0] = LAST_FRAME;
+  cm->comp_fwd_ref[1] = LAST2_FRAME;
+  cm->comp_fwd_ref[2] = LAST3_FRAME;
+  cm->comp_fwd_ref[3] = GOLDEN_FRAME;
+
+  cm->comp_bwd_ref[0] = BWDREF_FRAME;
+  cm->comp_bwd_ref[1] = ALTREF_FRAME;
+#else
   if (cm->ref_frame_sign_bias[LAST_FRAME] ==
           cm->ref_frame_sign_bias[GOLDEN_FRAME]) {
     cm->comp_fixed_ref = ALTREF_FRAME;
@@ -72,6 +81,7 @@
     cm->comp_var_ref[0] = GOLDEN_FRAME;
     cm->comp_var_ref[1] = ALTREF_FRAME;
   }
+#endif  // CONFIG_EXT_REFS
 }
 
 static int read_is_valid(const uint8_t *start, size_t len, const uint8_t *end) {
@@ -83,50 +93,52 @@
   return data > max ? max : data;
 }
 
-#if CONFIG_MISC_FIXES
 static TX_MODE read_tx_mode(struct vpx_read_bit_buffer *rb) {
   return vpx_rb_read_bit(rb) ? TX_MODE_SELECT : vpx_rb_read_literal(rb, 2);
 }
-#else
-static TX_MODE read_tx_mode(vpx_reader *r) {
-  TX_MODE tx_mode = vpx_read_literal(r, 2);
-  if (tx_mode == ALLOW_32X32)
-    tx_mode += vpx_read_bit(r);
-  return tx_mode;
-}
-#endif
 
-static void read_tx_mode_probs(struct tx_probs *tx_probs, vpx_reader *r) {
-  int i, j;
-
-  for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
-    for (j = 0; j < TX_SIZES - 3; ++j)
-      vp10_diff_update_prob(r, &tx_probs->p8x8[i][j]);
-
-  for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
-    for (j = 0; j < TX_SIZES - 2; ++j)
-      vp10_diff_update_prob(r, &tx_probs->p16x16[i][j]);
-
-  for (i = 0; i < TX_SIZE_CONTEXTS; ++i)
-    for (j = 0; j < TX_SIZES - 1; ++j)
-      vp10_diff_update_prob(r, &tx_probs->p32x32[i][j]);
-}
-
-static void read_switchable_interp_probs(FRAME_CONTEXT *fc, vpx_reader *r) {
+static void read_switchable_interp_probs(FRAME_CONTEXT *fc, vp10_reader *r) {
   int i, j;
   for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
     for (i = 0; i < SWITCHABLE_FILTERS - 1; ++i)
       vp10_diff_update_prob(r, &fc->switchable_interp_prob[j][i]);
 }
 
-static void read_inter_mode_probs(FRAME_CONTEXT *fc, vpx_reader *r) {
-  int i, j;
+static void read_inter_mode_probs(FRAME_CONTEXT *fc, vp10_reader *r) {
+  int i;
+#if CONFIG_REF_MV
+  for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i)
+    vp10_diff_update_prob(r, &fc->newmv_prob[i]);
+  for (i = 0; i < ZEROMV_MODE_CONTEXTS; ++i)
+    vp10_diff_update_prob(r, &fc->zeromv_prob[i]);
+  for (i = 0; i < REFMV_MODE_CONTEXTS; ++i)
+    vp10_diff_update_prob(r, &fc->refmv_prob[i]);
+  for (i = 0; i < DRL_MODE_CONTEXTS; ++i)
+    vp10_diff_update_prob(r, &fc->drl_prob[i]);
+#if CONFIG_EXT_INTER
+  vp10_diff_update_prob(r, &fc->new2mv_prob);
+#endif  // CONFIG_EXT_INTER
+#else
+  int j;
   for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
     for (j = 0; j < INTER_MODES - 1; ++j)
       vp10_diff_update_prob(r, &fc->inter_mode_probs[i][j]);
+#endif
 }
 
-#if CONFIG_MISC_FIXES
+#if CONFIG_EXT_INTER
+static void read_inter_compound_mode_probs(FRAME_CONTEXT *fc, vp10_reader *r) {
+  int i, j;
+  if (vp10_read(r, GROUP_DIFF_UPDATE_PROB)) {
+    for (j = 0; j < INTER_MODE_CONTEXTS; ++j) {
+      for (i = 0; i < INTER_COMPOUND_MODES - 1; ++i) {
+        vp10_diff_update_prob(r, &fc->inter_compound_mode_probs[j][i]);
+      }
+    }
+  }
+}
+#endif  // CONFIG_EXT_INTER
+
 static REFERENCE_MODE read_frame_reference_mode(const VP10_COMMON *cm,
     struct vpx_read_bit_buffer *rb) {
   if (is_compound_reference_allowed(cm)) {
@@ -137,54 +149,53 @@
     return SINGLE_REFERENCE;
   }
 }
-#else
-static REFERENCE_MODE read_frame_reference_mode(const VP10_COMMON *cm,
-                                                vpx_reader *r) {
-  if (is_compound_reference_allowed(cm)) {
-    return vpx_read_bit(r) ? (vpx_read_bit(r) ? REFERENCE_MODE_SELECT
-                                              : COMPOUND_REFERENCE)
-                           : SINGLE_REFERENCE;
-  } else {
-    return SINGLE_REFERENCE;
-  }
-}
-#endif
 
-static void read_frame_reference_mode_probs(VP10_COMMON *cm, vpx_reader *r) {
+static void read_frame_reference_mode_probs(VP10_COMMON *cm, vp10_reader *r) {
   FRAME_CONTEXT *const fc = cm->fc;
-  int i;
+  int i, j;
 
   if (cm->reference_mode == REFERENCE_MODE_SELECT)
     for (i = 0; i < COMP_INTER_CONTEXTS; ++i)
       vp10_diff_update_prob(r, &fc->comp_inter_prob[i]);
 
-  if (cm->reference_mode != COMPOUND_REFERENCE)
+  if (cm->reference_mode != COMPOUND_REFERENCE) {
     for (i = 0; i < REF_CONTEXTS; ++i) {
-      vp10_diff_update_prob(r, &fc->single_ref_prob[i][0]);
-      vp10_diff_update_prob(r, &fc->single_ref_prob[i][1]);
+      for (j = 0; j < (SINGLE_REFS - 1); ++j) {
+        vp10_diff_update_prob(r, &fc->single_ref_prob[i][j]);
+      }
     }
+  }
 
-  if (cm->reference_mode != SINGLE_REFERENCE)
-    for (i = 0; i < REF_CONTEXTS; ++i)
-      vp10_diff_update_prob(r, &fc->comp_ref_prob[i]);
+  if (cm->reference_mode != SINGLE_REFERENCE) {
+    for (i = 0; i < REF_CONTEXTS; ++i) {
+#if CONFIG_EXT_REFS
+      for (j = 0; j < (FWD_REFS - 1); ++j)
+        vp10_diff_update_prob(r, &fc->comp_ref_prob[i][j]);
+      for (j = 0; j < (BWD_REFS - 1); ++j)
+        vp10_diff_update_prob(r, &fc->comp_bwdref_prob[i][j]);
+#else
+      for (j = 0; j < (COMP_REFS - 1); ++j)
+        vp10_diff_update_prob(r, &fc->comp_ref_prob[i][j]);
+#endif  // CONFIG_EXT_REFS
+    }
+  }
 }
 
-static void update_mv_probs(vpx_prob *p, int n, vpx_reader *r) {
+static void update_mv_probs(vpx_prob *p, int n, vp10_reader *r) {
   int i;
   for (i = 0; i < n; ++i)
-#if CONFIG_MISC_FIXES
     vp10_diff_update_prob(r, &p[i]);
-#else
-    if (vpx_read(r, MV_UPDATE_PROB))
-      p[i] = (vpx_read_literal(r, 7) << 1) | 1;
-#endif
 }
 
-static void read_mv_probs(nmv_context *ctx, int allow_hp, vpx_reader *r) {
+static void read_mv_probs(nmv_context *ctx, int allow_hp, vp10_reader *r) {
   int i, j;
 
   update_mv_probs(ctx->joints, MV_JOINTS - 1, r);
 
+#if CONFIG_REF_MV
+  vp10_diff_update_prob(r, &ctx->zero_rmv);
+#endif
+
   for (i = 0; i < 2; ++i) {
     nmv_component *const comp_ctx = &ctx->comps[i];
     update_mv_probs(&comp_ctx->sign, 1, r);
@@ -209,58 +220,27 @@
   }
 }
 
-static void inverse_transform_block_inter(MACROBLOCKD* xd, int plane,
-                                          const TX_SIZE tx_size,
-                                          uint8_t *dst, int stride,
-                                          int eob, int block) {
+static void inverse_transform_block(MACROBLOCKD* xd, int plane,
+                                    const TX_TYPE tx_type,
+                                    const TX_SIZE tx_size,
+                                    uint8_t *dst, int stride,
+                                    int eob) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  TX_TYPE tx_type = get_tx_type(pd->plane_type, xd, block);
-  const int seg_id = xd->mi[0]->mbmi.segment_id;
   if (eob > 0) {
     tran_low_t *const dqcoeff = pd->dqcoeff;
+    INV_TXFM_PARAM inv_txfm_param;
+    inv_txfm_param.tx_type = tx_type;
+    inv_txfm_param.tx_size = tx_size;
+    inv_txfm_param.eob = eob;
+    inv_txfm_param.lossless = xd->lossless[xd->mi[0]->mbmi.segment_id];
+
 #if CONFIG_VP9_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      switch (tx_size) {
-        case TX_4X4:
-          vp10_highbd_inv_txfm_add_4x4(dqcoeff, dst, stride, eob, xd->bd,
-                                       tx_type, xd->lossless[seg_id]);
-          break;
-        case TX_8X8:
-          vp10_highbd_inv_txfm_add_8x8(dqcoeff, dst, stride, eob, xd->bd,
-                                       tx_type);
-          break;
-        case TX_16X16:
-          vp10_highbd_inv_txfm_add_16x16(dqcoeff, dst, stride, eob, xd->bd,
-                                         tx_type);
-          break;
-        case TX_32X32:
-          vp10_highbd_inv_txfm_add_32x32(dqcoeff, dst, stride, eob, xd->bd,
-                                         tx_type);
-          break;
-        default:
-          assert(0 && "Invalid transform size");
-          return;
-      }
+      inv_txfm_param.bd = xd->bd;
+      highbd_inv_txfm_add(dqcoeff, dst, stride, &inv_txfm_param);
     } else {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-      switch (tx_size) {
-        case TX_4X4:
-          vp10_inv_txfm_add_4x4(dqcoeff, dst, stride, eob, tx_type,
-                                xd->lossless[seg_id]);
-          break;
-        case TX_8X8:
-          vp10_inv_txfm_add_8x8(dqcoeff, dst, stride, eob, tx_type);
-          break;
-        case TX_16X16:
-          vp10_inv_txfm_add_16x16(dqcoeff, dst, stride, eob, tx_type);
-          break;
-        case TX_32X32:
-          vp10_inv_txfm_add_32x32(dqcoeff, dst, stride, eob, tx_type);
-          break;
-        default:
-          assert(0 && "Invalid transform size");
-          return;
-      }
+      inv_txfm_add(dqcoeff, dst, stride, &inv_txfm_param);
 #if CONFIG_VP9_HIGHBITDEPTH
     }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -270,85 +250,25 @@
     } else {
       if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10)
         memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
+#if CONFIG_EXT_TX
+      else
+        memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
+#else
       else if (tx_size == TX_32X32 && eob <= 34)
         memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
       else
         memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
-    }
-  }
-}
-
-static void inverse_transform_block_intra(MACROBLOCKD* xd, int plane,
-                                          const TX_TYPE tx_type,
-                                          const TX_SIZE tx_size,
-                                          uint8_t *dst, int stride,
-                                          int eob) {
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  const int seg_id = xd->mi[0]->mbmi.segment_id;
-  if (eob > 0) {
-    tran_low_t *const dqcoeff = pd->dqcoeff;
-#if CONFIG_VP9_HIGHBITDEPTH
-    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      switch (tx_size) {
-        case TX_4X4:
-          vp10_highbd_inv_txfm_add_4x4(dqcoeff, dst, stride, eob, xd->bd,
-                                       tx_type, xd->lossless[seg_id]);
-          break;
-        case TX_8X8:
-          vp10_highbd_inv_txfm_add_8x8(dqcoeff, dst, stride, eob, xd->bd,
-                                       tx_type);
-          break;
-        case TX_16X16:
-          vp10_highbd_inv_txfm_add_16x16(dqcoeff, dst, stride, eob, xd->bd,
-                                         tx_type);
-          break;
-        case TX_32X32:
-          vp10_highbd_inv_txfm_add_32x32(dqcoeff, dst, stride, eob, xd->bd,
-                                         tx_type);
-          break;
-        default:
-          assert(0 && "Invalid transform size");
-          return;
-      }
-    } else {
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-      switch (tx_size) {
-        case TX_4X4:
-          vp10_inv_txfm_add_4x4(dqcoeff, dst, stride, eob, tx_type,
-                                xd->lossless[seg_id]);
-          break;
-        case TX_8X8:
-          vp10_inv_txfm_add_8x8(dqcoeff, dst, stride, eob, tx_type);
-          break;
-        case TX_16X16:
-          vp10_inv_txfm_add_16x16(dqcoeff, dst, stride, eob, tx_type);
-          break;
-        case TX_32X32:
-          vp10_inv_txfm_add_32x32(dqcoeff, dst, stride, eob, tx_type);
-          break;
-        default:
-          assert(0 && "Invalid transform size");
-          return;
-      }
-#if CONFIG_VP9_HIGHBITDEPTH
-    }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-    if (eob == 1) {
-      dqcoeff[0] = 0;
-    } else {
-      if (tx_type == DCT_DCT && tx_size <= TX_16X16 && eob <= 10)
-        memset(dqcoeff, 0, 4 * (4 << tx_size) * sizeof(dqcoeff[0]));
-      else if (tx_size == TX_32X32 && eob <= 34)
-        memset(dqcoeff, 0, 256 * sizeof(dqcoeff[0]));
-      else
-        memset(dqcoeff, 0, (16 << (tx_size << 1)) * sizeof(dqcoeff[0]));
+#endif
     }
   }
 }
 
 static void predict_and_reconstruct_intra_block(MACROBLOCKD *const xd,
-                                                vpx_reader *r,
+#if CONFIG_ANS
+                                                struct AnsDecoder *const r,
+#else
+                                                vp10_reader *r,
+#endif  // CONFIG_ANS
                                                 MB_MODE_INFO *const mbmi,
                                                 int plane,
                                                 int row, int col,
@@ -369,386 +289,98 @@
                           col, row, plane);
 
   if (!mbmi->skip) {
-    TX_TYPE tx_type = get_tx_type(plane_type, xd, block_idx);
-    const scan_order *sc = get_scan(tx_size, tx_type);
-    const int eob = vp10_decode_block_tokens(xd, plane, sc, col, row, tx_size,
-                                             r, mbmi->segment_id);
-    inverse_transform_block_intra(xd, plane, tx_type, tx_size,
-                                  dst, pd->dst.stride, eob);
+    TX_TYPE tx_type = get_tx_type(plane_type, xd, block_idx, tx_size);
+    const scan_order *sc = get_scan(tx_size, tx_type, 0);
+    const int eob = vp10_decode_block_tokens(xd,
+                                             plane, sc, col, row, tx_size,
+                                             tx_type, r, mbmi->segment_id);
+    inverse_transform_block(xd, plane, tx_type, tx_size,
+                            dst, pd->dst.stride, eob);
   }
 }
 
-static int reconstruct_inter_block(MACROBLOCKD *const xd, vpx_reader *r,
-                                   MB_MODE_INFO *const mbmi, int plane,
+#if CONFIG_VAR_TX
+static void decode_reconstruct_tx(MACROBLOCKD *const xd, vp10_reader *r,
+                                  MB_MODE_INFO *const mbmi,
+                                  int plane, BLOCK_SIZE plane_bsize,
+                                  int block, int blk_row, int blk_col,
+                                  TX_SIZE tx_size, int *eob_total) {
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+  const int tx_row = blk_row >> (1 - pd->subsampling_y);
+  const int tx_col = blk_col >> (1 - pd->subsampling_x);
+  const TX_SIZE plane_tx_size = plane ?
+      get_uv_tx_size_impl(mbmi->inter_tx_size[tx_row][tx_col], bsize, 0, 0) :
+      mbmi->inter_tx_size[tx_row][tx_col];
+  int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
+  int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
+
+  if (xd->mb_to_bottom_edge < 0)
+    max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
+  if (xd->mb_to_right_edge < 0)
+    max_blocks_wide += xd->mb_to_right_edge >> (5 + pd->subsampling_x);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide)
+    return;
+
+  if (tx_size == plane_tx_size) {
+    PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
+    TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+    const scan_order *sc = get_scan(tx_size, tx_type, 1);
+    const int eob = vp10_decode_block_tokens(xd, plane, sc,
+                                             blk_col, blk_row, tx_size,
+                                             tx_type, r, mbmi->segment_id);
+    inverse_transform_block(xd, plane, tx_type, tx_size,
+        &pd->dst.buf[4 * blk_row * pd->dst.stride + 4 * blk_col],
+        pd->dst.stride, eob);
+    *eob_total += eob;
+  } else {
+    int bsl = b_width_log2_lookup[bsize];
+    int i;
+
+    assert(bsl > 0);
+    --bsl;
+
+    for (i = 0; i < 4; ++i) {
+      const int offsetr = blk_row + ((i >> 1) << bsl);
+      const int offsetc = blk_col + ((i & 0x01) << bsl);
+      int step = 1 << (2 * (tx_size - 1));
+
+      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide)
+        continue;
+
+      decode_reconstruct_tx(xd, r, mbmi, plane, plane_bsize, block + i * step,
+                            offsetr, offsetc, tx_size - 1, eob_total);
+    }
+  }
+}
+#endif  // CONFIG_VAR_TX
+
+#if !CONFIG_VAR_TX || CONFIG_SUPERTX
+static int reconstruct_inter_block(MACROBLOCKD *const xd,
+#if CONFIG_ANS
+                                   struct AnsDecoder *const r,
+#else
+                                   vp10_reader *r,
+#endif
+                                   int segment_id, int plane,
                                    int row, int col, TX_SIZE tx_size) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
   PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
   int block_idx = (row << 1) + col;
-  TX_TYPE tx_type = get_tx_type(plane_type, xd, block_idx);
-  const scan_order *sc = get_scan(tx_size, tx_type);
-  const int eob = vp10_decode_block_tokens(xd, plane, sc, col, row, tx_size, r,
-                                          mbmi->segment_id);
+  TX_TYPE tx_type = get_tx_type(plane_type, xd, block_idx, tx_size);
+  const scan_order *sc = get_scan(tx_size, tx_type, 1);
+  const int eob = vp10_decode_block_tokens(xd,
+                                           plane, sc, col, row,
+                                           tx_size, tx_type, r,
+                                           segment_id);
 
-  inverse_transform_block_inter(xd, plane, tx_size,
-                            &pd->dst.buf[4 * row * pd->dst.stride + 4 * col],
-                            pd->dst.stride, eob, block_idx);
+  inverse_transform_block(xd, plane, tx_type, tx_size,
+                          &pd->dst.buf[4 * row * pd->dst.stride + 4 * col],
+                          pd->dst.stride, eob);
   return eob;
 }
-
-static void build_mc_border(const uint8_t *src, int src_stride,
-                            uint8_t *dst, int dst_stride,
-                            int x, int y, int b_w, int b_h, int w, int h) {
-  // Get a pointer to the start of the real data for this row.
-  const uint8_t *ref_row = src - x - y * src_stride;
-
-  if (y >= h)
-    ref_row += (h - 1) * src_stride;
-  else if (y > 0)
-    ref_row += y * src_stride;
-
-  do {
-    int right = 0, copy;
-    int left = x < 0 ? -x : 0;
-
-    if (left > b_w)
-      left = b_w;
-
-    if (x + b_w > w)
-      right = x + b_w - w;
-
-    if (right > b_w)
-      right = b_w;
-
-    copy = b_w - left - right;
-
-    if (left)
-      memset(dst, ref_row[0], left);
-
-    if (copy)
-      memcpy(dst + left, ref_row + x + left, copy);
-
-    if (right)
-      memset(dst + left + copy, ref_row[w - 1], right);
-
-    dst += dst_stride;
-    ++y;
-
-    if (y > 0 && y < h)
-      ref_row += src_stride;
-  } while (--b_h);
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static void high_build_mc_border(const uint8_t *src8, int src_stride,
-                                 uint16_t *dst, int dst_stride,
-                                 int x, int y, int b_w, int b_h,
-                                 int w, int h) {
-  // Get a pointer to the start of the real data for this row.
-  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
-  const uint16_t *ref_row = src - x - y * src_stride;
-
-  if (y >= h)
-    ref_row += (h - 1) * src_stride;
-  else if (y > 0)
-    ref_row += y * src_stride;
-
-  do {
-    int right = 0, copy;
-    int left = x < 0 ? -x : 0;
-
-    if (left > b_w)
-      left = b_w;
-
-    if (x + b_w > w)
-      right = x + b_w - w;
-
-    if (right > b_w)
-      right = b_w;
-
-    copy = b_w - left - right;
-
-    if (left)
-      vpx_memset16(dst, ref_row[0], left);
-
-    if (copy)
-      memcpy(dst + left, ref_row + x + left, copy * sizeof(uint16_t));
-
-    if (right)
-      vpx_memset16(dst + left + copy, ref_row[w - 1], right);
-
-    dst += dst_stride;
-    ++y;
-
-    if (y > 0 && y < h)
-      ref_row += src_stride;
-  } while (--b_h);
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static void extend_and_predict(const uint8_t *buf_ptr1, int pre_buf_stride,
-                               int x0, int y0, int b_w, int b_h,
-                               int frame_width, int frame_height,
-                               int border_offset,
-                               uint8_t *const dst, int dst_buf_stride,
-                               int subpel_x, int subpel_y,
-                               const InterpKernel *kernel,
-                               const struct scale_factors *sf,
-                               MACROBLOCKD *xd,
-                               int w, int h, int ref, int xs, int ys) {
-  DECLARE_ALIGNED(16, uint16_t, mc_buf_high[80 * 2 * 80 * 2]);
-  const uint8_t *buf_ptr;
-
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    high_build_mc_border(buf_ptr1, pre_buf_stride, mc_buf_high, b_w,
-                         x0, y0, b_w, b_h, frame_width, frame_height);
-    buf_ptr = CONVERT_TO_BYTEPTR(mc_buf_high) + border_offset;
-  } else {
-    build_mc_border(buf_ptr1, pre_buf_stride, (uint8_t *)mc_buf_high, b_w,
-                    x0, y0, b_w, b_h, frame_width, frame_height);
-    buf_ptr = ((uint8_t *)mc_buf_high) + border_offset;
-  }
-
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    high_inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x,
-                         subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
-  } else {
-    inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x,
-                    subpel_y, sf, w, h, ref, kernel, xs, ys);
-  }
-}
-#else
-static void extend_and_predict(const uint8_t *buf_ptr1, int pre_buf_stride,
-                               int x0, int y0, int b_w, int b_h,
-                               int frame_width, int frame_height,
-                               int border_offset,
-                               uint8_t *const dst, int dst_buf_stride,
-                               int subpel_x, int subpel_y,
-                               const InterpKernel *kernel,
-                               const struct scale_factors *sf,
-                               int w, int h, int ref, int xs, int ys) {
-  DECLARE_ALIGNED(16, uint8_t, mc_buf[80 * 2 * 80 * 2]);
-  const uint8_t *buf_ptr;
-
-  build_mc_border(buf_ptr1, pre_buf_stride, mc_buf, b_w,
-                  x0, y0, b_w, b_h, frame_width, frame_height);
-  buf_ptr = mc_buf + border_offset;
-
-  inter_predictor(buf_ptr, b_w, dst, dst_buf_stride, subpel_x,
-                  subpel_y, sf, w, h, ref, kernel, xs, ys);
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-static void dec_build_inter_predictors(VP10Decoder *const pbi, MACROBLOCKD *xd,
-                                       int plane, int bw, int bh, int x,
-                                       int y, int w, int h, int mi_x, int mi_y,
-                                       const InterpKernel *kernel,
-                                       const struct scale_factors *sf,
-                                       struct buf_2d *pre_buf,
-                                       struct buf_2d *dst_buf, const MV* mv,
-                                       RefCntBuffer *ref_frame_buf,
-                                       int is_scaled, int ref) {
-  VP10_COMMON *const cm = &pbi->common;
-  struct macroblockd_plane *const pd = &xd->plane[plane];
-  uint8_t *const dst = dst_buf->buf + dst_buf->stride * y + x;
-  MV32 scaled_mv;
-  int xs, ys, x0, y0, x0_16, y0_16, frame_width, frame_height,
-      buf_stride, subpel_x, subpel_y;
-  uint8_t *ref_frame, *buf_ptr;
-
-  // Get reference frame pointer, width and height.
-  if (plane == 0) {
-    frame_width = ref_frame_buf->buf.y_crop_width;
-    frame_height = ref_frame_buf->buf.y_crop_height;
-    ref_frame = ref_frame_buf->buf.y_buffer;
-  } else {
-    frame_width = ref_frame_buf->buf.uv_crop_width;
-    frame_height = ref_frame_buf->buf.uv_crop_height;
-    ref_frame = plane == 1 ? ref_frame_buf->buf.u_buffer
-                         : ref_frame_buf->buf.v_buffer;
-  }
-
-  if (is_scaled) {
-    const MV mv_q4 = clamp_mv_to_umv_border_sb(xd, mv, bw, bh,
-                                               pd->subsampling_x,
-                                               pd->subsampling_y);
-    // Co-ordinate of containing block to pixel precision.
-    int x_start = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x));
-    int y_start = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y));
-
-    // Co-ordinate of the block to 1/16th pixel precision.
-    x0_16 = (x_start + x) << SUBPEL_BITS;
-    y0_16 = (y_start + y) << SUBPEL_BITS;
-
-    // Co-ordinate of current block in reference frame
-    // to 1/16th pixel precision.
-    x0_16 = sf->scale_value_x(x0_16, sf);
-    y0_16 = sf->scale_value_y(y0_16, sf);
-
-    // Map the top left corner of the block into the reference frame.
-    x0 = sf->scale_value_x(x_start + x, sf);
-    y0 = sf->scale_value_y(y_start + y, sf);
-
-    // Scale the MV and incorporate the sub-pixel offset of the block
-    // in the reference frame.
-    scaled_mv = vp10_scale_mv(&mv_q4, mi_x + x, mi_y + y, sf);
-    xs = sf->x_step_q4;
-    ys = sf->y_step_q4;
-  } else {
-    // Co-ordinate of containing block to pixel precision.
-    x0 = (-xd->mb_to_left_edge >> (3 + pd->subsampling_x)) + x;
-    y0 = (-xd->mb_to_top_edge >> (3 + pd->subsampling_y)) + y;
-
-    // Co-ordinate of the block to 1/16th pixel precision.
-    x0_16 = x0 << SUBPEL_BITS;
-    y0_16 = y0 << SUBPEL_BITS;
-
-    scaled_mv.row = mv->row * (1 << (1 - pd->subsampling_y));
-    scaled_mv.col = mv->col * (1 << (1 - pd->subsampling_x));
-    xs = ys = 16;
-  }
-  subpel_x = scaled_mv.col & SUBPEL_MASK;
-  subpel_y = scaled_mv.row & SUBPEL_MASK;
-
-  // Calculate the top left corner of the best matching block in the
-  // reference frame.
-  x0 += scaled_mv.col >> SUBPEL_BITS;
-  y0 += scaled_mv.row >> SUBPEL_BITS;
-  x0_16 += scaled_mv.col;
-  y0_16 += scaled_mv.row;
-
-  // Get reference block pointer.
-  buf_ptr = ref_frame + y0 * pre_buf->stride + x0;
-  buf_stride = pre_buf->stride;
-
-  // Do border extension if there is motion or the
-  // width/height is not a multiple of 8 pixels.
-  if (is_scaled || scaled_mv.col || scaled_mv.row ||
-      (frame_width & 0x7) || (frame_height & 0x7)) {
-    int y1 = ((y0_16 + (h - 1) * ys) >> SUBPEL_BITS) + 1;
-
-    // Get reference block bottom right horizontal coordinate.
-    int x1 = ((x0_16 + (w - 1) * xs) >> SUBPEL_BITS) + 1;
-    int x_pad = 0, y_pad = 0;
-
-    if (subpel_x || (sf->x_step_q4 != SUBPEL_SHIFTS)) {
-      x0 -= VP9_INTERP_EXTEND - 1;
-      x1 += VP9_INTERP_EXTEND;
-      x_pad = 1;
-    }
-
-    if (subpel_y || (sf->y_step_q4 != SUBPEL_SHIFTS)) {
-      y0 -= VP9_INTERP_EXTEND - 1;
-      y1 += VP9_INTERP_EXTEND;
-      y_pad = 1;
-    }
-
-    // Wait until reference block is ready. Pad 7 more pixels as last 7
-    // pixels of each superblock row can be changed by next superblock row.
-    if (cm->frame_parallel_decode)
-      vp10_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
-                            VPXMAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1));
-
-    // Skip border extension if block is inside the frame.
-    if (x0 < 0 || x0 > frame_width - 1 || x1 < 0 || x1 > frame_width - 1 ||
-        y0 < 0 || y0 > frame_height - 1 || y1 < 0 || y1 > frame_height - 1) {
-      // Extend the border.
-      const uint8_t *const buf_ptr1 = ref_frame + y0 * buf_stride + x0;
-      const int b_w = x1 - x0 + 1;
-      const int b_h = y1 - y0 + 1;
-      const int border_offset = y_pad * 3 * b_w + x_pad * 3;
-
-      extend_and_predict(buf_ptr1, buf_stride, x0, y0, b_w, b_h,
-                         frame_width, frame_height, border_offset,
-                         dst, dst_buf->stride,
-                         subpel_x, subpel_y,
-                         kernel, sf,
-#if CONFIG_VP9_HIGHBITDEPTH
-                         xd,
-#endif
-                         w, h, ref, xs, ys);
-      return;
-    }
-  } else {
-    // Wait until reference block is ready. Pad 7 more pixels as last 7
-    // pixels of each superblock row can be changed by next superblock row.
-     if (cm->frame_parallel_decode) {
-       const int y1 = (y0_16 + (h - 1) * ys) >> SUBPEL_BITS;
-       vp10_frameworker_wait(pbi->frame_worker_owner, ref_frame_buf,
-                             VPXMAX(0, (y1 + 7)) << (plane == 0 ? 0 : 1));
-     }
-  }
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    high_inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
-                         subpel_y, sf, w, h, ref, kernel, xs, ys, xd->bd);
-  } else {
-    inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
-                    subpel_y, sf, w, h, ref, kernel, xs, ys);
-  }
-#else
-  inter_predictor(buf_ptr, buf_stride, dst, dst_buf->stride, subpel_x,
-                  subpel_y, sf, w, h, ref, kernel, xs, ys);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-}
-
-static void dec_build_inter_predictors_sb(VP10Decoder *const pbi,
-                                          MACROBLOCKD *xd,
-                                          int mi_row, int mi_col) {
-  int plane;
-  const int mi_x = mi_col * MI_SIZE;
-  const int mi_y = mi_row * MI_SIZE;
-  const MODE_INFO *mi = xd->mi[0];
-  const InterpKernel *kernel = vp10_filter_kernels[mi->mbmi.interp_filter];
-  const BLOCK_SIZE sb_type = mi->mbmi.sb_type;
-  const int is_compound = has_second_ref(&mi->mbmi);
-
-  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-    struct macroblockd_plane *const pd = &xd->plane[plane];
-    struct buf_2d *const dst_buf = &pd->dst;
-    const int num_4x4_w = pd->n4_w;
-    const int num_4x4_h = pd->n4_h;
-
-    const int n4w_x4 = 4 * num_4x4_w;
-    const int n4h_x4 = 4 * num_4x4_h;
-    int ref;
-
-    for (ref = 0; ref < 1 + is_compound; ++ref) {
-      const struct scale_factors *const sf = &xd->block_refs[ref]->sf;
-      struct buf_2d *const pre_buf = &pd->pre[ref];
-      const int idx = xd->block_refs[ref]->idx;
-      BufferPool *const pool = pbi->common.buffer_pool;
-      RefCntBuffer *const ref_frame_buf = &pool->frame_bufs[idx];
-      const int is_scaled = vp10_is_scaled(sf);
-
-      if (sb_type < BLOCK_8X8) {
-        const PARTITION_TYPE bp = BLOCK_8X8 - sb_type;
-        const int have_vsplit = bp != PARTITION_HORZ;
-        const int have_hsplit = bp != PARTITION_VERT;
-        const int num_4x4_w = 2 >> ((!have_vsplit) | pd->subsampling_x);
-        const int num_4x4_h = 2 >> ((!have_hsplit) | pd->subsampling_y);
-        const int pw = 8 >> (have_vsplit | pd->subsampling_x);
-        const int ph = 8 >> (have_hsplit | pd->subsampling_y);
-        int x, y;
-        for (y = 0; y < num_4x4_h; ++y) {
-          for (x = 0; x < num_4x4_w; ++x) {
-            const MV mv = average_split_mvs(pd, mi, ref, y * 2 + x);
-            dec_build_inter_predictors(pbi, xd, plane, n4w_x4, n4h_x4,
-                                       4 * x, 4 * y, pw, ph, mi_x, mi_y, kernel,
-                                       sf, pre_buf, dst_buf, &mv,
-                                       ref_frame_buf, is_scaled, ref);
-          }
-        }
-      } else {
-        const MV mv = mi->mbmi.mv[ref].as_mv;
-        dec_build_inter_predictors(pbi, xd, plane, n4w_x4, n4h_x4,
-                                   0, 0, n4w_x4, n4h_x4, mi_x, mi_y, kernel,
-                                   sf, pre_buf, dst_buf, &mv, ref_frame_buf,
-                                   is_scaled, ref);
-      }
-    }
-  }
-}
+#endif  // !CONFIG_VAR_TX || CONFIG_SUPER_TX
 
 static INLINE TX_SIZE dec_get_uv_tx_size(const MB_MODE_INFO *mbmi,
                                          int n4_wl, int n4_hl) {
@@ -799,6 +431,11 @@
 
   set_skip_context(xd, mi_row, mi_col);
 
+
+#if CONFIG_VAR_TX
+  xd->max_tx_size = max_txsize_lookup[bsize];
+#endif
+
   // Distance of Mb to the various image edges. These are specified to 8th pel
   // as they are always compared to values that are in 1/8th pel units
   set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
@@ -807,9 +444,819 @@
   return &xd->mi[0]->mbmi;
 }
 
+#if CONFIG_SUPERTX
+static MB_MODE_INFO *set_offsets_extend(VP10_COMMON *const cm,
+                                        MACROBLOCKD *const xd,
+                                        const TileInfo *const tile,
+                                        BLOCK_SIZE bsize_pred,
+                                        int mi_row_pred, int mi_col_pred,
+                                        int mi_row_ori, int mi_col_ori) {
+  // Used in supertx
+  // (mi_row_ori, mi_col_ori): location for mv
+  // (mi_row_pred, mi_col_pred, bsize_pred): region to predict
+  const int bw = num_8x8_blocks_wide_lookup[bsize_pred];
+  const int bh = num_8x8_blocks_high_lookup[bsize_pred];
+  const int offset = mi_row_ori * cm->mi_stride + mi_col_ori;
+  const int bwl = b_width_log2_lookup[bsize_pred];
+  const int bhl = b_height_log2_lookup[bsize_pred];
+  xd->mi = cm->mi_grid_visible + offset;
+  xd->mi[0] = cm->mi + offset;
+  set_mi_row_col(xd, tile, mi_row_pred, bh, mi_col_pred, bw,
+                 cm->mi_rows, cm->mi_cols);
+
+  xd->up_available    = (mi_row_ori > tile->mi_row_start);
+  xd->left_available  = (mi_col_ori > tile->mi_col_start);
+
+  set_plane_n4(xd, bw, bh, bwl, bhl);
+
+  return &xd->mi[0]->mbmi;
+}
+
+static MB_MODE_INFO *set_mb_offsets(VP10_COMMON *const cm,
+                                    MACROBLOCKD *const xd,
+                                    BLOCK_SIZE bsize,
+                                    int mi_row, int mi_col,
+                                    int bw, int bh,
+                                    int x_mis, int y_mis) {
+  const int offset = mi_row * cm->mi_stride + mi_col;
+  const TileInfo *const tile = &xd->tile;
+  int x, y;
+
+  xd->mi = cm->mi_grid_visible + offset;
+  xd->mi[0] = cm->mi + offset;
+  xd->mi[0]->mbmi.sb_type = bsize;
+  for (y = 0; y < y_mis; ++y)
+    for (x = !y; x < x_mis; ++x)
+      xd->mi[y * cm->mi_stride + x] = xd->mi[0];
+
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
+  return &xd->mi[0]->mbmi;
+}
+
+static void set_offsets_topblock(VP10_COMMON *const cm, MACROBLOCKD *const xd,
+                                 const TileInfo *const tile,
+                                 BLOCK_SIZE bsize, int mi_row, int mi_col) {
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
+  const int offset = mi_row * cm->mi_stride + mi_col;
+  const int bwl = b_width_log2_lookup[bsize];
+  const int bhl = b_height_log2_lookup[bsize];
+
+  xd->mi = cm->mi_grid_visible + offset;
+  xd->mi[0] = cm->mi + offset;
+
+  set_plane_n4(xd, bw, bh, bwl, bhl);
+
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
+
+  vp10_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
+}
+
+static void set_param_topblock(VP10_COMMON *const cm,  MACROBLOCKD *const xd,
+                               BLOCK_SIZE bsize, int mi_row, int mi_col,
+                               int txfm, int skip) {
+  const int bw = num_8x8_blocks_wide_lookup[bsize];
+  const int bh = num_8x8_blocks_high_lookup[bsize];
+  const int x_mis = VPXMIN(bw, cm->mi_cols - mi_col);
+  const int y_mis = VPXMIN(bh, cm->mi_rows - mi_row);
+  const int offset = mi_row * cm->mi_stride + mi_col;
+  int x, y;
+
+  xd->mi = cm->mi_grid_visible + offset;
+  xd->mi[0] = cm->mi + offset;
+
+  for (y = 0; y < y_mis; ++y)
+    for (x = 0; x < x_mis; ++x) {
+      xd->mi[y * cm->mi_stride + x]->mbmi.skip = skip;
+      xd->mi[y * cm->mi_stride + x]->mbmi.tx_type = txfm;
+    }
+#if CONFIG_VAR_TX
+  xd->above_txfm_context = cm->above_txfm_context + mi_col;
+  xd->left_txfm_context =
+    xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+  set_txfm_ctx(xd->left_txfm_context, xd->mi[0]->mbmi.tx_size, bh);
+  set_txfm_ctx(xd->above_txfm_context, xd->mi[0]->mbmi.tx_size, bw);
+#endif
+}
+
+static void set_ref(VP10_COMMON *const cm, MACROBLOCKD *const xd,
+                    int idx, int mi_row, int mi_col) {
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  RefBuffer *ref_buffer = &cm->frame_refs[mbmi->ref_frame[idx] - LAST_FRAME];
+  xd->block_refs[idx] = ref_buffer;
+  if (!vp10_is_valid_scale(&ref_buffer->sf))
+    vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                       "Invalid scale factors");
+  vp10_setup_pre_planes(xd, idx, ref_buffer->buf, mi_row, mi_col,
+                        &ref_buffer->sf);
+  xd->corrupted |= ref_buffer->buf->corrupted;
+}
+
+static void dec_predict_b_extend(
+    VP10Decoder *const pbi, MACROBLOCKD *const xd,
+    const TileInfo *const tile, int block,
+    int mi_row_ori, int mi_col_ori,
+    int mi_row_pred, int mi_col_pred,
+    int mi_row_top, int mi_col_top,
+    uint8_t * dst_buf[3], int dst_stride[3],
+    BLOCK_SIZE bsize_top,
+    BLOCK_SIZE bsize_pred,
+    int b_sub8x8, int bextend) {
+  // Used in supertx
+  // (mi_row_ori, mi_col_ori): location for mv
+  // (mi_row_pred, mi_col_pred, bsize_pred): region to predict
+  // (mi_row_top, mi_col_top, bsize_top): region of the top partition size
+  // block: sub location of sub8x8 blocks
+  // b_sub8x8: 1: ori is sub8x8; 0: ori is not sub8x8
+  // bextend: 1: region to predict is an extension of ori; 0: not
+  int r = (mi_row_pred - mi_row_top) * MI_SIZE;
+  int c = (mi_col_pred - mi_col_top) * MI_SIZE;
+  const int mi_width_top = num_8x8_blocks_wide_lookup[bsize_top];
+  const int mi_height_top = num_8x8_blocks_high_lookup[bsize_top];
+  MB_MODE_INFO *mbmi;
+  VP10_COMMON *const cm = &pbi->common;
+
+  if (mi_row_pred < mi_row_top || mi_col_pred < mi_col_top ||
+      mi_row_pred >= mi_row_top + mi_height_top ||
+      mi_col_pred >= mi_col_top + mi_width_top ||
+      mi_row_pred >= cm->mi_rows || mi_col_pred >= cm->mi_cols)
+    return;
+
+  mbmi = set_offsets_extend(cm, xd, tile, bsize_pred,
+                            mi_row_pred, mi_col_pred,
+                            mi_row_ori, mi_col_ori);
+  set_ref(cm, xd, 0, mi_row_pred, mi_col_pred);
+  if (has_second_ref(&xd->mi[0]->mbmi))
+    set_ref(cm, xd, 1, mi_row_pred, mi_col_pred);
+
+  if (!bextend) {
+    mbmi->tx_size = b_width_log2_lookup[bsize_top];
+  }
+
+  xd->plane[0].dst.stride = dst_stride[0];
+  xd->plane[1].dst.stride = dst_stride[1];
+  xd->plane[2].dst.stride = dst_stride[2];
+  xd->plane[0].dst.buf = dst_buf[0] +
+                         (r >> xd->plane[0].subsampling_y) * dst_stride[0] +
+                         (c >> xd->plane[0].subsampling_x);
+  xd->plane[1].dst.buf = dst_buf[1] +
+                         (r >> xd->plane[1].subsampling_y) * dst_stride[1] +
+                         (c >> xd->plane[1].subsampling_x);
+  xd->plane[2].dst.buf = dst_buf[2] +
+                         (r >> xd->plane[2].subsampling_y) * dst_stride[2] +
+                         (c >> xd->plane[2].subsampling_x);
+
+  if (!b_sub8x8)
+    vp10_build_inter_predictors_sb_extend(
+        xd,
+#if CONFIG_EXT_INTER
+        mi_row_ori, mi_col_ori,
+#endif  // CONFIG_EXT_INTER
+        mi_row_pred, mi_col_pred, bsize_pred);
+  else
+    vp10_build_inter_predictors_sb_sub8x8_extend(
+        xd,
+#if CONFIG_EXT_INTER
+        mi_row_ori, mi_col_ori,
+#endif  // CONFIG_EXT_INTER
+        mi_row_pred, mi_col_pred, bsize_pred, block);
+}
+
+static void dec_extend_dir(VP10Decoder *const pbi, MACROBLOCKD *const xd,
+                           const TileInfo *const tile, int block,
+                           BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
+                           int mi_row, int mi_col,
+                           int mi_row_top, int mi_col_top,
+                           uint8_t * dst_buf[3], int dst_stride[3], int dir) {
+  // dir: 0-lower, 1-upper, 2-left, 3-right
+  //      4-lowerleft, 5-upperleft, 6-lowerright, 7-upperright
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  int xss = xd->plane[1].subsampling_x;
+  int yss = xd->plane[1].subsampling_y;
+  int b_sub8x8 = (bsize < BLOCK_8X8) ? 1 : 0;
+  BLOCK_SIZE extend_bsize;
+  int unit, mi_row_pred, mi_col_pred;
+
+  if (dir == 0 || dir == 1) {
+    extend_bsize = (mi_width == 1 || bsize < BLOCK_8X8 || xss < yss) ?
+                    BLOCK_8X8 : BLOCK_16X8;
+    unit = num_8x8_blocks_wide_lookup[extend_bsize];
+    mi_row_pred = mi_row + ((dir == 0) ? mi_height : -1);
+    mi_col_pred = mi_col;
+
+    dec_predict_b_extend(pbi, xd, tile, block, mi_row, mi_col,
+                         mi_row_pred, mi_col_pred,
+                         mi_row_top, mi_col_top,
+                         dst_buf, dst_stride,
+                         top_bsize, extend_bsize, b_sub8x8, 1);
+
+    if (mi_width > unit) {
+      int i;
+      assert(!b_sub8x8);
+      for (i = 0; i < mi_width/unit - 1; i++) {
+        mi_col_pred += unit;
+        dec_predict_b_extend(pbi, xd, tile, block, mi_row, mi_col,
+                             mi_row_pred, mi_col_pred,
+                             mi_row_top, mi_col_top,
+                             dst_buf, dst_stride,
+                             top_bsize, extend_bsize, b_sub8x8, 1);
+      }
+    }
+  } else if (dir == 2 || dir == 3) {
+    extend_bsize = (mi_height == 1 || bsize < BLOCK_8X8 || yss < xss) ?
+                    BLOCK_8X8 : BLOCK_8X16;
+    unit = num_8x8_blocks_high_lookup[extend_bsize];
+    mi_row_pred = mi_row;
+    mi_col_pred = mi_col + ((dir == 3) ? mi_width : -1);
+
+    dec_predict_b_extend(pbi, xd, tile, block, mi_row, mi_col,
+                         mi_row_pred, mi_col_pred,
+                         mi_row_top, mi_col_top,
+                         dst_buf, dst_stride,
+                         top_bsize, extend_bsize, b_sub8x8, 1);
+
+    if (mi_height > unit) {
+      int i;
+      for (i = 0; i < mi_height/unit - 1; i++) {
+        mi_row_pred += unit;
+        dec_predict_b_extend(pbi, xd, tile, block, mi_row, mi_col,
+                             mi_row_pred, mi_col_pred,
+                             mi_row_top, mi_col_top,
+                             dst_buf, dst_stride,
+                             top_bsize, extend_bsize, b_sub8x8, 1);
+      }
+    }
+  } else {
+    extend_bsize = BLOCK_8X8;
+    mi_row_pred = mi_row + ((dir == 4 || dir == 6) ? mi_height : -1);
+    mi_col_pred = mi_col + ((dir == 6 || dir == 7) ? mi_width : -1);
+    dec_predict_b_extend(pbi, xd, tile, block, mi_row, mi_col,
+                         mi_row_pred, mi_col_pred,
+                         mi_row_top, mi_col_top,
+                         dst_buf, dst_stride,
+                         top_bsize, extend_bsize, b_sub8x8, 1);
+  }
+}
+
+static void dec_extend_all(VP10Decoder *const pbi, MACROBLOCKD *const xd,
+                           const TileInfo *const tile, int block,
+                           BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
+                           int mi_row, int mi_col,
+                           int mi_row_top, int mi_col_top,
+                           uint8_t * dst_buf[3], int dst_stride[3]) {
+  dec_extend_dir(pbi, xd, tile, block, bsize, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, dst_buf, dst_stride, 0);
+  dec_extend_dir(pbi, xd, tile, block, bsize, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, dst_buf, dst_stride, 1);
+  dec_extend_dir(pbi, xd, tile, block, bsize, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, dst_buf, dst_stride, 2);
+  dec_extend_dir(pbi, xd, tile, block, bsize, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, dst_buf, dst_stride, 3);
+  dec_extend_dir(pbi, xd, tile, block, bsize, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, dst_buf, dst_stride, 4);
+  dec_extend_dir(pbi, xd, tile, block, bsize, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, dst_buf, dst_stride, 5);
+  dec_extend_dir(pbi, xd, tile, block, bsize, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, dst_buf, dst_stride, 6);
+  dec_extend_dir(pbi, xd, tile, block, bsize, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, dst_buf, dst_stride, 7);
+}
+
+static void dec_predict_sb_complex(VP10Decoder *const pbi,
+                                   MACROBLOCKD *const xd,
+                                   const TileInfo *const tile,
+                                   int mi_row, int mi_col,
+                                   int mi_row_top, int mi_col_top,
+                                   BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
+                                   uint8_t *dst_buf[3], int dst_stride[3]) {
+  const VP10_COMMON *const cm = &pbi->common;
+  const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
+  const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
+  const BLOCK_SIZE subsize = get_subsize(bsize, partition);
+#if CONFIG_EXT_PARTITION_TYPES
+  const BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+#endif
+  int i;
+  const int mi_offset = mi_row * cm->mi_stride + mi_col;
+  uint8_t *dst_buf1[3], *dst_buf2[3], *dst_buf3[3];
+
+  DECLARE_ALIGNED(16, uint8_t,
+                  tmp_buf1[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
+  DECLARE_ALIGNED(16, uint8_t,
+                  tmp_buf2[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
+  DECLARE_ALIGNED(16, uint8_t,
+                  tmp_buf3[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
+  int dst_stride1[3] = {MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE};
+  int dst_stride2[3] = {MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE};
+  int dst_stride3[3] = {MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE};
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    int len = sizeof(uint16_t);
+    dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
+    dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_TX_SQUARE * len);
+    dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + 2 * MAX_TX_SQUARE * len);
+    dst_buf2[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
+    dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_TX_SQUARE * len);
+    dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + 2 * MAX_TX_SQUARE * len);
+    dst_buf3[0] = CONVERT_TO_BYTEPTR(tmp_buf3);
+    dst_buf3[1] = CONVERT_TO_BYTEPTR(tmp_buf3 + MAX_TX_SQUARE * len);
+    dst_buf3[2] = CONVERT_TO_BYTEPTR(tmp_buf3 + 2 * MAX_TX_SQUARE * len);
+  } else {
+#endif
+    dst_buf1[0] = tmp_buf1;
+    dst_buf1[1] = tmp_buf1 + MAX_TX_SQUARE;
+    dst_buf1[2] = tmp_buf1 + 2 * MAX_TX_SQUARE;
+    dst_buf2[0] = tmp_buf2;
+    dst_buf2[1] = tmp_buf2 + MAX_TX_SQUARE;
+    dst_buf2[2] = tmp_buf2 + 2 * MAX_TX_SQUARE;
+    dst_buf3[0] = tmp_buf3;
+    dst_buf3[1] = tmp_buf3 + MAX_TX_SQUARE;
+    dst_buf3[2] = tmp_buf3 + 2 * MAX_TX_SQUARE;
+#if CONFIG_VP9_HIGHBITDEPTH
+  }
+#endif
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+  xd->mi = cm->mi_grid_visible + mi_offset;
+  xd->mi[0] = cm->mi + mi_offset;
+
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    xd->plane[i].dst.buf = dst_buf[i];
+    xd->plane[i].dst.stride = dst_stride[i];
+  }
+
+  switch (partition) {
+    case PARTITION_NONE:
+      assert(bsize < top_bsize);
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                           mi_row_top, mi_col_top, dst_buf, dst_stride,
+                           top_bsize, bsize, 0, 0);
+      dec_extend_all(pbi, xd, tile, 0, bsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, dst_buf, dst_stride);
+      break;
+    case PARTITION_HORZ:
+      if (bsize == BLOCK_8X8) {
+        // For sub8x8, predict in 8x8 unit
+        // First half
+        dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf, dst_stride,
+                             top_bsize, BLOCK_8X8, 1, 0);
+        if (bsize < top_bsize)
+          dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride);
+
+        // Second half
+        dec_predict_b_extend(pbi, xd, tile, 2, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+                             top_bsize, BLOCK_8X8, 1, 1);
+        if (bsize < top_bsize)
+          dec_extend_all(pbi, xd, tile, 2, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf1, dst_stride1);
+
+        // weighted average to smooth the boundary
+        xd->plane[0].dst.buf = dst_buf[0];
+        xd->plane[0].dst.stride = dst_stride[0];
+        vp10_build_masked_inter_predictor_complex(xd,
+                                                  dst_buf[0], dst_stride[0],
+                                                  dst_buf1[0], dst_stride1[0],
+                                                  mi_row, mi_col,
+                                                  mi_row_top, mi_col_top,
+                                                  bsize, top_bsize,
+                                                  PARTITION_HORZ, 0);
+      } else {
+        // First half
+        dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf, dst_stride,
+                             top_bsize, subsize, 0, 0);
+        if (bsize < top_bsize)
+          dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride);
+        else
+          dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride, 0);
+
+        if (mi_row + hbs < cm->mi_rows) {
+          // Second half
+          dec_predict_b_extend(pbi, xd, tile, 0, mi_row + hbs, mi_col,
+                               mi_row + hbs, mi_col,
+                               mi_row_top, mi_col_top,
+                               dst_buf1, dst_stride1,
+                               top_bsize, subsize, 0, 0);
+          if (bsize < top_bsize)
+            dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize,
+                           mi_row + hbs, mi_col,
+                           mi_row_top, mi_col_top,
+                           dst_buf1, dst_stride1);
+          else
+            dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize,
+                           mi_row + hbs, mi_col,
+                           mi_row_top, mi_col_top,
+                           dst_buf1, dst_stride1, 1);
+
+          // weighted average to smooth the boundary
+          for (i = 0; i < MAX_MB_PLANE; i++) {
+            xd->plane[i].dst.buf = dst_buf[i];
+            xd->plane[i].dst.stride = dst_stride[i];
+            vp10_build_masked_inter_predictor_complex(
+                xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
+                mi_row, mi_col, mi_row_top, mi_col_top,
+                bsize, top_bsize, PARTITION_HORZ, i);
+          }
+        }
+      }
+      break;
+    case PARTITION_VERT:
+      if (bsize == BLOCK_8X8) {
+        // First half
+        dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf, dst_stride,
+                             top_bsize, BLOCK_8X8, 1, 0);
+        if (bsize < top_bsize)
+          dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride);
+
+        // Second half
+        dec_predict_b_extend(pbi, xd, tile, 1, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+                             top_bsize, BLOCK_8X8, 1, 1);
+        if (bsize < top_bsize)
+          dec_extend_all(pbi, xd, tile, 1, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf1, dst_stride1);
+
+        // Smooth
+        xd->plane[0].dst.buf = dst_buf[0];
+        xd->plane[0].dst.stride = dst_stride[0];
+        vp10_build_masked_inter_predictor_complex(xd,
+                                                  dst_buf[0], dst_stride[0],
+                                                  dst_buf1[0], dst_stride1[0],
+                                                  mi_row, mi_col,
+                                                  mi_row_top, mi_col_top,
+                                                  bsize, top_bsize,
+                                                  PARTITION_VERT, 0);
+      } else {
+        // First half
+        dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf, dst_stride,
+                             top_bsize, subsize, 0, 0);
+        if (bsize < top_bsize)
+          dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride);
+        else
+          dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride, 3);
+
+        // Second half
+        if (mi_col + hbs < cm->mi_cols) {
+          dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col + hbs,
+                               mi_row, mi_col + hbs, mi_row_top, mi_col_top,
+                               dst_buf1, dst_stride1, top_bsize, subsize, 0, 0);
+          if (bsize < top_bsize)
+            dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize,
+                           mi_row, mi_col + hbs, mi_row_top, mi_col_top,
+                           dst_buf1, dst_stride1);
+          else
+            dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize,
+                           mi_row, mi_col + hbs, mi_row_top, mi_col_top,
+                           dst_buf1, dst_stride1, 2);
+
+          // Smooth
+          for (i = 0; i < MAX_MB_PLANE; i++) {
+            xd->plane[i].dst.buf = dst_buf[i];
+            xd->plane[i].dst.stride = dst_stride[i];
+            vp10_build_masked_inter_predictor_complex(
+                xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
+                mi_row, mi_col, mi_row_top, mi_col_top,
+                bsize, top_bsize, PARTITION_VERT, i);
+          }
+        }
+      }
+      break;
+    case PARTITION_SPLIT:
+      if (bsize == BLOCK_8X8) {
+        dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf, dst_stride,
+                             top_bsize, BLOCK_8X8, 1, 0);
+        dec_predict_b_extend(pbi, xd, tile, 1, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+                             top_bsize, BLOCK_8X8, 1, 1);
+        dec_predict_b_extend(pbi, xd, tile, 2, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf2, dst_stride2,
+                             top_bsize, BLOCK_8X8, 1, 1);
+        dec_predict_b_extend(pbi, xd, tile, 3, mi_row, mi_col, mi_row, mi_col,
+                             mi_row_top, mi_col_top, dst_buf3, dst_stride3,
+                             top_bsize, BLOCK_8X8, 1, 1);
+        if (bsize < top_bsize) {
+          dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride);
+          dec_extend_all(pbi, xd, tile, 1, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf1, dst_stride1);
+          dec_extend_all(pbi, xd, tile, 2, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf2, dst_stride2);
+          dec_extend_all(pbi, xd, tile, 3, subsize, top_bsize, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf3, dst_stride3);
+        }
+      } else {
+        dec_predict_sb_complex(pbi, xd, tile, mi_row, mi_col,
+                               mi_row_top, mi_col_top, subsize, top_bsize,
+                               dst_buf, dst_stride);
+        if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols)
+          dec_predict_sb_complex(pbi, xd, tile, mi_row, mi_col + hbs,
+                                 mi_row_top, mi_col_top, subsize, top_bsize,
+                                 dst_buf1, dst_stride1);
+        if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols)
+          dec_predict_sb_complex(pbi, xd, tile, mi_row + hbs, mi_col,
+                                 mi_row_top, mi_col_top, subsize, top_bsize,
+                                 dst_buf2, dst_stride2);
+        if (mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols)
+          dec_predict_sb_complex(pbi, xd, tile, mi_row + hbs, mi_col + hbs,
+                                 mi_row_top, mi_col_top, subsize, top_bsize,
+                                 dst_buf3, dst_stride3);
+      }
+        for (i = 0; i < MAX_MB_PLANE; i++) {
+          if (bsize == BLOCK_8X8 && i != 0)
+            continue;  // Skip <4x4 chroma smoothing
+          if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols) {
+            vp10_build_masked_inter_predictor_complex(xd,
+                                                      dst_buf[i], dst_stride[i],
+                                                      dst_buf1[i],
+                                                      dst_stride1[i],
+                                                      mi_row, mi_col,
+                                                      mi_row_top, mi_col_top,
+                                                      bsize, top_bsize,
+                                                      PARTITION_VERT, i);
+            if (mi_row + hbs < cm->mi_rows) {
+              vp10_build_masked_inter_predictor_complex(xd,
+                                                        dst_buf2[i],
+                                                        dst_stride2[i],
+                                                        dst_buf3[i],
+                                                        dst_stride3[i],
+                                                        mi_row, mi_col,
+                                                        mi_row_top, mi_col_top,
+                                                        bsize, top_bsize,
+                                                        PARTITION_VERT, i);
+              vp10_build_masked_inter_predictor_complex(xd,
+                                                        dst_buf[i],
+                                                        dst_stride[i],
+                                                        dst_buf2[i],
+                                                        dst_stride2[i],
+                                                        mi_row, mi_col,
+                                                        mi_row_top, mi_col_top,
+                                                        bsize, top_bsize,
+                                                        PARTITION_HORZ, i);
+            }
+          } else if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols) {
+            vp10_build_masked_inter_predictor_complex(xd,
+                                                      dst_buf[i],
+                                                      dst_stride[i],
+                                                      dst_buf2[i],
+                                                      dst_stride2[i],
+                                                      mi_row, mi_col,
+                                                      mi_row_top, mi_col_top,
+                                                      bsize, top_bsize,
+                                                      PARTITION_HORZ, i);
+          }
+        }
+      break;
+#if CONFIG_EXT_PARTITION_TYPES
+    case PARTITION_HORZ_A:
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                           mi_row_top, mi_col_top, dst_buf, dst_stride,
+                           top_bsize, bsize2, 0, 0);
+      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, dst_buf, dst_stride);
+
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col + hbs, mi_row,
+                           mi_col + hbs, mi_row_top, mi_col_top,
+                           dst_buf1, dst_stride1, top_bsize, bsize2, 0, 0);
+      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row, mi_col + hbs,
+                     mi_row_top, mi_col_top, dst_buf1, dst_stride1);
+
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row + hbs, mi_col,
+                           mi_row + hbs, mi_col, mi_row_top, mi_col_top,
+                           dst_buf2, dst_stride2, top_bsize, subsize, 0, 0);
+      if (bsize < top_bsize)
+        dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize,
+                       mi_row + hbs, mi_col, mi_row_top, mi_col_top,
+                       dst_buf2, dst_stride2);
+      else
+        dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize,
+                       mi_row + hbs, mi_col, mi_row_top, mi_col_top,
+                       dst_buf2, dst_stride2, 1);
+
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf[i];
+        xd->plane[i].dst.stride = dst_stride[i];
+        vp10_build_masked_inter_predictor_complex(xd,
+                                                  dst_buf[i], dst_stride[i],
+                                                  dst_buf1[i], dst_stride1[i],
+                                                  mi_row, mi_col,
+                                                  mi_row_top, mi_col_top,
+                                                  bsize, top_bsize,
+                                                  PARTITION_VERT, i);
+      }
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        vp10_build_masked_inter_predictor_complex(xd,
+                                                  dst_buf[i], dst_stride[i],
+                                                  dst_buf2[i], dst_stride2[i],
+                                                  mi_row, mi_col,
+                                                  mi_row_top, mi_col_top,
+                                                  bsize, top_bsize,
+                                                  PARTITION_HORZ, i);
+      }
+      break;
+    case PARTITION_VERT_A:
+
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                           mi_row_top, mi_col_top, dst_buf, dst_stride,
+                           top_bsize, bsize2, 0, 0);
+      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, dst_buf, dst_stride);
+
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row + hbs, mi_col,
+                           mi_row + hbs, mi_col, mi_row_top, mi_col_top,
+                           dst_buf1, dst_stride1, top_bsize, bsize2, 0, 0);
+      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col,
+                     mi_row_top, mi_col_top, dst_buf1, dst_stride1);
+
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col + hbs,
+                           mi_row, mi_col + hbs, mi_row_top, mi_col_top,
+                           dst_buf2, dst_stride2,
+                           top_bsize, subsize, 0, 0);
+      if (bsize < top_bsize)
+        dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize,
+                       mi_row, mi_col + hbs, mi_row_top, mi_col_top,
+                       dst_buf2, dst_stride2);
+      else
+        dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize,
+                       mi_row, mi_col + hbs, mi_row_top, mi_col_top,
+                       dst_buf2, dst_stride2, 2);
+
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf[i];
+        xd->plane[i].dst.stride = dst_stride[i];
+        vp10_build_masked_inter_predictor_complex(xd,
+                                                  dst_buf[i], dst_stride[i],
+                                                  dst_buf1[i], dst_stride1[i],
+                                                  mi_row, mi_col,
+                                                  mi_row_top, mi_col_top,
+                                                  bsize, top_bsize,
+                                                  PARTITION_HORZ, i);
+      }
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        vp10_build_masked_inter_predictor_complex(xd,
+                                                  dst_buf[i], dst_stride[i],
+                                                  dst_buf2[i], dst_stride2[i],
+                                                  mi_row, mi_col,
+                                                  mi_row_top, mi_col_top,
+                                                  bsize, top_bsize,
+                                                  PARTITION_VERT, i);
+      }
+      break;
+    case PARTITION_HORZ_B:
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                           mi_row_top, mi_col_top, dst_buf, dst_stride,
+                           top_bsize, subsize, 0, 0);
+      if (bsize < top_bsize)
+        dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride);
+      else
+        dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride, 0);
+
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
+                           mi_col, mi_row_top, mi_col_top,
+                           dst_buf1, dst_stride1, top_bsize, bsize2, 0, 0);
+      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col,
+                     mi_row_top, mi_col_top, dst_buf1, dst_stride1);
+
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row + hbs, mi_col + hbs,
+                           mi_row + hbs, mi_col + hbs, mi_row_top, mi_col_top,
+                           dst_buf2, dst_stride2, top_bsize, bsize2, 0, 0);
+      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize,
+                     mi_row + hbs, mi_col + hbs,
+                     mi_row_top, mi_col_top, dst_buf2, dst_stride2);
+
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf1[i];
+        xd->plane[i].dst.stride = dst_stride1[i];
+        vp10_build_masked_inter_predictor_complex(xd,
+                                                  dst_buf1[i], dst_stride1[i],
+                                                  dst_buf2[i], dst_stride2[i],
+                                                  mi_row, mi_col,
+                                                  mi_row_top, mi_col_top,
+                                                  bsize, top_bsize,
+                                                  PARTITION_VERT, i);
+      }
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf[i];
+        xd->plane[i].dst.stride = dst_stride[i];
+        vp10_build_masked_inter_predictor_complex(xd,
+                                                  dst_buf[i], dst_stride[i],
+                                                  dst_buf1[i], dst_stride1[i],
+                                                  mi_row, mi_col,
+                                                  mi_row_top, mi_col_top,
+                                                  bsize, top_bsize,
+                                                  PARTITION_HORZ, i);
+      }
+      break;
+    case PARTITION_VERT_B:
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                           mi_row_top, mi_col_top, dst_buf, dst_stride,
+                           top_bsize, subsize, 0, 0);
+      if (bsize < top_bsize)
+        dec_extend_all(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride);
+      else
+        dec_extend_dir(pbi, xd, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride, 3);
+
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row, mi_col + hbs,
+                           mi_row, mi_col + hbs, mi_row_top, mi_col_top,
+                           dst_buf1, dst_stride1, top_bsize, bsize2, 0, 0);
+      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize, mi_row, mi_col + hbs,
+                     mi_row_top, mi_col_top, dst_buf1, dst_stride1);
+
+      dec_predict_b_extend(pbi, xd, tile, 0, mi_row + hbs, mi_col + hbs,
+                           mi_row + hbs, mi_col + hbs, mi_row_top, mi_col_top,
+                           dst_buf2, dst_stride2, top_bsize, bsize2, 0, 0);
+      dec_extend_all(pbi, xd, tile, 0, bsize2, top_bsize,
+                     mi_row + hbs, mi_col + hbs,
+                     mi_row_top, mi_col_top, dst_buf2, dst_stride2);
+
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf1[i];
+        xd->plane[i].dst.stride = dst_stride1[i];
+        vp10_build_masked_inter_predictor_complex(xd,
+                                                  dst_buf1[i], dst_stride1[i],
+                                                  dst_buf2[i], dst_stride2[i],
+                                                  mi_row, mi_col,
+                                                  mi_row_top, mi_col_top,
+                                                  bsize, top_bsize,
+                                                  PARTITION_HORZ, i);
+      }
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf[i];
+        xd->plane[i].dst.stride = dst_stride[i];
+        vp10_build_masked_inter_predictor_complex(xd,
+                                                  dst_buf[i], dst_stride[i],
+                                                  dst_buf1[i], dst_stride1[i],
+                                                  mi_row, mi_col,
+                                                  mi_row_top, mi_col_top,
+                                                  bsize, top_bsize,
+                                                  PARTITION_VERT, i);
+      }
+      break;
+#endif  // CONFIG_EXT_PARTITION_TYPES
+    default:
+      assert(0);
+  }
+}
+
+static void set_segment_id_supertx(const VP10_COMMON *const cm,
+                                   const int mi_row, const int mi_col,
+                                   const BLOCK_SIZE bsize) {
+  const struct segmentation *seg = &cm->seg;
+  const int miw =
+      VPXMIN(num_8x8_blocks_wide_lookup[bsize], cm->mi_cols - mi_col);
+  const int mih =
+      VPXMIN(num_8x8_blocks_high_lookup[bsize], cm->mi_rows - mi_row);
+  const int mi_offset = mi_row * cm->mi_stride + mi_col;
+  MODE_INFO **const mip = cm->mi_grid_visible + mi_offset;
+  int r, c;
+  int seg_id_supertx = MAX_SEGMENTS;
+
+  if (!seg->enabled) {
+    seg_id_supertx = 0;
+  } else {
+    // Find the minimum segment_id
+    for (r = 0 ; r < mih ; r++)
+      for (c = 0 ; c < miw ; c++)
+        seg_id_supertx = VPXMIN(mip[r * cm->mi_stride + c]->mbmi.segment_id,
+                                seg_id_supertx);
+    assert(0 <= seg_id_supertx && seg_id_supertx < MAX_SEGMENTS);
+  }
+
+  // Assign the the segment_id back to segment_id_supertx
+  for (r = 0 ; r < mih ; r++)
+    for (c = 0 ; c < miw ; c++)
+      mip[r * cm->mi_stride + c]->mbmi.segment_id_supertx = seg_id_supertx;
+}
+#endif  // CONFIG_SUPERTX
+
 static void decode_block(VP10Decoder *const pbi, MACROBLOCKD *const xd,
+#if CONFIG_SUPERTX
+                         int supertx_enabled,
+#endif  // CONFIG_SUPERTX
                          int mi_row, int mi_col,
-                         vpx_reader *r, BLOCK_SIZE bsize,
+                         vp10_reader *r,
+#if CONFIG_EXT_PARTITION_TYPES
+                         PARTITION_TYPE partition,
+#endif  // CONFIG_EXT_PARTITION_TYPES
+                         BLOCK_SIZE bsize,
                          int bwl, int bhl) {
   VP10_COMMON *const cm = &pbi->common;
   const int less8x8 = bsize < BLOCK_8X8;
@@ -818,8 +1265,28 @@
   const int x_mis = VPXMIN(bw, cm->mi_cols - mi_col);
   const int y_mis = VPXMIN(bh, cm->mi_rows - mi_row);
 
+#if CONFIG_SUPERTX
+  MB_MODE_INFO *mbmi;
+  if (supertx_enabled) {
+    mbmi = set_mb_offsets(cm, xd, bsize, mi_row, mi_col,
+                          bw, bh, x_mis, y_mis);
+  } else {
+    mbmi = set_offsets(cm, xd, bsize, mi_row, mi_col,
+                       bw, bh, x_mis, y_mis, bwl, bhl);
+  }
+#if CONFIG_EXT_PARTITION_TYPES
+  xd->mi[0]->mbmi.partition = partition;
+#endif
+  vp10_read_mode_info(pbi, xd, supertx_enabled,
+                      mi_row, mi_col, r, x_mis, y_mis);
+#else
   MB_MODE_INFO *mbmi = set_offsets(cm, xd, bsize, mi_row, mi_col,
                                    bw, bh, x_mis, y_mis, bwl, bhl);
+#if CONFIG_EXT_PARTITION_TYPES
+  xd->mi[0]->mbmi.partition = partition;
+#endif
+  vp10_read_mode_info(pbi, xd, mi_row, mi_col, r, x_mis, y_mis);
+#endif  // CONFIG_SUPERTX
 
   if (bsize >= BLOCK_8X8 && (cm->subsampling_x || cm->subsampling_y)) {
     const BLOCK_SIZE uv_subsize =
@@ -829,36 +1296,100 @@
                          VPX_CODEC_CORRUPT_FRAME, "Invalid block size.");
   }
 
-  vp10_read_mode_info(pbi, xd, mi_row, mi_col, r, x_mis, y_mis);
+#if CONFIG_SUPERTX
+  mbmi->segment_id_supertx = MAX_SEGMENTS;
+
+  if (supertx_enabled) {
+    xd->corrupted |= vp10_reader_has_error(r);
+    return;
+  }
+#endif  // CONFIG_SUPERTX
 
   if (mbmi->skip) {
     dec_reset_skip_context(xd);
   }
-
   if (!is_inter_block(mbmi)) {
     int plane;
+    for (plane = 0; plane <= 1; ++plane) {
+      if (mbmi->palette_mode_info.palette_size[plane])
+        vp10_decode_palette_tokens(xd, plane, r);
+    }
     for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
       const struct macroblockd_plane *const pd = &xd->plane[plane];
       const TX_SIZE tx_size =
           plane ? dec_get_uv_tx_size(mbmi, pd->n4_wl, pd->n4_hl)
-                  : mbmi->tx_size;
+          : mbmi->tx_size;
       const int num_4x4_w = pd->n4_w;
       const int num_4x4_h = pd->n4_h;
       const int step = (1 << tx_size);
       int row, col;
-      const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ?
-          0 : xd->mb_to_right_edge >> (5 + pd->subsampling_x));
-      const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ?
-          0 : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+      const int max_blocks_wide = num_4x4_w +
+          (xd->mb_to_right_edge >= 0 ?
+           0 : xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+      const int max_blocks_high = num_4x4_h +
+          (xd->mb_to_bottom_edge >= 0 ?
+           0 : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
 
       for (row = 0; row < max_blocks_high; row += step)
         for (col = 0; col < max_blocks_wide; col += step)
-          predict_and_reconstruct_intra_block(xd, r, mbmi, plane,
+          predict_and_reconstruct_intra_block(xd,
+                                              r,
+                                              mbmi, plane,
                                               row, col, tx_size);
     }
   } else {
     // Prediction
-    dec_build_inter_predictors_sb(pbi, xd, mi_row, mi_col);
+    vp10_build_inter_predictors_sb(xd, mi_row, mi_col,
+                                   VPXMAX(bsize, BLOCK_8X8));
+#if CONFIG_OBMC
+    if (mbmi->motion_variation == OBMC_CAUSAL) {
+#if CONFIG_VP9_HIGHBITDEPTH
+      DECLARE_ALIGNED(16, uint8_t,
+                      tmp_buf1[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
+      DECLARE_ALIGNED(16, uint8_t,
+                      tmp_buf2[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
+#else
+      DECLARE_ALIGNED(16, uint8_t,
+                      tmp_buf1[MAX_MB_PLANE * MAX_SB_SQUARE]);
+      DECLARE_ALIGNED(16, uint8_t,
+                      tmp_buf2[MAX_MB_PLANE * MAX_SB_SQUARE]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
+      int dst_stride1[MAX_MB_PLANE] = {MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE};
+      int dst_stride2[MAX_MB_PLANE] = {MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE};
+
+      assert(mbmi->sb_type >= BLOCK_8X8);
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        int len = sizeof(uint16_t);
+        dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
+        dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * len);
+        dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * 2 * len);
+        dst_buf2[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
+        dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * len);
+        dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * 2 * len);
+      } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        dst_buf1[0] = tmp_buf1;
+        dst_buf1[1] = tmp_buf1 + MAX_SB_SQUARE;
+        dst_buf1[2] = tmp_buf1 + MAX_SB_SQUARE * 2;
+        dst_buf2[0] = tmp_buf2;
+        dst_buf2[1] = tmp_buf2 + MAX_SB_SQUARE;
+        dst_buf2[2] = tmp_buf2 + MAX_SB_SQUARE * 2;
+#if CONFIG_VP9_HIGHBITDEPTH
+      }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      vp10_build_prediction_by_above_preds(cm, xd, mi_row, mi_col,
+                                           dst_buf1, dst_stride1);
+      vp10_build_prediction_by_left_preds(cm, xd, mi_row, mi_col,
+                                          dst_buf2, dst_stride2);
+      vp10_setup_dst_planes(xd->plane, get_frame_new_buffer(cm),
+                            mi_row, mi_col);
+      vp10_build_obmc_inter_prediction(cm, xd, mi_row, mi_col,
+                                       dst_buf1, dst_stride1,
+                                       dst_buf2, dst_stride2);
+    }
+#endif  // CONFIG_OBMC
 
     // Reconstruction
     if (!mbmi->skip) {
@@ -867,41 +1398,62 @@
 
       for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
         const struct macroblockd_plane *const pd = &xd->plane[plane];
-        const TX_SIZE tx_size =
-            plane ? dec_get_uv_tx_size(mbmi, pd->n4_wl, pd->n4_hl)
-                    : mbmi->tx_size;
         const int num_4x4_w = pd->n4_w;
         const int num_4x4_h = pd->n4_h;
-        const int step = (1 << tx_size);
         int row, col;
-        const int max_blocks_wide = num_4x4_w + (xd->mb_to_right_edge >= 0 ?
-            0 : xd->mb_to_right_edge >> (5 + pd->subsampling_x));
-        const int max_blocks_high = num_4x4_h + (xd->mb_to_bottom_edge >= 0 ?
-            0 : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+#if CONFIG_VAR_TX
+        // TODO(jingning): This can be simplified for decoder performance.
+        const BLOCK_SIZE plane_bsize =
+            get_plane_block_size(VPXMAX(bsize, BLOCK_8X8), pd);
+        const TX_SIZE max_tx_size = max_txsize_lookup[plane_bsize];
+        const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
+        int bw = num_4x4_blocks_wide_lookup[txb_size];
+        int block = 0;
+        const int step = 1 << (max_tx_size << 1);
+
+        for (row = 0; row < num_4x4_h; row += bw) {
+          for (col = 0; col < num_4x4_w; col += bw) {
+            decode_reconstruct_tx(xd, r, mbmi, plane, plane_bsize,
+                                  block, row, col, max_tx_size, &eobtotal);
+            block += step;
+          }
+        }
+#else
+        const TX_SIZE tx_size =
+            plane ? dec_get_uv_tx_size(mbmi, pd->n4_wl, pd->n4_hl)
+            : mbmi->tx_size;
+        const int step = (1 << tx_size);
+        const int max_blocks_wide = num_4x4_w +
+            (xd->mb_to_right_edge >= 0 ?
+             0 : xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+        const int max_blocks_high = num_4x4_h +
+            (xd->mb_to_bottom_edge >= 0 ?
+             0 : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
 
         for (row = 0; row < max_blocks_high; row += step)
           for (col = 0; col < max_blocks_wide; col += step)
-            eobtotal += reconstruct_inter_block(xd, r, mbmi, plane, row, col,
+            eobtotal += reconstruct_inter_block(xd,
+                                                r,
+                                                mbmi->segment_id,
+                                                plane, row, col,
                                                 tx_size);
+#endif
       }
 
       if (!less8x8 && eobtotal == 0)
-#if CONFIG_MISC_FIXES
         mbmi->has_no_coeffs = 1;  // skip loopfilter
-#else
-        mbmi->skip = 1;  // skip loopfilter
-#endif
     }
   }
 
-  xd->corrupted |= vpx_reader_has_error(r);
+  xd->corrupted |= vp10_reader_has_error(r);
 }
 
 static INLINE int dec_partition_plane_context(const MACROBLOCKD *xd,
                                               int mi_row, int mi_col,
                                               int bsl) {
   const PARTITION_CONTEXT *above_ctx = xd->above_seg_context + mi_col;
-  const PARTITION_CONTEXT *left_ctx = xd->left_seg_context + (mi_row & MI_MASK);
+  const PARTITION_CONTEXT *left_ctx =
+    xd->left_seg_context + (mi_row & MAX_MIB_MASK);
   int above = (*above_ctx >> bsl) & 1 , left = (*left_ctx >> bsl) & 1;
 
 //  assert(bsl >= 0);
@@ -909,12 +1461,14 @@
   return (left * 2 + above) + bsl * PARTITION_PLOFFSET;
 }
 
+#if !CONFIG_EXT_PARTITION_TYPES
 static INLINE void dec_update_partition_context(MACROBLOCKD *xd,
                                                 int mi_row, int mi_col,
                                                 BLOCK_SIZE subsize,
                                                 int bw) {
   PARTITION_CONTEXT *const above_ctx = xd->above_seg_context + mi_col;
-  PARTITION_CONTEXT *const left_ctx = xd->left_seg_context + (mi_row & MI_MASK);
+  PARTITION_CONTEXT *const left_ctx =
+    xd->left_seg_context + (mi_row & MAX_MIB_MASK);
 
   // update the partition context at the end notes. set partition bits
   // of block sizes larger than the current one to be one, and partition
@@ -922,21 +1476,33 @@
   memset(above_ctx, partition_context_lookup[subsize].above, bw);
   memset(left_ctx, partition_context_lookup[subsize].left, bw);
 }
+#endif  // !CONFIG_EXT_PARTITION_TYPES
 
 static PARTITION_TYPE read_partition(VP10_COMMON *cm, MACROBLOCKD *xd,
-                                     int mi_row, int mi_col, vpx_reader *r,
-                                     int has_rows, int has_cols, int bsl) {
+                                     int mi_row, int mi_col, vp10_reader *r,
+                                     int has_rows, int has_cols,
+#if CONFIG_EXT_PARTITION_TYPES
+                                     BLOCK_SIZE bsize,
+#endif
+                                     int bsl) {
   const int ctx = dec_partition_plane_context(xd, mi_row, mi_col, bsl);
   const vpx_prob *const probs = cm->fc->partition_prob[ctx];
   FRAME_COUNTS *counts = xd->counts;
   PARTITION_TYPE p;
 
   if (has_rows && has_cols)
-    p = (PARTITION_TYPE)vpx_read_tree(r, vp10_partition_tree, probs);
+#if CONFIG_EXT_PARTITION_TYPES
+    if (bsize <= BLOCK_8X8)
+      p = (PARTITION_TYPE)vp10_read_tree(r, vp10_partition_tree, probs);
+    else
+      p = (PARTITION_TYPE)vp10_read_tree(r, vp10_ext_partition_tree, probs);
+#else
+    p = (PARTITION_TYPE)vp10_read_tree(r, vp10_partition_tree, probs);
+#endif  // CONFIG_EXT_PARTITION_TYPES
   else if (!has_rows && has_cols)
-    p = vpx_read(r, probs[1]) ? PARTITION_SPLIT : PARTITION_HORZ;
+    p = vp10_read(r, probs[1]) ? PARTITION_SPLIT : PARTITION_HORZ;
   else if (has_rows && !has_cols)
-    p = vpx_read(r, probs[2]) ? PARTITION_SPLIT : PARTITION_VERT;
+    p = vp10_read(r, probs[2]) ? PARTITION_SPLIT : PARTITION_VERT;
   else
     p = PARTITION_SPLIT;
 
@@ -946,72 +1512,389 @@
   return p;
 }
 
+#if CONFIG_SUPERTX
+static int read_skip(VP10_COMMON *cm, const MACROBLOCKD *xd,
+                     int segment_id, vp10_reader *r) {
+  if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
+    return 1;
+  } else {
+    const int ctx = vp10_get_skip_context(xd);
+    const int skip = vp10_read(r, cm->fc->skip_probs[ctx]);
+    FRAME_COUNTS *counts = xd->counts;
+    if (counts)
+      ++counts->skip[ctx][skip];
+    return skip;
+  }
+}
+#endif  // CONFIG_SUPERTX
+
 // TODO(slavarnway): eliminate bsize and subsize in future commits
 static void decode_partition(VP10Decoder *const pbi, MACROBLOCKD *const xd,
+#if CONFIG_SUPERTX
+                             int supertx_enabled,
+#endif
                              int mi_row, int mi_col,
-                             vpx_reader* r, BLOCK_SIZE bsize, int n4x4_l2) {
+                             vp10_reader* r,
+                             BLOCK_SIZE bsize, int n4x4_l2) {
   VP10_COMMON *const cm = &pbi->common;
   const int n8x8_l2 = n4x4_l2 - 1;
   const int num_8x8_wh = 1 << n8x8_l2;
   const int hbs = num_8x8_wh >> 1;
   PARTITION_TYPE partition;
   BLOCK_SIZE subsize;
+#if CONFIG_EXT_PARTITION_TYPES
+  BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+#endif
   const int has_rows = (mi_row + hbs) < cm->mi_rows;
   const int has_cols = (mi_col + hbs) < cm->mi_cols;
+#if CONFIG_SUPERTX
+  const int read_token = !supertx_enabled;
+  int skip = 0;
+  TX_SIZE supertx_size = b_width_log2_lookup[bsize];
+  const TileInfo *const tile = &xd->tile;
+  int txfm = DCT_DCT;
+#endif  // CONFIG_SUPERTX
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
   partition = read_partition(cm, xd, mi_row, mi_col, r, has_rows, has_cols,
+#if CONFIG_EXT_PARTITION_TYPES
+                             bsize,
+#endif
                              n8x8_l2);
   subsize = subsize_lookup[partition][bsize];  // get_subsize(bsize, partition);
+#if CONFIG_SUPERTX
+  if (!frame_is_intra_only(cm) &&
+      partition != PARTITION_NONE &&
+      bsize <= MAX_SUPERTX_BLOCK_SIZE &&
+      !supertx_enabled &&
+      !xd->lossless[0]) {
+    const int supertx_context =
+        partition_supertx_context_lookup[partition];
+    supertx_enabled = vp10_read(
+        r, cm->fc->supertx_prob[supertx_context][supertx_size]);
+    if (xd->counts)
+      xd->counts->supertx[supertx_context][supertx_size][supertx_enabled]++;
+#if CONFIG_VAR_TX
+    if (supertx_enabled)
+      xd->supertx_size = supertx_size;
+#endif
+  }
+#endif  // CONFIG_SUPERTX
   if (!hbs) {
     // calculate bmode block dimensions (log 2)
     xd->bmode_blocks_wl = 1 >> !!(partition & PARTITION_VERT);
     xd->bmode_blocks_hl = 1 >> !!(partition & PARTITION_HORZ);
-    decode_block(pbi, xd, mi_row, mi_col, r, subsize, 1, 1);
+    decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                 supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                 mi_row, mi_col, r,
+#if CONFIG_EXT_PARTITION_TYPES
+                 partition,
+#endif  // CONFIG_EXT_PARTITION_TYPES
+                 subsize, 1, 1);
   } else {
     switch (partition) {
       case PARTITION_NONE:
-        decode_block(pbi, xd, mi_row, mi_col, r, subsize, n4x4_l2, n4x4_l2);
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                     mi_row, mi_col, r,
+#if CONFIG_EXT_PARTITION_TYPES
+                 partition,
+#endif  // CONFIG_EXT_PARTITION_TYPES
+                     subsize, n4x4_l2, n4x4_l2);
         break;
       case PARTITION_HORZ:
-        decode_block(pbi, xd, mi_row, mi_col, r, subsize, n4x4_l2, n8x8_l2);
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                     mi_row, mi_col, r,
+#if CONFIG_EXT_PARTITION_TYPES
+                 partition,
+#endif  // CONFIG_EXT_PARTITION_TYPES
+                     subsize, n4x4_l2, n8x8_l2);
         if (has_rows)
-          decode_block(pbi, xd, mi_row + hbs, mi_col, r, subsize, n4x4_l2,
-                       n8x8_l2);
+          decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                       supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                       mi_row + hbs, mi_col, r,
+#if CONFIG_EXT_PARTITION_TYPES
+                 partition,
+#endif  // CONFIG_EXT_PARTITION_TYPES
+                       subsize, n4x4_l2, n8x8_l2);
         break;
       case PARTITION_VERT:
-        decode_block(pbi, xd, mi_row, mi_col, r, subsize, n8x8_l2, n4x4_l2);
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                     mi_row, mi_col, r,
+#if CONFIG_EXT_PARTITION_TYPES
+                 partition,
+#endif  // CONFIG_EXT_PARTITION_TYPES
+                     subsize, n8x8_l2, n4x4_l2);
         if (has_cols)
-          decode_block(pbi, xd, mi_row, mi_col + hbs, r, subsize, n8x8_l2,
-                       n4x4_l2);
+          decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                       supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                       mi_row, mi_col + hbs, r,
+#if CONFIG_EXT_PARTITION_TYPES
+                 partition,
+#endif  // CONFIG_EXT_PARTITION_TYPES
+                       subsize, n8x8_l2, n4x4_l2);
         break;
       case PARTITION_SPLIT:
-        decode_partition(pbi, xd, mi_row, mi_col, r, subsize, n8x8_l2);
-        decode_partition(pbi, xd, mi_row, mi_col + hbs, r, subsize, n8x8_l2);
-        decode_partition(pbi, xd, mi_row + hbs, mi_col, r, subsize, n8x8_l2);
-        decode_partition(pbi, xd, mi_row + hbs, mi_col + hbs, r, subsize,
-                         n8x8_l2);
+        decode_partition(pbi, xd,
+#if CONFIG_SUPERTX
+                         supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                         mi_row, mi_col, r,
+                         subsize, n8x8_l2);
+        decode_partition(pbi, xd,
+#if CONFIG_SUPERTX
+                         supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                         mi_row, mi_col + hbs, r,
+                         subsize, n8x8_l2);
+        decode_partition(pbi, xd,
+#if CONFIG_SUPERTX
+                         supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                         mi_row + hbs, mi_col, r,
+                         subsize, n8x8_l2);
+        decode_partition(pbi, xd,
+#if CONFIG_SUPERTX
+                         supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                         mi_row + hbs, mi_col + hbs, r,
+                         subsize, n8x8_l2);
         break;
+#if CONFIG_EXT_PARTITION_TYPES
+      case PARTITION_HORZ_A:
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row,       mi_col,       r,
+                     partition, bsize2, n8x8_l2, n8x8_l2);
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row,       mi_col + hbs, r,
+                     partition, bsize2, n8x8_l2, n8x8_l2);
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row + hbs, mi_col, r,
+                     partition, subsize, n4x4_l2, n8x8_l2);
+        break;
+      case PARTITION_HORZ_B:
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row, mi_col, r,
+                     partition, subsize, n4x4_l2, n8x8_l2);
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row + hbs, mi_col,       r,
+                     partition, bsize2, n8x8_l2, n8x8_l2);
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row + hbs, mi_col + hbs, r,
+                     partition, bsize2, n8x8_l2, n8x8_l2);
+        break;
+      case PARTITION_VERT_A:
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row,       mi_col,       r,
+                     partition, bsize2, n8x8_l2, n8x8_l2);
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row + hbs, mi_col,       r,
+                     partition, bsize2, n8x8_l2, n8x8_l2);
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row, mi_col + hbs, r,
+                     partition, subsize, n8x8_l2, n4x4_l2);
+        break;
+      case PARTITION_VERT_B:
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row, mi_col, r,
+                     partition, subsize, n8x8_l2, n4x4_l2);
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row,       mi_col + hbs, r,
+                     partition, bsize2, n8x8_l2, n8x8_l2);
+        decode_block(pbi, xd,
+#if CONFIG_SUPERTX
+                     supertx_enabled,
+#endif
+                     mi_row + hbs, mi_col + hbs, r,
+                     partition, bsize2, n8x8_l2, n8x8_l2);
+        break;
+#endif
       default:
         assert(0 && "Invalid partition type");
     }
   }
 
+#if CONFIG_SUPERTX
+  if (supertx_enabled && read_token) {
+    uint8_t *dst_buf[3];
+    int dst_stride[3], i;
+    int offset = mi_row * cm->mi_stride + mi_col;
+
+    set_segment_id_supertx(cm, mi_row, mi_col, bsize);
+
+    xd->mi = cm->mi_grid_visible + offset;
+    xd->mi[0] = cm->mi + offset;
+    set_mi_row_col(xd, tile, mi_row, num_8x8_blocks_high_lookup[bsize],
+                   mi_col, num_8x8_blocks_wide_lookup[bsize],
+                   cm->mi_rows, cm->mi_cols);
+    set_skip_context(xd, mi_row, mi_col);
+    skip = read_skip(cm, xd, xd->mi[0]->mbmi.segment_id_supertx, r);
+    if (skip) {
+      reset_skip_context(xd, bsize);
+    } else {
+#if CONFIG_EXT_TX
+      if (get_ext_tx_types(supertx_size, bsize, 1) > 1) {
+        int eset = get_ext_tx_set(supertx_size, bsize, 1);
+        if (eset > 0) {
+          txfm = vp10_read_tree(r, vp10_ext_tx_inter_tree[eset],
+                               cm->fc->inter_ext_tx_prob[eset][supertx_size]);
+          if (xd->counts)
+            ++xd->counts->inter_ext_tx[eset][supertx_size][txfm];
+        }
+      }
+#else
+      if (supertx_size < TX_32X32) {
+        txfm = vp10_read_tree(r, vp10_ext_tx_tree,
+                             cm->fc->inter_ext_tx_prob[supertx_size]);
+        if (xd->counts)
+          ++xd->counts->inter_ext_tx[supertx_size][txfm];
+      }
+#endif  // CONFIG_EXT_TX
+    }
+
+
+    vp10_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
+    for (i = 0; i < MAX_MB_PLANE; i++) {
+      dst_buf[i] = xd->plane[i].dst.buf;
+      dst_stride[i] = xd->plane[i].dst.stride;
+    }
+    dec_predict_sb_complex(pbi, xd, tile, mi_row, mi_col, mi_row, mi_col,
+                           bsize, bsize, dst_buf, dst_stride);
+
+    if (!skip) {
+      int eobtotal = 0;
+      MB_MODE_INFO *mbmi;
+      set_offsets_topblock(cm, xd, tile, bsize, mi_row, mi_col);
+      mbmi = &xd->mi[0]->mbmi;
+      mbmi->tx_type = txfm;
+      assert(mbmi->segment_id_supertx != MAX_SEGMENTS);
+      for (i = 0; i < MAX_MB_PLANE; ++i) {
+        const struct macroblockd_plane *const pd = &xd->plane[i];
+        const int num_4x4_w = pd->n4_w;
+        const int num_4x4_h = pd->n4_h;
+        int row, col;
+        const TX_SIZE tx_size =
+            i ? dec_get_uv_tx_size(mbmi, pd->n4_wl, pd->n4_hl)
+            : mbmi->tx_size;
+        const int step = (1 << tx_size);
+        const int max_blocks_wide = num_4x4_w +
+            (xd->mb_to_right_edge >= 0 ?
+             0 : xd->mb_to_right_edge >> (5 + pd->subsampling_x));
+        const int max_blocks_high = num_4x4_h +
+            (xd->mb_to_bottom_edge >= 0 ?
+             0 : xd->mb_to_bottom_edge >> (5 + pd->subsampling_y));
+
+        for (row = 0; row < max_blocks_high; row += step)
+          for (col = 0; col < max_blocks_wide; col += step)
+            eobtotal += reconstruct_inter_block(xd,
+                                                r,
+                                                mbmi->segment_id_supertx,
+                                                i, row, col,
+                                                tx_size);
+      }
+      if (!(subsize < BLOCK_8X8) && eobtotal == 0)
+        skip = 1;
+    }
+    set_param_topblock(cm, xd, bsize, mi_row, mi_col, txfm, skip);
+  }
+#endif  // CONFIG_SUPERTX
+
+#if CONFIG_EXT_PARTITION_TYPES
+  if (bsize >= BLOCK_8X8) {
+    switch (partition) {
+      case PARTITION_SPLIT:
+        if (bsize > BLOCK_8X8)
+          break;
+      case PARTITION_NONE:
+      case PARTITION_HORZ:
+      case PARTITION_VERT:
+        update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+        break;
+      case PARTITION_HORZ_A:
+        update_partition_context(xd, mi_row, mi_col, bsize2, subsize);
+        update_partition_context(xd, mi_row + hbs, mi_col, subsize, subsize);
+        break;
+      case PARTITION_HORZ_B:
+        update_partition_context(xd, mi_row, mi_col, subsize, subsize);
+        update_partition_context(xd, mi_row + hbs, mi_col, bsize2, subsize);
+        break;
+      case PARTITION_VERT_A:
+        update_partition_context(xd, mi_row, mi_col, bsize2, subsize);
+        update_partition_context(xd, mi_row, mi_col + hbs, subsize, subsize);
+        break;
+      case PARTITION_VERT_B:
+        update_partition_context(xd, mi_row, mi_col, subsize, subsize);
+        update_partition_context(xd, mi_row, mi_col + hbs, bsize2, subsize);
+        break;
+      default:
+        assert(0 && "Invalid partition type");
+    }
+  }
+#else
   // update partition context
   if (bsize >= BLOCK_8X8 &&
       (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
     dec_update_partition_context(xd, mi_row, mi_col, subsize, num_8x8_wh);
+#endif  // CONFIG_EXT_PARTITION_TYPES
 }
 
-static void setup_token_decoder(const uint8_t *data,
-                                const uint8_t *data_end,
-                                size_t read_size,
-                                struct vpx_internal_error_info *error_info,
-                                vpx_reader *r,
-                                vpx_decrypt_cb decrypt_cb,
-                                void *decrypt_state) {
+#if !CONFIG_ANS
+static void setup_bool_decoder(const uint8_t *data,
+                               const uint8_t *data_end,
+                               const size_t read_size,
+                               struct vpx_internal_error_info *error_info,
+                               vp10_reader *r,
+                               vpx_decrypt_cb decrypt_cb,
+                               void *decrypt_state) {
   // Validate the calculated partition length. If the buffer
   // described by the partition can't be fully read, then restrict
   // it to the portion that can be (for EC mode) or throw an error.
@@ -1023,12 +1906,34 @@
     vpx_internal_error(error_info, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate bool decoder %d", 1);
 }
+#else
+static void setup_token_decoder(const uint8_t *data,
+                                const uint8_t *data_end,
+                                const size_t read_size,
+                                struct vpx_internal_error_info *error_info,
+                                struct AnsDecoder *const ans,
+                                vpx_decrypt_cb decrypt_cb,
+                                void *decrypt_state) {
+  (void) decrypt_cb;
+  (void) decrypt_state;
+  // Validate the calculated partition length. If the buffer
+  // described by the partition can't be fully read, then restrict
+  // it to the portion that can be (for EC mode) or throw an error.
+  if (!read_is_valid(data, read_size, data_end))
+    vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME,
+                       "Truncated packet or corrupt tile length");
+
+  if (read_size > INT_MAX || ans_read_init(ans, data, (int)read_size))
+    vpx_internal_error(error_info, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate token decoder %d", 1);
+}
+#endif
 
 static void read_coef_probs_common(vp10_coeff_probs_model *coef_probs,
-                                   vpx_reader *r) {
+                                   vp10_reader *r) {
   int i, j, k, l, m;
 
-  if (vpx_read_bit(r))
+  if (vp10_read_bit(r))
     for (i = 0; i < PLANE_TYPES; ++i)
       for (j = 0; j < REF_TYPES; ++j)
         for (k = 0; k < COEF_BANDS; ++k)
@@ -1038,19 +1943,19 @@
 }
 
 static void read_coef_probs(FRAME_CONTEXT *fc, TX_MODE tx_mode,
-                            vpx_reader *r) {
+                            vp10_reader *r) {
     const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
     TX_SIZE tx_size;
     for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
       read_coef_probs_common(fc->coef_probs[tx_size], r);
+#if CONFIG_ANS
+    vp10_coef_pareto_cdfs(fc);
+#endif  // CONFIG_ANS
 }
 
 static void setup_segmentation(VP10_COMMON *const cm,
                                struct vpx_read_bit_buffer *rb) {
   struct segmentation *const seg = &cm->seg;
-#if !CONFIG_MISC_FIXES
-  struct segmentation_probs *const segp = &cm->segp;
-#endif
   int i, j;
 
   seg->update_map = 0;
@@ -1067,26 +1972,11 @@
     seg->update_map = vpx_rb_read_bit(rb);
   }
   if (seg->update_map) {
-#if !CONFIG_MISC_FIXES
-    for (i = 0; i < SEG_TREE_PROBS; i++)
-      segp->tree_probs[i] = vpx_rb_read_bit(rb) ? vpx_rb_read_literal(rb, 8)
-                                                : MAX_PROB;
-#endif
     if (frame_is_intra_only(cm) || cm->error_resilient_mode) {
       seg->temporal_update = 0;
     } else {
       seg->temporal_update = vpx_rb_read_bit(rb);
     }
-#if !CONFIG_MISC_FIXES
-    if (seg->temporal_update) {
-      for (i = 0; i < PREDICTION_PROBS; i++)
-        segp->pred_probs[i] = vpx_rb_read_bit(rb) ? vpx_rb_read_literal(rb, 8)
-                                                  : MAX_PROB;
-    } else {
-      for (i = 0; i < PREDICTION_PROBS; i++)
-        segp->pred_probs[i] = MAX_PROB;
-    }
-#endif
   }
 
   // Segmentation data update
@@ -1112,8 +2002,39 @@
   }
 }
 
-static void setup_loopfilter(struct loopfilter *lf,
+#if CONFIG_LOOP_RESTORATION
+static void setup_restoration(VP10_COMMON *cm,
+                              struct vpx_read_bit_buffer *rb) {
+  RestorationInfo *rst = &cm->rst_info;
+  if (vpx_rb_read_bit(rb)) {
+    if (vpx_rb_read_bit(rb)) {
+      rst->restoration_type = RESTORE_BILATERAL;
+      rst->restoration_level =
+          vpx_rb_read_literal(rb, vp10_restoration_level_bits(cm));
+    } else {
+      rst->restoration_type = RESTORE_WIENER;
+      rst->vfilter[0] = vpx_rb_read_literal(rb, WIENER_FILT_TAP0_BITS) +
+          WIENER_FILT_TAP0_MINV;
+      rst->vfilter[1] = vpx_rb_read_literal(rb, WIENER_FILT_TAP1_BITS) +
+          WIENER_FILT_TAP1_MINV;
+      rst->vfilter[2] = vpx_rb_read_literal(rb, WIENER_FILT_TAP2_BITS) +
+          WIENER_FILT_TAP2_MINV;
+      rst->hfilter[0] = vpx_rb_read_literal(rb, WIENER_FILT_TAP0_BITS) +
+          WIENER_FILT_TAP0_MINV;
+      rst->hfilter[1] = vpx_rb_read_literal(rb, WIENER_FILT_TAP1_BITS) +
+          WIENER_FILT_TAP1_MINV;
+      rst->hfilter[2] = vpx_rb_read_literal(rb, WIENER_FILT_TAP2_BITS) +
+          WIENER_FILT_TAP2_MINV;
+    }
+  } else {
+    rst->restoration_type = RESTORE_NONE;
+  }
+}
+#endif  // CONFIG_LOOP_RESTORATION
+
+static void setup_loopfilter(VP10_COMMON *cm,
                              struct vpx_read_bit_buffer *rb) {
+  struct loopfilter *lf = &cm->lf;
   lf->filter_level = vpx_rb_read_literal(rb, 6);
   lf->sharpness_level = vpx_rb_read_literal(rb, 3);
 
@@ -1140,7 +2061,7 @@
 
 static INLINE int read_delta_q(struct vpx_read_bit_buffer *rb) {
   return vpx_rb_read_bit(rb) ?
-      vpx_rb_read_inv_signed_literal(rb, CONFIG_MISC_FIXES ? 6 : 4) : 0;
+      vpx_rb_read_inv_signed_literal(rb, 6) : 0;
 }
 
 static void setup_quantization(VP10_COMMON *const cm,
@@ -1154,6 +2075,10 @@
 
 static void setup_segmentation_dequant(VP10_COMMON *const cm) {
   // Build y/uv dequant values based on segmentation.
+#if CONFIG_NEW_QUANT
+  int b;
+  int dq;
+#endif  //  CONFIG_NEW_QUANT
   if (cm->seg.enabled) {
     int i;
     for (i = 0; i < MAX_SEGMENTS; ++i) {
@@ -1165,6 +2090,18 @@
                                           cm->bit_depth);
       cm->uv_dequant[i][1] = vp10_ac_quant(qindex, cm->uv_ac_delta_q,
                                           cm->bit_depth);
+#if CONFIG_NEW_QUANT
+      for (dq = 0; dq < QUANT_PROFILES; dq ++) {
+        for (b = 0; b < COEF_BANDS; ++b) {
+          vp10_get_dequant_val_nuq(
+              cm->y_dequant[i][b != 0], qindex, b,
+              cm->y_dequant_nuq[i][dq][b], NULL, dq);
+          vp10_get_dequant_val_nuq(
+              cm->uv_dequant[i][b != 0], qindex, b,
+              cm->uv_dequant_nuq[i][dq][b], NULL, dq);
+        }
+      }
+#endif  //  CONFIG_NEW_QUANT
     }
   } else {
     const int qindex = cm->base_qindex;
@@ -1176,11 +2113,24 @@
                                         cm->bit_depth);
     cm->uv_dequant[0][1] = vp10_ac_quant(qindex, cm->uv_ac_delta_q,
                                         cm->bit_depth);
+#if CONFIG_NEW_QUANT
+    for (dq = 0; dq < QUANT_PROFILES; dq ++) {
+      for (b = 0; b < COEF_BANDS; ++b) {
+        vp10_get_dequant_val_nuq(
+            cm->y_dequant[0][b != 0], qindex, b,
+            cm->y_dequant_nuq[0][dq][b], NULL, dq);
+        vp10_get_dequant_val_nuq(
+            cm->uv_dequant[0][b != 0], qindex, b,
+            cm->uv_dequant_nuq[0][dq][b], NULL, dq);
+      }
+    }
+#endif  //  CONFIG_NEW_QUANT
   }
 }
 
 static INTERP_FILTER read_interp_filter(struct vpx_read_bit_buffer *rb) {
-  return vpx_rb_read_bit(rb) ? SWITCHABLE : vpx_rb_read_literal(rb, 2);
+  return vpx_rb_read_bit(rb) ?
+      SWITCHABLE : vpx_rb_read_literal(rb, 2 + CONFIG_EXT_INTERP);
 }
 
 static void setup_render_size(VP10_COMMON *cm,
@@ -1284,10 +2234,8 @@
       YV12_BUFFER_CONFIG *const buf = cm->frame_refs[i].buf;
       width = buf->y_crop_width;
       height = buf->y_crop_height;
-#if CONFIG_MISC_FIXES
       cm->render_width = buf->render_width;
       cm->render_height = buf->render_height;
-#endif
       found = 1;
       break;
     }
@@ -1295,9 +2243,7 @@
 
   if (!found) {
     vp10_read_frame_size(rb, &width, &height);
-#if CONFIG_MISC_FIXES
     setup_render_size(cm, rb);
-#endif
   }
 
   if (width <= 0 || height <= 0)
@@ -1329,9 +2275,6 @@
   }
 
   resize_context_buffers(cm, width, height);
-#if !CONFIG_MISC_FIXES
-  setup_render_size(cm, rb);
-#endif
 
   lock_buffer_pool(pool);
   if (vpx_realloc_frame_buffer(
@@ -1359,7 +2302,43 @@
   pool->frame_bufs[cm->new_fb_idx].buf.render_height = cm->render_height;
 }
 
-static void setup_tile_info(VP10_COMMON *cm, struct vpx_read_bit_buffer *rb) {
+static void read_tile_info(VP10Decoder *const pbi,
+                            struct vpx_read_bit_buffer *const rb) {
+  VP10_COMMON *const cm = &pbi->common;
+#if CONFIG_EXT_TILE
+  // Read the tile width/height
+#if CONFIG_EXT_PARTITION
+  if (cm->sb_size == BLOCK_128X128) {
+    cm->tile_width  = vpx_rb_read_literal(rb, 5) + 1;
+    cm->tile_height = vpx_rb_read_literal(rb, 5) + 1;
+  } else
+#endif  // CONFIG_EXT_PARTITION
+  {
+    cm->tile_width  = vpx_rb_read_literal(rb, 6) + 1;
+    cm->tile_height = vpx_rb_read_literal(rb, 6) + 1;
+  }
+
+  cm->tile_width  <<= cm->mib_size_log2;
+  cm->tile_height <<= cm->mib_size_log2;
+
+  cm->tile_width  = VPXMIN(cm->tile_width, cm->mi_cols);
+  cm->tile_height = VPXMIN(cm->tile_height, cm->mi_rows);
+
+  // Get the number of tiles
+  cm->tile_cols = 1;
+  while (cm->tile_cols * cm->tile_width < cm->mi_cols)
+    ++cm->tile_cols;
+
+  cm->tile_rows = 1;
+  while (cm->tile_rows * cm->tile_height < cm->mi_rows)
+    ++cm->tile_rows;
+
+  if (cm->tile_cols * cm->tile_rows > 1) {
+    // Read the number of bytes used to store tile size
+    pbi->tile_col_size_bytes  = vpx_rb_read_literal(rb, 2) + 1;
+    pbi->tile_size_bytes = vpx_rb_read_literal(rb, 2) + 1;
+  }
+#else
   int min_log2_tile_cols, max_log2_tile_cols, max_ones;
   vp10_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
 
@@ -1378,47 +2357,199 @@
   if (cm->log2_tile_rows)
     cm->log2_tile_rows += vpx_rb_read_bit(rb);
 
-#if CONFIG_MISC_FIXES
+  cm->tile_cols = 1 << cm->log2_tile_cols;
+  cm->tile_rows = 1 << cm->log2_tile_rows;
+
+  cm->tile_width = ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
+  cm->tile_width >>= cm->log2_tile_cols;
+  cm->tile_height = ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2);
+  cm->tile_height >>= cm->log2_tile_rows;
+
+  // round to integer multiples of superblock size
+  cm->tile_width  = ALIGN_POWER_OF_TWO(cm->tile_width, MAX_MIB_SIZE_LOG2);
+  cm->tile_height = ALIGN_POWER_OF_TWO(cm->tile_height, MAX_MIB_SIZE_LOG2);
+
   // tile size magnitude
-  if (cm->log2_tile_rows > 0 || cm->log2_tile_cols > 0) {
-    cm->tile_sz_mag = vpx_rb_read_literal(rb, 2);
+  if (cm->tile_rows > 1 || cm->tile_cols > 1) {
+    pbi->tile_size_bytes = vpx_rb_read_literal(rb, 2) + 1;
   }
-#else
-  cm->tile_sz_mag = 3;
-#endif
+#endif  // CONFIG_EXT_TILE
 }
 
-typedef struct TileBuffer {
-  const uint8_t *data;
-  size_t size;
-  int col;  // only used with multi-threaded decoding
-} TileBuffer;
-
-static int mem_get_varsize(const uint8_t *data, const int mag) {
-  switch (mag) {
-    case 0:
-      return data[0];
+static int mem_get_varsize(const uint8_t *src, const int sz) {
+  switch (sz) {
     case 1:
-      return mem_get_le16(data);
+      return src[0];
     case 2:
-      return mem_get_le24(data);
+      return mem_get_le16(src);
     case 3:
-      return mem_get_le32(data);
+      return mem_get_le24(src);
+    case 4:
+      return mem_get_le32(src);
+    default:
+      assert("Invalid size" && 0);
+      return -1;
   }
-
-  assert("Invalid tile size marker value" && 0);
-
-  return -1;
 }
 
+#if CONFIG_EXT_TILE
 // Reads the next tile returning its size and adjusting '*data' accordingly
 // based on 'is_last'.
 static void get_tile_buffer(const uint8_t *const data_end,
-                            const int tile_sz_mag, int is_last,
                             struct vpx_internal_error_info *error_info,
                             const uint8_t **data,
                             vpx_decrypt_cb decrypt_cb, void *decrypt_state,
-                            TileBuffer *buf) {
+                            TileBufferDec (*const tile_buffers)[MAX_TILE_COLS],
+                            int tile_size_bytes, int col, int row) {
+  size_t size;
+
+  size_t copy_size  = 0;
+  const uint8_t *copy_data = NULL;
+
+  if (!read_is_valid(*data, tile_size_bytes, data_end))
+    vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME,
+                       "Truncated packet or corrupt tile length");
+  if (decrypt_cb) {
+    uint8_t be_data[4];
+    decrypt_cb(decrypt_state, *data, be_data, tile_size_bytes);
+
+    // Only read number of bytes in cm->tile_size_bytes.
+    size = mem_get_varsize(be_data, tile_size_bytes);
+  } else {
+    size = mem_get_varsize(*data, tile_size_bytes);
+  }
+
+  // The top bit indicates copy mode
+  if ((size >> (tile_size_bytes * 8 - 1)) == 1) {
+    // The remaining bits in the top byte signal the row offset
+    int offset = (size >> (tile_size_bytes - 1) * 8) & 0x7f;
+
+    // Currently, only use tiles in same column as reference tiles.
+    copy_data = tile_buffers[row - offset][col].data;
+    copy_size = tile_buffers[row - offset][col].size;
+    size = 0;
+  }
+
+  *data += tile_size_bytes;
+
+  if (size > (size_t)(data_end - *data))
+    vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME,
+                       "Truncated packet or corrupt tile size");
+
+  if (size > 0) {
+    tile_buffers[row][col].data = *data;
+    tile_buffers[row][col].size = size;
+  } else {
+    tile_buffers[row][col].data = copy_data;
+    tile_buffers[row][col].size = copy_size;
+  }
+
+  *data += size;
+
+  tile_buffers[row][col].raw_data_end = *data;
+}
+
+static void get_tile_buffers(
+    VP10Decoder *pbi,
+    const uint8_t *data, const uint8_t *data_end,
+    TileBufferDec (*const tile_buffers)[MAX_TILE_COLS]) {
+  VP10_COMMON *const cm = &pbi->common;
+  const int tile_cols = cm->tile_cols;
+  const int tile_rows = cm->tile_rows;
+  const int have_tiles = tile_cols * tile_rows > 1;
+
+  if (!have_tiles)  {
+    const uint32_t tile_size = data_end - data;
+    tile_buffers[0][0].data = data;
+    tile_buffers[0][0].size = tile_size;
+    tile_buffers[0][0].raw_data_end = NULL;
+  } else {
+    // We locate only the tile buffers that are required, which are the ones
+    // specified by pbi->dec_tile_col and pbi->dec_tile_row. Also, we always
+    // need the last (bottom right) tile buffer, as we need to know where the
+    // end of the compressed frame buffer is for proper superframe decoding.
+
+    const uint8_t *tile_col_data_end[MAX_TILE_COLS];
+    const uint8_t *const data_start = data;
+
+    const int dec_tile_row = VPXMIN(pbi->dec_tile_row, tile_rows);
+    const int single_row = pbi->dec_tile_row >= 0;
+    const int tile_rows_start = single_row ? dec_tile_row : 0;
+    const int tile_rows_end = single_row ? tile_rows_start + 1 : tile_rows;
+    const int dec_tile_col = VPXMIN(pbi->dec_tile_col, tile_cols);
+    const int single_col = pbi->dec_tile_col >= 0;
+    const int tile_cols_start = single_col ? dec_tile_col : 0;
+    const int tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols;
+
+    const int tile_col_size_bytes = pbi->tile_col_size_bytes;
+    const int tile_size_bytes = pbi->tile_size_bytes;
+
+    size_t tile_col_size;
+    int r, c;
+
+    // Read tile column sizes for all columns (we need the last tile buffer)
+    for (c = 0; c < tile_cols; ++c) {
+      const int is_last = c == tile_cols - 1;
+      if (!is_last) {
+        tile_col_size = mem_get_varsize(data, tile_col_size_bytes);
+        data += tile_col_size_bytes;
+        tile_col_data_end[c] = data + tile_col_size;
+      } else {
+        tile_col_size = data_end - data;
+        tile_col_data_end[c] = data_end;
+      }
+      data += tile_col_size;
+    }
+
+    data = data_start;
+
+    // Read the required tile sizes.
+    for (c = tile_cols_start; c < tile_cols_end; ++c) {
+      const int is_last = c == tile_cols - 1;
+
+      if (c > 0)
+        data = tile_col_data_end[c - 1];
+
+      if (!is_last)
+        data += tile_col_size_bytes;
+
+      // Get the whole of the last column, otherwise stop at the required tile.
+      for (r = 0; r < (is_last ? tile_rows : tile_rows_end); ++r) {
+        tile_buffers[r][c].col = c;
+
+        get_tile_buffer(tile_col_data_end[c],
+                        &pbi->common.error, &data,
+                        pbi->decrypt_cb, pbi->decrypt_state,
+                        tile_buffers, tile_size_bytes, c, r);
+      }
+    }
+
+    // If we have not read the last column, then read it to get the last tile.
+    if (tile_cols_end != tile_cols) {
+      c = tile_cols - 1;
+
+      data = tile_col_data_end[c - 1];
+
+      for (r = 0; r < tile_rows; ++r) {
+        tile_buffers[r][c].col = c;
+
+        get_tile_buffer(tile_col_data_end[c],
+                        &pbi->common.error, &data,
+                        pbi->decrypt_cb, pbi->decrypt_state,
+                        tile_buffers, tile_size_bytes, c, r);
+      }
+    }
+  }
+}
+#else
+// Reads the next tile returning its size and adjusting '*data' accordingly
+// based on 'is_last'.
+static void get_tile_buffer(const uint8_t *const data_end,
+                            const int tile_size_bytes, int is_last,
+                            struct vpx_internal_error_info *error_info,
+                            const uint8_t **data,
+                            vpx_decrypt_cb decrypt_cb, void *decrypt_state,
+                            TileBufferDec *const buf) {
   size_t size;
 
   if (!is_last) {
@@ -1428,12 +2559,12 @@
 
     if (decrypt_cb) {
       uint8_t be_data[4];
-      decrypt_cb(decrypt_state, *data, be_data, tile_sz_mag + 1);
-      size = mem_get_varsize(be_data, tile_sz_mag) + CONFIG_MISC_FIXES;
+      decrypt_cb(decrypt_state, *data, be_data, tile_size_bytes);
+      size = mem_get_varsize(be_data, tile_size_bytes);
     } else {
-      size = mem_get_varsize(*data, tile_sz_mag) + CONFIG_MISC_FIXES;
+      size = mem_get_varsize(*data, tile_size_bytes);
     }
-    *data += tile_sz_mag + 1;
+    *data += tile_size_bytes;
 
     if (size > (size_t)(data_end - *data))
       vpx_internal_error(error_info, VPX_CODEC_CORRUPT_FRAME,
@@ -1448,36 +2579,61 @@
   *data += size;
 }
 
-static void get_tile_buffers(VP10Decoder *pbi,
-                             const uint8_t *data, const uint8_t *data_end,
-                             int tile_cols, int tile_rows,
-                             TileBuffer (*tile_buffers)[1 << 6]) {
+static void get_tile_buffers(
+    VP10Decoder *pbi,
+    const uint8_t *data, const uint8_t *data_end,
+    TileBufferDec (*const tile_buffers)[MAX_TILE_COLS]) {
+  VP10_COMMON *const cm = &pbi->common;
   int r, c;
+  const int tile_cols = cm->tile_cols;
+  const int tile_rows = cm->tile_rows;
 
   for (r = 0; r < tile_rows; ++r) {
     for (c = 0; c < tile_cols; ++c) {
       const int is_last = (r == tile_rows - 1) && (c == tile_cols - 1);
-      TileBuffer *const buf = &tile_buffers[r][c];
+      TileBufferDec *const buf = &tile_buffers[r][c];
       buf->col = c;
-      get_tile_buffer(data_end, pbi->common.tile_sz_mag,
-                      is_last, &pbi->common.error, &data,
+      get_tile_buffer(data_end, pbi->tile_size_bytes,
+                      is_last, &cm->error, &data,
                       pbi->decrypt_cb, pbi->decrypt_state, buf);
     }
   }
 }
+#endif  // CONFIG_EXT_TILE
 
 static const uint8_t *decode_tiles(VP10Decoder *pbi,
                                    const uint8_t *data,
                                    const uint8_t *data_end) {
   VP10_COMMON *const cm = &pbi->common;
   const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
-  const int aligned_cols = mi_cols_aligned_to_sb(cm->mi_cols);
-  const int tile_cols = 1 << cm->log2_tile_cols;
-  const int tile_rows = 1 << cm->log2_tile_rows;
-  TileBuffer tile_buffers[4][1 << 6];
+  const int tile_cols = cm->tile_cols;
+  const int tile_rows = cm->tile_rows;
+  const int n_tiles = tile_cols * tile_rows;
+  TileBufferDec (*const tile_buffers)[MAX_TILE_COLS] = pbi->tile_buffers;
+#if CONFIG_EXT_TILE
+  const int dec_tile_row = VPXMIN(pbi->dec_tile_row, tile_rows);
+  const int single_row = pbi->dec_tile_row >= 0;
+  const int tile_rows_start = single_row ? dec_tile_row : 0;
+  const int tile_rows_end = single_row ? dec_tile_row + 1 : tile_rows;
+  const int dec_tile_col = VPXMIN(pbi->dec_tile_col, tile_cols);
+  const int single_col = pbi->dec_tile_col >= 0;
+  const int tile_cols_start = single_col ? dec_tile_col : 0;
+  const int tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols;
+  const int inv_col_order = pbi->inv_tile_order && !single_col;
+  const int inv_row_order = pbi->inv_tile_order && !single_row;
+#else
+  const int tile_rows_start = 0;
+  const int tile_rows_end = tile_rows;
+  const int tile_cols_start = 0;
+  const int tile_cols_end = tile_cols;
+  const int inv_col_order = pbi->inv_tile_order;
+  const int inv_row_order = pbi->inv_tile_order;
+#endif  // CONFIG_EXT_TILE
   int tile_row, tile_col;
-  int mi_row, mi_col;
-  TileData *tile_data = NULL;
+
+#if CONFIG_ENTROPY
+  cm->do_subframe_update = n_tiles == 1;
+#endif  // CONFIG_ENTROPY
 
   if (cm->lf.filter_level && !cm->skip_loop_filter &&
       pbi->lf_worker.data1 == NULL) {
@@ -1498,102 +2654,138 @@
                                pbi->mb.plane);
   }
 
-  assert(tile_rows <= 4);
-  assert(tile_cols <= (1 << 6));
+  assert(tile_rows <= MAX_TILE_ROWS);
+  assert(tile_cols <= MAX_TILE_COLS);
 
-  // Note: this memset assumes above_context[0], [1] and [2]
-  // are allocated as part of the same buffer.
-  memset(cm->above_context, 0,
-         sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_cols);
+  get_tile_buffers(pbi, data, data_end, tile_buffers);
 
-  memset(cm->above_seg_context, 0,
-         sizeof(*cm->above_seg_context) * aligned_cols);
-
-  get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, tile_buffers);
-
-  if (pbi->tile_data == NULL ||
-      (tile_cols * tile_rows) != pbi->total_tiles) {
+  if (pbi->tile_data == NULL || n_tiles != pbi->allocated_tiles) {
     vpx_free(pbi->tile_data);
     CHECK_MEM_ERROR(
         cm,
         pbi->tile_data,
-        vpx_memalign(32, tile_cols * tile_rows * (sizeof(*pbi->tile_data))));
-    pbi->total_tiles = tile_rows * tile_cols;
+        vpx_memalign(32, n_tiles * (sizeof(*pbi->tile_data))));
+    pbi->allocated_tiles = n_tiles;
   }
 
   // Load all tile information into tile_data.
-  for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
-    for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
-      const TileBuffer *const buf = &tile_buffers[tile_row][tile_col];
-      tile_data = pbi->tile_data + tile_cols * tile_row + tile_col;
-      tile_data->cm = cm;
-      tile_data->xd = pbi->mb;
-      tile_data->xd.corrupted = 0;
-      tile_data->xd.counts =
+  for (tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) {
+    for (tile_col = tile_cols_start; tile_col < tile_cols_end; ++tile_col) {
+      const TileBufferDec *const buf = &tile_buffers[tile_row][tile_col];
+      TileData *const td = pbi->tile_data + tile_cols * tile_row + tile_col;
+
+      td->cm = cm;
+      td->xd = pbi->mb;
+      td->xd.corrupted = 0;
+      td->xd.counts =
           cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD ?
               &cm->counts : NULL;
-      vp10_zero(tile_data->dqcoeff);
-      vp10_tile_init(&tile_data->xd.tile, tile_data->cm, tile_row, tile_col);
+      vp10_zero(td->dqcoeff);
+      vp10_tile_init(&td->xd.tile, td->cm, tile_row, tile_col);
+#if !CONFIG_ANS
+      setup_bool_decoder(buf->data, data_end, buf->size, &cm->error,
+                         &td->bit_reader, pbi->decrypt_cb,
+                         pbi->decrypt_state);
+#else
       setup_token_decoder(buf->data, data_end, buf->size, &cm->error,
-                          &tile_data->bit_reader, pbi->decrypt_cb,
+                          &td->bit_reader, pbi->decrypt_cb,
                           pbi->decrypt_state);
-      vp10_init_macroblockd(cm, &tile_data->xd, tile_data->dqcoeff);
-      tile_data->xd.plane[0].color_index_map = tile_data->color_index_map[0];
-      tile_data->xd.plane[1].color_index_map = tile_data->color_index_map[1];
+#endif
+      vp10_init_macroblockd(cm, &td->xd, td->dqcoeff);
+      td->xd.plane[0].color_index_map = td->color_index_map[0];
+      td->xd.plane[1].color_index_map = td->color_index_map[1];
     }
   }
 
-  for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
-    TileInfo tile;
-    vp10_tile_set_row(&tile, cm, tile_row);
-    for (mi_row = tile.mi_row_start; mi_row < tile.mi_row_end;
-         mi_row += MI_BLOCK_SIZE) {
-      for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
-        const int col = pbi->inv_tile_order ?
-                        tile_cols - tile_col - 1 : tile_col;
-        tile_data = pbi->tile_data + tile_cols * tile_row + col;
-        vp10_tile_set_col(&tile, tile_data->cm, col);
-        vp10_zero(tile_data->xd.left_context);
-        vp10_zero(tile_data->xd.left_seg_context);
-        for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
-             mi_col += MI_BLOCK_SIZE) {
-          decode_partition(pbi, &tile_data->xd, mi_row,
-                           mi_col, &tile_data->bit_reader, BLOCK_64X64, 4);
+  for (tile_row = tile_rows_start; tile_row < tile_rows_end; ++tile_row) {
+    const int row = inv_row_order ? tile_rows - 1 - tile_row : tile_row;
+    int mi_row = 0;
+    TileInfo tile_info;
+
+    vp10_tile_set_row(&tile_info, cm, row);
+
+    for (tile_col = tile_cols_start; tile_col < tile_cols_end; ++tile_col) {
+      const int col = inv_col_order ? tile_cols - 1 - tile_col : tile_col;
+      TileData *const td = pbi->tile_data + tile_cols * row + col;
+
+      vp10_tile_set_col(&tile_info, cm, col);
+
+      vp10_zero_above_context(cm, tile_info.mi_col_start, tile_info.mi_col_end);
+
+      for (mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
+           mi_row += cm->mib_size) {
+        int mi_col;
+
+        vp10_zero_left_context(&td->xd);
+
+        for (mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
+             mi_col += cm->mib_size) {
+          decode_partition(pbi, &td->xd,
+#if CONFIG_SUPERTX
+                           0,
+#endif  // CONFIG_SUPERTX
+                           mi_row, mi_col, &td->bit_reader,
+                           cm->sb_size, b_width_log2_lookup[cm->sb_size]);
         }
-        pbi->mb.corrupted |= tile_data->xd.corrupted;
+        pbi->mb.corrupted |= td->xd.corrupted;
         if (pbi->mb.corrupted)
             vpx_internal_error(&cm->error, VPX_CODEC_CORRUPT_FRAME,
                                "Failed to decode tile data");
-      }
-      // Loopfilter one row.
-      if (cm->lf.filter_level && !cm->skip_loop_filter) {
-        const int lf_start = mi_row - MI_BLOCK_SIZE;
-        LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
-
-        // delay the loopfilter by 1 macroblock row.
-        if (lf_start < 0) continue;
-
-        // decoding has completed: finish up the loop filter in this thread.
-        if (mi_row + MI_BLOCK_SIZE >= cm->mi_rows) continue;
-
-        winterface->sync(&pbi->lf_worker);
-        lf_data->start = lf_start;
-        lf_data->stop = mi_row;
-        if (pbi->max_threads > 1) {
-          winterface->launch(&pbi->lf_worker);
-        } else {
-          winterface->execute(&pbi->lf_worker);
+#if CONFIG_ENTROPY
+        if (cm->do_subframe_update &&
+            cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+          if ((mi_row + MI_SIZE) % (MI_SIZE *
+              VPXMAX(cm->mi_rows / MI_SIZE / COEF_PROBS_BUFS, 1)) == 0 &&
+              mi_row + MI_SIZE < cm->mi_rows &&
+              cm->coef_probs_update_idx < COEF_PROBS_BUFS - 1) {
+            vp10_partial_adapt_probs(cm, mi_row, mi_col);
+            ++cm->coef_probs_update_idx;
+          }
         }
+#endif  // CONFIG_ENTROPY
       }
-      // After loopfiltering, the last 7 row pixels in each superblock row may
-      // still be changed by the longest loopfilter of the next superblock
-      // row.
-      if (cm->frame_parallel_decode)
-        vp10_frameworker_broadcast(pbi->cur_buf,
-                                  mi_row << MI_BLOCK_SIZE_LOG2);
     }
+
+    assert(mi_row > 0);
+
+#if !CONFIG_VAR_TX
+    // Loopfilter one tile row.
+    if (cm->lf.filter_level && !cm->skip_loop_filter) {
+      LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
+      const int lf_start = VPXMAX(0, tile_info.mi_row_start - cm->mib_size);
+      const int lf_end = tile_info.mi_row_end - cm->mib_size;
+
+      // Delay the loopfilter if the first tile row is only
+      // a single superblock high.
+      if (lf_end <= 0)
+        continue;
+
+      // Decoding has completed. Finish up the loop filter in this thread.
+      if (tile_info.mi_row_end >= cm->mi_rows)
+        continue;
+
+      winterface->sync(&pbi->lf_worker);
+      lf_data->start = lf_start;
+      lf_data->stop = lf_end;
+      if (pbi->max_threads > 1) {
+        winterface->launch(&pbi->lf_worker);
+      } else {
+        winterface->execute(&pbi->lf_worker);
+      }
+    }
+
+    // After loopfiltering, the last 7 row pixels in each superblock row may
+    // still be changed by the longest loopfilter of the next superblock row.
+    if (cm->frame_parallel_decode)
+      vp10_frameworker_broadcast(pbi->cur_buf, mi_row << cm->mib_size_log2);
+#endif  // !CONFIG_VAR_TX
   }
 
+#if CONFIG_VAR_TX
+  // Loopfilter the whole frame.
+  vp10_loop_filter_frame(get_frame_new_buffer(cm), cm, &pbi->mb,
+                         cm->lf.filter_level, 0, 0);
+#else
   // Loopfilter remaining rows in the frame.
   if (cm->lf.filter_level && !cm->skip_loop_filter) {
     LFWorkerData *const lf_data = (LFWorkerData*)pbi->lf_worker.data1;
@@ -1602,17 +2794,40 @@
     lf_data->stop = cm->mi_rows;
     winterface->execute(&pbi->lf_worker);
   }
-
-  // Get last tile data.
-  tile_data = pbi->tile_data + tile_cols * tile_rows - 1;
+#endif  // CONFIG_VAR_TX
 
   if (cm->frame_parallel_decode)
     vp10_frameworker_broadcast(pbi->cur_buf, INT_MAX);
-  return vpx_reader_find_end(&tile_data->bit_reader);
+
+#if CONFIG_EXT_TILE
+  if (n_tiles == 1) {
+#if CONFIG_ANS
+    return data_end;
+#else
+    // Find the end of the single tile buffer
+    return vpx_reader_find_end(&pbi->tile_data->bit_reader);
+#endif  // CONFIG_ANS
+  } else {
+    // Return the end of the last tile buffer
+    return tile_buffers[tile_rows - 1][tile_cols - 1].raw_data_end;
+  }
+#else
+#if CONFIG_ANS
+  return data_end;
+#else
+  {
+    // Get last tile data.
+    TileData *const td = pbi->tile_data + tile_cols * tile_rows - 1;
+    return vpx_reader_find_end(&td->bit_reader);
+  }
+#endif  // CONFIG_ANS
+#endif  // CONFIG_EXT_TILE
 }
 
 static int tile_worker_hook(TileWorkerData *const tile_data,
                             const TileInfo *const tile) {
+  VP10Decoder *const pbi = tile_data->pbi;
+  const VP10_COMMON *const cm = &pbi->common;
   int mi_row, mi_col;
 
   if (setjmp(tile_data->error_info.jmp)) {
@@ -1624,15 +2839,20 @@
   tile_data->error_info.setjmp = 1;
   tile_data->xd.error_info = &tile_data->error_info;
 
+  vp10_zero_above_context(&pbi->common, tile->mi_col_start, tile->mi_col_end);
+
   for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
-       mi_row += MI_BLOCK_SIZE) {
-    vp10_zero(tile_data->xd.left_context);
-    vp10_zero(tile_data->xd.left_seg_context);
+       mi_row += cm->mib_size) {
+    vp10_zero_left_context(&tile_data->xd);
+
     for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
-         mi_col += MI_BLOCK_SIZE) {
-      decode_partition(tile_data->pbi, &tile_data->xd,
+         mi_col += cm->mib_size) {
+      decode_partition(pbi, &tile_data->xd,
+#if CONFIG_SUPERTX
+                       0,
+#endif
                        mi_row, mi_col, &tile_data->bit_reader,
-                       BLOCK_64X64, 4);
+                       cm->sb_size, b_width_log2_lookup[cm->sb_size]);
     }
   }
   return !tile_data->xd.corrupted;
@@ -1640,8 +2860,8 @@
 
 // sorts in descending order
 static int compare_tile_buffers(const void *a, const void *b) {
-  const TileBuffer *const buf1 = (const TileBuffer*)a;
-  const TileBuffer *const buf2 = (const TileBuffer*)b;
+  const TileBufferDec *const buf1 = (const TileBufferDec*)a;
+  const TileBufferDec *const buf2 = (const TileBufferDec*)b;
   return (int)(buf2->size - buf1->size);
 }
 
@@ -1650,24 +2870,46 @@
                                       const uint8_t *data_end) {
   VP10_COMMON *const cm = &pbi->common;
   const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
-  const uint8_t *bit_reader_end = NULL;
-  const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
-  const int tile_cols = 1 << cm->log2_tile_cols;
-  const int tile_rows = 1 << cm->log2_tile_rows;
+  const int tile_cols = cm->tile_cols;
+  const int tile_rows = cm->tile_rows;
   const int num_workers = VPXMIN(pbi->max_threads & ~1, tile_cols);
-  TileBuffer tile_buffers[1][1 << 6];
-  int n;
-  int final_worker = -1;
+  TileBufferDec (*const tile_buffers)[MAX_TILE_COLS] = pbi->tile_buffers;
+#if CONFIG_EXT_TILE
+  const int dec_tile_row = VPXMIN(pbi->dec_tile_row, tile_rows);
+  const int single_row = pbi->dec_tile_row >= 0;
+  const int tile_rows_start = single_row ? dec_tile_row : 0;
+  const int tile_rows_end = single_row ? dec_tile_row + 1 : tile_rows;
+  const int dec_tile_col = VPXMIN(pbi->dec_tile_col, tile_cols);
+  const int single_col = pbi->dec_tile_col >= 0;
+  const int tile_cols_start = single_col ? dec_tile_col : 0;
+  const int tile_cols_end = single_col ? tile_cols_start + 1 : tile_cols;
+#else
+  const int tile_rows_start = 0;
+  const int tile_rows_end = tile_rows;
+  const int tile_cols_start = 0;
+  const int tile_cols_end = tile_cols;
+#endif  // CONFIG_EXT_TILE
+  int tile_row, tile_col;
+  int i;
 
-  assert(tile_cols <= (1 << 6));
-  assert(tile_rows == 1);
-  (void)tile_rows;
+#if !(CONFIG_ANS || CONFIG_EXT_TILE)
+  int final_worker = -1;
+#endif  // !(CONFIG_ANS || CONFIG_EXT_TILE)
+
+  assert(tile_rows <= MAX_TILE_ROWS);
+  assert(tile_cols <= MAX_TILE_COLS);
+
+  assert(tile_cols * tile_rows > 1);
+
+#if CONFIG_ANS
+  // TODO(any): This might just work now. Needs to be tested.
+  abort();  // FIXME: Tile parsing broken
+#endif  // CONFIG_ANS
 
   // TODO(jzern): See if we can remove the restriction of passing in max
   // threads to the decoder.
   if (pbi->num_tile_workers == 0) {
     const int num_threads = pbi->max_threads & ~1;
-    int i;
     CHECK_MEM_ERROR(cm, pbi->tile_workers,
                     vpx_malloc(num_threads * sizeof(*pbi->tile_workers)));
     // Ensure tile data offsets will be properly aligned. This may fail on
@@ -1691,121 +2933,128 @@
   }
 
   // Reset tile decoding hook
-  for (n = 0; n < num_workers; ++n) {
-    VPxWorker *const worker = &pbi->tile_workers[n];
+  for (i = 0; i < num_workers; ++i) {
+    VPxWorker *const worker = &pbi->tile_workers[i];
     winterface->sync(worker);
     worker->hook = (VPxWorkerHook)tile_worker_hook;
-    worker->data1 = &pbi->tile_worker_data[n];
-    worker->data2 = &pbi->tile_worker_info[n];
-  }
-
-  // Note: this memset assumes above_context[0], [1] and [2]
-  // are allocated as part of the same buffer.
-  memset(cm->above_context, 0,
-         sizeof(*cm->above_context) * MAX_MB_PLANE * 2 * aligned_mi_cols);
-  memset(cm->above_seg_context, 0,
-         sizeof(*cm->above_seg_context) * aligned_mi_cols);
-
-  // Load tile data into tile_buffers
-  get_tile_buffers(pbi, data, data_end, tile_cols, tile_rows, tile_buffers);
-
-  // Sort the buffers based on size in descending order.
-  qsort(tile_buffers[0], tile_cols, sizeof(tile_buffers[0][0]),
-        compare_tile_buffers);
-
-  // Rearrange the tile buffers such that per-tile group the largest, and
-  // presumably the most difficult, tile will be decoded in the main thread.
-  // This should help minimize the number of instances where the main thread is
-  // waiting for a worker to complete.
-  {
-    int group_start = 0;
-    while (group_start < tile_cols) {
-      const TileBuffer largest = tile_buffers[0][group_start];
-      const int group_end = VPXMIN(group_start + num_workers, tile_cols) - 1;
-      memmove(tile_buffers[0] + group_start, tile_buffers[0] + group_start + 1,
-              (group_end - group_start) * sizeof(tile_buffers[0][0]));
-      tile_buffers[0][group_end] = largest;
-      group_start = group_end + 1;
-    }
+    worker->data1 = &pbi->tile_worker_data[i];
+    worker->data2 = &pbi->tile_worker_info[i];
   }
 
   // Initialize thread frame counts.
   if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
-    int i;
-
     for (i = 0; i < num_workers; ++i) {
-      TileWorkerData *const tile_data =
-          (TileWorkerData*)pbi->tile_workers[i].data1;
-      vp10_zero(tile_data->counts);
+      TileWorkerData *const twd = (TileWorkerData*)pbi->tile_workers[i].data1;
+      vp10_zero(twd->counts);
     }
   }
 
-  n = 0;
-  while (n < tile_cols) {
-    int i;
-    for (i = 0; i < num_workers && n < tile_cols; ++i) {
-      VPxWorker *const worker = &pbi->tile_workers[i];
-      TileWorkerData *const tile_data = (TileWorkerData*)worker->data1;
-      TileInfo *const tile = (TileInfo*)worker->data2;
-      TileBuffer *const buf = &tile_buffers[0][n];
+  // Load tile data into tile_buffers
+  get_tile_buffers(pbi, data, data_end, tile_buffers);
 
-      tile_data->pbi = pbi;
-      tile_data->xd = pbi->mb;
-      tile_data->xd.corrupted = 0;
-      tile_data->xd.counts =
-          cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD ?
-              &tile_data->counts : NULL;
-      vp10_zero(tile_data->dqcoeff);
-      vp10_tile_init(tile, cm, 0, buf->col);
-      vp10_tile_init(&tile_data->xd.tile, cm, 0, buf->col);
-      setup_token_decoder(buf->data, data_end, buf->size, &cm->error,
-                          &tile_data->bit_reader, pbi->decrypt_cb,
-                          pbi->decrypt_state);
-      vp10_init_macroblockd(cm, &tile_data->xd, tile_data->dqcoeff);
-      tile_data->xd.plane[0].color_index_map = tile_data->color_index_map[0];
-      tile_data->xd.plane[1].color_index_map = tile_data->color_index_map[1];
+  for (tile_row = tile_rows_start ; tile_row < tile_rows_end ; ++tile_row) {
+    // Sort the buffers in this tile row based on size in descending order.
+    qsort(&tile_buffers[tile_row][tile_cols_start],
+          tile_cols_end - tile_cols_start, sizeof(tile_buffers[0][0]),
+          compare_tile_buffers);
 
-      worker->had_error = 0;
-      if (i == num_workers - 1 || n == tile_cols - 1) {
-        winterface->execute(worker);
-      } else {
-        winterface->launch(worker);
+    // Rearrange the tile buffers in this tile row such that per-tile group
+    // the largest, and presumably the most difficult tile will be decoded in
+    // the main thread. This should help minimize the number of instances
+    // where the main thread is waiting for a worker to complete.
+    {
+      int group_start;
+      for (group_start = tile_cols_start ; group_start < tile_cols_end ;
+           group_start += num_workers) {
+        const int group_end = VPXMIN(group_start + num_workers, tile_cols);
+        const TileBufferDec largest = tile_buffers[tile_row][group_start];
+        memmove(&tile_buffers[tile_row][group_start],
+                &tile_buffers[tile_row][group_start + 1],
+                (group_end - group_start - 1) * sizeof(tile_buffers[0][0]));
+        tile_buffers[tile_row][group_end - 1] = largest;
+      }
+    }
+
+    for (tile_col = tile_cols_start ; tile_col < tile_cols_end ; ) {
+      // Launch workers for individual columns
+      for (i = 0; i < num_workers && tile_col < tile_cols_end;
+           ++i, ++tile_col) {
+        TileBufferDec *const buf = &tile_buffers[tile_row][tile_col];
+        VPxWorker *const worker = &pbi->tile_workers[i];
+        TileWorkerData *const twd = (TileWorkerData*)worker->data1;
+        TileInfo *const tile_info = (TileInfo*)worker->data2;
+
+        twd->pbi = pbi;
+        twd->xd = pbi->mb;
+        twd->xd.corrupted = 0;
+        twd->xd.counts =
+            cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD ?
+                &twd->counts : NULL;
+        vp10_zero(twd->dqcoeff);
+        vp10_tile_init(tile_info, cm, tile_row, buf->col);
+        vp10_tile_init(&twd->xd.tile, cm, tile_row, buf->col);
+#if !CONFIG_ANS
+        setup_bool_decoder(buf->data, data_end, buf->size, &cm->error,
+                           &twd->bit_reader,
+                           pbi->decrypt_cb, pbi->decrypt_state);
+#else
+        setup_token_decoder(buf->data, data_end, buf->size, &cm->error,
+                            &twd->bit_reader, pbi->decrypt_cb,
+                            pbi->decrypt_state);
+#endif  // CONFIG_ANS
+        vp10_init_macroblockd(cm, &twd->xd, twd->dqcoeff);
+        twd->xd.plane[0].color_index_map = twd->color_index_map[0];
+        twd->xd.plane[1].color_index_map = twd->color_index_map[1];
+
+        worker->had_error = 0;
+        if (i == num_workers - 1 || tile_col == tile_cols_end - 1) {
+          winterface->execute(worker);
+        } else {
+          winterface->launch(worker);
+        }
+
+#if !(CONFIG_ANS || CONFIG_EXT_TILE)
+        if (tile_row == tile_rows - 1 && buf->col == tile_cols - 1) {
+          final_worker = i;
+        }
+#endif  // !(CONFIG_ANS || CONFIG_EXT_TILE)
       }
 
-      if (buf->col == tile_cols - 1) {
-        final_worker = i;
-      }
-
-      ++n;
-    }
-
-    for (; i > 0; --i) {
-      VPxWorker *const worker = &pbi->tile_workers[i - 1];
-      // TODO(jzern): The tile may have specific error data associated with
-      // its vpx_internal_error_info which could be propagated to the main info
-      // in cm. Additionally once the threads have been synced and an error is
-      // detected, there's no point in continuing to decode tiles.
-      pbi->mb.corrupted |= !winterface->sync(worker);
-    }
-    if (final_worker > -1) {
-      TileWorkerData *const tile_data =
-          (TileWorkerData*)pbi->tile_workers[final_worker].data1;
-      bit_reader_end = vpx_reader_find_end(&tile_data->bit_reader);
-      final_worker = -1;
-    }
-
-    // Accumulate thread frame counts.
-    if (n >= tile_cols &&
-        cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
-      for (i = 0; i < num_workers; ++i) {
-        TileWorkerData *const tile_data =
-            (TileWorkerData*)pbi->tile_workers[i].data1;
-        vp10_accumulate_frame_counts(cm, &tile_data->counts, 1);
+      // Sync all workers
+      for (; i > 0; --i) {
+        VPxWorker *const worker = &pbi->tile_workers[i - 1];
+        // TODO(jzern): The tile may have specific error data associated with
+        // its vpx_internal_error_info which could be propagated to the main
+        // info in cm. Additionally once the threads have been synced and an
+        // error is detected, there's no point in continuing to decode tiles.
+        pbi->mb.corrupted |= !winterface->sync(worker);
       }
     }
   }
 
-  return bit_reader_end;
+  // Accumulate thread frame counts.
+  if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+    for (i = 0; i < num_workers; ++i) {
+      TileWorkerData *const twd = (TileWorkerData*)pbi->tile_workers[i].data1;
+      vp10_accumulate_frame_counts(cm, &twd->counts);
+    }
+  }
+
+#if CONFIG_EXT_TILE
+  // Return the end of the last tile buffer
+  return tile_buffers[tile_rows - 1][tile_cols - 1].raw_data_end;
+#else
+#if CONFIG_ANS
+  return data_end;
+#else
+  assert(final_worker != -1);
+  {
+    TileWorkerData *const twd =
+        (TileWorkerData*)pbi->tile_workers[final_worker].data1;
+    return vpx_reader_find_end(&twd->bit_reader);
+  }
+#endif  // CONFIG_ANS
+#endif  // CONFIG_EXT_TILE
 }
 
 static void error_handler(void *data) {
@@ -1865,10 +3114,18 @@
   RefCntBuffer *const frame_bufs = pool->frame_bufs;
   int i, mask, ref_index = 0;
   size_t sz;
-
+#if CONFIG_EXT_REFS
+  cm->last3_frame_type = cm->last2_frame_type;
+  cm->last2_frame_type = cm->last_frame_type;
+#endif  // CONFIG_EXT_REFS
   cm->last_frame_type = cm->frame_type;
   cm->last_intra_only = cm->intra_only;
 
+#if CONFIG_EXT_REFS
+  // NOTE: By default all coded frames to be used as a reference
+  cm->is_reference_frame = 1;
+#endif  // CONFIG_EXT_REFS
+
   if (vpx_rb_read_literal(rb, 2) != VP9_FRAME_MARKER)
       vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                          "Invalid frame marker");
@@ -1885,9 +3142,11 @@
 #endif
 
   cm->show_existing_frame = vpx_rb_read_bit(rb);
+
   if (cm->show_existing_frame) {
     // Show an existing frame directly.
     const int frame_to_show = cm->ref_frame_map[vpx_rb_read_literal(rb, 3)];
+
     lock_buffer_pool(pool);
     if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) {
       unlock_buffer_pool(pool);
@@ -1895,17 +3154,18 @@
                          "Buffer %d does not contain a decoded frame",
                          frame_to_show);
     }
-
     ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show);
     unlock_buffer_pool(pool);
-    pbi->refresh_frame_flags = 0;
+
     cm->lf.filter_level = 0;
     cm->show_frame = 1;
+    pbi->refresh_frame_flags = 0;
 
     if (cm->frame_parallel_decode) {
       for (i = 0; i < REF_FRAMES; ++i)
         cm->next_ref_frame_map[i] = cm->ref_frame_map[i];
     }
+
     return 0;
   }
 
@@ -1931,13 +3191,14 @@
       memset(&cm->ref_frame_map, -1, sizeof(cm->ref_frame_map));
       pbi->need_resync = 0;
     }
+    if (frame_is_intra_only(cm))
+      cm->allow_screen_content_tools = vpx_rb_read_bit(rb);
   } else {
     cm->intra_only = cm->show_frame ? 0 : vpx_rb_read_bit(rb);
 
     if (cm->error_resilient_mode) {
         cm->reset_frame_context = RESET_FRAME_CONTEXT_ALL;
     } else {
-#if CONFIG_MISC_FIXES
       if (cm->intra_only) {
           cm->reset_frame_context =
               vpx_rb_read_bit(rb) ? RESET_FRAME_CONTEXT_ALL
@@ -1951,40 +3212,14 @@
                   vpx_rb_read_bit(rb) ? RESET_FRAME_CONTEXT_ALL
                                       : RESET_FRAME_CONTEXT_CURRENT;
       }
-#else
-      static const RESET_FRAME_CONTEXT_MODE reset_frame_context_conv_tbl[4] = {
-        RESET_FRAME_CONTEXT_NONE, RESET_FRAME_CONTEXT_NONE,
-        RESET_FRAME_CONTEXT_CURRENT, RESET_FRAME_CONTEXT_ALL
-      };
-
-      cm->reset_frame_context =
-          reset_frame_context_conv_tbl[vpx_rb_read_literal(rb, 2)];
-#endif
     }
 
     if (cm->intra_only) {
       if (!vp10_read_sync_code(rb))
         vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
                            "Invalid frame sync code");
-#if CONFIG_MISC_FIXES
+
       read_bitdepth_colorspace_sampling(cm, rb);
-#else
-      if (cm->profile > PROFILE_0) {
-        read_bitdepth_colorspace_sampling(cm, rb);
-      } else {
-        // NOTE: The intra-only frame header does not include the specification
-        // of either the color format or color sub-sampling in profile 0. VP9
-        // specifies that the default color format should be YUV 4:2:0 in this
-        // case (normative).
-        cm->color_space = VPX_CS_BT_601;
-        cm->color_range = 0;
-        cm->subsampling_y = cm->subsampling_x = 1;
-        cm->bit_depth = VPX_BITS_8;
-#if CONFIG_VP9_HIGHBITDEPTH
-        cm->use_highbitdepth = 0;
-#endif
-      }
-#endif
 
       pbi->refresh_frame_flags = vpx_rb_read_literal(rb, REF_FRAMES);
       setup_frame_size(cm, rb);
@@ -1994,6 +3229,15 @@
       }
     } else if (pbi->need_resync != 1) {  /* Skip if need resync */
       pbi->refresh_frame_flags = vpx_rb_read_literal(rb, REF_FRAMES);
+
+#if CONFIG_EXT_REFS
+      if (!pbi->refresh_frame_flags) {
+        // NOTE: "pbi->refresh_frame_flags == 0" indicates that the coded frame
+        //       will not be used as a reference
+        cm->is_reference_frame = 0;
+      }
+#endif  // CONFIG_EXT_REFS
+
       for (i = 0; i < REFS_PER_FRAME; ++i) {
         const int ref = vpx_rb_read_literal(rb, REF_FRAMES_LOG2);
         const int idx = cm->ref_frame_map[ref];
@@ -2042,18 +3286,9 @@
   if (!cm->error_resilient_mode) {
     cm->refresh_frame_context =
         vpx_rb_read_bit(rb) ? REFRESH_FRAME_CONTEXT_FORWARD
-                            : REFRESH_FRAME_CONTEXT_OFF;
-    if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_FORWARD) {
-        cm->refresh_frame_context =
-            vpx_rb_read_bit(rb) ? REFRESH_FRAME_CONTEXT_FORWARD
-                                : REFRESH_FRAME_CONTEXT_BACKWARD;
-#if !CONFIG_MISC_FIXES
-    } else {
-      vpx_rb_read_bit(rb);  // parallel decoding mode flag
-#endif
-    }
+                            : REFRESH_FRAME_CONTEXT_BACKWARD;
   } else {
-    cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_OFF;
+    cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_FORWARD;
   }
 
   // This flag will be overridden by the call to vp10_setup_past_independence
@@ -2077,6 +3312,7 @@
 
   for (; ref_index < REF_FRAMES; ++ref_index) {
     cm->next_ref_frame_map[ref_index] = cm->ref_frame_map[ref_index];
+
     // Current thread holds the reference frame.
     if (cm->ref_frame_map[ref_index] >= 0)
       ++frame_bufs[cm->ref_frame_map[ref_index]].ref_count;
@@ -2087,20 +3323,39 @@
   if (frame_is_intra_only(cm) || cm->error_resilient_mode)
     vp10_setup_past_independence(cm);
 
-  setup_loopfilter(&cm->lf, rb);
+#if CONFIG_EXT_PARTITION
+  set_sb_size(cm, vpx_rb_read_bit(rb) ? BLOCK_128X128 : BLOCK_64X64);
+#else
+  set_sb_size(cm, BLOCK_64X64);
+#endif  // CONFIG_EXT_PARTITION
+
+  setup_loopfilter(cm, rb);
+#if CONFIG_LOOP_RESTORATION
+  setup_restoration(cm, rb);
+#endif  // CONFIG_LOOP_RESTORATION
   setup_quantization(cm, rb);
 #if CONFIG_VP9_HIGHBITDEPTH
   xd->bd = (int)cm->bit_depth;
 #endif
 
+#if CONFIG_ENTROPY
+  vp10_default_coef_probs(cm);
+  if (cm->frame_type == KEY_FRAME || cm->error_resilient_mode ||
+      cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL) {
+    for (i = 0; i < FRAME_CONTEXTS; ++i)
+      cm->frame_contexts[i] = *cm->fc;
+  } else if (cm->reset_frame_context == RESET_FRAME_CONTEXT_CURRENT) {
+    cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
+  }
+#endif  // CONFIG_ENTROPY
+
   setup_segmentation(cm, rb);
 
   {
     int i;
     for (i = 0; i < MAX_SEGMENTS; ++i) {
-      const int qindex = CONFIG_MISC_FIXES && cm->seg.enabled ?
-          vp10_get_qindex(&cm->seg, i, cm->base_qindex) :
-          cm->base_qindex;
+      const int qindex = cm->seg.enabled ?
+          vp10_get_qindex(&cm->seg, i, cm->base_qindex) : cm->base_qindex;
       xd->lossless[i] = qindex == 0 &&
           cm->y_dc_delta_q == 0 &&
           cm->uv_dc_delta_q == 0 &&
@@ -2109,13 +3364,11 @@
   }
 
   setup_segmentation_dequant(cm);
-#if CONFIG_MISC_FIXES
   cm->tx_mode = (!cm->seg.enabled && xd->lossless[0]) ? ONLY_4X4
                                                       : read_tx_mode(rb);
   cm->reference_mode = read_frame_reference_mode(cm, rb);
-#endif
 
-  setup_tile_info(cm, rb);
+  read_tile_info(pbi, rb);
   sz = vpx_rb_read_literal(rb, 16);
 
   if (sz == 0)
@@ -2125,49 +3378,103 @@
   return sz;
 }
 
-static void read_ext_tx_probs(FRAME_CONTEXT *fc, vpx_reader *r) {
+#if CONFIG_EXT_TX
+static void read_ext_tx_probs(FRAME_CONTEXT *fc, vp10_reader *r) {
   int i, j, k;
-  if (vpx_read(r, GROUP_DIFF_UPDATE_PROB)) {
+  int s;
+  for (s = 1; s < EXT_TX_SETS_INTER; ++s) {
+    if (vp10_read(r, GROUP_DIFF_UPDATE_PROB)) {
+      for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+        if (!use_inter_ext_tx_for_txsize[s][i]) continue;
+        for (j = 0; j < num_ext_tx_set_inter[s] - 1; ++j)
+          vp10_diff_update_prob(r, &fc->inter_ext_tx_prob[s][i][j]);
+      }
+    }
+  }
+
+  for (s = 1; s < EXT_TX_SETS_INTRA; ++s) {
+    if (vp10_read(r, GROUP_DIFF_UPDATE_PROB)) {
+      for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+        if (!use_intra_ext_tx_for_txsize[s][i]) continue;
+        for (j = 0; j < INTRA_MODES; ++j)
+          for (k = 0; k < num_ext_tx_set_intra[s] - 1; ++k)
+            vp10_diff_update_prob(r, &fc->intra_ext_tx_prob[s][i][j][k]);
+      }
+    }
+  }
+}
+
+#else
+
+static void read_ext_tx_probs(FRAME_CONTEXT *fc, vp10_reader *r) {
+  int i, j, k;
+  if (vp10_read(r, GROUP_DIFF_UPDATE_PROB)) {
     for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
       for (j = 0; j < TX_TYPES; ++j)
         for (k = 0; k < TX_TYPES - 1; ++k)
           vp10_diff_update_prob(r, &fc->intra_ext_tx_prob[i][j][k]);
     }
   }
-  if (vpx_read(r, GROUP_DIFF_UPDATE_PROB)) {
+  if (vp10_read(r, GROUP_DIFF_UPDATE_PROB)) {
     for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
       for (k = 0; k < TX_TYPES - 1; ++k)
         vp10_diff_update_prob(r, &fc->inter_ext_tx_prob[i][k]);
     }
   }
 }
+#endif  // CONFIG_EXT_TX
+
+#if CONFIG_SUPERTX
+static void read_supertx_probs(FRAME_CONTEXT *fc, vp10_reader *r) {
+  int i, j;
+  if (vp10_read(r, GROUP_DIFF_UPDATE_PROB)) {
+    for (i = 0; i < PARTITION_SUPERTX_CONTEXTS; ++i) {
+      for (j = 1; j < TX_SIZES; ++j) {
+        vp10_diff_update_prob(r, &fc->supertx_prob[i][j]);
+      }
+    }
+  }
+}
+#endif  // CONFIG_SUPERTX
 
 static int read_compressed_header(VP10Decoder *pbi, const uint8_t *data,
                                   size_t partition_size) {
   VP10_COMMON *const cm = &pbi->common;
-#if !CONFIG_MISC_FIXES
+#if CONFIG_SUPERTX
   MACROBLOCKD *const xd = &pbi->mb;
 #endif
   FRAME_CONTEXT *const fc = cm->fc;
-  vpx_reader r;
+  vp10_reader r;
   int k, i, j;
 
+#if !CONFIG_ANS
   if (vpx_reader_init(&r, data, partition_size, pbi->decrypt_cb,
                       pbi->decrypt_state))
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate bool decoder 0");
+#else
+  if (ans_read_init(&r, data, partition_size))
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate compressed header ANS decoder");
+#endif  // !CONFIG_ANS
 
-#if !CONFIG_MISC_FIXES
-  cm->tx_mode = xd->lossless[0] ? ONLY_4X4 : read_tx_mode(&r);
-#endif
-  if (cm->tx_mode == TX_MODE_SELECT)
-    read_tx_mode_probs(&fc->tx_probs, &r);
+  if (cm->tx_mode == TX_MODE_SELECT) {
+    for (i = 0; i < TX_SIZES - 1; ++i)
+      for (j = 0; j < TX_SIZE_CONTEXTS; ++j)
+        for (k = 0; k < i + 1; ++k)
+          vp10_diff_update_prob(&r, &fc->tx_size_probs[i][j][k]);
+  }
+
   read_coef_probs(fc, cm->tx_mode, &r);
 
+#if CONFIG_VAR_TX
+  for (k = 0; k < TXFM_PARTITION_CONTEXTS; ++k)
+    vp10_diff_update_prob(&r, &fc->txfm_partition_prob[k]);
+#endif
+
   for (k = 0; k < SKIP_CONTEXTS; ++k)
     vp10_diff_update_prob(&r, &fc->skip_probs[k]);
 
-#if CONFIG_MISC_FIXES
   if (cm->seg.enabled) {
     if (cm->seg.temporal_update) {
       for (k = 0; k < PREDICTION_PROBS; k++)
@@ -2181,52 +3488,100 @@
     for (i = 0; i < INTRA_MODES - 1; ++i)
       vp10_diff_update_prob(&r, &fc->uv_mode_prob[j][i]);
 
+#if CONFIG_EXT_PARTITION_TYPES
+    for (i = 0; i < PARTITION_TYPES - 1; ++i)
+      vp10_diff_update_prob(&r, &fc->partition_prob[0][i]);
+    for (j = 1; j < PARTITION_CONTEXTS; ++j)
+      for (i = 0; i < EXT_PARTITION_TYPES - 1; ++i)
+        vp10_diff_update_prob(&r, &fc->partition_prob[j][i]);
+#else
   for (j = 0; j < PARTITION_CONTEXTS; ++j)
     for (i = 0; i < PARTITION_TYPES - 1; ++i)
       vp10_diff_update_prob(&r, &fc->partition_prob[j][i]);
-#endif
+#endif  // CONFIG_EXT_PARTITION_TYPES
+
+#if CONFIG_EXT_INTRA
+  for (i = 0; i < INTRA_FILTERS + 1; ++i)
+    for (j = 0; j < INTRA_FILTERS - 1; ++j)
+      vp10_diff_update_prob(&r, &fc->intra_filter_probs[i][j]);
+#endif  // CONFIG_EXT_INTRA
 
   if (frame_is_intra_only(cm)) {
     vp10_copy(cm->kf_y_prob, vp10_kf_y_mode_prob);
-#if CONFIG_MISC_FIXES
     for (k = 0; k < INTRA_MODES; k++)
       for (j = 0; j < INTRA_MODES; j++)
         for (i = 0; i < INTRA_MODES - 1; ++i)
           vp10_diff_update_prob(&r, &cm->kf_y_prob[k][j][i]);
-#endif
   } else {
+#if !CONFIG_REF_MV
     nmv_context *const nmvc = &fc->nmvc;
+#endif
 
     read_inter_mode_probs(fc, &r);
 
+#if CONFIG_EXT_INTER
+    read_inter_compound_mode_probs(fc, &r);
+    if (cm->reference_mode != COMPOUND_REFERENCE) {
+      for (i = 0; i < BLOCK_SIZE_GROUPS; i++) {
+        if (is_interintra_allowed_bsize_group(i)) {
+          vp10_diff_update_prob(&r, &fc->interintra_prob[i]);
+        }
+      }
+      for (i = 0; i < BLOCK_SIZE_GROUPS; i++) {
+        for (j = 0; j < INTERINTRA_MODES - 1; j++)
+          vp10_diff_update_prob(&r, &fc->interintra_mode_prob[i][j]);
+      }
+      for (i = 0; i < BLOCK_SIZES; i++) {
+        if (is_interintra_allowed_bsize(i) && is_interintra_wedge_used(i)) {
+          vp10_diff_update_prob(&r, &fc->wedge_interintra_prob[i]);
+        }
+      }
+    }
+    if (cm->reference_mode != SINGLE_REFERENCE) {
+      for (i = 0; i < BLOCK_SIZES; i++) {
+        if (is_interinter_wedge_used(i)) {
+          vp10_diff_update_prob(&r, &fc->wedge_interinter_prob[i]);
+        }
+      }
+    }
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_OBMC || CONFIG_WARPED_MOTION
+    for (i = BLOCK_8X8; i < BLOCK_SIZES; ++i) {
+      for (j = 0; j < MOTION_VARIATIONS - 1; ++j)
+        vp10_diff_update_prob(&r, &fc->motvar_prob[i][j]);
+    }
+#endif  // CONFIG_OBMC || CONFIG_WARPED_MOTION
+
     if (cm->interp_filter == SWITCHABLE)
       read_switchable_interp_probs(fc, &r);
 
     for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
       vp10_diff_update_prob(&r, &fc->intra_inter_prob[i]);
 
-#if !CONFIG_MISC_FIXES
-    cm->reference_mode = read_frame_reference_mode(cm, &r);
-#endif
     if (cm->reference_mode != SINGLE_REFERENCE)
       setup_compound_reference_mode(cm);
+
     read_frame_reference_mode_probs(cm, &r);
 
     for (j = 0; j < BLOCK_SIZE_GROUPS; j++)
       for (i = 0; i < INTRA_MODES - 1; ++i)
         vp10_diff_update_prob(&r, &fc->y_mode_prob[j][i]);
 
-#if !CONFIG_MISC_FIXES
-    for (j = 0; j < PARTITION_CONTEXTS; ++j)
-      for (i = 0; i < PARTITION_TYPES - 1; ++i)
-        vp10_diff_update_prob(&r, &fc->partition_prob[j][i]);
-#endif
-
+#if CONFIG_REF_MV
+    for (i = 0; i < NMV_CONTEXTS; ++i)
+      read_mv_probs(&fc->nmvc[i], cm->allow_high_precision_mv, &r);
+#else
     read_mv_probs(nmvc, cm->allow_high_precision_mv, &r);
+#endif
     read_ext_tx_probs(fc, &r);
+#if CONFIG_SUPERTX
+    if (!xd->lossless[0])
+      read_supertx_probs(fc, &r);
+#endif
   }
 
-  return vpx_reader_has_error(&r);
+  return vp10_reader_has_error(&r);
 }
 
 #ifdef NDEBUG
@@ -2253,6 +3608,21 @@
                  sizeof(cm->counts.switchable_interp)));
   assert(!memcmp(cm->counts.inter_mode, zero_counts.inter_mode,
                  sizeof(cm->counts.inter_mode)));
+#if CONFIG_EXT_INTER
+  assert(!memcmp(cm->counts.inter_compound_mode,
+                 zero_counts.inter_compound_mode,
+                 sizeof(cm->counts.inter_compound_mode)));
+  assert(!memcmp(cm->counts.interintra, zero_counts.interintra,
+                 sizeof(cm->counts.interintra)));
+  assert(!memcmp(cm->counts.wedge_interintra, zero_counts.wedge_interintra,
+                 sizeof(cm->counts.wedge_interintra)));
+  assert(!memcmp(cm->counts.wedge_interinter, zero_counts.wedge_interinter,
+                 sizeof(cm->counts.wedge_interinter)));
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_OBMC || CONFIG_WARPED_MOTION
+  assert(!memcmp(cm->counts.motvar, zero_counts.motvar,
+                 sizeof(cm->counts.motvar)));
+#endif  // CONFIG_OBMC || CONFIG_WARPED_MOTION
   assert(!memcmp(cm->counts.intra_inter, zero_counts.intra_inter,
                  sizeof(cm->counts.intra_inter)));
   assert(!memcmp(cm->counts.comp_inter, zero_counts.comp_inter,
@@ -2261,13 +3631,25 @@
                  sizeof(cm->counts.single_ref)));
   assert(!memcmp(cm->counts.comp_ref, zero_counts.comp_ref,
                  sizeof(cm->counts.comp_ref)));
-  assert(!memcmp(&cm->counts.tx, &zero_counts.tx, sizeof(cm->counts.tx)));
+#if CONFIG_EXT_REFS
+  assert(!memcmp(cm->counts.comp_bwdref, zero_counts.comp_bwdref,
+                 sizeof(cm->counts.comp_bwdref)));
+#endif  // CONFIG_EXT_REFS
+  assert(!memcmp(&cm->counts.tx_size, &zero_counts.tx_size,
+                 sizeof(cm->counts.tx_size)));
   assert(!memcmp(cm->counts.skip, zero_counts.skip, sizeof(cm->counts.skip)));
+#if CONFIG_REF_MV
+  assert(!memcmp(&cm->counts.mv[0], &zero_counts.mv[0],
+                 sizeof(cm->counts.mv[0])));
+  assert(!memcmp(&cm->counts.mv[1], &zero_counts.mv[1],
+                 sizeof(cm->counts.mv[0])));
+#else
   assert(!memcmp(&cm->counts.mv, &zero_counts.mv, sizeof(cm->counts.mv)));
-  assert(!memcmp(cm->counts.intra_ext_tx, zero_counts.intra_ext_tx,
-                 sizeof(cm->counts.intra_ext_tx)));
+#endif
   assert(!memcmp(cm->counts.inter_ext_tx, zero_counts.inter_ext_tx,
                  sizeof(cm->counts.inter_ext_tx)));
+  assert(!memcmp(cm->counts.intra_ext_tx, zero_counts.intra_ext_tx,
+                 sizeof(cm->counts.intra_ext_tx)));
 }
 #endif  // NDEBUG
 
@@ -2324,14 +3706,18 @@
   uint8_t clear_data[MAX_VP9_HEADER_SIZE];
   const size_t first_partition_size = read_uncompressed_header(pbi,
       init_read_bit_buffer(pbi, &rb, data, data_end, clear_data));
-  const int tile_rows = 1 << cm->log2_tile_rows;
-  const int tile_cols = 1 << cm->log2_tile_cols;
   YV12_BUFFER_CONFIG *const new_fb = get_frame_new_buffer(cm);
   xd->cur_buf = new_fb;
 
   if (!first_partition_size) {
     // showing a frame directly
-    *p_data_end = data + (cm->profile <= PROFILE_2 ? 1 : 2);
+#if CONFIG_EXT_REFS
+    if (cm->show_existing_frame)
+      *p_data_end = data + vpx_rb_bytes_read(&rb);
+    else
+#endif  // CONFIG_EXT_REFS
+      *p_data_end = data + (cm->profile <= PROFILE_2 ? 1 : 2);
+
     return;
   }
 
@@ -2346,6 +3732,23 @@
                            !cm->last_intra_only &&
                            cm->last_show_frame &&
                            (cm->last_frame_type != KEY_FRAME);
+#if CONFIG_EXT_REFS
+  // NOTE(zoeliu): As cm->prev_frame can take neither a frame of
+  //               show_exisiting_frame=1, nor can it take a frame not used as
+  //               a reference, it is probable that by the time it is being
+  //               referred to, the frame buffer it originally points to may
+  //               already get expired and have been reassigned to the current
+  //               newly coded frame. Hence, we need to check whether this is
+  //               the case, and if yes, we have 2 choices:
+  //               (1) Simply disable the use of previous frame mvs; or
+  //               (2) Have cm->prev_frame point to one reference frame buffer,
+  //                   e.g. LAST_FRAME.
+  if (cm->use_prev_frame_mvs && !dec_is_ref_frame_buf(pbi, cm->prev_frame)) {
+    // Reassign the LAST_FRAME buffer to cm->prev_frame.
+    RefBuffer *last_fb_ref_buf = &cm->frame_refs[LAST_FRAME - LAST_FRAME];
+    cm->prev_frame = &cm->buffer_pool->frame_bufs[last_fb_ref_buf->idx];
+  }
+#endif  // CONFIG_EXT_REFS
 
   vp10_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y);
 
@@ -2385,7 +3788,16 @@
     vp10_frameworker_unlock_stats(worker);
   }
 
-  if (pbi->max_threads > 1 && tile_rows == 1 && tile_cols > 1) {
+#if CONFIG_ENTROPY
+  vp10_copy(cm->starting_coef_probs, cm->fc->coef_probs);
+  cm->coef_probs_update_idx = 0;
+#endif  // CONFIG_ENTROPY
+
+  if (pbi->max_threads > 1
+#if CONFIG_EXT_TILE
+      && pbi->dec_tile_col < 0  // Decoding all columns
+#endif  // CONFIG_EXT_TILE
+      && cm->tile_cols > 1) {
     // Multi-threaded tile decoder
     *p_data_end = decode_tiles_mt(pbi, data + first_partition_size, data_end);
     if (!xd->corrupted) {
@@ -2404,18 +3816,24 @@
   } else {
     *p_data_end = decode_tiles(pbi, data + first_partition_size, data_end);
   }
+#if CONFIG_LOOP_RESTORATION
+  if (cm->rst_info.restoration_type != RESTORE_NONE) {
+    vp10_loop_restoration_init(&cm->rst_internal,
+                               &cm->rst_info,
+                               cm->frame_type == KEY_FRAME);
+    vp10_loop_restoration_rows(new_fb, cm, 0, cm->mi_rows, 0);
+  }
+#endif  // CONFIG_LOOP_RESTORATION
 
   if (!xd->corrupted) {
     if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+#if CONFIG_ENTROPY
+      cm->partial_prob_update = 0;
+#endif  // CONFIG_ENTROPY
       vp10_adapt_coef_probs(cm);
-#if CONFIG_MISC_FIXES
       vp10_adapt_intra_frame_probs(cm);
-#endif
 
       if (!frame_is_intra_only(cm)) {
-#if !CONFIG_MISC_FIXES
-        vp10_adapt_intra_frame_probs(cm);
-#endif
         vp10_adapt_inter_frame_probs(cm);
         vp10_adapt_mv_probs(cm, cm->allow_high_precision_mv);
       }
@@ -2428,7 +3846,6 @@
   }
 
   // Non frame parallel update frame context here.
-  if (cm->refresh_frame_context != REFRESH_FRAME_CONTEXT_OFF &&
-      !context_updated)
+  if (!cm->error_resilient_mode && !context_updated)
     cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
 }
diff --git a/vp10/decoder/decodemv.c b/vp10/decoder/decodemv.c
index 01b796c..8528370 100644
--- a/vp10/decoder/decodemv.c
+++ b/vp10/decoder/decodemv.c
@@ -24,12 +24,25 @@
 
 #include "vpx_dsp/vpx_dsp_common.h"
 
-static PREDICTION_MODE read_intra_mode(vpx_reader *r, const vpx_prob *p) {
-  return (PREDICTION_MODE)vpx_read_tree(r, vp10_intra_mode_tree, p);
+static INLINE int read_uniform(vp10_reader *r, int n) {
+  int l = get_unsigned_bits(n);
+  int m = (1 << l) - n;
+  int v = vp10_read_literal(r, l-1);
+
+  assert(l != 0);
+
+  if (v < m)
+    return v;
+  else
+    return (v << 1) - m + vp10_read_literal(r, 1);
+}
+
+static PREDICTION_MODE read_intra_mode(vp10_reader *r, const vpx_prob *p) {
+  return (PREDICTION_MODE)vp10_read_tree(r, vp10_intra_mode_tree, p);
 }
 
 static PREDICTION_MODE read_intra_mode_y(VP10_COMMON *cm, MACROBLOCKD *xd,
-                                         vpx_reader *r, int size_group) {
+                                         vp10_reader *r, int size_group) {
   const PREDICTION_MODE y_mode =
       read_intra_mode(r, cm->fc->y_mode_prob[size_group]);
   FRAME_COUNTS *counts = xd->counts;
@@ -39,7 +52,7 @@
 }
 
 static PREDICTION_MODE read_intra_mode_uv(VP10_COMMON *cm, MACROBLOCKD *xd,
-                                          vpx_reader *r,
+                                          vp10_reader *r,
                                           PREDICTION_MODE y_mode) {
   const PREDICTION_MODE uv_mode = read_intra_mode(r,
                                          cm->fc->uv_mode_prob[y_mode]);
@@ -49,41 +62,255 @@
   return uv_mode;
 }
 
+#if CONFIG_EXT_INTER
+static INTERINTRA_MODE read_interintra_mode(VP10_COMMON *cm, MACROBLOCKD *xd,
+                                            vp10_reader *r, int size_group) {
+  const INTERINTRA_MODE ii_mode =
+      (INTERINTRA_MODE)vp10_read_tree(r, vp10_interintra_mode_tree,
+                                      cm->fc->interintra_mode_prob[size_group]);
+  FRAME_COUNTS *counts = xd->counts;
+  if (counts)
+    ++counts->interintra_mode[size_group][ii_mode];
+  return ii_mode;
+}
+#endif  // CONFIG_EXT_INTER
+
 static PREDICTION_MODE read_inter_mode(VP10_COMMON *cm, MACROBLOCKD *xd,
-                                       vpx_reader *r, int ctx) {
-  const int mode = vpx_read_tree(r, vp10_inter_mode_tree,
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+                                       MB_MODE_INFO *mbmi,
+#endif
+                                       vp10_reader *r, int16_t ctx) {
+#if CONFIG_REF_MV
+  FRAME_COUNTS *counts = xd->counts;
+  int16_t mode_ctx = ctx & NEWMV_CTX_MASK;
+  vpx_prob mode_prob = cm->fc->newmv_prob[mode_ctx];
+
+  if (vp10_read(r, mode_prob) == 0) {
+    if (counts)
+      ++counts->newmv_mode[mode_ctx][0];
+
+#if CONFIG_EXT_INTER
+    if (has_second_ref(mbmi)) {
+#endif  // CONFIG_EXT_INTER
+    return NEWMV;
+#if CONFIG_EXT_INTER
+    } else {
+      mode_prob = cm->fc->new2mv_prob;
+      if (vp10_read(r, mode_prob) == 0) {
+        if (counts)
+          ++counts->new2mv_mode[0];
+        return NEWMV;
+      } else {
+        if (counts)
+          ++counts->new2mv_mode[1];
+        return NEWFROMNEARMV;
+      }
+    }
+#endif  // CONFIG_EXT_INTER
+  }
+  if (counts)
+    ++counts->newmv_mode[mode_ctx][1];
+
+  if (ctx & (1 << ALL_ZERO_FLAG_OFFSET))
+    return ZEROMV;
+
+  mode_ctx = (ctx >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK;
+
+  mode_prob = cm->fc->zeromv_prob[mode_ctx];
+  if (vp10_read(r, mode_prob) == 0) {
+    if (counts)
+      ++counts->zeromv_mode[mode_ctx][0];
+    return ZEROMV;
+  }
+  if (counts)
+    ++counts->zeromv_mode[mode_ctx][1];
+
+  mode_ctx = (ctx >> REFMV_OFFSET) & REFMV_CTX_MASK;
+
+  if (ctx & (1 << SKIP_NEARESTMV_OFFSET))
+    mode_ctx = 6;
+  if (ctx & (1 << SKIP_NEARMV_OFFSET))
+    mode_ctx = 7;
+  if (ctx & (1 << SKIP_NEARESTMV_SUB8X8_OFFSET))
+    mode_ctx = 8;
+
+  mode_prob = cm->fc->refmv_prob[mode_ctx];
+
+  if (vp10_read(r, mode_prob) == 0) {
+    if (counts)
+      ++counts->refmv_mode[mode_ctx][0];
+
+    return NEARESTMV;
+  } else {
+    if (counts)
+      ++counts->refmv_mode[mode_ctx][1];
+    return NEARMV;
+  }
+
+  // Invalid prediction mode.
+  assert(0);
+#else
+  const int mode = vp10_read_tree(r, vp10_inter_mode_tree,
                                  cm->fc->inter_mode_probs[ctx]);
   FRAME_COUNTS *counts = xd->counts;
   if (counts)
     ++counts->inter_mode[ctx][mode];
 
   return NEARESTMV + mode;
+#endif
 }
 
-static int read_segment_id(vpx_reader *r,
-    const struct segmentation_probs *segp) {
-  return vpx_read_tree(r, vp10_segment_tree, segp->tree_probs);
-}
+#if CONFIG_REF_MV
+static void read_drl_idx(const VP10_COMMON *cm,
+                         MACROBLOCKD *xd,
+                         MB_MODE_INFO *mbmi,
+                         vp10_reader *r) {
+  uint8_t ref_frame_type = vp10_ref_frame_type(mbmi->ref_frame);
+  mbmi->ref_mv_idx = 0;
 
-static TX_SIZE read_selected_tx_size(VP10_COMMON *cm, MACROBLOCKD *xd,
-                                     TX_SIZE max_tx_size, vpx_reader *r) {
-  FRAME_COUNTS *counts = xd->counts;
-  const int ctx = get_tx_size_context(xd);
-  const vpx_prob *tx_probs = get_tx_probs(max_tx_size, ctx, &cm->fc->tx_probs);
-  int tx_size = vpx_read(r, tx_probs[0]);
-  if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) {
-    tx_size += vpx_read(r, tx_probs[1]);
-    if (tx_size != TX_8X8 && max_tx_size >= TX_32X32)
-      tx_size += vpx_read(r, tx_probs[2]);
+  if (mbmi->mode == NEWMV) {
+    int idx;
+    for (idx = 0; idx < 2; ++idx) {
+      if (xd->ref_mv_count[ref_frame_type] > idx + 1) {
+        uint8_t drl_ctx = vp10_drl_ctx(xd->ref_mv_stack[ref_frame_type], idx);
+        vpx_prob drl_prob = cm->fc->drl_prob[drl_ctx];
+        if (!vp10_read(r, drl_prob)) {
+          mbmi->ref_mv_idx = idx;
+          if (xd->counts)
+            ++xd->counts->drl_mode[drl_ctx][0];
+          return;
+        }
+        mbmi->ref_mv_idx = idx + 1;
+        if (xd->counts)
+          ++xd->counts->drl_mode[drl_ctx][1];
+      }
+    }
   }
 
+  if (mbmi->mode == NEARMV) {
+    int idx;
+    // Offset the NEARESTMV mode.
+    // TODO(jingning): Unify the two syntax decoding loops after the NEARESTMV
+    // mode is factored in.
+    for (idx = 1; idx < 3; ++idx) {
+      if (xd->ref_mv_count[ref_frame_type] > idx + 1) {
+        uint8_t drl_ctx = vp10_drl_ctx(xd->ref_mv_stack[ref_frame_type], idx);
+        vpx_prob drl_prob = cm->fc->drl_prob[drl_ctx];
+        if (!vp10_read(r, drl_prob)) {
+          mbmi->ref_mv_idx = idx - 1;
+          if (xd->counts)
+            ++xd->counts->drl_mode[drl_ctx][0];
+          return;
+        }
+        mbmi->ref_mv_idx = idx;
+        if (xd->counts)
+          ++xd->counts->drl_mode[drl_ctx][1];
+      }
+    }
+  }
+}
+#endif
+
+#if CONFIG_EXT_INTER
+static PREDICTION_MODE read_inter_compound_mode(VP10_COMMON *cm,
+                                                MACROBLOCKD *xd,
+                                                vp10_reader *r, int16_t ctx) {
+  const int mode = vp10_read_tree(r, vp10_inter_compound_mode_tree,
+                                 cm->fc->inter_compound_mode_probs[ctx]);
+  FRAME_COUNTS *counts = xd->counts;
+
   if (counts)
-    ++get_tx_counts(max_tx_size, ctx, &counts->tx)[tx_size];
+    ++counts->inter_compound_mode[ctx][mode];
+
+  assert(is_inter_compound_mode(NEAREST_NEARESTMV + mode));
+  return NEAREST_NEARESTMV + mode;
+}
+#endif  // CONFIG_EXT_INTER
+
+static int read_segment_id(vp10_reader *r,
+    const struct segmentation_probs *segp) {
+  return vp10_read_tree(r, vp10_segment_tree, segp->tree_probs);
+}
+
+#if CONFIG_VAR_TX
+static void read_tx_size_inter(VP10_COMMON *cm, MACROBLOCKD *xd,
+                               MB_MODE_INFO *mbmi, FRAME_COUNTS *counts,
+                               TX_SIZE tx_size, int blk_row, int blk_col,
+                               vp10_reader *r) {
+  int is_split = 0;
+  const int tx_row = blk_row >> 1;
+  const int tx_col = blk_col >> 1;
+  int max_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
+  int max_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
+  int ctx = txfm_partition_context(xd->above_txfm_context + tx_col,
+                                   xd->left_txfm_context + tx_row,
+                                   tx_size);
+  TX_SIZE (*const inter_tx_size)[MAX_MIB_SIZE] =
+    (TX_SIZE (*)[MAX_MIB_SIZE])&mbmi->inter_tx_size[tx_row][tx_col];
+
+  if (xd->mb_to_bottom_edge < 0)
+    max_blocks_high += xd->mb_to_bottom_edge >> 5;
+  if (xd->mb_to_right_edge < 0)
+     max_blocks_wide += xd->mb_to_right_edge >> 5;
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide)
+     return;
+
+  is_split = vp10_read(r, cm->fc->txfm_partition_prob[ctx]);
+
+  if (is_split) {
+    BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+    int bsl = b_width_log2_lookup[bsize];
+    int i;
+
+    if (counts)
+      ++counts->txfm_partition[ctx][1];
+
+    if (tx_size == TX_8X8) {
+      inter_tx_size[0][0] = TX_4X4;
+      mbmi->tx_size = TX_4X4;
+      txfm_partition_update(xd->above_txfm_context + tx_col,
+                            xd->left_txfm_context + tx_row, TX_4X4);
+      return;
+    }
+
+    assert(bsl > 0);
+    --bsl;
+    for (i = 0; i < 4; ++i) {
+      int offsetr = blk_row + ((i >> 1) << bsl);
+      int offsetc = blk_col + ((i & 0x01) << bsl);
+      read_tx_size_inter(cm, xd, mbmi, counts,
+                         tx_size - 1, offsetr, offsetc, r);
+    }
+  } else {
+    int idx, idy;
+    inter_tx_size[0][0] = tx_size;
+    for (idy = 0; idy < (1 << tx_size) / 2; ++idy)
+      for (idx = 0; idx < (1 << tx_size) / 2; ++idx)
+        inter_tx_size[idy][idx] = tx_size;
+    mbmi->tx_size = tx_size;
+    if (counts)
+      ++counts->txfm_partition[ctx][0];
+    txfm_partition_update(xd->above_txfm_context + tx_col,
+                          xd->left_txfm_context + tx_row, tx_size);
+  }
+}
+#endif
+
+static TX_SIZE read_selected_tx_size(VP10_COMMON *cm, MACROBLOCKD *xd,
+                                     TX_SIZE max_tx_size, vp10_reader *r) {
+  FRAME_COUNTS *counts = xd->counts;
+  const int ctx = get_tx_size_context(xd);
+  const int tx_size_cat = max_tx_size - TX_8X8;
+  int tx_size = vp10_read_tree(r, vp10_tx_size_tree[tx_size_cat],
+                              cm->fc->tx_size_probs[tx_size_cat][ctx]);
+  if (counts)
+    ++counts->tx_size[tx_size_cat][ctx][tx_size];
   return (TX_SIZE)tx_size;
 }
 
 static TX_SIZE read_tx_size(VP10_COMMON *cm, MACROBLOCKD *xd,
-                            int allow_select, vpx_reader *r) {
+                            int allow_select, vp10_reader *r) {
   TX_MODE tx_mode = cm->tx_mode;
   BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
   const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
@@ -121,30 +348,20 @@
 
 static int read_intra_segment_id(VP10_COMMON *const cm, MACROBLOCKD *const xd,
                                  int mi_offset, int x_mis, int y_mis,
-                                 vpx_reader *r) {
+                                 vp10_reader *r) {
   struct segmentation *const seg = &cm->seg;
-#if CONFIG_MISC_FIXES
   FRAME_COUNTS *counts = xd->counts;
   struct segmentation_probs *const segp = &cm->fc->seg;
-#else
-  struct segmentation_probs *const segp = &cm->segp;
-#endif
   int segment_id;
 
-#if !CONFIG_MISC_FIXES
-  (void) xd;
-#endif
-
   if (!seg->enabled)
     return 0;  // Default for disabled segmentation
 
   assert(seg->update_map && !seg->temporal_update);
 
   segment_id = read_segment_id(r, segp);
-#if CONFIG_MISC_FIXES
   if (counts)
     ++counts->seg.tree_total[segment_id];
-#endif
   set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id);
   return segment_id;
 }
@@ -162,19 +379,15 @@
 }
 
 static int read_inter_segment_id(VP10_COMMON *const cm, MACROBLOCKD *const xd,
-                                 int mi_row, int mi_col, vpx_reader *r) {
+                                 int mi_row, int mi_col, vp10_reader *r) {
   struct segmentation *const seg = &cm->seg;
-#if CONFIG_MISC_FIXES
   FRAME_COUNTS *counts = xd->counts;
   struct segmentation_probs *const segp = &cm->fc->seg;
-#else
-  struct segmentation_probs *const segp = &cm->segp;
-#endif
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   int predicted_segment_id, segment_id;
   const int mi_offset = mi_row * cm->mi_cols + mi_col;
-  const int bw = xd->plane[0].n4_w >> 1;
-  const int bh = xd->plane[0].n4_h >> 1;
+  const int bw = num_8x8_blocks_wide_lookup[mbmi->sb_type];
+  const int bh = num_8x8_blocks_high_lookup[mbmi->sb_type];
 
   // TODO(slavarnway): move x_mis, y_mis into xd ?????
   const int x_mis = VPXMIN(cm->mi_cols - mi_col, bw);
@@ -196,38 +409,32 @@
   if (seg->temporal_update) {
     const int ctx = vp10_get_pred_context_seg_id(xd);
     const vpx_prob pred_prob = segp->pred_probs[ctx];
-    mbmi->seg_id_predicted = vpx_read(r, pred_prob);
-#if CONFIG_MISC_FIXES
+    mbmi->seg_id_predicted = vp10_read(r, pred_prob);
     if (counts)
       ++counts->seg.pred[ctx][mbmi->seg_id_predicted];
-#endif
     if (mbmi->seg_id_predicted) {
       segment_id = predicted_segment_id;
     } else {
       segment_id = read_segment_id(r, segp);
-#if CONFIG_MISC_FIXES
       if (counts)
         ++counts->seg.tree_mispred[segment_id];
-#endif
     }
   } else {
     segment_id = read_segment_id(r, segp);
-#if CONFIG_MISC_FIXES
     if (counts)
       ++counts->seg.tree_total[segment_id];
-#endif
   }
   set_segment_id(cm, mi_offset, x_mis, y_mis, segment_id);
   return segment_id;
 }
 
 static int read_skip(VP10_COMMON *cm, const MACROBLOCKD *xd,
-                     int segment_id, vpx_reader *r) {
+                     int segment_id, vp10_reader *r) {
   if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
     return 1;
   } else {
     const int ctx = vp10_get_skip_context(xd);
-    const int skip = vpx_read(r, cm->fc->skip_probs[ctx]);
+    const int skip = vp10_read(r, cm->fc->skip_probs[ctx]);
     FRAME_COUNTS *counts = xd->counts;
     if (counts)
       ++counts->skip[ctx][skip];
@@ -235,9 +442,125 @@
   }
 }
 
+static void read_palette_mode_info(VP10_COMMON *const cm,
+                                   MACROBLOCKD *const xd,
+                                   vp10_reader *r) {
+  MODE_INFO *const mi = xd->mi[0];
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const MODE_INFO *const above_mi = xd->above_mi;
+  const MODE_INFO *const left_mi  = xd->left_mi;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  int i, n, palette_ctx = 0;
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+
+  if (mbmi->mode == DC_PRED) {
+    if (above_mi)
+      palette_ctx += (above_mi->mbmi.palette_mode_info.palette_size[0] > 0);
+    if (left_mi)
+      palette_ctx += (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
+    if (vp10_read(r, vp10_default_palette_y_mode_prob[bsize - BLOCK_8X8]
+                                                     [palette_ctx])) {
+      pmi->palette_size[0] =
+        vp10_read_tree(r, vp10_palette_size_tree,
+                      vp10_default_palette_y_size_prob[bsize - BLOCK_8X8]) + 2;
+      n = pmi->palette_size[0];
+      for (i = 0; i < n; ++i)
+        pmi->palette_colors[i] = vp10_read_literal(r, cm->bit_depth);
+
+      xd->plane[0].color_index_map[0] = read_uniform(r, n);
+      assert(xd->plane[0].color_index_map[0] < n);
+    }
+  }
+
+  if (mbmi->uv_mode == DC_PRED) {
+    if (vp10_read(r,
+                 vp10_default_palette_uv_mode_prob[pmi->palette_size[0] > 0])) {
+      pmi->palette_size[1] =
+          vp10_read_tree(r, vp10_palette_size_tree,
+                        vp10_default_palette_uv_size_prob[bsize - BLOCK_8X8])
+                        + 2;
+      n = pmi->palette_size[1];
+      for (i = 0; i < n; ++i) {
+        pmi->palette_colors[PALETTE_MAX_SIZE + i] =
+            vp10_read_literal(r, cm->bit_depth);
+        pmi->palette_colors[2 * PALETTE_MAX_SIZE + i] =
+            vp10_read_literal(r, cm->bit_depth);
+      }
+      xd->plane[1].color_index_map[0] = read_uniform(r, n);
+      assert(xd->plane[1].color_index_map[0] < n);
+    }
+  }
+}
+
+#if CONFIG_EXT_INTRA
+static void read_ext_intra_mode_info(VP10_COMMON *const cm,
+                                     MACROBLOCKD *const xd, vp10_reader *r) {
+  MODE_INFO *const mi = xd->mi[0];
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
+  FRAME_COUNTS *counts = xd->counts;
+
+#if !ALLOW_FILTER_INTRA_MODES
+  return;
+#endif
+  if (mbmi->mode == DC_PRED &&
+      mbmi->palette_mode_info.palette_size[0] == 0) {
+    mbmi->ext_intra_mode_info.use_ext_intra_mode[0] =
+        vp10_read(r, cm->fc->ext_intra_probs[0]);
+    if (mbmi->ext_intra_mode_info.use_ext_intra_mode[0]) {
+      mbmi->ext_intra_mode_info.ext_intra_mode[0] =
+          read_uniform(r, FILTER_INTRA_MODES);
+    }
+    if (counts)
+      ++counts->ext_intra[0][mbmi->ext_intra_mode_info.use_ext_intra_mode[0]];
+  }
+  if (mbmi->uv_mode == DC_PRED &&
+      mbmi->palette_mode_info.palette_size[1] == 0) {
+    mbmi->ext_intra_mode_info.use_ext_intra_mode[1] =
+        vp10_read(r, cm->fc->ext_intra_probs[1]);
+    if (mbmi->ext_intra_mode_info.use_ext_intra_mode[1]) {
+      mbmi->ext_intra_mode_info.ext_intra_mode[1] =
+          read_uniform(r, FILTER_INTRA_MODES);
+    }
+    if (counts)
+      ++counts->ext_intra[1][mbmi->ext_intra_mode_info.use_ext_intra_mode[1]];
+  }
+}
+
+static void read_intra_angle_info(VP10_COMMON *const cm, MACROBLOCKD *const xd,
+                                  vp10_reader *r) {
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int ctx = vp10_get_pred_context_intra_interp(xd);
+  int p_angle;
+
+  if (bsize < BLOCK_8X8)
+    return;
+
+  if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED) {
+    mbmi->angle_delta[0] =
+        read_uniform(r, 2 * MAX_ANGLE_DELTAS + 1) - MAX_ANGLE_DELTAS;
+    p_angle = mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP;
+    if (vp10_is_intra_filter_switchable(p_angle)) {
+      FRAME_COUNTS *counts = xd->counts;
+      mbmi->intra_filter = vp10_read_tree(r, vp10_intra_filter_tree,
+                                          cm->fc->intra_filter_probs[ctx]);
+      if (counts)
+        ++counts->intra_filter[ctx][mbmi->intra_filter];
+    } else {
+      mbmi->intra_filter = INTRA_FILTER_LINEAR;
+    }
+  }
+
+  if (mbmi->uv_mode != DC_PRED && mbmi->uv_mode != TM_PRED) {
+    mbmi->angle_delta[1] =
+        read_uniform(r, 2 * MAX_ANGLE_DELTAS + 1) - MAX_ANGLE_DELTAS;
+  }
+}
+#endif  // CONFIG_EXT_INTRA
+
 static void read_intra_frame_mode_info(VP10_COMMON *const cm,
                                        MACROBLOCKD *const xd,
-                                       int mi_row, int mi_col, vpx_reader *r) {
+                                       int mi_row, int mi_col, vp10_reader *r) {
   MODE_INFO *const mi = xd->mi[0];
   MB_MODE_INFO *const mbmi = &mi->mbmi;
   const MODE_INFO *above_mi = xd->above_mi;
@@ -283,32 +606,67 @@
   }
 
   mbmi->uv_mode = read_intra_mode_uv(cm, xd, r, mbmi->mode);
+#if CONFIG_EXT_INTRA
+  read_intra_angle_info(cm, xd, r);
+#endif  // CONFIG_EXT_INTRA
+  mbmi->palette_mode_info.palette_size[0] = 0;
+  mbmi->palette_mode_info.palette_size[1] = 0;
+  if (bsize >= BLOCK_8X8 && cm->allow_screen_content_tools)
+    read_palette_mode_info(cm, xd, r);
+#if CONFIG_EXT_INTRA
+    mbmi->ext_intra_mode_info.use_ext_intra_mode[0] = 0;
+    mbmi->ext_intra_mode_info.use_ext_intra_mode[1] = 0;
+    if (bsize >= BLOCK_8X8)
+      read_ext_intra_mode_info(cm, xd, r);
+#endif  // CONFIG_EXT_INTRA
 
-  if (mbmi->tx_size < TX_32X32 &&
-      cm->base_qindex > 0 && !mbmi->skip &&
-      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-    FRAME_COUNTS *counts = xd->counts;
-    TX_TYPE tx_type_nom = intra_mode_to_tx_type_context[mbmi->mode];
-    mbmi->tx_type = vpx_read_tree(
-        r, vp10_ext_tx_tree,
-        cm->fc->intra_ext_tx_prob[mbmi->tx_size][tx_type_nom]);
-    if (counts)
-      ++counts->intra_ext_tx[mbmi->tx_size][tx_type_nom][mbmi->tx_type];
-  } else {
-    mbmi->tx_type = DCT_DCT;
+  if (!FIXED_TX_TYPE) {
+#if CONFIG_EXT_TX
+    if (get_ext_tx_types(mbmi->tx_size, mbmi->sb_type, 0) > 1 &&
+        cm->base_qindex > 0 && !mbmi->skip &&
+        !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) &&
+        ALLOW_INTRA_EXT_TX) {
+      FRAME_COUNTS *counts = xd->counts;
+      int eset = get_ext_tx_set(mbmi->tx_size, mbmi->sb_type, 0);
+      if (eset > 0) {
+        mbmi->tx_type = vp10_read_tree(
+            r, vp10_ext_tx_intra_tree[eset],
+            cm->fc->intra_ext_tx_prob[eset][mbmi->tx_size][mbmi->mode]);
+        if (counts)
+          ++counts->intra_ext_tx[eset][mbmi->tx_size][mbmi->mode]
+                                                     [mbmi->tx_type];
+      }
+    } else {
+      mbmi->tx_type = DCT_DCT;
+    }
+#else
+    if (mbmi->tx_size < TX_32X32 &&
+        cm->base_qindex > 0 && !mbmi->skip &&
+        !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+      FRAME_COUNTS *counts = xd->counts;
+      TX_TYPE tx_type_nom = intra_mode_to_tx_type_context[mbmi->mode];
+      mbmi->tx_type = vp10_read_tree(
+          r, vp10_ext_tx_tree,
+          cm->fc->intra_ext_tx_prob[mbmi->tx_size][tx_type_nom]);
+      if (counts)
+        ++counts->intra_ext_tx[mbmi->tx_size][tx_type_nom][mbmi->tx_type];
+    } else {
+      mbmi->tx_type = DCT_DCT;
+    }
+#endif  // CONFIG_EXT_TX
   }
 }
 
-static int read_mv_component(vpx_reader *r,
+static int read_mv_component(vp10_reader *r,
                              const nmv_component *mvcomp, int usehp) {
   int mag, d, fr, hp;
-  const int sign = vpx_read(r, mvcomp->sign);
-  const int mv_class = vpx_read_tree(r, vp10_mv_class_tree, mvcomp->classes);
+  const int sign = vp10_read(r, mvcomp->sign);
+  const int mv_class = vp10_read_tree(r, vp10_mv_class_tree, mvcomp->classes);
   const int class0 = mv_class == MV_CLASS_0;
 
   // Integer part
   if (class0) {
-    d = vpx_read_tree(r, vp10_mv_class0_tree, mvcomp->class0);
+    d = vp10_read_tree(r, vp10_mv_class0_tree, mvcomp->class0);
     mag = 0;
   } else {
     int i;
@@ -316,16 +674,16 @@
 
     d = 0;
     for (i = 0; i < n; ++i)
-      d |= vpx_read(r, mvcomp->bits[i]) << i;
+      d |= vp10_read(r, mvcomp->bits[i]) << i;
     mag = CLASS0_SIZE << (mv_class + 2);
   }
 
   // Fractional part
-  fr = vpx_read_tree(r, vp10_mv_fp_tree, class0 ? mvcomp->class0_fp[d]
+  fr = vp10_read_tree(r, vp10_mv_fp_tree, class0 ? mvcomp->class0_fp[d]
                                                : mvcomp->fp);
 
   // High precision part (if hp is not used, the default value of the hp is 1)
-  hp = usehp ? vpx_read(r, class0 ? mvcomp->class0_hp : mvcomp->hp)
+  hp = usehp ? vp10_read(r, class0 ? mvcomp->class0_hp : mvcomp->hp)
              : 1;
 
   // Result
@@ -333,14 +691,38 @@
   return sign ? -mag : mag;
 }
 
-static INLINE void read_mv(vpx_reader *r, MV *mv, const MV *ref,
+static INLINE void read_mv(vp10_reader *r, MV *mv, const MV *ref,
+#if CONFIG_REF_MV
+                           int is_compound,
+#endif
                            const nmv_context *ctx,
                            nmv_context_counts *counts, int allow_hp) {
-  const MV_JOINT_TYPE joint_type =
-      (MV_JOINT_TYPE)vpx_read_tree(r, vp10_mv_joint_tree, ctx->joints);
+  MV_JOINT_TYPE joint_type;
   const int use_hp = allow_hp && vp10_use_mv_hp(ref);
   MV diff = {0, 0};
 
+#if CONFIG_REF_MV && !CONFIG_EXT_INTER
+  if (is_compound) {
+    int is_zero_rmv = vp10_read(r, ctx->zero_rmv);
+    if (is_zero_rmv) {
+      joint_type = MV_JOINT_ZERO;
+    } else {
+      joint_type = (MV_JOINT_TYPE)vp10_read_tree(r, vp10_mv_joint_tree,
+                                                 ctx->joints);
+    }
+  } else {
+    joint_type = (MV_JOINT_TYPE)vp10_read_tree(r, vp10_mv_joint_tree,
+                                               ctx->joints);
+  }
+#else
+  joint_type = (MV_JOINT_TYPE)vp10_read_tree(r, vp10_mv_joint_tree,
+                                             ctx->joints);
+#endif
+
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+  (void)is_compound;
+#endif
+
   if (mv_joint_vertical(joint_type))
     diff.row = read_mv_component(r, &ctx->comps[0], use_hp);
 
@@ -355,11 +737,11 @@
 
 static REFERENCE_MODE read_block_reference_mode(VP10_COMMON *cm,
                                                 const MACROBLOCKD *xd,
-                                                vpx_reader *r) {
+                                                vp10_reader *r) {
   if (cm->reference_mode == REFERENCE_MODE_SELECT) {
     const int ctx = vp10_get_reference_mode_context(cm, xd);
     const REFERENCE_MODE mode =
-        (REFERENCE_MODE)vpx_read(r, cm->fc->comp_inter_prob[ctx]);
+        (REFERENCE_MODE)vp10_read(r, cm->fc->comp_inter_prob[ctx]);
     FRAME_COUNTS *counts = xd->counts;
     if (counts)
       ++counts->comp_inter[ctx][mode];
@@ -371,7 +753,7 @@
 
 // Read the referncence frame
 static void read_ref_frames(VP10_COMMON *const cm, MACROBLOCKD *const xd,
-                            vpx_reader *r,
+                            vp10_reader *r,
                             int segment_id, MV_REFERENCE_FRAME ref_frame[2]) {
   FRAME_CONTEXT *const fc = cm->fc;
   FRAME_COUNTS *counts = xd->counts;
@@ -384,27 +766,93 @@
     const REFERENCE_MODE mode = read_block_reference_mode(cm, xd, r);
     // FIXME(rbultje) I'm pretty sure this breaks segmentation ref frame coding
     if (mode == COMPOUND_REFERENCE) {
+#if CONFIG_EXT_REFS
+      const int idx = cm->ref_frame_sign_bias[cm->comp_bwd_ref[0]];
+#else
       const int idx = cm->ref_frame_sign_bias[cm->comp_fixed_ref];
+#endif  // CONFIG_EXT_REFS
       const int ctx = vp10_get_pred_context_comp_ref_p(cm, xd);
-      const int bit = vpx_read(r, fc->comp_ref_prob[ctx]);
+      const int bit = vp10_read(r, fc->comp_ref_prob[ctx][0]);
+
       if (counts)
-        ++counts->comp_ref[ctx][bit];
-      ref_frame[idx] = cm->comp_fixed_ref;
+        ++counts->comp_ref[ctx][0][bit];
+
+#if CONFIG_EXT_REFS
+      // Decode forward references.
+      if (!bit) {
+        const int ctx1 = vp10_get_pred_context_comp_ref_p1(cm, xd);
+        const int bit1 = vp10_read(r, fc->comp_ref_prob[ctx1][1]);
+        if (counts)
+          ++counts->comp_ref[ctx1][1][bit1];
+        ref_frame[!idx] = cm->comp_fwd_ref[bit1 ? 0 : 1];
+      } else {
+        const int ctx2 = vp10_get_pred_context_comp_ref_p2(cm, xd);
+        const int bit2 = vp10_read(r, fc->comp_ref_prob[ctx2][2]);
+        if (counts)
+          ++counts->comp_ref[ctx2][2][bit2];
+        ref_frame[!idx] = cm->comp_fwd_ref[bit2 ? 3 : 2];
+      }
+
+      // Decode backward references.
+      {
+        const int ctx_bwd = vp10_get_pred_context_comp_bwdref_p(cm, xd);
+        const int bit_bwd = vp10_read(r, fc->comp_bwdref_prob[ctx_bwd][0]);
+        if (counts)
+          ++counts->comp_bwdref[ctx_bwd][0][bit_bwd];
+        ref_frame[idx] = cm->comp_bwd_ref[bit_bwd];
+      }
+#else
       ref_frame[!idx] = cm->comp_var_ref[bit];
+      ref_frame[idx] = cm->comp_fixed_ref;
+#endif  // CONFIG_EXT_REFS
     } else if (mode == SINGLE_REFERENCE) {
+#if CONFIG_EXT_REFS
       const int ctx0 = vp10_get_pred_context_single_ref_p1(xd);
-      const int bit0 = vpx_read(r, fc->single_ref_prob[ctx0][0]);
+      const int bit0 = vp10_read(r, fc->single_ref_prob[ctx0][0]);
       if (counts)
         ++counts->single_ref[ctx0][0][bit0];
+
       if (bit0) {
         const int ctx1 = vp10_get_pred_context_single_ref_p2(xd);
-        const int bit1 = vpx_read(r, fc->single_ref_prob[ctx1][1]);
+        const int bit1 = vp10_read(r, fc->single_ref_prob[ctx1][1]);
+        if (counts)
+          ++counts->single_ref[ctx1][1][bit1];
+        ref_frame[0] = bit1 ? ALTREF_FRAME : BWDREF_FRAME;
+      } else {
+        const int ctx2 = vp10_get_pred_context_single_ref_p3(xd);
+        const int bit2 = vp10_read(r, fc->single_ref_prob[ctx2][2]);
+        if (counts)
+          ++counts->single_ref[ctx2][2][bit2];
+        if (bit2) {
+          const int ctx4 = vp10_get_pred_context_single_ref_p5(xd);
+          const int bit4 = vp10_read(r, fc->single_ref_prob[ctx4][4]);
+          if (counts)
+            ++counts->single_ref[ctx4][4][bit4];
+          ref_frame[0] = bit4 ? GOLDEN_FRAME : LAST3_FRAME;
+        } else {
+          const int ctx3 = vp10_get_pred_context_single_ref_p4(xd);
+          const int bit3 = vp10_read(r, fc->single_ref_prob[ctx3][3]);
+          if (counts)
+            ++counts->single_ref[ctx3][3][bit3];
+          ref_frame[0] = bit3 ? LAST2_FRAME : LAST_FRAME;
+        }
+      }
+#else
+      const int ctx0 = vp10_get_pred_context_single_ref_p1(xd);
+      const int bit0 = vp10_read(r, fc->single_ref_prob[ctx0][0]);
+      if (counts)
+        ++counts->single_ref[ctx0][0][bit0];
+
+      if (bit0) {
+        const int ctx1 = vp10_get_pred_context_single_ref_p2(xd);
+        const int bit1 = vp10_read(r, fc->single_ref_prob[ctx1][1]);
         if (counts)
           ++counts->single_ref[ctx1][1][bit1];
         ref_frame[0] = bit1 ? ALTREF_FRAME : GOLDEN_FRAME;
       } else {
         ref_frame[0] = LAST_FRAME;
       }
+#endif  // CONFIG_EXT_REFS
 
       ref_frame[1] = NONE;
     } else {
@@ -414,22 +862,55 @@
 }
 
 
-static INLINE INTERP_FILTER read_switchable_interp_filter(
-    VP10_COMMON *const cm, MACROBLOCKD *const xd,
-    vpx_reader *r) {
-  const int ctx = vp10_get_pred_context_switchable_interp(xd);
-  const INTERP_FILTER type =
-      (INTERP_FILTER)vpx_read_tree(r, vp10_switchable_interp_tree,
-                                   cm->fc->switchable_interp_prob[ctx]);
+#if CONFIG_OBMC || CONFIG_WARPED_MOTION
+static MOTION_VARIATION read_motvar_block(
+    VP10_COMMON *const cm, MACROBLOCKD *const xd, vp10_reader *r) {
+  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
   FRAME_COUNTS *counts = xd->counts;
-  if (counts)
-    ++counts->switchable_interp[ctx][type];
-  return type;
+  MOTION_VARIATION motvar;
+
+  if (is_motvar_allowed(&xd->mi[0]->mbmi)) {
+    motvar = (MOTION_VARIATION)
+        vp10_read_tree(r, vp10_motvar_tree, cm->fc->motvar_prob[bsize]);
+    if (counts)
+      ++counts->motvar[bsize][motvar];
+    return motvar;
+  } else {
+    return SIMPLE_TRANSLATION;
+  }
+}
+#endif  // CONFIG_OBMC || CONFIG_WARPED_MOTION
+
+static INLINE INTERP_FILTER read_interp_filter(
+    VP10_COMMON *const cm, MACROBLOCKD *const xd,
+#if CONFIG_DUAL_FILTER
+    int dir,
+#endif
+    vp10_reader *r) {
+#if CONFIG_EXT_INTERP
+  if (!vp10_is_interp_needed(xd)) return EIGHTTAP_REGULAR;
+#endif
+  if (cm->interp_filter != SWITCHABLE) {
+    return cm->interp_filter;
+  } else {
+#if CONFIG_DUAL_FILTER
+    const int ctx = vp10_get_pred_context_switchable_interp(xd, dir);
+#else
+    const int ctx = vp10_get_pred_context_switchable_interp(xd);
+#endif
+    FRAME_COUNTS *counts = xd->counts;
+    const INTERP_FILTER type =
+      (INTERP_FILTER)vp10_read_tree(r, vp10_switchable_interp_tree,
+                                    cm->fc->switchable_interp_prob[ctx]);
+    if (counts)
+      ++counts->switchable_interp[ctx][type];
+    return type;
+  }
 }
 
 static void read_intra_block_mode_info(VP10_COMMON *const cm,
                                        MACROBLOCKD *const xd, MODE_INFO *mi,
-                                       vpx_reader *r) {
+                                       vp10_reader *r) {
   MB_MODE_INFO *const mbmi = &mi->mbmi;
   const BLOCK_SIZE bsize = mi->mbmi.sb_type;
   int i;
@@ -460,6 +941,19 @@
   }
 
   mbmi->uv_mode = read_intra_mode_uv(cm, xd, r, mbmi->mode);
+#if CONFIG_EXT_INTRA
+  read_intra_angle_info(cm, xd, r);
+#endif  // CONFIG_EXT_INTRA
+  mbmi->palette_mode_info.palette_size[0] = 0;
+  mbmi->palette_mode_info.palette_size[1] = 0;
+  if (bsize >= BLOCK_8X8 && cm->allow_screen_content_tools)
+    read_palette_mode_info(cm, xd, r);
+#if CONFIG_EXT_INTRA
+  mbmi->ext_intra_mode_info.use_ext_intra_mode[0] = 0;
+  mbmi->ext_intra_mode_info.use_ext_intra_mode[1] = 0;
+  if (bsize >= BLOCK_8X8)
+    read_ext_intra_mode_info(cm, xd, r);
+#endif  // CONFIG_EXT_INTRA
 }
 
 static INLINE int is_mv_valid(const MV *mv) {
@@ -469,20 +963,50 @@
 
 static INLINE int assign_mv(VP10_COMMON *cm, MACROBLOCKD *xd,
                             PREDICTION_MODE mode,
+#if CONFIG_REF_MV
+                            int block,
+#endif
                             int_mv mv[2], int_mv ref_mv[2],
                             int_mv nearest_mv[2], int_mv near_mv[2],
-                            int is_compound, int allow_hp, vpx_reader *r) {
+                            int is_compound, int allow_hp, vp10_reader *r) {
   int i;
   int ret = 1;
+#if CONFIG_REF_MV
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  BLOCK_SIZE bsize = mbmi->sb_type;
+  int_mv *pred_mv = (bsize >= BLOCK_8X8) ?
+      mbmi->pred_mv : xd->mi[0]->bmi[block].pred_mv_s8;
+#endif
 
   switch (mode) {
+#if CONFIG_EXT_INTER
+    case NEWFROMNEARMV:
+#endif  // CONFIG_EXT_INTER
     case NEWMV: {
       FRAME_COUNTS *counts = xd->counts;
+#if !CONFIG_REF_MV
       nmv_context_counts *const mv_counts = counts ? &counts->mv : NULL;
+#endif
       for (i = 0; i < 1 + is_compound; ++i) {
+#if CONFIG_REF_MV
+        int nmv_ctx = vp10_nmv_ctx(xd->ref_mv_count[mbmi->ref_frame[i]],
+                                   xd->ref_mv_stack[mbmi->ref_frame[i]]);
+        nmv_context_counts *const mv_counts =
+            counts ? &counts->mv[nmv_ctx] : NULL;
+        read_mv(r, &mv[i].as_mv, &ref_mv[i].as_mv,
+#if CONFIG_REF_MV
+                is_compound,
+#endif
+                &cm->fc->nmvc[nmv_ctx], mv_counts, allow_hp);
+#else
         read_mv(r, &mv[i].as_mv, &ref_mv[i].as_mv, &cm->fc->nmvc, mv_counts,
                 allow_hp);
+#endif
         ret = ret && is_mv_valid(&mv[i].as_mv);
+
+#if CONFIG_REF_MV
+        pred_mv[i].as_int = ref_mv[i].as_int;
+#endif
       }
       break;
     }
@@ -490,20 +1014,176 @@
       mv[0].as_int = nearest_mv[0].as_int;
       if (is_compound)
         mv[1].as_int = nearest_mv[1].as_int;
+
+#if CONFIG_REF_MV
+      pred_mv[0].as_int = nearest_mv[0].as_int;
+      if (is_compound)
+        pred_mv[1].as_int = nearest_mv[1].as_int;
+#endif
       break;
     }
     case NEARMV: {
       mv[0].as_int = near_mv[0].as_int;
       if (is_compound)
         mv[1].as_int = near_mv[1].as_int;
+
+#if CONFIG_REF_MV
+      pred_mv[0].as_int = near_mv[0].as_int;
+      if (is_compound)
+        pred_mv[1].as_int = near_mv[1].as_int;
+#endif
       break;
     }
     case ZEROMV: {
       mv[0].as_int = 0;
       if (is_compound)
         mv[1].as_int = 0;
+
+#if CONFIG_REF_MV
+      pred_mv[0].as_int = 0;
+      if (is_compound)
+        pred_mv[1].as_int = 0;
+#endif
       break;
     }
+#if CONFIG_EXT_INTER
+    case NEW_NEWMV: {
+      FRAME_COUNTS *counts = xd->counts;
+#if !CONFIG_REF_MV
+      nmv_context_counts *const mv_counts = counts ? &counts->mv : NULL;
+#endif
+      assert(is_compound);
+      for (i = 0; i < 2; ++i) {
+#if CONFIG_REF_MV
+        int nmv_ctx = vp10_nmv_ctx(xd->ref_mv_count[mbmi->ref_frame[i]],
+                                   xd->ref_mv_stack[mbmi->ref_frame[i]]);
+        nmv_context_counts *const mv_counts =
+            counts ? &counts->mv[nmv_ctx] : NULL;
+        read_mv(r, &mv[i].as_mv, &ref_mv[i].as_mv, is_compound,
+                &cm->fc->nmvc[nmv_ctx], mv_counts,
+                allow_hp);
+#else
+        read_mv(r, &mv[i].as_mv, &ref_mv[i].as_mv, &cm->fc->nmvc, mv_counts,
+                allow_hp);
+#endif
+        ret = ret && is_mv_valid(&mv[i].as_mv);
+      }
+      break;
+    }
+    case NEAREST_NEARESTMV: {
+      assert(is_compound);
+      mv[0].as_int = nearest_mv[0].as_int;
+      mv[1].as_int = nearest_mv[1].as_int;
+      break;
+    }
+    case NEAREST_NEARMV: {
+      assert(is_compound);
+      mv[0].as_int = nearest_mv[0].as_int;
+      mv[1].as_int = near_mv[1].as_int;
+      break;
+    }
+    case NEAR_NEARESTMV: {
+      assert(is_compound);
+      mv[0].as_int = near_mv[0].as_int;
+      mv[1].as_int = nearest_mv[1].as_int;
+      break;
+    }
+    case NEAR_NEARMV: {
+      assert(is_compound);
+      mv[0].as_int = near_mv[0].as_int;
+      mv[1].as_int = near_mv[1].as_int;
+      break;
+    }
+    case NEW_NEARESTMV: {
+      FRAME_COUNTS *counts = xd->counts;
+#if CONFIG_REF_MV
+      int nmv_ctx = vp10_nmv_ctx(xd->ref_mv_count[mbmi->ref_frame[0]],
+                                 xd->ref_mv_stack[mbmi->ref_frame[0]]);
+      nmv_context_counts *const mv_counts =
+          counts ? &counts->mv[nmv_ctx] : NULL;
+      read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, is_compound,
+              &cm->fc->nmvc[nmv_ctx], mv_counts,
+              allow_hp);
+#else
+      nmv_context_counts *const mv_counts = counts ? &counts->mv : NULL;
+      read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, &cm->fc->nmvc, mv_counts,
+              allow_hp);
+#endif
+      assert(is_compound);
+      ret = ret && is_mv_valid(&mv[0].as_mv);
+      mv[1].as_int = nearest_mv[1].as_int;
+      break;
+    }
+    case NEAREST_NEWMV: {
+      FRAME_COUNTS *counts = xd->counts;
+#if CONFIG_REF_MV
+      int nmv_ctx = vp10_nmv_ctx(xd->ref_mv_count[mbmi->ref_frame[1]],
+                                 xd->ref_mv_stack[mbmi->ref_frame[1]]);
+      nmv_context_counts *const mv_counts =
+          counts ? &counts->mv[nmv_ctx] : NULL;
+      mv[0].as_int = nearest_mv[0].as_int;
+      read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv, is_compound,
+              &cm->fc->nmvc[nmv_ctx], mv_counts,
+              allow_hp);
+#else
+      nmv_context_counts *const mv_counts = counts ? &counts->mv : NULL;
+      mv[0].as_int = nearest_mv[0].as_int;
+      read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv, &cm->fc->nmvc, mv_counts,
+              allow_hp);
+#endif
+      assert(is_compound);
+      ret = ret && is_mv_valid(&mv[1].as_mv);
+      break;
+    }
+    case NEAR_NEWMV: {
+      FRAME_COUNTS *counts = xd->counts;
+#if CONFIG_REF_MV
+      int nmv_ctx = vp10_nmv_ctx(xd->ref_mv_count[mbmi->ref_frame[1]],
+                                 xd->ref_mv_stack[mbmi->ref_frame[1]]);
+      nmv_context_counts *const mv_counts =
+          counts ? &counts->mv[nmv_ctx] : NULL;
+      mv[0].as_int = near_mv[0].as_int;
+      read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv, is_compound,
+              &cm->fc->nmvc[nmv_ctx], mv_counts,
+              allow_hp);
+#else
+      nmv_context_counts *const mv_counts = counts ? &counts->mv : NULL;
+      mv[0].as_int = near_mv[0].as_int;
+      read_mv(r, &mv[1].as_mv, &ref_mv[1].as_mv, &cm->fc->nmvc, mv_counts,
+              allow_hp);
+#endif
+      assert(is_compound);
+
+      ret = ret && is_mv_valid(&mv[1].as_mv);
+      break;
+    }
+    case NEW_NEARMV: {
+      FRAME_COUNTS *counts = xd->counts;
+#if CONFIG_REF_MV
+      int nmv_ctx = vp10_nmv_ctx(xd->ref_mv_count[mbmi->ref_frame[0]],
+                                 xd->ref_mv_stack[mbmi->ref_frame[0]]);
+      nmv_context_counts *const mv_counts =
+          counts ? &counts->mv[nmv_ctx] : NULL;
+      read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, is_compound,
+              &cm->fc->nmvc[nmv_ctx], mv_counts,
+              allow_hp);
+#else
+      nmv_context_counts *const mv_counts = counts ? &counts->mv : NULL;
+      read_mv(r, &mv[0].as_mv, &ref_mv[0].as_mv, &cm->fc->nmvc, mv_counts,
+              allow_hp);
+#endif
+      assert(is_compound);
+      ret = ret && is_mv_valid(&mv[0].as_mv);
+      mv[1].as_int = near_mv[1].as_int;
+      break;
+    }
+    case ZERO_ZEROMV: {
+      assert(is_compound);
+      mv[0].as_int = 0;
+      mv[1].as_int = 0;
+      break;
+    }
+#endif  // CONFIG_EXT_INTER
     default: {
       return 0;
     }
@@ -512,12 +1192,12 @@
 }
 
 static int read_is_inter_block(VP10_COMMON *const cm, MACROBLOCKD *const xd,
-                               int segment_id, vpx_reader *r) {
+                               int segment_id, vp10_reader *r) {
   if (segfeature_active(&cm->seg, segment_id, SEG_LVL_REF_FRAME)) {
     return get_segdata(&cm->seg, segment_id, SEG_LVL_REF_FRAME) != INTRA_FRAME;
   } else {
     const int ctx = vp10_get_intra_inter_context(xd);
-    const int is_inter = vpx_read(r, cm->fc->intra_inter_prob[ctx]);
+    const int is_inter = vp10_read(r, cm->fc->intra_inter_prob[ctx]);
     FRAME_COUNTS *counts = xd->counts;
     if (counts)
       ++counts->intra_inter[ctx][is_inter];
@@ -528,27 +1208,43 @@
 static void fpm_sync(void *const data, int mi_row) {
   VP10Decoder *const pbi = (VP10Decoder *)data;
   vp10_frameworker_wait(pbi->frame_worker_owner, pbi->common.prev_frame,
-                       mi_row << MI_BLOCK_SIZE_LOG2);
+                       mi_row << pbi->common.mib_size_log2);
 }
 
 static void read_inter_block_mode_info(VP10Decoder *const pbi,
                                        MACROBLOCKD *const xd,
                                        MODE_INFO *const mi,
-                                       int mi_row, int mi_col, vpx_reader *r) {
+#if (CONFIG_OBMC || CONFIG_EXT_INTER) && CONFIG_SUPERTX
+                                       int mi_row, int mi_col, vp10_reader *r,
+                                       int supertx_enabled) {
+#else
+                                       int mi_row, int mi_col, vp10_reader *r) {
+#endif  // CONFIG_OBMC && CONFIG_SUPERTX
   VP10_COMMON *const cm = &pbi->common;
   MB_MODE_INFO *const mbmi = &mi->mbmi;
   const BLOCK_SIZE bsize = mbmi->sb_type;
   const int allow_hp = cm->allow_high_precision_mv;
   int_mv nearestmv[2], nearmv[2];
-  int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
+  int_mv ref_mvs[MODE_CTX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
+#if CONFIG_EXT_INTER
+  int mv_idx;
+#endif  // CONFIG_EXT_INTER
   int ref, is_compound;
-  uint8_t inter_mode_ctx[MAX_REF_FRAMES];
+  int16_t inter_mode_ctx[MODE_CTX_REF_FRAMES];
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+  int16_t compound_inter_mode_ctx[MODE_CTX_REF_FRAMES];
+#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
+  int16_t mode_ctx = 0;
+  MV_REFERENCE_FRAME ref_frame;
+
+  mbmi->palette_mode_info.palette_size[0] = 0;
+  mbmi->palette_mode_info.palette_size[1] = 0;
 
   read_ref_frames(cm, xd, r, mbmi->segment_id, mbmi->ref_frame);
   is_compound = has_second_ref(mbmi);
 
   for (ref = 0; ref < 1 + is_compound; ++ref) {
-    const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
+    MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
     RefBuffer *ref_buf = &cm->frame_refs[frame - LAST_FRAME];
 
     xd->block_refs[ref] = ref_buf;
@@ -557,10 +1253,34 @@
                          "Reference frame has invalid dimensions");
     vp10_setup_pre_planes(xd, ref, ref_buf->buf, mi_row, mi_col,
                          &ref_buf->sf);
-    vp10_find_mv_refs(cm, xd, mi, frame, ref_mvs[frame],
-                     mi_row, mi_col, fpm_sync, (void *)pbi, inter_mode_ctx);
   }
 
+  for (ref_frame = LAST_FRAME; ref_frame < MODE_CTX_REF_FRAMES; ++ref_frame) {
+    vp10_find_mv_refs(cm, xd, mi, ref_frame,
+#if CONFIG_REF_MV
+                      &xd->ref_mv_count[ref_frame],
+                      xd->ref_mv_stack[ref_frame],
+#if CONFIG_EXT_INTER
+                      compound_inter_mode_ctx,
+#endif  // CONFIG_EXT_INTER
+#endif
+                      ref_mvs[ref_frame],
+                      mi_row, mi_col, fpm_sync, (void *)pbi, inter_mode_ctx);
+  }
+
+#if CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+  if (is_compound)
+    mode_ctx = compound_inter_mode_ctx[mbmi->ref_frame[0]];
+  else
+#endif  // CONFIG_EXT_INTER
+  mode_ctx = vp10_mode_context_analyzer(inter_mode_ctx,
+                                        mbmi->ref_frame, bsize, -1);
+  mbmi->ref_mv_idx = 0;
+#else
+  mode_ctx = inter_mode_ctx[mbmi->ref_frame[0]];
+#endif
+
   if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
     mbmi->mode = ZEROMV;
     if (bsize < BLOCK_8X8) {
@@ -569,21 +1289,105 @@
         return;
     }
   } else {
-    if (bsize >= BLOCK_8X8)
-      mbmi->mode = read_inter_mode(cm, xd, r,
-                                   inter_mode_ctx[mbmi->ref_frame[0]]);
+    if (bsize >= BLOCK_8X8) {
+#if CONFIG_EXT_INTER
+      if (is_compound)
+        mbmi->mode = read_inter_compound_mode(cm, xd, r, mode_ctx);
+      else
+#endif  // CONFIG_EXT_INTER
+      mbmi->mode = read_inter_mode(cm, xd,
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+                                   mbmi,
+#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
+                                   r, mode_ctx);
+#if CONFIG_REF_MV
+      if (mbmi->mode == NEARMV || mbmi->mode == NEWMV)
+        read_drl_idx(cm, xd, mbmi, r);
+#endif
+    }
   }
 
+#if CONFIG_EXT_INTER
+  if (bsize < BLOCK_8X8 ||
+      (mbmi->mode != ZEROMV && mbmi->mode != ZERO_ZEROMV)) {
+#else
   if (bsize < BLOCK_8X8 || mbmi->mode != ZEROMV) {
+#endif  // CONFIG_EXT_INTER
     for (ref = 0; ref < 1 + is_compound; ++ref) {
       vp10_find_best_ref_mvs(allow_hp, ref_mvs[mbmi->ref_frame[ref]],
                              &nearestmv[ref], &nearmv[ref]);
     }
   }
 
-  mbmi->interp_filter = (cm->interp_filter == SWITCHABLE)
-                      ? read_switchable_interp_filter(cm, xd, r)
-                      : cm->interp_filter;
+#if CONFIG_REF_MV
+  if (mbmi->ref_mv_idx > 0) {
+    int_mv cur_mv =
+        xd->ref_mv_stack[mbmi->ref_frame[0]][1 + mbmi->ref_mv_idx].this_mv;
+    nearmv[0] = cur_mv;
+  }
+
+#if CONFIG_EXT_INTER
+  if (is_compound && bsize >= BLOCK_8X8 && mbmi->mode != ZERO_ZEROMV) {
+#else
+  if (is_compound && bsize >= BLOCK_8X8 && mbmi->mode != NEWMV &&
+      mbmi->mode != ZEROMV) {
+#endif  // CONFIG_EXT_INTER
+    uint8_t ref_frame_type = vp10_ref_frame_type(mbmi->ref_frame);
+
+#if CONFIG_EXT_INTER
+    if (xd->ref_mv_count[ref_frame_type] > 0) {
+#else
+    if (xd->ref_mv_count[ref_frame_type] == 1 && mbmi->mode == NEARESTMV) {
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_EXT_INTER
+      if (mbmi->mode == NEAREST_NEARESTMV) {
+#endif  // CONFIG_EXT_INTER
+        nearestmv[0] = xd->ref_mv_stack[ref_frame_type][0].this_mv;
+        nearestmv[1] = xd->ref_mv_stack[ref_frame_type][0].comp_mv;
+        lower_mv_precision(&nearestmv[0].as_mv, allow_hp);
+        lower_mv_precision(&nearestmv[1].as_mv, allow_hp);
+#if CONFIG_EXT_INTER
+      } else if (mbmi->mode == NEAREST_NEWMV || mbmi->mode == NEAREST_NEARMV) {
+        nearestmv[0] = xd->ref_mv_stack[ref_frame_type][0].this_mv;
+        lower_mv_precision(&nearestmv[0].as_mv, allow_hp);
+      } else if (mbmi->mode == NEW_NEARESTMV || mbmi->mode == NEAR_NEARESTMV) {
+        nearestmv[1] = xd->ref_mv_stack[ref_frame_type][0].comp_mv;
+        lower_mv_precision(&nearestmv[1].as_mv, allow_hp);
+      }
+#endif  // CONFIG_EXT_INTER
+    }
+
+#if CONFIG_EXT_INTER
+    if (xd->ref_mv_count[ref_frame_type] > 1) {
+      if (mbmi->mode == NEAR_NEWMV ||
+          mbmi->mode == NEAR_NEARESTMV ||
+          mbmi->mode == NEAR_NEARMV) {
+        nearmv[0] = xd->ref_mv_stack[ref_frame_type][1].this_mv;
+        lower_mv_precision(&nearmv[0].as_mv, allow_hp);
+      }
+
+      if (mbmi->mode == NEW_NEARMV ||
+          mbmi->mode == NEAREST_NEARMV ||
+          mbmi->mode == NEAR_NEARMV) {
+        nearmv[1] = xd->ref_mv_stack[ref_frame_type][1].comp_mv;
+        lower_mv_precision(&nearmv[1].as_mv, allow_hp);
+      }
+    }
+#else
+    if (xd->ref_mv_count[ref_frame_type] > 1) {
+      int ref_mv_idx = 1 + mbmi->ref_mv_idx;
+      nearestmv[0] = xd->ref_mv_stack[ref_frame_type][0].this_mv;
+      nearestmv[1] = xd->ref_mv_stack[ref_frame_type][0].comp_mv;
+      nearmv[0] = xd->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
+      nearmv[1] = xd->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
+    }
+#endif  // CONFIG_EXT_INTER
+  }
+#endif
+
+#if !CONFIG_EXT_INTERP && !CONFIG_DUAL_FILTER
+  mbmi->interp_filter = read_interp_filter(cm, xd, r);
+#endif  // !CONFIG_EXT_INTERP && !CONFIG_DUAL_FILTER
 
   if (bsize < BLOCK_8X8) {
     const int num_4x4_w = 1 << xd->bmode_blocks_wl;
@@ -591,22 +1395,93 @@
     int idx, idy;
     PREDICTION_MODE b_mode;
     int_mv nearest_sub8x8[2], near_sub8x8[2];
+#if CONFIG_EXT_INTER
+    int_mv ref_mv[2][2];
+#endif  // CONFIG_EXT_INTER
     for (idy = 0; idy < 2; idy += num_4x4_h) {
       for (idx = 0; idx < 2; idx += num_4x4_w) {
         int_mv block[2];
         const int j = idy * 2 + idx;
-        b_mode = read_inter_mode(cm, xd, r, inter_mode_ctx[mbmi->ref_frame[0]]);
+        int_mv ref_mv_s8[2];
+#if CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+        if (!is_compound)
+#endif  // CONFIG_EXT_INTER
+        mode_ctx = vp10_mode_context_analyzer(inter_mode_ctx,  mbmi->ref_frame,
+                                              bsize, j);
+#endif
+#if CONFIG_EXT_INTER
+        if (is_compound)
+          b_mode = read_inter_compound_mode(cm, xd, r, mode_ctx);
+        else
+#endif  // CONFIG_EXT_INTER
+        b_mode = read_inter_mode(cm, xd,
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+                                 mbmi,
+#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
+                                 r, mode_ctx);
 
-        if (b_mode == NEARESTMV || b_mode == NEARMV) {
-          uint8_t dummy_mode_ctx[MAX_REF_FRAMES];
+#if CONFIG_EXT_INTER
+        mv_idx = (b_mode == NEWFROMNEARMV) ? 1 : 0;
+
+        if (b_mode != ZEROMV && b_mode != ZERO_ZEROMV) {
+#else
+        if (b_mode != ZEROMV) {
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_REF_MV
+          CANDIDATE_MV ref_mv_stack[2][MAX_REF_MV_STACK_SIZE];
+          uint8_t ref_mv_count[2];
+#endif
           for (ref = 0; ref < 1 + is_compound; ++ref)
+#if CONFIG_EXT_INTER
+          {
+            int_mv mv_ref_list[MAX_MV_REF_CANDIDATES];
+            vp10_update_mv_context(xd, mi, mbmi->ref_frame[ref],
+                                   mv_ref_list, j, mi_row, mi_col, NULL);
+#endif  // CONFIG_EXT_INTER
             vp10_append_sub8x8_mvs_for_idx(cm, xd, j, ref, mi_row, mi_col,
-                                          &nearest_sub8x8[ref],
-                                          &near_sub8x8[ref],
-                                          dummy_mode_ctx);
+#if CONFIG_REF_MV
+                                           ref_mv_stack[ref],
+                                           &ref_mv_count[ref],
+#endif
+#if CONFIG_EXT_INTER
+                                           mv_ref_list,
+#endif  // CONFIG_EXT_INTER
+                                           &nearest_sub8x8[ref],
+                                           &near_sub8x8[ref]);
+#if CONFIG_EXT_INTER
+            if (have_newmv_in_inter_mode(b_mode)) {
+              mv_ref_list[0].as_int = nearest_sub8x8[ref].as_int;
+              mv_ref_list[1].as_int = near_sub8x8[ref].as_int;
+              vp10_find_best_ref_mvs(allow_hp, mv_ref_list,
+                                     &ref_mv[0][ref], &ref_mv[1][ref]);
+            }
+          }
+#endif  // CONFIG_EXT_INTER
         }
 
-        if (!assign_mv(cm, xd, b_mode, block, nearestmv,
+        for (ref = 0; ref < 1 + is_compound && b_mode != ZEROMV; ++ref) {
+#if CONFIG_REF_MV
+          ref_mv_s8[ref] = nearest_sub8x8[ref];
+          lower_mv_precision(&ref_mv_s8[ref].as_mv, allow_hp);
+#else
+          ref_mv_s8[ref] = nearestmv[ref];
+#endif
+        }
+#if CONFIG_EXT_INTER
+        (void)ref_mv_s8;
+#endif
+
+        if (!assign_mv(cm, xd, b_mode,
+#if CONFIG_REF_MV
+                       j,
+#endif
+                       block,
+#if CONFIG_EXT_INTER
+                       ref_mv[mv_idx],
+#else
+                       ref_mv_s8,
+#endif  // CONFIG_EXT_INTER
                        nearest_sub8x8, near_sub8x8,
                        is_compound, allow_hp, r)) {
           xd->corrupted |= 1;
@@ -624,62 +1499,295 @@
       }
     }
 
+#if CONFIG_REF_MV
+    mbmi->pred_mv[0].as_int = mi->bmi[3].pred_mv_s8[0].as_int;
+    mbmi->pred_mv[1].as_int = mi->bmi[3].pred_mv_s8[1].as_int;
+#endif
     mi->mbmi.mode = b_mode;
 
     mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
     mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
   } else {
-    xd->corrupted |= !assign_mv(cm, xd, mbmi->mode, mbmi->mv, nearestmv,
+    int ref;
+    int_mv ref_mv[2];
+    ref_mv[0] = nearestmv[0];
+    ref_mv[1] = nearestmv[1];
+
+    for (ref = 0; ref < 1 + is_compound && mbmi->mode == NEWMV; ++ref) {
+#if CONFIG_REF_MV
+      uint8_t ref_frame_type = vp10_ref_frame_type(mbmi->ref_frame);
+      if (xd->ref_mv_count[ref_frame_type] > 1) {
+        ref_mv[ref] = (ref == 0) ?
+            xd->ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx].this_mv :
+            xd->ref_mv_stack[ref_frame_type][mbmi->ref_mv_idx].comp_mv;
+        clamp_mv_ref(&ref_mv[ref].as_mv, xd->n8_w << 3, xd->n8_h << 3, xd);
+      }
+#endif
+      nearestmv[ref] = ref_mv[ref];
+    }
+
+    xd->corrupted |= !assign_mv(cm, xd, mbmi->mode,
+#if CONFIG_REF_MV
+                                0,
+#endif
+                                mbmi->mv,
+#if CONFIG_EXT_INTER
+                                mbmi->mode == NEWFROMNEARMV ?
+                                              nearmv : nearestmv,
+#else
+                                ref_mv,
+#endif  // CONFIG_EXT_INTER
                                 nearestmv, nearmv, is_compound, allow_hp, r);
   }
+
+#if CONFIG_EXT_INTER
+  mbmi->use_wedge_interintra = 0;
+  if (cm->reference_mode != COMPOUND_REFERENCE &&
+#if CONFIG_SUPERTX
+      !supertx_enabled &&
+#endif
+      is_interintra_allowed(mbmi)) {
+    const int bsize_group = size_group_lookup[bsize];
+    const int interintra = vp10_read(r, cm->fc->interintra_prob[bsize_group]);
+    if (xd->counts)
+      xd->counts->interintra[bsize_group][interintra]++;
+    assert(mbmi->ref_frame[1] == NONE);
+    if (interintra) {
+      const INTERINTRA_MODE interintra_mode =
+          read_interintra_mode(cm, xd, r, bsize_group);
+      mbmi->ref_frame[1] = INTRA_FRAME;
+      mbmi->interintra_mode = interintra_mode;
+#if CONFIG_EXT_INTRA
+      mbmi->ext_intra_mode_info.use_ext_intra_mode[0] = 0;
+      mbmi->ext_intra_mode_info.use_ext_intra_mode[1] = 0;
+      mbmi->angle_delta[0] = 0;
+      mbmi->angle_delta[1] = 0;
+      mbmi->intra_filter = INTRA_FILTER_LINEAR;
+#endif  // CONFIG_EXT_INTRA
+      if (is_interintra_wedge_used(bsize)) {
+        mbmi->use_wedge_interintra =
+            vp10_read(r, cm->fc->wedge_interintra_prob[bsize]);
+        if (xd->counts)
+          xd->counts->wedge_interintra[bsize][mbmi->use_wedge_interintra]++;
+        if (mbmi->use_wedge_interintra) {
+          mbmi->interintra_wedge_index =
+              vp10_read_literal(r, get_wedge_bits_lookup(bsize));
+          mbmi->interintra_wedge_sign = 0;
+        }
+      }
+    }
+  }
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_OBMC || CONFIG_WARPED_MOTION
+  mbmi->motion_variation = SIMPLE_TRANSLATION;
+#if CONFIG_SUPERTX
+  if (!supertx_enabled)
+#endif  // CONFIG_SUPERTX
+#if CONFIG_EXT_INTER
+    if (mbmi->ref_frame[1] != INTRA_FRAME)
+#endif  // CONFIG_EXT_INTER
+    mbmi->motion_variation = read_motvar_block(cm, xd, r);
+#endif  // CONFIG_OBMC || CONFIG_WARPED_MOTION
+
+#if CONFIG_EXT_INTER
+  mbmi->use_wedge_interinter = 0;
+  if (cm->reference_mode != SINGLE_REFERENCE &&
+      is_inter_compound_mode(mbmi->mode) &&
+#if CONFIG_OBMC || CONFIG_WARPED_MOTION
+      !(is_motvar_allowed(mbmi) &&
+        mbmi->motion_variation != SIMPLE_TRANSLATION) &&
+#endif  // CONFIG_OBMC || CONFIG_WARPED_MOTION
+      is_interinter_wedge_used(bsize)) {
+    mbmi->use_wedge_interinter =
+        vp10_read(r, cm->fc->wedge_interinter_prob[bsize]);
+    if (xd->counts)
+      xd->counts->wedge_interinter[bsize][mbmi->use_wedge_interinter]++;
+    if (mbmi->use_wedge_interinter) {
+      mbmi->interinter_wedge_index =
+          vp10_read_literal(r, get_wedge_bits_lookup(bsize));
+      mbmi->interinter_wedge_sign = vp10_read_bit(r);
+    }
+  }
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_DUAL_FILTER
+  for (ref = 0; ref < 2; ++ref) {
+    mbmi->interp_filter[ref] = (cm->interp_filter == SWITCHABLE) ?
+        EIGHTTAP_REGULAR : cm->interp_filter;
+
+    if (has_subpel_mv_component(xd->mi[0], xd, ref) ||
+        (mbmi->ref_frame[1] > INTRA_FRAME &&
+         has_subpel_mv_component(xd->mi[0], xd, ref + 2)))
+      mbmi->interp_filter[ref] = read_interp_filter(cm, xd, ref, r);
+  }
+  // The index system worsk as:
+  // (0, 1) -> (vertical, horizontal) filter types for the first ref frame.
+  // (2, 3) -> (vertical, horizontal) filter types for the second ref frame.
+  mbmi->interp_filter[2] = mbmi->interp_filter[0];
+  mbmi->interp_filter[3] = mbmi->interp_filter[1];
+#else
+#if CONFIG_EXT_INTERP
+  mbmi->interp_filter = read_interp_filter(cm, xd, r);
+#endif  // CONFIG_EXT_INTERP
+#endif  // CONFIG_DUAL_FILTER
 }
 
 static void read_inter_frame_mode_info(VP10Decoder *const pbi,
                                        MACROBLOCKD *const xd,
-                                       int mi_row, int mi_col, vpx_reader *r) {
+#if CONFIG_SUPERTX
+                                       int supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                                       int mi_row, int mi_col, vp10_reader *r) {
   VP10_COMMON *const cm = &pbi->common;
   MODE_INFO *const mi = xd->mi[0];
   MB_MODE_INFO *const mbmi = &mi->mbmi;
-  int inter_block;
+  int inter_block = 1;
+#if CONFIG_VAR_TX
+  BLOCK_SIZE bsize = mbmi->sb_type;
+#endif  // CONFIG_VAR_TX
 
   mbmi->mv[0].as_int = 0;
   mbmi->mv[1].as_int = 0;
   mbmi->segment_id = read_inter_segment_id(cm, xd, mi_row, mi_col, r);
-  mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
-  inter_block = read_is_inter_block(cm, xd, mbmi->segment_id, r);
-  mbmi->tx_size = read_tx_size(cm, xd, !mbmi->skip || !inter_block, r);
+#if CONFIG_SUPERTX
+  if (!supertx_enabled) {
+#endif  // CONFIG_SUPERTX
+    mbmi->skip = read_skip(cm, xd, mbmi->segment_id, r);
+    inter_block = read_is_inter_block(cm, xd, mbmi->segment_id, r);
+
+#if CONFIG_VAR_TX
+    xd->above_txfm_context = cm->above_txfm_context + mi_col;
+    xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+    if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT &&
+        !mbmi->skip && inter_block) {
+      const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
+      const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
+      const int bs = num_4x4_blocks_wide_lookup[txb_size];
+      const int width  = num_4x4_blocks_wide_lookup[bsize];
+      const int height = num_4x4_blocks_high_lookup[bsize];
+      int idx, idy;
+      for (idy = 0; idy < height; idy += bs)
+        for (idx = 0; idx < width; idx += bs)
+          read_tx_size_inter(cm, xd, mbmi, xd->counts, max_tx_size,
+                             idy, idx, r);
+      if (xd->counts) {
+        const int ctx = get_tx_size_context(xd);
+        ++xd->counts->tx_size[max_tx_size - TX_8X8][ctx][mbmi->tx_size];
+      }
+    } else {
+      mbmi->tx_size = read_tx_size(cm, xd, !mbmi->skip || !inter_block, r);
+      if (inter_block) {
+        const int width  = num_4x4_blocks_wide_lookup[bsize];
+        const int height = num_4x4_blocks_high_lookup[bsize];
+        int idx, idy;
+        for (idy = 0; idy < height; ++idy)
+          for (idx = 0; idx < width; ++idx)
+            mbmi->inter_tx_size[idy >> 1][idx >> 1] = mbmi->tx_size;
+      }
+
+      set_txfm_ctx(xd->left_txfm_context, mbmi->tx_size, xd->n8_h);
+      set_txfm_ctx(xd->above_txfm_context, mbmi->tx_size, xd->n8_w);
+    }
+#else
+    mbmi->tx_size = read_tx_size(cm, xd, !mbmi->skip || !inter_block, r);
+#endif  // CONFIG_VAR_TX
+#if CONFIG_SUPERTX
+  }
+#if CONFIG_VAR_TX
+  else if (inter_block) {
+    const int width  = num_4x4_blocks_wide_lookup[bsize];
+    const int height = num_4x4_blocks_high_lookup[bsize];
+    int idx, idy;
+    xd->mi[0]->mbmi.tx_size = xd->supertx_size;
+    for (idy = 0; idy < height; ++idy)
+      for (idx = 0; idx < width; ++idx)
+        xd->mi[0]->mbmi.inter_tx_size[idy >> 1][idx >> 1] =
+            xd->supertx_size;
+  }
+#endif  // CONFIG_VAR_TX
+#endif  // CONFIG_SUPERTX
 
   if (inter_block)
-    read_inter_block_mode_info(pbi, xd, mi, mi_row, mi_col, r);
+    read_inter_block_mode_info(pbi, xd,
+#if (CONFIG_OBMC || CONFIG_EXT_INTER) && CONFIG_SUPERTX
+
+                               mi, mi_row, mi_col, r, supertx_enabled);
+#else
+                               mi, mi_row, mi_col, r);
+#endif  // CONFIG_OBMC && CONFIG_SUPERTX
   else
     read_intra_block_mode_info(cm, xd, mi, r);
 
-  if (mbmi->tx_size < TX_32X32 &&
-      cm->base_qindex > 0 && !mbmi->skip &&
-      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-    FRAME_COUNTS *counts = xd->counts;
-    if (inter_block) {
-      mbmi->tx_type = vpx_read_tree(
-          r, vp10_ext_tx_tree,
-          cm->fc->inter_ext_tx_prob[mbmi->tx_size]);
-      if (counts)
-        ++counts->inter_ext_tx[mbmi->tx_size][mbmi->tx_type];
+  if (!FIXED_TX_TYPE) {
+#if CONFIG_EXT_TX
+    if (get_ext_tx_types(mbmi->tx_size, mbmi->sb_type, inter_block) > 1 &&
+        cm->base_qindex > 0 && !mbmi->skip &&
+#if CONFIG_SUPERTX
+        !supertx_enabled &&
+#endif  // CONFIG_SUPERTX
+        !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+      int eset = get_ext_tx_set(mbmi->tx_size, mbmi->sb_type,
+                                inter_block);
+      FRAME_COUNTS *counts = xd->counts;
+
+      if (inter_block) {
+        if (eset > 0) {
+          mbmi->tx_type =
+              vp10_read_tree(r, vp10_ext_tx_inter_tree[eset],
+                            cm->fc->inter_ext_tx_prob[eset][mbmi->tx_size]);
+          if (counts)
+            ++counts->inter_ext_tx[eset][mbmi->tx_size][mbmi->tx_type];
+        }
+      } else if (ALLOW_INTRA_EXT_TX) {
+        if (eset > 0) {
+          mbmi->tx_type = vp10_read_tree(r, vp10_ext_tx_intra_tree[eset],
+                                        cm->fc->intra_ext_tx_prob[eset]
+                                                [mbmi->tx_size][mbmi->mode]);
+          if (counts)
+            ++counts->intra_ext_tx[eset][mbmi->tx_size]
+                                         [mbmi->mode][mbmi->tx_type];
+        }
+      }
     } else {
-      const TX_TYPE tx_type_nom = intra_mode_to_tx_type_context[mbmi->mode];
-      mbmi->tx_type = vpx_read_tree(
-          r, vp10_ext_tx_tree,
-          cm->fc->intra_ext_tx_prob[mbmi->tx_size][tx_type_nom]);
-      if (counts)
-        ++counts->intra_ext_tx[mbmi->tx_size][tx_type_nom][mbmi->tx_type];
+      mbmi->tx_type = DCT_DCT;
     }
-  } else {
-    mbmi->tx_type = DCT_DCT;
+#else
+    if (mbmi->tx_size < TX_32X32 &&
+        cm->base_qindex > 0 && !mbmi->skip &&
+#if CONFIG_SUPERTX
+        !supertx_enabled &&
+#endif  // CONFIG_SUPERTX
+        !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+      FRAME_COUNTS *counts = xd->counts;
+      if (inter_block) {
+        mbmi->tx_type = vp10_read_tree(
+            r, vp10_ext_tx_tree,
+            cm->fc->inter_ext_tx_prob[mbmi->tx_size]);
+        if (counts)
+          ++counts->inter_ext_tx[mbmi->tx_size][mbmi->tx_type];
+      } else {
+        const TX_TYPE tx_type_nom = intra_mode_to_tx_type_context[mbmi->mode];
+        mbmi->tx_type = vp10_read_tree(
+            r, vp10_ext_tx_tree,
+            cm->fc->intra_ext_tx_prob[mbmi->tx_size][tx_type_nom]);
+        if (counts)
+          ++counts->intra_ext_tx[mbmi->tx_size][tx_type_nom][mbmi->tx_type];
+      }
+    } else {
+      mbmi->tx_type = DCT_DCT;
+    }
+#endif  // CONFIG_EXT_TX
   }
 }
 
 void vp10_read_mode_info(VP10Decoder *const pbi, MACROBLOCKD *xd,
-                        int mi_row, int mi_col, vpx_reader *r,
-                        int x_mis, int y_mis) {
+#if CONFIG_SUPERTX
+                         int supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                         int mi_row, int mi_col, vp10_reader *r,
+                         int x_mis, int y_mis) {
   VP10_COMMON *const cm = &pbi->common;
   MODE_INFO *const mi = xd->mi[0];
   MV_REF* frame_mvs = cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
@@ -687,9 +1795,22 @@
 
   if (frame_is_intra_only(cm)) {
     read_intra_frame_mode_info(cm, xd, mi_row, mi_col, r);
+#if CONFIG_REF_MV
+    for (h = 0; h < y_mis; ++h) {
+      MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
+      for (w = 0; w < x_mis; ++w) {
+        MV_REF *const mv = frame_mv + w;
+        mv->ref_frame[0] = NONE;
+        mv->ref_frame[1] = NONE;
+      }
+    }
+#endif
   } else {
-    read_inter_frame_mode_info(pbi, xd, mi_row, mi_col, r);
-
+    read_inter_frame_mode_info(pbi, xd,
+#if CONFIG_SUPERTX
+                               supertx_enabled,
+#endif  // CONFIG_SUPERTX
+                               mi_row, mi_col, r);
     for (h = 0; h < y_mis; ++h) {
       MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
       for (w = 0; w < x_mis; ++w) {
diff --git a/vp10/decoder/decodemv.h b/vp10/decoder/decodemv.h
index 6653be5..c10c6bf 100644
--- a/vp10/decoder/decodemv.h
+++ b/vp10/decoder/decodemv.h
@@ -11,7 +11,7 @@
 #ifndef VP10_DECODER_DECODEMV_H_
 #define VP10_DECODER_DECODEMV_H_
 
-#include "vpx_dsp/bitreader.h"
+#include "vp10/decoder/bitreader.h"
 
 #include "vp10/decoder/decoder.h"
 
@@ -20,8 +20,12 @@
 #endif
 
 void vp10_read_mode_info(VP10Decoder *const pbi, MACROBLOCKD *xd,
-                        int mi_row, int mi_col, vpx_reader *r,
-                        int x_mis, int y_mis);
+#if CONFIG_SUPERTX
+                         int supertx_enabled,
+#endif
+
+                         int mi_row, int mi_col, vp10_reader *r,
+                         int x_mis, int y_mis);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp10/decoder/decoder.c b/vp10/decoder/decoder.c
index d8864d2..d1e7104 100644
--- a/vp10/decoder/decoder.c
+++ b/vp10/decoder/decoder.c
@@ -30,6 +30,7 @@
 #include "vp10/common/postproc.h"
 #endif
 #include "vp10/common/quant_common.h"
+#include "vp10/common/reconinter.h"
 #include "vp10/common/reconintra.h"
 
 #include "vp10/decoder/decodeframe.h"
@@ -44,6 +45,9 @@
     vpx_dsp_rtcd();
     vpx_scale_rtcd();
     vp10_init_intra_predictors();
+#if CONFIG_EXT_INTER
+    vp10_init_wedge_masks();
+#endif  // CONFIG_EXT_INTER
     init_done = 1;
   }
 }
@@ -115,6 +119,9 @@
   cm->setup_mi = vp10_dec_setup_mi;
 
   vp10_loop_filter_init(cm);
+#if CONFIG_LOOP_RESTORATION
+  vp10_loop_restoration_precal();
+#endif  // CONFIG_LOOP_RESTORATION
 
   cm->error.setjmp = 0;
 
@@ -187,44 +194,70 @@
 vpx_codec_err_t vp10_set_reference_dec(VP10_COMMON *cm,
                                       VP9_REFFRAME ref_frame_flag,
                                       YV12_BUFFER_CONFIG *sd) {
-  RefBuffer *ref_buf = NULL;
-  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+  int idx;
+  YV12_BUFFER_CONFIG *ref_buf = NULL;
 
   // TODO(jkoleszar): The decoder doesn't have any real knowledge of what the
   // encoder is using the frame buffers for. This is just a stub to keep the
   // vpxenc --test-decode functionality working, and will be replaced in a
   // later commit that adds VP9-specific controls for this functionality.
+
+  // (Yunqing) The set_reference control depends on the following setting in
+  // encoder.
+  //   cpi->lst_fb_idx = 0;
+  // #if CONFIG_EXT_REFS
+  //   cpi->lst2_fb_idx = 1;
+  //   cpi->lst3_fb_idx = 2;
+  //   cpi->gld_fb_idx = 3;
+  //   cpi->bwd_fb_idx = 4;
+  //   cpi->alt_fb_idx = 5;
+  // #else  // CONFIG_EXT_REFS
+  //   cpi->gld_fb_idx = 1;
+  //   cpi->alt_fb_idx = 2;
+  // #endif  // CONFIG_EXT_REFS
+
+  // TODO(zoeliu): To revisit following code and reconsider what assumption we
+  // may take on the reference frame buffer virtual indexes
   if (ref_frame_flag == VP9_LAST_FLAG) {
-    ref_buf = &cm->frame_refs[0];
+    idx = cm->ref_frame_map[0];
+#if CONFIG_EXT_REFS
+  } else if (ref_frame_flag == VP9_LAST2_FLAG) {
+    idx = cm->ref_frame_map[1];
+  } else if (ref_frame_flag == VP9_LAST3_FLAG) {
+    idx = cm->ref_frame_map[2];
   } else if (ref_frame_flag == VP9_GOLD_FLAG) {
-    ref_buf = &cm->frame_refs[1];
+    idx = cm->ref_frame_map[3];
+  } else if (ref_frame_flag == VP9_BWD_FLAG) {
+    idx = cm->ref_frame_map[4];
   } else if (ref_frame_flag == VP9_ALT_FLAG) {
-    ref_buf = &cm->frame_refs[2];
+    idx = cm->ref_frame_map[5];
+#else
+  } else if (ref_frame_flag == VP9_GOLD_FLAG) {
+    idx = cm->ref_frame_map[1];
+  } else if (ref_frame_flag == VP9_ALT_FLAG) {
+    idx = cm->ref_frame_map[2];
+#endif  // CONFIG_EXT_REFS
   } else {
     vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                        "Invalid reference frame");
     return cm->error.error_code;
   }
 
-  if (!equal_dimensions(ref_buf->buf, sd)) {
+  if (idx < 0 || idx >= FRAME_BUFFERS) {
+    vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                       "Invalid reference frame map");
+    return cm->error.error_code;
+  }
+
+  // Get the destination reference buffer.
+  ref_buf = &cm->buffer_pool->frame_bufs[idx].buf;
+
+  if (!equal_dimensions(ref_buf, sd)) {
     vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                        "Incorrect buffer dimensions");
   } else {
-    int *ref_fb_ptr = &ref_buf->idx;
-
-    // Find an empty frame buffer.
-    const int free_fb = get_free_fb(cm);
-    if (cm->new_fb_idx == INVALID_IDX)
-      return VPX_CODEC_MEM_ERROR;
-
-    // Decrease ref_count since it will be increased again in
-    // ref_cnt_fb() below.
-    --frame_bufs[free_fb].ref_count;
-
-    // Manage the reference counters and copy image.
-    ref_cnt_fb(frame_bufs, ref_fb_ptr, free_fb);
-    ref_buf->buf = &frame_bufs[*ref_fb_ptr].buf;
-    vp8_yv12_copy_frame(sd, ref_buf->buf);
+    // Overwrite the reference frame buffer.
+    vp8_yv12_copy_frame(sd, ref_buf);
   }
 
   return cm->error.error_code;
@@ -243,10 +276,10 @@
     // Current thread releases the holding of reference frame.
     decrease_ref_count(old_idx, frame_bufs, pool);
 
-    // Release the reference frame in reference map.
-    if ((mask & 1) && old_idx >= 0) {
+    // Release the reference frame holding in the reference map for the decoding
+    // of the next frame.
+    if (mask & 1)
       decrease_ref_count(old_idx, frame_bufs, pool);
-    }
     cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
     ++ref_index;
   }
@@ -257,10 +290,13 @@
     decrease_ref_count(old_idx, frame_bufs, pool);
     cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
   }
+
   unlock_buffer_pool(pool);
   pbi->hold_ref_buf = 0;
   cm->frame_to_show = get_frame_new_buffer(cm);
 
+  // TODO(zoeliu): To fix the ref frame buffer update for the scenario of
+  //               cm->frame_parellel_decode == 1
   if (!cm->frame_parallel_decode || !cm->show_frame) {
     lock_buffer_pool(pool);
     --frame_bufs[cm->new_fb_idx].ref_count;
@@ -268,8 +304,10 @@
   }
 
   // Invalidate these references until the next frame starts.
-  for (ref_index = 0; ref_index < 3; ref_index++)
-    cm->frame_refs[ref_index].idx = -1;
+  for (ref_index = 0; ref_index < REFS_PER_FRAME; ref_index++) {
+    cm->frame_refs[ref_index].idx = INVALID_IDX;
+    cm->frame_refs[ref_index].buf = NULL;
+  }
 }
 
 int vp10_receive_compressed_data(VP10Decoder *pbi,
@@ -298,12 +336,16 @@
 
   pbi->ready_for_new_data = 0;
 
+  // Find a free buffer for the new frame, releasing the reference previously
+  // held.
+
   // Check if the previous frame was a frame without any references to it.
   // Release frame buffer if not decoding in frame parallel mode.
   if (!cm->frame_parallel_decode && cm->new_fb_idx >= 0
       && frame_bufs[cm->new_fb_idx].ref_count == 0)
     pool->release_fb_cb(pool->cb_priv,
                         &frame_bufs[cm->new_fb_idx].raw_frame_buffer);
+
   // Find a free frame buffer. Return error if can not find any.
   cm->new_fb_idx = get_free_fb(cm);
   if (cm->new_fb_idx == INVALID_IDX)
@@ -326,7 +368,6 @@
     pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
   }
 
-
   if (setjmp(cm->error.jmp)) {
     const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
     int i;
@@ -350,10 +391,10 @@
         // Current thread releases the holding of reference frame.
         decrease_ref_count(old_idx, frame_bufs, pool);
 
-        // Release the reference frame in reference map.
-        if ((mask & 1) && old_idx >= 0) {
+       // Release the reference frame holding in the reference map for the
+       // decoding of the next frame.
+       if (mask & 1)
           decrease_ref_count(old_idx, frame_bufs, pool);
-        }
         ++ref_index;
       }
 
@@ -377,11 +418,25 @@
 
   swap_frame_buffers(pbi);
 
+#if CONFIG_EXT_TILE
+  // For now, we only extend the frame borders when the whole frame is decoded.
+  // Later, if needed, extend the border for the decoded tile on the frame
+  // border.
+  if (pbi->dec_tile_row == -1 && pbi->dec_tile_col == -1)
+#endif  // CONFIG_EXT_TILE
+    vpx_extend_frame_inner_borders(cm->frame_to_show);
+
   vpx_clear_system_state();
 
   if (!cm->show_existing_frame) {
     cm->last_show_frame = cm->show_frame;
-    cm->prev_frame = cm->cur_frame;
+
+#if CONFIG_EXT_REFS
+    // NOTE: It is not supposed to ref to any frame not used as reference
+    if (cm->is_reference_frame)
+#endif  // CONFIG_EXT_REFS
+      cm->prev_frame = cm->cur_frame;
+
     if (cm->seg.enabled && !cm->frame_parallel_decode)
       vp10_swap_current_and_last_seg_map(cm);
   }
@@ -447,6 +502,17 @@
   return ret;
 }
 
+int vp10_get_frame_to_show(VP10Decoder *pbi,
+                           YV12_BUFFER_CONFIG *frame) {
+  VP10_COMMON *const cm = &pbi->common;
+
+  if (!cm->show_frame || !cm->frame_to_show)
+    return -1;
+
+  *frame = *cm->frame_to_show;
+  return 0;
+}
+
 vpx_codec_err_t vp10_parse_superframe_index(const uint8_t *data,
                                            size_t data_sz,
                                            uint32_t sizes[8], int *count,
@@ -459,9 +525,7 @@
   // an invalid bitstream and need to return an error.
 
   uint8_t marker;
-#if CONFIG_MISC_FIXES
   size_t frame_sz_sum = 0;
-#endif
 
   assert(data_sz);
   marker = read_marker(decrypt_cb, decrypt_state, data + data_sz - 1);
@@ -470,7 +534,7 @@
   if ((marker & 0xe0) == 0xc0) {
     const uint32_t frames = (marker & 0x7) + 1;
     const uint32_t mag = ((marker >> 3) & 0x3) + 1;
-    const size_t index_sz = 2 + mag * (frames - CONFIG_MISC_FIXES);
+    const size_t index_sz = 2 + mag * (frames - 1);
 
     // This chunk is marked as having a superframe index but doesn't have
     // enough data for it, thus it's an invalid superframe index.
@@ -501,20 +565,16 @@
         x = clear_buffer;
       }
 
-      for (i = 0; i < frames - CONFIG_MISC_FIXES; ++i) {
+      for (i = 0; i < frames - 1; ++i) {
         uint32_t this_sz = 0;
 
         for (j = 0; j < mag; ++j)
           this_sz |= (*x++) << (j * 8);
-        this_sz += CONFIG_MISC_FIXES;
+        this_sz += 1;
         sizes[i] = this_sz;
-#if CONFIG_MISC_FIXES
         frame_sz_sum += this_sz;
-#endif
       }
-#if CONFIG_MISC_FIXES
-      sizes[i] = data_sz - index_sz - frame_sz_sum;
-#endif
+      sizes[i] = (uint32_t)(data_sz - index_sz - frame_sz_sum);
       *count = frames;
     }
   }
diff --git a/vp10/decoder/decoder.h b/vp10/decoder/decoder.h
index 72a6310..e4be2f4 100644
--- a/vp10/decoder/decoder.h
+++ b/vp10/decoder/decoder.h
@@ -14,7 +14,7 @@
 #include "./vpx_config.h"
 
 #include "vpx/vpx_codec.h"
-#include "vpx_dsp/bitreader.h"
+#include "vp10/decoder/bitreader.h"
 #include "vpx_scale/yv12config.h"
 #include "vpx_util/vpx_thread.h"
 
@@ -30,24 +30,32 @@
 // TODO(hkuang): combine this with TileWorkerData.
 typedef struct TileData {
   VP10_COMMON *cm;
-  vpx_reader bit_reader;
+  vp10_reader bit_reader;
   DECLARE_ALIGNED(16, MACROBLOCKD, xd);
   /* dqcoeff are shared by all the planes. So planes must be decoded serially */
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
-  DECLARE_ALIGNED(16, uint8_t, color_index_map[2][64 * 64]);
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[MAX_TX_SQUARE]);
+  DECLARE_ALIGNED(16, uint8_t, color_index_map[2][MAX_SB_SQUARE]);
 } TileData;
 
 typedef struct TileWorkerData {
   struct VP10Decoder *pbi;
-  vpx_reader bit_reader;
+  vp10_reader bit_reader;
   FRAME_COUNTS counts;
   DECLARE_ALIGNED(16, MACROBLOCKD, xd);
   /* dqcoeff are shared by all the planes. So planes must be decoded serially */
-  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[32 * 32]);
-  DECLARE_ALIGNED(16, uint8_t, color_index_map[2][64 * 64]);
+  DECLARE_ALIGNED(16, tran_low_t, dqcoeff[MAX_TX_SQUARE]);
+  DECLARE_ALIGNED(16, uint8_t, color_index_map[2][MAX_SB_SQUARE]);
   struct vpx_internal_error_info error_info;
 } TileWorkerData;
 
+typedef struct TileBufferDec {
+  const uint8_t *data;
+  size_t size;
+  const uint8_t *raw_data_end;  // The end of the raw tile buffer in the
+                                // bit stream.
+  int col;  // only used with multi-threaded decoding
+} TileBufferDec;
+
 typedef struct VP10Decoder {
   DECLARE_ALIGNED(16, MACROBLOCKD, mb);
 
@@ -69,7 +77,9 @@
   int num_tile_workers;
 
   TileData *tile_data;
-  int total_tiles;
+  int allocated_tiles;
+
+  TileBufferDec tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS];
 
   VP9LfSync lf_row_sync;
 
@@ -80,6 +90,12 @@
   int inv_tile_order;
   int need_resync;  // wait for key/intra-only frame.
   int hold_ref_buf;  // hold the reference buffer.
+
+  int tile_size_bytes;
+#if CONFIG_EXT_TILE
+  int tile_col_size_bytes;
+  int dec_tile_row, dec_tile_col;
+#endif  // CONFIG_EXT_TILE
 } VP10Decoder;
 
 int vp10_receive_compressed_data(struct VP10Decoder *pbi,
@@ -88,6 +104,8 @@
 int vp10_get_raw_frame(struct VP10Decoder *pbi, YV12_BUFFER_CONFIG *sd,
                       vp10_ppflags_t *flags);
 
+int vp10_get_frame_to_show(struct VP10Decoder *pbi, YV12_BUFFER_CONFIG *frame);
+
 vpx_codec_err_t vp10_copy_reference_dec(struct VP10Decoder *pbi,
                                        VP9_REFFRAME ref_frame_flag,
                                        YV12_BUFFER_CONFIG *sd);
@@ -134,6 +152,21 @@
   }
 }
 
+#if CONFIG_EXT_REFS
+static INLINE int dec_is_ref_frame_buf(VP10Decoder *const pbi,
+                                       RefCntBuffer *frame_buf) {
+  VP10_COMMON *const cm = &pbi->common;
+  int i;
+  for (i = 0; i < REFS_PER_FRAME; ++i) {
+    RefBuffer *const ref_frame = &cm->frame_refs[i];
+    if (ref_frame->idx == INVALID_IDX) continue;
+    if (frame_buf == &cm->buffer_pool->frame_bufs[ref_frame->idx])
+      break;
+  }
+  return (i < REFS_PER_FRAME);
+}
+#endif  // CONFIG_EXT_REFS
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp10/decoder/detokenize.c b/vp10/decoder/detokenize.c
index d39e3dc..cc3b18b 100644
--- a/vp10/decoder/detokenize.c
+++ b/vp10/decoder/detokenize.c
@@ -11,12 +11,11 @@
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 
+#include "vp10/common/ans.h"
 #include "vp10/common/blockd.h"
 #include "vp10/common/common.h"
 #include "vp10/common/entropy.h"
-#if CONFIG_COEFFICIENT_RANGE_CHECKING
 #include "vp10/common/idct.h"
-#endif
 
 #include "vp10/decoder/detokenize.h"
 
@@ -38,18 +37,23 @@
        ++coef_counts[band][ctx][token];                     \
   } while (0)
 
-static INLINE int read_coeff(const vpx_prob *probs, int n, vpx_reader *r) {
+#if !CONFIG_ANS
+static INLINE int read_coeff(const vpx_prob *probs, int n, vp10_reader *r) {
   int i, val = 0;
   for (i = 0; i < n; ++i)
-    val = (val << 1) | vpx_read(r, probs[i]);
+    val = (val << 1) | vp10_read(r, probs[i]);
   return val;
 }
 
 static int decode_coefs(const MACROBLOCKD *xd,
                         PLANE_TYPE type,
-                        tran_low_t *dqcoeff, TX_SIZE tx_size, const int16_t *dq,
+                        tran_low_t *dqcoeff, TX_SIZE tx_size, TX_TYPE tx_type,
+                        const int16_t *dq,
+#if CONFIG_NEW_QUANT
+                        dequant_val_type_nuq *dq_val,
+#endif  // CONFIG_NEW_QUANT
                         int ctx, const int16_t *scan, const int16_t *nb,
-                        vpx_reader *r) {
+                        vp10_reader *r) {
   FRAME_COUNTS *counts = xd->counts;
   const int max_eob = 16 << (tx_size << 1);
   const FRAME_CONTEXT *const fc = xd->fc;
@@ -60,11 +64,14 @@
   const vpx_prob *prob;
   unsigned int (*coef_counts)[COEFF_CONTEXTS][UNCONSTRAINED_NODES + 1];
   unsigned int (*eob_branch_count)[COEFF_CONTEXTS];
-  uint8_t token_cache[32 * 32];
+  uint8_t token_cache[MAX_TX_SQUARE];
   const uint8_t *band_translate = get_band_translate(tx_size);
-  const int dq_shift = (tx_size == TX_32X32);
+  int dq_shift;
   int v, token;
   int16_t dqv = dq[0];
+#if CONFIG_NEW_QUANT
+  const tran_low_t *dqv_val = &dq_val[0][0];
+#endif  // CONFIG_NEW_QUANT
   const uint8_t *cat1_prob;
   const uint8_t *cat2_prob;
   const uint8_t *cat3_prob;
@@ -111,18 +118,24 @@
   cat6_prob = vp10_cat6_prob;
 #endif
 
+  dq_shift = get_tx_scale(xd, tx_type, tx_size);
+
   while (c < max_eob) {
     int val = -1;
     band = *band_translate++;
     prob = coef_probs[band][ctx];
     if (counts)
       ++eob_branch_count[band][ctx];
-    if (!vpx_read(r, prob[EOB_CONTEXT_NODE])) {
+    if (!vp10_read(r, prob[EOB_CONTEXT_NODE])) {
       INCREMENT_COUNT(EOB_MODEL_TOKEN);
       break;
     }
 
-    while (!vpx_read(r, prob[ZERO_CONTEXT_NODE])) {
+#if CONFIG_NEW_QUANT
+    dqv_val = &dq_val[band][0];
+#endif  // CONFIG_NEW_QUANT
+
+    while (!vp10_read(r, prob[ZERO_CONTEXT_NODE])) {
       INCREMENT_COUNT(ZERO_TOKEN);
       dqv = dq[1];
       token_cache[scan[c]] = 0;
@@ -132,15 +145,18 @@
       ctx = get_coef_context(nb, token_cache, c);
       band = *band_translate++;
       prob = coef_probs[band][ctx];
+#if CONFIG_NEW_QUANT
+      dqv_val = &dq_val[band][0];
+#endif  // CONFIG_NEW_QUANT
     }
 
-    if (!vpx_read(r, prob[ONE_CONTEXT_NODE])) {
+    if (!vp10_read(r, prob[ONE_CONTEXT_NODE])) {
       INCREMENT_COUNT(ONE_TOKEN);
       token = ONE_TOKEN;
       val = 1;
     } else {
       INCREMENT_COUNT(TWO_TOKEN);
-      token = vpx_read_tree(r, vp10_coef_con_tree,
+      token = vp10_read_tree(r, vp10_coef_con_tree,
                             vp10_pareto8_full[prob[PIVOT_NODE] - 1]);
       switch (token) {
         case TWO_TOKEN:
@@ -164,11 +180,7 @@
           val = CAT5_MIN_VAL + read_coeff(cat5_prob, 5, r);
           break;
         case CATEGORY6_TOKEN: {
-#if CONFIG_MISC_FIXES
           const int skip_bits = TX_SIZES - 1 - tx_size;
-#else
-          const int skip_bits = 0;
-#endif
           const uint8_t *cat6p = cat6_prob + skip_bits;
 #if CONFIG_VP9_HIGHBITDEPTH
           switch (xd->bd) {
@@ -192,16 +204,22 @@
         }
       }
     }
+#if CONFIG_NEW_QUANT
+    v = vp10_dequant_abscoeff_nuq(val, dqv, dqv_val);
+    v = dq_shift ? ROUND_POWER_OF_TWO(v, dq_shift) : v;
+#else
     v = (val * dqv) >> dq_shift;
+#endif  // CONFIG_NEW_QUANT
+
 #if CONFIG_COEFFICIENT_RANGE_CHECKING
 #if CONFIG_VP9_HIGHBITDEPTH
-    dqcoeff[scan[c]] = highbd_check_range((vpx_read_bit(r) ? -v : v),
+    dqcoeff[scan[c]] = highbd_check_range((vp10_read_bit(r) ? -v : v),
                                           xd->bd);
 #else
-    dqcoeff[scan[c]] = check_range(vpx_read_bit(r) ? -v : v);
+    dqcoeff[scan[c]] = check_range(vp10_read_bit(r) ? -v : v);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 #else
-    dqcoeff[scan[c]] = vpx_read_bit(r) ? -v : v;
+    dqcoeff[scan[c]] = vp10_read_bit(r) ? -v : v;
 #endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
     token_cache[scan[c]] = vp10_pt_energy_class[token];
     ++c;
@@ -211,6 +229,193 @@
 
   return c;
 }
+#else  // !CONFIG_ANS
+static INLINE int read_coeff(const vpx_prob *const probs, int n,
+                             struct AnsDecoder *const ans) {
+  int i, val = 0;
+  for (i = 0; i < n; ++i)
+    val = (val << 1) | uabs_read(ans, probs[i]);
+  return val;
+}
+
+static int decode_coefs_ans(const MACROBLOCKD *const xd,
+                            PLANE_TYPE type,
+                            tran_low_t *dqcoeff, TX_SIZE tx_size,
+                            TX_TYPE tx_type,
+                            const int16_t *dq,
+#if CONFIG_NEW_QUANT
+                            dequant_val_type_nuq *dq_val,
+#endif  // CONFIG_NEW_QUANT
+                            int ctx, const int16_t *scan, const int16_t *nb,
+                            struct AnsDecoder *const ans) {
+  FRAME_COUNTS *counts = xd->counts;
+  const int max_eob = 16 << (tx_size << 1);
+  const FRAME_CONTEXT *const fc = xd->fc;
+  const int ref = is_inter_block(&xd->mi[0]->mbmi);
+  int band, c = 0;
+  int skip_eob = 0;
+  const vpx_prob (*coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
+      fc->coef_probs[tx_size][type][ref];
+  const rans_dec_lut(*coef_cdfs)[COEFF_CONTEXTS] =
+      fc->coef_cdfs[tx_size][type][ref];
+  const vpx_prob *prob;
+  const rans_dec_lut *cdf;
+  unsigned int (*coef_counts)[COEFF_CONTEXTS][UNCONSTRAINED_NODES + 1];
+  unsigned int (*eob_branch_count)[COEFF_CONTEXTS];
+  uint8_t token_cache[MAX_TX_SQUARE];
+  const uint8_t *band_translate = get_band_translate(tx_size);
+  int dq_shift;
+  int v, token;
+  int16_t dqv = dq[0];
+#if CONFIG_NEW_QUANT
+  const tran_low_t *dqv_val = &dq_val[0][0];
+#endif  // CONFIG_NEW_QUANT
+  const uint8_t *cat1_prob;
+  const uint8_t *cat2_prob;
+  const uint8_t *cat3_prob;
+  const uint8_t *cat4_prob;
+  const uint8_t *cat5_prob;
+  const uint8_t *cat6_prob;
+
+  dq_shift = get_tx_scale(xd, tx_type, tx_size);
+
+  if (counts) {
+    coef_counts = counts->coef[tx_size][type][ref];
+    eob_branch_count = counts->eob_branch[tx_size][type][ref];
+  }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->bd > VPX_BITS_8) {
+    if (xd->bd == VPX_BITS_10) {
+      cat1_prob = vp10_cat1_prob_high10;
+      cat2_prob = vp10_cat2_prob_high10;
+      cat3_prob = vp10_cat3_prob_high10;
+      cat4_prob = vp10_cat4_prob_high10;
+      cat5_prob = vp10_cat5_prob_high10;
+      cat6_prob = vp10_cat6_prob_high10;
+    } else {
+      cat1_prob = vp10_cat1_prob_high12;
+      cat2_prob = vp10_cat2_prob_high12;
+      cat3_prob = vp10_cat3_prob_high12;
+      cat4_prob = vp10_cat4_prob_high12;
+      cat5_prob = vp10_cat5_prob_high12;
+      cat6_prob = vp10_cat6_prob_high12;
+    }
+  } else {
+    cat1_prob = vp10_cat1_prob;
+    cat2_prob = vp10_cat2_prob;
+    cat3_prob = vp10_cat3_prob;
+    cat4_prob = vp10_cat4_prob;
+    cat5_prob = vp10_cat5_prob;
+    cat6_prob = vp10_cat6_prob;
+  }
+#else
+  cat1_prob = vp10_cat1_prob;
+  cat2_prob = vp10_cat2_prob;
+  cat3_prob = vp10_cat3_prob;
+  cat4_prob = vp10_cat4_prob;
+  cat5_prob = vp10_cat5_prob;
+  cat6_prob = vp10_cat6_prob;
+#endif
+
+  while (c < max_eob) {
+    int val = -1;
+    band = *band_translate++;
+    prob = coef_probs[band][ctx];
+    if (!skip_eob) {
+      if (counts)
+        ++eob_branch_count[band][ctx];
+      if (!uabs_read(ans, prob[EOB_CONTEXT_NODE])) {
+        INCREMENT_COUNT(EOB_MODEL_TOKEN);
+        break;
+      }
+    }
+
+#if CONFIG_NEW_QUANT
+    dqv_val = &dq_val[band][0];
+#endif  // CONFIG_NEW_QUANT
+
+    cdf = &coef_cdfs[band][ctx];
+    token = ZERO_TOKEN + rans_read(ans, *cdf);
+    if (token == ZERO_TOKEN) {
+      INCREMENT_COUNT(ZERO_TOKEN);
+      token_cache[scan[c]] = 0;
+      skip_eob = 1;
+    } else {
+      INCREMENT_COUNT(ONE_TOKEN + (token > ONE_TOKEN));
+      switch (token) {
+        case ONE_TOKEN:
+        case TWO_TOKEN:
+        case THREE_TOKEN:
+        case FOUR_TOKEN:
+          val = token;
+          break;
+        case CATEGORY1_TOKEN:
+          val = CAT1_MIN_VAL + read_coeff(cat1_prob, 1, ans);
+          break;
+        case CATEGORY2_TOKEN:
+          val = CAT2_MIN_VAL + read_coeff(cat2_prob, 2, ans);
+          break;
+        case CATEGORY3_TOKEN:
+          val = CAT3_MIN_VAL + read_coeff(cat3_prob, 3, ans);
+          break;
+        case CATEGORY4_TOKEN:
+          val = CAT4_MIN_VAL + read_coeff(cat4_prob, 4, ans);
+          break;
+        case CATEGORY5_TOKEN:
+          val = CAT5_MIN_VAL + read_coeff(cat5_prob, 5, ans);
+          break;
+        case CATEGORY6_TOKEN: {
+          const int skip_bits = TX_SIZES - 1 - tx_size;
+          const uint8_t *cat6p = cat6_prob + skip_bits;
+#if CONFIG_VP9_HIGHBITDEPTH
+          switch (xd->bd) {
+            case VPX_BITS_8:
+              val = CAT6_MIN_VAL + read_coeff(cat6p, 14 - skip_bits, ans);
+              break;
+            case VPX_BITS_10:
+              val = CAT6_MIN_VAL + read_coeff(cat6p, 16 - skip_bits, ans);
+              break;
+            case VPX_BITS_12:
+              val = CAT6_MIN_VAL + read_coeff(cat6p, 18 - skip_bits, ans);
+              break;
+            default:
+              assert(0);
+              return -1;
+          }
+#else
+          val = CAT6_MIN_VAL + read_coeff(cat6p, 14 - skip_bits, ans);
+#endif
+        } break;
+      }
+#if CONFIG_NEW_QUANT
+    v = vp10_dequant_abscoeff_nuq(val, dqv, dqv_val);
+    v = dq_shift ? ROUND_POWER_OF_TWO(v, dq_shift) : v;
+#else
+    v = (val * dqv) >> dq_shift;
+#endif  // CONFIG_NEW_QUANT
+
+#if CONFIG_COEFFICIENT_RANGE_CHECKING
+#if CONFIG_VP9_HIGHBITDEPTH
+      dqcoeff[scan[c]] =
+          highbd_check_range((uabs_read_bit(ans) ? -v : v), xd->bd);
+#else
+      dqcoeff[scan[c]] = check_range(uabs_read_bit(ans) ? -v : v);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#else
+      dqcoeff[scan[c]] = uabs_read_bit(ans) ? -v : v;
+#endif  // CONFIG_COEFFICIENT_RANGE_CHECKING
+      token_cache[scan[c]] = vp10_pt_energy_class[token];
+      skip_eob = 0;
+    }
+    ++c;
+    ctx = get_coef_context(nb, token_cache, c);
+    dqv = dq[1];
+  }
+
+  return c;
+}
+#endif  // !CONFIG_ANS
 
 // TODO(slavarnway): Decode version of vp10_set_context.  Modify vp10_set_context
 // after testing is complete, then delete this version.
@@ -257,18 +462,71 @@
   }
 }
 
-int vp10_decode_block_tokens(MACROBLOCKD *xd,
-                            int plane, const scan_order *sc,
-                            int x, int y,
-                            TX_SIZE tx_size, vpx_reader *r,
-                            int seg_id) {
+void vp10_decode_palette_tokens(MACROBLOCKD *const xd, int plane,
+                                vp10_reader *r) {
+  MODE_INFO *const mi = xd->mi[0];
+  MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int rows = (4 * num_4x4_blocks_high_lookup[bsize]) >>
+      (xd->plane[plane != 0].subsampling_y);
+  const int cols = (4 * num_4x4_blocks_wide_lookup[bsize]) >>
+      (xd->plane[plane != 0].subsampling_x);
+  int color_idx, color_ctx, color_order[PALETTE_MAX_SIZE];
+  int n = mbmi->palette_mode_info.palette_size[plane != 0];
+  int i, j;
+  uint8_t *color_map = xd->plane[plane != 0].color_index_map;
+  const vpx_prob (* const prob)[PALETTE_COLOR_CONTEXTS][PALETTE_COLORS - 1] =
+      plane ? vp10_default_palette_uv_color_prob :
+          vp10_default_palette_y_color_prob;
+
+  for (i = 0; i < rows; ++i) {
+    for (j = (i == 0 ? 1 : 0); j < cols; ++j) {
+      color_ctx = vp10_get_palette_color_context(color_map, cols, i, j, n,
+                                                 color_order);
+      color_idx = vp10_read_tree(r, vp10_palette_color_tree[n - 2],
+                                prob[n - 2][color_ctx]);
+      assert(color_idx >= 0 && color_idx < n);
+      color_map[i * cols + j] = color_order[color_idx];
+    }
+  }
+}
+
+int vp10_decode_block_tokens(MACROBLOCKD *const xd,
+                             int plane, const scan_order *sc,
+                             int x, int y,
+                             TX_SIZE tx_size,
+                             TX_TYPE tx_type,
+#if CONFIG_ANS
+                             struct AnsDecoder *const r,
+#else
+                             vp10_reader *r,
+#endif  // CONFIG_ANS
+                             int seg_id) {
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int16_t *const dequant = pd->seg_dequant[seg_id];
   const int ctx = get_entropy_context(tx_size, pd->above_context + x,
                                                pd->left_context + y);
+#if CONFIG_NEW_QUANT
+  int dq = get_dq_profile_from_ctx(ctx);
+#endif  //  CONFIG_NEW_QUANT
+
+#if !CONFIG_ANS
   const int eob = decode_coefs(xd, pd->plane_type,
-                               pd->dqcoeff, tx_size,
-                               dequant, ctx, sc->scan, sc->neighbors, r);
+                               pd->dqcoeff, tx_size, tx_type,
+                               dequant,
+#if CONFIG_NEW_QUANT
+                               pd->seg_dequant_nuq[seg_id][dq],
+#endif  // CONFIG_NEW_QUANT
+                               ctx, sc->scan, sc->neighbors, r);
+#else
+  const int eob = decode_coefs_ans(xd, pd->plane_type,
+                                   pd->dqcoeff, tx_size, tx_type,
+                                   dequant,
+#if CONFIG_NEW_QUANT
+                                   pd->seg_dequant_nuq[seg_id][dq],
+#endif  // CONFIG_NEW_QUANT
+                                   ctx, sc->scan, sc->neighbors, r);
+#endif  // !CONFIG_ANS
   dec_set_contexts(xd, pd, tx_size, eob > 0, x, y);
   return eob;
 }
diff --git a/vp10/decoder/detokenize.h b/vp10/decoder/detokenize.h
index c3fd90a..7b25b41 100644
--- a/vp10/decoder/detokenize.h
+++ b/vp10/decoder/detokenize.h
@@ -12,19 +12,27 @@
 #ifndef VP10_DECODER_DETOKENIZE_H_
 #define VP10_DECODER_DETOKENIZE_H_
 
-#include "vpx_dsp/bitreader.h"
 #include "vp10/decoder/decoder.h"
+#include "vp10/common/ans.h"
 #include "vp10/common/scan.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-int vp10_decode_block_tokens(MACROBLOCKD *xd,
-                            int plane, const scan_order *sc,
-                            int x, int y,
-                            TX_SIZE tx_size, vpx_reader *r,
-                            int seg_id);
+void vp10_decode_palette_tokens(MACROBLOCKD *const xd, int plane,
+                                vp10_reader *r);
+int vp10_decode_block_tokens(MACROBLOCKD *const xd,
+                             int plane, const scan_order *sc,
+                             int x, int y,
+                             TX_SIZE tx_size,
+                             TX_TYPE tx_type,
+#if CONFIG_ANS
+                             struct AnsDecoder *const r,
+#else
+                             vp10_reader *r,
+#endif  // CONFIG_ANS
+                             int seg_id);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp10/decoder/dsubexp.c b/vp10/decoder/dsubexp.c
index 36c1917..4d53e12 100644
--- a/vp10/decoder/dsubexp.c
+++ b/vp10/decoder/dsubexp.c
@@ -21,15 +21,15 @@
   return (v & 1) ? m - ((v + 1) >> 1) : m + (v >> 1);
 }
 
-static int decode_uniform(vpx_reader *r) {
+static int decode_uniform(vp10_reader *r) {
   const int l = 8;
-  const int m = (1 << l) - 191 + CONFIG_MISC_FIXES;
-  const int v = vpx_read_literal(r, l - 1);
-  return v < m ?  v : (v << 1) - m + vpx_read_bit(r);
+  const int m = (1 << l) - 190;
+  const int v = vp10_read_literal(r, l - 1);
+  return v < m ?  v : (v << 1) - m + vp10_read_bit(r);
 }
 
 static int inv_remap_prob(int v, int m) {
-  static uint8_t inv_map_table[MAX_PROB - CONFIG_MISC_FIXES] = {
+  static uint8_t inv_map_table[MAX_PROB - 1] = {
       7,  20,  33,  46,  59,  72,  85,  98, 111, 124, 137, 150, 163, 176, 189,
     202, 215, 228, 241, 254,   1,   2,   3,   4,   5,   6,   8,   9,  10,  11,
      12,  13,  14,  15,  16,  17,  18,  19,  21,  22,  23,  24,  25,  26,  27,
@@ -47,9 +47,6 @@
     207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221, 222,
     223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238,
     239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253,
-#if !CONFIG_MISC_FIXES
-    253
-#endif
   };
   assert(v < (int)(sizeof(inv_map_table) / sizeof(inv_map_table[0])));
   v = inv_map_table[v];
@@ -61,18 +58,18 @@
   }
 }
 
-static int decode_term_subexp(vpx_reader *r) {
-  if (!vpx_read_bit(r))
-    return vpx_read_literal(r, 4);
-  if (!vpx_read_bit(r))
-    return vpx_read_literal(r, 4) + 16;
-  if (!vpx_read_bit(r))
-    return vpx_read_literal(r, 5) + 32;
+static int decode_term_subexp(vp10_reader *r) {
+  if (!vp10_read_bit(r))
+    return vp10_read_literal(r, 4);
+  if (!vp10_read_bit(r))
+    return vp10_read_literal(r, 4) + 16;
+  if (!vp10_read_bit(r))
+    return vp10_read_literal(r, 5) + 32;
   return decode_uniform(r) + 64;
 }
 
-void vp10_diff_update_prob(vpx_reader *r, vpx_prob* p) {
-  if (vpx_read(r, DIFF_UPDATE_PROB)) {
+void vp10_diff_update_prob(vp10_reader *r, vpx_prob* p) {
+  if (vp10_read(r, DIFF_UPDATE_PROB)) {
     const int delp = decode_term_subexp(r);
     *p = (vpx_prob)inv_remap_prob(delp, *p);
   }
diff --git a/vp10/decoder/dsubexp.h b/vp10/decoder/dsubexp.h
index 1a7ed99..c05ec6e 100644
--- a/vp10/decoder/dsubexp.h
+++ b/vp10/decoder/dsubexp.h
@@ -12,13 +12,13 @@
 #ifndef VP10_DECODER_DSUBEXP_H_
 #define VP10_DECODER_DSUBEXP_H_
 
-#include "vpx_dsp/bitreader.h"
+#include "vp10/decoder/bitreader.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-void vp10_diff_update_prob(vpx_reader *r, vpx_prob* p);
+void vp10_diff_update_prob(vp10_reader *r, vpx_prob* p);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp10/decoder/dthread.c b/vp10/decoder/dthread.c
index 4206adc..f0cb98d 100644
--- a/vp10/decoder/dthread.c
+++ b/vp10/decoder/dthread.c
@@ -159,6 +159,9 @@
 #if CONFIG_VP9_HIGHBITDEPTH
   dst_cm->use_highbitdepth = src_cm->use_highbitdepth;
 #endif
+#if CONFIG_EXT_REFS
+  // TODO(zoeliu): To handle parallel decoding
+#endif  // CONFIG_EXT_REFS
   dst_cm->prev_frame = src_cm->show_existing_frame ?
                        src_cm->prev_frame : src_cm->cur_frame;
   dst_cm->last_width = !src_cm->show_existing_frame ?
diff --git a/vp10/encoder/aq_complexity.c b/vp10/encoder/aq_complexity.c
index 2506a4e..a4c38d1 100644
--- a/vp10/encoder/aq_complexity.c
+++ b/vp10/encoder/aq_complexity.c
@@ -116,8 +116,6 @@
   VP10_COMMON *const cm = &cpi->common;
 
   const int mi_offset = mi_row * cm->mi_cols + mi_col;
-  const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64];
-  const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64];
   const int xmis = VPXMIN(cm->mi_cols - mi_col, num_8x8_blocks_wide_lookup[bs]);
   const int ymis = VPXMIN(cm->mi_rows - mi_row, num_8x8_blocks_high_lookup[bs]);
   int x, y;
@@ -130,7 +128,7 @@
     // Rate depends on fraction of a SB64 in frame (xmis * ymis / bw * bh).
     // It is converted to bits * 256 units.
     const int target_rate = (cpi->rc.sb64_target_rate * xmis * ymis * 256) /
-                            (bw * bh);
+                            (cm->mib_size * cm->mib_size);
     double logvar;
     double low_var_thresh;
     const int aq_strength = get_aq_c_strength(cm->base_qindex, cm->bit_depth);
diff --git a/vp10/encoder/aq_cyclicrefresh.c b/vp10/encoder/aq_cyclicrefresh.c
index 4d7b7d9..a018a4f 100644
--- a/vp10/encoder/aq_cyclicrefresh.c
+++ b/vp10/encoder/aq_cyclicrefresh.c
@@ -267,9 +267,17 @@
       // don't update the map for them. For cases where motion is non-zero or
       // the reference frame isn't the previous frame, the previous value in
       // the map for this spatial location is not entirely correct.
-      if (!is_inter_block(mbmi) || !skip)
+      if ((!is_inter_block(mbmi) || !skip) &&
+          mbmi->segment_id <= CR_SEGMENT_ID_BOOST2) {
         cr->last_coded_q_map[map_offset] = clamp(
             cm->base_qindex + cr->qindex_delta[mbmi->segment_id], 0, MAXQ);
+      } else if (is_inter_block(mbmi) && skip &&
+                 mbmi->segment_id <= CR_SEGMENT_ID_BOOST2) {
+        cr->last_coded_q_map[map_offset] =
+            VPXMIN(clamp(cm->base_qindex + cr->qindex_delta[mbmi->segment_id],
+                         0, MAXQ),
+                   cr->last_coded_q_map[map_offset]);
+      }
     }
 }
 
@@ -316,13 +324,15 @@
   double fraction_low = 0.0;
   int low_content_frame = 0;
 
-  MODE_INFO **mi = cm->mi_grid_visible;
+  MODE_INFO **mi;
   RATE_CONTROL *const rc = &cpi->rc;
   const int rows = cm->mi_rows, cols = cm->mi_cols;
   int cnt1 = 0, cnt2 = 0;
   int force_gf_refresh = 0;
 
   for (mi_row = 0; mi_row < rows; mi_row++) {
+    mi = cm->mi_grid_visible + mi_row * cm->mi_stride;
+
     for (mi_col = 0; mi_col < cols; mi_col++) {
       int16_t abs_mvr = mi[0]->mbmi.mv[0].as_mv.row >= 0 ?
           mi[0]->mbmi.mv[0].as_mv.row : -1 * mi[0]->mbmi.mv[0].as_mv.row;
@@ -341,7 +351,6 @@
       if (cr->map[mi_row * cols + mi_col] < 1)
         low_content_frame++;
     }
-    mi += 8;
   }
 
   // For video conference clips, if the background has high motion in current
@@ -388,8 +397,8 @@
   int i, block_count, bl_index, sb_rows, sb_cols, sbs_in_frame;
   int xmis, ymis, x, y;
   memset(seg_map, CR_SEGMENT_ID_BASE, cm->mi_rows * cm->mi_cols);
-  sb_cols = (cm->mi_cols + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
-  sb_rows = (cm->mi_rows + MI_BLOCK_SIZE - 1) / MI_BLOCK_SIZE;
+  sb_cols = (cm->mi_cols + cm->mib_size - 1) / cm->mib_size;
+  sb_rows = (cm->mi_rows + cm->mib_size - 1) / cm->mib_size;
   sbs_in_frame = sb_cols * sb_rows;
   // Number of target blocks to get the q delta (segment 1).
   block_count = cr->percent_refresh * cm->mi_rows * cm->mi_cols / 100;
@@ -404,8 +413,8 @@
     // Get the mi_row/mi_col corresponding to superblock index i.
     int sb_row_index = (i / sb_cols);
     int sb_col_index = i - sb_row_index * sb_cols;
-    int mi_row = sb_row_index * MI_BLOCK_SIZE;
-    int mi_col = sb_col_index * MI_BLOCK_SIZE;
+    int mi_row = sb_row_index * cm->mib_size;
+    int mi_col = sb_col_index * cm->mib_size;
     int qindex_thresh =
         cpi->oxcf.content == VP9E_CONTENT_SCREEN
             ? vp10_get_qindex(&cm->seg, CR_SEGMENT_ID_BOOST2, cm->base_qindex)
@@ -413,11 +422,9 @@
     assert(mi_row >= 0 && mi_row < cm->mi_rows);
     assert(mi_col >= 0 && mi_col < cm->mi_cols);
     bl_index = mi_row * cm->mi_cols + mi_col;
-    // Loop through all 8x8 blocks in superblock and update map.
-    xmis =
-        VPXMIN(cm->mi_cols - mi_col, num_8x8_blocks_wide_lookup[BLOCK_64X64]);
-    ymis =
-        VPXMIN(cm->mi_rows - mi_row, num_8x8_blocks_high_lookup[BLOCK_64X64]);
+    // Loop through all MI blocks in superblock and update map.
+    xmis = VPXMIN(cm->mi_cols - mi_col, cm->mib_size);
+    ymis = VPXMIN(cm->mi_rows - mi_row, cm->mib_size);
     for (y = 0; y < ymis; y++) {
       for (x = 0; x < xmis; x++) {
         const int bl_index2 = bl_index + y * cm->mi_cols + x;
diff --git a/vp10/encoder/aq_variance.c b/vp10/encoder/aq_variance.c
index bed5162..e99310f 100644
--- a/vp10/encoder/aq_variance.c
+++ b/vp10/encoder/aq_variance.c
@@ -32,9 +32,11 @@
 
 #define SEGMENT_ID(i) segment_id[(i) - ENERGY_MIN]
 
-DECLARE_ALIGNED(16, static const uint8_t, vp10_64_zeros[64]) = {0};
+DECLARE_ALIGNED(16, static const uint8_t,
+                vp10_all_zeros[MAX_SB_SIZE]) = {0};
 #if CONFIG_VP9_HIGHBITDEPTH
-DECLARE_ALIGNED(16, static const uint16_t, vp10_highbd_64_zeros[64]) = {0};
+DECLARE_ALIGNED(16, static const uint16_t,
+                vp10_highbd_all_zeros[MAX_SB_SIZE]) = {0};
 #endif
 
 unsigned int vp10_vaq_segment_id(int energy) {
@@ -50,6 +52,8 @@
   if (frame_is_intra_only(cm) || cm->error_resilient_mode ||
       cpi->refresh_alt_ref_frame ||
       (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+    cpi->vaq_refresh = 1;
+
     vp10_enable_segmentation(seg);
     vp10_clearall_segfeatures(seg);
 
@@ -153,17 +157,17 @@
 #if CONFIG_VP9_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       aq_highbd_8_variance(x->plane[0].src.buf, x->plane[0].src.stride,
-                           CONVERT_TO_BYTEPTR(vp10_highbd_64_zeros), 0, bw, bh,
+                           CONVERT_TO_BYTEPTR(vp10_highbd_all_zeros), 0, bw, bh,
                            &sse, &avg);
       sse >>= 2 * (xd->bd - 8);
       avg >>= (xd->bd - 8);
     } else {
       aq_variance(x->plane[0].src.buf, x->plane[0].src.stride,
-                  vp10_64_zeros, 0, bw, bh, &sse, &avg);
+                  vp10_all_zeros, 0, bw, bh, &sse, &avg);
     }
 #else
     aq_variance(x->plane[0].src.buf, x->plane[0].src.stride,
-                vp10_64_zeros, 0, bw, bh, &sse, &avg);
+                vp10_all_zeros, 0, bw, bh, &sse, &avg);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     var = sse - (((int64_t)avg * avg) / (bw * bh));
     return (256 * var) / (bw * bh);
@@ -172,17 +176,17 @@
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf,
                                x->plane[0].src.stride,
-                               CONVERT_TO_BYTEPTR(vp10_highbd_64_zeros),
+                               CONVERT_TO_BYTEPTR(vp10_highbd_all_zeros),
                                0, &sse);
     } else {
       var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf,
                                x->plane[0].src.stride,
-                               vp10_64_zeros, 0, &sse);
+                               vp10_all_zeros, 0, &sse);
     }
 #else
     var = cpi->fn_ptr[bs].vf(x->plane[0].src.buf,
                              x->plane[0].src.stride,
-                             vp10_64_zeros, 0, &sse);
+                             vp10_all_zeros, 0, &sse);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     return (256 * var) >> num_pels_log2_lookup[bs];
   }
diff --git a/vp10/encoder/bitstream.c b/vp10/encoder/bitstream.c
index 04ce61d..3999c94 100644
--- a/vp10/encoder/bitstream.c
+++ b/vp10/encoder/bitstream.c
@@ -24,9 +24,13 @@
 #include "vp10/common/entropymv.h"
 #include "vp10/common/mvref_common.h"
 #include "vp10/common/pred_common.h"
+#include "vp10/common/reconinter.h"
 #include "vp10/common/seg_common.h"
 #include "vp10/common/tile_common.h"
 
+#if CONFIG_ANS
+#include "vp10/encoder/buf_ans.h"
+#endif  // CONFIG_ANS
 #include "vp10/encoder/cost.h"
 #include "vp10/encoder/bitstream.h"
 #include "vp10/encoder/encodemv.h"
@@ -38,30 +42,231 @@
 static const struct vp10_token intra_mode_encodings[INTRA_MODES] = {
   {0, 1}, {6, 3}, {28, 5}, {30, 5}, {58, 6}, {59, 6}, {126, 7}, {127, 7},
   {62, 6}, {2, 2}};
+#if CONFIG_EXT_INTERP
+static const struct vp10_token switchable_interp_encodings[SWITCHABLE_FILTERS] =
+  {{0, 1}, {4, 3}, {6, 3}, {5, 3}, {7, 3}};
+#else
 static const struct vp10_token switchable_interp_encodings[SWITCHABLE_FILTERS] =
   {{0, 1}, {2, 2}, {3, 2}};
+#endif  // CONFIG_EXT_INTERP
+#if CONFIG_EXT_PARTITION_TYPES
+static const struct vp10_token ext_partition_encodings[EXT_PARTITION_TYPES] =
+  {{0, 1}, {4, 3}, {12, 4}, {7, 3}, {10, 4}, {11, 4}, {26, 5}, {27, 5}};
+#endif
 static const struct vp10_token partition_encodings[PARTITION_TYPES] =
   {{0, 1}, {2, 2}, {6, 3}, {7, 3}};
+#if !CONFIG_REF_MV
 static const struct vp10_token inter_mode_encodings[INTER_MODES] =
+#if CONFIG_EXT_INTER
+  {{2, 2}, {6, 3}, {0, 1}, {14, 4}, {15, 4}};
+#else
   {{2, 2}, {6, 3}, {0, 1}, {7, 3}};
+#endif  // CONFIG_EXT_INTER
+#endif
+#if CONFIG_EXT_INTER
+static const struct vp10_token inter_compound_mode_encodings
+                               [INTER_COMPOUND_MODES] = {
+  {2, 2}, {50, 6}, {51, 6}, {24, 5}, {52, 6}, {53, 6},
+  {54, 6}, {55, 6}, {0, 1}, {7, 3}
+};
+#endif  // CONFIG_EXT_INTER
+static const struct vp10_token palette_size_encodings[] = {
+    {0, 1}, {2, 2}, {6, 3}, {14, 4}, {30, 5}, {62, 6}, {63, 6},
+};
+static const struct vp10_token
+palette_color_encodings[PALETTE_MAX_SIZE - 1][PALETTE_MAX_SIZE] = {
+    {{0, 1}, {1, 1}},  // 2 colors
+    {{0, 1}, {2, 2}, {3, 2}},  // 3 colors
+    {{0, 1}, {2, 2}, {6, 3}, {7, 3}},  // 4 colors
+    {{0, 1}, {2, 2}, {6, 3}, {14, 4}, {15, 4}},  // 5 colors
+    {{0, 1}, {2, 2}, {6, 3}, {14, 4}, {30, 5}, {31, 5}},  // 6 colors
+    {{0, 1}, {2, 2}, {6, 3}, {14, 4}, {30, 5}, {62, 6}, {63, 6}},  // 7 colors
+    {{0, 1}, {2, 2}, {6, 3}, {14, 4},
+        {30, 5}, {62, 6}, {126, 7}, {127, 7}},  // 8 colors
+};
 
-static struct vp10_token ext_tx_encodings[TX_TYPES];
+static const struct vp10_token
+tx_size_encodings[TX_SIZES - 1][TX_SIZES] = {
+    {{0, 1}, {1, 1}},  // Max tx_size is 8X8
+    {{0, 1}, {2, 2}, {3, 2}},  // Max tx_size is 16X16
+    {{0, 1}, {2, 2}, {6, 3}, {7, 3}},  // Max tx_size is 32X32
+};
 
-void vp10_encode_token_init() {
-  vp10_tokens_from_tree(ext_tx_encodings, vp10_ext_tx_tree);
+static INLINE void write_uniform(vp10_writer *w, int n, int v) {
+  int l = get_unsigned_bits(n);
+  int m = (1 << l) - n;
+  if (l == 0)
+    return;
+  if (v < m) {
+    vp10_write_literal(w, v, l - 1);
+  } else {
+    vp10_write_literal(w, m + ((v - m) >> 1), l - 1);
+    vp10_write_literal(w, (v - m) & 1, 1);
+  }
 }
 
-static void write_intra_mode(vpx_writer *w, PREDICTION_MODE mode,
+#if CONFIG_EXT_TX
+static struct vp10_token ext_tx_inter_encodings[EXT_TX_SETS_INTER][TX_TYPES];
+static struct vp10_token ext_tx_intra_encodings[EXT_TX_SETS_INTRA][TX_TYPES];
+#else
+static struct vp10_token ext_tx_encodings[TX_TYPES];
+#endif  // CONFIG_EXT_TX
+#if CONFIG_EXT_INTRA
+static struct vp10_token intra_filter_encodings[INTRA_FILTERS];
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_EXT_INTER
+static struct vp10_token interintra_mode_encodings[INTERINTRA_MODES];
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_OBMC || CONFIG_WARPED_MOTION
+static struct vp10_token motvar_encodings[MOTION_VARIATIONS];
+#endif  // CONFIG_OBMC || CONFIG_WARPED_MOTION
+
+void vp10_encode_token_init(void) {
+#if CONFIG_EXT_TX
+  int s;
+  for (s = 1; s < EXT_TX_SETS_INTER; ++s) {
+    vp10_tokens_from_tree(ext_tx_inter_encodings[s], vp10_ext_tx_inter_tree[s]);
+  }
+  for (s = 1; s < EXT_TX_SETS_INTRA; ++s) {
+    vp10_tokens_from_tree(ext_tx_intra_encodings[s], vp10_ext_tx_intra_tree[s]);
+  }
+#else
+  vp10_tokens_from_tree(ext_tx_encodings, vp10_ext_tx_tree);
+#endif  // CONFIG_EXT_TX
+#if CONFIG_EXT_INTRA
+  vp10_tokens_from_tree(intra_filter_encodings, vp10_intra_filter_tree);
+#endif  // CONFIG_EXT_INTRA
+#if CONFIG_EXT_INTER
+  vp10_tokens_from_tree(interintra_mode_encodings, vp10_interintra_mode_tree);
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_OBMC || CONFIG_WARPED_MOTION
+  vp10_tokens_from_tree(motvar_encodings, vp10_motvar_tree);
+#endif  // CONFIG_OBMC || CONFIG_WARPED_MOTION
+}
+
+static void write_intra_mode(vp10_writer *w, PREDICTION_MODE mode,
                              const vpx_prob *probs) {
   vp10_write_token(w, vp10_intra_mode_tree, probs, &intra_mode_encodings[mode]);
 }
 
-static void write_inter_mode(vpx_writer *w, PREDICTION_MODE mode,
-                             const vpx_prob *probs) {
-  assert(is_inter_mode(mode));
-  vp10_write_token(w, vp10_inter_mode_tree, probs,
-                  &inter_mode_encodings[INTER_OFFSET(mode)]);
+#if CONFIG_EXT_INTER
+static void write_interintra_mode(vp10_writer *w, INTERINTRA_MODE mode,
+                                  const vpx_prob *probs) {
+  vp10_write_token(w, vp10_interintra_mode_tree, probs,
+                   &interintra_mode_encodings[mode]);
 }
+#endif  // CONFIG_EXT_INTER
+
+static void write_inter_mode(VP10_COMMON *cm,
+                             vp10_writer *w, PREDICTION_MODE mode,
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+                             int is_compound,
+#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
+                             const int16_t mode_ctx) {
+#if CONFIG_REF_MV
+  const int16_t newmv_ctx = mode_ctx & NEWMV_CTX_MASK;
+  const vpx_prob newmv_prob = cm->fc->newmv_prob[newmv_ctx];
+#if CONFIG_EXT_INTER
+  vp10_write(w, mode != NEWMV && mode != NEWFROMNEARMV, newmv_prob);
+
+  if (!is_compound && (mode == NEWMV || mode == NEWFROMNEARMV))
+    vp10_write(w, mode == NEWFROMNEARMV, cm->fc->new2mv_prob);
+
+  if (mode != NEWMV && mode != NEWFROMNEARMV) {
+#else
+  vp10_write(w, mode != NEWMV, newmv_prob);
+
+  if (mode != NEWMV) {
+#endif  // CONFIG_EXT_INTER
+    const int16_t zeromv_ctx = (mode_ctx >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK;
+    const vpx_prob zeromv_prob = cm->fc->zeromv_prob[zeromv_ctx];
+
+    if (mode_ctx & (1 << ALL_ZERO_FLAG_OFFSET)) {
+      assert(mode == ZEROMV);
+      return;
+    }
+
+    vp10_write(w, mode != ZEROMV, zeromv_prob);
+
+    if (mode != ZEROMV) {
+      int16_t refmv_ctx = (mode_ctx >> REFMV_OFFSET) & REFMV_CTX_MASK;
+      vpx_prob refmv_prob;
+
+      if (mode_ctx & (1 << SKIP_NEARESTMV_OFFSET))
+        refmv_ctx = 6;
+      if (mode_ctx & (1 << SKIP_NEARMV_OFFSET))
+        refmv_ctx = 7;
+      if (mode_ctx & (1 << SKIP_NEARESTMV_SUB8X8_OFFSET))
+        refmv_ctx = 8;
+
+      refmv_prob = cm->fc->refmv_prob[refmv_ctx];
+      vp10_write(w, mode != NEARESTMV, refmv_prob);
+    }
+  }
+#else
+  const vpx_prob *const inter_probs = cm->fc->inter_mode_probs[mode_ctx];
+  assert(is_inter_mode(mode));
+  vp10_write_token(w, vp10_inter_mode_tree, inter_probs,
+                  &inter_mode_encodings[INTER_OFFSET(mode)]);
+#endif
+}
+
+#if CONFIG_REF_MV
+static void write_drl_idx(const VP10_COMMON *cm,
+                          const MB_MODE_INFO *mbmi,
+                          const MB_MODE_INFO_EXT *mbmi_ext,
+                          vp10_writer *w) {
+  uint8_t ref_frame_type = vp10_ref_frame_type(mbmi->ref_frame);
+
+  assert(mbmi->ref_mv_idx < 3);
+
+  if (mbmi->mode == NEWMV) {
+    int idx;
+    for (idx = 0; idx < 2; ++idx) {
+      if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+        uint8_t drl_ctx =
+            vp10_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
+        vpx_prob drl_prob = cm->fc->drl_prob[drl_ctx];
+
+        vp10_write(w, mbmi->ref_mv_idx != idx, drl_prob);
+        if (mbmi->ref_mv_idx == idx)
+          return;
+      }
+    }
+    return;
+  }
+
+  if (mbmi->mode == NEARMV) {
+    int idx;
+    // TODO(jingning): Temporary solution to compensate the NEARESTMV offset.
+    for (idx = 1; idx < 3; ++idx) {
+      if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+        uint8_t drl_ctx =
+            vp10_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
+        vpx_prob drl_prob = cm->fc->drl_prob[drl_ctx];
+
+        vp10_write(w, mbmi->ref_mv_idx != (idx - 1), drl_prob);
+        if (mbmi->ref_mv_idx == (idx - 1))
+          return;
+      }
+    }
+    return;
+  }
+}
+#endif
+
+#if CONFIG_EXT_INTER
+static void write_inter_compound_mode(VP10_COMMON *cm, vp10_writer *w,
+                                      PREDICTION_MODE mode,
+                                      const int16_t mode_ctx) {
+  const vpx_prob *const inter_compound_probs =
+                        cm->fc->inter_compound_mode_probs[mode_ctx];
+
+  assert(is_inter_compound_mode(mode));
+  vp10_write_token(w, vp10_inter_compound_mode_tree, inter_compound_probs,
+                  &inter_compound_mode_encodings[INTER_COMPOUND_OFFSET(mode)]);
+}
+#endif  // CONFIG_EXT_INTER
 
 static void encode_unsigned_max(struct vpx_write_bit_buffer *wb,
                                 int data, int max) {
@@ -71,7 +276,7 @@
 static void prob_diff_update(const vpx_tree_index *tree,
                              vpx_prob probs[/*n - 1*/],
                              const unsigned int counts[/*n - 1*/],
-                             int n, vpx_writer *w) {
+                             int n, vp10_writer *w) {
   int i;
   unsigned int branch_ct[32][2];
 
@@ -101,33 +306,136 @@
   return savings;
 }
 
-static void write_selected_tx_size(const VP10_COMMON *cm,
-                                   const MACROBLOCKD *xd, vpx_writer *w) {
-  TX_SIZE tx_size = xd->mi[0]->mbmi.tx_size;
-  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
-  const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
-  const vpx_prob *const tx_probs = get_tx_probs2(max_tx_size, xd,
-                                                 &cm->fc->tx_probs);
-  vpx_write(w, tx_size != TX_4X4, tx_probs[0]);
-  if (tx_size != TX_4X4 && max_tx_size >= TX_16X16) {
-    vpx_write(w, tx_size != TX_8X8, tx_probs[1]);
-    if (tx_size != TX_8X8 && max_tx_size >= TX_32X32)
-      vpx_write(w, tx_size != TX_16X16, tx_probs[2]);
+#if CONFIG_VAR_TX
+static void write_tx_size_inter(const VP10_COMMON *cm,
+                                const MACROBLOCKD *xd,
+                                const MB_MODE_INFO *mbmi,
+                                TX_SIZE tx_size, int blk_row, int blk_col,
+                                vp10_writer *w) {
+  const int tx_row = blk_row >> 1;
+  const int tx_col = blk_col >> 1;
+  int max_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
+  int max_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
+  int ctx = txfm_partition_context(xd->above_txfm_context + tx_col,
+                                   xd->left_txfm_context + tx_row,
+                                   tx_size);
+
+  if (xd->mb_to_bottom_edge < 0)
+    max_blocks_high += xd->mb_to_bottom_edge >> 5;
+  if (xd->mb_to_right_edge < 0)
+     max_blocks_wide += xd->mb_to_right_edge >> 5;
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide)
+     return;
+
+  if (tx_size == mbmi->inter_tx_size[tx_row][tx_col]) {
+    vp10_write(w, 0, cm->fc->txfm_partition_prob[ctx]);
+    txfm_partition_update(xd->above_txfm_context + tx_col,
+                          xd->left_txfm_context + tx_row, tx_size);
+  } else {
+    const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+    int bsl = b_width_log2_lookup[bsize];
+    int i;
+    vp10_write(w, 1, cm->fc->txfm_partition_prob[ctx]);
+
+    if (tx_size == TX_8X8) {
+      txfm_partition_update(xd->above_txfm_context + tx_col,
+                            xd->left_txfm_context + tx_row, TX_4X4);
+      return;
+    }
+
+    assert(bsl > 0);
+    --bsl;
+    for (i = 0; i < 4; ++i) {
+      int offsetr = blk_row + ((i >> 1) << bsl);
+      int offsetc = blk_col + ((i & 0x01) << bsl);
+      write_tx_size_inter(cm, xd, mbmi, tx_size - 1, offsetr, offsetc, w);
+    }
   }
 }
 
+static void update_txfm_partition_probs(VP10_COMMON *cm, vp10_writer *w,
+                                        FRAME_COUNTS *counts) {
+  int k;
+  for (k = 0; k < TXFM_PARTITION_CONTEXTS; ++k)
+    vp10_cond_prob_diff_update(w, &cm->fc->txfm_partition_prob[k],
+                               counts->txfm_partition[k]);
+}
+#endif
+
+static void write_selected_tx_size(const VP10_COMMON *cm,
+                                   const MACROBLOCKD *xd, vp10_writer *w) {
+  TX_SIZE tx_size = xd->mi[0]->mbmi.tx_size;
+  BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
+  if (max_tx_size > TX_4X4) {
+    vp10_write_token(w, vp10_tx_size_tree[max_tx_size - TX_8X8],
+                     cm->fc->tx_size_probs[max_tx_size - TX_8X8]
+                                          [get_tx_size_context(xd)],
+                     &tx_size_encodings[max_tx_size - TX_8X8][tx_size]);
+  }
+}
+
+#if CONFIG_REF_MV
+static void update_inter_mode_probs(VP10_COMMON *cm, vp10_writer *w,
+                                    FRAME_COUNTS *counts) {
+  int i;
+  for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i)
+    vp10_cond_prob_diff_update(w, &cm->fc->newmv_prob[i],
+                               counts->newmv_mode[i]);
+  for (i = 0; i < ZEROMV_MODE_CONTEXTS; ++i)
+    vp10_cond_prob_diff_update(w, &cm->fc->zeromv_prob[i],
+                               counts->zeromv_mode[i]);
+  for (i = 0; i < REFMV_MODE_CONTEXTS; ++i)
+    vp10_cond_prob_diff_update(w, &cm->fc->refmv_prob[i],
+                               counts->refmv_mode[i]);
+  for (i = 0; i < DRL_MODE_CONTEXTS; ++i)
+    vp10_cond_prob_diff_update(w, &cm->fc->drl_prob[i],
+                               counts->drl_mode[i]);
+#if CONFIG_EXT_INTER
+  vp10_cond_prob_diff_update(w, &cm->fc->new2mv_prob, counts->new2mv_mode);
+#endif  // CONFIG_EXT_INTER
+}
+#endif
+
+#if CONFIG_EXT_INTER
+static void update_inter_compound_mode_probs(VP10_COMMON *cm, vp10_writer *w) {
+  const int savings_thresh = vp10_cost_one(GROUP_DIFF_UPDATE_PROB) -
+                             vp10_cost_zero(GROUP_DIFF_UPDATE_PROB);
+  int i;
+  int savings = 0;
+  int do_update = 0;
+  for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
+    savings += prob_diff_update_savings(vp10_inter_compound_mode_tree,
+                                        cm->fc->inter_compound_mode_probs[i],
+                                        cm->counts.inter_compound_mode[i],
+                                        INTER_COMPOUND_MODES);
+  }
+  do_update = savings > savings_thresh;
+  vp10_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
+  if (do_update) {
+    for (i = 0; i < INTER_MODE_CONTEXTS; ++i) {
+      prob_diff_update(vp10_inter_compound_mode_tree,
+                       cm->fc->inter_compound_mode_probs[i],
+                       cm->counts.inter_compound_mode[i],
+                       INTER_COMPOUND_MODES, w);
+    }
+  }
+}
+#endif  // CONFIG_EXT_INTER
+
 static int write_skip(const VP10_COMMON *cm, const MACROBLOCKD *xd,
-                      int segment_id, const MODE_INFO *mi, vpx_writer *w) {
+                      int segment_id, const MODE_INFO *mi, vp10_writer *w) {
   if (segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP)) {
     return 1;
   } else {
     const int skip = mi->mbmi.skip;
-    vpx_write(w, skip, vp10_get_skip_prob(cm, xd));
+    vp10_write(w, skip, vp10_get_skip_prob(cm, xd));
     return skip;
   }
 }
 
-static void update_skip_probs(VP10_COMMON *cm, vpx_writer *w,
+static void update_skip_probs(VP10_COMMON *cm, vp10_writer *w,
                               FRAME_COUNTS *counts) {
   int k;
 
@@ -135,7 +443,7 @@
     vp10_cond_prob_diff_update(w, &cm->fc->skip_probs[k], counts->skip[k]);
 }
 
-static void update_switchable_interp_probs(VP10_COMMON *cm, vpx_writer *w,
+static void update_switchable_interp_probs(VP10_COMMON *cm, vp10_writer *w,
                                            FRAME_COUNTS *counts) {
   int j;
   for (j = 0; j < SWITCHABLE_FILTER_CONTEXTS; ++j)
@@ -144,7 +452,63 @@
                      counts->switchable_interp[j], SWITCHABLE_FILTERS, w);
 }
 
-static void update_ext_tx_probs(VP10_COMMON *cm, vpx_writer *w) {
+
+#if CONFIG_EXT_TX
+static void update_ext_tx_probs(VP10_COMMON *cm, vp10_writer *w) {
+  const int savings_thresh = vp10_cost_one(GROUP_DIFF_UPDATE_PROB) -
+                             vp10_cost_zero(GROUP_DIFF_UPDATE_PROB);
+  int i, j;
+  int s;
+  for (s = 1; s < EXT_TX_SETS_INTER; ++s) {
+    int savings = 0;
+    int do_update = 0;
+    for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+      if (!use_inter_ext_tx_for_txsize[s][i]) continue;
+      savings += prob_diff_update_savings(
+          vp10_ext_tx_inter_tree[s], cm->fc->inter_ext_tx_prob[s][i],
+          cm->counts.inter_ext_tx[s][i], num_ext_tx_set_inter[s]);
+    }
+    do_update = savings > savings_thresh;
+    vp10_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
+    if (do_update) {
+      for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+        if (!use_inter_ext_tx_for_txsize[s][i]) continue;
+        prob_diff_update(vp10_ext_tx_inter_tree[s],
+                         cm->fc->inter_ext_tx_prob[s][i],
+                         cm->counts.inter_ext_tx[s][i],
+                         num_ext_tx_set_inter[s], w);
+      }
+    }
+  }
+
+  for (s = 1; s < EXT_TX_SETS_INTRA; ++s) {
+    int savings = 0;
+    int do_update = 0;
+    for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+      if (!use_intra_ext_tx_for_txsize[s][i]) continue;
+      for (j = 0; j < INTRA_MODES; ++j)
+        savings += prob_diff_update_savings(
+            vp10_ext_tx_intra_tree[s], cm->fc->intra_ext_tx_prob[s][i][j],
+            cm->counts.intra_ext_tx[s][i][j], num_ext_tx_set_intra[s]);
+    }
+    do_update = savings > savings_thresh;
+    vp10_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
+    if (do_update) {
+      for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+        if (!use_intra_ext_tx_for_txsize[s][i]) continue;
+        for (j = 0; j < INTRA_MODES; ++j)
+          prob_diff_update(vp10_ext_tx_intra_tree[s],
+                           cm->fc->intra_ext_tx_prob[s][i][j],
+                           cm->counts.intra_ext_tx[s][i][j],
+                           num_ext_tx_set_intra[s], w);
+      }
+    }
+  }
+}
+
+#else
+
+static void update_ext_tx_probs(VP10_COMMON *cm, vp10_writer *w) {
   const int savings_thresh = vp10_cost_one(GROUP_DIFF_UPDATE_PROB) -
                              vp10_cost_zero(GROUP_DIFF_UPDATE_PROB);
   int i, j;
@@ -158,7 +522,7 @@
           cm->counts.intra_ext_tx[i][j], TX_TYPES);
   }
   do_update = savings > savings_thresh;
-  vpx_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
+  vp10_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
   if (do_update) {
     for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
       for (j = 0; j < TX_TYPES; ++j)
@@ -176,7 +540,7 @@
         cm->counts.inter_ext_tx[i], TX_TYPES);
   }
   do_update = savings > savings_thresh;
-  vpx_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
+  vp10_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
   if (do_update) {
     for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
       prob_diff_update(vp10_ext_tx_tree,
@@ -186,19 +550,61 @@
     }
   }
 }
+#endif  // CONFIG_EXT_TX
 
-static void pack_mb_tokens(vpx_writer *w,
-                           TOKENEXTRA **tp, const TOKENEXTRA *const stop,
+static void pack_palette_tokens(vp10_writer *w, const TOKENEXTRA **tp,
+                                int n, int num) {
+  int i;
+  const TOKENEXTRA *p = *tp;
+
+  for (i = 0; i < num; ++i) {
+    vp10_write_token(w, vp10_palette_color_tree[n - 2], p->context_tree,
+                     &palette_color_encodings[n - 2][p->token]);
+    ++p;
+  }
+
+  *tp = p;
+}
+
+#if CONFIG_SUPERTX
+static void update_supertx_probs(VP10_COMMON *cm, vp10_writer *w) {
+  const int savings_thresh = vp10_cost_one(GROUP_DIFF_UPDATE_PROB) -
+                             vp10_cost_zero(GROUP_DIFF_UPDATE_PROB);
+  int i, j;
+  int savings = 0;
+  int do_update = 0;
+  for (i = 0; i < PARTITION_SUPERTX_CONTEXTS; ++i) {
+    for (j = 1; j < TX_SIZES; ++j) {
+      savings += vp10_cond_prob_diff_update_savings(&cm->fc->supertx_prob[i][j],
+                                                    cm->counts.supertx[i][j]);
+    }
+  }
+  do_update = savings > savings_thresh;
+  vp10_write(w, do_update, GROUP_DIFF_UPDATE_PROB);
+  if (do_update) {
+    for (i = 0; i < PARTITION_SUPERTX_CONTEXTS; ++i) {
+      for (j = 1; j < TX_SIZES; ++j) {
+        vp10_cond_prob_diff_update(w, &cm->fc->supertx_prob[i][j],
+                                   cm->counts.supertx[i][j]);
+      }
+    }
+  }
+}
+#endif  // CONFIG_SUPERTX
+
+#if !CONFIG_ANS
+static void pack_mb_tokens(vp10_writer *w,
+                           const TOKENEXTRA **tp, const TOKENEXTRA *const stop,
                            vpx_bit_depth_t bit_depth, const TX_SIZE tx) {
-  TOKENEXTRA *p = *tp;
-#if !CONFIG_MISC_FIXES
-  (void) tx;
+  const TOKENEXTRA *p = *tp;
+#if CONFIG_VAR_TX
+  int count = 0;
+  const int seg_eob = 16 << (tx << 1);
 #endif
 
   while (p < stop && p->token != EOSB_TOKEN) {
     const int t = p->token;
     const struct vp10_token *const a = &vp10_coef_encodings[t];
-    int i = 0;
     int v = a->value;
     int n = a->len;
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -215,38 +621,30 @@
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
     /* skip one or two nodes */
-    if (p->skip_eob_node) {
+    if (p->skip_eob_node)
       n -= p->skip_eob_node;
-      i = 2 * p->skip_eob_node;
-    }
+    else
+      vp10_write(w, t != EOB_TOKEN, p->context_tree[0]);
 
-    // TODO(jbb): expanding this can lead to big gains.  It allows
-    // much better branch prediction and would enable us to avoid numerous
-    // lookups and compares.
+    if (t != EOB_TOKEN) {
+      vp10_write(w, t != ZERO_TOKEN, p->context_tree[1]);
 
-    // If we have a token that's in the constrained set, the coefficient tree
-    // is split into two treed writes.  The first treed write takes care of the
-    // unconstrained nodes.  The second treed write takes care of the
-    // constrained nodes.
-    if (t >= TWO_TOKEN && t < EOB_TOKEN) {
-      int len = UNCONSTRAINED_NODES - p->skip_eob_node;
-      int bits = v >> (n - len);
-      vp10_write_tree(w, vp10_coef_tree, p->context_tree, bits, len, i);
-      vp10_write_tree(w, vp10_coef_con_tree,
-                     vp10_pareto8_full[p->context_tree[PIVOT_NODE] - 1],
-                     v, n - len, 0);
-    } else {
-      vp10_write_tree(w, vp10_coef_tree, p->context_tree, v, n, i);
+      if (t != ZERO_TOKEN) {
+        vp10_write(w, t != ONE_TOKEN, p->context_tree[2]);
+
+        if (t != ONE_TOKEN) {
+          int len = UNCONSTRAINED_NODES - p->skip_eob_node;
+          vp10_write_tree(w, vp10_coef_con_tree,
+                          vp10_pareto8_full[p->context_tree[PIVOT_NODE] - 1],
+                          v, n - len, 0);
+        }
+      }
     }
 
     if (b->base_val) {
       const int e = p->extra, l = b->len;
-#if CONFIG_MISC_FIXES
       int skip_bits =
           (b->base_val == CAT6_MIN_VAL) ? TX_SIZES - 1 - tx : 0;
-#else
-      int skip_bits = 0;
-#endif
 
       if (l) {
         const unsigned char *pb = b->prob;
@@ -260,21 +658,156 @@
             skip_bits--;
             assert(!bb);
           } else {
-            vpx_write(w, bb, pb[i >> 1]);
+            vp10_write(w, bb, pb[i >> 1]);
           }
           i = b->tree[i + bb];
         } while (n);
       }
 
-      vpx_write_bit(w, e & 1);
+      vp10_write_bit(w, e & 1);
     }
     ++p;
+
+#if CONFIG_VAR_TX
+    ++count;
+    if (t == EOB_TOKEN || count == seg_eob)
+      break;
+#endif
   }
 
   *tp = p;
 }
+#else
+// This function serializes the tokens in forward order using a buffered ans
+// coder.
+static void pack_mb_tokens(struct BufAnsCoder *ans,
+                           const TOKENEXTRA **tp,
+                           const TOKENEXTRA *const stop,
+                           vpx_bit_depth_t bit_depth,
+                           const TX_SIZE tx) {
+  const TOKENEXTRA *p = *tp;
+#if CONFIG_VAR_TX
+  int count = 0;
+  const int seg_eob = 16 << (tx << 1);
+#endif  // CONFIG_VAR_TX
 
-static void write_segment_id(vpx_writer *w, const struct segmentation *seg,
+  while (p < stop && p->token != EOSB_TOKEN) {
+    const int t = p->token;
+#if CONFIG_VP9_HIGHBITDEPTH
+    const vp10_extra_bit *b;
+    if (bit_depth == VPX_BITS_12)
+      b = &vp10_extra_bits_high12[t];
+    else if (bit_depth == VPX_BITS_10)
+      b = &vp10_extra_bits_high10[t];
+    else
+      b = &vp10_extra_bits[t];
+#else
+    const vp10_extra_bit *const b = &vp10_extra_bits[t];
+    (void)bit_depth;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    /* skip one or two nodes */
+    if (!p->skip_eob_node)
+      buf_uabs_write(ans, t != EOB_TOKEN, p->context_tree[0]);
+
+    if (t != EOB_TOKEN) {
+      struct rans_sym s;
+      const rans_dec_lut *token_cdf = p->token_cdf;
+      assert(token_cdf);
+      s.cum_prob = (*token_cdf)[t - ZERO_TOKEN];
+      s.prob = (*token_cdf)[t - ZERO_TOKEN + 1] - s.cum_prob;
+      buf_rans_write(ans, &s);
+
+      if (b->base_val) {
+        const int e = p->extra, l = b->len;
+        int skip_bits = (b->base_val == CAT6_MIN_VAL) ? TX_SIZES - 1 - tx : 0;
+
+        if (l) {
+          const unsigned char *pb = b->prob;
+          int v = e >> 1;
+          int n = l; /* number of bits in v, assumed nonzero */
+          int i = 0;
+
+          do {
+            const int bb = (v >> --n) & 1;
+            if (skip_bits) {
+              skip_bits--;
+              assert(!bb);
+            } else {
+              buf_uabs_write(ans, bb, pb[i >> 1]);
+            }
+            i = b->tree[i + bb];
+          } while (n);
+        }
+
+        buf_uabs_write(ans, e & 1, 128);
+      }
+    }
+    ++p;
+
+#if CONFIG_VAR_TX
+    ++count;
+    if (t == EOB_TOKEN || count == seg_eob) break;
+#endif  // CONFIG_VAR_TX
+  }
+
+  *tp = p;
+}
+#endif  // !CONFIG_ANS
+
+#if CONFIG_VAR_TX
+static void pack_txb_tokens(vp10_writer *w,
+                           const TOKENEXTRA **tp,
+                           const TOKENEXTRA *const tok_end,
+                           MACROBLOCKD *xd, MB_MODE_INFO *mbmi, int plane,
+                           BLOCK_SIZE plane_bsize,
+                           vpx_bit_depth_t bit_depth,
+                           int block,
+                           int blk_row, int blk_col, TX_SIZE tx_size) {
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+  const int tx_row = blk_row >> (1 - pd->subsampling_y);
+  const int tx_col = blk_col >> (1 - pd->subsampling_x);
+  const TX_SIZE plane_tx_size = plane ?
+      get_uv_tx_size_impl(mbmi->inter_tx_size[tx_row][tx_col], bsize, 0, 0) :
+      mbmi->inter_tx_size[tx_row][tx_col];
+  int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
+  int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
+
+  if (xd->mb_to_bottom_edge < 0)
+    max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
+  if (xd->mb_to_right_edge < 0)
+    max_blocks_wide += xd->mb_to_right_edge >> (5 + pd->subsampling_x);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide)
+    return;
+
+  if (tx_size == plane_tx_size) {
+    pack_mb_tokens(w, tp, tok_end, bit_depth, tx_size);
+  } else {
+    int bsl = b_width_log2_lookup[bsize];
+    int i;
+
+    assert(bsl > 0);
+    --bsl;
+
+    for (i = 0; i < 4; ++i) {
+      const int offsetr = blk_row + ((i >> 1) << bsl);
+      const int offsetc = blk_col + ((i & 0x01) << bsl);
+      int step = 1 << (2 * (tx_size - 1));
+
+      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide)
+        continue;
+
+      pack_txb_tokens(w, tp, tok_end, xd, mbmi, plane,
+                      plane_bsize, bit_depth, block + i * step,
+                      offsetr, offsetc, tx_size - 1);
+    }
+  }
+}
+#endif
+
+static void write_segment_id(vp10_writer *w, const struct segmentation *seg,
                              const struct segmentation_probs *segp,
                              int segment_id) {
   if (seg->enabled && seg->update_map)
@@ -283,7 +816,7 @@
 
 // This function encodes the reference frame
 static void write_ref_frames(const VP10_COMMON *cm, const MACROBLOCKD *xd,
-                             vpx_writer *w) {
+                             vp10_writer *w) {
   const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   const int is_compound = has_second_ref(mbmi);
   const int segment_id = mbmi->segment_id;
@@ -298,37 +831,236 @@
     // does the feature use compound prediction or not
     // (if not specified at the frame/segment level)
     if (cm->reference_mode == REFERENCE_MODE_SELECT) {
-      vpx_write(w, is_compound, vp10_get_reference_mode_prob(cm, xd));
+      vp10_write(w, is_compound, vp10_get_reference_mode_prob(cm, xd));
     } else {
-      assert(!is_compound == (cm->reference_mode == SINGLE_REFERENCE));
+      assert((!is_compound) == (cm->reference_mode == SINGLE_REFERENCE));
     }
 
     if (is_compound) {
-      vpx_write(w, mbmi->ref_frame[0] == GOLDEN_FRAME,
-                vp10_get_pred_prob_comp_ref_p(cm, xd));
+#if CONFIG_EXT_REFS
+      const int bit = (mbmi->ref_frame[0] == GOLDEN_FRAME ||
+                       mbmi->ref_frame[0] == LAST3_FRAME);
+      const int bit_bwd = mbmi->ref_frame[1] == ALTREF_FRAME;
+#else  // CONFIG_EXT_REFS
+      const int bit = mbmi->ref_frame[0] == GOLDEN_FRAME;
+#endif  // CONFIG_EXT_REFS
+
+      vp10_write(w, bit, vp10_get_pred_prob_comp_ref_p(cm, xd));
+
+#if CONFIG_EXT_REFS
+      if (!bit) {
+        const int bit1 = mbmi->ref_frame[0] == LAST_FRAME;
+        vp10_write(w, bit1, vp10_get_pred_prob_comp_ref_p1(cm, xd));
+      } else {
+        const int bit2 = mbmi->ref_frame[0] == GOLDEN_FRAME;
+        vp10_write(w, bit2, vp10_get_pred_prob_comp_ref_p2(cm, xd));
+      }
+      vp10_write(w, bit_bwd, vp10_get_pred_prob_comp_bwdref_p(cm, xd));
+#endif  // CONFIG_EXT_REFS
     } else {
+#if CONFIG_EXT_REFS
+      const int bit0 = (mbmi->ref_frame[0] == ALTREF_FRAME ||
+                        mbmi->ref_frame[0] == BWDREF_FRAME);
+      vp10_write(w, bit0, vp10_get_pred_prob_single_ref_p1(cm, xd));
+
+      if (bit0) {
+        const int bit1 = mbmi->ref_frame[0] == ALTREF_FRAME;
+        vp10_write(w, bit1, vp10_get_pred_prob_single_ref_p2(cm, xd));
+      } else {
+        const int bit2 = (mbmi->ref_frame[0] == LAST3_FRAME ||
+                          mbmi->ref_frame[0] == GOLDEN_FRAME);
+        vp10_write(w, bit2, vp10_get_pred_prob_single_ref_p3(cm, xd));
+
+        if (!bit2) {
+          const int bit3 = mbmi->ref_frame[0] != LAST_FRAME;
+          vp10_write(w, bit3, vp10_get_pred_prob_single_ref_p4(cm, xd));
+        } else {
+          const int bit4 = mbmi->ref_frame[0] != LAST3_FRAME;
+          vp10_write(w, bit4, vp10_get_pred_prob_single_ref_p5(cm, xd));
+        }
+      }
+#else  // CONFIG_EXT_REFS
       const int bit0 = mbmi->ref_frame[0] != LAST_FRAME;
-      vpx_write(w, bit0, vp10_get_pred_prob_single_ref_p1(cm, xd));
+      vp10_write(w, bit0, vp10_get_pred_prob_single_ref_p1(cm, xd));
+
       if (bit0) {
         const int bit1 = mbmi->ref_frame[0] != GOLDEN_FRAME;
-        vpx_write(w, bit1, vp10_get_pred_prob_single_ref_p2(cm, xd));
+        vp10_write(w, bit1, vp10_get_pred_prob_single_ref_p2(cm, xd));
       }
+#endif  // CONFIG_EXT_REFS
+    }
+  }
+}
+
+#if CONFIG_EXT_INTRA
+static void write_ext_intra_mode_info(const VP10_COMMON *const cm,
+                                      const MB_MODE_INFO *const mbmi,
+                                      vp10_writer *w) {
+#if !ALLOW_FILTER_INTRA_MODES
+  return;
+#endif
+  if (mbmi->mode == DC_PRED &&
+      mbmi->palette_mode_info.palette_size[0] == 0) {
+    vp10_write(w, mbmi->ext_intra_mode_info.use_ext_intra_mode[0],
+              cm->fc->ext_intra_probs[0]);
+    if (mbmi->ext_intra_mode_info.use_ext_intra_mode[0]) {
+      EXT_INTRA_MODE mode = mbmi->ext_intra_mode_info.ext_intra_mode[0];
+      write_uniform(w, FILTER_INTRA_MODES, mode);
+    }
+  }
+
+  if (mbmi->uv_mode == DC_PRED &&
+      mbmi->palette_mode_info.palette_size[1] == 0) {
+    vp10_write(w, mbmi->ext_intra_mode_info.use_ext_intra_mode[1],
+              cm->fc->ext_intra_probs[1]);
+    if (mbmi->ext_intra_mode_info.use_ext_intra_mode[1]) {
+      EXT_INTRA_MODE mode = mbmi->ext_intra_mode_info.ext_intra_mode[1];
+      write_uniform(w, FILTER_INTRA_MODES, mode);
+    }
+  }
+}
+
+static void write_intra_angle_info(const VP10_COMMON *cm, const MACROBLOCKD *xd,
+                                   vp10_writer *w) {
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int intra_filter_ctx = vp10_get_pred_context_intra_interp(xd);
+  int p_angle;
+
+  if (bsize < BLOCK_8X8)
+    return;
+
+  if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED) {
+    write_uniform(w, 2 * MAX_ANGLE_DELTAS + 1,
+                  MAX_ANGLE_DELTAS + mbmi->angle_delta[0]);
+    p_angle = mode_to_angle_map[mbmi->mode] + mbmi->angle_delta[0] * ANGLE_STEP;
+    if (vp10_is_intra_filter_switchable(p_angle)) {
+      vp10_write_token(w, vp10_intra_filter_tree,
+                       cm->fc->intra_filter_probs[intra_filter_ctx],
+                       &intra_filter_encodings[mbmi->intra_filter]);
+    }
+  }
+
+  if (mbmi->uv_mode != DC_PRED && mbmi->uv_mode != TM_PRED) {
+    write_uniform(w, 2 * MAX_ANGLE_DELTAS + 1,
+                  MAX_ANGLE_DELTAS + mbmi->angle_delta[1]);
+  }
+}
+#endif  // CONFIG_EXT_INTRA
+
+static void write_switchable_interp_filter(VP10_COMP *cpi,
+                                           const MACROBLOCKD *xd,
+                                           vp10_writer *w) {
+  VP10_COMMON *const cm = &cpi->common;
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+#if CONFIG_DUAL_FILTER
+  int dir;
+#endif
+  if (cm->interp_filter == SWITCHABLE) {
+#if CONFIG_EXT_INTERP
+#if CONFIG_DUAL_FILTER
+    if (!vp10_is_interp_needed(xd)) {
+      assert(mbmi->interp_filter[0] == EIGHTTAP_REGULAR);
+      return;
+    }
+#else
+    if (!vp10_is_interp_needed(xd)) {
+#if CONFIG_DUAL_FILTER
+      assert(mbmi->interp_filter[0] == EIGHTTAP_REGULAR);
+      assert(mbmi->interp_filter[1] == EIGHTTAP_REGULAR);
+#else
+      assert(mbmi->interp_filter == EIGHTTAP_REGULAR);
+#endif
+      return;
+    }
+#endif  // CONFIG_DUAL_FILTER
+#endif  // CONFIG_EXT_INTERP
+#if CONFIG_DUAL_FILTER
+    for (dir = 0; dir < 2; ++dir) {
+      if (has_subpel_mv_component(xd->mi[0], xd, dir) ||
+          (mbmi->ref_frame[1] > INTRA_FRAME &&
+           has_subpel_mv_component(xd->mi[0], xd, dir + 2))) {
+        const int ctx = vp10_get_pred_context_switchable_interp(xd, dir);
+        vp10_write_token(w, vp10_switchable_interp_tree,
+              cm->fc->switchable_interp_prob[ctx],
+              &switchable_interp_encodings[mbmi->interp_filter[dir]]);
+        ++cpi->interp_filter_selected[0][mbmi->interp_filter[dir]];
+      }
+    }
+#else
+    {
+      const int ctx = vp10_get_pred_context_switchable_interp(xd);
+      vp10_write_token(w, vp10_switchable_interp_tree,
+                       cm->fc->switchable_interp_prob[ctx],
+                       &switchable_interp_encodings[mbmi->interp_filter]);
+      ++cpi->interp_filter_selected[0][mbmi->interp_filter];
+    }
+#endif
+  }
+}
+
+static void write_palette_mode_info(const VP10_COMMON *cm,
+                                    const MACROBLOCKD *xd,
+                                    const MODE_INFO *const mi,
+                                    vp10_writer *w) {
+  const MB_MODE_INFO *const mbmi = &mi->mbmi;
+  const MODE_INFO *const above_mi = xd->above_mi;
+  const MODE_INFO *const left_mi = xd->left_mi;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  int palette_ctx = 0;
+  int n, i;
+
+  if (mbmi->mode == DC_PRED) {
+    n = pmi->palette_size[0];
+    if (above_mi)
+      palette_ctx += (above_mi->mbmi.palette_mode_info.palette_size[0] > 0);
+    if (left_mi)
+      palette_ctx += (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
+    vp10_write(w, n > 0,
+              vp10_default_palette_y_mode_prob[bsize - BLOCK_8X8][palette_ctx]);
+    if (n > 0) {
+      vp10_write_token(w, vp10_palette_size_tree,
+                       vp10_default_palette_y_size_prob[bsize - BLOCK_8X8],
+                       &palette_size_encodings[n - 2]);
+      for (i = 0; i < n; ++i)
+        vp10_write_literal(w, pmi->palette_colors[i], cm->bit_depth);
+      write_uniform(w, n, pmi->palette_first_color_idx[0]);
+    }
+  }
+
+  if (mbmi->uv_mode == DC_PRED) {
+    n = pmi->palette_size[1];
+    vp10_write(w, n > 0,
+              vp10_default_palette_uv_mode_prob[pmi->palette_size[0] > 0]);
+    if (n > 0) {
+      vp10_write_token(w, vp10_palette_size_tree,
+                       vp10_default_palette_uv_size_prob[bsize - BLOCK_8X8],
+                       &palette_size_encodings[n - 2]);
+      for (i = 0; i < n; ++i) {
+        vp10_write_literal(w, pmi->palette_colors[PALETTE_MAX_SIZE + i],
+                          cm->bit_depth);
+        vp10_write_literal(w, pmi->palette_colors[2 * PALETTE_MAX_SIZE + i],
+                          cm->bit_depth);
+      }
+      write_uniform(w, n, pmi->palette_first_color_idx[1]);
     }
   }
 }
 
 static void pack_inter_mode_mvs(VP10_COMP *cpi, const MODE_INFO *mi,
-                                vpx_writer *w) {
-  VP10_COMMON *const cm = &cpi->common;
-  const nmv_context *nmvc = &cm->fc->nmvc;
-  const MACROBLOCK *const x = &cpi->td.mb;
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  const struct segmentation *const seg = &cm->seg;
-#if CONFIG_MISC_FIXES
-  const struct segmentation_probs *const segp = &cm->fc->seg;
-#else
-  const struct segmentation_probs *const segp = &cm->segp;
+#if CONFIG_SUPERTX
+                                int supertx_enabled,
 #endif
+                                vp10_writer *w) {
+  VP10_COMMON *const cm = &cpi->common;
+#if !CONFIG_REF_MV
+  const nmv_context *nmvc = &cm->fc->nmvc;
+#endif
+  const MACROBLOCK *x = &cpi->td.mb;
+  const MACROBLOCKD *xd = &x->e_mbd;
+  const struct segmentation *const seg = &cm->seg;
+  const struct segmentation_probs *const segp = &cm->fc->seg;
   const MB_MODE_INFO *const mbmi = &mi->mbmi;
   const MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
   const PREDICTION_MODE mode = mbmi->mode;
@@ -343,7 +1075,7 @@
     if (seg->temporal_update) {
       const int pred_flag = mbmi->seg_id_predicted;
       vpx_prob pred_prob = vp10_get_pred_prob_seg_id(segp, xd);
-      vpx_write(w, pred_flag, pred_prob);
+      vp10_write(w, pred_flag, pred_prob);
       if (!pred_flag)
         write_segment_id(w, seg, segp, segment_id);
     } else {
@@ -351,14 +1083,49 @@
     }
   }
 
+#if CONFIG_SUPERTX
+  if (supertx_enabled)
+    skip = mbmi->skip;
+  else
+    skip = write_skip(cm, xd, segment_id, mi, w);
+#else
   skip = write_skip(cm, xd, segment_id, mi, w);
+#endif  // CONFIG_SUPERTX
 
-  if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
-    vpx_write(w, is_inter, vp10_get_intra_inter_prob(cm, xd));
+#if CONFIG_SUPERTX
+  if (!supertx_enabled)
+#endif  // CONFIG_SUPERTX
+    if (!segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME))
+      vp10_write(w, is_inter, vp10_get_intra_inter_prob(cm, xd));
 
   if (bsize >= BLOCK_8X8 && cm->tx_mode == TX_MODE_SELECT &&
+#if CONFIG_SUPERTX
+      !supertx_enabled &&
+#endif  // CONFIG_SUPERTX
       !(is_inter && skip) && !xd->lossless[segment_id]) {
-    write_selected_tx_size(cm, xd, w);
+#if CONFIG_VAR_TX
+    if (is_inter) {  // This implies skip flag is 0.
+      const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
+      const int txb_size = txsize_to_bsize[max_tx_size];
+      const int bs = num_4x4_blocks_wide_lookup[txb_size];
+      const int width  = num_4x4_blocks_wide_lookup[bsize];
+      const int height = num_4x4_blocks_high_lookup[bsize];
+      int idx, idy;
+      for (idy = 0; idy < height; idy += bs)
+        for (idx = 0; idx < width; idx += bs)
+          write_tx_size_inter(cm, xd, mbmi, max_tx_size, idy, idx, w);
+    } else {
+      set_txfm_ctx(xd->left_txfm_context, mbmi->tx_size, xd->n8_h);
+      set_txfm_ctx(xd->above_txfm_context, mbmi->tx_size, xd->n8_w);
+
+      write_selected_tx_size(cm, xd, w);
+    }
+  } else {
+    set_txfm_ctx(xd->left_txfm_context, mbmi->tx_size, xd->n8_h);
+    set_txfm_ctx(xd->above_txfm_context, mbmi->tx_size, xd->n8_w);
+#else
+  write_selected_tx_size(cm, xd, w);
+#endif
   }
 
   if (!is_inter) {
@@ -376,27 +1143,53 @@
       }
     }
     write_intra_mode(w, mbmi->uv_mode, cm->fc->uv_mode_prob[mode]);
+#if CONFIG_EXT_INTRA
+    write_intra_angle_info(cm, xd, w);
+#endif  // CONFIG_EXT_INTRA
+    if (bsize >= BLOCK_8X8 && cm->allow_screen_content_tools)
+      write_palette_mode_info(cm, xd, mi, w);
+#if CONFIG_EXT_INTRA
+    if (bsize >= BLOCK_8X8)
+      write_ext_intra_mode_info(cm, mbmi, w);
+#endif  // CONFIG_EXT_INTRA
   } else {
-    const int mode_ctx = mbmi_ext->mode_context[mbmi->ref_frame[0]];
-    const vpx_prob *const inter_probs = cm->fc->inter_mode_probs[mode_ctx];
+    int16_t mode_ctx = mbmi_ext->mode_context[mbmi->ref_frame[0]];
     write_ref_frames(cm, xd, w);
 
+#if CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+    if (is_compound)
+      mode_ctx = mbmi_ext->compound_mode_context[mbmi->ref_frame[0]];
+    else
+#endif  // CONFIG_EXT_INTER
+    mode_ctx = vp10_mode_context_analyzer(mbmi_ext->mode_context,
+                                          mbmi->ref_frame, bsize, -1);
+#endif
+
     // If segment skip is not enabled code the mode.
     if (!segfeature_active(seg, segment_id, SEG_LVL_SKIP)) {
       if (bsize >= BLOCK_8X8) {
-        write_inter_mode(w, mode, inter_probs);
+#if CONFIG_EXT_INTER
+        if (is_inter_compound_mode(mode))
+          write_inter_compound_mode(cm, w, mode, mode_ctx);
+        else if (is_inter_singleref_mode(mode))
+#endif  // CONFIG_EXT_INTER
+        write_inter_mode(cm, w, mode,
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+                         is_compound,
+#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
+                         mode_ctx);
+
+#if CONFIG_REF_MV
+        if (mode == NEARMV || mode == NEWMV)
+          write_drl_idx(cm, mbmi, mbmi_ext, w);
+#endif
       }
     }
 
-    if (cm->interp_filter == SWITCHABLE) {
-      const int ctx = vp10_get_pred_context_switchable_interp(xd);
-      vp10_write_token(w, vp10_switchable_interp_tree,
-                      cm->fc->switchable_interp_prob[ctx],
-                      &switchable_interp_encodings[mbmi->interp_filter]);
-      ++cpi->interp_filter_selected[0][mbmi->interp_filter];
-    } else {
-      assert(mbmi->interp_filter == cm->interp_filter);
-    }
+#if !CONFIG_EXT_INTERP && !CONFIG_DUAL_FILTER
+    write_switchable_interp_filter(cpi, xd, w);
+#endif  // !CONFIG_EXT_INTERP
 
     if (bsize < BLOCK_8X8) {
       const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
@@ -406,53 +1199,271 @@
         for (idx = 0; idx < 2; idx += num_4x4_w) {
           const int j = idy * 2 + idx;
           const PREDICTION_MODE b_mode = mi->bmi[j].as_mode;
-          write_inter_mode(w, b_mode, inter_probs);
+#if CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+          if (!is_compound)
+#endif  // CONFIG_EXT_INTER
+            mode_ctx = vp10_mode_context_analyzer(mbmi_ext->mode_context,
+                                                  mbmi->ref_frame, bsize, j);
+#endif
+#if CONFIG_EXT_INTER
+          if (is_inter_compound_mode(b_mode))
+            write_inter_compound_mode(cm, w, b_mode, mode_ctx);
+          else if (is_inter_singleref_mode(b_mode))
+#endif  // CONFIG_EXT_INTER
+          write_inter_mode(cm, w, b_mode,
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+                           has_second_ref(mbmi),
+#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
+                           mode_ctx);
+
+#if CONFIG_EXT_INTER
+          if (b_mode == NEWMV || b_mode == NEWFROMNEARMV ||
+              b_mode == NEW_NEWMV) {
+#else
           if (b_mode == NEWMV) {
-            for (ref = 0; ref < 1 + is_compound; ++ref)
+#endif  // CONFIG_EXT_INTER
+            for (ref = 0; ref < 1 + is_compound; ++ref) {
+#if CONFIG_REF_MV
+              int nmv_ctx =
+                  vp10_nmv_ctx(mbmi_ext->ref_mv_count[mbmi->ref_frame[ref]],
+                               mbmi_ext->ref_mv_stack[mbmi->ref_frame[ref]]);
+              const nmv_context *nmvc = &cm->fc->nmvc[nmv_ctx];
+#endif
               vp10_encode_mv(cpi, w, &mi->bmi[j].as_mv[ref].as_mv,
-                            &mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0].as_mv,
-                            nmvc, allow_hp);
+#if CONFIG_EXT_INTER
+                             &mi->bmi[j].ref_mv[ref].as_mv,
+#if CONFIG_REF_MV
+                             is_compound,
+#endif
+#else
+#if CONFIG_REF_MV
+                             &mi->bmi[j].pred_mv_s8[ref].as_mv,
+                             is_compound,
+#else
+                             &mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0].as_mv,
+#endif  // CONFIG_REF_MV
+#endif  // CONFIG_EXT_INTER
+                             nmvc, allow_hp);
+            }
           }
+#if CONFIG_EXT_INTER
+          else if (b_mode == NEAREST_NEWMV || b_mode == NEAR_NEWMV) {
+#if CONFIG_REF_MV
+            int nmv_ctx =
+                vp10_nmv_ctx(mbmi_ext->ref_mv_count[mbmi->ref_frame[1]],
+                             mbmi_ext->ref_mv_stack[mbmi->ref_frame[1]]);
+            const nmv_context *nmvc = &cm->fc->nmvc[nmv_ctx];
+#endif
+            vp10_encode_mv(cpi, w, &mi->bmi[j].as_mv[1].as_mv,
+                           &mi->bmi[j].ref_mv[1].as_mv,
+#if CONFIG_REF_MV
+                           is_compound,
+#endif
+                           nmvc, allow_hp);
+          } else if (b_mode == NEW_NEARESTMV || b_mode == NEW_NEARMV) {
+#if CONFIG_REF_MV
+            int nmv_ctx =
+                vp10_nmv_ctx(mbmi_ext->ref_mv_count[mbmi->ref_frame[0]],
+                             mbmi_ext->ref_mv_stack[mbmi->ref_frame[0]]);
+            const nmv_context *nmvc = &cm->fc->nmvc[nmv_ctx];
+#endif
+            vp10_encode_mv(cpi, w, &mi->bmi[j].as_mv[0].as_mv,
+                           &mi->bmi[j].ref_mv[0].as_mv,
+#if CONFIG_REF_MV
+                           is_compound,
+#endif
+                           nmvc, allow_hp);
+          }
+#endif  // CONFIG_EXT_INTER
         }
       }
     } else {
+#if CONFIG_EXT_INTER
+      if (mode == NEWMV || mode == NEWFROMNEARMV || mode == NEW_NEWMV) {
+#else
       if (mode == NEWMV) {
-        for (ref = 0; ref < 1 + is_compound; ++ref)
+#endif  // CONFIG_EXT_INTER
+        int_mv ref_mv;
+        for (ref = 0; ref < 1 + is_compound; ++ref) {
+#if CONFIG_REF_MV
+          int nmv_ctx =
+              vp10_nmv_ctx(mbmi_ext->ref_mv_count[mbmi->ref_frame[ref]],
+                           mbmi_ext->ref_mv_stack[mbmi->ref_frame[ref]]);
+          const nmv_context *nmvc = &cm->fc->nmvc[nmv_ctx];
+#endif
+          ref_mv = mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0];
+#if CONFIG_EXT_INTER
+          if (mode == NEWFROMNEARMV)
+            vp10_encode_mv(cpi, w, &mbmi->mv[ref].as_mv,
+                           &mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][1].as_mv,
+#if CONFIG_REF_MV
+                           is_compound,
+#endif
+                           nmvc, allow_hp);
+          else
+#endif  // CONFIG_EXT_INTER
           vp10_encode_mv(cpi, w, &mbmi->mv[ref].as_mv,
-                        &mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0].as_mv, nmvc,
-                        allow_hp);
+                         &ref_mv.as_mv,
+#if CONFIG_REF_MV
+                         is_compound,
+#endif
+                         nmvc, allow_hp);
+        }
+#if CONFIG_EXT_INTER
+      } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) {
+#if CONFIG_REF_MV
+            int nmv_ctx =
+                vp10_nmv_ctx(mbmi_ext->ref_mv_count[mbmi->ref_frame[1]],
+                             mbmi_ext->ref_mv_stack[mbmi->ref_frame[1]]);
+            const nmv_context *nmvc = &cm->fc->nmvc[nmv_ctx];
+#endif
+        vp10_encode_mv(cpi, w, &mbmi->mv[1].as_mv,
+                       &mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0].as_mv,
+#if CONFIG_REF_MV
+                       is_compound,
+#endif
+                       nmvc, allow_hp);
+      } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
+#if CONFIG_REF_MV
+            int nmv_ctx =
+                vp10_nmv_ctx(mbmi_ext->ref_mv_count[mbmi->ref_frame[0]],
+                             mbmi_ext->ref_mv_stack[mbmi->ref_frame[0]]);
+            const nmv_context *nmvc = &cm->fc->nmvc[nmv_ctx];
+#endif
+        vp10_encode_mv(cpi, w, &mbmi->mv[0].as_mv,
+                       &mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0].as_mv,
+#if CONFIG_REF_MV
+                       is_compound,
+#endif
+                       nmvc, allow_hp);
+#endif  // CONFIG_EXT_INTER
       }
     }
-  }
-  if (mbmi->tx_size < TX_32X32 &&
-      cm->base_qindex > 0 && !mbmi->skip &&
-      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-    if (is_inter) {
-      vp10_write_token(
-          w, vp10_ext_tx_tree,
-          cm->fc->inter_ext_tx_prob[mbmi->tx_size],
-          &ext_tx_encodings[mbmi->tx_type]);
-    } else {
-      vp10_write_token(
-          w, vp10_ext_tx_tree,
-          cm->fc->intra_ext_tx_prob[mbmi->tx_size]
-                                   [intra_mode_to_tx_type_context[mbmi->mode]],
-          &ext_tx_encodings[mbmi->tx_type]);
+
+#if CONFIG_EXT_INTER
+    if (cpi->common.reference_mode != COMPOUND_REFERENCE &&
+#if CONFIG_SUPERTX
+        !supertx_enabled &&
+#endif  // CONFIG_SUPERTX
+        is_interintra_allowed(mbmi)) {
+      const int interintra = mbmi->ref_frame[1] == INTRA_FRAME;
+      const int bsize_group = size_group_lookup[bsize];
+      vp10_write(w, interintra, cm->fc->interintra_prob[bsize_group]);
+      if (interintra) {
+        write_interintra_mode(
+            w, mbmi->interintra_mode,
+            cm->fc->interintra_mode_prob[bsize_group]);
+        if (is_interintra_wedge_used(bsize)) {
+          vp10_write(w, mbmi->use_wedge_interintra,
+                     cm->fc->wedge_interintra_prob[bsize]);
+          if (mbmi->use_wedge_interintra) {
+            vp10_write_literal(w, mbmi->interintra_wedge_index,
+                              get_wedge_bits_lookup(bsize));
+            assert(mbmi->interintra_wedge_sign == 0);
+          }
+        }
+      }
     }
-  } else {
-    if (!mbmi->skip)
-      assert(mbmi->tx_type == DCT_DCT);
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_OBMC || CONFIG_WARPED_MOTION
+#if CONFIG_SUPERTX
+    if (!supertx_enabled)
+#endif  // CONFIG_SUPERTX
+#if CONFIG_EXT_INTER
+      if (mbmi->ref_frame[1] != INTRA_FRAME)
+#endif  // CONFIG_EXT_INTER
+      if (is_motvar_allowed(mbmi)) {
+        // TODO(debargha): Might want to only emit this if SEG_LVL_SKIP
+        // is not active, and assume SIMPLE_TRANSLATION in the decoder if
+        // it is active.
+        assert(mbmi->motion_variation < MOTION_VARIATIONS);
+        vp10_write_token(w, vp10_motvar_tree, cm->fc->motvar_prob[bsize],
+                         &motvar_encodings[mbmi->motion_variation]);
+      }
+#endif  // CONFIG_OBMC || CONFIG_WARPED_MOTION
+
+#if CONFIG_EXT_INTER
+    if (cpi->common.reference_mode != SINGLE_REFERENCE &&
+        is_inter_compound_mode(mbmi->mode) &&
+#if CONFIG_OBMC
+        !(is_motvar_allowed(mbmi) &&
+          mbmi->motion_variation != SIMPLE_TRANSLATION) &&
+#endif  // CONFIG_OBMC
+        is_interinter_wedge_used(bsize)) {
+      vp10_write(w, mbmi->use_wedge_interinter,
+                 cm->fc->wedge_interinter_prob[bsize]);
+      if (mbmi->use_wedge_interinter) {
+        vp10_write_literal(w, mbmi->interinter_wedge_index,
+                           get_wedge_bits_lookup(bsize));
+        vp10_write_bit(w, mbmi->interinter_wedge_sign);
+      }
+    }
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_EXT_INTERP || CONFIG_DUAL_FILTER
+    write_switchable_interp_filter(cpi, xd, w);
+#endif  // CONFIG_EXT_INTERP
   }
+
+    if (!FIXED_TX_TYPE) {
+#if CONFIG_EXT_TX
+      if (get_ext_tx_types(mbmi->tx_size, bsize, is_inter) > 1 &&
+          cm->base_qindex > 0 && !mbmi->skip &&
+#if CONFIG_SUPERTX
+          !supertx_enabled &&
+#endif  // CONFIG_SUPERTX
+          !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+        int eset = get_ext_tx_set(mbmi->tx_size, bsize, is_inter);
+        if (is_inter) {
+          if (eset > 0)
+            vp10_write_token(w, vp10_ext_tx_inter_tree[eset],
+                             cm->fc->inter_ext_tx_prob[eset][mbmi->tx_size],
+                             &ext_tx_inter_encodings[eset][mbmi->tx_type]);
+        } else if (ALLOW_INTRA_EXT_TX) {
+          if (eset > 0)
+            vp10_write_token(
+                w, vp10_ext_tx_intra_tree[eset],
+                cm->fc->intra_ext_tx_prob[eset][mbmi->tx_size][mbmi->mode],
+                &ext_tx_intra_encodings[eset][mbmi->tx_type]);
+        }
+      }
+#else
+      if (mbmi->tx_size < TX_32X32 &&
+          cm->base_qindex > 0 && !mbmi->skip &&
+#if CONFIG_SUPERTX
+          !supertx_enabled &&
+#endif  // CONFIG_SUPERTX
+          !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+        if (is_inter) {
+          vp10_write_token(
+              w, vp10_ext_tx_tree,
+              cm->fc->inter_ext_tx_prob[mbmi->tx_size],
+              &ext_tx_encodings[mbmi->tx_type]);
+        } else {
+          vp10_write_token(
+              w, vp10_ext_tx_tree,
+              cm->fc->intra_ext_tx_prob[mbmi->tx_size]
+                                    [intra_mode_to_tx_type_context[mbmi->mode]],
+                                    &ext_tx_encodings[mbmi->tx_type]);
+        }
+      } else {
+        if (!mbmi->skip) {
+#if CONFIG_SUPERTX
+          if (!supertx_enabled)
+#endif  // CONFIG_SUPERTX
+            assert(mbmi->tx_type == DCT_DCT);
+        }
+      }
+#endif  // CONFIG_EXT_TX
+    }
 }
 
 static void write_mb_modes_kf(const VP10_COMMON *cm, const MACROBLOCKD *xd,
-                              MODE_INFO **mi_8x8, vpx_writer *w) {
+                              MODE_INFO **mi_8x8, vp10_writer *w) {
   const struct segmentation *const seg = &cm->seg;
-#if CONFIG_MISC_FIXES
   const struct segmentation_probs *const segp = &cm->fc->seg;
-#else
-  const struct segmentation_probs *const segp = &cm->segp;
-#endif
   const MODE_INFO *const mi = mi_8x8[0];
   const MODE_INFO *const above_mi = xd->above_mi;
   const MODE_INFO *const left_mi = xd->left_mi;
@@ -486,48 +1497,187 @@
   }
 
   write_intra_mode(w, mbmi->uv_mode, cm->fc->uv_mode_prob[mbmi->mode]);
+#if CONFIG_EXT_INTRA
+  write_intra_angle_info(cm, xd, w);
+#endif  // CONFIG_EXT_INTRA
+  if (bsize >= BLOCK_8X8 && cm->allow_screen_content_tools)
+    write_palette_mode_info(cm, xd, mi, w);
+#if CONFIG_EXT_INTRA
+  if (bsize >= BLOCK_8X8)
+      write_ext_intra_mode_info(cm, mbmi, w);
+#endif  // CONFIG_EXT_INTRA
 
-  if (mbmi->tx_size < TX_32X32 &&
-      cm->base_qindex > 0 && !mbmi->skip &&
-      !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-    vp10_write_token(
-        w, vp10_ext_tx_tree,
-        cm->fc->intra_ext_tx_prob[mbmi->tx_size]
-                                 [intra_mode_to_tx_type_context[mbmi->mode]],
-        &ext_tx_encodings[mbmi->tx_type]);
+  if (!FIXED_TX_TYPE) {
+#if CONFIG_EXT_TX
+    if (get_ext_tx_types(mbmi->tx_size, bsize, 0) > 1 &&
+        cm->base_qindex > 0 && !mbmi->skip &&
+        !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP) &&
+        ALLOW_INTRA_EXT_TX) {
+      int eset = get_ext_tx_set(mbmi->tx_size, bsize, 0);
+      if (eset > 0)
+        vp10_write_token(
+            w, vp10_ext_tx_intra_tree[eset],
+            cm->fc->intra_ext_tx_prob[eset][mbmi->tx_size][mbmi->mode],
+            &ext_tx_intra_encodings[eset][mbmi->tx_type]);
+    }
+#else
+    if (mbmi->tx_size < TX_32X32 &&
+        cm->base_qindex > 0 && !mbmi->skip &&
+        !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+      vp10_write_token(
+          w, vp10_ext_tx_tree,
+          cm->fc->intra_ext_tx_prob[mbmi->tx_size]
+                                    [intra_mode_to_tx_type_context[mbmi->mode]],
+                                    &ext_tx_encodings[mbmi->tx_type]);
+    }
+#endif  // CONFIG_EXT_TX
   }
 }
 
+#if CONFIG_SUPERTX
+#define write_modes_b_wrapper(cpi, tile, w, tok, tok_end,      \
+                              supertx_enabled, mi_row, mi_col) \
+  write_modes_b(cpi, tile, w, tok, tok_end, supertx_enabled, mi_row, mi_col)
+#else
+#define write_modes_b_wrapper(cpi, tile, w, tok, tok_end,      \
+                              supertx_enabled, mi_row, mi_col) \
+  write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col)
+#endif  // CONFIG_ANS && CONFIG_SUPERTX
+
 static void write_modes_b(VP10_COMP *cpi, const TileInfo *const tile,
-                          vpx_writer *w, TOKENEXTRA **tok,
+                          vp10_writer *w,
+                          const TOKENEXTRA **tok,
                           const TOKENEXTRA *const tok_end,
+#if CONFIG_SUPERTX
+                          int supertx_enabled,
+#endif
                           int mi_row, int mi_col) {
-  const VP10_COMMON *const cm = &cpi->common;
+  VP10_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
   MODE_INFO *m;
   int plane;
+  int bh, bw;
+#if CONFIG_ANS
+  (void) tok;
+  (void) tok_end;
+  (void) plane;
+#endif  // !CONFIG_ANS
 
   xd->mi = cm->mi_grid_visible + (mi_row * cm->mi_stride + mi_col);
   m = xd->mi[0];
 
+  assert(m->mbmi.sb_type <= cm->sb_size);
+
+  bh = num_8x8_blocks_high_lookup[m->mbmi.sb_type];
+  bw = num_8x8_blocks_wide_lookup[m->mbmi.sb_type];
+
   cpi->td.mb.mbmi_ext = cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
 
-  set_mi_row_col(xd, tile,
-                 mi_row, num_8x8_blocks_high_lookup[m->mbmi.sb_type],
-                 mi_col, num_8x8_blocks_wide_lookup[m->mbmi.sb_type],
-                 cm->mi_rows, cm->mi_cols);
+  set_mi_row_col(xd, tile, mi_row, bh, mi_col, bw, cm->mi_rows, cm->mi_cols);
   if (frame_is_intra_only(cm)) {
     write_mb_modes_kf(cm, xd, xd->mi, w);
   } else {
-    pack_inter_mode_mvs(cpi, m, w);
+#if CONFIG_VAR_TX
+    xd->above_txfm_context = cm->above_txfm_context + mi_col;
+    xd->left_txfm_context =
+      xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+#endif
+#if CONFIG_EXT_INTERP
+    // vp10_is_interp_needed needs the ref frame buffers set up to look
+    // up if they are scaled. vp10_is_interp_needed is in turn needed by
+    // write_switchable_interp_filter, which is called by pack_inter_mode_mvs.
+    set_ref_ptrs(cm, xd, m->mbmi.ref_frame[0], m->mbmi.ref_frame[1]);
+#endif  // CONFIG_EXT_INTERP
+#if 0
+    // NOTE(zoeliu): For debug
+    if (cm->current_video_frame == FRAME_TO_CHECK && cm->show_frame == 1) {
+      const PREDICTION_MODE mode = m->mbmi.mode;
+      const int segment_id = m->mbmi.segment_id;
+      const BLOCK_SIZE bsize = m->mbmi.sb_type;
+
+      // For sub8x8, simply dump out the first sub8x8 block info
+      const PREDICTION_MODE b_mode =
+          (bsize < BLOCK_8X8) ? m->bmi[0].as_mode : -1;
+      const int mv_x = (bsize < BLOCK_8X8) ?
+          m->bmi[0].as_mv[0].as_mv.row : m->mbmi.mv[0].as_mv.row;
+      const int mv_y = (bsize < BLOCK_8X8) ?
+          m->bmi[0].as_mv[0].as_mv.col : m->mbmi.mv[0].as_mv.col;
+
+      printf("Before pack_inter_mode_mvs(): "
+             "Frame=%d, (mi_row,mi_col)=(%d,%d), "
+             "mode=%d, segment_id=%d, bsize=%d, b_mode=%d, "
+             "mv[0]=(%d, %d), ref[0]=%d, ref[1]=%d\n",
+             cm->current_video_frame, mi_row, mi_col,
+             mode, segment_id, bsize, b_mode, mv_x, mv_y,
+             m->mbmi.ref_frame[0], m->mbmi.ref_frame[1]);
+    }
+#endif  // 0
+    pack_inter_mode_mvs(cpi, m,
+#if CONFIG_SUPERTX
+                        supertx_enabled,
+#endif
+                        w);
   }
 
+  for (plane = 0; plane <= 1; ++plane) {
+    if (m->mbmi.palette_mode_info.palette_size[plane] > 0) {
+      const int rows = (4 * num_4x4_blocks_high_lookup[m->mbmi.sb_type]) >>
+          (xd->plane[plane].subsampling_y);
+      const int cols = (4 * num_4x4_blocks_wide_lookup[m->mbmi.sb_type]) >>
+          (xd->plane[plane].subsampling_x);
+      assert(*tok < tok_end);
+      pack_palette_tokens(w, tok, m->mbmi.palette_mode_info.palette_size[plane],
+                          rows * cols - 1);
+      assert(*tok < tok_end + m->mbmi.skip);
+    }
+  }
+
+#if CONFIG_SUPERTX
+  if (supertx_enabled) return;
+#endif  // CONFIG_SUPERTX
+
   if (!m->mbmi.skip) {
     assert(*tok < tok_end);
     for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+#if CONFIG_VAR_TX
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+      MB_MODE_INFO *mbmi = &m->mbmi;
+      BLOCK_SIZE bsize = mbmi->sb_type;
+      const BLOCK_SIZE plane_bsize =
+          get_plane_block_size(VPXMAX(bsize, BLOCK_8X8), pd);
+
+      const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
+      const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
+      int row, col;
+
+      if (is_inter_block(mbmi)) {
+        const TX_SIZE max_tx_size = max_txsize_lookup[plane_bsize];
+        const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
+        int bw = num_4x4_blocks_wide_lookup[txb_size];
+        int block = 0;
+        const int step = 1 << (max_tx_size << 1);
+        for (row = 0; row < num_4x4_h; row += bw) {
+          for (col = 0; col < num_4x4_w; col += bw) {
+            pack_txb_tokens(w, tok, tok_end, xd, mbmi, plane, plane_bsize,
+                            cm->bit_depth, block, row, col, max_tx_size);
+            block += step;
+          }
+        }
+      } else {
+        TX_SIZE tx = plane ? get_uv_tx_size(&m->mbmi, &xd->plane[plane])
+                           : m->mbmi.tx_size;
+        BLOCK_SIZE txb_size = txsize_to_bsize[tx];
+        int bw = num_4x4_blocks_wide_lookup[txb_size];
+
+        for (row = 0; row < num_4x4_h; row += bw)
+          for (col = 0; col < num_4x4_w; col += bw)
+            pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx);
+      }
+#else
       TX_SIZE tx = plane ? get_uv_tx_size(&m->mbmi, &xd->plane[plane])
                          : m->mbmi.tx_size;
       pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx);
+#endif  // CONFIG_VAR_TX
       assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
       (*tok)++;
     }
@@ -537,96 +1687,248 @@
 static void write_partition(const VP10_COMMON *const cm,
                             const MACROBLOCKD *const xd,
                             int hbs, int mi_row, int mi_col,
-                            PARTITION_TYPE p, BLOCK_SIZE bsize, vpx_writer *w) {
+                            PARTITION_TYPE p, BLOCK_SIZE bsize,
+                            vp10_writer *w) {
   const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
   const vpx_prob *const probs = cm->fc->partition_prob[ctx];
   const int has_rows = (mi_row + hbs) < cm->mi_rows;
   const int has_cols = (mi_col + hbs) < cm->mi_cols;
 
   if (has_rows && has_cols) {
+#if CONFIG_EXT_PARTITION_TYPES
+    if (bsize <= BLOCK_8X8)
+      vp10_write_token(w, vp10_partition_tree, probs, &partition_encodings[p]);
+    else
+      vp10_write_token(w, vp10_ext_partition_tree, probs,
+                      &ext_partition_encodings[p]);
+#else
     vp10_write_token(w, vp10_partition_tree, probs, &partition_encodings[p]);
+#endif  // CONFIG_EXT_PARTITION_TYPES
   } else if (!has_rows && has_cols) {
     assert(p == PARTITION_SPLIT || p == PARTITION_HORZ);
-    vpx_write(w, p == PARTITION_SPLIT, probs[1]);
+    vp10_write(w, p == PARTITION_SPLIT, probs[1]);
   } else if (has_rows && !has_cols) {
     assert(p == PARTITION_SPLIT || p == PARTITION_VERT);
-    vpx_write(w, p == PARTITION_SPLIT, probs[2]);
+    vp10_write(w, p == PARTITION_SPLIT, probs[2]);
   } else {
     assert(p == PARTITION_SPLIT);
   }
 }
 
-static void write_modes_sb(VP10_COMP *cpi,
-                           const TileInfo *const tile, vpx_writer *w,
-                           TOKENEXTRA **tok, const TOKENEXTRA *const tok_end,
+#if CONFIG_SUPERTX
+#define write_modes_sb_wrapper(cpi, tile, w, tok, tok_end,                    \
+                               supertx_enabled, mi_row, mi_col, bsize)        \
+  write_modes_sb(cpi, tile, w, tok, tok_end, supertx_enabled, mi_row, mi_col, \
+                 bsize)
+#else
+#define write_modes_sb_wrapper(cpi, tile, w, tok, tok_end,             \
+                               supertx_enabled, mi_row, mi_col, bsize) \
+  write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, bsize)
+#endif  // CONFIG_ANS && CONFIG_SUPERTX
+
+static void write_modes_sb(VP10_COMP *const cpi,
+                           const TileInfo *const tile,
+                           vp10_writer *const w,
+                           const TOKENEXTRA **tok,
+                           const TOKENEXTRA *const tok_end,
+#if CONFIG_SUPERTX
+                           int supertx_enabled,
+#endif
                            int mi_row, int mi_col, BLOCK_SIZE bsize) {
   const VP10_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
-
-  const int bsl = b_width_log2_lookup[bsize];
-  const int bs = (1 << bsl) / 4;
-  PARTITION_TYPE partition;
-  BLOCK_SIZE subsize;
-  const MODE_INFO *m = NULL;
+  const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
+  const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
+  const BLOCK_SIZE subsize =  get_subsize(bsize, partition);
+#if CONFIG_SUPERTX
+  const int mi_offset = mi_row * cm->mi_stride + mi_col;
+  MB_MODE_INFO *mbmi;
+  const int pack_token = !supertx_enabled;
+  TX_SIZE supertx_size;
+  int plane;
+#endif
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  m = cm->mi_grid_visible[mi_row * cm->mi_stride + mi_col];
-
-  partition = partition_lookup[bsl][m->mbmi.sb_type];
-  write_partition(cm, xd, bs, mi_row, mi_col, partition, bsize, w);
-  subsize = get_subsize(bsize, partition);
+  write_partition(cm, xd, hbs, mi_row, mi_col, partition, bsize, w);
+#if CONFIG_SUPERTX
+  mbmi = &cm->mi_grid_visible[mi_offset]->mbmi;
+  xd->mi = cm->mi_grid_visible + mi_offset;
+  set_mi_row_col(xd, tile,
+                 mi_row, num_8x8_blocks_high_lookup[bsize],
+                 mi_col, num_8x8_blocks_wide_lookup[bsize],
+                 cm->mi_rows, cm->mi_cols);
+  if (!supertx_enabled &&
+      !frame_is_intra_only(cm) &&
+      partition != PARTITION_NONE && bsize <= MAX_SUPERTX_BLOCK_SIZE &&
+      !xd->lossless[0]) {
+    vpx_prob prob;
+    supertx_size = max_txsize_lookup[bsize];
+    prob = cm->fc->supertx_prob[partition_supertx_context_lookup[partition]]
+                               [supertx_size];
+    supertx_enabled = (xd->mi[0]->mbmi.tx_size == supertx_size);
+    vp10_write(w, supertx_enabled, prob);
+  }
+#endif  // CONFIG_SUPERTX
   if (subsize < BLOCK_8X8) {
-    write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+    write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                          mi_row, mi_col);
   } else {
     switch (partition) {
       case PARTITION_NONE:
-        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                              mi_row, mi_col);
         break;
       case PARTITION_HORZ:
-        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
-        if (mi_row + bs < cm->mi_rows)
-          write_modes_b(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                              mi_row, mi_col);
+        if (mi_row + hbs < cm->mi_rows)
+          write_modes_b_wrapper(cpi, tile, w, tok, tok_end,
+                                supertx_enabled, mi_row + hbs, mi_col);
         break;
       case PARTITION_VERT:
-        write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col);
-        if (mi_col + bs < cm->mi_cols)
-          write_modes_b(cpi, tile, w, tok, tok_end, mi_row, mi_col + bs);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                              mi_row, mi_col);
+        if (mi_col + hbs < cm->mi_cols)
+          write_modes_b_wrapper(cpi, tile, w, tok, tok_end,
+                                supertx_enabled, mi_row, mi_col + hbs);
         break;
       case PARTITION_SPLIT:
-        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col, subsize);
-        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col + bs,
-                       subsize);
-        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col,
-                       subsize);
-        write_modes_sb(cpi, tile, w, tok, tok_end, mi_row + bs, mi_col + bs,
-                       subsize);
+        write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                               mi_row, mi_col, subsize);
+        write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                               mi_row, mi_col + hbs, subsize);
+        write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                               mi_row + hbs, mi_col, subsize);
+        write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                               mi_row + hbs, mi_col + hbs, subsize);
         break;
+#if CONFIG_EXT_PARTITION_TYPES
+      case PARTITION_HORZ_A:
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                      mi_row, mi_col);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                      mi_row, mi_col + hbs);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                      mi_row + hbs, mi_col);
+        break;
+      case PARTITION_HORZ_B:
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                      mi_row, mi_col);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                      mi_row + hbs, mi_col);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                      mi_row + hbs, mi_col + hbs);
+        break;
+      case PARTITION_VERT_A:
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                      mi_row, mi_col);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                      mi_row + hbs, mi_col);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                      mi_row, mi_col + hbs);
+        break;
+      case PARTITION_VERT_B:
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                      mi_row, mi_col);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                      mi_row, mi_col + hbs);
+        write_modes_b_wrapper(cpi, tile, w, tok, tok_end, supertx_enabled,
+                      mi_row + hbs, mi_col + hbs);
+        break;
+#endif  // CONFIG_EXT_PARTITION_TYPES
       default:
         assert(0);
     }
   }
+#if CONFIG_SUPERTX
+  if (partition != PARTITION_NONE && supertx_enabled && pack_token) {
+    int skip;
+    xd->mi = cm->mi_grid_visible + mi_offset;
+    supertx_size = mbmi->tx_size;
+    set_mi_row_col(xd, tile,
+                   mi_row, num_8x8_blocks_high_lookup[bsize],
+                   mi_col, num_8x8_blocks_wide_lookup[bsize],
+                   cm->mi_rows, cm->mi_cols);
+
+    assert(IMPLIES(!cm->seg.enabled, mbmi->segment_id_supertx == 0));
+    assert(mbmi->segment_id_supertx < MAX_SEGMENTS);
+
+    skip = write_skip(cm, xd, mbmi->segment_id_supertx, xd->mi[0], w);
+#if CONFIG_EXT_TX
+    if (get_ext_tx_types(supertx_size, bsize, 1) > 1 && !skip) {
+      int eset = get_ext_tx_set(supertx_size, bsize, 1);
+      if (eset > 0) {
+        vp10_write_token(
+            w, vp10_ext_tx_inter_tree[eset],
+            cm->fc->inter_ext_tx_prob[eset][supertx_size],
+            &ext_tx_inter_encodings[eset][mbmi->tx_type]);
+      }
+    }
+#else
+    if (supertx_size < TX_32X32 && !skip) {
+      vp10_write_token(
+          w, vp10_ext_tx_tree,
+          cm->fc->inter_ext_tx_prob[supertx_size],
+          &ext_tx_encodings[mbmi->tx_type]);
+    }
+#endif  // CONFIG_EXT_TX
+
+    if (!skip) {
+      assert(*tok < tok_end);
+      for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+        const int mbmi_txb_size = txsize_to_bsize[mbmi->tx_size];
+        const int num_4x4_w = num_4x4_blocks_wide_lookup[mbmi_txb_size];
+        const int num_4x4_h = num_4x4_blocks_high_lookup[mbmi_txb_size];
+        int row, col;
+        TX_SIZE tx = plane ? get_uv_tx_size(mbmi, &xd->plane[plane])
+                           : mbmi->tx_size;
+        BLOCK_SIZE txb_size = txsize_to_bsize[tx];
+        int bw = num_4x4_blocks_wide_lookup[txb_size];
+
+        for (row = 0; row < num_4x4_h; row += bw)
+          for (col = 0; col < num_4x4_w; col += bw)
+            pack_mb_tokens(w, tok, tok_end, cm->bit_depth, tx);
+        assert(*tok < tok_end && (*tok)->token == EOSB_TOKEN);
+        (*tok)++;
+      }
+    }
+  }
+#endif  // CONFIG_SUPERTX
 
   // update partition context
+#if CONFIG_EXT_PARTITION_TYPES
+  update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
+#else
   if (bsize >= BLOCK_8X8 &&
       (bsize == BLOCK_8X8 || partition != PARTITION_SPLIT))
     update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+#endif  // CONFIG_EXT_PARTITION_TYPES
 }
 
-static void write_modes(VP10_COMP *cpi,
-                        const TileInfo *const tile, vpx_writer *w,
-                        TOKENEXTRA **tok, const TOKENEXTRA *const tok_end) {
+static void write_modes(VP10_COMP *const cpi,
+                        const TileInfo *const tile,
+                        vp10_writer *const w,
+                        const TOKENEXTRA **tok,
+                        const TOKENEXTRA *const tok_end) {
+  VP10_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+  const int mi_row_start = tile->mi_row_start;
+  const int mi_row_end = tile->mi_row_end;
+  const int mi_col_start = tile->mi_col_start;
+  const int mi_col_end = tile->mi_col_end;
   int mi_row, mi_col;
 
-  for (mi_row = tile->mi_row_start; mi_row < tile->mi_row_end;
-       mi_row += MI_BLOCK_SIZE) {
-    vp10_zero(xd->left_seg_context);
-    for (mi_col = tile->mi_col_start; mi_col < tile->mi_col_end;
-         mi_col += MI_BLOCK_SIZE)
-      write_modes_sb(cpi, tile, w, tok, tok_end, mi_row, mi_col,
-                     BLOCK_64X64);
+  vp10_zero_above_context(cm, mi_col_start, mi_col_end);
+
+  for (mi_row = mi_row_start; mi_row < mi_row_end; mi_row += cm->mib_size) {
+    vp10_zero_left_context(xd);
+
+    for (mi_col = mi_col_start; mi_col < mi_col_end; mi_col += cm->mib_size) {
+      write_modes_sb_wrapper(cpi, tile, w, tok, tok_end, 0,
+                             mi_row, mi_col, cm->sb_size);
+    }
   }
 }
 
@@ -657,7 +1959,7 @@
   }
 }
 
-static void update_coef_probs_common(vpx_writer* const bc, VP10_COMP *cpi,
+static void update_coef_probs_common(vp10_writer* const bc, VP10_COMP *cpi,
                                      TX_SIZE tx_size,
                                      vp10_coeff_stats *frame_branch_ct,
                                      vp10_coeff_probs_model *new_coef_probs) {
@@ -701,13 +2003,12 @@
         }
       }
 
-      // printf("Update %d %d, savings %d\n", update[0], update[1], savings);
       /* Is coef updated at all */
       if (update[1] == 0 || savings < 0) {
-        vpx_write_bit(bc, 0);
+        vp10_write_bit(bc, 0);
         return;
       }
-      vpx_write_bit(bc, 1);
+      vp10_write_bit(bc, 1);
       for (i = 0; i < PLANE_TYPES; ++i) {
         for (j = 0; j < REF_TYPES; ++j) {
           for (k = 0; k < COEF_BANDS; ++k) {
@@ -729,7 +2030,7 @@
                       *oldp, &newp, upd);
                 if (s > 0 && newp != *oldp)
                   u = 1;
-                vpx_write(bc, u, upd);
+                vp10_write(bc, u, upd);
                 if (u) {
                   /* send/use new probability */
                   vp10_write_prob_diff_update(bc, newp, *oldp);
@@ -777,11 +2078,11 @@
                 if (u == 1 && updates == 1) {
                   int v;
                   // first update
-                  vpx_write_bit(bc, 1);
+                  vp10_write_bit(bc, 1);
                   for (v = 0; v < noupdates_before_first; ++v)
-                    vpx_write(bc, 0, upd);
+                    vp10_write(bc, 0, upd);
                 }
-                vpx_write(bc, u, upd);
+                vp10_write(bc, u, upd);
                 if (u) {
                   /* send/use new probability */
                   vp10_write_prob_diff_update(bc, newp, *oldp);
@@ -793,7 +2094,7 @@
         }
       }
       if (updates == 0) {
-        vpx_write_bit(bc, 0);  // no updates
+        vp10_write_bit(bc, 0);  // no updates
       }
       return;
     }
@@ -802,28 +2103,356 @@
   }
 }
 
-static void update_coef_probs(VP10_COMP *cpi, vpx_writer* w) {
+#if CONFIG_ENTROPY
+// Calculate the token counts between subsequent subframe updates.
+static void get_coef_counts_diff(VP10_COMP *cpi, int index,
+                                 vp10_coeff_count
+                                 coef_counts[TX_SIZES][PLANE_TYPES],
+                                 unsigned int eob_counts[TX_SIZES]
+                                 [PLANE_TYPES][REF_TYPES][COEF_BANDS]
+                                 [COEFF_CONTEXTS]) {
+  int i, j, k, l, m, tx_size, val;
+  const int max_idx = cpi->common.coef_probs_update_idx;
+  const TX_MODE tx_mode = cpi->common.tx_mode;
+  const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
+  const SUBFRAME_STATS *subframe_stats = &cpi->subframe_stats;
+
+  assert(max_idx < COEF_PROBS_BUFS);
+
+  for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
+    for (i = 0; i < PLANE_TYPES; ++i)
+      for (j = 0; j < REF_TYPES; ++j)
+        for (k = 0; k < COEF_BANDS; ++k)
+          for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+            if (index == max_idx) {
+              val = cpi->common.counts.eob_branch[tx_size][i][j][k][l] -
+                  subframe_stats->eob_counts_buf[max_idx][tx_size][i][j][k][l];
+            } else {
+              val = subframe_stats->eob_counts_buf[index + 1][tx_size]
+                                                             [i][j][k][l] -
+                  subframe_stats->eob_counts_buf[index][tx_size][i][j][k][l];
+            }
+            assert(val >= 0);
+            eob_counts[tx_size][i][j][k][l] = val;
+
+            for (m = 0; m < ENTROPY_TOKENS; ++m) {
+              if (index == max_idx) {
+                val = cpi->td.rd_counts.coef_counts[tx_size][i][j][k][l][m] -
+                    subframe_stats->coef_counts_buf[max_idx][tx_size]
+                                                            [i][j][k][l][m];
+              } else {
+                val = subframe_stats->coef_counts_buf[index + 1]
+                                                     [tx_size][i][j][k][l][m] -
+                      subframe_stats->coef_counts_buf[index][tx_size]
+                                                            [i][j][k][l][m];
+              }
+              assert(val >= 0);
+              coef_counts[tx_size][i][j][k][l][m] = val;
+            }
+          }
+}
+
+static void update_coef_probs_subframe(vp10_writer* const bc, VP10_COMP *cpi,
+                                       TX_SIZE tx_size,
+                                       vp10_coeff_stats
+                                       branch_ct[COEF_PROBS_BUFS][TX_SIZES]
+                                                                 [PLANE_TYPES],
+                                     vp10_coeff_probs_model *new_coef_probs) {
+  vp10_coeff_probs_model *old_coef_probs = cpi->common.fc->coef_probs[tx_size];
+  const vpx_prob upd = DIFF_UPDATE_PROB;
+  const int entropy_nodes_update = UNCONSTRAINED_NODES;
+  int i, j, k, l, t;
+  int stepsize = cpi->sf.coeff_prob_appx_step;
+  const int max_idx = cpi->common.coef_probs_update_idx;
+  int idx;
+  unsigned int this_branch_ct[ENTROPY_NODES][COEF_PROBS_BUFS][2];
+
+  switch (cpi->sf.use_fast_coef_updates) {
+    case TWO_LOOP: {
+      /* dry run to see if there is any update at all needed */
+      int savings = 0;
+      int update[2] = {0, 0};
+      for (i = 0; i < PLANE_TYPES; ++i) {
+        for (j = 0; j < REF_TYPES; ++j) {
+          for (k = 0; k < COEF_BANDS; ++k) {
+            for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+              for (t = 0; t < ENTROPY_NODES; ++t) {
+                for (idx = 0; idx <= max_idx; ++idx) {
+                  memcpy(this_branch_ct[t][idx],
+                         branch_ct[idx][tx_size][i][j][k][l][t],
+                         2 * sizeof(this_branch_ct[t][idx][0]));
+                }
+              }
+              for (t = 0; t < entropy_nodes_update; ++t) {
+                vpx_prob newp = new_coef_probs[i][j][k][l][t];
+                const vpx_prob oldp = old_coef_probs[i][j][k][l][t];
+                int s, u = 0;
+
+                if (t == PIVOT_NODE)
+                  s = vp10_prob_update_search_model_subframe(this_branch_ct,
+                                      old_coef_probs[i][j][k][l], &newp, upd,
+                                      stepsize, max_idx);
+                else
+                  s = vp10_prob_update_search_subframe(this_branch_ct[t],
+                                                       oldp, &newp, upd,
+                                                       max_idx);
+                if (s > 0 && newp != oldp)
+                  u = 1;
+                if (u)
+                  savings += s - (int)(vp10_cost_zero(upd));
+                else
+                  savings -= (int)(vp10_cost_zero(upd));
+                update[u]++;
+              }
+            }
+          }
+        }
+      }
+
+      /* Is coef updated at all */
+      if (update[1] == 0 || savings < 0) {
+        vp10_write_bit(bc, 0);
+        return;
+      }
+      vp10_write_bit(bc, 1);
+      for (i = 0; i < PLANE_TYPES; ++i) {
+        for (j = 0; j < REF_TYPES; ++j) {
+          for (k = 0; k < COEF_BANDS; ++k) {
+            for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+              for (t = 0; t < ENTROPY_NODES; ++t) {
+                for (idx = 0; idx <= max_idx; ++idx) {
+                  memcpy(this_branch_ct[t][idx],
+                         branch_ct[idx][tx_size][i][j][k][l][t],
+                         2 * sizeof(this_branch_ct[t][idx][0]));
+                }
+              }
+              for (t = 0; t < entropy_nodes_update; ++t) {
+                vpx_prob newp = new_coef_probs[i][j][k][l][t];
+                vpx_prob *oldp = old_coef_probs[i][j][k][l] + t;
+                const vpx_prob upd = DIFF_UPDATE_PROB;
+                int s;
+                int u = 0;
+
+                if (t == PIVOT_NODE)
+                  s = vp10_prob_update_search_model_subframe(this_branch_ct,
+                                     old_coef_probs[i][j][k][l], &newp, upd,
+                                     stepsize, max_idx);
+                else
+                  s = vp10_prob_update_search_subframe(this_branch_ct[t],
+                                                       *oldp, &newp, upd,
+                                                       max_idx);
+                if (s > 0 && newp != *oldp)
+                  u = 1;
+                vp10_write(bc, u, upd);
+                if (u) {
+                  /* send/use new probability */
+                  vp10_write_prob_diff_update(bc, newp, *oldp);
+                  *oldp = newp;
+                }
+              }
+            }
+          }
+        }
+      }
+      return;
+    }
+
+    case ONE_LOOP_REDUCED: {
+      int updates = 0;
+      int noupdates_before_first = 0;
+      for (i = 0; i < PLANE_TYPES; ++i) {
+        for (j = 0; j < REF_TYPES; ++j) {
+          for (k = 0; k < COEF_BANDS; ++k) {
+            for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+              for (t = 0; t < ENTROPY_NODES; ++t) {
+                for (idx = 0; idx <= max_idx; ++idx) {
+                  memcpy(this_branch_ct[t][idx],
+                         branch_ct[idx][tx_size][i][j][k][l][t],
+                         2 * sizeof(this_branch_ct[t][idx][0]));
+                }
+              }
+              for (t = 0; t < entropy_nodes_update; ++t) {
+                vpx_prob newp = new_coef_probs[i][j][k][l][t];
+                vpx_prob *oldp = old_coef_probs[i][j][k][l] + t;
+                int s;
+                int u = 0;
+
+                if (t == PIVOT_NODE)
+                  s = vp10_prob_update_search_model_subframe(this_branch_ct,
+                                      old_coef_probs[i][j][k][l], &newp, upd,
+                                      stepsize, max_idx);
+                else
+                  s = vp10_prob_update_search_subframe(this_branch_ct[t],
+                                                       *oldp, &newp, upd,
+                                                       max_idx);
+                if (s > 0 && newp != *oldp)
+                  u = 1;
+                updates += u;
+                if (u == 0 && updates == 0) {
+                  noupdates_before_first++;
+                  continue;
+                }
+                if (u == 1 && updates == 1) {
+                  int v;
+                  // first update
+                  vp10_write_bit(bc, 1);
+                  for (v = 0; v < noupdates_before_first; ++v)
+                    vp10_write(bc, 0, upd);
+                }
+                vp10_write(bc, u, upd);
+                if (u) {
+                  /* send/use new probability */
+                  vp10_write_prob_diff_update(bc, newp, *oldp);
+                  *oldp = newp;
+                }
+              }
+            }
+          }
+        }
+      }
+      if (updates == 0) {
+        vp10_write_bit(bc, 0);  // no updates
+      }
+      return;
+    }
+    default:
+      assert(0);
+  }
+}
+#endif  // CONFIG_ENTROPY
+
+static void update_coef_probs(VP10_COMP *cpi, vp10_writer* w) {
   const TX_MODE tx_mode = cpi->common.tx_mode;
   const TX_SIZE max_tx_size = tx_mode_to_biggest_tx_size[tx_mode];
   TX_SIZE tx_size;
+#if CONFIG_ANS
+  int update = 0;
+#endif  // CONFIG_ANS
+#if CONFIG_ENTROPY
+  VP10_COMMON *cm = &cpi->common;
+  SUBFRAME_STATS *subframe_stats = &cpi->subframe_stats;
+  unsigned int eob_counts_copy[TX_SIZES][PLANE_TYPES][REF_TYPES]
+                              [COEF_BANDS][COEFF_CONTEXTS];
+  int i;
+  vp10_coeff_probs_model dummy_frame_coef_probs[PLANE_TYPES];
+
+  if (cm->do_subframe_update &&
+      cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+    vp10_copy(cpi->common.fc->coef_probs,
+              subframe_stats->enc_starting_coef_probs);
+    for (i = 0; i <= cpi->common.coef_probs_update_idx; ++i) {
+      get_coef_counts_diff(cpi, i,
+                           cpi->wholeframe_stats.coef_counts_buf[i],
+                           cpi->wholeframe_stats.eob_counts_buf[i]);
+    }
+  }
+#endif  // CONFIG_ENTROPY
+
   for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size) {
     vp10_coeff_stats frame_branch_ct[PLANE_TYPES];
     vp10_coeff_probs_model frame_coef_probs[PLANE_TYPES];
-    if (cpi->td.counts->tx.tx_totals[tx_size] <= 20 ||
+    if (cpi->td.counts->tx_size_totals[tx_size] <= 20 ||
         (tx_size >= TX_16X16 && cpi->sf.tx_size_search_method == USE_TX_8X8)) {
-      vpx_write_bit(w, 0);
+      vp10_write_bit(w, 0);
     } else {
-      build_tree_distribution(cpi, tx_size, frame_branch_ct,
-                              frame_coef_probs);
-      update_coef_probs_common(w, cpi, tx_size, frame_branch_ct,
-                               frame_coef_probs);
+#if CONFIG_ENTROPY
+      if (cm->do_subframe_update &&
+          cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+        unsigned int eob_counts_copy[PLANE_TYPES][REF_TYPES]
+                                                 [COEF_BANDS][COEFF_CONTEXTS];
+        vp10_coeff_count coef_counts_copy[PLANE_TYPES];
+        vp10_copy(eob_counts_copy, cpi->common.counts.eob_branch[tx_size]);
+        vp10_copy(coef_counts_copy, cpi->td.rd_counts.coef_counts[tx_size]);
+        build_tree_distribution(cpi, tx_size, frame_branch_ct,
+                                frame_coef_probs);
+        for (i = 0; i <= cpi->common.coef_probs_update_idx; ++i) {
+          vp10_copy(cpi->common.counts.eob_branch[tx_size],
+                    cpi->wholeframe_stats.eob_counts_buf[i][tx_size]);
+          vp10_copy(cpi->td.rd_counts.coef_counts[tx_size],
+                    cpi->wholeframe_stats.coef_counts_buf[i][tx_size]);
+          build_tree_distribution(cpi, tx_size,
+                                  cpi->branch_ct_buf[i][tx_size],
+                                  dummy_frame_coef_probs);
+        }
+        vp10_copy(cpi->common.counts.eob_branch[tx_size], eob_counts_copy);
+        vp10_copy(cpi->td.rd_counts.coef_counts[tx_size], coef_counts_copy);
+
+        update_coef_probs_subframe(w, cpi, tx_size, cpi->branch_ct_buf,
+                                   frame_coef_probs);
+#if CONFIG_ANS
+        update = 1;
+#endif  // CONFIG_ANS
+      } else {
+#endif  // CONFIG_ENTROPY
+        build_tree_distribution(cpi, tx_size, frame_branch_ct,
+                                frame_coef_probs);
+        update_coef_probs_common(w, cpi, tx_size, frame_branch_ct,
+                                 frame_coef_probs);
+#if CONFIG_ANS
+        update = 1;
+#endif  // CONFIG_ANS
+#if CONFIG_ENTROPY
+      }
+#endif  // CONFIG_ENTROPY
+    }
+  }
+
+#if CONFIG_ENTROPY
+  vp10_copy(cm->starting_coef_probs, cm->fc->coef_probs);
+  vp10_copy(subframe_stats->coef_probs_buf[0], cm->fc->coef_probs);
+  if (cm->do_subframe_update &&
+      cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+    vp10_copy(eob_counts_copy, cm->counts.eob_branch);
+    for (i = 1; i <= cpi->common.coef_probs_update_idx; ++i) {
+      for (tx_size = TX_4X4; tx_size <= max_tx_size; ++tx_size)
+        vp10_full_to_model_counts(cm->counts.coef[tx_size],
+                                  subframe_stats->coef_counts_buf[i][tx_size]);
+      vp10_copy(cm->counts.eob_branch, subframe_stats->eob_counts_buf[i]);
+      vp10_partial_adapt_probs(cm, 0, 0);
+      vp10_copy(subframe_stats->coef_probs_buf[i], cm->fc->coef_probs);
+    }
+    vp10_copy(cm->fc->coef_probs, subframe_stats->coef_probs_buf[0]);
+    vp10_copy(cm->counts.eob_branch, eob_counts_copy);
+  }
+#endif  // CONFIG_ENTROPY
+#if CONFIG_ANS
+  if (update) vp10_coef_pareto_cdfs(cpi->common.fc);
+#endif  // CONFIG_ANS
+}
+
+#if CONFIG_LOOP_RESTORATION
+static void encode_restoration(VP10_COMMON *cm,
+                               struct vpx_write_bit_buffer *wb) {
+  RestorationInfo *rst = &cm->rst_info;
+  vpx_wb_write_bit(wb, rst->restoration_type != RESTORE_NONE);
+  if (rst->restoration_type != RESTORE_NONE) {
+    if (rst->restoration_type == RESTORE_BILATERAL) {
+      vpx_wb_write_bit(wb, 1);
+      vpx_wb_write_literal(wb, rst->restoration_level,
+                           vp10_restoration_level_bits(cm));
+    } else {
+      vpx_wb_write_bit(wb, 0);
+      vpx_wb_write_literal(
+          wb, rst->vfilter[0] - WIENER_FILT_TAP0_MINV, WIENER_FILT_TAP0_BITS);
+      vpx_wb_write_literal(
+          wb, rst->vfilter[1] - WIENER_FILT_TAP1_MINV, WIENER_FILT_TAP1_BITS);
+      vpx_wb_write_literal(
+          wb, rst->vfilter[2] - WIENER_FILT_TAP2_MINV, WIENER_FILT_TAP2_BITS);
+      vpx_wb_write_literal(
+          wb, rst->hfilter[0] - WIENER_FILT_TAP0_MINV, WIENER_FILT_TAP0_BITS);
+      vpx_wb_write_literal(
+          wb, rst->hfilter[1] - WIENER_FILT_TAP1_MINV, WIENER_FILT_TAP1_BITS);
+      vpx_wb_write_literal(
+          wb, rst->hfilter[2] - WIENER_FILT_TAP2_MINV, WIENER_FILT_TAP2_BITS);
     }
   }
 }
+#endif  // CONFIG_LOOP_RESTORATION
 
-static void encode_loopfilter(struct loopfilter *lf,
+static void encode_loopfilter(VP10_COMMON *cm,
                               struct vpx_write_bit_buffer *wb) {
   int i;
+  struct loopfilter *lf = &cm->lf;
 
   // Encode the loop filter level and type
   vpx_wb_write_literal(wb, lf->filter_level, 6);
@@ -862,7 +2491,7 @@
 static void write_delta_q(struct vpx_write_bit_buffer *wb, int delta_q) {
   if (delta_q != 0) {
     vpx_wb_write_bit(wb, 1);
-    vpx_wb_write_inv_signed_literal(wb, delta_q, CONFIG_MISC_FIXES ? 6 : 4);
+    vpx_wb_write_inv_signed_literal(wb, delta_q, 6);
   } else {
     vpx_wb_write_bit(wb, 0);
   }
@@ -879,11 +2508,7 @@
 static void encode_segmentation(VP10_COMMON *cm, MACROBLOCKD *xd,
                                 struct vpx_write_bit_buffer *wb) {
   int i, j;
-
   const struct segmentation *seg = &cm->seg;
-#if !CONFIG_MISC_FIXES
-  const struct segmentation_probs *segp = &cm->segp;
-#endif
 
   vpx_wb_write_bit(wb, seg->enabled);
   if (!seg->enabled)
@@ -898,16 +2523,6 @@
   if (seg->update_map) {
     // Select the coding strategy (temporal or spatial)
     vp10_choose_segmap_coding_method(cm, xd);
-#if !CONFIG_MISC_FIXES
-    // Write out probabilities used to decode unpredicted  macro-block segments
-    for (i = 0; i < SEG_TREE_PROBS; i++) {
-      const int prob = segp->tree_probs[i];
-      const int update = prob != MAX_PROB;
-      vpx_wb_write_bit(wb, update);
-      if (update)
-        vpx_wb_write_literal(wb, prob, 8);
-    }
-#endif
 
     // Write out the chosen coding method.
     if (!frame_is_intra_only(cm) && !cm->error_resilient_mode) {
@@ -915,18 +2530,6 @@
     } else {
       assert(seg->temporal_update == 0);
     }
-
-#if !CONFIG_MISC_FIXES
-    if (seg->temporal_update) {
-      for (i = 0; i < PREDICTION_PROBS; i++) {
-        const int prob = segp->pred_probs[i];
-        const int update = prob != MAX_PROB;
-        vpx_wb_write_bit(wb, update);
-        if (update)
-          vpx_wb_write_literal(wb, prob, 8);
-      }
-    }
-#endif
   }
 
   // Segmentation data
@@ -954,8 +2557,7 @@
   }
 }
 
-#if CONFIG_MISC_FIXES
-static void update_seg_probs(VP10_COMP *cpi, vpx_writer *w) {
+static void update_seg_probs(VP10_COMP *cpi, vp10_writer *w) {
   VP10_COMMON *cm = &cpi->common;
 
   if (!cpi->common.seg.enabled)
@@ -981,44 +2583,17 @@
   if (mode != TX_MODE_SELECT)
     vpx_wb_write_literal(wb, mode, 2);
 }
-#else
-static void write_txfm_mode(TX_MODE mode, struct vpx_writer *wb) {
-  vpx_write_literal(wb, VPXMIN(mode, ALLOW_32X32), 2);
-  if (mode >= ALLOW_32X32)
-    vpx_write_bit(wb, mode == TX_MODE_SELECT);
-}
-#endif
 
 
-static void update_txfm_probs(VP10_COMMON *cm, vpx_writer *w,
+static void update_txfm_probs(VP10_COMMON *cm, vp10_writer *w,
                               FRAME_COUNTS *counts) {
-
   if (cm->tx_mode == TX_MODE_SELECT) {
     int i, j;
-    unsigned int ct_8x8p[TX_SIZES - 3][2];
-    unsigned int ct_16x16p[TX_SIZES - 2][2];
-    unsigned int ct_32x32p[TX_SIZES - 1][2];
-
-
-    for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-      vp10_tx_counts_to_branch_counts_8x8(counts->tx.p8x8[i], ct_8x8p);
-      for (j = 0; j < TX_SIZES - 3; j++)
-        vp10_cond_prob_diff_update(w, &cm->fc->tx_probs.p8x8[i][j], ct_8x8p[j]);
-    }
-
-    for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-      vp10_tx_counts_to_branch_counts_16x16(counts->tx.p16x16[i], ct_16x16p);
-      for (j = 0; j < TX_SIZES - 2; j++)
-        vp10_cond_prob_diff_update(w, &cm->fc->tx_probs.p16x16[i][j],
-                                  ct_16x16p[j]);
-    }
-
-    for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
-      vp10_tx_counts_to_branch_counts_32x32(counts->tx.p32x32[i], ct_32x32p);
-      for (j = 0; j < TX_SIZES - 1; j++)
-        vp10_cond_prob_diff_update(w, &cm->fc->tx_probs.p32x32[i][j],
-                                  ct_32x32p[j]);
-    }
+    for (i = 0; i < TX_SIZES - 1; ++i)
+      for (j = 0; j < TX_SIZE_CONTEXTS; ++j)
+        prob_diff_update(vp10_tx_size_tree[i],
+                         cm->fc->tx_size_probs[i][j],
+                         counts->tx_size[i][j], i + 2, w);
   }
 }
 
@@ -1026,7 +2601,7 @@
                                 struct vpx_write_bit_buffer *wb) {
   vpx_wb_write_bit(wb, filter == SWITCHABLE);
   if (filter != SWITCHABLE)
-    vpx_wb_write_literal(wb, filter, 2);
+    vpx_wb_write_literal(wb, filter, 2 + CONFIG_EXT_INTERP);
 }
 
 static void fix_interp_filter(VP10_COMMON *cm, FRAME_COUNTS *counts) {
@@ -1054,6 +2629,31 @@
 
 static void write_tile_info(const VP10_COMMON *const cm,
                             struct vpx_write_bit_buffer *wb) {
+#if CONFIG_EXT_TILE
+  const int tile_width  =
+    ALIGN_POWER_OF_TWO(cm->tile_width, cm->mib_size_log2) >> cm->mib_size_log2;
+  const int tile_height =
+    ALIGN_POWER_OF_TWO(cm->tile_height, cm->mib_size_log2) >> cm->mib_size_log2;
+
+  assert(tile_width > 0);
+  assert(tile_height > 0);
+
+  // Write the tile sizes
+#if CONFIG_EXT_PARTITION
+  if (cm->sb_size == BLOCK_128X128) {
+    assert(tile_width <= 32);
+    assert(tile_height <= 32);
+    vpx_wb_write_literal(wb, tile_width - 1, 5);
+    vpx_wb_write_literal(wb, tile_height - 1, 5);
+  } else
+#endif  // CONFIG_EXT_PARTITION
+  {
+    assert(tile_width <= 64);
+    assert(tile_height <= 64);
+    vpx_wb_write_literal(wb, tile_width - 1, 6);
+    vpx_wb_write_literal(wb, tile_height - 1, 6);
+  }
+#else
   int min_log2_tile_cols, max_log2_tile_cols, ones;
   vp10_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
 
@@ -1069,9 +2669,29 @@
   vpx_wb_write_bit(wb, cm->log2_tile_rows != 0);
   if (cm->log2_tile_rows != 0)
     vpx_wb_write_bit(wb, cm->log2_tile_rows != 1);
+#endif  // CONFIG_EXT_TILE
 }
 
 static int get_refresh_mask(VP10_COMP *cpi) {
+  int refresh_mask = 0;
+
+#if CONFIG_EXT_REFS
+  // NOTE(zoeliu): When LAST_FRAME is to get refreshed, the decoder will be
+  // notified to get LAST3_FRAME refreshed and then the virtual indexes for all
+  // the 3 LAST reference frames will be updated accordingly, i.e.:
+  // (1) The original virtual index for LAST3_FRAME will become the new virtual
+  //     index for LAST_FRAME; and
+  // (2) The original virtual indexes for LAST_FRAME and LAST2_FRAME will be
+  //     shifted and become the new virtual indexes for LAST2_FRAME and
+  //     LAST3_FRAME.
+  refresh_mask |=
+      (cpi->refresh_last_frame << cpi->lst_fb_idxes[LAST_REF_FRAMES - 1]);
+
+  refresh_mask |= (cpi->refresh_bwd_ref_frame << cpi->bwd_fb_idx);
+#else
+  refresh_mask |= (cpi->refresh_last_frame << cpi->lst_fb_idx);
+#endif  // CONFIG_EXT_REFS
+
   if (vp10_preserve_existing_gf(cpi)) {
     // We have decided to preserve the previously existing golden frame as our
     // new ARF frame. However, in the short term we leave it in the GF slot and,
@@ -1083,68 +2703,237 @@
     // Note: This is highly specific to the use of ARF as a forward reference,
     // and this needs to be generalized as other uses are implemented
     // (like RTC/temporal scalability).
-    return (cpi->refresh_last_frame << cpi->lst_fb_idx) |
-           (cpi->refresh_golden_frame << cpi->alt_fb_idx);
+    return refresh_mask | (cpi->refresh_golden_frame << cpi->alt_fb_idx);
   } else {
     int arf_idx = cpi->alt_fb_idx;
     if ((cpi->oxcf.pass == 2) && cpi->multi_arf_allowed) {
       const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
       arf_idx = gf_group->arf_update_idx[gf_group->index];
     }
-    return (cpi->refresh_last_frame << cpi->lst_fb_idx) |
+    return refresh_mask |
            (cpi->refresh_golden_frame << cpi->gld_fb_idx) |
            (cpi->refresh_alt_ref_frame << arf_idx);
   }
 }
 
-static size_t encode_tiles(VP10_COMP *cpi, uint8_t *data_ptr,
-                           unsigned int *max_tile_sz) {
-  VP10_COMMON *const cm = &cpi->common;
-  vpx_writer residual_bc;
+#if CONFIG_EXT_TILE
+static INLINE int find_identical_tile(
+    const int tile_row, const int tile_col,
+    TileBufferEnc (*const tile_buffers)[1024]) {
+  const MV32 candidate_offset[1] = {{1, 0}};
+  const uint8_t *const cur_tile_data =
+      tile_buffers[tile_row][tile_col].data + 4;
+  const unsigned int cur_tile_size = tile_buffers[tile_row][tile_col].size;
+
+  int i;
+
+  if (tile_row == 0)
+    return 0;
+
+  // (TODO: yunqingwang) For now, only above tile is checked and used.
+  // More candidates such as left tile can be added later.
+  for (i = 0; i < 1; i++) {
+    int row_offset = candidate_offset[0].row;
+    int col_offset = candidate_offset[0].col;
+    int row = tile_row - row_offset;
+    int col = tile_col - col_offset;
+    uint8_t tile_hdr;
+    const uint8_t *tile_data;
+    TileBufferEnc *candidate;
+
+    if (row < 0 || col < 0)
+      continue;
+
+    tile_hdr = *(tile_buffers[row][col].data);
+
+    // Read out tcm bit
+    if ((tile_hdr >> 7) == 1) {
+      // The candidate is a copy tile itself
+      row_offset += tile_hdr & 0x7f;
+      row = tile_row - row_offset;
+    }
+
+    candidate = &tile_buffers[row][col];
+
+    if (row_offset >= 128 || candidate->size != cur_tile_size)
+      continue;
+
+    tile_data = candidate->data + 4;
+
+    if (memcmp(tile_data, cur_tile_data, cur_tile_size) != 0)
+      continue;
+
+    // Identical tile found
+    assert(row_offset > 0);
+    return row_offset;
+  }
+
+  // No identical tile found
+  return 0;
+}
+#endif  // CONFIG_EXT_TILE
+
+static uint32_t write_tiles(VP10_COMP *const cpi,
+                           uint8_t *const dst,
+                           unsigned int *max_tile_size,
+                           unsigned int *max_tile_col_size) {
+  const VP10_COMMON *const cm = &cpi->common;
+#if CONFIG_ANS
+  struct AnsCoder token_ans;
+#else
+  vp10_writer mode_bc;
+#endif  // CONFIG_ANS
   int tile_row, tile_col;
-  TOKENEXTRA *tok_end;
+  TOKENEXTRA *(*const tok_buffers)[MAX_TILE_COLS] = cpi->tile_tok;
+  TileBufferEnc (*const tile_buffers)[MAX_TILE_COLS] = cpi->tile_buffers;
   size_t total_size = 0;
-  const int tile_cols = 1 << cm->log2_tile_cols;
-  const int tile_rows = 1 << cm->log2_tile_rows;
-  unsigned int max_tile = 0;
+  const int tile_cols = cm->tile_cols;
+  const int tile_rows = cm->tile_rows;
+#if CONFIG_EXT_TILE
+  const int have_tiles = tile_cols * tile_rows > 1;
+#endif  // CONFIG_EXT_TILE
+#if CONFIG_ANS
+  BufAnsCoder *buf_ans = &cpi->buf_ans;
+#endif  // CONFIG_ANS
 
-  memset(cm->above_seg_context, 0,
-         sizeof(*cm->above_seg_context) * mi_cols_aligned_to_sb(cm->mi_cols));
+  *max_tile_size = 0;
+  *max_tile_col_size = 0;
 
-  for (tile_row = 0; tile_row < tile_rows; tile_row++) {
-    for (tile_col = 0; tile_col < tile_cols; tile_col++) {
-      int tile_idx = tile_row * tile_cols + tile_col;
-      TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col];
+  // All tile size fields are output on 4 bytes. A call to remux_tiles will
+  // later compact the data if smaller headers are adequate.
 
-      tok_end = cpi->tile_tok[tile_row][tile_col] +
-          cpi->tok_count[tile_row][tile_col];
+#if CONFIG_EXT_TILE
+  for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+    TileInfo tile_info;
+    const int is_last_col = (tile_col == tile_cols - 1);
+    const size_t col_offset = total_size;
 
-      if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1)
-        vpx_start_encode(&residual_bc, data_ptr + total_size + 4);
-      else
-        vpx_start_encode(&residual_bc, data_ptr + total_size);
+    vp10_tile_set_col(&tile_info, cm, tile_col);
 
-      write_modes(cpi, &cpi->tile_data[tile_idx].tile_info,
-                  &residual_bc, &tok, tok_end);
+    // The last column does not have a column header
+    if (!is_last_col)
+      total_size += 4;
+
+    for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+      TileBufferEnc *const buf =  &tile_buffers[tile_row][tile_col];
+      unsigned int tile_size;
+      const TOKENEXTRA *tok = tok_buffers[tile_row][tile_col];
+      const TOKENEXTRA *tok_end = tok + cpi->tok_count[tile_row][tile_col];
+      const int data_offset = have_tiles ? 4 : 0;
+
+      vp10_tile_set_row(&tile_info, cm, tile_row);
+
+      buf->data = dst + total_size;
+
+      // Is CONFIG_EXT_TILE = 1, every tile in the row has a header,
+      // even for the last one, unless no tiling is used at all.
+      total_size += data_offset;
+#if !CONFIG_ANS
+      vpx_start_encode(&mode_bc, buf->data + data_offset);
+      write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end);
       assert(tok == tok_end);
-      vpx_stop_encode(&residual_bc);
-      if (tile_col < tile_cols - 1 || tile_row < tile_rows - 1) {
-        unsigned int tile_sz;
+      vpx_stop_encode(&mode_bc);
+      tile_size = mode_bc.pos;
+#else
+      buf_ans_write_reset(buf_ans);
+      write_modes(cpi, &tile_info, buf_ans, &tok, tok_end);
+      assert(tok == tok_end);
+      ans_write_init(&token_ans, buf->data + data_offset);
+      buf_ans_flush(buf_ans, &token_ans);
+      tile_size = ans_write_end(&token_ans);
+#endif  // !CONFIG_ANS
 
-        // size of this tile
-        assert(residual_bc.pos > 0);
-        tile_sz = residual_bc.pos - CONFIG_MISC_FIXES;
-        mem_put_le32(data_ptr + total_size, tile_sz);
-        max_tile = max_tile > tile_sz ? max_tile : tile_sz;
-        total_size += 4;
+      buf->size = tile_size;
+
+      // Record the maximum tile size we see, so we can compact headers later.
+      *max_tile_size = VPXMAX(*max_tile_size, tile_size);
+
+      if (have_tiles) {
+        // tile header: size of this tile, or copy offset
+        uint32_t  tile_header = tile_size;
+
+        // Check if this tile is a copy tile.
+        // Very low chances to have copy tiles on the key frames, so don't
+        // search on key frames to reduce unnecessary search.
+        if (cm->frame_type != KEY_FRAME) {
+          const int idendical_tile_offset =
+              find_identical_tile(tile_row, tile_col, tile_buffers);
+
+          if (idendical_tile_offset > 0) {
+            tile_size = 0;
+            tile_header = idendical_tile_offset | 0x80;
+            tile_header <<= 24;
+          }
+        }
+
+        mem_put_le32(buf->data, tile_header);
       }
 
-      total_size += residual_bc.pos;
+      total_size += tile_size;
+    }
+
+    if (!is_last_col) {
+      size_t col_size = total_size - col_offset - 4;
+      mem_put_le32(dst + col_offset, col_size);
+
+      // If it is not final packing, record the maximum tile column size we see,
+      // otherwise, check if the tile size is out of the range.
+      *max_tile_col_size = VPXMAX(*max_tile_col_size, col_size);
     }
   }
-  *max_tile_sz = max_tile;
+#else
+  for (tile_row = 0; tile_row < tile_rows; tile_row++) {
+    TileInfo tile_info;
+    const int is_last_row = (tile_row == tile_rows - 1);
 
-  return total_size;
+    vp10_tile_set_row(&tile_info, cm, tile_row);
+
+    for (tile_col = 0; tile_col < tile_cols; tile_col++) {
+      TileBufferEnc *const buf = &tile_buffers[tile_row][tile_col];
+      const int is_last_col = (tile_col == tile_cols - 1);
+      const int is_last_tile = is_last_col && is_last_row;
+      unsigned int tile_size;
+      const TOKENEXTRA *tok = tok_buffers[tile_row][tile_col];
+      const TOKENEXTRA *tok_end = tok + cpi->tok_count[tile_row][tile_col];
+
+      vp10_tile_set_col(&tile_info, cm, tile_col);
+
+      buf->data = dst + total_size;
+
+      // The last tile does not have a header.
+      if (!is_last_tile)
+        total_size += 4;
+
+#if !CONFIG_ANS
+      vpx_start_encode(&mode_bc, dst + total_size);
+      write_modes(cpi, &tile_info, &mode_bc, &tok, tok_end);
+      assert(tok == tok_end);
+      vpx_stop_encode(&mode_bc);
+      tile_size = mode_bc.pos;
+#else
+      buf_ans_write_reset(buf_ans);
+      write_modes(cpi, &tile_info, buf_ans, &tok, tok_end);
+      assert(tok == tok_end);
+      ans_write_init(&token_ans, dst + total_size);
+      buf_ans_flush(buf_ans, &token_ans);
+      tile_size = ans_write_end(&token_ans);
+#endif  // !CONFIG_ANS
+
+      assert(tile_size > 0);
+
+      buf->size = tile_size;
+
+      if (!is_last_tile) {
+        *max_tile_size = VPXMAX(*max_tile_size, tile_size);
+        // size of this tile
+        mem_put_le32(buf->data, tile_size);
+      }
+
+      total_size += tile_size;
+    }
+  }
+#endif  // CONFIG_EXT_TILE
+  return (uint32_t)total_size;
 }
 
 static void write_render_size(const VP10_COMMON *cm,
@@ -1178,10 +2967,8 @@
     if (cfg != NULL) {
       found = cm->width == cfg->y_crop_width &&
               cm->height == cfg->y_crop_height;
-#if CONFIG_MISC_FIXES
       found &= cm->render_width == cfg->render_width &&
                cm->render_height == cfg->render_height;
-#endif
     }
     vpx_wb_write_bit(wb, found);
     if (found) {
@@ -1192,15 +2979,8 @@
   if (!found) {
     vpx_wb_write_literal(wb, cm->width - 1, 16);
     vpx_wb_write_literal(wb, cm->height - 1, 16);
-
-#if CONFIG_MISC_FIXES
     write_render_size(cm, wb);
-#endif
   }
-
-#if !CONFIG_MISC_FIXES
-  write_render_size(cm, wb);
-#endif
 }
 
 static void write_sync_code(struct vpx_write_bit_buffer *wb) {
@@ -1262,7 +3042,33 @@
 
   write_profile(cm->profile, wb);
 
-  vpx_wb_write_bit(wb, 0);  // show_existing_frame
+#if CONFIG_EXT_REFS
+  // NOTE: By default all coded frames to be used as a reference
+  cm->is_reference_frame = 1;
+
+  if (cm->show_existing_frame) {
+    RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+    const int frame_to_show =
+        cm->ref_frame_map[cpi->existing_fb_idx_to_show];
+
+    if (frame_to_show < 0 || frame_bufs[frame_to_show].ref_count < 1) {
+      vpx_internal_error(&cm->error, VPX_CODEC_UNSUP_BITSTREAM,
+                         "Buffer %d does not contain a reconstructed frame",
+                         frame_to_show);
+    }
+    ref_cnt_fb(frame_bufs, &cm->new_fb_idx, frame_to_show);
+
+    vpx_wb_write_bit(wb, 1);  // show_existing_frame
+    vpx_wb_write_literal(wb, cpi->existing_fb_idx_to_show, 3);
+
+    return;
+  } else {
+#endif  // CONFIG_EXT_REFS
+    vpx_wb_write_bit(wb, 0);  // show_existing_frame
+#if CONFIG_EXT_REFS
+  }
+#endif  // CONFIG_EXT_REFS
+
   vpx_wb_write_bit(wb, cm->frame_type);
   vpx_wb_write_bit(wb, cm->show_frame);
   vpx_wb_write_bit(wb, cm->error_resilient_mode);
@@ -1271,12 +3077,13 @@
     write_sync_code(wb);
     write_bitdepth_colorspace_sampling(cm, wb);
     write_frame_size(cm, wb);
+    if (frame_is_intra_only(cm))
+      vpx_wb_write_bit(wb, cm->allow_screen_content_tools);
   } else {
     if (!cm->show_frame)
       vpx_wb_write_bit(wb, cm->intra_only);
 
     if (!cm->error_resilient_mode) {
-#if CONFIG_MISC_FIXES
       if (cm->intra_only) {
         vpx_wb_write_bit(wb,
                          cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL);
@@ -1287,31 +3094,39 @@
           vpx_wb_write_bit(wb,
                            cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL);
       }
-#else
-      static const int reset_frame_context_conv_tbl[3] = { 0, 2, 3 };
-
-      vpx_wb_write_literal(wb,
-          reset_frame_context_conv_tbl[cm->reset_frame_context], 2);
-#endif
     }
 
+#if CONFIG_EXT_REFS
+    cpi->refresh_frame_mask = get_refresh_mask(cpi);
+#endif  // CONFIG_EXT_REFS
+
     if (cm->intra_only) {
       write_sync_code(wb);
-
-#if CONFIG_MISC_FIXES
       write_bitdepth_colorspace_sampling(cm, wb);
-#else
-      // Note for profile 0, 420 8bpp is assumed.
-      if (cm->profile > PROFILE_0) {
-        write_bitdepth_colorspace_sampling(cm, wb);
-      }
-#endif
 
+#if CONFIG_EXT_REFS
+      vpx_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES);
+#else
       vpx_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
+#endif  // CONFIG_EXT_REFS
       write_frame_size(cm, wb);
     } else {
       MV_REFERENCE_FRAME ref_frame;
+
+#if CONFIG_EXT_REFS
+      vpx_wb_write_literal(wb, cpi->refresh_frame_mask, REF_FRAMES);
+#else
       vpx_wb_write_literal(wb, get_refresh_mask(cpi), REF_FRAMES);
+#endif  // CONFIG_EXT_REFS
+
+#if CONFIG_EXT_REFS
+      if (!cpi->refresh_frame_mask) {
+        // NOTE: "cpi->refresh_frame_mask == 0" indicates that the coded frame
+        //       will not be used as a reference
+        cm->is_reference_frame = 0;
+      }
+#endif  // CONFIG_EXT_REFS
+
       for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
         assert(get_ref_frame_map_idx(cpi, ref_frame) != INVALID_IDX);
         vpx_wb_write_literal(wb, get_ref_frame_map_idx(cpi, ref_frame),
@@ -1330,24 +3145,31 @@
 
   if (!cm->error_resilient_mode) {
     vpx_wb_write_bit(wb,
-                     cm->refresh_frame_context != REFRESH_FRAME_CONTEXT_OFF);
-#if CONFIG_MISC_FIXES
-    if (cm->refresh_frame_context != REFRESH_FRAME_CONTEXT_OFF)
-#endif
-      vpx_wb_write_bit(wb, cm->refresh_frame_context !=
-                               REFRESH_FRAME_CONTEXT_BACKWARD);
+        cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_FORWARD);
   }
 
   vpx_wb_write_literal(wb, cm->frame_context_idx, FRAME_CONTEXTS_LOG2);
 
-  encode_loopfilter(&cm->lf, wb);
+  assert(cm->mib_size == num_8x8_blocks_wide_lookup[cm->sb_size]);
+  assert(cm->mib_size == 1 << cm->mib_size_log2);
+#if CONFIG_EXT_PARTITION
+  assert(cm->sb_size == BLOCK_128X128 || cm->sb_size == BLOCK_64X64);
+  vpx_wb_write_bit(wb, cm->sb_size == BLOCK_128X128 ? 1 : 0);
+#else
+  assert(cm->sb_size == BLOCK_64X64);
+#endif  // CONFIG_EXT_PARTITION
+
+  encode_loopfilter(cm, wb);
+#if CONFIG_LOOP_RESTORATION
+  encode_restoration(cm, wb);
+#endif  // CONFIG_LOOP_RESTORATION
   encode_quantization(cm, wb);
   encode_segmentation(cm, xd, wb);
-#if CONFIG_MISC_FIXES
   if (!cm->seg.enabled && xd->lossless[0])
     cm->tx_mode = TX_4X4;
   else
     write_txfm_mode(cm->tx_mode, wb);
+
   if (cpi->allow_comp_inter_inter) {
     const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT;
     const int use_compound_pred = cm->reference_mode != SINGLE_REFERENCE;
@@ -1356,218 +3178,406 @@
     if (!use_hybrid_pred)
       vpx_wb_write_bit(wb, use_compound_pred);
   }
-#endif
 
   write_tile_info(cm, wb);
 }
 
-static size_t write_compressed_header(VP10_COMP *cpi, uint8_t *data) {
+static uint32_t write_compressed_header(VP10_COMP *cpi, uint8_t *data) {
   VP10_COMMON *const cm = &cpi->common;
+#if CONFIG_SUPERTX
+  MACROBLOCKD *const xd = &cpi->td.mb.e_mbd;
+#endif  // CONFIG_SUPERTX
   FRAME_CONTEXT *const fc = cm->fc;
   FRAME_COUNTS *counts = cpi->td.counts;
-  vpx_writer header_bc;
-  int i;
-#if CONFIG_MISC_FIXES
-  int j;
-#endif
+  vp10_writer *header_bc;
+  int i, j;
 
-  vpx_start_encode(&header_bc, data);
-
-#if !CONFIG_MISC_FIXES
-  if (cpi->td.mb.e_mbd.lossless[0]) {
-    cm->tx_mode = TX_4X4;
-  } else {
-    write_txfm_mode(cm->tx_mode, &header_bc);
-    update_txfm_probs(cm, &header_bc, counts);
-  }
+#if CONFIG_ANS
+  struct AnsCoder header_ans;
+  int header_size;
+  header_bc = &cpi->buf_ans;
+  buf_ans_write_reset(header_bc);
 #else
-  update_txfm_probs(cm, &header_bc, counts);
+  vp10_writer real_header_bc;
+  header_bc = &real_header_bc;
+  vpx_start_encode(header_bc, data);
 #endif
-  update_coef_probs(cpi, &header_bc);
-  update_skip_probs(cm, &header_bc, counts);
-#if CONFIG_MISC_FIXES
-  update_seg_probs(cpi, &header_bc);
+  update_txfm_probs(cm, header_bc, counts);
+  update_coef_probs(cpi, header_bc);
+
+#if CONFIG_VAR_TX
+  update_txfm_partition_probs(cm, header_bc, counts);
+#endif
+
+  update_skip_probs(cm, header_bc, counts);
+  update_seg_probs(cpi, header_bc);
 
   for (i = 0; i < INTRA_MODES; ++i)
     prob_diff_update(vp10_intra_mode_tree, fc->uv_mode_prob[i],
-                     counts->uv_mode[i], INTRA_MODES, &header_bc);
+                     counts->uv_mode[i], INTRA_MODES, header_bc);
 
+#if CONFIG_EXT_PARTITION_TYPES
+  prob_diff_update(vp10_partition_tree, fc->partition_prob[0],
+                   counts->partition[0], PARTITION_TYPES, header_bc);
+  for (i = 1; i < PARTITION_CONTEXTS; ++i)
+    prob_diff_update(vp10_ext_partition_tree, fc->partition_prob[i],
+                     counts->partition[i], EXT_PARTITION_TYPES,
+                     header_bc);
+#else
   for (i = 0; i < PARTITION_CONTEXTS; ++i)
     prob_diff_update(vp10_partition_tree, fc->partition_prob[i],
-                     counts->partition[i], PARTITION_TYPES, &header_bc);
-#endif
+                     counts->partition[i], PARTITION_TYPES, header_bc);
+#endif  // CONFIG_EXT_PARTITION_TYPES
+
+#if CONFIG_EXT_INTRA
+  for (i = 0; i < INTRA_FILTERS + 1; ++i)
+    prob_diff_update(vp10_intra_filter_tree, fc->intra_filter_probs[i],
+                     counts->intra_filter[i], INTRA_FILTERS, header_bc);
+#endif  // CONFIG_EXT_INTRA
 
   if (frame_is_intra_only(cm)) {
     vp10_copy(cm->kf_y_prob, vp10_kf_y_mode_prob);
-#if CONFIG_MISC_FIXES
     for (i = 0; i < INTRA_MODES; ++i)
       for (j = 0; j < INTRA_MODES; ++j)
         prob_diff_update(vp10_intra_mode_tree, cm->kf_y_prob[i][j],
-                         counts->kf_y_mode[i][j], INTRA_MODES, &header_bc);
-#endif
+                         counts->kf_y_mode[i][j], INTRA_MODES, header_bc);
   } else {
+#if CONFIG_REF_MV
+    update_inter_mode_probs(cm, header_bc, counts);
+#else
     for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
       prob_diff_update(vp10_inter_mode_tree, cm->fc->inter_mode_probs[i],
-                       counts->inter_mode[i], INTER_MODES, &header_bc);
+                       counts->inter_mode[i], INTER_MODES, header_bc);
+#endif
+
+#if CONFIG_EXT_INTER
+    update_inter_compound_mode_probs(cm, header_bc);
+
+    if (cm->reference_mode != COMPOUND_REFERENCE) {
+      for (i = 0; i < BLOCK_SIZE_GROUPS; i++) {
+        if (is_interintra_allowed_bsize_group(i)) {
+          vp10_cond_prob_diff_update(header_bc,
+                                     &fc->interintra_prob[i],
+                                     cm->counts.interintra[i]);
+        }
+      }
+      for (i = 0; i < BLOCK_SIZE_GROUPS; i++) {
+        prob_diff_update(vp10_interintra_mode_tree,
+                         cm->fc->interintra_mode_prob[i],
+                         counts->interintra_mode[i],
+                         INTERINTRA_MODES, header_bc);
+      }
+      for (i = 0; i < BLOCK_SIZES; i++) {
+        if (is_interintra_allowed_bsize(i) && is_interintra_wedge_used(i))
+          vp10_cond_prob_diff_update(header_bc,
+                                     &fc->wedge_interintra_prob[i],
+                                     cm->counts.wedge_interintra[i]);
+      }
+    }
+    if (cm->reference_mode != SINGLE_REFERENCE) {
+      for (i = 0; i < BLOCK_SIZES; i++)
+        if (is_interinter_wedge_used(i))
+          vp10_cond_prob_diff_update(header_bc,
+                                     &fc->wedge_interinter_prob[i],
+                                     cm->counts.wedge_interinter[i]);
+    }
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_OBMC || CONFIG_WARPED_MOTION
+    for (i = BLOCK_8X8; i < BLOCK_SIZES; ++i)
+      prob_diff_update(vp10_motvar_tree, fc->motvar_prob[i],
+                       counts->motvar[i], MOTION_VARIATIONS, header_bc);
+#endif  // CONFIG_OBMC || CONFIG_WARPED_MOTION
 
     if (cm->interp_filter == SWITCHABLE)
-      update_switchable_interp_probs(cm, &header_bc, counts);
+      update_switchable_interp_probs(cm, header_bc, counts);
 
     for (i = 0; i < INTRA_INTER_CONTEXTS; i++)
-      vp10_cond_prob_diff_update(&header_bc, &fc->intra_inter_prob[i],
+      vp10_cond_prob_diff_update(header_bc, &fc->intra_inter_prob[i],
                                 counts->intra_inter[i]);
 
     if (cpi->allow_comp_inter_inter) {
       const int use_hybrid_pred = cm->reference_mode == REFERENCE_MODE_SELECT;
-#if !CONFIG_MISC_FIXES
-      const int use_compound_pred = cm->reference_mode != SINGLE_REFERENCE;
-
-      vpx_write_bit(&header_bc, use_compound_pred);
-      if (use_compound_pred) {
-        vpx_write_bit(&header_bc, use_hybrid_pred);
-        if (use_hybrid_pred)
-          for (i = 0; i < COMP_INTER_CONTEXTS; i++)
-            vp10_cond_prob_diff_update(&header_bc, &fc->comp_inter_prob[i],
-                                      counts->comp_inter[i]);
-      }
-#else
       if (use_hybrid_pred)
         for (i = 0; i < COMP_INTER_CONTEXTS; i++)
-          vp10_cond_prob_diff_update(&header_bc, &fc->comp_inter_prob[i],
+          vp10_cond_prob_diff_update(header_bc, &fc->comp_inter_prob[i],
                                      counts->comp_inter[i]);
-#endif
     }
 
     if (cm->reference_mode != COMPOUND_REFERENCE) {
       for (i = 0; i < REF_CONTEXTS; i++) {
-        vp10_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][0],
-                                  counts->single_ref[i][0]);
-        vp10_cond_prob_diff_update(&header_bc, &fc->single_ref_prob[i][1],
-                                  counts->single_ref[i][1]);
+        for (j = 0; j < (SINGLE_REFS - 1); j ++) {
+          vp10_cond_prob_diff_update(header_bc, &fc->single_ref_prob[i][j],
+                                     counts->single_ref[i][j]);
+        }
       }
     }
 
-    if (cm->reference_mode != SINGLE_REFERENCE)
-      for (i = 0; i < REF_CONTEXTS; i++)
-        vp10_cond_prob_diff_update(&header_bc, &fc->comp_ref_prob[i],
-                                  counts->comp_ref[i]);
+    if (cm->reference_mode != SINGLE_REFERENCE) {
+      for (i = 0; i < REF_CONTEXTS; i++) {
+#if CONFIG_EXT_REFS
+        for (j = 0; j < (FWD_REFS - 1); j++) {
+          vp10_cond_prob_diff_update(header_bc, &fc->comp_ref_prob[i][j],
+                                     counts->comp_ref[i][j]);
+        }
+        for (j = 0; j < (BWD_REFS - 1); j++) {
+          vp10_cond_prob_diff_update(header_bc, &fc->comp_bwdref_prob[i][j],
+                                     counts->comp_bwdref[i][j]);
+        }
+#else
+        for (j = 0; j < (COMP_REFS - 1); j++) {
+          vp10_cond_prob_diff_update(header_bc, &fc->comp_ref_prob[i][j],
+                                     counts->comp_ref[i][j]);
+        }
+#endif  // CONFIG_EXT_REFS
+      }
+    }
 
     for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
       prob_diff_update(vp10_intra_mode_tree, cm->fc->y_mode_prob[i],
-                       counts->y_mode[i], INTRA_MODES, &header_bc);
+                       counts->y_mode[i], INTRA_MODES, header_bc);
 
-#if !CONFIG_MISC_FIXES
-    for (i = 0; i < PARTITION_CONTEXTS; ++i)
-      prob_diff_update(vp10_partition_tree, fc->partition_prob[i],
-                       counts->partition[i], PARTITION_TYPES, &header_bc);
+    vp10_write_nmv_probs(cm, cm->allow_high_precision_mv, header_bc,
+#if CONFIG_REF_MV
+                         counts->mv);
+#else
+                         &counts->mv);
 #endif
-
-    vp10_write_nmv_probs(cm, cm->allow_high_precision_mv, &header_bc,
-                        &counts->mv);
-    update_ext_tx_probs(cm, &header_bc);
+    update_ext_tx_probs(cm, header_bc);
+#if CONFIG_SUPERTX
+    if (!xd->lossless[0])
+      update_supertx_probs(cm, header_bc);
+#endif  // CONFIG_SUPERTX
   }
 
-  vpx_stop_encode(&header_bc);
-  assert(header_bc.pos <= 0xffff);
-
-  return header_bc.pos;
+#if CONFIG_ANS
+  ans_write_init(&header_ans, data);
+  buf_ans_flush(header_bc, &header_ans);
+  header_size = ans_write_end(&header_ans);
+  assert(header_size <= 0xffff);
+  return header_size;
+#else
+  vpx_stop_encode(header_bc);
+  assert(header_bc->pos <= 0xffff);
+  return header_bc->pos;
+#endif  // CONFIG_ANS
 }
 
-#if CONFIG_MISC_FIXES
-static int remux_tiles(uint8_t *dest, const int sz,
-                       const int n_tiles, const int mag) {
-  int rpos = 0, wpos = 0, n;
+static int choose_size_bytes(uint32_t size, int spare_msbs) {
+  // Choose the number of bytes required to represent size, without
+  // using the 'spare_msbs' number of most significant bits.
 
-  for (n = 0; n < n_tiles; n++) {
-    int tile_sz;
+  // Make sure we will fit in 4 bytes to start with..
+  if (spare_msbs > 0 && size >> (32 - spare_msbs) != 0)
+    return -1;
 
-    if (n == n_tiles - 1) {
-      tile_sz = sz - rpos;
-    } else {
-      tile_sz = mem_get_le32(&dest[rpos]) + 1;
-      rpos += 4;
-      switch (mag) {
-        case 0:
-          dest[wpos] = tile_sz - 1;
-          break;
-        case 1:
-          mem_put_le16(&dest[wpos], tile_sz - 1);
-          break;
-        case 2:
-          mem_put_le24(&dest[wpos], tile_sz - 1);
-          break;
-        case 3:  // remuxing should only happen if mag < 3
-        default:
-          assert("Invalid value for tile size magnitude" && 0);
+  // Normalise to 32 bits
+  size <<= spare_msbs;
+
+  if (size >> 24 != 0)
+    return 4;
+  else if (size >> 16 != 0)
+    return 3;
+  else if (size >> 8 != 0)
+    return 2;
+  else
+    return 1;
+}
+
+static void mem_put_varsize(uint8_t *const dst, const int sz, const int val) {
+  switch (sz) {
+    case 1:
+      dst[0] = (uint8_t)(val & 0xff);
+      break;
+    case 2:
+      mem_put_le16(dst, val);
+      break;
+    case 3:
+      mem_put_le24(dst, val);
+      break;
+    case 4:
+      mem_put_le32(dst, val);
+      break;
+    default:
+      assert("Invalid size" && 0);
+      break;
+  }
+}
+
+static int remux_tiles(const VP10_COMMON *const cm,
+                       uint8_t *dst,
+                       const uint32_t data_size,
+                       const uint32_t max_tile_size,
+                       const uint32_t max_tile_col_size,
+                       int *const tile_size_bytes,
+                       int *const tile_col_size_bytes) {
+  // Choose the tile size bytes (tsb) and tile column size bytes (tcsb)
+#if CONFIG_EXT_TILE
+  // The top bit in the tile size field indicates tile copy mode, so we
+  // have 1 less bit to code the tile size
+  const int tsb = choose_size_bytes(max_tile_size, 1);
+  const int tcsb = choose_size_bytes(max_tile_col_size, 0);
+#else
+  const int tsb = choose_size_bytes(max_tile_size, 0);
+  const int tcsb = 4;  // This is ignored
+  (void) max_tile_col_size;
+#endif  // CONFIG_EXT_TILE
+
+  assert(tsb > 0);
+  assert(tcsb > 0);
+
+  *tile_size_bytes = tsb;
+  *tile_col_size_bytes = tcsb;
+
+  if (tsb == 4 && tcsb == 4) {
+    return data_size;
+  } else {
+    uint32_t wpos = 0;
+    uint32_t rpos = 0;
+
+#if CONFIG_EXT_TILE
+    int tile_row;
+    int tile_col;
+
+    for (tile_col = 0 ; tile_col < cm->tile_cols ; tile_col++) {
+      // All but the last column has a column header
+      if (tile_col < cm->tile_cols - 1) {
+        uint32_t tile_col_size = mem_get_le32(dst + rpos);
+        rpos += 4;
+
+        // Adjust the tile column size by the number of bytes removed
+        // from the tile size fields.
+        tile_col_size -= (4-tsb) * cm->tile_rows;
+
+        mem_put_varsize(dst + wpos, tcsb, tile_col_size);
+        wpos += tcsb;
       }
-      wpos += mag + 1;
+
+      for (tile_row = 0 ; tile_row < cm->tile_rows ; tile_row++) {
+        // All, including the last row has a header
+        uint32_t tile_header = mem_get_le32(dst + rpos);
+        rpos += 4;
+
+        // If this is a copy tile, we need to shift the MSB to the
+        // top bit of the new width, and there is no data to copy.
+        if (tile_header >> 31 != 0) {
+          if (tsb < 4)
+            tile_header >>= 32 - 8 * tsb;
+          mem_put_varsize(dst + wpos, tsb, tile_header);
+          wpos += tsb;
+        } else {
+          mem_put_varsize(dst + wpos, tsb, tile_header);
+          wpos += tsb;
+
+          memmove(dst + wpos, dst + rpos, tile_header);
+          rpos += tile_header;
+          wpos += tile_header;
+        }
+      }
     }
+#else
+    const int n_tiles = cm->tile_cols * cm->tile_rows;
+    int n;
 
-    memmove(&dest[wpos], &dest[rpos], tile_sz);
-    wpos += tile_sz;
-    rpos += tile_sz;
+    for (n = 0; n < n_tiles; n++) {
+      int tile_size;
+
+      if (n == n_tiles - 1) {
+        tile_size = data_size - rpos;
+      } else {
+        tile_size = mem_get_le32(dst + rpos);
+        rpos += 4;
+        mem_put_varsize(dst + wpos, tsb, tile_size);
+        wpos += tsb;
+      }
+
+      memmove(dst + wpos, dst + rpos, tile_size);
+
+      rpos += tile_size;
+      wpos += tile_size;
+    }
+#endif  // CONFIG_EXT_TILE
+
+    assert(rpos > wpos);
+    assert(rpos == data_size);
+
+    return wpos;
   }
-
-  assert(rpos > wpos);
-  assert(rpos == sz);
-
-  return wpos;
 }
-#endif
 
-void vp10_pack_bitstream(VP10_COMP *const cpi, uint8_t *dest, size_t *size) {
-  uint8_t *data = dest;
-  size_t first_part_size, uncompressed_hdr_size, data_sz;
+void vp10_pack_bitstream(VP10_COMP *const cpi, uint8_t *dst, size_t *size) {
+  uint8_t *data = dst;
+  uint32_t compressed_header_size;
+  uint32_t uncompressed_header_size;
+  uint32_t data_size;
   struct vpx_write_bit_buffer wb = {data, 0};
   struct vpx_write_bit_buffer saved_wb;
-  unsigned int max_tile;
-#if CONFIG_MISC_FIXES
+  unsigned int max_tile_size;
+  unsigned int max_tile_col_size;
+  int tile_size_bytes;
+  int tile_col_size_bytes;
+
   VP10_COMMON *const cm = &cpi->common;
-  const int n_log2_tiles = cm->log2_tile_rows + cm->log2_tile_cols;
-  const int have_tiles = n_log2_tiles > 0;
-#else
-  const int have_tiles = 0;  // we have tiles, but we don't want to write a
-                             // tile size marker in the header
-#endif
+  const int have_tiles = cm->tile_cols * cm->tile_rows > 1;
 
+  // Write the uncompressed header
   write_uncompressed_header(cpi, &wb);
-  saved_wb = wb;
-  // don't know in advance first part. size
-  vpx_wb_write_literal(&wb, 0, 16 + have_tiles * 2);
 
-  uncompressed_hdr_size = vpx_wb_bytes_written(&wb);
-  data += uncompressed_hdr_size;
+#if CONFIG_EXT_REFS
+  if (cm->show_existing_frame) {
+    *size = vpx_wb_bytes_written(&wb);
+    return;
+  }
+#endif  // CONFIG_EXT_REFS
+
+  // We do not know these in advance. Output placeholder bit.
+  saved_wb = wb;
+  // Write tile size magnitudes
+  if (have_tiles) {
+    // Note that the last item in the uncompressed header is the data
+    // describing tile configuration.
+#if CONFIG_EXT_TILE
+    // Number of bytes in tile column size - 1
+    vpx_wb_write_literal(&wb, 0, 2);
+#endif  // CONFIG_EXT_TILE
+    // Number of bytes in tile size - 1
+    vpx_wb_write_literal(&wb, 0, 2);
+  }
+  // Size of compressed header
+  vpx_wb_write_literal(&wb, 0, 16);
+
+  uncompressed_header_size = (uint32_t)vpx_wb_bytes_written(&wb);
+  data += uncompressed_header_size;
 
   vpx_clear_system_state();
 
-  first_part_size = write_compressed_header(cpi, data);
-  data += first_part_size;
+  // Write the compressed header
+  compressed_header_size = write_compressed_header(cpi, data);
+  data += compressed_header_size;
 
-  data_sz = encode_tiles(cpi, data, &max_tile);
-#if CONFIG_MISC_FIXES
-  if (max_tile > 0) {
-    int mag;
-    unsigned int mask;
+  // Write the encoded tile data
+  data_size = write_tiles(cpi, data, &max_tile_size, &max_tile_col_size);
 
-    // Choose the (tile size) magnitude
-    for (mag = 0, mask = 0xff; mag < 4; mag++) {
-      if (max_tile <= mask)
-        break;
-      mask <<= 8;
-      mask |= 0xff;
-    }
-    assert(n_log2_tiles > 0);
-    vpx_wb_write_literal(&saved_wb, mag, 2);
-    if (mag < 3)
-      data_sz = remux_tiles(data, (int)data_sz, 1 << n_log2_tiles, mag);
-  } else {
-    assert(n_log2_tiles == 0);
+  if (have_tiles) {
+    data_size = remux_tiles(cm, data, data_size,
+                            max_tile_size, max_tile_col_size,
+                            &tile_size_bytes, &tile_col_size_bytes);
   }
-#endif
-  data += data_sz;
 
-  // TODO(jbb): Figure out what to do if first_part_size > 16 bits.
-  vpx_wb_write_literal(&saved_wb, (int)first_part_size, 16);
+  data += data_size;
 
-  *size = data - dest;
+  // Now fill in the gaps in the uncompressed header.
+  if (have_tiles) {
+#if CONFIG_EXT_TILE
+    assert(tile_col_size_bytes >= 1 && tile_col_size_bytes <= 4);
+    vpx_wb_write_literal(&saved_wb, tile_col_size_bytes - 1, 2);
+#endif  // CONFIG_EXT_TILE
+    assert(tile_size_bytes >= 1 && tile_size_bytes <= 4);
+    vpx_wb_write_literal(&saved_wb, tile_size_bytes - 1, 2);
+  }
+  // TODO(jbb): Figure out what to do if compressed_header_size > 16 bits.
+  assert(compressed_header_size <= 0xffff);
+  vpx_wb_write_literal(&saved_wb, compressed_header_size, 16);
+
+  *size = data - dst;
 }
diff --git a/vp10/encoder/bitstream.h b/vp10/encoder/bitstream.h
index b1da89f..cacdb43 100644
--- a/vp10/encoder/bitstream.h
+++ b/vp10/encoder/bitstream.h
@@ -18,9 +18,10 @@
 
 #include "vp10/encoder/encoder.h"
 
-void vp10_encode_token_init();
 void vp10_pack_bitstream(VP10_COMP *const cpi, uint8_t *dest, size_t *size);
 
+void vp10_encode_token_init(void);
+
 static INLINE int vp10_preserve_existing_gf(VP10_COMP *cpi) {
   return !cpi->multi_arf_allowed && cpi->refresh_golden_frame &&
          cpi->rc.is_src_frame_alt_ref;
diff --git a/vp10/encoder/bitwriter.h b/vp10/encoder/bitwriter.h
new file mode 100644
index 0000000..f53a132
--- /dev/null
+++ b/vp10/encoder/bitwriter.h
@@ -0,0 +1,35 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+/* The purpose of this header is to provide compile time pluggable bit writer
+ * implementations with a common interface. */
+
+#ifndef VPX10_ENCODER_BITWRITER_H_
+#define VPX10_ENCODER_BITWRITER_H_
+
+#include "./vpx_config.h"
+#include "vpx_dsp/prob.h"
+
+#if CONFIG_ANS
+typedef struct BufAnsCoder BufAnsCoder;
+#include "vp10/encoder/buf_ans.h"
+#define vp10_writer BufAnsCoder
+#define vp10_write buf_uabs_write
+#define vp10_write_bit buf_uabs_write_bit
+#define vp10_write_literal buf_uabs_write_literal
+#else
+#include "vpx_dsp/bitwriter.h"
+#define vp10_writer vpx_writer
+#define vp10_write vpx_write
+#define vp10_write_bit vpx_write_bit
+#define vp10_write_literal vpx_write_literal
+#endif
+
+#endif  // VPX10_ENCODER_BITWRITER_H_
diff --git a/vp10/encoder/block.h b/vp10/encoder/block.h
index ab0252b..4c9f8a5 100644
--- a/vp10/encoder/block.h
+++ b/vp10/encoder/block.h
@@ -13,6 +13,9 @@
 
 #include "vp10/common/entropymv.h"
 #include "vp10/common/entropy.h"
+#if CONFIG_REF_MV
+#include "vp10/common/mvref_common.h"
+#endif
 
 #ifdef __cplusplus
 extern "C" {
@@ -24,23 +27,26 @@
   unsigned int var;
 } diff;
 
-struct macroblock_plane {
-  DECLARE_ALIGNED(16, int16_t, src_diff[64 * 64]);
+typedef struct macroblock_plane {
+  DECLARE_ALIGNED(16, int16_t, src_diff[MAX_SB_SQUARE]);
   tran_low_t *qcoeff;
   tran_low_t *coeff;
   uint16_t *eobs;
   struct buf_2d src;
 
   // Quantizer setings
-  int16_t *quant_fp;
-  int16_t *round_fp;
-  int16_t *quant;
-  int16_t *quant_shift;
-  int16_t *zbin;
-  int16_t *round;
+  const int16_t *quant_fp;
+  const int16_t *round_fp;
+  const int16_t *quant;
+  const int16_t *quant_shift;
+  const int16_t *zbin;
+  const int16_t *round;
+#if CONFIG_NEW_QUANT
+  cuml_bins_type_nuq *cuml_bins_nuq[QUANT_PROFILES];
+#endif  // CONFIG_NEW_QUANT
 
   int64_t quant_thred[2];
-};
+} MACROBLOCK_PLANE;
 
 /* The [2] dimension is for whether we skip the EOB node (i.e. if previous
  * coefficient in this block was zero) or not. */
@@ -48,10 +54,24 @@
                                    [COEFF_CONTEXTS][ENTROPY_TOKENS];
 
 typedef struct {
-  int_mv ref_mvs[MAX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
-  uint8_t mode_context[MAX_REF_FRAMES];
+  int_mv ref_mvs[MODE_CTX_REF_FRAMES][MAX_MV_REF_CANDIDATES];
+  int16_t mode_context[MODE_CTX_REF_FRAMES];
+#if CONFIG_REF_MV
+  uint8_t ref_mv_count[MODE_CTX_REF_FRAMES];
+  CANDIDATE_MV ref_mv_stack[MODE_CTX_REF_FRAMES][MAX_REF_MV_STACK_SIZE];
+#if CONFIG_EXT_INTER
+  int16_t compound_mode_context[MODE_CTX_REF_FRAMES];
+#endif  // CONFIG_EXT_INTER
+#endif
 } MB_MODE_INFO_EXT;
 
+typedef struct {
+  uint8_t best_palette_color_map[MAX_SB_SQUARE];
+  float kmeans_data_buf[2 * MAX_SB_SQUARE];
+  uint8_t kmeans_indices_buf[MAX_SB_SQUARE];
+  uint8_t kmeans_pre_indices_buf[MAX_SB_SQUARE];
+} PALETTE_BUFFER;
+
 typedef struct macroblock MACROBLOCK;
 struct macroblock {
   struct macroblock_plane plane[MAX_MB_PLANE];
@@ -60,12 +80,17 @@
   MB_MODE_INFO_EXT *mbmi_ext;
   int skip_block;
   int select_tx_size;
-  int skip_recode;
   int skip_optimize;
   int q_index;
 
+  // The equivalent error at the current rdmult of one whole bit (not one
+  // bitcost unit).
   int errorperbit;
+  // The equivalend SAD error of one (whole) bit at the current quantizer
+  // for large blocks.
   int sadperbit16;
+  // The equivalend SAD error of one (whole) bit at the current quantizer
+  // for sub-8x8 blocks.
   int sadperbit4;
   int rddiv;
   int rdmult;
@@ -81,19 +106,33 @@
   int mv_best_ref_index[MAX_REF_FRAMES];
   unsigned int max_mv_context[MAX_REF_FRAMES];
   unsigned int source_variance;
+  unsigned int recon_variance;
   unsigned int pred_sse[MAX_REF_FRAMES];
   int pred_mv_sad[MAX_REF_FRAMES];
 
+#if CONFIG_REF_MV
+  int *nmvjointcost;
+  int nmv_vec_cost[NMV_CONTEXTS][MV_JOINTS];
+  int *nmvcost[NMV_CONTEXTS][2];
+  int *nmvcost_hp[NMV_CONTEXTS][2];
+  int **mv_cost_stack[NMV_CONTEXTS];
+  int *nmvjointsadcost;
+  int zero_rmv_cost[NMV_CONTEXTS][2];
+  int comp_rmv_cost[2];
+#else
   int nmvjointcost[MV_JOINTS];
   int *nmvcost[2];
   int *nmvcost_hp[2];
-  int **mvcost;
-
   int nmvjointsadcost[MV_JOINTS];
+#endif
+
+  int **mvcost;
   int *nmvsadcost[2];
   int *nmvsadcost_hp[2];
   int **mvsadcost;
 
+  PALETTE_BUFFER *palette_buffer;
+
   // These define limits to motion vector components to prevent them
   // from extending outside the UMV borders
   int mv_col_min;
@@ -103,7 +142,13 @@
 
   // Notes transform blocks where no coefficents are coded.
   // Set during mode selection. Read during block encoding.
-  uint8_t zcoeff_blk[TX_SIZES][256];
+  uint8_t zcoeff_blk[TX_SIZES][MAX_MIB_SIZE * MAX_MIB_SIZE * 4];
+#if CONFIG_VAR_TX
+  uint8_t blk_skip[MAX_MB_PLANE][MAX_MIB_SIZE * MAX_MIB_SIZE * 4];
+#if CONFIG_REF_MV
+  uint8_t blk_skip_drl[MAX_MB_PLANE][MAX_MIB_SIZE * MAX_MIB_SIZE * 4];
+#endif
+#endif
 
   int skip;
 
@@ -117,23 +162,20 @@
   // indicate if it is in the rd search loop or encoding process
   int use_lp32x32fdct;
 
-  // use fast quantization process
-  int quant_fp;
-
-  // skip forward transform and quantization
-  uint8_t skip_txfm[MAX_MB_PLANE << 2];
-  #define SKIP_TXFM_NONE 0
-  #define SKIP_TXFM_AC_DC 1
-  #define SKIP_TXFM_AC_ONLY 2
-
-  int64_t bsse[MAX_MB_PLANE << 2];
-
   // Used to store sub partition's choices.
   MV pred_mv[MAX_REF_FRAMES];
 
+  // Store the best motion vector during motion search
+  int_mv best_mv;
+
   // Strong color activity detection. Used in RTC coding mode to enhance
   // the visual quality at the boundary of moving color objects.
   uint8_t color_sensitivity[2];
+
+  // use default transform and skip transform type search for intra modes
+  int use_default_intra_tx_type;
+  // use default transform and skip transform type search for inter modes
+  int use_default_inter_tx_type;
 };
 
 #ifdef __cplusplus
diff --git a/vp10/encoder/buf_ans.c b/vp10/encoder/buf_ans.c
new file mode 100644
index 0000000..31cd227
--- /dev/null
+++ b/vp10/encoder/buf_ans.c
@@ -0,0 +1,41 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string.h>
+
+#include "vp10/common/common.h"
+#include "vp10/encoder/buf_ans.h"
+#include "vp10/encoder/encoder.h"
+#include "vpx_mem/vpx_mem.h"
+
+void vp10_buf_ans_alloc(struct BufAnsCoder *c, struct VP10Common *cm,
+                        int size_hint) {
+  c->cm = cm;
+  c->size = size_hint;
+  CHECK_MEM_ERROR(cm, c->buf, vpx_malloc(c->size * sizeof(*c->buf)));
+  // Initialize to overfull to trigger the assert in write.
+  c->offset = c->size + 1;
+}
+
+void vp10_buf_ans_free(struct BufAnsCoder *c) {
+  vpx_free(c->buf);
+  c->buf = NULL;
+  c->size = 0;
+}
+
+void vp10_buf_ans_grow(struct BufAnsCoder *c) {
+  struct buffered_ans_symbol *new_buf = NULL;
+  int new_size = c->size * 2;
+  CHECK_MEM_ERROR(c->cm, new_buf, vpx_malloc(new_size * sizeof(*new_buf)));
+  memcpy(new_buf, c->buf, c->size * sizeof(*c->buf));
+  vpx_free(c->buf);
+  c->buf = new_buf;
+  c->size = new_size;
+}
diff --git a/vp10/encoder/buf_ans.h b/vp10/encoder/buf_ans.h
new file mode 100644
index 0000000..8697ee4
--- /dev/null
+++ b/vp10/encoder/buf_ans.h
@@ -0,0 +1,109 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_ENCODER_BUF_ANS_H_
+#define VP10_ENCODER_BUF_ANS_H_
+// Buffered forward ANS writer.
+// Symbols are written to the writer in forward (decode) order and serialzed
+// backwards due to ANS's stack like behavior.
+
+#include <assert.h>
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vp10/common/ans.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif  // __cplusplus
+
+#define ANS_METHOD_UABS 0
+#define ANS_METHOD_RANS 1
+
+struct buffered_ans_symbol {
+  uint8_t method;    // one of ANS_METHOD_UABS or ANS_METHOD_RANS
+  // TODO(aconverse): Should be possible to write this interms of start for ABS
+  AnsP10 val_start;  // Boolean value for ABS, start in symbol cycle for Rans
+  AnsP10 prob;  // Probability of this symbol
+};
+
+struct BufAnsCoder {
+  struct VP10Common *cm;
+  struct buffered_ans_symbol *buf;
+  int size;
+  int offset;
+};
+
+void vp10_buf_ans_alloc(struct BufAnsCoder *c, struct VP10Common *cm,
+                        int size_hint);
+
+void vp10_buf_ans_free(struct BufAnsCoder *c);
+
+void vp10_buf_ans_grow(struct BufAnsCoder *c);
+
+static INLINE void buf_ans_write_reset(struct BufAnsCoder *const c) {
+  c->offset = 0;
+}
+
+static INLINE void buf_uabs_write(struct BufAnsCoder *const c,
+                                  uint8_t val, AnsP8 prob) {
+  assert(c->offset <= c->size);
+  if (c->offset == c->size) {
+    vp10_buf_ans_grow(c);
+  }
+  c->buf[c->offset].method = ANS_METHOD_UABS;
+  c->buf[c->offset].val_start = val;
+  c->buf[c->offset].prob = prob;
+  ++c->offset;
+}
+
+static INLINE void buf_rans_write(struct BufAnsCoder *const c,
+                                  const struct rans_sym *const sym) {
+  assert(c->offset <= c->size);
+  if (c->offset == c->size) {
+    vp10_buf_ans_grow(c);
+  }
+  c->buf[c->offset].method = ANS_METHOD_RANS;
+  c->buf[c->offset].val_start = sym->cum_prob;
+  c->buf[c->offset].prob = sym->prob;
+  ++c->offset;
+}
+
+static INLINE void buf_ans_flush(const struct BufAnsCoder *const c,
+                                 struct AnsCoder *ans) {
+  int offset;
+  for (offset = c->offset - 1; offset >= 0; --offset) {
+    if (c->buf[offset].method == ANS_METHOD_RANS) {
+      struct rans_sym sym;
+      sym.prob = c->buf[offset].prob;
+      sym.cum_prob = c->buf[offset].val_start;
+      rans_write(ans, &sym);
+    } else {
+      uabs_write(ans, (uint8_t)c->buf[offset].val_start,
+                 (AnsP8)c->buf[offset].prob);
+    }
+  }
+}
+
+static INLINE void buf_uabs_write_bit(struct BufAnsCoder *c, int bit) {
+  buf_uabs_write(c, bit, 128);
+}
+
+static INLINE void buf_uabs_write_literal(struct BufAnsCoder *c,
+                                          int literal, int bits) {
+  int bit;
+
+  assert(bits < 31);
+  for (bit = bits - 1; bit >= 0; bit--)
+    buf_uabs_write_bit(c, 1 & (literal >> bit));
+}
+#ifdef __cplusplus
+}  // extern "C"
+#endif  // __cplusplus
+#endif  // VP10_ENCODER_BUF_ANS_H_
diff --git a/vp10/encoder/context_tree.c b/vp10/encoder/context_tree.c
index 6c056d2..41155c9 100644
--- a/vp10/encoder/context_tree.c
+++ b/vp10/encoder/context_tree.c
@@ -11,23 +11,36 @@
 #include "vp10/encoder/context_tree.h"
 #include "vp10/encoder/encoder.h"
 
-static const BLOCK_SIZE square[] = {
+static const BLOCK_SIZE square[MAX_SB_SIZE_LOG2 - 2] = {
   BLOCK_8X8,
   BLOCK_16X16,
   BLOCK_32X32,
   BLOCK_64X64,
+#if CONFIG_EXT_PARTITION
+  BLOCK_128X128,
+#endif  // CONFIG_EXT_PARTITION
 };
 
 static void alloc_mode_context(VP10_COMMON *cm, int num_4x4_blk,
+#if CONFIG_EXT_PARTITION_TYPES
+                               PARTITION_TYPE partition,
+#endif
                                PICK_MODE_CONTEXT *ctx) {
   const int num_blk = (num_4x4_blk < 4 ? 4 : num_4x4_blk);
   const int num_pix = num_blk << 4;
   int i, k;
   ctx->num_4x4_blk = num_blk;
+#if CONFIG_EXT_PARTITION_TYPES
+  ctx->partition = partition;
+#endif
 
   CHECK_MEM_ERROR(cm, ctx->zcoeff_blk,
                   vpx_calloc(num_blk, sizeof(uint8_t)));
   for (i = 0; i < MAX_MB_PLANE; ++i) {
+#if CONFIG_VAR_TX
+    CHECK_MEM_ERROR(cm, ctx->blk_skip[i],
+                    vpx_calloc(num_blk, sizeof(uint8_t)));
+#endif
     for (k = 0; k < 3; ++k) {
       CHECK_MEM_ERROR(cm, ctx->coeff[i][k],
                       vpx_memalign(32, num_pix * sizeof(*ctx->coeff[i][k])));
@@ -43,6 +56,14 @@
       ctx->eobs_pbuf[i][k]    = ctx->eobs[i][k];
     }
   }
+
+  if (cm->allow_screen_content_tools) {
+    for (i = 0;  i < 2; ++i) {
+      CHECK_MEM_ERROR(cm, ctx->color_index_map[i],
+                    vpx_memalign(32,
+                                 num_pix * sizeof(*ctx->color_index_map[i])));
+    }
+  }
 }
 
 static void free_mode_context(PICK_MODE_CONTEXT *ctx) {
@@ -50,6 +71,10 @@
   vpx_free(ctx->zcoeff_blk);
   ctx->zcoeff_blk = 0;
   for (i = 0; i < MAX_MB_PLANE; ++i) {
+#if CONFIG_VAR_TX
+    vpx_free(ctx->blk_skip[i]);
+    ctx->blk_skip[i] = 0;
+#endif
     for (k = 0; k < 3; ++k) {
       vpx_free(ctx->coeff[i][k]);
       ctx->coeff[i][k] = 0;
@@ -70,9 +95,54 @@
 
 static void alloc_tree_contexts(VP10_COMMON *cm, PC_TREE *tree,
                                 int num_4x4_blk) {
+#if CONFIG_EXT_PARTITION_TYPES
+  alloc_mode_context(cm, num_4x4_blk, PARTITION_NONE, &tree->none);
+  alloc_mode_context(cm, num_4x4_blk/2, PARTITION_HORZ, &tree->horizontal[0]);
+  alloc_mode_context(cm, num_4x4_blk/2, PARTITION_VERT, &tree->vertical[0]);
+  alloc_mode_context(cm, num_4x4_blk/2, PARTITION_VERT, &tree->horizontal[1]);
+  alloc_mode_context(cm, num_4x4_blk/2, PARTITION_VERT, &tree->vertical[1]);
+
+  alloc_mode_context(cm, num_4x4_blk/4, PARTITION_HORZ_A,
+                     &tree->horizontala[0]);
+  alloc_mode_context(cm, num_4x4_blk/4, PARTITION_HORZ_A,
+                     &tree->horizontala[1]);
+  alloc_mode_context(cm, num_4x4_blk/2, PARTITION_HORZ_A,
+                     &tree->horizontala[2]);
+  alloc_mode_context(cm, num_4x4_blk/2, PARTITION_HORZ_B,
+                     &tree->horizontalb[0]);
+  alloc_mode_context(cm, num_4x4_blk/4, PARTITION_HORZ_B,
+                     &tree->horizontalb[1]);
+  alloc_mode_context(cm, num_4x4_blk/4, PARTITION_HORZ_B,
+                     &tree->horizontalb[2]);
+  alloc_mode_context(cm, num_4x4_blk/4, PARTITION_VERT_A, &tree->verticala[0]);
+  alloc_mode_context(cm, num_4x4_blk/4, PARTITION_VERT_A, &tree->verticala[1]);
+  alloc_mode_context(cm, num_4x4_blk/2, PARTITION_VERT_A, &tree->verticala[2]);
+  alloc_mode_context(cm, num_4x4_blk/2, PARTITION_VERT_B, &tree->verticalb[0]);
+  alloc_mode_context(cm, num_4x4_blk/4, PARTITION_VERT_B, &tree->verticalb[1]);
+  alloc_mode_context(cm, num_4x4_blk/4, PARTITION_VERT_B, &tree->verticalb[2]);
+#ifdef CONFIG_SUPERTX
+  alloc_mode_context(cm, num_4x4_blk, PARTITION_HORZ,
+                     &tree->horizontal_supertx);
+  alloc_mode_context(cm, num_4x4_blk, PARTITION_VERT, &tree->vertical_supertx);
+  alloc_mode_context(cm, num_4x4_blk, PARTITION_SPLIT, &tree->split_supertx);
+  alloc_mode_context(cm, num_4x4_blk, PARTITION_HORZ_A,
+                     &tree->horizontala_supertx);
+  alloc_mode_context(cm, num_4x4_blk, PARTITION_HORZ_B,
+                     &tree->horizontalb_supertx);
+  alloc_mode_context(cm, num_4x4_blk, PARTITION_VERT_A,
+                     &tree->verticala_supertx);
+  alloc_mode_context(cm, num_4x4_blk, PARTITION_VERT_B,
+                     &tree->verticalb_supertx);
+#endif  // CONFIG_SUPERTX
+#else
   alloc_mode_context(cm, num_4x4_blk, &tree->none);
   alloc_mode_context(cm, num_4x4_blk/2, &tree->horizontal[0]);
   alloc_mode_context(cm, num_4x4_blk/2, &tree->vertical[0]);
+#ifdef CONFIG_SUPERTX
+  alloc_mode_context(cm, num_4x4_blk, &tree->horizontal_supertx);
+  alloc_mode_context(cm, num_4x4_blk, &tree->vertical_supertx);
+  alloc_mode_context(cm, num_4x4_blk, &tree->split_supertx);
+#endif
 
   if (num_4x4_blk > 4) {
     alloc_mode_context(cm, num_4x4_blk/2, &tree->horizontal[1]);
@@ -81,14 +151,35 @@
     memset(&tree->horizontal[1], 0, sizeof(tree->horizontal[1]));
     memset(&tree->vertical[1], 0, sizeof(tree->vertical[1]));
   }
+#endif  // CONFIG_EXT_PARTITION_TYPES
 }
 
 static void free_tree_contexts(PC_TREE *tree) {
+#if CONFIG_EXT_PARTITION_TYPES
+  int i;
+  for (i = 0; i < 3; i++) {
+    free_mode_context(&tree->horizontala[i]);
+    free_mode_context(&tree->horizontalb[i]);
+    free_mode_context(&tree->verticala[i]);
+    free_mode_context(&tree->verticalb[i]);
+  }
+#endif  // CONFIG_EXT_PARTITION_TYPES
   free_mode_context(&tree->none);
   free_mode_context(&tree->horizontal[0]);
   free_mode_context(&tree->horizontal[1]);
   free_mode_context(&tree->vertical[0]);
   free_mode_context(&tree->vertical[1]);
+#ifdef CONFIG_SUPERTX
+  free_mode_context(&tree->horizontal_supertx);
+  free_mode_context(&tree->vertical_supertx);
+  free_mode_context(&tree->split_supertx);
+#if CONFIG_EXT_PARTITION_TYPES
+  free_mode_context(&tree->horizontala_supertx);
+  free_mode_context(&tree->horizontalb_supertx);
+  free_mode_context(&tree->verticala_supertx);
+  free_mode_context(&tree->verticalb_supertx);
+#endif  // CONFIG_EXT_PARTITION_TYPES
+#endif  // CONFIG_SUPERTX
 }
 
 // This function sets up a tree of contexts such that at each square
@@ -97,8 +188,13 @@
 // represents the state of our search.
 void vp10_setup_pc_tree(VP10_COMMON *cm, ThreadData *td) {
   int i, j;
+#if CONFIG_EXT_PARTITION
+  const int leaf_nodes = 256;
+  const int tree_nodes = 256 + 64 + 16 + 4 + 1;
+#else
   const int leaf_nodes = 64;
   const int tree_nodes = 64 + 16 + 4 + 1;
+#endif  // CONFIG_EXT_PARTITION
   int pc_tree_index = 0;
   PC_TREE *this_pc;
   PICK_MODE_CONTEXT *this_leaf;
@@ -117,8 +213,13 @@
 
   // 4x4 blocks smaller than 8x8 but in the same 8x8 block share the same
   // context so we only need to allocate 1 for each 8x8 block.
-  for (i = 0; i < leaf_nodes; ++i)
+  for (i = 0; i < leaf_nodes; ++i) {
+#if CONFIG_EXT_PARTITION_TYPES
+    alloc_mode_context(cm, 1, PARTITION_NONE, &td->leaf_tree[i]);
+#else
     alloc_mode_context(cm, 1, &td->leaf_tree[i]);
+#endif
+  }
 
   // Sets up all the leaf nodes in the tree.
   for (pc_tree_index = 0; pc_tree_index < leaf_nodes; ++pc_tree_index) {
@@ -132,7 +233,7 @@
 
   // Each node has 4 leaf nodes, fill each block_size level of the tree
   // from leafs to the root.
-  for (nodes = 16; nodes > 0; nodes >>= 2) {
+  for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) {
     for (i = 0; i < nodes; ++i) {
       PC_TREE *const tree = &td->pc_tree[pc_tree_index];
       alloc_tree_contexts(cm, tree, 4 << (2 * square_index));
@@ -143,16 +244,30 @@
     }
     ++square_index;
   }
-  td->pc_root = &td->pc_tree[tree_nodes - 1];
-  td->pc_root[0].none.best_mode_index = 2;
+
+  // Set up the root node for the largest superblock size
+  i = MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2;
+  td->pc_root[i] = &td->pc_tree[tree_nodes - 1];
+  td->pc_root[i]->none.best_mode_index = 2;
+  // Set up the root nodes for the rest of the possible superblock sizes
+  while (--i >= 0) {
+    td->pc_root[i] = td->pc_root[i+1]->split[0];
+    td->pc_root[i]->none.best_mode_index = 2;
+  }
 }
 
 void vp10_free_pc_tree(ThreadData *td) {
+#if CONFIG_EXT_PARTITION
+  const int leaf_nodes = 256;
+  const int tree_nodes = 256 + 64 + 16 + 4 + 1;
+#else
+  const int leaf_nodes = 64;
   const int tree_nodes = 64 + 16 + 4 + 1;
+#endif  // CONFIG_EXT_PARTITION
   int i;
 
   // Set up all 4x4 mode contexts
-  for (i = 0; i < 64; ++i)
+  for (i = 0; i < leaf_nodes; ++i)
     free_mode_context(&td->leaf_tree[i]);
 
   // Sets up all the leaf nodes in the tree.
diff --git a/vp10/encoder/context_tree.h b/vp10/encoder/context_tree.h
index 2a0fffb..7b49354 100644
--- a/vp10/encoder/context_tree.h
+++ b/vp10/encoder/context_tree.h
@@ -28,6 +28,9 @@
   MB_MODE_INFO_EXT mbmi_ext;
   uint8_t *zcoeff_blk;
   uint8_t *color_index_map[2];
+#if CONFIG_VAR_TX
+  uint8_t *blk_skip[MAX_MB_PLANE];
+#endif
   tran_low_t *coeff[MAX_MB_PLANE][3];
   tran_low_t *qcoeff[MAX_MB_PLANE][3];
   tran_low_t *dqcoeff[MAX_MB_PLANE][3];
@@ -46,12 +49,10 @@
   // For current partition, only if all Y, U, and V transform blocks'
   // coefficients are quantized to 0, skippable is set to 0.
   int skippable;
-  uint8_t skip_txfm[MAX_MB_PLANE << 2];
   int best_mode_index;
   int hybrid_pred_diff;
   int comp_pred_diff;
   int single_pred_diff;
-  int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
 
   // TODO(jingning) Use RD_COST struct here instead. This involves a boarder
   // scope of refactoring.
@@ -71,6 +72,9 @@
   // search loop
   MV pred_mv[MAX_REF_FRAMES];
   INTERP_FILTER pred_interp_filter;
+#if CONFIG_EXT_PARTITION_TYPES
+  PARTITION_TYPE partition;
+#endif
 } PICK_MODE_CONTEXT;
 
 typedef struct PC_TREE {
@@ -80,10 +84,27 @@
   PICK_MODE_CONTEXT none;
   PICK_MODE_CONTEXT horizontal[2];
   PICK_MODE_CONTEXT vertical[2];
+#if CONFIG_EXT_PARTITION_TYPES
+  PICK_MODE_CONTEXT horizontala[3];
+  PICK_MODE_CONTEXT horizontalb[3];
+  PICK_MODE_CONTEXT verticala[3];
+  PICK_MODE_CONTEXT verticalb[3];
+#endif
   union {
     struct PC_TREE *split[4];
     PICK_MODE_CONTEXT *leaf_split[4];
   };
+#ifdef CONFIG_SUPERTX
+  PICK_MODE_CONTEXT horizontal_supertx;
+  PICK_MODE_CONTEXT vertical_supertx;
+  PICK_MODE_CONTEXT split_supertx;
+#if CONFIG_EXT_PARTITION_TYPES
+  PICK_MODE_CONTEXT horizontala_supertx;
+  PICK_MODE_CONTEXT horizontalb_supertx;
+  PICK_MODE_CONTEXT verticala_supertx;
+  PICK_MODE_CONTEXT verticalb_supertx;
+#endif
+#endif
 } PC_TREE;
 
 void vp10_setup_pc_tree(struct VP10Common *cm, struct ThreadData *td);
diff --git a/vp10/encoder/cost.c b/vp10/encoder/cost.c
index aab8263..6053d2e 100644
--- a/vp10/encoder/cost.c
+++ b/vp10/encoder/cost.c
@@ -10,36 +10,134 @@
 #include <assert.h>
 
 #include "vp10/encoder/cost.h"
+#if CONFIG_ANS
+#include "vp10/common/ans.h"
+#endif  // CONFIG_ANS
+#include "vp10/common/entropy.h"
 
-const unsigned int vp10_prob_cost[256] = {
-  2047, 2047, 1791, 1641, 1535, 1452, 1385, 1328, 1279, 1235, 1196, 1161,
-  1129, 1099, 1072, 1046, 1023, 1000, 979,  959,  940,  922,  905,  889,
-  873,  858,  843,  829,  816,  803,  790,  778,  767,  755,  744,  733,
-  723,  713,  703,  693,  684,  675,  666,  657,  649,  641,  633,  625,
-  617,  609,  602,  594,  587,  580,  573,  567,  560,  553,  547,  541,
-  534,  528,  522,  516,  511,  505,  499,  494,  488,  483,  477,  472,
-  467,  462,  457,  452,  447,  442,  437,  433,  428,  424,  419,  415,
-  410,  406,  401,  397,  393,  389,  385,  381,  377,  373,  369,  365,
-  361,  357,  353,  349,  346,  342,  338,  335,  331,  328,  324,  321,
-  317,  314,  311,  307,  304,  301,  297,  294,  291,  288,  285,  281,
-  278,  275,  272,  269,  266,  263,  260,  257,  255,  252,  249,  246,
-  243,  240,  238,  235,  232,  229,  227,  224,  221,  219,  216,  214,
-  211,  208,  206,  203,  201,  198,  196,  194,  191,  189,  186,  184,
-  181,  179,  177,  174,  172,  170,  168,  165,  163,  161,  159,  156,
-  154,  152,  150,  148,  145,  143,  141,  139,  137,  135,  133,  131,
-  129,  127,  125,  123,  121,  119,  117,  115,  113,  111,  109,  107,
-  105,  103,  101,  99,   97,   95,   93,   92,   90,   88,   86,   84,
-  82,   81,   79,   77,   75,   73,   72,   70,   68,   66,   65,   63,
-  61,   60,   58,   56,   55,   53,   51,   50,   48,   46,   45,   43,
-  41,   40,   38,   37,   35,   33,   32,   30,   29,   27,   25,   24,
-  22,   21,   19,   18,   16,   15,   13,   12,   10,   9,    7,    6,
-  4,    3,    1,    1};
+/* round(-log2(i/256.) * (1 << VP9_PROB_COST_SHIFT))
+   Begins with a bogus entry for simpler addressing. */
+const uint16_t vp10_prob_cost[256] = {
+    4096, 4096, 3584, 3284, 3072, 2907, 2772, 2659, 2560, 2473, 2395, 2325,
+    2260, 2201, 2147, 2096, 2048, 2003, 1961, 1921, 1883, 1847, 1813, 1780,
+    1748, 1718, 1689, 1661, 1635, 1609, 1584, 1559, 1536, 1513, 1491, 1470,
+    1449, 1429, 1409, 1390, 1371, 1353, 1335, 1318, 1301, 1284, 1268, 1252,
+    1236, 1221, 1206, 1192, 1177, 1163, 1149, 1136, 1123, 1110, 1097, 1084,
+    1072, 1059, 1047, 1036, 1024, 1013, 1001, 990,  979,  968,  958,  947,
+    937,  927,  917,  907,  897,  887,  878,  868,  859,  850,  841,  832,
+    823,  814,  806,  797,  789,  780,  772,  764,  756,  748,  740,  732,
+    724,  717,  709,  702,  694,  687,  680,  673,  665,  658,  651,  644,
+    637,  631,  624,  617,  611,  604,  598,  591,  585,  578,  572,  566,
+    560,  554,  547,  541,  535,  530,  524,  518,  512,  506,  501,  495,
+    489,  484,  478,  473,  467,  462,  456,  451,  446,  441,  435,  430,
+    425,  420,  415,  410,  405,  400,  395,  390,  385,  380,  375,  371,
+    366,  361,  356,  352,  347,  343,  338,  333,  329,  324,  320,  316,
+    311,  307,  302,  298,  294,  289,  285,  281,  277,  273,  268,  264,
+    260,  256,  252,  248,  244,  240,  236,  232,  228,  224,  220,  216,
+    212,  209,  205,  201,  197,  194,  190,  186,  182,  179,  175,  171,
+    168,  164,  161,  157,  153,  150,  146,  143,  139,  136,  132,  129,
+    125,  122,  119,  115,  112,  109,  105,  102,  99,   95,   92,   89,
+    86,   82,   79,   76,   73,   70,   66,   63,   60,   57,   54,   51,
+    48,   45,   42,   38,   35,   32,   29,   26,   23,   20,   18,   15,
+    12,   9,    6,    3};
+
+#if CONFIG_ANS
+// round(-log2(i/1024.) * (1 << VP9_PROB_COST_SHIFT))
+static const uint16_t vp10_prob_cost10[1024] = {
+    5120, 5120, 4608, 4308, 4096, 3931, 3796, 3683, 3584, 3497, 3419, 3349,
+    3284, 3225, 3171, 3120, 3072, 3027, 2985, 2945, 2907, 2871, 2837, 2804,
+    2772, 2742, 2713, 2685, 2659, 2633, 2608, 2583, 2560, 2537, 2515, 2494,
+    2473, 2453, 2433, 2414, 2395, 2377, 2359, 2342, 2325, 2308, 2292, 2276,
+    2260, 2245, 2230, 2216, 2201, 2187, 2173, 2160, 2147, 2134, 2121, 2108,
+    2096, 2083, 2071, 2060, 2048, 2037, 2025, 2014, 2003, 1992, 1982, 1971,
+    1961, 1951, 1941, 1931, 1921, 1911, 1902, 1892, 1883, 1874, 1865, 1856,
+    1847, 1838, 1830, 1821, 1813, 1804, 1796, 1788, 1780, 1772, 1764, 1756,
+    1748, 1741, 1733, 1726, 1718, 1711, 1704, 1697, 1689, 1682, 1675, 1668,
+    1661, 1655, 1648, 1641, 1635, 1628, 1622, 1615, 1609, 1602, 1596, 1590,
+    1584, 1578, 1571, 1565, 1559, 1554, 1548, 1542, 1536, 1530, 1525, 1519,
+    1513, 1508, 1502, 1497, 1491, 1486, 1480, 1475, 1470, 1465, 1459, 1454,
+    1449, 1444, 1439, 1434, 1429, 1424, 1419, 1414, 1409, 1404, 1399, 1395,
+    1390, 1385, 1380, 1376, 1371, 1367, 1362, 1357, 1353, 1348, 1344, 1340,
+    1335, 1331, 1326, 1322, 1318, 1313, 1309, 1305, 1301, 1297, 1292, 1288,
+    1284, 1280, 1276, 1272, 1268, 1264, 1260, 1256, 1252, 1248, 1244, 1240,
+    1236, 1233, 1229, 1225, 1221, 1218, 1214, 1210, 1206, 1203, 1199, 1195,
+    1192, 1188, 1185, 1181, 1177, 1174, 1170, 1167, 1163, 1160, 1156, 1153,
+    1149, 1146, 1143, 1139, 1136, 1133, 1129, 1126, 1123, 1119, 1116, 1113,
+    1110, 1106, 1103, 1100, 1097, 1094, 1090, 1087, 1084, 1081, 1078, 1075,
+    1072, 1069, 1066, 1062, 1059, 1056, 1053, 1050, 1047, 1044, 1042, 1039,
+    1036, 1033, 1030, 1027, 1024, 1021, 1018, 1015, 1013, 1010, 1007, 1004,
+    1001, 998,  996,  993,  990,  987,  985,  982,  979,  977,  974,  971,
+    968,  966,  963,  960,  958,  955,  953,  950,  947,  945,  942,  940,
+    937,  934,  932,  929,  927,  924,  922,  919,  917,  914,  912,  909,
+    907,  904,  902,  899,  897,  895,  892,  890,  887,  885,  883,  880,
+    878,  876,  873,  871,  868,  866,  864,  861,  859,  857,  855,  852,
+    850,  848,  845,  843,  841,  839,  836,  834,  832,  830,  828,  825,
+    823,  821,  819,  817,  814,  812,  810,  808,  806,  804,  801,  799,
+    797,  795,  793,  791,  789,  787,  785,  783,  780,  778,  776,  774,
+    772,  770,  768,  766,  764,  762,  760,  758,  756,  754,  752,  750,
+    748,  746,  744,  742,  740,  738,  736,  734,  732,  730,  728,  726,
+    724,  723,  721,  719,  717,  715,  713,  711,  709,  707,  706,  704,
+    702,  700,  698,  696,  694,  693,  691,  689,  687,  685,  683,  682,
+    680,  678,  676,  674,  673,  671,  669,  667,  665,  664,  662,  660,
+    658,  657,  655,  653,  651,  650,  648,  646,  644,  643,  641,  639,
+    637,  636,  634,  632,  631,  629,  627,  626,  624,  622,  621,  619,
+    617,  616,  614,  612,  611,  609,  607,  606,  604,  602,  601,  599,
+    598,  596,  594,  593,  591,  590,  588,  586,  585,  583,  582,  580,
+    578,  577,  575,  574,  572,  571,  569,  567,  566,  564,  563,  561,
+    560,  558,  557,  555,  554,  552,  550,  549,  547,  546,  544,  543,
+    541,  540,  538,  537,  535,  534,  532,  531,  530,  528,  527,  525,
+    524,  522,  521,  519,  518,  516,  515,  513,  512,  511,  509,  508,
+    506,  505,  503,  502,  501,  499,  498,  496,  495,  493,  492,  491,
+    489,  488,  486,  485,  484,  482,  481,  480,  478,  477,  475,  474,
+    473,  471,  470,  469,  467,  466,  465,  463,  462,  460,  459,  458,
+    456,  455,  454,  452,  451,  450,  448,  447,  446,  444,  443,  442,
+    441,  439,  438,  437,  435,  434,  433,  431,  430,  429,  428,  426,
+    425,  424,  422,  421,  420,  419,  417,  416,  415,  414,  412,  411,
+    410,  409,  407,  406,  405,  404,  402,  401,  400,  399,  397,  396,
+    395,  394,  392,  391,  390,  389,  387,  386,  385,  384,  383,  381,
+    380,  379,  378,  377,  375,  374,  373,  372,  371,  369,  368,  367,
+    366,  365,  364,  362,  361,  360,  359,  358,  356,  355,  354,  353,
+    352,  351,  349,  348,  347,  346,  345,  344,  343,  341,  340,  339,
+    338,  337,  336,  335,  333,  332,  331,  330,  329,  328,  327,  326,
+    324,  323,  322,  321,  320,  319,  318,  317,  316,  314,  313,  312,
+    311,  310,  309,  308,  307,  306,  305,  303,  302,  301,  300,  299,
+    298,  297,  296,  295,  294,  293,  292,  291,  289,  288,  287,  286,
+    285,  284,  283,  282,  281,  280,  279,  278,  277,  276,  275,  274,
+    273,  272,  271,  269,  268,  267,  266,  265,  264,  263,  262,  261,
+    260,  259,  258,  257,  256,  255,  254,  253,  252,  251,  250,  249,
+    248,  247,  246,  245,  244,  243,  242,  241,  240,  239,  238,  237,
+    236,  235,  234,  233,  232,  231,  230,  229,  228,  227,  226,  225,
+    224,  223,  222,  221,  220,  219,  218,  217,  216,  215,  214,  213,
+    212,  212,  211,  210,  209,  208,  207,  206,  205,  204,  203,  202,
+    201,  200,  199,  198,  197,  196,  195,  194,  194,  193,  192,  191,
+    190,  189,  188,  187,  186,  185,  184,  183,  182,  181,  181,  180,
+    179,  178,  177,  176,  175,  174,  173,  172,  171,  170,  170,  169,
+    168,  167,  166,  165,  164,  163,  162,  161,  161,  160,  159,  158,
+    157,  156,  155,  154,  153,  152,  152,  151,  150,  149,  148,  147,
+    146,  145,  145,  144,  143,  142,  141,  140,  139,  138,  138,  137,
+    136,  135,  134,  133,  132,  132,  131,  130,  129,  128,  127,  126,
+    125,  125,  124,  123,  122,  121,  120,  120,  119,  118,  117,  116,
+    115,  114,  114,  113,  112,  111,  110,  109,  109,  108,  107,  106,
+    105,  104,  104,  103,  102,  101,  100,  99,   99,   98,   97,   96,
+    95,   95,   94,   93,   92,   91,   90,   90,   89,   88,   87,   86,
+    86,   85,   84,   83,   82,   82,   81,   80,   79,   78,   78,   77,
+    76,   75,   74,   74,   73,   72,   71,   70,   70,   69,   68,   67,
+    66,   66,   65,   64,   63,   62,   62,   61,   60,   59,   59,   58,
+    57,   56,   55,   55,   54,   53,   52,   52,   51,   50,   49,   48,
+    48,   47,   46,   45,   45,   44,   43,   42,   42,   41,   40,   39,
+    38,   38,   37,   36,   35,   35,   34,   33,   32,   32,   31,   30,
+    29,   29,   28,   27,   26,   26,   25,   24,   23,   23,   22,   21,
+    20,   20,   19,   18,   18,   17,   16,   15,   15,   14,   13,   12,
+    12,   11,   10,   9,    9,    8,    7,    7,    6,    5,    4,    4,
+    3,    2,    1,    1};
+#endif  // CONFIG_ANS
 
 static void cost(int *costs, vpx_tree tree, const vpx_prob *probs,
                  int i, int c) {
   const vpx_prob prob = probs[i / 2];
   int b;
 
+  assert(prob != 0);
   for (b = 0; b <= 1; ++b) {
     const int cc = c + vp10_cost_bit(prob, b);
     const vpx_tree_index ii = tree[i + b];
@@ -51,6 +149,21 @@
   }
 }
 
+#if CONFIG_ANS
+void vp10_cost_tokens_ans(int *costs, const vpx_prob *tree_probs,
+                          const rans_dec_lut token_cdf, int skip_eob) {
+  int c_tree = 0;  // Cost of the "tree" nodes EOB and ZERO.
+  int i;
+  costs[EOB_TOKEN] = vp10_cost_bit(tree_probs[0], 0);
+  if (!skip_eob)
+    c_tree = vp10_cost_bit(tree_probs[0], 1);
+  for (i = ZERO_TOKEN; i <= CATEGORY6_TOKEN; ++i) {
+    const int p = token_cdf[i + 1] - token_cdf[i];
+    costs[i] = c_tree + vp10_prob_cost10[p];
+  }
+}
+#endif  // CONFIG_ANS
+
 void vp10_cost_tokens(int *costs, const vpx_prob *probs, vpx_tree tree) {
   cost(costs, tree, probs, 0, 0);
 }
diff --git a/vp10/encoder/cost.h b/vp10/encoder/cost.h
index b9619c6..bfd0be0 100644
--- a/vp10/encoder/cost.h
+++ b/vp10/encoder/cost.h
@@ -12,20 +12,31 @@
 #define VP10_ENCODER_COST_H_
 
 #include "vpx_dsp/prob.h"
+#include "vpx/vpx_integer.h"
+#if CONFIG_ANS
+#include "vp10/common/ans.h"
+#endif  // CONFIG_ANS
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-extern const unsigned int vp10_prob_cost[256];
+extern const uint16_t vp10_prob_cost[256];
+
+// The factor to scale from cost in bits to cost in vp10_prob_cost units.
+#define VP9_PROB_COST_SHIFT 9
 
 #define vp10_cost_zero(prob) (vp10_prob_cost[prob])
 
-#define vp10_cost_one(prob) vp10_cost_zero(vpx_complement(prob))
+#define vp10_cost_one(prob) vp10_cost_zero(256 - (prob))
 
-#define vp10_cost_bit(prob, bit) vp10_cost_zero((bit) ? vpx_complement(prob) \
+#define vp10_cost_bit(prob, bit) vp10_cost_zero((bit) ? 256 - (prob) \
                                                     : (prob))
 
+// Cost of coding an n bit literal, using 128 (i.e. 50%) probability
+// for each bit.
+#define vp10_cost_literal(n) ((n) * (1 << VP9_PROB_COST_SHIFT))
+
 static INLINE unsigned int cost_branch256(const unsigned int ct[2],
                                           vpx_prob p) {
   return ct[0] * vp10_cost_zero(p) + ct[1] * vp10_cost_one(p);
@@ -48,6 +59,11 @@
 void vp10_cost_tokens(int *costs, const vpx_prob *probs, vpx_tree tree);
 void vp10_cost_tokens_skip(int *costs, const vpx_prob *probs, vpx_tree tree);
 
+#if CONFIG_ANS
+void vp10_cost_tokens_ans(int *costs, const vpx_prob *tree_probs,
+                          const rans_dec_lut token_cdf, int skip_eob);
+#endif
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp10/encoder/dct.c b/vp10/encoder/dct.c
index 132a141..11d4a8e 100644
--- a/vp10/encoder/dct.c
+++ b/vp10/encoder/dct.c
@@ -14,7 +14,6 @@
 #include "./vp10_rtcd.h"
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
-
 #include "vp10/common/blockd.h"
 #include "vp10/common/idct.h"
 #include "vpx_dsp/fwd_txfm.h"
@@ -325,7 +324,7 @@
   range_check(output, 16, 16);
 }
 
-/* TODO(angiebird): Unify this with vp10_fwd_txfm.c: vp10_fdct32
+#if CONFIG_EXT_TX
 static void fdct32(const tran_low_t *input, tran_low_t *output) {
   tran_high_t temp;
   tran_low_t step[32];
@@ -723,7 +722,7 @@
 
   range_check(output, 32, 18);
 }
-*/
+#endif  // CONFIG_EXT_TX
 
 static void fadst4(const tran_low_t *input, tran_low_t *output) {
   tran_high_t x0, x1, x2, x3;
@@ -999,29 +998,231 @@
   output[15] = (tran_low_t)-x1;
 }
 
+#if CONFIG_EXT_TX
+static void fidtx4(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 4; ++i)
+    output[i] = (tran_low_t)fdct_round_shift(input[i] * Sqrt2);
+}
+
+static void fidtx8(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 8; ++i)
+    output[i] = input[i] * 2;
+}
+
+static void fidtx16(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 16; ++i)
+    output[i] = (tran_low_t)fdct_round_shift(input[i] * 2 * Sqrt2);
+}
+
+static void fidtx32(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  for (i = 0; i < 32; ++i)
+    output[i] = input[i] * 4;
+}
+
+// For use in lieu of ADST
+static void fhalfright32(const tran_low_t *input, tran_low_t *output) {
+  int i;
+  tran_low_t inputhalf[16];
+  for (i = 0; i < 16; ++i) {
+    output[16 + i] = input[i] * 4;
+  }
+  // Multiply input by sqrt(2)
+  for (i = 0; i < 16; ++i) {
+    inputhalf[i] = (tran_low_t)fdct_round_shift(input[i + 16] * Sqrt2);
+  }
+  fdct16(inputhalf, output);
+  // Note overall scaling factor is 4 times orthogonal
+}
+
+static void copy_block(const int16_t *src, int src_stride, int l,
+                       int16_t *dest, int dest_stride) {
+  int i;
+  for (i = 0; i < l; ++i) {
+    memcpy(dest + dest_stride * i, src + src_stride * i,
+           l * sizeof(int16_t));
+  }
+}
+
+static void fliplr(int16_t *dest, int stride, int l) {
+  int i, j;
+  for (i = 0; i < l; ++i) {
+    for (j = 0; j < l / 2; ++j) {
+      const int16_t tmp = dest[i * stride + j];
+      dest[i * stride + j] = dest[i * stride + l - 1 - j];
+      dest[i * stride + l - 1 - j] = tmp;
+    }
+  }
+}
+
+static void flipud(int16_t *dest, int stride, int l) {
+  int i, j;
+  for (j = 0; j < l; ++j) {
+    for (i = 0; i < l / 2; ++i) {
+      const int16_t tmp = dest[i * stride + j];
+      dest[i * stride + j] = dest[(l - 1 - i) * stride + j];
+      dest[(l - 1 - i) * stride + j] = tmp;
+    }
+  }
+}
+
+static void fliplrud(int16_t *dest, int stride, int l) {
+  int i, j;
+  for (i = 0; i < l / 2; ++i) {
+    for (j = 0; j < l; ++j) {
+      const int16_t tmp = dest[i * stride + j];
+      dest[i * stride + j] = dest[(l - 1 - i) * stride + l - 1 - j];
+      dest[(l - 1 - i) * stride + l - 1 - j] = tmp;
+    }
+  }
+}
+
+static void copy_fliplr(const int16_t *src, int src_stride, int l,
+                          int16_t *dest, int dest_stride) {
+  copy_block(src, src_stride, l, dest, dest_stride);
+  fliplr(dest, dest_stride, l);
+}
+
+static void copy_flipud(const int16_t *src, int src_stride, int l,
+                          int16_t *dest, int dest_stride) {
+  copy_block(src, src_stride, l, dest, dest_stride);
+  flipud(dest, dest_stride, l);
+}
+
+static void copy_fliplrud(const int16_t *src, int src_stride, int l,
+                            int16_t *dest, int dest_stride) {
+  copy_block(src, src_stride, l, dest, dest_stride);
+  fliplrud(dest, dest_stride, l);
+}
+
+static void maybe_flip_input(const int16_t **src, int *src_stride, int l,
+                             int16_t *buff, int tx_type) {
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case IDTX:
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+      break;
+    case FLIPADST_DCT:
+    case FLIPADST_ADST:
+    case V_FLIPADST:
+      copy_flipud(*src, *src_stride, l, buff, l);
+      *src = buff;
+      *src_stride = l;
+      break;
+    case DCT_FLIPADST:
+    case ADST_FLIPADST:
+    case H_FLIPADST:
+      copy_fliplr(*src, *src_stride, l, buff, l);
+      *src = buff;
+      *src_stride = l;
+      break;
+    case FLIPADST_FLIPADST:
+      copy_fliplrud(*src, *src_stride, l, buff, l);
+      *src = buff;
+      *src_stride = l;
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+#endif  // CONFIG_EXT_TX
+
 static const transform_2d FHT_4[] = {
-  { fdct4,  fdct4  },  // DCT_DCT  = 0
-  { fadst4, fdct4  },  // ADST_DCT = 1
-  { fdct4,  fadst4 },  // DCT_ADST = 2
-  { fadst4, fadst4 }   // ADST_ADST = 3
+  { fdct4,  fdct4  },  // DCT_DCT
+  { fadst4, fdct4  },  // ADST_DCT
+  { fdct4,  fadst4 },  // DCT_ADST
+  { fadst4, fadst4 },  // ADST_ADST
+#if CONFIG_EXT_TX
+  { fadst4, fdct4  },  // FLIPADST_DCT
+  { fdct4,  fadst4 },  // DCT_FLIPADST
+  { fadst4, fadst4 },  // FLIPADST_FLIPADST
+  { fadst4, fadst4 },  // ADST_FLIPADST
+  { fadst4, fadst4 },  // FLIPADST_ADST
+  { fidtx4, fidtx4 },  // IDTX
+  { fdct4,  fidtx4 },  // V_DCT
+  { fidtx4, fdct4  },  // H_DCT
+  { fadst4, fidtx4 },  // V_ADST
+  { fidtx4, fadst4 },  // H_ADST
+  { fadst4, fidtx4 },  // V_FLIPADST
+  { fidtx4, fadst4 },  // H_FLIPADST
+#endif  // CONFIG_EXT_TX
 };
 
 static const transform_2d FHT_8[] = {
-  { fdct8,  fdct8  },  // DCT_DCT  = 0
-  { fadst8, fdct8  },  // ADST_DCT = 1
-  { fdct8,  fadst8 },  // DCT_ADST = 2
-  { fadst8, fadst8 }   // ADST_ADST = 3
+  { fdct8,  fdct8  },  // DCT_DCT
+  { fadst8, fdct8  },  // ADST_DCT
+  { fdct8,  fadst8 },  // DCT_ADST
+  { fadst8, fadst8 },  // ADST_ADST
+#if CONFIG_EXT_TX
+  { fadst8, fdct8  },  // FLIPADST_DCT
+  { fdct8,  fadst8 },  // DCT_FLIPADST
+  { fadst8, fadst8 },  // FLIPADST_FLIPADST
+  { fadst8, fadst8 },  // ADST_FLIPADST
+  { fadst8, fadst8 },  // FLIPADST_ADST
+  { fidtx8, fidtx8 },  // IDTX
+  { fdct8,  fidtx8 },  // V_DCT
+  { fidtx8, fdct8  },  // H_DCT
+  { fadst8, fidtx8 },  // V_ADST
+  { fidtx8, fadst8 },  // H_ADST
+  { fadst8, fidtx8 },  // V_FLIPADST
+  { fidtx8, fadst8 },  // H_FLIPADST
+#endif  // CONFIG_EXT_TX
 };
 
 static const transform_2d FHT_16[] = {
-  { fdct16,  fdct16  },  // DCT_DCT  = 0
-  { fadst16, fdct16  },  // ADST_DCT = 1
-  { fdct16,  fadst16 },  // DCT_ADST = 2
-  { fadst16, fadst16 }   // ADST_ADST = 3
+  { fdct16,  fdct16  },  // DCT_DCT
+  { fadst16, fdct16  },  // ADST_DCT
+  { fdct16,  fadst16 },  // DCT_ADST
+  { fadst16, fadst16 },  // ADST_ADST
+#if CONFIG_EXT_TX
+  { fadst16, fdct16  },  // FLIPADST_DCT
+  { fdct16,  fadst16 },  // DCT_FLIPADST
+  { fadst16, fadst16 },  // FLIPADST_FLIPADST
+  { fadst16, fadst16 },  // ADST_FLIPADST
+  { fadst16, fadst16 },  // FLIPADST_ADST
+  { fidtx16, fidtx16 },  // IDTX
+  { fdct16,  fidtx16 },  // V_DCT
+  { fidtx16, fdct16  },  // H_DCT
+  { fadst16, fidtx16 },  // V_ADST
+  { fidtx16, fadst16 },  // H_ADST
+  { fadst16, fidtx16 },  // V_FLIPADST
+  { fidtx16, fadst16 },  // H_FLIPADST
+#endif  // CONFIG_EXT_TX
 };
 
+#if CONFIG_EXT_TX
+static const transform_2d FHT_32[] = {
+  { fdct32,  fdct32  },                // DCT_DCT
+  { fhalfright32, fdct32  },           // ADST_DCT
+  { fdct32,  fhalfright32 },           // DCT_ADST
+  { fhalfright32, fhalfright32 },      // ADST_ADST
+  { fhalfright32, fdct32  },           // FLIPADST_DCT
+  { fdct32,  fhalfright32 },           // DCT_FLIPADST
+  { fhalfright32, fhalfright32 },      // FLIPADST_FLIPADST
+  { fhalfright32, fhalfright32 },      // ADST_FLIPADST
+  { fhalfright32, fhalfright32 },      // FLIPADST_ADST
+  { fidtx32, fidtx32 },                // IDTX
+  { fdct32,  fidtx32 },                // V_DCT
+  { fidtx32, fdct32  },                // H_DCT
+  { fhalfright32, fidtx32 },           // V_ADST
+  { fidtx32, fhalfright32 },           // H_ADST
+  { fhalfright32, fidtx32 },           // V_FLIPADST
+  { fidtx32, fhalfright32 },           // H_FLIPADST
+};
+#endif  // CONFIG_EXT_TX
+
 void vp10_fht4x4_c(const int16_t *input, tran_low_t *output,
-                  int stride, int tx_type) {
+                   int stride, int tx_type) {
   if (tx_type == DCT_DCT) {
     vpx_fdct4x4_c(input, output, stride);
   } else {
@@ -1030,6 +1231,11 @@
     tran_low_t temp_in[4], temp_out[4];
     const transform_2d ht = FHT_4[tx_type];
 
+#if CONFIG_EXT_TX
+    int16_t flipped_input[4 * 4];
+    maybe_flip_input(&input, &stride, 4, flipped_input, tx_type);
+#endif
+
     // Columns
     for (i = 0; i < 4; ++i) {
       for (j = 0; j < 4; ++j)
@@ -1053,15 +1259,15 @@
 }
 
 void vp10_fdct8x8_quant_c(const int16_t *input, int stride,
-                         tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                         int skip_block,
-                         const int16_t *zbin_ptr, const int16_t *round_ptr,
-                         const int16_t *quant_ptr,
-                         const int16_t *quant_shift_ptr,
-                         tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
-                         const int16_t *dequant_ptr,
-                         uint16_t *eob_ptr,
-                         const int16_t *scan, const int16_t *iscan) {
+                          tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                          int skip_block,
+                          const int16_t *zbin_ptr, const int16_t *round_ptr,
+                          const int16_t *quant_ptr,
+                          const int16_t *quant_shift_ptr,
+                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                          const int16_t *dequant_ptr,
+                          uint16_t *eob_ptr,
+                          const int16_t *scan, const int16_t *iscan) {
   int eob = -1;
 
   int i, j;
@@ -1165,7 +1371,7 @@
 }
 
 void vp10_fht8x8_c(const int16_t *input, tran_low_t *output,
-                  int stride, int tx_type) {
+                   int stride, int tx_type) {
   if (tx_type == DCT_DCT) {
     vpx_fdct8x8_c(input, output, stride);
   } else {
@@ -1174,6 +1380,11 @@
     tran_low_t temp_in[8], temp_out[8];
     const transform_2d ht = FHT_8[tx_type];
 
+#if CONFIG_EXT_TX
+    int16_t flipped_input[8 * 8];
+    maybe_flip_input(&input, &stride, 8, flipped_input, tx_type);
+#endif
+
     // Columns
     for (i = 0; i < 8; ++i) {
       for (j = 0; j < 8; ++j)
@@ -1251,7 +1462,7 @@
 }
 
 void vp10_fht16x16_c(const int16_t *input, tran_low_t *output,
-                    int stride, int tx_type) {
+                     int stride, int tx_type) {
   if (tx_type == DCT_DCT) {
     vpx_fdct16x16_c(input, output, stride);
   } else {
@@ -1260,6 +1471,11 @@
     tran_low_t temp_in[16], temp_out[16];
     const transform_2d ht = FHT_16[tx_type];
 
+#if CONFIG_EXT_TX
+    int16_t flipped_input[16 * 16];
+    maybe_flip_input(&input, &stride, 16, flipped_input, tx_type);
+#endif
+
     // Columns
     for (i = 0; i < 16; ++i) {
       for (j = 0; j < 16; ++j)
@@ -1301,3 +1517,61 @@
   vp10_fht16x16_c(input, output, stride, tx_type);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_EXT_TX
+void vp10_fht32x32_c(const int16_t *input, tran_low_t *output,
+                     int stride, int tx_type) {
+  if (tx_type == DCT_DCT) {
+    vpx_fdct32x32_c(input, output, stride);
+  } else {
+    tran_low_t out[1024];
+    int i, j;
+    tran_low_t temp_in[32], temp_out[32];
+    const transform_2d ht = FHT_32[tx_type];
+
+    int16_t flipped_input[32 * 32];
+    maybe_flip_input(&input, &stride, 32, flipped_input, tx_type);
+
+    // Columns
+    for (i = 0; i < 32; ++i) {
+      for (j = 0; j < 32; ++j)
+        temp_in[j] = input[j * stride + i] * 4;
+      ht.cols(temp_in, temp_out);
+      for (j = 0; j < 32; ++j)
+        out[j * 32 + i] = (temp_out[j] + 1 + (temp_out[j] > 0)) >> 2;
+    }
+
+    // Rows
+    for (i = 0; i < 32; ++i) {
+      for (j = 0; j < 32; ++j)
+        temp_in[j] = out[j + i * 32];
+      ht.rows(temp_in, temp_out);
+      for (j = 0; j < 32; ++j)
+        output[j + i * 32] =
+            (tran_low_t)((temp_out[j] + 1 + (temp_out[j] < 0)) >> 2);
+    }
+  }
+}
+
+// Forward identity transform.
+void vp10_fwd_idtx_c(const int16_t *src_diff,
+                     tran_low_t *coeff, int stride,
+                     int bs, int tx_type) {
+  int r, c;
+  const int shift = bs < 32 ? 3 : 2;
+  if (tx_type == IDTX) {
+    for (r = 0; r < bs; ++r) {
+      for (c = 0; c < bs; ++c) coeff[c] = src_diff[c] << shift;
+      src_diff += stride;
+      coeff += bs;
+    }
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_highbd_fht32x32_c(const int16_t *input, tran_low_t *output,
+                            int stride, int tx_type) {
+  vp10_fht32x32_c(input, output, stride, tx_type);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // CONFIG_EXT_TX
diff --git a/vp10/encoder/denoiser.c b/vp10/encoder/denoiser.c
index e5d8157..1ff854d 100644
--- a/vp10/encoder/denoiser.c
+++ b/vp10/encoder/denoiser.c
@@ -189,7 +189,7 @@
 
 static uint8_t *block_start(uint8_t *framebuf, int stride,
                             int mi_row, int mi_col) {
-  return framebuf + (stride * mi_row * 8) + (mi_col * 8);
+  return framebuf + (stride * mi_row * MI_SIZE) + (mi_col * MI_SIZE);
 }
 
 static VP9_DENOISER_DECISION perform_motion_compensation(VP9_DENOISER *denoiser,
@@ -230,9 +230,19 @@
     frame = ctx->best_zeromv_reference_frame;
 
     mbmi->ref_frame[0] = ctx->best_zeromv_reference_frame;
+#if CONFIG_EXT_INTER
+    if (has_second_ref(mbmi))
+      mbmi->mode = ZERO_ZEROMV;
+    else
+#endif  // CONFIG_EXT_INTER
     mbmi->mode = ZEROMV;
     mbmi->mv[0].as_int = 0;
 
+#if CONFIG_EXT_INTER
+    if (has_second_ref(mbmi))
+      ctx->best_sse_inter_mode = ZERO_ZEROMV;
+    else
+#endif  // CONFIG_EXT_INTER
     ctx->best_sse_inter_mode = ZEROMV;
     ctx->best_sse_mv.as_int = 0;
     ctx->newmv_sse = ctx->zeromv_sse;
@@ -377,9 +387,12 @@
 void vp10_denoiser_update_frame_info(VP9_DENOISER *denoiser,
                                     YV12_BUFFER_CONFIG src,
                                     FRAME_TYPE frame_type,
+                                    int refresh_last_frame,
+#if CONFIG_EXT_REFS
+                                    int refresh_bwd_ref_frame,
+#endif  // CONFIG_EXT_REFS
                                     int refresh_alt_ref_frame,
-                                    int refresh_golden_frame,
-                                    int refresh_last_frame) {
+                                    int refresh_golden_frame) {
   if (frame_type == KEY_FRAME) {
     int i;
     // Start at 1 so as not to overwrite the INTRA_FRAME
@@ -397,10 +410,18 @@
     swap_frame_buffer(&denoiser->running_avg_y[GOLDEN_FRAME],
                       &denoiser->running_avg_y[INTRA_FRAME]);
   }
+  // TODO(zoeliu): To explore whether when show_existing_frame == 1 should be
+  //               handled differently.
   if (refresh_last_frame) {
     swap_frame_buffer(&denoiser->running_avg_y[LAST_FRAME],
                       &denoiser->running_avg_y[INTRA_FRAME]);
   }
+#if CONFIG_EXT_REFS
+  if (refresh_bwd_ref_frame) {
+    swap_frame_buffer(&denoiser->running_avg_y[BWDREF_FRAME],
+                      &denoiser->running_avg_y[INTRA_FRAME]);
+  }
+#endif  // CONFIG_EXT_REFS
 }
 
 void vp10_denoiser_reset_frame_stats(PICK_MODE_CONTEXT *ctx) {
diff --git a/vp10/encoder/denoiser.h b/vp10/encoder/denoiser.h
index e543fb0..d7fdf0c 100644
--- a/vp10/encoder/denoiser.h
+++ b/vp10/encoder/denoiser.h
@@ -35,9 +35,12 @@
 void vp10_denoiser_update_frame_info(VP9_DENOISER *denoiser,
                                     YV12_BUFFER_CONFIG src,
                                     FRAME_TYPE frame_type,
+                                    int refresh_last_frame,
+#if CONFIG_EXT_REFS
+                                    int refresh_bwd_ref_frame,
+#endif  // CONFIG_EXT_REFS
                                     int refresh_alt_ref_frame,
-                                    int refresh_golden_frame,
-                                    int refresh_last_frame);
+                                    int refresh_golden_frame);
 
 void vp10_denoiser_denoise(VP9_DENOISER *denoiser, MACROBLOCK *mb,
                           int mi_row, int mi_col, BLOCK_SIZE bs,
diff --git a/vp10/encoder/encodeframe.c b/vp10/encoder/encodeframe.c
index 26ce5a1..ff1ee6b 100644
--- a/vp10/encoder/encodeframe.c
+++ b/vp10/encoder/encodeframe.c
@@ -36,6 +36,9 @@
 #include "vp10/encoder/aq_complexity.h"
 #include "vp10/encoder/aq_cyclicrefresh.h"
 #include "vp10/encoder/aq_variance.h"
+#if CONFIG_SUPERTX
+#include "vp10/encoder/cost.h"
+#endif
 #include "vp10/encoder/encodeframe.h"
 #include "vp10/encoder/encodemb.h"
 #include "vp10/encoder/encodemv.h"
@@ -46,16 +49,66 @@
 #include "vp10/encoder/segmentation.h"
 #include "vp10/encoder/tokenize.h"
 
+#if CONFIG_VP9_HIGHBITDEPTH
+# define IF_HBD(...) __VA_ARGS__
+#else
+# define IF_HBD(...)
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 static void encode_superblock(VP10_COMP *cpi, ThreadData * td,
                               TOKENEXTRA **t, int output_enabled,
                               int mi_row, int mi_col, BLOCK_SIZE bsize,
                               PICK_MODE_CONTEXT *ctx);
 
+#if CONFIG_SUPERTX
+static int check_intra_b(PICK_MODE_CONTEXT *ctx);
+
+static int check_intra_sb(VP10_COMP *cpi, const TileInfo *const tile,
+                          int mi_row, int mi_col, BLOCK_SIZE bsize,
+                          PC_TREE *pc_tree);
+static void predict_superblock(VP10_COMP *cpi, ThreadData *td,
+#if CONFIG_EXT_INTER
+                               int mi_row_ori, int mi_col_ori,
+#endif  // CONFIG_EXT_INTER
+                               int mi_row_pred, int mi_col_pred,
+                               BLOCK_SIZE bsize_pred, int b_sub8x8, int block);
+static int check_supertx_sb(BLOCK_SIZE bsize, TX_SIZE supertx_size,
+                            PC_TREE *pc_tree);
+static void predict_sb_complex(VP10_COMP *cpi, ThreadData *td,
+                               const TileInfo *const tile,
+                               int mi_row, int mi_col,
+                               int mi_row_ori, int mi_col_ori,
+                               int output_enabled, BLOCK_SIZE bsize,
+                               BLOCK_SIZE top_bsize,
+                               uint8_t *dst_buf[3], int dst_stride[3],
+                               PC_TREE *pc_tree);
+static void update_state_sb_supertx(VP10_COMP *cpi, ThreadData *td,
+                                    const TileInfo *const tile,
+                                    int mi_row, int mi_col,
+                                    BLOCK_SIZE bsize,
+                                    int output_enabled, PC_TREE *pc_tree);
+static void rd_supertx_sb(VP10_COMP *cpi, ThreadData *td,
+                          const TileInfo *const tile,
+                          int mi_row, int mi_col, BLOCK_SIZE bsize,
+                          int *tmp_rate, int64_t *tmp_dist,
+                          TX_TYPE *best_tx,
+                          PC_TREE *pc_tree);
+#endif  // CONFIG_SUPERTX
+
 // This is used as a reference when computing the source variance for the
 //  purposes of activity masking.
 // Eventually this should be replaced by custom no-reference routines,
 //  which will be faster.
-static const uint8_t VP9_VAR_OFFS[64] = {
+static const uint8_t VP10_VAR_OFFS[MAX_SB_SIZE] = {
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+#if CONFIG_EXT_PARTITION
     128, 128, 128, 128, 128, 128, 128, 128,
     128, 128, 128, 128, 128, 128, 128, 128,
     128, 128, 128, 128, 128, 128, 128, 128,
@@ -64,10 +117,20 @@
     128, 128, 128, 128, 128, 128, 128, 128,
     128, 128, 128, 128, 128, 128, 128, 128,
     128, 128, 128, 128, 128, 128, 128, 128
+#endif  // CONFIG_EXT_PARTITION
 };
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static const uint16_t VP9_HIGH_VAR_OFFS_8[64] = {
+static const uint16_t VP10_HIGH_VAR_OFFS_8[MAX_SB_SIZE] = {
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+    128, 128, 128, 128, 128, 128, 128, 128,
+#if CONFIG_EXT_PARTITION
     128, 128, 128, 128, 128, 128, 128, 128,
     128, 128, 128, 128, 128, 128, 128, 128,
     128, 128, 128, 128, 128, 128, 128, 128,
@@ -76,9 +139,19 @@
     128, 128, 128, 128, 128, 128, 128, 128,
     128, 128, 128, 128, 128, 128, 128, 128,
     128, 128, 128, 128, 128, 128, 128, 128
+#endif  // CONFIG_EXT_PARTITION
 };
 
-static const uint16_t VP9_HIGH_VAR_OFFS_10[64] = {
+static const uint16_t VP10_HIGH_VAR_OFFS_10[MAX_SB_SIZE] = {
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+    128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
+#if CONFIG_EXT_PARTITION
     128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
     128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
     128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
@@ -87,9 +160,19 @@
     128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
     128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4,
     128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4, 128*4
+#endif  // CONFIG_EXT_PARTITION
 };
 
-static const uint16_t VP9_HIGH_VAR_OFFS_12[64] = {
+static const uint16_t VP10_HIGH_VAR_OFFS_12[MAX_SB_SIZE] = {
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+    128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
+#if CONFIG_EXT_PARTITION
     128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
     128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
     128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
@@ -98,6 +181,7 @@
     128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
     128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16,
     128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16, 128*16
+#endif  // CONFIG_EXT_PARTITION
 };
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -106,7 +190,7 @@
                                            BLOCK_SIZE bs) {
   unsigned int sse;
   const unsigned int var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
-                                              VP9_VAR_OFFS, 0, &sse);
+                                              VP10_VAR_OFFS, 0, &sse);
   return ROUND_POWER_OF_TWO(var, num_pels_log2_lookup[bs]);
 }
 
@@ -117,18 +201,18 @@
   switch (bd) {
     case 10:
       var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
-                               CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_10),
+                               CONVERT_TO_BYTEPTR(VP10_HIGH_VAR_OFFS_10),
                                0, &sse);
       break;
     case 12:
       var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
-                               CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_12),
+                               CONVERT_TO_BYTEPTR(VP10_HIGH_VAR_OFFS_12),
                                0, &sse);
       break;
     case 8:
     default:
       var = cpi->fn_ptr[bs].vf(ref->buf, ref->stride,
-                               CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_8),
+                               CONVERT_TO_BYTEPTR(VP10_HIGH_VAR_OFFS_8),
                                0, &sse);
       break;
   }
@@ -170,11 +254,11 @@
 
 // Lighter version of set_offsets that only sets the mode info
 // pointers.
-static INLINE void set_mode_info_offsets(VP10_COMP *const cpi,
-                                         MACROBLOCK *const x,
-                                         MACROBLOCKD *const xd,
-                                         int mi_row,
-                                         int mi_col) {
+static void set_mode_info_offsets(VP10_COMP *const cpi,
+                                  MACROBLOCK *const x,
+                                  MACROBLOCKD *const xd,
+                                  int mi_row,
+                                  int mi_col) {
   VP10_COMMON *const cm = &cpi->common;
   const int idx_str = xd->mi_stride * mi_row + mi_col;
   xd->mi = cm->mi_grid_visible + idx_str;
@@ -182,21 +266,26 @@
   x->mbmi_ext = cpi->mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
 }
 
-static void set_offsets(VP10_COMP *cpi, const TileInfo *const tile,
-                        MACROBLOCK *const x, int mi_row, int mi_col,
-                        BLOCK_SIZE bsize) {
+static void set_offsets_without_segment_id(VP10_COMP *cpi,
+                                           const TileInfo *const tile,
+                                           MACROBLOCK *const x,
+                                           int mi_row, int mi_col,
+                                           BLOCK_SIZE bsize) {
   VP10_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *mbmi;
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
-  const struct segmentation *const seg = &cm->seg;
 
   set_skip_context(xd, mi_row, mi_col);
 
   set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
 
-  mbmi = &xd->mi[0]->mbmi;
+#if CONFIG_VAR_TX
+  xd->above_txfm_context = cm->above_txfm_context + mi_col;
+  xd->left_txfm_context =
+    xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+  xd->max_tx_size = max_txsize_lookup[bsize];
+#endif
 
   // Set up destination pointers.
   vp10_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
@@ -220,14 +309,30 @@
   x->rddiv = cpi->rd.RDDIV;
   x->rdmult = cpi->rd.RDMULT;
 
+  // required by vp10_append_sub8x8_mvs_for_idx() and vp10_find_best_ref_mvs()
+  xd->tile = *tile;
+}
+
+static void set_offsets(VP10_COMP *cpi, const TileInfo *const tile,
+                        MACROBLOCK *const x, int mi_row, int mi_col,
+                        BLOCK_SIZE bsize) {
+  VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi;
+  const struct segmentation *const seg = &cm->seg;
+
+  set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
+
+  mbmi = &xd->mi[0]->mbmi;
+
   // Setup segment ID.
   if (seg->enabled) {
-    if (cpi->oxcf.aq_mode != VARIANCE_AQ) {
+    if (!cpi->vaq_refresh) {
       const uint8_t *const map = seg->update_map ? cpi->segmentation_map
                                                  : cm->last_frame_seg_map;
       mbmi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
     }
-    vp10_init_plane_quantizers(cpi, x);
+    vp10_init_plane_quantizers(cpi, x, mbmi->segment_id);
 
     x->encode_breakout = cpi->segment_encode_breakout[mbmi->segment_id];
   } else {
@@ -235,10 +340,102 @@
     x->encode_breakout = cpi->encode_breakout;
   }
 
-  // required by vp10_append_sub8x8_mvs_for_idx() and vp10_find_best_ref_mvs()
-  xd->tile = *tile;
+#if CONFIG_SUPERTX
+  mbmi->segment_id_supertx = MAX_SEGMENTS;
+#endif  // CONFIG_SUPERTX
 }
 
+#if CONFIG_SUPERTX
+static void set_offsets_supertx(VP10_COMP *cpi, ThreadData *td,
+                                const TileInfo *const tile,
+                                int mi_row, int mi_col, BLOCK_SIZE bsize) {
+  MACROBLOCK *const x = &td->mb;
+  VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+
+  set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
+
+  // Set up distance of MB to edge of frame in 1/8th pel units.
+  assert(!(mi_col & (mi_width - 1)) && !(mi_row & (mi_height - 1)));
+  set_mi_row_col(xd, tile, mi_row, mi_height, mi_col, mi_width,
+                 cm->mi_rows, cm->mi_cols);
+}
+
+static void set_offsets_extend(VP10_COMP *cpi, ThreadData *td,
+                               const TileInfo *const tile,
+                               int mi_row_pred, int mi_col_pred,
+                               int mi_row_ori, int mi_col_ori,
+                               BLOCK_SIZE bsize_pred) {
+  // Used in supertx
+  // (mi_row_ori, mi_col_ori, bsize_ori): region for mv
+  // (mi_row_pred, mi_col_pred, bsize_pred): region to predict
+  MACROBLOCK *const x = &td->mb;
+  VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize_pred];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize_pred];
+
+  set_mode_info_offsets(cpi, x, xd, mi_row_ori, mi_col_ori);
+
+  // Set up limit values for MV components.
+  // Mv beyond the range do not produce new/different prediction block.
+  x->mv_row_min = -(((mi_row_pred + mi_height) * MI_SIZE) + VP9_INTERP_EXTEND);
+  x->mv_col_min = -(((mi_col_pred + mi_width) * MI_SIZE) + VP9_INTERP_EXTEND);
+  x->mv_row_max = (cm->mi_rows - mi_row_pred) * MI_SIZE + VP9_INTERP_EXTEND;
+  x->mv_col_max = (cm->mi_cols - mi_col_pred) * MI_SIZE + VP9_INTERP_EXTEND;
+
+  // Set up distance of MB to edge of frame in 1/8th pel units.
+  assert(!(mi_col_pred & (mi_width - 1)) && !(mi_row_pred & (mi_height - 1)));
+  set_mi_row_col(xd, tile, mi_row_pred, mi_height, mi_col_pred, mi_width,
+                 cm->mi_rows, cm->mi_cols);
+  xd->up_available    = (mi_row_ori > tile->mi_row_start);
+  xd->left_available  = (mi_col_ori > tile->mi_col_start);
+
+  // R/D setup.
+  x->rddiv = cpi->rd.RDDIV;
+  x->rdmult = cpi->rd.RDMULT;
+}
+
+static void set_segment_id_supertx(const VP10_COMP *const cpi,
+                                   MACROBLOCK *const x,
+                                   const int mi_row, const int mi_col,
+                                   const BLOCK_SIZE bsize) {
+  const VP10_COMMON *cm = &cpi->common;
+  const struct segmentation *seg = &cm->seg;
+  const int miw =
+      VPXMIN(num_8x8_blocks_wide_lookup[bsize], cm->mi_cols - mi_col);
+  const int mih =
+      VPXMIN(num_8x8_blocks_high_lookup[bsize], cm->mi_rows - mi_row);
+  const int mi_offset = mi_row * cm->mi_stride + mi_col;
+  MODE_INFO **const mip = cm->mi_grid_visible + mi_offset;
+  int r, c;
+  int seg_id_supertx = MAX_SEGMENTS;
+
+  if (!seg->enabled) {
+    seg_id_supertx = 0;
+    x->encode_breakout = cpi->encode_breakout;
+  } else {
+    // Find the minimum segment_id
+    for (r = 0 ; r < mih ; r++)
+      for (c = 0 ; c < miw ; c++)
+        seg_id_supertx = VPXMIN(mip[r * cm->mi_stride + c]->mbmi.segment_id,
+                               seg_id_supertx);
+    assert(0 <= seg_id_supertx && seg_id_supertx < MAX_SEGMENTS);
+
+    // Initialize plane quantisers
+    vp10_init_plane_quantizers(cpi, x, seg_id_supertx);
+    x->encode_breakout = cpi->segment_encode_breakout[seg_id_supertx];
+  }
+
+  // Assign the the segment_id back to segment_id_supertx
+  for (r = 0 ; r < mih ; r++)
+    for (c = 0 ; c < miw ; c++)
+      mip[r * cm->mi_stride + c]->mbmi.segment_id_supertx = seg_id_supertx;
+}
+#endif  // CONFIG_SUPERTX
+
 static void set_block_size(VP10_COMP * const cpi,
                            MACROBLOCK *const x,
                            MACROBLOCKD *const xd,
@@ -250,215 +447,107 @@
   }
 }
 
-typedef struct {
-  int64_t sum_square_error;
-  int64_t sum_error;
-  int log2_count;
-  int variance;
-} var;
-
-typedef struct {
-  var none;
-  var horz[2];
-  var vert[2];
-} partition_variance;
-
-typedef struct {
-  partition_variance part_variances;
-  var split[4];
-} v4x4;
-
-typedef struct {
-  partition_variance part_variances;
-  v4x4 split[4];
-} v8x8;
-
-typedef struct {
-  partition_variance part_variances;
-  v8x8 split[4];
-} v16x16;
-
-typedef struct {
-  partition_variance part_variances;
-  v16x16 split[4];
-} v32x32;
-
-typedef struct {
-  partition_variance part_variances;
-  v32x32 split[4];
-} v64x64;
-
-typedef struct {
-  partition_variance *part_variances;
-  var *split[4];
-} variance_node;
-
-typedef enum {
-  V16X16,
-  V32X32,
-  V64X64,
-} TREE_LEVEL;
-
-static void tree_to_node(void *data, BLOCK_SIZE bsize, variance_node *node) {
-  int i;
-  node->part_variances = NULL;
-  switch (bsize) {
-    case BLOCK_64X64: {
-      v64x64 *vt = (v64x64 *) data;
-      node->part_variances = &vt->part_variances;
-      for (i = 0; i < 4; i++)
-        node->split[i] = &vt->split[i].part_variances.none;
-      break;
-    }
-    case BLOCK_32X32: {
-      v32x32 *vt = (v32x32 *) data;
-      node->part_variances = &vt->part_variances;
-      for (i = 0; i < 4; i++)
-        node->split[i] = &vt->split[i].part_variances.none;
-      break;
-    }
-    case BLOCK_16X16: {
-      v16x16 *vt = (v16x16 *) data;
-      node->part_variances = &vt->part_variances;
-      for (i = 0; i < 4; i++)
-        node->split[i] = &vt->split[i].part_variances.none;
-      break;
-    }
-    case BLOCK_8X8: {
-      v8x8 *vt = (v8x8 *) data;
-      node->part_variances = &vt->part_variances;
-      for (i = 0; i < 4; i++)
-        node->split[i] = &vt->split[i].part_variances.none;
-      break;
-    }
-    case BLOCK_4X4: {
-      v4x4 *vt = (v4x4 *) data;
-      node->part_variances = &vt->part_variances;
-      for (i = 0; i < 4; i++)
-        node->split[i] = &vt->split[i];
-      break;
-    }
-    default: {
-      assert(0);
-      break;
-    }
-  }
-}
-
-// Set variance values given sum square error, sum error, count.
-static void fill_variance(int64_t s2, int64_t s, int c, var *v) {
-  v->sum_square_error = s2;
-  v->sum_error = s;
-  v->log2_count = c;
-}
-
-static void get_variance(var *v) {
-  v->variance = (int)(256 * (v->sum_square_error -
-      ((v->sum_error * v->sum_error) >> v->log2_count)) >> v->log2_count);
-}
-
-static void sum_2_variances(const var *a, const var *b, var *r) {
-  assert(a->log2_count == b->log2_count);
-  fill_variance(a->sum_square_error + b->sum_square_error,
-                a->sum_error + b->sum_error, a->log2_count + 1, r);
-}
-
-static void fill_variance_tree(void *data, BLOCK_SIZE bsize) {
-  variance_node node;
-  memset(&node, 0, sizeof(node));
-  tree_to_node(data, bsize, &node);
-  sum_2_variances(node.split[0], node.split[1], &node.part_variances->horz[0]);
-  sum_2_variances(node.split[2], node.split[3], &node.part_variances->horz[1]);
-  sum_2_variances(node.split[0], node.split[2], &node.part_variances->vert[0]);
-  sum_2_variances(node.split[1], node.split[3], &node.part_variances->vert[1]);
-  sum_2_variances(&node.part_variances->vert[0], &node.part_variances->vert[1],
-                  &node.part_variances->none);
-}
-
-static int set_vt_partitioning(VP10_COMP *cpi,
+static void set_vt_partitioning(VP10_COMP *cpi,
                                MACROBLOCK *const x,
                                MACROBLOCKD *const xd,
-                               void *data,
-                               BLOCK_SIZE bsize,
+                               VAR_TREE *vt,
                                int mi_row,
                                int mi_col,
-                               int64_t threshold,
-                               BLOCK_SIZE bsize_min,
-                               int force_split) {
+                               const int64_t *const threshold,
+                               const BLOCK_SIZE *const bsize_min) {
   VP10_COMMON * const cm = &cpi->common;
-  variance_node vt;
-  const int block_width = num_8x8_blocks_wide_lookup[bsize];
-  const int block_height = num_8x8_blocks_high_lookup[bsize];
-  const int low_res = (cm->width <= 352 && cm->height <= 288);
+  const int hbw = num_8x8_blocks_wide_lookup[vt->bsize] / 2;
+  const int hbh = num_8x8_blocks_high_lookup[vt->bsize] / 2;
+  const int has_cols = mi_col + hbw < cm->mi_cols;
+  const int has_rows = mi_row + hbh < cm->mi_rows;
 
-  assert(block_height == block_width);
-  tree_to_node(data, bsize, &vt);
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
 
-  if (force_split == 1)
-    return 0;
+  assert(vt->bsize >= BLOCK_8X8);
+
+  assert(hbh == hbw);
+
+  if (vt->bsize == BLOCK_8X8 && cm->frame_type != KEY_FRAME) {
+    set_block_size(cpi, x, xd, mi_row, mi_col, BLOCK_8X8);
+    return;
+  }
+
+  if (vt->force_split || (!has_cols && !has_rows))
+    goto split;
 
   // For bsize=bsize_min (16x16/8x8 for 8x8/4x4 downsampling), select if
   // variance is below threshold, otherwise split will be selected.
   // No check for vert/horiz split as too few samples for variance.
-  if (bsize == bsize_min) {
-    // Variance already computed to set the force_split.
-    if (low_res || cm->frame_type == KEY_FRAME)
-      get_variance(&vt.part_variances->none);
-    if (mi_col + block_width / 2 < cm->mi_cols &&
-        mi_row + block_height / 2 < cm->mi_rows &&
-        vt.part_variances->none.variance < threshold) {
-      set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
-      return 1;
+  if (vt->bsize == bsize_min[0]) {
+    if (has_cols && has_rows &&
+        vt->variances.none.variance < threshold[0]) {
+      set_block_size(cpi, x, xd, mi_row, mi_col, vt->bsize);
+      return;
+    } else {
+      BLOCK_SIZE subsize = get_subsize(vt->bsize, PARTITION_SPLIT);
+      set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
+      if (vt->bsize > BLOCK_8X8) {
+        set_block_size(cpi, x, xd, mi_row, mi_col + hbw, subsize);
+        set_block_size(cpi, x, xd, mi_row + hbh, mi_col, subsize);
+        set_block_size(cpi, x, xd, mi_row + hbh, mi_col + hbw, subsize);
+      }
+      return;
     }
-    return 0;
-  } else if (bsize > bsize_min) {
-    // Variance already computed to set the force_split.
-    if (low_res || cm->frame_type == KEY_FRAME)
-      get_variance(&vt.part_variances->none);
+  } else if (vt->bsize > bsize_min[0]) {
     // For key frame: take split for bsize above 32X32 or very high variance.
     if (cm->frame_type == KEY_FRAME &&
-        (bsize > BLOCK_32X32 ||
-        vt.part_variances->none.variance > (threshold << 4))) {
-      return 0;
+        (vt->bsize > BLOCK_32X32 ||
+        vt->variances.none.variance > (threshold[0] << 4))) {
+      goto split;
     }
     // If variance is low, take the bsize (no split).
-    if (mi_col + block_width / 2 < cm->mi_cols &&
-        mi_row + block_height / 2 < cm->mi_rows &&
-        vt.part_variances->none.variance < threshold) {
-      set_block_size(cpi, x, xd, mi_row, mi_col, bsize);
-      return 1;
+    if (has_cols && has_rows &&
+        vt->variances.none.variance < threshold[0]) {
+      set_block_size(cpi, x, xd, mi_row, mi_col, vt->bsize);
+      return;
     }
 
     // Check vertical split.
-    if (mi_row + block_height / 2 < cm->mi_rows) {
-      BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_VERT);
-      get_variance(&vt.part_variances->vert[0]);
-      get_variance(&vt.part_variances->vert[1]);
-      if (vt.part_variances->vert[0].variance < threshold &&
-          vt.part_variances->vert[1].variance < threshold &&
+    if (has_rows) {
+      BLOCK_SIZE subsize = get_subsize(vt->bsize, PARTITION_VERT);
+      if (vt->variances.vert[0].variance < threshold[0] &&
+          vt->variances.vert[1].variance < threshold[0] &&
           get_plane_block_size(subsize, &xd->plane[1]) < BLOCK_INVALID) {
         set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
-        set_block_size(cpi, x, xd, mi_row, mi_col + block_width / 2, subsize);
-        return 1;
+        set_block_size(cpi, x, xd, mi_row, mi_col + hbw, subsize);
+        return;
       }
     }
     // Check horizontal split.
-    if (mi_col + block_width / 2 < cm->mi_cols) {
-      BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_HORZ);
-      get_variance(&vt.part_variances->horz[0]);
-      get_variance(&vt.part_variances->horz[1]);
-      if (vt.part_variances->horz[0].variance < threshold &&
-          vt.part_variances->horz[1].variance < threshold &&
+    if (has_cols) {
+      BLOCK_SIZE subsize = get_subsize(vt->bsize, PARTITION_HORZ);
+      if (vt->variances.horz[0].variance < threshold[0] &&
+          vt->variances.horz[1].variance < threshold[0] &&
           get_plane_block_size(subsize, &xd->plane[1]) < BLOCK_INVALID) {
         set_block_size(cpi, x, xd, mi_row, mi_col, subsize);
-        set_block_size(cpi, x, xd, mi_row + block_height / 2, mi_col, subsize);
-        return 1;
+        set_block_size(cpi, x, xd, mi_row + hbh, mi_col, subsize);
+        return;
       }
     }
-
-    return 0;
   }
-  return 0;
+
+split:
+  {
+    set_vt_partitioning(cpi, x, xd, vt->split[0],
+                        mi_row, mi_col,
+                        threshold + 1, bsize_min + 1);
+    set_vt_partitioning(cpi, x, xd, vt->split[1],
+                        mi_row, mi_col + hbw,
+                        threshold + 1, bsize_min + 1);
+    set_vt_partitioning(cpi, x, xd, vt->split[2],
+                        mi_row + hbh, mi_col,
+                        threshold + 1, bsize_min + 1);
+    set_vt_partitioning(cpi, x, xd, vt->split[3],
+                        mi_row + hbh, mi_col + hbw,
+                        threshold + 1, bsize_min + 1);
+    return;
+  }
 }
 
 // Set the variance split thresholds for following the block sizes:
@@ -472,23 +561,24 @@
   const int64_t threshold_base = (int64_t)(threshold_multiplier *
       cpi->y_dequant[q][1]);
   if (is_key_frame) {
-    thresholds[0] = threshold_base;
-    thresholds[1] = threshold_base >> 2;
-    thresholds[2] = threshold_base >> 2;
-    thresholds[3] = threshold_base << 2;
-  } else {
     thresholds[1] = threshold_base;
+    thresholds[2] = threshold_base >> 2;
+    thresholds[3] = threshold_base >> 2;
+    thresholds[4] = threshold_base << 2;
+  } else {
+    thresholds[2] = threshold_base;
     if (cm->width <= 352 && cm->height <= 288) {
-      thresholds[0] = threshold_base >> 2;
-      thresholds[2] = threshold_base << 3;
+      thresholds[1] = threshold_base >> 2;
+      thresholds[3] = threshold_base << 3;
     } else {
-      thresholds[0] = threshold_base;
-      thresholds[1] = (5 * threshold_base) >> 2;
+      thresholds[1] = threshold_base;
+      thresholds[2] = (5 * threshold_base) >> 2;
       if (cm->width >= 1920 && cm->height >= 1080)
-        thresholds[1] = (7 * threshold_base) >> 2;
-      thresholds[2] = threshold_base << cpi->oxcf.speed;
+        thresholds[2] = (7 * threshold_base) >> 2;
+      thresholds[3] = threshold_base << cpi->oxcf.speed;
     }
   }
+  thresholds[0] = INT64_MIN;
 }
 
 void vp10_set_variance_partition_thresholds(VP10_COMP *cpi, int q) {
@@ -517,10 +607,10 @@
 }
 
 // Compute the minmax over the 8x8 subblocks.
-static int compute_minmax_8x8(const uint8_t *s, int sp, const uint8_t *d,
-                              int dp, int x16_idx, int y16_idx,
+static int compute_minmax_8x8(const uint8_t *src, int src_stride,
+                              const uint8_t *ref, int ref_stride,
 #if CONFIG_VP9_HIGHBITDEPTH
-                              int highbd_flag,
+                              int highbd,
 #endif
                               int pixels_wide,
                               int pixels_high) {
@@ -529,24 +619,26 @@
   int minmax_min = 255;
   // Loop over the 4 8x8 subblocks.
   for (k = 0; k < 4; k++) {
-    int x8_idx = x16_idx + ((k & 1) << 3);
-    int y8_idx = y16_idx + ((k >> 1) << 3);
+    const int x8_idx = ((k & 1) << 3);
+    const int y8_idx = ((k >> 1) << 3);
     int min = 0;
     int max = 0;
     if (x8_idx < pixels_wide && y8_idx < pixels_high) {
+      const int src_offset = y8_idx * src_stride + x8_idx;
+      const int ref_offset = y8_idx * ref_stride + x8_idx;
 #if CONFIG_VP9_HIGHBITDEPTH
-      if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
-        vpx_highbd_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
-                              d + y8_idx * dp + x8_idx, dp,
+      if (highbd) {
+        vpx_highbd_minmax_8x8(src + src_offset, src_stride,
+                              ref + ref_offset, ref_stride,
                               &min, &max);
       } else {
-        vpx_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
-                       d + y8_idx * dp + x8_idx, dp,
+        vpx_minmax_8x8(src + src_offset, src_stride,
+                       ref + ref_offset, ref_stride,
                        &min, &max);
       }
 #else
-      vpx_minmax_8x8(s + y8_idx * sp + x8_idx, sp,
-                     d + y8_idx * dp + x8_idx, dp,
+      vpx_minmax_8x8(src + src_offset, src_stride,
+                     ref + ref_offset, ref_stride,
                      &min, &max);
 #endif
       if ((max - min) > minmax_max)
@@ -558,115 +650,259 @@
   return (minmax_max - minmax_min);
 }
 
-static void fill_variance_4x4avg(const uint8_t *s, int sp, const uint8_t *d,
-                                 int dp, int x8_idx, int y8_idx, v8x8 *vst,
 #if CONFIG_VP9_HIGHBITDEPTH
-                                 int highbd_flag,
-#endif
-                                 int pixels_wide,
-                                 int pixels_high,
-                                 int is_key_frame) {
-  int k;
-  for (k = 0; k < 4; k++) {
-    int x4_idx = x8_idx + ((k & 1) << 2);
-    int y4_idx = y8_idx + ((k >> 1) << 2);
-    unsigned int sse = 0;
-    int sum = 0;
-    if (x4_idx < pixels_wide && y4_idx < pixels_high) {
-      int s_avg;
-      int d_avg = 128;
-#if CONFIG_VP9_HIGHBITDEPTH
-      if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
-        s_avg = vpx_highbd_avg_4x4(s + y4_idx * sp + x4_idx, sp);
-        if (!is_key_frame)
-          d_avg = vpx_highbd_avg_4x4(d + y4_idx * dp + x4_idx, dp);
-      } else {
-        s_avg = vpx_avg_4x4(s + y4_idx * sp + x4_idx, sp);
-        if (!is_key_frame)
-          d_avg = vpx_avg_4x4(d + y4_idx * dp + x4_idx, dp);
-      }
+static INLINE int avg_4x4(const uint8_t *const src, const int stride,
+                          const int highbd) {
+  if (highbd) {
+    return vpx_highbd_avg_4x4(src, stride);
+  } else {
+    return vpx_avg_4x4(src, stride);
+  }
+}
 #else
-      s_avg = vpx_avg_4x4(s + y4_idx * sp + x4_idx, sp);
-      if (!is_key_frame)
-        d_avg = vpx_avg_4x4(d + y4_idx * dp + x4_idx, dp);
+static INLINE int avg_4x4(const uint8_t *const src, const int stride) {
+  return vpx_avg_4x4(src, stride);
+}
 #endif
-      sum = s_avg - d_avg;
-      sse = sum * sum;
-    }
-    fill_variance(sse, sum, 0, &vst->split[k].part_variances.none);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE int avg_8x8(const uint8_t *const src, const int stride,
+                          const int highbd) {
+  if (highbd) {
+    return vpx_highbd_avg_8x8(src, stride);
+  } else {
+    return vpx_avg_8x8(src, stride);
+  }
+}
+#else
+static INLINE int avg_8x8(const uint8_t *const src, const int stride) {
+  return vpx_avg_8x8(src, stride);
+}
+#endif
+
+static void init_variance_tree(VAR_TREE *const vt,
+#if CONFIG_VP9_HIGHBITDEPTH
+                               const int highbd,
+#endif
+                               BLOCK_SIZE bsize,
+                               BLOCK_SIZE leaf_size,
+                               const int width, const int height,
+                               const uint8_t *const src, const int src_stride,
+                               const uint8_t *const ref, const int ref_stride) {
+  assert(bsize >= leaf_size);
+
+  vt->bsize = bsize;
+
+  vt->force_split = 0;
+
+  vt->src = src;
+  vt->src_stride = src_stride;
+  vt->ref = ref;
+  vt->ref_stride = ref_stride;
+
+  vt->width = width;
+  vt->height = height;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  vt->highbd = highbd;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  if (bsize > leaf_size) {
+    const BLOCK_SIZE subsize = get_subsize(bsize, PARTITION_SPLIT);
+    const int px = num_4x4_blocks_wide_lookup[subsize] * 4;
+
+    init_variance_tree(vt->split[0],
+#if CONFIG_VP9_HIGHBITDEPTH
+                       highbd,
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+                       subsize, leaf_size,
+                       VPXMIN(px, width), VPXMIN(px, height),
+                       src, src_stride,
+                       ref, ref_stride);
+    init_variance_tree(vt->split[1],
+#if CONFIG_VP9_HIGHBITDEPTH
+                       highbd,
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+                       subsize, leaf_size,
+                       width - px, VPXMIN(px, height),
+                       src + px, src_stride,
+                       ref + px, ref_stride);
+    init_variance_tree(vt->split[2],
+#if CONFIG_VP9_HIGHBITDEPTH
+                       highbd,
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+                       subsize, leaf_size,
+                       VPXMIN(px, width), height - px,
+                       src + px * src_stride, src_stride,
+                       ref + px * ref_stride, ref_stride);
+    init_variance_tree(vt->split[3],
+#if CONFIG_VP9_HIGHBITDEPTH
+                       highbd,
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+                       subsize, leaf_size,
+                       width - px, height - px,
+                       src + px * src_stride + px, src_stride,
+                       ref + px * ref_stride + px, ref_stride);
   }
 }
 
-static void fill_variance_8x8avg(const uint8_t *s, int sp, const uint8_t *d,
-                                 int dp, int x16_idx, int y16_idx, v16x16 *vst,
-#if CONFIG_VP9_HIGHBITDEPTH
-                                 int highbd_flag,
-#endif
-                                 int pixels_wide,
-                                 int pixels_high,
-                                 int is_key_frame) {
-  int k;
-  for (k = 0; k < 4; k++) {
-    int x8_idx = x16_idx + ((k & 1) << 3);
-    int y8_idx = y16_idx + ((k >> 1) << 3);
+
+// Fill the variance tree based on averaging pixel values (sub-sampling), at
+// the leaf node size.
+static void fill_variance_tree(VAR_TREE *const vt,
+                               const BLOCK_SIZE leaf_size) {
+  if (vt->bsize > leaf_size) {
+    fill_variance_tree(vt->split[0], leaf_size);
+    fill_variance_tree(vt->split[1], leaf_size);
+    fill_variance_tree(vt->split[2], leaf_size);
+    fill_variance_tree(vt->split[3], leaf_size);
+    fill_variance_node(vt);
+  } else if (vt->width <= 0 || vt->height <= 0) {
+    fill_variance(0, 0, 0, &vt->variances.none);
+  } else {
     unsigned int sse = 0;
     int sum = 0;
-    if (x8_idx < pixels_wide && y8_idx < pixels_high) {
-      int s_avg;
-      int d_avg = 128;
-#if CONFIG_VP9_HIGHBITDEPTH
-      if (highbd_flag & YV12_FLAG_HIGHBITDEPTH) {
-        s_avg = vpx_highbd_avg_8x8(s + y8_idx * sp + x8_idx, sp);
-        if (!is_key_frame)
-          d_avg = vpx_highbd_avg_8x8(d + y8_idx * dp + x8_idx, dp);
-      } else {
-        s_avg = vpx_avg_8x8(s + y8_idx * sp + x8_idx, sp);
-        if (!is_key_frame)
-          d_avg = vpx_avg_8x8(d + y8_idx * dp + x8_idx, dp);
-      }
-#else
-      s_avg = vpx_avg_8x8(s + y8_idx * sp + x8_idx, sp);
-      if (!is_key_frame)
-        d_avg = vpx_avg_8x8(d + y8_idx * dp + x8_idx, dp);
-#endif
-      sum = s_avg - d_avg;
-      sse = sum * sum;
+    int src_avg;
+    int ref_avg;
+    assert(leaf_size == BLOCK_4X4 || leaf_size == BLOCK_8X8);
+    if (leaf_size == BLOCK_4X4) {
+      src_avg = avg_4x4(vt->src, vt->src_stride IF_HBD(, vt->highbd));
+      ref_avg = avg_4x4(vt->ref, vt->ref_stride IF_HBD(, vt->highbd));
+    } else {
+      src_avg = avg_8x8(vt->src, vt->src_stride IF_HBD(, vt->highbd));
+      ref_avg = avg_8x8(vt->ref, vt->ref_stride IF_HBD(, vt->highbd));
     }
-    fill_variance(sse, sum, 0, &vst->split[k].part_variances.none);
+    sum = src_avg - ref_avg;
+    sse = sum * sum;
+    fill_variance(sse, sum, 0, &vt->variances.none);
   }
 }
 
+static void refine_variance_tree(VAR_TREE *const vt, const int64_t threshold) {
+  if (vt->bsize >= BLOCK_8X8) {
+    if (vt->bsize == BLOCK_16X16) {
+      if (vt->variances.none.variance <= threshold)
+        return;
+      else
+        vt->force_split = 0;
+    }
+
+    refine_variance_tree(vt->split[0], threshold);
+    refine_variance_tree(vt->split[1], threshold);
+    refine_variance_tree(vt->split[2], threshold);
+    refine_variance_tree(vt->split[3], threshold);
+
+    if (vt->bsize <= BLOCK_16X16)
+      fill_variance_node(vt);
+  } else if (vt->width <= 0 || vt->height <= 0) {
+    fill_variance(0, 0, 0, &vt->variances.none);
+  } else {
+    const int src_avg = avg_4x4(vt->src, vt->src_stride IF_HBD(, vt->highbd));
+    const int ref_avg = avg_4x4(vt->ref, vt->ref_stride IF_HBD(, vt->highbd));
+    const int sum = src_avg - ref_avg;
+    const unsigned int sse =  sum * sum;
+    assert(vt->bsize == BLOCK_4X4);
+    fill_variance(sse, sum, 0, &vt->variances.none);
+  }
+}
+
+static int check_split_key_frame(VAR_TREE *const vt,
+                                 const int64_t threshold) {
+  if (vt->bsize == BLOCK_32X32) {
+    vt->force_split = vt->variances.none.variance > threshold;
+  } else {
+    vt->force_split |= check_split_key_frame(vt->split[0], threshold);
+    vt->force_split |= check_split_key_frame(vt->split[1], threshold);
+    vt->force_split |= check_split_key_frame(vt->split[2], threshold);
+    vt->force_split |= check_split_key_frame(vt->split[3], threshold);
+  }
+  return vt->force_split;
+}
+
+static int check_split(VP10_COMP *const cpi,
+                       VAR_TREE *const vt,
+                       const int segment_id,
+                       const int64_t *const thresholds
+                       ) {
+  if (vt->bsize == BLOCK_16X16) {
+    vt->force_split = vt->variances.none.variance > thresholds[0];
+    if (!vt->force_split &&
+        vt->variances.none.variance > thresholds[-1] &&
+         !cyclic_refresh_segment_id_boosted(segment_id)) {
+      // We have some nominal amount of 16x16 variance (based on average),
+      // compute the minmax over the 8x8 sub-blocks, and if above threshold,
+      // force split to 8x8 block for this 16x16 block.
+      int minmax = compute_minmax_8x8(vt->src, vt->src_stride,
+                                      vt->ref, vt->ref_stride,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                      vt->highbd,
+#endif
+                                      vt->width, vt->height);
+      vt->force_split = minmax > cpi->vbp_threshold_minmax;
+    }
+  } else {
+    vt->force_split |= check_split(cpi, vt->split[0],
+                                   segment_id, thresholds + 1);
+    vt->force_split |= check_split(cpi, vt->split[1],
+                                   segment_id, thresholds + 1);
+    vt->force_split |= check_split(cpi, vt->split[2],
+                                   segment_id, thresholds + 1);
+    vt->force_split |= check_split(cpi, vt->split[3],
+                                   segment_id, thresholds + 1);
+
+    if (vt->bsize == BLOCK_32X32 && !vt->force_split) {
+      vt->force_split = vt->variances.none.variance > thresholds[0];
+    }
+  }
+
+  return vt->force_split;
+}
+
 // This function chooses partitioning based on the variance between source and
-// reconstructed last, where variance is computed for down-sampled inputs.
-static int choose_partitioning(VP10_COMP *cpi,
+// reconstructed last (or golden), where variance is computed for down-sampled
+// inputs.
+static void choose_partitioning(VP10_COMP *const cpi,
+                                ThreadData *const td,
                                 const TileInfo *const tile,
-                                MACROBLOCK *x,
-                                int mi_row, int mi_col) {
-  VP10_COMMON * const cm = &cpi->common;
-  MACROBLOCKD *xd = &x->e_mbd;
-  int i, j, k, m;
-  v64x64 vt;
-  v16x16 vt2[16];
-  int force_split[21];
-  uint8_t *s;
-  const uint8_t *d;
-  int sp;
-  int dp;
-  int pixels_wide = 64, pixels_high = 64;
-  int64_t thresholds[4] = {cpi->vbp_thresholds[0], cpi->vbp_thresholds[1],
-      cpi->vbp_thresholds[2], cpi->vbp_thresholds[3]};
+                                MACROBLOCK *const x,
+                                const int mi_row, const int mi_col) {
+  VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  VAR_TREE *const vt = td->var_root[cm->mib_size_log2 - MIN_MIB_SIZE_LOG2];
+  int i;
+  const uint8_t *src;
+  const uint8_t *ref;
+  int src_stride;
+  int ref_stride;
+  int pixels_wide = 8 * num_8x8_blocks_wide_lookup[cm->sb_size];
+  int pixels_high = 8 * num_8x8_blocks_high_lookup[cm->sb_size];
+  int64_t thresholds[5] = {
+    cpi->vbp_thresholds[0],
+    cpi->vbp_thresholds[1],
+    cpi->vbp_thresholds[2],
+    cpi->vbp_thresholds[3],
+    cpi->vbp_thresholds[4],
+  };
+  BLOCK_SIZE bsize_min[5] = {
+      BLOCK_16X16,
+      BLOCK_16X16,
+      BLOCK_16X16,
+      cpi->vbp_bsize_min,
+      BLOCK_8X8
+  };
+  const int start_level = cm->sb_size == BLOCK_64X64 ? 1 : 0;
+  const int64_t *const thre = thresholds + start_level;
+  const BLOCK_SIZE *const bmin = bsize_min + start_level;
 
-  // Always use 4x4 partition for key frame.
   const int is_key_frame = (cm->frame_type == KEY_FRAME);
-  const int use_4x4_partition = is_key_frame;
   const int low_res = (cm->width <= 352 && cm->height <= 288);
-  int variance4x4downsample[16];
 
   int segment_id = CR_SEGMENT_ID_BASE;
+
   if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ && cm->seg.enabled) {
     const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map :
                                                     cm->last_frame_seg_map;
-    segment_id = get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
+    segment_id = get_segment_id(cm, map, cm->sb_size, mi_row, mi_col);
 
     if (cyclic_refresh_segment_id_boosted(segment_id)) {
       int q = vp10_get_qindex(&cm->seg, segment_id, cm->base_qindex);
@@ -674,28 +910,38 @@
     }
   }
 
-  set_offsets(cpi, tile, x, mi_row, mi_col, BLOCK_64X64);
+  set_offsets(cpi, tile, x, mi_row, mi_col, cm->sb_size);
 
   if (xd->mb_to_right_edge < 0)
     pixels_wide += (xd->mb_to_right_edge >> 3);
   if (xd->mb_to_bottom_edge < 0)
     pixels_high += (xd->mb_to_bottom_edge >> 3);
 
-  s = x->plane[0].src.buf;
-  sp = x->plane[0].src.stride;
+  src = x->plane[0].src.buf;
+  src_stride = x->plane[0].src.stride;
 
   if (!is_key_frame) {
     MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
     unsigned int uv_sad;
     const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, LAST_FRAME);
-
-    const YV12_BUFFER_CONFIG *yv12_g = NULL;
+    const YV12_BUFFER_CONFIG *yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
     unsigned int y_sad, y_sad_g;
-    const BLOCK_SIZE bsize = BLOCK_32X32
-        + (mi_col + 4 < cm->mi_cols) * 2 + (mi_row + 4 < cm->mi_rows);
+
+    const int hbs = cm->mib_size / 2;
+    const int split_vert = mi_col + hbs >= cm->mi_cols;
+    const int split_horz = mi_row + hbs >= cm->mi_rows;
+    BLOCK_SIZE bsize;
+
+    if (split_vert && split_horz)
+      bsize = get_subsize(cm->sb_size, PARTITION_SPLIT);
+    else if (split_vert)
+      bsize = get_subsize(cm->sb_size, PARTITION_VERT);
+    else if (split_horz)
+      bsize = get_subsize(cm->sb_size, PARTITION_HORZ);
+    else
+      bsize = cm->sb_size;
 
     assert(yv12 != NULL);
-    yv12_g = get_ref_frame_buffer(cpi, GOLDEN_FRAME);
 
     if (yv12_g && yv12_g != yv12) {
       vp10_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
@@ -712,11 +958,17 @@
                          &cm->frame_refs[LAST_FRAME - 1].sf);
     mbmi->ref_frame[0] = LAST_FRAME;
     mbmi->ref_frame[1] = NONE;
-    mbmi->sb_type = BLOCK_64X64;
+    mbmi->sb_type = cm->sb_size;
     mbmi->mv[0].as_int = 0;
+#if CONFIG_DUAL_FILTER
+    for (i = 0; i < 4; ++i)
+      mbmi->interp_filter[i] = BILINEAR;
+#else
     mbmi->interp_filter = BILINEAR;
+#endif
 
     y_sad = vp10_int_pro_motion_estimation(cpi, x, bsize, mi_row, mi_col);
+
     if (y_sad_g < y_sad) {
       vp10_setup_pre_planes(xd, 0, yv12_g, mi_row, mi_col,
                            &cm->frame_refs[GOLDEN_FRAME - 1].sf);
@@ -727,9 +979,9 @@
       x->pred_mv[LAST_FRAME] = mbmi->mv[0].as_mv;
     }
 
-    vp10_build_inter_predictors_sb(xd, mi_row, mi_col, BLOCK_64X64);
+    vp10_build_inter_predictors_sb(xd, mi_row, mi_col, cm->sb_size);
 
-    for (i = 1; i <= 2; ++i) {
+    for (i = 1; i < MAX_MB_PLANE; ++i) {
       struct macroblock_plane  *p = &x->plane[i];
       struct macroblockd_plane *pd = &xd->plane[i];
       const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
@@ -743,201 +995,95 @@
       x->color_sensitivity[i - 1] = uv_sad > (y_sad >> 2);
     }
 
-    d = xd->plane[0].dst.buf;
-    dp = xd->plane[0].dst.stride;
+    ref = xd->plane[0].dst.buf;
+    ref_stride = xd->plane[0].dst.stride;
 
-    // If the y_sad is very small, take 64x64 as partition and exit.
-    // Don't check on boosted segment for now, as 64x64 is suppressed there.
-    if (segment_id == CR_SEGMENT_ID_BASE &&
-        y_sad < cpi->vbp_threshold_sad) {
-      const int block_width = num_8x8_blocks_wide_lookup[BLOCK_64X64];
-      const int block_height = num_8x8_blocks_high_lookup[BLOCK_64X64];
-      if (mi_col + block_width / 2 < cm->mi_cols &&
-          mi_row + block_height / 2 < cm->mi_rows) {
-        set_block_size(cpi, x, xd, mi_row, mi_col, BLOCK_64X64);
-        return 0;
+    // If the y_sad is very small, take the largest partition and exit.
+    // Don't check on boosted segment for now, as largest is suppressed there.
+    if (segment_id == CR_SEGMENT_ID_BASE && y_sad < cpi->vbp_threshold_sad) {
+      if (!split_vert && !split_horz) {
+        set_block_size(cpi, x, xd, mi_row, mi_col, cm->sb_size);
+        return;
       }
     }
   } else {
-    d = VP9_VAR_OFFS;
-    dp = 0;
+    ref = VP10_VAR_OFFS;
+    ref_stride = 0;
 #if CONFIG_VP9_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
       switch (xd->bd) {
         case 10:
-          d = CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_10);
+          ref = CONVERT_TO_BYTEPTR(VP10_HIGH_VAR_OFFS_10);
           break;
         case 12:
-          d = CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_12);
+          ref = CONVERT_TO_BYTEPTR(VP10_HIGH_VAR_OFFS_12);
           break;
         case 8:
         default:
-          d = CONVERT_TO_BYTEPTR(VP9_HIGH_VAR_OFFS_8);
+          ref = CONVERT_TO_BYTEPTR(VP10_HIGH_VAR_OFFS_8);
           break;
       }
     }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   }
 
-  // Index for force_split: 0 for 64x64, 1-4 for 32x32 blocks,
-  // 5-20 for the 16x16 blocks.
-  force_split[0] = 0;
-  // Fill in the entire tree of 8x8 (or 4x4 under some conditions) variances
-  // for splits.
-  for (i = 0; i < 4; i++) {
-    const int x32_idx = ((i & 1) << 5);
-    const int y32_idx = ((i >> 1) << 5);
-    const int i2 = i << 2;
-    force_split[i + 1] = 0;
-    for (j = 0; j < 4; j++) {
-      const int x16_idx = x32_idx + ((j & 1) << 4);
-      const int y16_idx = y32_idx + ((j >> 1) << 4);
-      const int split_index = 5 + i2 + j;
-      v16x16 *vst = &vt.split[i].split[j];
-      force_split[split_index] = 0;
-      variance4x4downsample[i2 + j] = 0;
-      if (!is_key_frame) {
-        fill_variance_8x8avg(s, sp, d, dp, x16_idx, y16_idx, vst,
+  init_variance_tree(vt,
 #if CONFIG_VP9_HIGHBITDEPTH
-                            xd->cur_buf->flags,
-#endif
-                            pixels_wide,
-                            pixels_high,
-                            is_key_frame);
-        fill_variance_tree(&vt.split[i].split[j], BLOCK_16X16);
-        get_variance(&vt.split[i].split[j].part_variances.none);
-        if (vt.split[i].split[j].part_variances.none.variance >
-            thresholds[2]) {
-          // 16X16 variance is above threshold for split, so force split to 8x8
-          // for this 16x16 block (this also forces splits for upper levels).
-          force_split[split_index] = 1;
-          force_split[i + 1] = 1;
-          force_split[0] = 1;
-        } else if (vt.split[i].split[j].part_variances.none.variance >
-                   thresholds[1] &&
-                   !cyclic_refresh_segment_id_boosted(segment_id)) {
-          // We have some nominal amount of 16x16 variance (based on average),
-          // compute the minmax over the 8x8 sub-blocks, and if above threshold,
-          // force split to 8x8 block for this 16x16 block.
-          int minmax = compute_minmax_8x8(s, sp, d, dp, x16_idx, y16_idx,
-#if CONFIG_VP9_HIGHBITDEPTH
-                                          xd->cur_buf->flags,
-#endif
-                                          pixels_wide, pixels_high);
-          if (minmax > cpi->vbp_threshold_minmax) {
-            force_split[split_index] = 1;
-            force_split[i + 1] = 1;
-            force_split[0] = 1;
-          }
-        }
-      }
-      if (is_key_frame || (low_res &&
-          vt.split[i].split[j].part_variances.none.variance >
-          (thresholds[1] << 1))) {
-        force_split[split_index] = 0;
-        // Go down to 4x4 down-sampling for variance.
-        variance4x4downsample[i2 + j] = 1;
-        for (k = 0; k < 4; k++) {
-          int x8_idx = x16_idx + ((k & 1) << 3);
-          int y8_idx = y16_idx + ((k >> 1) << 3);
-          v8x8 *vst2 = is_key_frame ? &vst->split[k] :
-              &vt2[i2 + j].split[k];
-          fill_variance_4x4avg(s, sp, d, dp, x8_idx, y8_idx, vst2,
-#if CONFIG_VP9_HIGHBITDEPTH
-                               xd->cur_buf->flags,
-#endif
-                               pixels_wide,
-                               pixels_high,
-                               is_key_frame);
-        }
-      }
+                     xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH,
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+                     cm->sb_size,
+                     (is_key_frame || low_res) ? BLOCK_4X4 : BLOCK_8X8,
+                     pixels_wide, pixels_high,
+                     src, src_stride, ref, ref_stride);
+
+  // Fill in the entire tree of variances and compute splits.
+  if (is_key_frame)  {
+    fill_variance_tree(vt, BLOCK_4X4);
+    check_split_key_frame(vt, thre[1]);
+  } else {
+    fill_variance_tree(vt, BLOCK_8X8);
+    check_split(cpi, vt, segment_id, thre);
+    if (low_res) {
+      refine_variance_tree(vt, thre[1] << 1);
     }
   }
 
-  // Fill the rest of the variance tree by summing split partition values.
-  for (i = 0; i < 4; i++) {
-    const int i2 = i << 2;
-    for (j = 0; j < 4; j++) {
-      if (variance4x4downsample[i2 + j] == 1) {
-        v16x16 *vtemp = (!is_key_frame) ? &vt2[i2 + j] :
-            &vt.split[i].split[j];
-        for (m = 0; m < 4; m++)
-          fill_variance_tree(&vtemp->split[m], BLOCK_8X8);
-        fill_variance_tree(vtemp, BLOCK_16X16);
-      }
-    }
-    fill_variance_tree(&vt.split[i], BLOCK_32X32);
-    // If variance of this 32x32 block is above the threshold, force the block
-    // to split. This also forces a split on the upper (64x64) level.
-    if (!force_split[i + 1]) {
-      get_variance(&vt.split[i].part_variances.none);
-      if (vt.split[i].part_variances.none.variance > thresholds[1]) {
-        force_split[i + 1] = 1;
-        force_split[0] = 1;
-      }
-    }
-  }
-  if (!force_split[0]) {
-    fill_variance_tree(&vt, BLOCK_64X64);
-    get_variance(&vt.part_variances.none);
-  }
+  vt->force_split |= mi_col + cm->mib_size > cm->mi_cols ||
+                     mi_row + cm->mib_size > cm->mi_rows;
 
   // Now go through the entire structure, splitting every block size until
   // we get to one that's got a variance lower than our threshold.
-  if ( mi_col + 8 > cm->mi_cols || mi_row + 8 > cm->mi_rows ||
-      !set_vt_partitioning(cpi, x, xd, &vt, BLOCK_64X64, mi_row, mi_col,
-                           thresholds[0], BLOCK_16X16, force_split[0])) {
-    for (i = 0; i < 4; ++i) {
-      const int x32_idx = ((i & 1) << 2);
-      const int y32_idx = ((i >> 1) << 2);
-      const int i2 = i << 2;
-      if (!set_vt_partitioning(cpi, x, xd, &vt.split[i], BLOCK_32X32,
-                               (mi_row + y32_idx), (mi_col + x32_idx),
-                               thresholds[1], BLOCK_16X16,
-                               force_split[i + 1])) {
-        for (j = 0; j < 4; ++j) {
-          const int x16_idx = ((j & 1) << 1);
-          const int y16_idx = ((j >> 1) << 1);
-          // For inter frames: if variance4x4downsample[] == 1 for this 16x16
-          // block, then the variance is based on 4x4 down-sampling, so use vt2
-          // in set_vt_partioning(), otherwise use vt.
-          v16x16 *vtemp = (!is_key_frame &&
-                           variance4x4downsample[i2 + j] == 1) ?
-                           &vt2[i2 + j] : &vt.split[i].split[j];
-          if (!set_vt_partitioning(cpi, x, xd, vtemp, BLOCK_16X16,
-                                   mi_row + y32_idx + y16_idx,
-                                   mi_col + x32_idx + x16_idx,
-                                   thresholds[2],
-                                   cpi->vbp_bsize_min,
-                                   force_split[5 + i2  + j])) {
-            for (k = 0; k < 4; ++k) {
-              const int x8_idx = (k & 1);
-              const int y8_idx = (k >> 1);
-              if (use_4x4_partition) {
-                if (!set_vt_partitioning(cpi, x, xd, &vtemp->split[k],
-                                         BLOCK_8X8,
-                                         mi_row + y32_idx + y16_idx + y8_idx,
-                                         mi_col + x32_idx + x16_idx + x8_idx,
-                                         thresholds[3], BLOCK_8X8, 0)) {
-                  set_block_size(cpi, x, xd,
-                                 (mi_row + y32_idx + y16_idx + y8_idx),
-                                 (mi_col + x32_idx + x16_idx + x8_idx),
-                                 BLOCK_4X4);
-                }
-              } else {
-                set_block_size(cpi, x, xd,
-                               (mi_row + y32_idx + y16_idx + y8_idx),
-                               (mi_col + x32_idx + x16_idx + x8_idx),
-                               BLOCK_8X8);
-              }
-            }
-          }
-        }
-      }
+  set_vt_partitioning(cpi, x, xd, vt, mi_row, mi_col, thre, bmin);
+}
+
+#if CONFIG_DUAL_FILTER
+static void reset_intmv_filter_type(VP10_COMMON *cm,
+                                    MACROBLOCKD *xd, MB_MODE_INFO *mbmi) {
+  int dir;
+  for (dir = 0; dir < 2; ++dir) {
+    if (!has_subpel_mv_component(xd->mi[0], xd, dir) &&
+        (mbmi->ref_frame[1] == NONE ||
+         !has_subpel_mv_component(xd->mi[0], xd, dir + 2)))
+      mbmi->interp_filter[dir] = (cm->interp_filter == SWITCHABLE) ?
+          EIGHTTAP_REGULAR : cm->interp_filter;
+    mbmi->interp_filter[dir + 2] = mbmi->interp_filter[dir];
+  }
+}
+
+static void update_filter_type_count(FRAME_COUNTS *counts,
+                                     const MACROBLOCKD *xd,
+                                     const MB_MODE_INFO *mbmi) {
+  int dir;
+  for (dir = 0; dir < 2; ++dir) {
+    if (has_subpel_mv_component(xd->mi[0], xd, dir) ||
+        (mbmi->ref_frame[1] > INTRA_FRAME &&
+         has_subpel_mv_component(xd->mi[0], xd, dir + 2))) {
+      const int ctx = vp10_get_pred_context_switchable_interp(xd, dir);
+      ++counts->switchable_interp[ctx][mbmi->interp_filter[dir]];
     }
   }
-  return 0;
 }
+#endif
 
 static void update_state(VP10_COMP *cpi, ThreadData *td,
                          PICK_MODE_CONTEXT *ctx,
@@ -967,11 +1113,37 @@
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
   int max_plane;
 
+#if CONFIG_REF_MV
+  int8_t rf_type;
+#endif
+
+#if !CONFIG_SUPERTX
   assert(mi->mbmi.sb_type == bsize);
+#endif
 
   *mi_addr = *mi;
   *x->mbmi_ext = ctx->mbmi_ext;
 
+#if CONFIG_DUAL_FILTER
+  reset_intmv_filter_type(cm, xd, mbmi);
+#endif
+
+#if CONFIG_REF_MV
+  rf_type = vp10_ref_frame_type(mbmi->ref_frame);
+  if (x->mbmi_ext->ref_mv_count[rf_type] > 1 &&
+      mbmi->sb_type >= BLOCK_8X8 &&
+      mbmi->mode == NEWMV) {
+    for (i = 0; i < 1 + has_second_ref(mbmi); ++i) {
+      int_mv this_mv = (i == 0) ?
+          x->mbmi_ext->ref_mv_stack[rf_type][mbmi->ref_mv_idx].this_mv :
+          x->mbmi_ext->ref_mv_stack[rf_type][mbmi->ref_mv_idx].comp_mv;
+      clamp_mv_ref(&this_mv.as_mv, xd->n8_w << 3, xd->n8_h << 3, xd);
+      x->mbmi_ext->ref_mvs[mbmi->ref_frame[i]][0] = this_mv;
+      mbmi->pred_mv[i] = this_mv;
+    }
+  }
+#endif
+
   // If segmentation in use
   if (seg->enabled) {
     // For in frame complexity AQ copy the segment id from the segment map.
@@ -985,8 +1157,8 @@
     // and then update the quantizer.
     if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
       vp10_cyclic_refresh_update_segment(cpi, &xd->mi[0]->mbmi, mi_row,
-                                        mi_col, bsize, ctx->rate, ctx->dist,
-                                        x->skip);
+                                         mi_col, bsize, ctx->rate, ctx->dist,
+                                         x->skip);
     }
   }
 
@@ -1018,7 +1190,7 @@
       }
 
   if (cpi->oxcf.aq_mode)
-    vp10_init_plane_quantizers(cpi, x);
+    vp10_init_plane_quantizers(cpi, x, xd->mi[0]->mbmi.segment_id);
 
   if (is_inter_block(mbmi) && mbmi->sb_type < BLOCK_8X8) {
     mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
@@ -1026,6 +1198,12 @@
   }
 
   x->skip = ctx->skip;
+
+#if CONFIG_VAR_TX
+  for (i = 0; i < 1; ++i)
+    memcpy(x->blk_skip[i], ctx->blk_skip[i],
+           sizeof(uint8_t) * ctx->num_4x4_blk);
+#endif
   memcpy(x->zcoeff_blk[mbmi->tx_size], ctx->zcoeff_blk,
          sizeof(ctx->zcoeff_blk[0]) * ctx->num_4x4_blk);
 
@@ -1055,19 +1233,23 @@
   if (!frame_is_intra_only(cm)) {
     if (is_inter_block(mbmi)) {
       vp10_update_mv_count(td);
-
-      if (cm->interp_filter == SWITCHABLE) {
+      if (cm->interp_filter == SWITCHABLE
+#if CONFIG_EXT_INTERP
+          && vp10_is_interp_needed(xd)
+#endif
+          ) {
+#if CONFIG_DUAL_FILTER
+        update_filter_type_count(td->counts, xd, mbmi);
+#else
         const int ctx = vp10_get_pred_context_switchable_interp(xd);
         ++td->counts->switchable_interp[ctx][mbmi->interp_filter];
+#endif
       }
     }
 
     rdc->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff;
     rdc->comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff;
     rdc->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff;
-
-    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
-      rdc->filter_diff[i] += ctx->best_filter_diff[i];
   }
 
   for (h = 0; h < y_mis; ++h) {
@@ -1082,6 +1264,406 @@
   }
 }
 
+#if CONFIG_SUPERTX
+static void update_state_supertx(VP10_COMP *cpi, ThreadData *td,
+                                 PICK_MODE_CONTEXT *ctx,
+                                 int mi_row, int mi_col, BLOCK_SIZE bsize,
+                                 int output_enabled) {
+  int y, x_idx;
+#if CONFIG_VAR_TX || CONFIG_REF_MV
+  int i;
+#endif
+  VP10_COMMON *const cm = &cpi->common;
+  RD_COUNTS *const rdc = &td->rd_counts;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *mi = &ctx->mic;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  MODE_INFO *mi_addr = xd->mi[0];
+  const struct segmentation *const seg = &cm->seg;
+  const int mis = cm->mi_stride;
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  const int x_mis = VPXMIN(mi_width, cm->mi_cols - mi_col);
+  const int y_mis = VPXMIN(mi_height, cm->mi_rows - mi_row);
+  MV_REF *const frame_mvs =
+      cm->cur_frame->mvs + mi_row * cm->mi_cols + mi_col;
+  int w, h;
+
+#if CONFIG_REF_MV
+  int8_t rf_type;
+#endif
+
+  *mi_addr = *mi;
+  *x->mbmi_ext = ctx->mbmi_ext;
+  assert(is_inter_block(mbmi));
+  assert(mbmi->tx_size == ctx->mic.mbmi.tx_size);
+
+#if CONFIG_DUAL_FILTER
+  reset_intmv_filter_type(cm, xd, mbmi);
+#endif
+
+#if CONFIG_REF_MV
+  rf_type = vp10_ref_frame_type(mbmi->ref_frame);
+  if (x->mbmi_ext->ref_mv_count[rf_type] > 1 &&
+      mbmi->sb_type >= BLOCK_8X8 &&
+      mbmi->mode == NEWMV) {
+    for (i = 0; i < 1 + has_second_ref(mbmi); ++i) {
+      int_mv this_mv = (i == 0) ?
+          x->mbmi_ext->ref_mv_stack[rf_type][mbmi->ref_mv_idx].this_mv :
+          x->mbmi_ext->ref_mv_stack[rf_type][mbmi->ref_mv_idx].comp_mv;
+      clamp_mv_ref(&this_mv.as_mv, xd->n8_w << 3, xd->n8_h << 3, xd);
+      lower_mv_precision(&this_mv.as_mv, cm->allow_high_precision_mv);
+      x->mbmi_ext->ref_mvs[mbmi->ref_frame[i]][0] = this_mv;
+      mbmi->pred_mv[i] = this_mv;
+    }
+  }
+#endif
+
+  // If segmentation in use
+  if (seg->enabled) {
+    if (cpi->vaq_refresh) {
+      const int energy = bsize <= BLOCK_16X16 ?
+                         x->mb_energy : vp10_block_energy(cpi, x, bsize);
+      mi_addr->mbmi.segment_id = vp10_vaq_segment_id(energy);
+    } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+      // For cyclic refresh mode, now update the segment map
+      // and set the segment id.
+      vp10_cyclic_refresh_update_segment(cpi, &xd->mi[0]->mbmi,
+                                         mi_row, mi_col, bsize,
+                                         ctx->rate, ctx->dist, 1);
+    } else {
+      // Otherwise just set the segment id based on the current segment map
+      const uint8_t *const map = seg->update_map ? cpi->segmentation_map
+                                                 : cm->last_frame_seg_map;
+      mi_addr->mbmi.segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
+    }
+    mi_addr->mbmi.segment_id_supertx = MAX_SEGMENTS;
+  }
+
+  // Restore the coding context of the MB to that that was in place
+  // when the mode was picked for it
+  for (y = 0; y < mi_height; y++)
+    for (x_idx = 0; x_idx < mi_width; x_idx++)
+      if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > x_idx
+        && (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height > y) {
+        xd->mi[x_idx + y * mis] = mi_addr;
+      }
+
+  if (is_inter_block(mbmi) && mbmi->sb_type < BLOCK_8X8) {
+    mbmi->mv[0].as_int = mi->bmi[3].as_mv[0].as_int;
+    mbmi->mv[1].as_int = mi->bmi[3].as_mv[1].as_int;
+  }
+
+  x->skip = ctx->skip;
+
+#if CONFIG_VAR_TX
+  for (i = 0; i < 1; ++i)
+    memcpy(x->blk_skip[i], ctx->blk_skip[i],
+           sizeof(uint8_t) * ctx->num_4x4_blk);
+#endif  // CONFIG_VAR_TX
+  memcpy(x->zcoeff_blk[mbmi->tx_size], ctx->zcoeff_blk,
+         sizeof(uint8_t) * ctx->num_4x4_blk);
+
+#if CONFIG_VAR_TX
+  {
+    const TX_SIZE mtx = mbmi->tx_size;
+    int idy, idx;
+    for (idy = 0; idy < (1 << mtx) / 2; ++idy)
+      for (idx = 0; idx < (1 << mtx) / 2; ++idx)
+        mbmi->inter_tx_size[idy][idx] = mbmi->tx_size;
+  }
+#endif  // CONFIG_VAR_TX
+  // Turn motion variation off for supertx
+  mbmi->motion_variation = SIMPLE_TRANSLATION;
+
+  if (!output_enabled)
+    return;
+
+  if (!frame_is_intra_only(cm)) {
+    vp10_update_mv_count(td);
+
+    if (cm->interp_filter == SWITCHABLE
+#if CONFIG_EXT_INTERP
+        && vp10_is_interp_needed(xd)
+#endif
+        ) {
+#if CONFIG_DUAL_FILTER
+      update_filter_type_count(td->counts, xd, mbmi);
+#else
+      const int ctx = vp10_get_pred_context_switchable_interp(xd);
+      ++td->counts->switchable_interp[ctx][mbmi->interp_filter];
+#endif
+    }
+
+    rdc->comp_pred_diff[SINGLE_REFERENCE] += ctx->single_pred_diff;
+    rdc->comp_pred_diff[COMPOUND_REFERENCE] += ctx->comp_pred_diff;
+    rdc->comp_pred_diff[REFERENCE_MODE_SELECT] += ctx->hybrid_pred_diff;
+  }
+
+  for (h = 0; h < y_mis; ++h) {
+    MV_REF *const frame_mv = frame_mvs + h * cm->mi_cols;
+    for (w = 0; w < x_mis; ++w) {
+      MV_REF *const mv = frame_mv + w;
+      mv->ref_frame[0] = mi->mbmi.ref_frame[0];
+      mv->ref_frame[1] = mi->mbmi.ref_frame[1];
+      mv->mv[0].as_int = mi->mbmi.mv[0].as_int;
+      mv->mv[1].as_int = mi->mbmi.mv[1].as_int;
+    }
+  }
+}
+
+static void update_state_sb_supertx(VP10_COMP *cpi, ThreadData *td,
+                                    const TileInfo *const tile,
+                                    int mi_row, int mi_col,
+                                    BLOCK_SIZE bsize,
+                                    int output_enabled, PC_TREE *pc_tree) {
+  VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct macroblock_plane *const p = x->plane;
+  struct macroblockd_plane *const pd = xd->plane;
+  int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
+  PARTITION_TYPE partition = pc_tree->partitioning;
+  BLOCK_SIZE subsize = get_subsize(bsize, partition);
+  int i;
+#if CONFIG_EXT_PARTITION_TYPES
+  BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+#endif
+  PICK_MODE_CONTEXT *pmc = NULL;
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+  if (bsize == BLOCK_16X16 && cpi->vaq_refresh)
+    x->mb_energy = vp10_block_energy(cpi, x, bsize);
+
+  switch (partition) {
+    case PARTITION_NONE:
+      set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
+      update_state_supertx(cpi, td, &pc_tree->none, mi_row, mi_col,
+                           subsize, output_enabled);
+      break;
+    case PARTITION_VERT:
+      set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
+      update_state_supertx(cpi, td, &pc_tree->vertical[0], mi_row, mi_col,
+                           subsize, output_enabled);
+      if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
+        set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize);
+        update_state_supertx(cpi, td, &pc_tree->vertical[1],
+                             mi_row, mi_col + hbs, subsize, output_enabled);
+      }
+      pmc = &pc_tree->vertical_supertx;
+      break;
+    case PARTITION_HORZ:
+      set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
+      update_state_supertx(cpi, td, &pc_tree->horizontal[0], mi_row, mi_col,
+                           subsize, output_enabled);
+      if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
+        set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize);
+        update_state_supertx(cpi, td, &pc_tree->horizontal[1], mi_row + hbs,
+                             mi_col, subsize, output_enabled);
+      }
+      pmc = &pc_tree->horizontal_supertx;
+      break;
+    case PARTITION_SPLIT:
+      if (bsize == BLOCK_8X8) {
+        set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
+        update_state_supertx(cpi, td, pc_tree->leaf_split[0], mi_row, mi_col,
+                             subsize, output_enabled);
+      } else {
+        set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
+        update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, subsize,
+                                output_enabled, pc_tree->split[0]);
+        set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize);
+        update_state_sb_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize,
+                                output_enabled, pc_tree->split[1]);
+        set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize);
+        update_state_sb_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize,
+                                output_enabled, pc_tree->split[2]);
+        set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs, subsize);
+        update_state_sb_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs,
+                                subsize, output_enabled, pc_tree->split[3]);
+      }
+      pmc = &pc_tree->split_supertx;
+      break;
+#if CONFIG_EXT_PARTITION_TYPES
+    case PARTITION_HORZ_A:
+      set_offsets_supertx(cpi, td, tile, mi_row, mi_col, bsize2);
+      update_state_supertx(cpi, td, &pc_tree->horizontala[0], mi_row, mi_col,
+                           bsize2, output_enabled);
+      set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, bsize2);
+      update_state_supertx(cpi, td, &pc_tree->horizontala[1], mi_row,
+                           mi_col + hbs, bsize2, output_enabled);
+      set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, subsize);
+      update_state_supertx(cpi, td, &pc_tree->horizontala[2], mi_row + hbs,
+                           mi_col, subsize, output_enabled);
+      pmc = &pc_tree->horizontala_supertx;
+      break;
+    case PARTITION_HORZ_B:
+      set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
+      update_state_supertx(cpi, td, &pc_tree->horizontalb[0], mi_row, mi_col,
+                           subsize, output_enabled);
+      set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, bsize2);
+      update_state_supertx(cpi, td, &pc_tree->horizontalb[1], mi_row + hbs,
+                           mi_col, bsize2, output_enabled);
+      set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs, bsize2);
+      update_state_supertx(cpi, td, &pc_tree->horizontalb[2], mi_row + hbs,
+                           mi_col + hbs, bsize2, output_enabled);
+      pmc = &pc_tree->horizontalb_supertx;
+      break;
+    case PARTITION_VERT_A:
+      set_offsets_supertx(cpi, td, tile, mi_row, mi_col, bsize2);
+      update_state_supertx(cpi, td, &pc_tree->verticala[0], mi_row, mi_col,
+                           bsize2, output_enabled);
+      set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col, bsize2);
+      update_state_supertx(cpi, td, &pc_tree->verticala[1], mi_row + hbs,
+                           mi_col, bsize2, output_enabled);
+      set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, subsize);
+      update_state_supertx(cpi, td, &pc_tree->verticala[2], mi_row,
+                           mi_col + hbs, subsize, output_enabled);
+      pmc = &pc_tree->verticala_supertx;
+      break;
+    case PARTITION_VERT_B:
+      set_offsets_supertx(cpi, td, tile, mi_row, mi_col, subsize);
+      update_state_supertx(cpi, td, &pc_tree->verticalb[0], mi_row, mi_col,
+                           subsize, output_enabled);
+      set_offsets_supertx(cpi, td, tile, mi_row, mi_col + hbs, bsize2);
+      update_state_supertx(cpi, td, &pc_tree->verticalb[1], mi_row,
+                           mi_col + hbs, bsize2, output_enabled);
+      set_offsets_supertx(cpi, td, tile, mi_row + hbs, mi_col + hbs, bsize2);
+      update_state_supertx(cpi, td, &pc_tree->verticalb[2], mi_row + hbs,
+                           mi_col + hbs, bsize2, output_enabled);
+      pmc = &pc_tree->verticalb_supertx;
+      break;
+#endif  // CONFIG_EXT_PARTITION_TYPES
+    default:
+      assert(0);
+  }
+
+  for (i = 0; i < MAX_MB_PLANE; ++i) {
+    if (pmc != NULL) {
+      p[i].coeff = pmc->coeff_pbuf[i][1];
+      p[i].qcoeff = pmc->qcoeff_pbuf[i][1];
+      pd[i].dqcoeff = pmc->dqcoeff_pbuf[i][1];
+      p[i].eobs = pmc->eobs_pbuf[i][1];
+    } else {
+      // These should never be used
+      p[i].coeff = NULL;
+      p[i].qcoeff = NULL;
+      pd[i].dqcoeff = NULL;
+      p[i].eobs = NULL;
+    }
+  }
+}
+
+static void update_supertx_param(ThreadData *td,
+                                 PICK_MODE_CONTEXT *ctx,
+                                 int best_tx,
+                                 TX_SIZE supertx_size) {
+  MACROBLOCK *const x = &td->mb;
+#if CONFIG_VAR_TX
+  int i;
+
+  for (i = 0; i < 1; ++i)
+    memcpy(ctx->blk_skip[i], x->blk_skip[i],
+           sizeof(uint8_t) * ctx->num_4x4_blk);
+#endif  // CONFIG_VAR_TX
+  memcpy(ctx->zcoeff_blk, x->zcoeff_blk[supertx_size],
+         sizeof(uint8_t) * ctx->num_4x4_blk);
+  ctx->mic.mbmi.tx_size = supertx_size;
+  ctx->skip = x->skip;
+  ctx->mic.mbmi.tx_type = best_tx;
+}
+
+static void update_supertx_param_sb(VP10_COMP *cpi, ThreadData *td,
+                                    int mi_row, int mi_col,
+                                    BLOCK_SIZE bsize,
+                                    int best_tx,
+                                    TX_SIZE supertx_size, PC_TREE *pc_tree) {
+  VP10_COMMON *const cm = &cpi->common;
+  int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
+  PARTITION_TYPE partition = pc_tree->partitioning;
+  BLOCK_SIZE subsize = get_subsize(bsize, partition);
+#if CONFIG_EXT_PARTITION_TYPES
+  int i;
+#endif
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+  switch (partition) {
+    case PARTITION_NONE:
+      update_supertx_param(td, &pc_tree->none,
+                           best_tx,
+                           supertx_size);
+      break;
+    case PARTITION_VERT:
+      update_supertx_param(td, &pc_tree->vertical[0],
+                           best_tx,
+                           supertx_size);
+      if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8)
+        update_supertx_param(td, &pc_tree->vertical[1],
+                             best_tx,
+                             supertx_size);
+      break;
+    case PARTITION_HORZ:
+      update_supertx_param(td, &pc_tree->horizontal[0],
+                           best_tx,
+                           supertx_size);
+      if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8)
+        update_supertx_param(td, &pc_tree->horizontal[1],
+                             best_tx,
+                             supertx_size);
+      break;
+    case PARTITION_SPLIT:
+      if (bsize == BLOCK_8X8) {
+        update_supertx_param(td, pc_tree->leaf_split[0],
+                             best_tx,
+                             supertx_size);
+      } else {
+        update_supertx_param_sb(cpi, td, mi_row, mi_col, subsize,
+                                best_tx,
+                                supertx_size, pc_tree->split[0]);
+        update_supertx_param_sb(cpi, td, mi_row, mi_col + hbs, subsize,
+                                best_tx,
+                                supertx_size, pc_tree->split[1]);
+        update_supertx_param_sb(cpi, td, mi_row + hbs, mi_col, subsize,
+                                best_tx,
+                                supertx_size, pc_tree->split[2]);
+        update_supertx_param_sb(cpi, td, mi_row + hbs, mi_col + hbs, subsize,
+                                best_tx,
+                                supertx_size, pc_tree->split[3]);
+      }
+      break;
+#if CONFIG_EXT_PARTITION_TYPES
+    case PARTITION_HORZ_A:
+      for ( i = 0; i < 3; i++)
+        update_supertx_param(td, &pc_tree->horizontala[i], best_tx,
+                            supertx_size);
+      break;
+    case PARTITION_HORZ_B:
+      for ( i = 0; i < 3; i++)
+        update_supertx_param(td, &pc_tree->horizontalb[i], best_tx,
+                            supertx_size);
+      break;
+    case PARTITION_VERT_A:
+      for ( i = 0; i < 3; i++)
+        update_supertx_param(td, &pc_tree->verticala[i], best_tx,
+                            supertx_size);
+      break;
+    case PARTITION_VERT_B:
+      for ( i = 0; i < 3; i++)
+        update_supertx_param(td, &pc_tree->verticalb[i], best_tx,
+                            supertx_size);
+      break;
+#endif  // CONFIG_EXT_PARTITION_TYPES
+    default:
+      assert(0);
+  }
+}
+#endif  // CONFIG_SUPERTX
+
 void vp10_setup_src_planes(MACROBLOCK *x, const YV12_BUFFER_CONFIG *src,
                           int mi_row, int mi_col) {
   uint8_t *const buffers[3] = {src->y_buffer, src->u_buffer, src->v_buffer };
@@ -1102,7 +1684,7 @@
                                int8_t segment_id) {
   int segment_qindex;
   VP10_COMMON *const cm = &cpi->common;
-  vp10_init_plane_quantizers(cpi, x);
+  vp10_init_plane_quantizers(cpi, x, segment_id);
   vpx_clear_system_state();
   segment_qindex = vp10_get_qindex(&cm->seg, segment_id,
                                   cm->base_qindex);
@@ -1113,6 +1695,12 @@
                              TileDataEnc *tile_data,
                              MACROBLOCK *const x,
                              int mi_row, int mi_col, RD_COST *rd_cost,
+#if CONFIG_SUPERTX
+                             int *totalrate_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+                             PARTITION_TYPE partition,
+#endif
                              BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
                              int64_t best_rd) {
   VP10_COMMON *const cm = &cpi->common;
@@ -1132,6 +1720,17 @@
   set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
   mbmi = &xd->mi[0]->mbmi;
   mbmi->sb_type = bsize;
+#if CONFIG_SUPERTX
+  // We set tx_size here as skip blocks would otherwise not set it.
+  // tx_size needs to be set at this point as supertx_enable in
+  // write_modes_sb is computed based on this, and if the garbage in memory
+  // just happens to be the supertx_size, then the packer will code this
+  // block as a supertx block, even if rdopt did not pick it as such.
+  mbmi->tx_size = max_txsize_lookup[bsize];
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+  mbmi->partition = partition;
+#endif
 
   for (i = 0; i < MAX_MB_PLANE; ++i) {
     p[i].coeff = ctx->coeff_pbuf[i][0];
@@ -1146,7 +1745,6 @@
   ctx->is_coded = 0;
   ctx->skippable = 0;
   ctx->pred_pixel_ready = 0;
-  x->skip_recode = 0;
 
   // Set to zero to make sure we do not use the previous encoded frame stats
   mbmi->skip = 0;
@@ -1169,26 +1767,20 @@
   orig_rdmult = x->rdmult;
 
   if (aq_mode == VARIANCE_AQ) {
-    const int energy = bsize <= BLOCK_16X16 ? x->mb_energy
-                                            : vp10_block_energy(cpi, x, bsize);
-    if (cm->frame_type == KEY_FRAME ||
-        cpi->refresh_alt_ref_frame ||
-        (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref)) {
+    if (cpi->vaq_refresh) {
+      const int energy = bsize <= BLOCK_16X16 ?
+                         x->mb_energy : vp10_block_energy(cpi, x, bsize);
       mbmi->segment_id = vp10_vaq_segment_id(energy);
-    } else {
-      const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map
-                                                    : cm->last_frame_seg_map;
-      mbmi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
+      // Re-initialise quantiser
+      vp10_init_plane_quantizers(cpi, x, mbmi->segment_id);
+      x->encode_breakout = cpi->segment_encode_breakout[mbmi->segment_id];
     }
     x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
   } else if (aq_mode == COMPLEXITY_AQ) {
     x->rdmult = set_segment_rdmult(cpi, x, mbmi->segment_id);
   } else if (aq_mode == CYCLIC_REFRESH_AQ) {
-    const uint8_t *const map = cm->seg.update_map ? cpi->segmentation_map
-                                                  : cm->last_frame_seg_map;
     // If segment is boosted, use rdmult for that segment.
-    if (cyclic_refresh_segment_id_boosted(
-            get_segment_id(cm, map, bsize, mi_row, mi_col)))
+    if (cyclic_refresh_segment_id_boosted(mbmi->segment_id))
       x->rdmult = vp10_cyclic_refresh_get_rdmult(cpi->cyclic_refresh);
   }
 
@@ -1196,17 +1788,42 @@
   // as a predictor for MBs that follow in the SB
   if (frame_is_intra_only(cm)) {
     vp10_rd_pick_intra_mode_sb(cpi, x, rd_cost, bsize, ctx, best_rd);
+#if CONFIG_SUPERTX
+    *totalrate_nocoef = 0;
+#endif  // CONFIG_SUPERTX
   } else {
     if (bsize >= BLOCK_8X8) {
-      if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP))
+      if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
         vp10_rd_pick_inter_mode_sb_seg_skip(cpi, tile_data, x, rd_cost, bsize,
                                            ctx, best_rd);
-      else
-        vp10_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col,
-                                  rd_cost, bsize, ctx, best_rd);
+#if CONFIG_SUPERTX
+        *totalrate_nocoef = rd_cost->rate;
+#endif  // CONFIG_SUPERTX
+      } else {
+        vp10_rd_pick_inter_mode_sb(cpi, tile_data, x, mi_row, mi_col, rd_cost,
+#if CONFIG_SUPERTX
+                                   totalrate_nocoef,
+#endif  // CONFIG_SUPERTX
+                                   bsize, ctx, best_rd);
+#if CONFIG_SUPERTX
+        assert(*totalrate_nocoef >= 0);
+#endif  // CONFIG_SUPERTX
+      }
     } else {
-      vp10_rd_pick_inter_mode_sub8x8(cpi, tile_data, x, mi_row, mi_col,
-                                    rd_cost, bsize, ctx, best_rd);
+      if (segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+        // The decoder rejects sub8x8 partitions when SEG_LVL_SKIP is set.
+        rd_cost->rate = INT_MAX;
+      } else {
+        vp10_rd_pick_inter_mode_sub8x8(cpi, tile_data, x, mi_row, mi_col,
+                                       rd_cost,
+#if CONFIG_SUPERTX
+                                       totalrate_nocoef,
+#endif  // CONFIG_SUPERTX
+                                       bsize, ctx, best_rd);
+#if CONFIG_SUPERTX
+      assert(*totalrate_nocoef >= 0);
+#endif  // CONFIG_SUPERTX
+      }
     }
   }
 
@@ -1231,7 +1848,56 @@
   ctx->dist = rd_cost->dist;
 }
 
-static void update_stats(VP10_COMMON *cm, ThreadData *td) {
+#if CONFIG_REF_MV
+static void update_inter_mode_stats(FRAME_COUNTS *counts,
+                                    PREDICTION_MODE mode,
+#if CONFIG_EXT_INTER
+                                    int is_compound,
+#endif  // CONFIG_EXT_INTER
+                                    int16_t mode_context) {
+  int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
+#if CONFIG_EXT_INTER
+  if (mode == NEWMV || mode == NEWFROMNEARMV) {
+    if (!is_compound)
+      ++counts->new2mv_mode[mode == NEWFROMNEARMV];
+#else
+  if (mode == NEWMV) {
+#endif  // CONFIG_EXT_INTER
+    ++counts->newmv_mode[mode_ctx][0];
+    return;
+  } else {
+    ++counts->newmv_mode[mode_ctx][1];
+
+    if (mode_context & (1 << ALL_ZERO_FLAG_OFFSET)) {
+      return;
+    }
+
+    mode_ctx = (mode_context >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK;
+    if (mode == ZEROMV) {
+      ++counts->zeromv_mode[mode_ctx][0];
+      return;
+    } else {
+      ++counts->zeromv_mode[mode_ctx][1];
+      mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
+
+      if (mode_context & (1 << SKIP_NEARESTMV_OFFSET))
+        mode_ctx = 6;
+      if (mode_context & (1 << SKIP_NEARMV_OFFSET))
+        mode_ctx = 7;
+      if (mode_context & (1 << SKIP_NEARESTMV_SUB8X8_OFFSET))
+        mode_ctx = 8;
+
+      ++counts->refmv_mode[mode_ctx][mode != NEARESTMV];
+    }
+  }
+}
+#endif
+
+static void update_stats(VP10_COMMON *cm, ThreadData *td
+#if CONFIG_SUPERTX
+                         , int supertx_enabled
+#endif
+                         ) {
   const MACROBLOCK *x = &td->mb;
   const MACROBLOCKD *const xd = &x->e_mbd;
   const MODE_INFO *const mi = xd->mi[0];
@@ -1245,34 +1911,177 @@
     const int seg_ref_active = segfeature_active(&cm->seg, mbmi->segment_id,
                                                  SEG_LVL_REF_FRAME);
     if (!seg_ref_active) {
+#if CONFIG_SUPERTX
+      if (!supertx_enabled)
+#endif
       counts->intra_inter[vp10_get_intra_inter_context(xd)][inter_block]++;
       // If the segment reference feature is enabled we have only a single
       // reference frame allowed for the segment so exclude it from
       // the reference frame counts used to work out probabilities.
       if (inter_block) {
         const MV_REFERENCE_FRAME ref0 = mbmi->ref_frame[0];
+#if CONFIG_EXT_REFS
+        const MV_REFERENCE_FRAME ref1 = mbmi->ref_frame[1];
+#endif  // CONFIG_EXT_REFS
+
         if (cm->reference_mode == REFERENCE_MODE_SELECT)
           counts->comp_inter[vp10_get_reference_mode_context(cm, xd)]
                             [has_second_ref(mbmi)]++;
 
         if (has_second_ref(mbmi)) {
-          counts->comp_ref[vp10_get_pred_context_comp_ref_p(cm, xd)]
+#if CONFIG_EXT_REFS
+          const int bit = (ref0 == GOLDEN_FRAME || ref0 == LAST3_FRAME);
+
+          counts->comp_ref[vp10_get_pred_context_comp_ref_p(cm, xd)][0][bit]++;
+          if (!bit) {
+            counts->comp_ref[vp10_get_pred_context_comp_ref_p1(cm, xd)][1]
+                            [ref0 == LAST_FRAME]++;
+          } else {
+            counts->comp_ref[vp10_get_pred_context_comp_ref_p2(cm, xd)][2]
+                            [ref0 == GOLDEN_FRAME]++;
+          }
+
+          counts->comp_bwdref[vp10_get_pred_context_comp_bwdref_p(cm, xd)][0]
+                             [ref1 == ALTREF_FRAME]++;
+#else
+          counts->comp_ref[vp10_get_pred_context_comp_ref_p(cm, xd)][0]
                           [ref0 == GOLDEN_FRAME]++;
+#endif  // CONFIG_EXT_REFS
         } else {
+#if CONFIG_EXT_REFS
+          const int bit = (ref0 == ALTREF_FRAME || ref0 == BWDREF_FRAME);
+
+          counts->single_ref[vp10_get_pred_context_single_ref_p1(xd)][0][bit]++;
+          if (bit) {
+            counts->single_ref[vp10_get_pred_context_single_ref_p2(xd)][1]
+                              [ref0 != BWDREF_FRAME]++;
+          } else {
+            const int bit1 = !(ref0 == LAST2_FRAME || ref0 == LAST_FRAME);
+            counts->single_ref[vp10_get_pred_context_single_ref_p3(xd)][2]
+                              [bit1]++;
+            if (!bit1) {
+              counts->single_ref[vp10_get_pred_context_single_ref_p4(xd)][3]
+                                [ref0 != LAST_FRAME]++;
+            } else {
+              counts->single_ref[vp10_get_pred_context_single_ref_p5(xd)][4]
+                                [ref0 != LAST3_FRAME]++;
+            }
+          }
+#else
           counts->single_ref[vp10_get_pred_context_single_ref_p1(xd)][0]
                             [ref0 != LAST_FRAME]++;
-          if (ref0 != LAST_FRAME)
+          if (ref0 != LAST_FRAME) {
             counts->single_ref[vp10_get_pred_context_single_ref_p2(xd)][1]
                               [ref0 != GOLDEN_FRAME]++;
+          }
+#endif  // CONFIG_EXT_REFS
         }
+
+#if CONFIG_EXT_INTER
+    if (cm->reference_mode != COMPOUND_REFERENCE &&
+#if CONFIG_SUPERTX
+        !supertx_enabled &&
+#endif
+        is_interintra_allowed(mbmi)) {
+      const int bsize_group = size_group_lookup[bsize];
+      if (mbmi->ref_frame[1] == INTRA_FRAME) {
+        counts->interintra[bsize_group][1]++;
+        counts->interintra_mode[bsize_group][mbmi->interintra_mode]++;
+        if (is_interintra_wedge_used(bsize))
+          counts->wedge_interintra[bsize][mbmi->use_wedge_interintra]++;
+      } else {
+        counts->interintra[bsize_group][0]++;
       }
     }
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_OBMC || CONFIG_WARPED_MOTION
+#if CONFIG_SUPERTX
+        if (!supertx_enabled)
+#endif  // CONFIG_SUPERTX
+#if CONFIG_EXT_INTER
+        if (mbmi->ref_frame[1] != INTRA_FRAME)
+#endif  // CONFIG_EXT_INTER
+          if (is_motvar_allowed(mbmi))
+            counts->motvar[mbmi->sb_type][mbmi->motion_variation]++;
+#endif  // CONFIG_OBMC || CONFIG_WARPED_MOTION
+
+#if CONFIG_EXT_INTER
+        if (cm->reference_mode != SINGLE_REFERENCE &&
+            is_inter_compound_mode(mbmi->mode) &&
+#if CONFIG_OBMC || CONFIG_WARPED_MOTION
+            !(is_motvar_allowed(mbmi) &&
+              mbmi->motion_variation != SIMPLE_TRANSLATION) &&
+#endif  // CONFIG_OBMC || CONFIG_WARPED_MOTION
+            is_interinter_wedge_used(bsize)) {
+          counts->wedge_interinter[bsize][mbmi->use_wedge_interinter]++;
+        }
+#endif  // CONFIG_EXT_INTER
+      }
+    }
+
     if (inter_block &&
         !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
-      const int mode_ctx = mbmi_ext->mode_context[mbmi->ref_frame[0]];
+      int16_t mode_ctx = mbmi_ext->mode_context[mbmi->ref_frame[0]];
       if (bsize >= BLOCK_8X8) {
         const PREDICTION_MODE mode = mbmi->mode;
+#if CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+        if (has_second_ref(mbmi)) {
+          mode_ctx = mbmi_ext->compound_mode_context[mbmi->ref_frame[0]];
+          ++counts->inter_compound_mode[mode_ctx][INTER_COMPOUND_OFFSET(mode)];
+        } else {
+#endif  // CONFIG_EXT_INTER
+        mode_ctx = vp10_mode_context_analyzer(mbmi_ext->mode_context,
+                                              mbmi->ref_frame, bsize, -1);
+        update_inter_mode_stats(counts, mode,
+#if CONFIG_EXT_INTER
+                                has_second_ref(mbmi),
+#endif  // CONFIG_EXT_INTER
+                                mode_ctx);
+
+        if (mode == NEWMV) {
+          uint8_t ref_frame_type = vp10_ref_frame_type(mbmi->ref_frame);
+          int idx;
+
+          for (idx = 0; idx < 2; ++idx) {
+            if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+              uint8_t drl_ctx =
+                  vp10_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
+              ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx];
+
+              if (mbmi->ref_mv_idx == idx)
+                break;
+            }
+          }
+        }
+
+        if (mode == NEARMV) {
+          uint8_t ref_frame_type = vp10_ref_frame_type(mbmi->ref_frame);
+          int idx;
+
+          for (idx = 1; idx < 3; ++idx) {
+            if (mbmi_ext->ref_mv_count[ref_frame_type] > idx + 1) {
+              uint8_t drl_ctx =
+                  vp10_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type], idx);
+              ++counts->drl_mode[drl_ctx][mbmi->ref_mv_idx != idx - 1];
+
+              if (mbmi->ref_mv_idx == idx - 1)
+                break;
+            }
+          }
+        }
+#if CONFIG_EXT_INTER
+        }
+#endif  // CONFIG_EXT_INTER
+#else
+#if CONFIG_EXT_INTER
+        if (is_inter_compound_mode(mode))
+          ++counts->inter_compound_mode[mode_ctx][INTER_COMPOUND_OFFSET(mode)];
+        else
+#endif  // CONFIG_EXT_INTER
         ++counts->inter_mode[mode_ctx][INTER_OFFSET(mode)];
+#endif
       } else {
         const int num_4x4_w = num_4x4_blocks_wide_lookup[bsize];
         const int num_4x4_h = num_4x4_blocks_high_lookup[bsize];
@@ -1281,7 +2090,33 @@
           for (idx = 0; idx < 2; idx += num_4x4_w) {
             const int j = idy * 2 + idx;
             const PREDICTION_MODE b_mode = mi->bmi[j].as_mode;
+#if CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+          if (has_second_ref(mbmi)) {
+            mode_ctx = mbmi_ext->compound_mode_context[mbmi->ref_frame[0]];
+            ++counts->inter_compound_mode[mode_ctx]
+                                         [INTER_COMPOUND_OFFSET(b_mode)];
+          } else {
+#endif  // CONFIG_EXT_INTER
+            mode_ctx = vp10_mode_context_analyzer(mbmi_ext->mode_context,
+                                                  mbmi->ref_frame, bsize, j);
+            update_inter_mode_stats(counts, b_mode,
+#if CONFIG_EXT_INTER
+                                    has_second_ref(mbmi),
+#endif  // CONFIG_EXT_INTER
+                                    mode_ctx);
+#if CONFIG_EXT_INTER
+            }
+#endif  // CONFIG_EXT_INTER
+#else
+#if CONFIG_EXT_INTER
+            if (is_inter_compound_mode(b_mode))
+              ++counts->inter_compound_mode[mode_ctx]
+                                           [INTER_COMPOUND_OFFSET(b_mode)];
+            else
+#endif  // CONFIG_EXT_INTER
             ++counts->inter_mode[mode_ctx][INTER_OFFSET(b_mode)];
+#endif
           }
         }
       }
@@ -1289,12 +2124,23 @@
   }
 }
 
-static void restore_context(MACROBLOCK *const x, int mi_row, int mi_col,
-                            ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
-                            ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
-                            PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
-                            BLOCK_SIZE bsize) {
-  MACROBLOCKD *const xd = &x->e_mbd;
+typedef struct {
+  ENTROPY_CONTEXT a[2 * MAX_MIB_SIZE * MAX_MB_PLANE];
+  ENTROPY_CONTEXT l[2 * MAX_MIB_SIZE * MAX_MB_PLANE];
+  PARTITION_CONTEXT sa[MAX_MIB_SIZE];
+  PARTITION_CONTEXT sl[MAX_MIB_SIZE];
+#if CONFIG_VAR_TX
+  TXFM_CONTEXT *p_ta;
+  TXFM_CONTEXT *p_tl;
+  TXFM_CONTEXT ta[MAX_MIB_SIZE];
+  TXFM_CONTEXT tl[MAX_MIB_SIZE];
+#endif
+} RD_SEARCH_MACROBLOCK_CONTEXT;
+
+static void restore_context(MACROBLOCK *x,
+                            const RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
+                            int mi_row, int mi_col, BLOCK_SIZE bsize) {
+  MACROBLOCKD *xd = &x->e_mbd;
   int p;
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
@@ -1303,28 +2149,34 @@
   for (p = 0; p < MAX_MB_PLANE; p++) {
     memcpy(
         xd->above_context[p] + ((mi_col * 2) >> xd->plane[p].subsampling_x),
-        a + num_4x4_blocks_wide * p,
+        ctx->a + num_4x4_blocks_wide * p,
         (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
         xd->plane[p].subsampling_x);
     memcpy(
         xd->left_context[p]
-            + ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
-        l + num_4x4_blocks_high * p,
+            + ((mi_row & MAX_MIB_MASK) * 2 >> xd->plane[p].subsampling_y),
+        ctx->l + num_4x4_blocks_high * p,
         (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
         xd->plane[p].subsampling_y);
   }
-  memcpy(xd->above_seg_context + mi_col, sa,
+  memcpy(xd->above_seg_context + mi_col, ctx->sa,
          sizeof(*xd->above_seg_context) * mi_width);
-  memcpy(xd->left_seg_context + (mi_row & MI_MASK), sl,
+  memcpy(xd->left_seg_context + (mi_row & MAX_MIB_MASK), ctx->sl,
          sizeof(xd->left_seg_context[0]) * mi_height);
+#if CONFIG_VAR_TX
+  xd->above_txfm_context = ctx->p_ta;
+  xd->left_txfm_context = ctx->p_tl;
+  memcpy(xd->above_txfm_context, ctx->ta,
+         sizeof(*xd->above_txfm_context) * mi_width);
+  memcpy(xd->left_txfm_context, ctx->tl,
+         sizeof(*xd->left_txfm_context) * mi_height);
+#endif
 }
 
-static void save_context(MACROBLOCK *const x, int mi_row, int mi_col,
-                         ENTROPY_CONTEXT a[16 * MAX_MB_PLANE],
-                         ENTROPY_CONTEXT l[16 * MAX_MB_PLANE],
-                         PARTITION_CONTEXT sa[8], PARTITION_CONTEXT sl[8],
-                         BLOCK_SIZE bsize) {
-  const MACROBLOCKD *const xd = &x->e_mbd;
+static void save_context(const MACROBLOCK *x,
+                         RD_SEARCH_MACROBLOCK_CONTEXT *ctx,
+                         int mi_row, int mi_col, BLOCK_SIZE bsize) {
+  const MACROBLOCKD *xd = &x->e_mbd;
   int p;
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[bsize];
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[bsize];
@@ -1334,35 +2186,53 @@
   // buffer the above/left context information of the block in search.
   for (p = 0; p < MAX_MB_PLANE; ++p) {
     memcpy(
-        a + num_4x4_blocks_wide * p,
+        ctx->a + num_4x4_blocks_wide * p,
         xd->above_context[p] + (mi_col * 2 >> xd->plane[p].subsampling_x),
         (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_wide) >>
         xd->plane[p].subsampling_x);
     memcpy(
-        l + num_4x4_blocks_high * p,
+        ctx->l + num_4x4_blocks_high * p,
         xd->left_context[p]
-            + ((mi_row & MI_MASK) * 2 >> xd->plane[p].subsampling_y),
+            + ((mi_row & MAX_MIB_MASK) * 2 >> xd->plane[p].subsampling_y),
         (sizeof(ENTROPY_CONTEXT) * num_4x4_blocks_high) >>
         xd->plane[p].subsampling_y);
   }
-  memcpy(sa, xd->above_seg_context + mi_col,
+  memcpy(ctx->sa, xd->above_seg_context + mi_col,
          sizeof(*xd->above_seg_context) * mi_width);
-  memcpy(sl, xd->left_seg_context + (mi_row & MI_MASK),
+  memcpy(ctx->sl, xd->left_seg_context + (mi_row & MAX_MIB_MASK),
          sizeof(xd->left_seg_context[0]) * mi_height);
+#if CONFIG_VAR_TX
+  memcpy(ctx->ta, xd->above_txfm_context,
+         sizeof(*xd->above_txfm_context) * mi_width);
+  memcpy(ctx->tl, xd->left_txfm_context,
+         sizeof(*xd->left_txfm_context) * mi_height);
+  ctx->p_ta = xd->above_txfm_context;
+  ctx->p_tl = xd->left_txfm_context;
+#endif
 }
 
 static void encode_b(VP10_COMP *cpi, const TileInfo *const tile,
                      ThreadData *td,
                      TOKENEXTRA **tp, int mi_row, int mi_col,
                      int output_enabled, BLOCK_SIZE bsize,
+#if CONFIG_EXT_PARTITION_TYPES
+                     PARTITION_TYPE partition,
+#endif
                      PICK_MODE_CONTEXT *ctx) {
   MACROBLOCK *const x = &td->mb;
   set_offsets(cpi, tile, x, mi_row, mi_col, bsize);
+#if CONFIG_EXT_PARTITION_TYPES
+  x->e_mbd.mi[0]->mbmi.partition = partition;
+#endif
   update_state(cpi, td, ctx, mi_row, mi_col, bsize, output_enabled);
   encode_superblock(cpi, td, tp, output_enabled, mi_row, mi_col, bsize, ctx);
 
   if (output_enabled) {
+#if CONFIG_SUPERTX
+    update_stats(&cpi->common, td, 0);
+#else
     update_stats(&cpi->common, td);
+#endif
   }
 }
 
@@ -1371,54 +2241,164 @@
                       TOKENEXTRA **tp, int mi_row, int mi_col,
                       int output_enabled, BLOCK_SIZE bsize,
                       PC_TREE *pc_tree) {
-  VP10_COMMON *const cm = &cpi->common;
+  const VP10_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
 
-  const int bsl = b_width_log2_lookup[bsize], hbs = (1 << bsl) / 4;
-  int ctx;
-  PARTITION_TYPE partition;
-  BLOCK_SIZE subsize = bsize;
+  const int ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
+  const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
+  const PARTITION_TYPE partition = pc_tree->partitioning;
+  const BLOCK_SIZE subsize =  get_subsize(bsize, partition);
+#if CONFIG_EXT_PARTITION_TYPES
+  const BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+#endif
+
+  assert(bsize >= BLOCK_8X8);
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
-  if (bsize >= BLOCK_8X8) {
-    ctx = partition_plane_context(xd, mi_row, mi_col, bsize);
-    subsize = get_subsize(bsize, pc_tree->partitioning);
-  } else {
-    ctx = 0;
-    subsize = BLOCK_4X4;
-  }
-
-  partition = partition_lookup[bsl][subsize];
-  if (output_enabled && bsize != BLOCK_4X4)
+  if (output_enabled)
     td->counts->partition[ctx][partition]++;
 
+#if CONFIG_SUPERTX
+  if (!frame_is_intra_only(cm) &&
+      bsize <= MAX_SUPERTX_BLOCK_SIZE &&
+      partition != PARTITION_NONE &&
+      !xd->lossless[0]) {
+    int supertx_enabled;
+    TX_SIZE supertx_size = max_txsize_lookup[bsize];
+    supertx_enabled = check_supertx_sb(bsize, supertx_size, pc_tree);
+    if (supertx_enabled) {
+      const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+      const int mi_height = num_8x8_blocks_high_lookup[bsize];
+      int x_idx, y_idx, i;
+      uint8_t *dst_buf[3];
+      int dst_stride[3];
+      set_skip_context(xd, mi_row, mi_col);
+      set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
+      update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, bsize,
+                              output_enabled, pc_tree);
+
+      vp10_setup_dst_planes(xd->plane, get_frame_new_buffer(cm),
+                           mi_row, mi_col);
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        dst_buf[i] = xd->plane[i].dst.buf;
+        dst_stride[i] = xd->plane[i].dst.stride;
+      }
+      predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row, mi_col,
+                         output_enabled, bsize, bsize,
+                         dst_buf, dst_stride, pc_tree);
+
+      set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
+      set_segment_id_supertx(cpi, x, mi_row, mi_col, bsize);
+
+      if (!x->skip) {
+        x->skip_optimize = 0;
+        x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct;
+
+        vp10_encode_sb_supertx(x, bsize);
+        vp10_tokenize_sb_supertx(cpi, td, tp, !output_enabled, bsize);
+      } else {
+        xd->mi[0]->mbmi.skip = 1;
+        if (output_enabled)
+          td->counts->skip[vp10_get_skip_context(xd)][1]++;
+        reset_skip_context(xd, bsize);
+      }
+      if (output_enabled) {
+        for (y_idx = 0; y_idx < mi_height; y_idx++)
+          for (x_idx = 0; x_idx < mi_width; x_idx++) {
+            if ((xd->mb_to_right_edge >> (3 + MI_SIZE_LOG2)) + mi_width > x_idx
+                && (xd->mb_to_bottom_edge >> (3 + MI_SIZE_LOG2)) + mi_height
+                    > y_idx) {
+              xd->mi[x_idx + y_idx * cm->mi_stride]->mbmi.skip =
+                  xd->mi[0]->mbmi.skip;
+            }
+          }
+        td->counts->supertx
+            [partition_supertx_context_lookup[partition]][supertx_size][1]++;
+        td->counts->supertx_size[supertx_size]++;
+#if CONFIG_EXT_TX
+        if (get_ext_tx_types(supertx_size, bsize, 1) > 1 &&
+            !xd->mi[0]->mbmi.skip) {
+          int eset = get_ext_tx_set(supertx_size, bsize, 1);
+          if (eset > 0) {
+            ++td->counts->inter_ext_tx[eset][supertx_size]
+                                      [xd->mi[0]->mbmi.tx_type];
+          }
+        }
+#else
+        if (supertx_size < TX_32X32 &&
+            !xd->mi[0]->mbmi.skip) {
+          ++td->counts->inter_ext_tx[supertx_size][xd->mi[0]->mbmi.tx_type];
+        }
+#endif  // CONFIG_EXT_TX
+      }
+#if CONFIG_EXT_PARTITION_TYPES
+      update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize,
+                                   partition);
+#else
+      if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
+        update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+#endif
+#if CONFIG_VAR_TX
+      set_txfm_ctx(xd->left_txfm_context, supertx_size, xd->n8_h);
+      set_txfm_ctx(xd->above_txfm_context, supertx_size, mi_height);
+#endif  // CONFIG_VAR_TX
+      return;
+    } else {
+      if (output_enabled) {
+        td->counts->supertx
+            [partition_supertx_context_lookup[partition]][supertx_size][0]++;
+      }
+    }
+  }
+#endif  // CONFIG_SUPERTX
+
   switch (partition) {
     case PARTITION_NONE:
       encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
+#if CONFIG_EXT_PARTITION_TYPES
+               partition,
+#endif
                &pc_tree->none);
       break;
     case PARTITION_VERT:
       encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
+#if CONFIG_EXT_PARTITION_TYPES
+               partition,
+#endif
                &pc_tree->vertical[0]);
       if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
         encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, output_enabled,
-                 subsize, &pc_tree->vertical[1]);
+                 subsize,
+#if CONFIG_EXT_PARTITION_TYPES
+                 partition,
+#endif
+                 &pc_tree->vertical[1]);
       }
       break;
     case PARTITION_HORZ:
       encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
+#if CONFIG_EXT_PARTITION_TYPES
+               partition,
+#endif
                &pc_tree->horizontal[0]);
       if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
         encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, output_enabled,
-                 subsize, &pc_tree->horizontal[1]);
+                 subsize,
+#if CONFIG_EXT_PARTITION_TYPES
+                 partition,
+#endif
+                 &pc_tree->horizontal[1]);
       }
       break;
     case PARTITION_SPLIT:
       if (bsize == BLOCK_8X8) {
         encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
+#if CONFIG_EXT_PARTITION_TYPES
+                 partition,
+#endif
                  pc_tree->leaf_split[0]);
       } else {
         encode_sb(cpi, td, tile, tp, mi_row, mi_col, output_enabled, subsize,
@@ -1431,17 +2411,56 @@
                   subsize, pc_tree->split[3]);
       }
       break;
+#if CONFIG_EXT_PARTITION_TYPES
+    case PARTITION_HORZ_A:
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, bsize2,
+               partition, &pc_tree->horizontala[0]);
+      encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, output_enabled, bsize2,
+               partition, &pc_tree->horizontala[1]);
+      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, output_enabled, subsize,
+               partition, &pc_tree->horizontala[2]);
+      break;
+    case PARTITION_HORZ_B:
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
+               partition, &pc_tree->horizontalb[0]);
+      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, output_enabled, bsize2,
+               partition, &pc_tree->horizontalb[1]);
+      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col + hbs, output_enabled,
+               bsize2, partition, &pc_tree->horizontalb[2]);
+      break;
+    case PARTITION_VERT_A:
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, bsize2,
+               partition, &pc_tree->verticala[0]);
+      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col, output_enabled, bsize2,
+               partition, &pc_tree->verticala[1]);
+      encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, output_enabled, subsize,
+               partition, &pc_tree->verticala[2]);
+
+      break;
+    case PARTITION_VERT_B:
+      encode_b(cpi, tile, td, tp, mi_row, mi_col, output_enabled, subsize,
+               partition, &pc_tree->verticalb[0]);
+      encode_b(cpi, tile, td, tp, mi_row, mi_col + hbs, output_enabled, bsize2,
+               partition, &pc_tree->verticalb[1]);
+      encode_b(cpi, tile, td, tp, mi_row + hbs, mi_col + hbs, output_enabled,
+               bsize2, partition, &pc_tree->verticalb[2]);
+      break;
+#endif  // CONFIG_EXT_PARTITION_TYPES
     default:
       assert(0 && "Invalid partition type.");
       break;
   }
 
+#if CONFIG_EXT_PARTITION_TYPES
+  update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
+#else
   if (partition != PARTITION_SPLIT || bsize == BLOCK_8X8)
     update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+#endif  // CONFIG_EXT_PARTITION_TYPES
 }
 
 // Check to see if the given partition size is allowed for a specified number
-// of 8x8 block rows and columns remaining in the image.
+// of mi block rows and columns remaining in the image.
 // If not then return the largest allowed partition size
 static BLOCK_SIZE find_partition_size(BLOCK_SIZE bsize,
                                       int rows_left, int cols_left,
@@ -1460,85 +2479,93 @@
   return bsize;
 }
 
-static void set_partial_b64x64_partition(MODE_INFO *mi, int mis,
-    int bh_in, int bw_in, int row8x8_remaining, int col8x8_remaining,
-    BLOCK_SIZE bsize, MODE_INFO **mi_8x8) {
+static void set_partial_sb_partition(const VP10_COMMON *const cm,
+                                     MODE_INFO *mi,
+                                     int bh_in, int bw_in,
+                                     int mi_rows_remaining,
+                                     int mi_cols_remaining,
+                                     BLOCK_SIZE bsize, MODE_INFO **mib) {
   int bh = bh_in;
   int r, c;
-  for (r = 0; r < MI_BLOCK_SIZE; r += bh) {
+  for (r = 0; r < cm->mib_size; r += bh) {
     int bw = bw_in;
-    for (c = 0; c < MI_BLOCK_SIZE; c += bw) {
-      const int index = r * mis + c;
-      mi_8x8[index] = mi + index;
-      mi_8x8[index]->mbmi.sb_type = find_partition_size(bsize,
-          row8x8_remaining - r, col8x8_remaining - c, &bh, &bw);
+    for (c = 0; c < cm->mib_size; c += bw) {
+      const int index = r * cm->mi_stride + c;
+      mib[index] = mi + index;
+      mib[index]->mbmi.sb_type = find_partition_size(bsize,
+          mi_rows_remaining - r, mi_cols_remaining - c, &bh, &bw);
     }
   }
 }
 
-// This function attempts to set all mode info entries in a given SB64
+// This function attempts to set all mode info entries in a given superblock
 // to the same block partition size.
 // However, at the bottom and right borders of the image the requested size
 // may not be allowed in which case this code attempts to choose the largest
 // allowable partition.
 static void set_fixed_partitioning(VP10_COMP *cpi, const TileInfo *const tile,
-                                   MODE_INFO **mi_8x8, int mi_row, int mi_col,
+                                   MODE_INFO **mib, int mi_row, int mi_col,
                                    BLOCK_SIZE bsize) {
   VP10_COMMON *const cm = &cpi->common;
-  const int mis = cm->mi_stride;
-  const int row8x8_remaining = tile->mi_row_end - mi_row;
-  const int col8x8_remaining = tile->mi_col_end - mi_col;
+  const int mi_rows_remaining = tile->mi_row_end - mi_row;
+  const int mi_cols_remaining = tile->mi_col_end - mi_col;
   int block_row, block_col;
-  MODE_INFO *mi_upper_left = cm->mi + mi_row * mis + mi_col;
+  MODE_INFO *const mi_upper_left = cm->mi + mi_row * cm->mi_stride + mi_col;
   int bh = num_8x8_blocks_high_lookup[bsize];
   int bw = num_8x8_blocks_wide_lookup[bsize];
 
-  assert((row8x8_remaining > 0) && (col8x8_remaining > 0));
+  assert((mi_rows_remaining > 0) && (mi_cols_remaining > 0));
 
-  // Apply the requested partition size to the SB64 if it is all "in image"
-  if ((col8x8_remaining >= MI_BLOCK_SIZE) &&
-      (row8x8_remaining >= MI_BLOCK_SIZE)) {
-    for (block_row = 0; block_row < MI_BLOCK_SIZE; block_row += bh) {
-      for (block_col = 0; block_col < MI_BLOCK_SIZE; block_col += bw) {
-        int index = block_row * mis + block_col;
-        mi_8x8[index] = mi_upper_left + index;
-        mi_8x8[index]->mbmi.sb_type = bsize;
+  // Apply the requested partition size to the SB if it is all "in image"
+  if ((mi_cols_remaining >= cm->mib_size) &&
+      (mi_rows_remaining >= cm->mib_size)) {
+    for (block_row = 0; block_row < cm->mib_size; block_row += bh) {
+      for (block_col = 0; block_col < cm->mib_size; block_col += bw) {
+        int index = block_row * cm->mi_stride + block_col;
+        mib[index] = mi_upper_left + index;
+        mib[index]->mbmi.sb_type = bsize;
       }
     }
   } else {
-    // Else this is a partial SB64.
-    set_partial_b64x64_partition(mi_upper_left, mis, bh, bw, row8x8_remaining,
-        col8x8_remaining, bsize, mi_8x8);
+    // Else this is a partial SB.
+    set_partial_sb_partition(cm, mi_upper_left, bh, bw,
+                             mi_rows_remaining, mi_cols_remaining, bsize, mib);
   }
 }
 
 static void rd_use_partition(VP10_COMP *cpi,
                              ThreadData *td,
                              TileDataEnc *tile_data,
-                             MODE_INFO **mi_8x8, TOKENEXTRA **tp,
+                             MODE_INFO **mib, TOKENEXTRA **tp,
                              int mi_row, int mi_col,
                              BLOCK_SIZE bsize,
                              int *rate, int64_t *dist,
+#if CONFIG_SUPERTX
+                             int *rate_nocoef,
+#endif
                              int do_recon, PC_TREE *pc_tree) {
   VP10_COMMON *const cm = &cpi->common;
   TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
-  const int mis = cm->mi_stride;
-  const int bsl = b_width_log2_lookup[bsize];
-  const int mi_step = num_4x4_blocks_wide_lookup[bsize] / 2;
-  const int bss = (1 << bsl) / 4;
-  int i, pl;
-  PARTITION_TYPE partition = PARTITION_NONE;
-  BLOCK_SIZE subsize;
-  ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
-  PARTITION_CONTEXT sl[8], sa[8];
+  const int bs = num_8x8_blocks_wide_lookup[bsize];
+  const int hbs = bs / 2;
+  int i;
+  const int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+  const PARTITION_TYPE partition = get_partition(cm, mi_row, mi_col, bsize);
+  const BLOCK_SIZE subsize =  get_subsize(bsize, partition);
+  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
   RD_COST last_part_rdc, none_rdc, chosen_rdc;
   BLOCK_SIZE sub_subsize = BLOCK_4X4;
   int splits_below = 0;
-  BLOCK_SIZE bs_type = mi_8x8[0]->mbmi.sb_type;
+  BLOCK_SIZE bs_type = mib[0]->mbmi.sb_type;
   int do_partition_search = 1;
   PICK_MODE_CONTEXT *ctx = &pc_tree->none;
+#if CONFIG_SUPERTX
+  int last_part_rate_nocoef = INT_MAX;
+  int none_rate_nocoef = INT_MAX;
+  int chosen_rate_nocoef = INT_MAX;
+#endif
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
@@ -1550,13 +2577,17 @@
   vp10_rd_cost_reset(&none_rdc);
   vp10_rd_cost_reset(&chosen_rdc);
 
-  partition = partition_lookup[bsl][bs_type];
-  subsize = get_subsize(bsize, partition);
-
   pc_tree->partitioning = partition;
-  save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
 
-  if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode) {
+#if CONFIG_VAR_TX
+  xd->above_txfm_context = cm->above_txfm_context + mi_col;
+  xd->left_txfm_context =
+    xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+#endif
+
+  save_context(x, &x_ctx, mi_row, mi_col, bsize);
+
+  if (bsize == BLOCK_16X16 && cpi->vaq_refresh) {
     set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
     x->mb_energy = vp10_block_energy(cpi, x, bsize);
   }
@@ -1570,7 +2601,7 @@
       splits_below = 1;
       for (i = 0; i < 4; i++) {
         int jj = i >> 1, ii = i & 0x01;
-        MODE_INFO *this_mi = mi_8x8[jj * bss * mis + ii * bss];
+        MODE_INFO *this_mi = mib[jj * hbs * cm->mi_stride + ii * hbs];
         if (this_mi && this_mi->mbmi.sb_type >= sub_subsize) {
           splits_below = 0;
         }
@@ -1580,22 +2611,30 @@
     // If partition is not none try none unless each of the 4 splits are split
     // even further..
     if (partition != PARTITION_NONE && !splits_below &&
-        mi_row + (mi_step >> 1) < cm->mi_rows &&
-        mi_col + (mi_step >> 1) < cm->mi_cols) {
+        mi_row + hbs < cm->mi_rows &&
+        mi_col + hbs < cm->mi_cols) {
       pc_tree->partitioning = PARTITION_NONE;
-      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc, bsize,
-                       ctx, INT64_MAX);
-
-      pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &none_rdc,
+#if CONFIG_SUPERTX
+                       &none_rate_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_NONE,
+#endif
+                       bsize, ctx, INT64_MAX);
 
       if (none_rdc.rate < INT_MAX) {
         none_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
         none_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, none_rdc.rate,
                                  none_rdc.dist);
+#if CONFIG_SUPERTX
+        none_rate_nocoef += cpi->partition_cost[pl][PARTITION_NONE];
+#endif
       }
 
-      restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
-      mi_8x8[0]->mbmi.sb_type = bs_type;
+      restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+
+      mib[0]->mbmi.sb_type = bs_type;
       pc_tree->partitioning = partition;
     }
   }
@@ -1603,224 +2642,348 @@
   switch (partition) {
     case PARTITION_NONE:
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+#if CONFIG_SUPERTX
+                       &last_part_rate_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_NONE,
+#endif
                        bsize, ctx, INT64_MAX);
       break;
     case PARTITION_HORZ:
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+#if CONFIG_SUPERTX
+                       &last_part_rate_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_HORZ,
+#endif
                        subsize, &pc_tree->horizontal[0],
                        INT64_MAX);
       if (last_part_rdc.rate != INT_MAX &&
-          bsize >= BLOCK_8X8 && mi_row + (mi_step >> 1) < cm->mi_rows) {
+          bsize >= BLOCK_8X8 && mi_row + hbs < cm->mi_rows) {
         RD_COST tmp_rdc;
+#if CONFIG_SUPERTX
+        int rt_nocoef = 0;
+#endif
         PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0];
         vp10_rd_cost_init(&tmp_rdc);
         update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
         encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
         rd_pick_sb_modes(cpi, tile_data, x,
-                         mi_row + (mi_step >> 1), mi_col, &tmp_rdc,
+                         mi_row + hbs, mi_col, &tmp_rdc,
+#if CONFIG_SUPERTX
+                         &rt_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+                         PARTITION_HORZ,
+#endif
                          subsize, &pc_tree->horizontal[1], INT64_MAX);
         if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
           vp10_rd_cost_reset(&last_part_rdc);
+#if CONFIG_SUPERTX
+          last_part_rate_nocoef = INT_MAX;
+#endif
           break;
         }
         last_part_rdc.rate += tmp_rdc.rate;
         last_part_rdc.dist += tmp_rdc.dist;
         last_part_rdc.rdcost += tmp_rdc.rdcost;
+#if CONFIG_SUPERTX
+        last_part_rate_nocoef += rt_nocoef;
+#endif
       }
       break;
     case PARTITION_VERT:
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+#if CONFIG_SUPERTX
+                       &last_part_rate_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_VERT,
+#endif
                        subsize, &pc_tree->vertical[0], INT64_MAX);
       if (last_part_rdc.rate != INT_MAX &&
-          bsize >= BLOCK_8X8 && mi_col + (mi_step >> 1) < cm->mi_cols) {
+          bsize >= BLOCK_8X8 && mi_col + hbs < cm->mi_cols) {
         RD_COST tmp_rdc;
+#if CONFIG_SUPERTX
+        int rt_nocoef = 0;
+#endif
         PICK_MODE_CONTEXT *ctx = &pc_tree->vertical[0];
         vp10_rd_cost_init(&tmp_rdc);
         update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
         encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize, ctx);
         rd_pick_sb_modes(cpi, tile_data, x,
-                         mi_row, mi_col + (mi_step >> 1), &tmp_rdc,
+                         mi_row, mi_col + hbs, &tmp_rdc,
+#if CONFIG_SUPERTX
+                         &rt_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+                         PARTITION_VERT,
+#endif
                          subsize, &pc_tree->vertical[bsize > BLOCK_8X8],
                          INT64_MAX);
         if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
           vp10_rd_cost_reset(&last_part_rdc);
+#if CONFIG_SUPERTX
+          last_part_rate_nocoef = INT_MAX;
+#endif
           break;
         }
         last_part_rdc.rate += tmp_rdc.rate;
         last_part_rdc.dist += tmp_rdc.dist;
         last_part_rdc.rdcost += tmp_rdc.rdcost;
+#if CONFIG_SUPERTX
+        last_part_rate_nocoef += rt_nocoef;
+#endif
       }
       break;
     case PARTITION_SPLIT:
       if (bsize == BLOCK_8X8) {
         rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &last_part_rdc,
+#if CONFIG_SUPERTX
+                         &last_part_rate_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+                         PARTITION_SPLIT,
+#endif
                          subsize, pc_tree->leaf_split[0], INT64_MAX);
         break;
       }
       last_part_rdc.rate = 0;
       last_part_rdc.dist = 0;
       last_part_rdc.rdcost = 0;
+#if CONFIG_SUPERTX
+      last_part_rate_nocoef = 0;
+#endif
       for (i = 0; i < 4; i++) {
-        int x_idx = (i & 1) * (mi_step >> 1);
-        int y_idx = (i >> 1) * (mi_step >> 1);
+        int x_idx = (i & 1) * hbs;
+        int y_idx = (i >> 1) * hbs;
         int jj = i >> 1, ii = i & 0x01;
         RD_COST tmp_rdc;
+#if CONFIG_SUPERTX
+        int rt_nocoef;
+#endif
         if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
           continue;
 
         vp10_rd_cost_init(&tmp_rdc);
         rd_use_partition(cpi, td, tile_data,
-                         mi_8x8 + jj * bss * mis + ii * bss, tp,
+                         mib + jj * hbs * cm->mi_stride + ii * hbs, tp,
                          mi_row + y_idx, mi_col + x_idx, subsize,
                          &tmp_rdc.rate, &tmp_rdc.dist,
+#if CONFIG_SUPERTX
+                         &rt_nocoef,
+#endif
                          i != 3, pc_tree->split[i]);
         if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
           vp10_rd_cost_reset(&last_part_rdc);
+#if CONFIG_SUPERTX
+          last_part_rate_nocoef = INT_MAX;
+#endif
           break;
         }
         last_part_rdc.rate += tmp_rdc.rate;
         last_part_rdc.dist += tmp_rdc.dist;
+#if CONFIG_SUPERTX
+        last_part_rate_nocoef += rt_nocoef;
+#endif
       }
       break;
+#if CONFIG_EXT_PARTITION_TYPES
+    case PARTITION_VERT_A:
+    case PARTITION_VERT_B:
+    case PARTITION_HORZ_A:
+    case PARTITION_HORZ_B:
+      assert(0 && "Cannot handle extended partiton types");
+#endif  //  CONFIG_EXT_PARTITION_TYPES
     default:
       assert(0);
       break;
   }
 
-  pl = partition_plane_context(xd, mi_row, mi_col, bsize);
   if (last_part_rdc.rate < INT_MAX) {
     last_part_rdc.rate += cpi->partition_cost[pl][partition];
     last_part_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
                                   last_part_rdc.rate, last_part_rdc.dist);
+#if CONFIG_SUPERTX
+    last_part_rate_nocoef += cpi->partition_cost[pl][partition];
+#endif
   }
 
   if (do_partition_search
       && cpi->sf.adjust_partitioning_from_last_frame
       && cpi->sf.partition_search_type == SEARCH_PARTITION
       && partition != PARTITION_SPLIT && bsize > BLOCK_8X8
-      && (mi_row + mi_step < cm->mi_rows ||
-          mi_row + (mi_step >> 1) == cm->mi_rows)
-      && (mi_col + mi_step < cm->mi_cols ||
-          mi_col + (mi_step >> 1) == cm->mi_cols)) {
+      && (mi_row + bs < cm->mi_rows ||
+          mi_row + hbs == cm->mi_rows)
+      && (mi_col + bs < cm->mi_cols ||
+          mi_col + hbs == cm->mi_cols)) {
     BLOCK_SIZE split_subsize = get_subsize(bsize, PARTITION_SPLIT);
     chosen_rdc.rate = 0;
     chosen_rdc.dist = 0;
-    restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+#if CONFIG_SUPERTX
+    chosen_rate_nocoef = 0;
+#endif
+
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+
     pc_tree->partitioning = PARTITION_SPLIT;
 
     // Split partition.
     for (i = 0; i < 4; i++) {
-      int x_idx = (i & 1) * (mi_step >> 1);
-      int y_idx = (i >> 1) * (mi_step >> 1);
+      int x_idx = (i & 1) * hbs;
+      int y_idx = (i >> 1) * hbs;
       RD_COST tmp_rdc;
-      ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
-      PARTITION_CONTEXT sl[8], sa[8];
+#if CONFIG_SUPERTX
+      int rt_nocoef = 0;
+#endif
+      RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
 
       if ((mi_row + y_idx >= cm->mi_rows) || (mi_col + x_idx >= cm->mi_cols))
         continue;
 
-      save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+      save_context(x, &x_ctx, mi_row, mi_col, bsize);
       pc_tree->split[i]->partitioning = PARTITION_NONE;
       rd_pick_sb_modes(cpi, tile_data, x,
                        mi_row + y_idx, mi_col + x_idx, &tmp_rdc,
+#if CONFIG_SUPERTX
+                       &rt_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_SPLIT,
+#endif
                        split_subsize, &pc_tree->split[i]->none, INT64_MAX);
 
-      restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+      restore_context(x, &x_ctx, mi_row, mi_col, bsize);
 
       if (tmp_rdc.rate == INT_MAX || tmp_rdc.dist == INT64_MAX) {
         vp10_rd_cost_reset(&chosen_rdc);
+#if CONFIG_SUPERTX
+        chosen_rate_nocoef = INT_MAX;
+#endif
         break;
       }
 
       chosen_rdc.rate += tmp_rdc.rate;
       chosen_rdc.dist += tmp_rdc.dist;
+#if CONFIG_SUPERTX
+      chosen_rate_nocoef += rt_nocoef;
+#endif
 
       if (i != 3)
         encode_sb(cpi, td, tile_info, tp,  mi_row + y_idx, mi_col + x_idx, 0,
                   split_subsize, pc_tree->split[i]);
 
-      pl = partition_plane_context(xd, mi_row + y_idx, mi_col + x_idx,
-                                   split_subsize);
       chosen_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
+#if CONFIG_SUPERTX
+      chosen_rate_nocoef += cpi->partition_cost[pl][PARTITION_SPLIT];
+#endif
     }
-    pl = partition_plane_context(xd, mi_row, mi_col, bsize);
     if (chosen_rdc.rate < INT_MAX) {
       chosen_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT];
       chosen_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
                                  chosen_rdc.rate, chosen_rdc.dist);
+#if CONFIG_SUPERTX
+      chosen_rate_nocoef += cpi->partition_cost[pl][PARTITION_NONE];
+#endif
     }
   }
 
   // If last_part is better set the partitioning to that.
   if (last_part_rdc.rdcost < chosen_rdc.rdcost) {
-    mi_8x8[0]->mbmi.sb_type = bsize;
+    mib[0]->mbmi.sb_type = bsize;
     if (bsize >= BLOCK_8X8)
       pc_tree->partitioning = partition;
     chosen_rdc = last_part_rdc;
+#if CONFIG_SUPERTX
+    chosen_rate_nocoef = last_part_rate_nocoef;
+#endif
   }
   // If none was better set the partitioning to that.
   if (none_rdc.rdcost < chosen_rdc.rdcost) {
     if (bsize >= BLOCK_8X8)
       pc_tree->partitioning = PARTITION_NONE;
     chosen_rdc = none_rdc;
+#if CONFIG_SUPERTX
+    chosen_rate_nocoef = none_rate_nocoef;
+#endif
   }
 
-  restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+  restore_context(x, &x_ctx, mi_row, mi_col, bsize);
 
   // We must have chosen a partitioning and encoding or we'll fail later on.
   // No other opportunities for success.
-  if (bsize == BLOCK_64X64)
+  if (bsize == cm->sb_size)
     assert(chosen_rdc.rate < INT_MAX && chosen_rdc.dist < INT64_MAX);
 
   if (do_recon) {
-    int output_enabled = (bsize == BLOCK_64X64);
+    int output_enabled = (bsize == cm->sb_size);
     encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled, bsize,
               pc_tree);
   }
 
   *rate = chosen_rdc.rate;
   *dist = chosen_rdc.dist;
+#if CONFIG_SUPERTX
+  *rate_nocoef = chosen_rate_nocoef;
+#endif
 }
 
 static const BLOCK_SIZE min_partition_size[BLOCK_SIZES] = {
-  BLOCK_4X4,   BLOCK_4X4,   BLOCK_4X4,
-  BLOCK_4X4,   BLOCK_4X4,   BLOCK_4X4,
-  BLOCK_8X8,   BLOCK_8X8,   BLOCK_8X8,
-  BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,
-  BLOCK_16X16
+                              BLOCK_4X4,    //                     4x4
+    BLOCK_4X4,   BLOCK_4X4,   BLOCK_4X4,    //    4x8,    8x4,     8x8
+    BLOCK_4X4,   BLOCK_4X4,   BLOCK_8X8,    //   8x16,   16x8,   16x16
+    BLOCK_8X8,   BLOCK_8X8, BLOCK_16X16,    //  16x32,  32x16,   32x32
+  BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,    //  32x64,  64x32,   64x64
+#if CONFIG_EXT_PARTITION
+  BLOCK_16X16, BLOCK_16X16, BLOCK_16X16     // 64x128, 128x64, 128x128
+#endif  // CONFIG_EXT_PARTITION
 };
 
 static const BLOCK_SIZE max_partition_size[BLOCK_SIZES] = {
-  BLOCK_8X8,   BLOCK_16X16, BLOCK_16X16,
-  BLOCK_16X16, BLOCK_32X32, BLOCK_32X32,
-  BLOCK_32X32, BLOCK_64X64, BLOCK_64X64,
-  BLOCK_64X64, BLOCK_64X64, BLOCK_64X64,
-  BLOCK_64X64
+                                    BLOCK_8X8,  //                     4x4
+    BLOCK_16X16,   BLOCK_16X16,   BLOCK_16X16,  //    4x8,    8x4,     8x8
+    BLOCK_32X32,   BLOCK_32X32,   BLOCK_32X32,  //   8x16,   16x8,   16x16
+    BLOCK_64X64,   BLOCK_64X64,   BLOCK_64X64,  //  16x32,  32x16,   32x32
+  BLOCK_LARGEST, BLOCK_LARGEST, BLOCK_LARGEST,  //  32x64,  64x32,   64x64
+#if CONFIG_EXT_PARTITION
+  BLOCK_LARGEST, BLOCK_LARGEST, BLOCK_LARGEST   // 64x128, 128x64, 128x128
+#endif  // CONFIG_EXT_PARTITION
 };
 
+// Next square block size less or equal than current block size.
+static const BLOCK_SIZE next_square_size[BLOCK_SIZES] = {
+                                BLOCK_4X4,  //                     4x4
+    BLOCK_4X4,   BLOCK_4X4,     BLOCK_8X8,  //    4x8,    8x4,     8x8
+    BLOCK_8X8,   BLOCK_8X8,   BLOCK_16X16,  //   8x16,   16x8,   16x16
+  BLOCK_16X16, BLOCK_16X16,   BLOCK_32X32,  //  16x32,  32x16,   32x32
+  BLOCK_32X32, BLOCK_32X32,   BLOCK_64X64,  //  32x64,  64x32,   64x64
+#if CONFIG_EXT_PARTITION
+  BLOCK_64X64, BLOCK_64X64, BLOCK_128X128   // 64x128, 128x64, 128x128
+#endif  // CONFIG_EXT_PARTITION
+};
 
 // Look at all the mode_info entries for blocks that are part of this
 // partition and find the min and max values for sb_type.
-// At the moment this is designed to work on a 64x64 SB but could be
+// At the moment this is designed to work on a superblock but could be
 // adjusted to use a size parameter.
 //
 // The min and max are assumed to have been initialized prior to calling this
-// function so repeat calls can accumulate a min and max of more than one sb64.
-static void get_sb_partition_size_range(MACROBLOCKD *xd, MODE_INFO **mi_8x8,
+// function so repeat calls can accumulate a min and max of more than one
+// superblock.
+static void get_sb_partition_size_range(const VP10_COMMON *const cm,
+                                        MACROBLOCKD *xd, MODE_INFO **mib,
                                         BLOCK_SIZE *min_block_size,
-                                        BLOCK_SIZE *max_block_size,
-                                        int bs_hist[BLOCK_SIZES]) {
-  int sb_width_in_blocks = MI_BLOCK_SIZE;
-  int sb_height_in_blocks  = MI_BLOCK_SIZE;
+                                        BLOCK_SIZE *max_block_size) {
   int i, j;
   int index = 0;
 
   // Check the sb_type for each block that belongs to this region.
-  for (i = 0; i < sb_height_in_blocks; ++i) {
-    for (j = 0; j < sb_width_in_blocks; ++j) {
-      MODE_INFO *mi = mi_8x8[index+j];
-      BLOCK_SIZE sb_type = mi ? mi->mbmi.sb_type : 0;
-      bs_hist[sb_type]++;
+  for (i = 0; i < cm->mib_size; ++i) {
+    for (j = 0; j < cm->mib_size; ++j) {
+      MODE_INFO *mi = mib[index+j];
+      BLOCK_SIZE sb_type = mi ? mi->mbmi.sb_type : BLOCK_4X4;
       *min_block_size = VPXMIN(*min_block_size, sb_type);
       *max_block_size = VPXMAX(*max_block_size, sb_type);
     }
@@ -1828,15 +2991,6 @@
   }
 }
 
-// Next square block size less or equal than current block size.
-static const BLOCK_SIZE next_square_size[BLOCK_SIZES] = {
-  BLOCK_4X4, BLOCK_4X4, BLOCK_4X4,
-  BLOCK_8X8, BLOCK_8X8, BLOCK_8X8,
-  BLOCK_16X16, BLOCK_16X16, BLOCK_16X16,
-  BLOCK_32X32, BLOCK_32X32, BLOCK_32X32,
-  BLOCK_64X64
-};
-
 // Look at neighboring blocks and set a min and max partition size based on
 // what they chose.
 static void rd_auto_partition_range(VP10_COMP *cpi, const TileInfo *const tile,
@@ -1848,17 +3002,16 @@
   MODE_INFO **mi = xd->mi;
   const int left_in_image = xd->left_available && mi[-1];
   const int above_in_image = xd->up_available && mi[-xd->mi_stride];
-  const int row8x8_remaining = tile->mi_row_end - mi_row;
-  const int col8x8_remaining = tile->mi_col_end - mi_col;
+  const int mi_rows_remaining = tile->mi_row_end - mi_row;
+  const int mi_cols_remaining = tile->mi_col_end - mi_col;
   int bh, bw;
   BLOCK_SIZE min_size = BLOCK_4X4;
-  BLOCK_SIZE max_size = BLOCK_64X64;
-  int bs_hist[BLOCK_SIZES] = {0};
+  BLOCK_SIZE max_size = BLOCK_LARGEST;
 
   // Trap case where we do not have a prediction.
   if (left_in_image || above_in_image || cm->frame_type != KEY_FRAME) {
     // Default "min to max" and "max to min"
-    min_size = BLOCK_64X64;
+    min_size = BLOCK_LARGEST;
     max_size = BLOCK_4X4;
 
     // NOTE: each call to get_sb_partition_size_range() uses the previous
@@ -1867,19 +3020,17 @@
     if (cm->frame_type != KEY_FRAME) {
       MODE_INFO **prev_mi =
           &cm->prev_mi_grid_visible[mi_row * xd->mi_stride + mi_col];
-      get_sb_partition_size_range(xd, prev_mi, &min_size, &max_size, bs_hist);
+      get_sb_partition_size_range(cm, xd, prev_mi, &min_size, &max_size);
     }
-    // Find the min and max partition sizes used in the left SB64
+    // Find the min and max partition sizes used in the left superblock
     if (left_in_image) {
-      MODE_INFO **left_sb64_mi = &mi[-MI_BLOCK_SIZE];
-      get_sb_partition_size_range(xd, left_sb64_mi, &min_size, &max_size,
-                                  bs_hist);
+      MODE_INFO **left_sb_mi = &mi[-cm->mib_size];
+      get_sb_partition_size_range(cm, xd, left_sb_mi, &min_size, &max_size);
     }
-    // Find the min and max partition sizes used in the above SB64.
+    // Find the min and max partition sizes used in the above suprblock.
     if (above_in_image) {
-      MODE_INFO **above_sb64_mi = &mi[-xd->mi_stride * MI_BLOCK_SIZE];
-      get_sb_partition_size_range(xd, above_sb64_mi, &min_size, &max_size,
-                                  bs_hist);
+      MODE_INFO **above_sb_mi = &mi[-xd->mi_stride * cm->mib_size];
+      get_sb_partition_size_range(cm, xd, above_sb_mi, &min_size, &max_size);
     }
 
     // Adjust observed min and max for "relaxed" auto partition case.
@@ -1890,29 +3041,28 @@
   }
 
   // Check border cases where max and min from neighbors may not be legal.
-  max_size = find_partition_size(max_size,
-                                 row8x8_remaining, col8x8_remaining,
+  max_size = find_partition_size(max_size, mi_rows_remaining, mi_cols_remaining,
                                  &bh, &bw);
+  min_size = VPXMIN(min_size, max_size);
+
   // Test for blocks at the edge of the active image.
   // This may be the actual edge of the image or where there are formatting
   // bars.
   if (vp10_active_edge_sb(cpi, mi_row, mi_col)) {
     min_size = BLOCK_4X4;
   } else {
-    min_size =
-        VPXMIN(cpi->sf.rd_auto_partition_min_limit, VPXMIN(min_size, max_size));
+    min_size = VPXMIN(cpi->sf.rd_auto_partition_min_limit, min_size);
   }
 
   // When use_square_partition_only is true, make sure at least one square
   // partition is allowed by selecting the next smaller square size as
   // *min_block_size.
-  if (cpi->sf.use_square_partition_only &&
-      next_square_size[max_size] < min_size) {
-     min_size = next_square_size[max_size];
+  if (cpi->sf.use_square_partition_only) {
+    min_size = VPXMIN(min_size, next_square_size[max_size]);
   }
 
-  *min_block_size = min_size;
-  *max_block_size = max_size;
+  *min_block_size = VPXMIN(min_size, cm->sb_size);
+  *max_block_size = VPXMIN(max_size, cm->sb_size);
 }
 
 // TODO(jingning) refactor functions setting partition search range
@@ -1928,7 +3078,7 @@
   MODE_INFO **prev_mi = &cm->prev_mi_grid_visible[idx_str];
   BLOCK_SIZE bs, min_size, max_size;
 
-  min_size = BLOCK_64X64;
+  min_size = BLOCK_LARGEST;
   max_size = BLOCK_4X4;
 
   if (prev_mi) {
@@ -1965,8 +3115,8 @@
     max_size = max_partition_size[max_size];
   }
 
-  *min_bs = min_size;
-  *max_bs = max_size;
+  *min_bs = VPXMIN(min_size, cm->sb_size);
+  *max_bs = VPXMIN(max_size, cm->sb_size);
 }
 
 static INLINE void store_pred_mv(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx) {
@@ -1978,16 +3128,27 @@
 }
 
 #if CONFIG_FP_MB_STATS
-const int num_16x16_blocks_wide_lookup[BLOCK_SIZES] =
-  {1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 4, 4};
-const int num_16x16_blocks_high_lookup[BLOCK_SIZES] =
-  {1, 1, 1, 1, 1, 1, 1, 2, 1, 2, 4, 2, 4};
 const int qindex_skip_threshold_lookup[BLOCK_SIZES] =
-  {0, 10, 10, 30, 40, 40, 60, 80, 80, 90, 100, 100, 120};
+  {0, 10, 10, 30, 40, 40, 60, 80, 80, 90, 100, 100, 120,
+#if CONFIG_EXT_PARTITION
+  // TODO(debargha): What are the correct numbers here?
+  130, 130, 150
+#endif  // CONFIG_EXT_PARTITION
+  };
 const int qindex_split_threshold_lookup[BLOCK_SIZES] =
-  {0, 3, 3, 7, 15, 15, 30, 40, 40, 60, 80, 80, 120};
+  {0, 3, 3, 7, 15, 15, 30, 40, 40, 60, 80, 80, 120,
+#if CONFIG_EXT_PARTITION
+  // TODO(debargha): What are the correct numbers here?
+  160, 160, 240
+#endif  // CONFIG_EXT_PARTITION
+  };
 const int complexity_16x16_blocks_threshold[BLOCK_SIZES] =
-  {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 6};
+  {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 6
+#if CONFIG_EXT_PARTITION
+  // TODO(debargha): What are the correct numbers here?
+  8, 8, 10
+#endif  // CONFIG_EXT_PARTITION
+  };
 
 typedef enum {
   MV_ZERO = 0,
@@ -2022,6 +3183,191 @@
 }
 #endif
 
+#if CONFIG_EXT_PARTITION_TYPES
+static void rd_test_partition3(VP10_COMP *cpi, ThreadData *td,
+                               TileDataEnc *tile_data,
+                               TOKENEXTRA **tp, PC_TREE *pc_tree,
+                               RD_COST *best_rdc, PICK_MODE_CONTEXT ctxs[3],
+                               PICK_MODE_CONTEXT *ctx,
+                               int mi_row, int mi_col, BLOCK_SIZE bsize,
+                               PARTITION_TYPE partition,
+#if CONFIG_SUPERTX
+                               int64_t best_rd, int *best_rate_nocoef,
+                               RD_SEARCH_MACROBLOCK_CONTEXT* x_ctx,
+#endif
+                               int mi_row0, int mi_col0, BLOCK_SIZE subsize0,
+                               int mi_row1, int mi_col1, BLOCK_SIZE subsize1,
+                               int mi_row2, int mi_col2, BLOCK_SIZE subsize2) {
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  RD_COST this_rdc, sum_rdc;
+#if CONFIG_SUPERTX
+  VP10_COMMON *const cm = &cpi->common;
+  TileInfo *const tile_info = &tile_data->tile_info;
+  int this_rate_nocoef, sum_rate_nocoef;
+  int abort_flag;
+  const int supertx_allowed =
+      !frame_is_intra_only(cm) &&
+      bsize <= MAX_SUPERTX_BLOCK_SIZE &&
+      !xd->lossless[0];
+#endif
+  if (cpi->sf.adaptive_motion_search)
+    load_pred_mv(x, ctx);
+
+  rd_pick_sb_modes(cpi, tile_data, x, mi_row0, mi_col0, &sum_rdc,
+#if CONFIG_SUPERTX
+                   &sum_rate_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+                   partition,
+#endif
+                   subsize0, &ctxs[0], best_rdc->rdcost);
+#if CONFIG_SUPERTX
+  abort_flag = sum_rdc.rdcost >= best_rd;
+#endif
+
+#if CONFIG_SUPERTX
+  if (sum_rdc.rdcost < INT64_MAX) {
+#else
+  if (sum_rdc.rdcost < best_rdc->rdcost) {
+#endif
+    PICK_MODE_CONTEXT *ctx = &ctxs[0];
+    update_state(cpi, td, ctx, mi_row0, mi_col0, subsize0, 0);
+    encode_superblock(cpi, td, tp, 0, mi_row0, mi_col0, subsize0, ctx);
+
+    if (cpi->sf.adaptive_motion_search)
+      load_pred_mv(x, ctx);
+
+#if CONFIG_SUPERTX
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row1, mi_col1, &this_rdc,
+                     &this_rate_nocoef,
+#if CONFIG_EXT_PARTITION_TYPES
+                     partition,
+#endif
+                     subsize1, &ctxs[1], INT64_MAX - sum_rdc.rdcost);
+#else
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row1, mi_col1, &this_rdc,
+#if CONFIG_EXT_PARTITION_TYPES
+                     partition,
+#endif
+                     subsize1, &ctxs[1], best_rdc->rdcost - sum_rdc.rdcost);
+#endif  // CONFIG_SUPERTX
+
+    if (this_rdc.rate == INT_MAX) {
+      sum_rdc.rdcost = INT64_MAX;
+#if CONFIG_SUPERTX
+      sum_rate_nocoef = INT_MAX;
+#endif
+    } else {
+      sum_rdc.rate += this_rdc.rate;
+      sum_rdc.dist += this_rdc.dist;
+      sum_rdc.rdcost += this_rdc.rdcost;
+#if CONFIG_SUPERTX
+      sum_rate_nocoef += this_rate_nocoef;
+#endif
+    }
+
+#if CONFIG_SUPERTX
+    if (sum_rdc.rdcost < INT64_MAX) {
+#else
+    if (sum_rdc.rdcost < best_rdc->rdcost) {
+#endif
+      PICK_MODE_CONTEXT *ctx = &ctxs[1];
+      update_state(cpi, td, ctx, mi_row1, mi_col1, subsize1, 0);
+      encode_superblock(cpi, td, tp, 0, mi_row1, mi_col1, subsize1, ctx);
+
+      if (cpi->sf.adaptive_motion_search)
+        load_pred_mv(x, ctx);
+
+#if CONFIG_SUPERTX
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row2, mi_col2, &this_rdc,
+                       &this_rate_nocoef,
+#if CONFIG_EXT_PARTITION_TYPES
+                       partition,
+#endif
+                       subsize2, &ctxs[2], INT64_MAX - sum_rdc.rdcost);
+#else
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row2, mi_col2, &this_rdc,
+#if CONFIG_EXT_PARTITION_TYPES
+                       partition,
+#endif
+                       subsize2, &ctxs[2], best_rdc->rdcost - sum_rdc.rdcost);
+#endif  // CONFIG_SUPERTX
+
+      if (this_rdc.rate == INT_MAX) {
+        sum_rdc.rdcost = INT64_MAX;
+#if CONFIG_SUPERTX
+        sum_rate_nocoef = INT_MAX;
+#endif
+      } else {
+        sum_rdc.rate += this_rdc.rate;
+        sum_rdc.dist += this_rdc.dist;
+        sum_rdc.rdcost += this_rdc.rdcost;
+#if CONFIG_SUPERTX
+        sum_rate_nocoef += this_rate_nocoef;
+#endif
+      }
+
+#if CONFIG_SUPERTX
+      if (supertx_allowed && !abort_flag && sum_rdc.rdcost < INT64_MAX) {
+        TX_SIZE supertx_size = max_txsize_lookup[bsize];
+        const PARTITION_TYPE best_partition = pc_tree->partitioning;
+        pc_tree->partitioning = partition;
+        sum_rdc.rate += vp10_cost_bit(
+            cm->fc->supertx_prob
+            [partition_supertx_context_lookup[partition]][supertx_size],
+            0);
+        sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate,
+                                sum_rdc.dist);
+
+        if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) {
+          TX_TYPE best_tx = DCT_DCT;
+          RD_COST tmp_rdc = {sum_rate_nocoef, 0, 0};
+
+          restore_context(x, x_ctx, mi_row, mi_col, bsize);
+
+          rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize,
+                        &tmp_rdc.rate, &tmp_rdc.dist, &best_tx, pc_tree);
+
+          tmp_rdc.rate += vp10_cost_bit(
+              cm->fc->supertx_prob
+              [partition_supertx_context_lookup[partition]][supertx_size],
+              1);
+          tmp_rdc.rdcost =
+              RDCOST(x->rdmult, x->rddiv, tmp_rdc.rate, tmp_rdc.dist);
+          if (tmp_rdc.rdcost < sum_rdc.rdcost) {
+            sum_rdc = tmp_rdc;
+            update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize, best_tx,
+                                    supertx_size, pc_tree);
+          }
+        }
+
+        pc_tree->partitioning = best_partition;
+      }
+#endif  // CONFIG_SUPERTX
+
+      if (sum_rdc.rdcost < best_rdc->rdcost) {
+        int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+        sum_rdc.rate += cpi->partition_cost[pl][partition];
+        sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate,
+                                sum_rdc.dist);
+#if CONFIG_SUPERTX
+        sum_rate_nocoef += cpi->partition_cost[pl][partition];
+#endif
+        if (sum_rdc.rdcost < best_rdc->rdcost) {
+#if CONFIG_SUPERTX
+          *best_rate_nocoef = sum_rate_nocoef;
+          assert(*best_rate_nocoef >= 0);
+#endif
+          *best_rdc = sum_rdc;
+          pc_tree->partitioning = partition;
+        }
+      }
+    }
+  }
+}
+#endif  // CONFIG_EXT_PARTITION_TYPES
+
 // TODO(jingning,jimbankoski,rbultje): properly skip partition types that are
 // unlikely to be selected depending on previous rate-distortion optimization
 // results, for encoding speed-up.
@@ -2029,21 +3375,37 @@
                               TileDataEnc *tile_data,
                               TOKENEXTRA **tp, int mi_row, int mi_col,
                               BLOCK_SIZE bsize, RD_COST *rd_cost,
+#if CONFIG_SUPERTX
+                              int *rate_nocoef,
+#endif
                               int64_t best_rd, PC_TREE *pc_tree) {
   VP10_COMMON *const cm = &cpi->common;
   TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   const int mi_step = num_8x8_blocks_wide_lookup[bsize] / 2;
-  ENTROPY_CONTEXT l[16 * MAX_MB_PLANE], a[16 * MAX_MB_PLANE];
-  PARTITION_CONTEXT sl[8], sa[8];
+  RD_SEARCH_MACROBLOCK_CONTEXT x_ctx;
   TOKENEXTRA *tp_orig = *tp;
   PICK_MODE_CONTEXT *ctx = &pc_tree->none;
-  int i, pl;
+  int i;
+  const int pl = partition_plane_context(xd, mi_row, mi_col, bsize);
+  int *partition_cost = cpi->partition_cost[pl];
+  int tmp_partition_cost[PARTITION_TYPES];
   BLOCK_SIZE subsize;
   RD_COST this_rdc, sum_rdc, best_rdc;
+#if CONFIG_SUPERTX
+  int this_rate_nocoef, sum_rate_nocoef = 0, best_rate_nocoef = INT_MAX;
+  int abort_flag;
+  const int supertx_allowed =
+      !frame_is_intra_only(cm) &&
+      bsize <= MAX_SUPERTX_BLOCK_SIZE &&
+      !xd->lossless[0];
+#endif  // CONFIG_SUPERTX
   int do_split = bsize >= BLOCK_8X8;
   int do_rect = 1;
+#if CONFIG_EXT_PARTITION_TYPES
+  BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+#endif
 
   // Override skipping rectangular partition operations for edge blocks
   const int force_horz_split = (mi_row + mi_step >= cm->mi_rows);
@@ -2066,6 +3428,39 @@
                                bsize >= BLOCK_8X8;
   (void) *tp_orig;
 
+  if (force_horz_split || force_vert_split) {
+    tmp_partition_cost[PARTITION_NONE] = INT_MAX;
+
+    if (!force_vert_split) {  // force_horz_split only
+      tmp_partition_cost[PARTITION_VERT] = INT_MAX;
+      tmp_partition_cost[PARTITION_HORZ] =
+          vp10_cost_bit(cm->fc->partition_prob[pl][PARTITION_HORZ], 0);
+      tmp_partition_cost[PARTITION_SPLIT] =
+          vp10_cost_bit(cm->fc->partition_prob[pl][PARTITION_HORZ], 1);
+    } else if (!force_horz_split) {  // force_vert_split only
+      tmp_partition_cost[PARTITION_HORZ] = INT_MAX;
+      tmp_partition_cost[PARTITION_VERT] =
+          vp10_cost_bit(cm->fc->partition_prob[pl][PARTITION_VERT], 0);
+      tmp_partition_cost[PARTITION_SPLIT] =
+          vp10_cost_bit(cm->fc->partition_prob[pl][PARTITION_VERT], 1);
+    } else {  // force_ horz_split && force_vert_split horz_split
+      tmp_partition_cost[PARTITION_HORZ] = INT_MAX;
+      tmp_partition_cost[PARTITION_VERT] = INT_MAX;
+      tmp_partition_cost[PARTITION_SPLIT] = 0;
+    }
+
+    partition_cost = tmp_partition_cost;
+  }
+
+#if CONFIG_VAR_TX
+#ifndef NDEBUG
+  // Nothing should rely on the default value of this array (which is just
+  // leftover from encoding the previous block. Setting it to magic number
+  // when debugging.
+  memset(x->blk_skip[0], 234, sizeof(x->blk_skip[0]));
+#endif  // NDEBUG
+#endif  // CONFIG_VAR_TX
+
   assert(num_8x8_blocks_wide_lookup[bsize] ==
              num_8x8_blocks_high_lookup[bsize]);
 
@@ -2076,7 +3471,7 @@
 
   set_offsets(cpi, tile_info, x, mi_row, mi_col, bsize);
 
-  if (bsize == BLOCK_16X16 && cpi->oxcf.aq_mode)
+  if (bsize == BLOCK_16X16 && cpi->vaq_refresh)
     x->mb_energy = vp10_block_energy(cpi, x, bsize);
 
   if (cpi->sf.cb_partition_search && bsize == BLOCK_16X16) {
@@ -2102,7 +3497,13 @@
     partition_vert_allowed &= force_vert_split;
   }
 
-  save_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+#if CONFIG_VAR_TX
+  xd->above_txfm_context = cm->above_txfm_context + mi_col;
+  xd->left_txfm_context =
+    xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+#endif
+
+  save_context(x, &x_ctx, mi_row, mi_col, bsize);
 
 #if CONFIG_FP_MB_STATS
   if (cpi->use_fp_mb_stats) {
@@ -2165,14 +3566,22 @@
 
   // PARTITION_NONE
   if (partition_none_allowed) {
-    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col,
-                     &this_rdc, bsize, ctx, best_rdc.rdcost);
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &this_rdc,
+#if CONFIG_SUPERTX
+                     &this_rate_nocoef,
+#endif
+#if CONFIG_EXT_PARTITION_TYPES
+                     PARTITION_NONE,
+#endif
+                     bsize, ctx, best_rdc.rdcost);
     if (this_rdc.rate != INT_MAX) {
       if (bsize >= BLOCK_8X8) {
-        pl = partition_plane_context(xd, mi_row, mi_col, bsize);
-        this_rdc.rate += cpi->partition_cost[pl][PARTITION_NONE];
+        this_rdc.rate += partition_cost[PARTITION_NONE];
         this_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
                                  this_rdc.rate, this_rdc.dist);
+#if CONFIG_SUPERTX
+        this_rate_nocoef += partition_cost[PARTITION_NONE];
+#endif
       }
 
       if (this_rdc.rdcost < best_rdc.rdcost) {
@@ -2180,12 +3589,16 @@
         int rate_breakout_thr = cpi->sf.partition_search_breakout_rate_thr;
 
         best_rdc = this_rdc;
+#if CONFIG_SUPERTX
+        best_rate_nocoef = this_rate_nocoef;
+        assert(best_rate_nocoef >= 0);
+#endif
         if (bsize >= BLOCK_8X8)
           pc_tree->partitioning = PARTITION_NONE;
 
         // Adjust dist breakout threshold according to the partition size.
-        dist_breakout_thr >>= 8 - (b_width_log2_lookup[bsize] +
-            b_height_log2_lookup[bsize]);
+        dist_breakout_thr >>= (2 * (MAX_SB_SIZE_LOG2 - 2))
+          - (b_width_log2_lookup[bsize] + b_height_log2_lookup[bsize]);
 
         rate_breakout_thr *= num_pels_log2_lookup[bsize];
 
@@ -2248,7 +3661,8 @@
 #endif
       }
     }
-    restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
   }
 
   // store estimated motion vector
@@ -2262,17 +3676,85 @@
     subsize = get_subsize(bsize, PARTITION_SPLIT);
     if (bsize == BLOCK_8X8) {
       i = 4;
+#if CONFIG_DUAL_FILTER
+      if (cpi->sf.adaptive_pred_interp_filter && partition_none_allowed)
+        pc_tree->leaf_split[0]->pred_interp_filter =
+            ctx->mic.mbmi.interp_filter[0];
+#else
       if (cpi->sf.adaptive_pred_interp_filter && partition_none_allowed)
         pc_tree->leaf_split[0]->pred_interp_filter =
             ctx->mic.mbmi.interp_filter;
-      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
-                       pc_tree->leaf_split[0], best_rdc.rdcost);
-      if (sum_rdc.rate == INT_MAX)
+#endif
+#if CONFIG_SUPERTX
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc,
+                       &sum_rate_nocoef,
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_SPLIT,
+#endif
+                       subsize, pc_tree->leaf_split[0], INT64_MAX);
+#else
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc,
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_SPLIT,
+#endif
+                       subsize, pc_tree->leaf_split[0], best_rdc.rdcost);
+#endif  // CONFIG_SUPERTX
+      if (sum_rdc.rate == INT_MAX) {
         sum_rdc.rdcost = INT64_MAX;
+#if CONFIG_SUPERTX
+        sum_rate_nocoef = INT_MAX;
+#endif
+      }
+#if CONFIG_SUPERTX
+      if (supertx_allowed && sum_rdc.rdcost < INT64_MAX) {
+        TX_SIZE supertx_size = max_txsize_lookup[bsize];
+        const PARTITION_TYPE best_partition = pc_tree->partitioning;
+
+        pc_tree->partitioning = PARTITION_SPLIT;
+
+        sum_rdc.rate += vp10_cost_bit(
+            cm->fc->supertx_prob
+            [partition_supertx_context_lookup[PARTITION_SPLIT]][supertx_size],
+            0);
+        sum_rdc.rdcost =
+            RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+
+        if (is_inter_mode(pc_tree->leaf_split[0]->mic.mbmi.mode)) {
+          TX_TYPE best_tx = DCT_DCT;
+          RD_COST tmp_rdc = {sum_rate_nocoef, 0, 0};
+
+          restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+
+          rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize,
+                        &tmp_rdc.rate, &tmp_rdc.dist,
+                        &best_tx,
+                        pc_tree);
+
+          tmp_rdc.rate += vp10_cost_bit(
+              cm->fc->supertx_prob
+              [partition_supertx_context_lookup[PARTITION_SPLIT]][supertx_size],
+              1);
+          tmp_rdc.rdcost =
+              RDCOST(x->rdmult, x->rddiv, tmp_rdc.rate, tmp_rdc.dist);
+          if (tmp_rdc.rdcost < sum_rdc.rdcost) {
+            sum_rdc = tmp_rdc;
+            update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize,
+                                    best_tx,
+                                    supertx_size, pc_tree);
+          }
+        }
+
+        pc_tree->partitioning = best_partition;
+      }
+#endif  // CONFIG_SUPERTX
     } else {
+#if CONFIG_SUPERTX
+      for (i = 0; i < 4 && sum_rdc.rdcost < INT64_MAX; ++i) {
+#else
       for (i = 0; i < 4 && sum_rdc.rdcost < best_rdc.rdcost; ++i) {
-      const int x_idx = (i & 1) * mi_step;
-      const int y_idx = (i >> 1) * mi_step;
+#endif  // CONFIG_SUPERTX
+        const int x_idx = (i & 1) * mi_step;
+        const int y_idx = (i >> 1) * mi_step;
 
         if (mi_row + y_idx >= cm->mi_rows || mi_col + x_idx >= cm->mi_cols)
           continue;
@@ -2281,30 +3763,91 @@
           load_pred_mv(x, ctx);
 
         pc_tree->split[i]->index = i;
+#if CONFIG_SUPERTX
+        rd_pick_partition(cpi, td, tile_data, tp,
+                          mi_row + y_idx, mi_col + x_idx,
+                          subsize, &this_rdc, &this_rate_nocoef,
+                          INT64_MAX - sum_rdc.rdcost, pc_tree->split[i]);
+#else
         rd_pick_partition(cpi, td, tile_data, tp,
                           mi_row + y_idx, mi_col + x_idx,
                           subsize, &this_rdc,
                           best_rdc.rdcost - sum_rdc.rdcost, pc_tree->split[i]);
+#endif  // CONFIG_SUPERTX
 
         if (this_rdc.rate == INT_MAX) {
           sum_rdc.rdcost = INT64_MAX;
+#if CONFIG_SUPERTX
+          sum_rate_nocoef = INT_MAX;
+#endif  // CONFIG_SUPERTX
           break;
         } else {
           sum_rdc.rate += this_rdc.rate;
           sum_rdc.dist += this_rdc.dist;
           sum_rdc.rdcost += this_rdc.rdcost;
+#if CONFIG_SUPERTX
+          sum_rate_nocoef += this_rate_nocoef;
+#endif  // CONFIG_SUPERTX
         }
       }
+#if CONFIG_SUPERTX
+      if (supertx_allowed && sum_rdc.rdcost < INT64_MAX && i == 4) {
+        TX_SIZE supertx_size = max_txsize_lookup[bsize];
+        const PARTITION_TYPE best_partition = pc_tree->partitioning;
+
+        pc_tree->partitioning = PARTITION_SPLIT;
+
+        sum_rdc.rate += vp10_cost_bit(
+            cm->fc->supertx_prob
+            [partition_supertx_context_lookup[PARTITION_SPLIT]][supertx_size],
+            0);
+        sum_rdc.rdcost =
+            RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+
+        if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) {
+          TX_TYPE best_tx = DCT_DCT;
+          RD_COST tmp_rdc = {sum_rate_nocoef, 0, 0};
+
+          restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+
+          rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize,
+                        &tmp_rdc.rate, &tmp_rdc.dist,
+                        &best_tx,
+                        pc_tree);
+
+          tmp_rdc.rate += vp10_cost_bit(
+              cm->fc->supertx_prob
+              [partition_supertx_context_lookup[PARTITION_SPLIT]][supertx_size],
+              1);
+          tmp_rdc.rdcost =
+              RDCOST(x->rdmult, x->rddiv, tmp_rdc.rate, tmp_rdc.dist);
+          if (tmp_rdc.rdcost < sum_rdc.rdcost) {
+            sum_rdc = tmp_rdc;
+            update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize,
+                                    best_tx,
+                                    supertx_size, pc_tree);
+          }
+        }
+
+        pc_tree->partitioning = best_partition;
+      }
+#endif  // CONFIG_SUPERTX
     }
 
     if (sum_rdc.rdcost < best_rdc.rdcost && i == 4) {
-      pl = partition_plane_context(xd, mi_row, mi_col, bsize);
-      sum_rdc.rate += cpi->partition_cost[pl][PARTITION_SPLIT];
+      sum_rdc.rate += partition_cost[PARTITION_SPLIT];
       sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
                               sum_rdc.rate, sum_rdc.dist);
+#if CONFIG_SUPERTX
+      sum_rate_nocoef += partition_cost[PARTITION_SPLIT];
+#endif  // CONFIG_SUPERTX
 
       if (sum_rdc.rdcost < best_rdc.rdcost) {
         best_rdc = sum_rdc;
+#if CONFIG_SUPERTX
+        best_rate_nocoef = sum_rate_nocoef;
+        assert(best_rate_nocoef >= 0);
+#endif  // CONFIG_SUPERTX
         pc_tree->partitioning = PARTITION_SPLIT;
       }
     } else {
@@ -2313,23 +3856,44 @@
       if (cpi->sf.less_rectangular_check)
         do_rect &= !partition_none_allowed;
     }
-    restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
-  }
+
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+  }  // if (do_split)
 
   // PARTITION_HORZ
   if (partition_horz_allowed &&
       (do_rect || vp10_active_h_edge(cpi, mi_row, mi_step))) {
-      subsize = get_subsize(bsize, PARTITION_HORZ);
+    subsize = get_subsize(bsize, PARTITION_HORZ);
     if (cpi->sf.adaptive_motion_search)
       load_pred_mv(x, ctx);
+#if CONFIG_DUAL_FILTER
+    if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+        partition_none_allowed)
+      pc_tree->horizontal[0].pred_interp_filter =
+          ctx->mic.mbmi.interp_filter[0];
+#else
     if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
         partition_none_allowed)
       pc_tree->horizontal[0].pred_interp_filter =
           ctx->mic.mbmi.interp_filter;
-    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
-                     &pc_tree->horizontal[0], best_rdc.rdcost);
+#endif
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc,
+#if CONFIG_SUPERTX
+                     &sum_rate_nocoef,
+#endif  // CONFIG_SUPERTX
+#if CONFIG_EXT_PARTITION_TYPES
+                     PARTITION_HORZ,
+#endif
+                     subsize, &pc_tree->horizontal[0], best_rdc.rdcost);
 
-    if (sum_rdc.rdcost < best_rdc.rdcost && mi_row + mi_step < cm->mi_rows &&
+#if CONFIG_SUPERTX
+    abort_flag = (sum_rdc.rdcost >= best_rd && bsize > BLOCK_8X8) ||
+        (sum_rdc.rate == INT_MAX && bsize == BLOCK_8X8);
+    if (sum_rdc.rdcost < INT64_MAX &&
+#else
+    if (sum_rdc.rdcost < best_rdc.rdcost &&
+#endif  // CONFIG_SUPERTX
+        mi_row + mi_step < cm->mi_rows &&
         bsize > BLOCK_8X8) {
       PICK_MODE_CONTEXT *ctx = &pc_tree->horizontal[0];
       update_state(cpi, td, ctx, mi_row, mi_col, subsize, 0);
@@ -2337,47 +3901,145 @@
 
       if (cpi->sf.adaptive_motion_search)
         load_pred_mv(x, ctx);
+
+#if CONFIG_DUAL_FILTER
+      if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+          partition_none_allowed)
+        pc_tree->horizontal[1].pred_interp_filter =
+            ctx->mic.mbmi.interp_filter[0];
+#else
       if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
           partition_none_allowed)
         pc_tree->horizontal[1].pred_interp_filter =
             ctx->mic.mbmi.interp_filter;
+#endif
+#if CONFIG_SUPERTX
       rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col,
-                       &this_rdc, subsize, &pc_tree->horizontal[1],
+                       &this_rdc, &this_rate_nocoef,
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_HORZ,
+#endif
+                       subsize, &pc_tree->horizontal[1],
+                       INT64_MAX);
+#else
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row + mi_step, mi_col,
+                       &this_rdc,
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_HORZ,
+#endif
+                       subsize, &pc_tree->horizontal[1],
                        best_rdc.rdcost - sum_rdc.rdcost);
+#endif  // CONFIG_SUPERTX
       if (this_rdc.rate == INT_MAX) {
         sum_rdc.rdcost = INT64_MAX;
+#if CONFIG_SUPERTX
+        sum_rate_nocoef = INT_MAX;
+#endif  // CONFIG_SUPERTX
       } else {
         sum_rdc.rate += this_rdc.rate;
         sum_rdc.dist += this_rdc.dist;
         sum_rdc.rdcost += this_rdc.rdcost;
+#if CONFIG_SUPERTX
+        sum_rate_nocoef += this_rate_nocoef;
+#endif  // CONFIG_SUPERTX
       }
     }
 
-    if (sum_rdc.rdcost < best_rdc.rdcost) {
-      pl = partition_plane_context(xd, mi_row, mi_col, bsize);
-      sum_rdc.rate += cpi->partition_cost[pl][PARTITION_HORZ];
+#if CONFIG_SUPERTX
+    if (supertx_allowed && sum_rdc.rdcost < INT64_MAX && !abort_flag) {
+      TX_SIZE supertx_size = max_txsize_lookup[bsize];
+      const PARTITION_TYPE best_partition = pc_tree->partitioning;
+
+      pc_tree->partitioning = PARTITION_HORZ;
+
+      sum_rdc.rate += vp10_cost_bit(
+          cm->fc->supertx_prob[partition_supertx_context_lookup[PARTITION_HORZ]]
+          [supertx_size], 0);
       sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+
+      if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) {
+        TX_TYPE best_tx = DCT_DCT;
+        RD_COST tmp_rdc = {sum_rate_nocoef, 0, 0};
+
+        restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+
+        rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize,
+                      &tmp_rdc.rate, &tmp_rdc.dist,
+                      &best_tx,
+                      pc_tree);
+
+        tmp_rdc.rate += vp10_cost_bit(
+            cm->fc->supertx_prob
+            [partition_supertx_context_lookup[PARTITION_HORZ]][supertx_size],
+            1);
+        tmp_rdc.rdcost =
+            RDCOST(x->rdmult, x->rddiv, tmp_rdc.rate, tmp_rdc.dist);
+        if (tmp_rdc.rdcost < sum_rdc.rdcost) {
+          sum_rdc = tmp_rdc;
+          update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize,
+                                  best_tx,
+                                  supertx_size, pc_tree);
+        }
+      }
+
+      pc_tree->partitioning = best_partition;
+    }
+#endif  // CONFIG_SUPERTX
+
+    if (sum_rdc.rdcost < best_rdc.rdcost) {
+      sum_rdc.rate += partition_cost[PARTITION_HORZ];
+      sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+#if CONFIG_SUPERTX
+      sum_rate_nocoef += partition_cost[PARTITION_HORZ];
+#endif  // CONFIG_SUPERTX
       if (sum_rdc.rdcost < best_rdc.rdcost) {
         best_rdc = sum_rdc;
+#if CONFIG_SUPERTX
+        best_rate_nocoef = sum_rate_nocoef;
+        assert(best_rate_nocoef >= 0);
+#endif  // CONFIG_SUPERTX
         pc_tree->partitioning = PARTITION_HORZ;
       }
     }
-    restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
   }
+
   // PARTITION_VERT
   if (partition_vert_allowed &&
       (do_rect || vp10_active_v_edge(cpi, mi_col, mi_step))) {
-      subsize = get_subsize(bsize, PARTITION_VERT);
+    subsize = get_subsize(bsize, PARTITION_VERT);
 
     if (cpi->sf.adaptive_motion_search)
       load_pred_mv(x, ctx);
+
+#if CONFIG_DUAL_FILTER
+    if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+        partition_none_allowed)
+      pc_tree->vertical[0].pred_interp_filter =
+          ctx->mic.mbmi.interp_filter[0];
+#else
     if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
         partition_none_allowed)
       pc_tree->vertical[0].pred_interp_filter =
           ctx->mic.mbmi.interp_filter;
-    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc, subsize,
-                     &pc_tree->vertical[0], best_rdc.rdcost);
-    if (sum_rdc.rdcost < best_rdc.rdcost && mi_col + mi_step < cm->mi_cols &&
+#endif
+    rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col, &sum_rdc,
+#if CONFIG_SUPERTX
+                     &sum_rate_nocoef,
+#endif  // CONFIG_SUPERTX
+#if CONFIG_EXT_PARTITION_TYPES
+                     PARTITION_VERT,
+#endif
+                     subsize, &pc_tree->vertical[0], best_rdc.rdcost);
+#if CONFIG_SUPERTX
+    abort_flag = (sum_rdc.rdcost >= best_rd && bsize > BLOCK_8X8) ||
+                 (sum_rdc.rate == INT_MAX && bsize == BLOCK_8X8);
+    if (sum_rdc.rdcost < INT64_MAX &&
+#else
+    if (sum_rdc.rdcost < best_rdc.rdcost &&
+#endif  // CONFIG_SUPERTX
+        mi_col + mi_step < cm->mi_cols &&
         bsize > BLOCK_8X8) {
       update_state(cpi, td, &pc_tree->vertical[0], mi_row, mi_col, subsize, 0);
       encode_superblock(cpi, td, tp, 0, mi_row, mi_col, subsize,
@@ -2385,51 +4047,190 @@
 
       if (cpi->sf.adaptive_motion_search)
         load_pred_mv(x, ctx);
+
+#if CONFIG_DUAL_FILTER
+      if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
+          partition_none_allowed)
+        pc_tree->vertical[1].pred_interp_filter =
+            ctx->mic.mbmi.interp_filter[0];
+#else
       if (cpi->sf.adaptive_pred_interp_filter && bsize == BLOCK_8X8 &&
           partition_none_allowed)
         pc_tree->vertical[1].pred_interp_filter =
             ctx->mic.mbmi.interp_filter;
+#endif
+#if CONFIG_SUPERTX
+      rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step, &this_rdc,
+                       &this_rate_nocoef,
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_VERT,
+#endif
+                       subsize, &pc_tree->vertical[1],
+                       INT64_MAX - sum_rdc.rdcost);
+#else
       rd_pick_sb_modes(cpi, tile_data, x, mi_row, mi_col + mi_step,
-                       &this_rdc, subsize,
+                       &this_rdc,
+#if CONFIG_EXT_PARTITION_TYPES
+                       PARTITION_VERT,
+#endif
+                       subsize,
                        &pc_tree->vertical[1], best_rdc.rdcost - sum_rdc.rdcost);
+#endif  // CONFIG_SUPERTX
       if (this_rdc.rate == INT_MAX) {
         sum_rdc.rdcost = INT64_MAX;
+#if CONFIG_SUPERTX
+        sum_rate_nocoef = INT_MAX;
+#endif  // CONFIG_SUPERTX
       } else {
         sum_rdc.rate += this_rdc.rate;
         sum_rdc.dist += this_rdc.dist;
         sum_rdc.rdcost += this_rdc.rdcost;
+#if CONFIG_SUPERTX
+        sum_rate_nocoef += this_rate_nocoef;
+#endif  // CONFIG_SUPERTX
       }
     }
+#if CONFIG_SUPERTX
+    if (supertx_allowed && sum_rdc.rdcost < INT64_MAX && !abort_flag) {
+      TX_SIZE supertx_size = max_txsize_lookup[bsize];
+      const PARTITION_TYPE best_partition = pc_tree->partitioning;
+
+      pc_tree->partitioning = PARTITION_VERT;
+
+      sum_rdc.rate += vp10_cost_bit(
+          cm->fc->supertx_prob[partition_supertx_context_lookup[PARTITION_VERT]]
+                              [supertx_size], 0);
+      sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv, sum_rdc.rate, sum_rdc.dist);
+
+      if (!check_intra_sb(cpi, tile_info, mi_row, mi_col, bsize, pc_tree)) {
+        TX_TYPE best_tx = DCT_DCT;
+        RD_COST tmp_rdc = {sum_rate_nocoef, 0, 0};
+
+        restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+
+        rd_supertx_sb(cpi, td, tile_info, mi_row, mi_col, bsize,
+                      &tmp_rdc.rate, &tmp_rdc.dist,
+                      &best_tx,
+                      pc_tree);
+
+        tmp_rdc.rate += vp10_cost_bit(
+            cm->fc->supertx_prob
+            [partition_supertx_context_lookup[PARTITION_VERT]][supertx_size],
+            1);
+        tmp_rdc.rdcost =
+            RDCOST(x->rdmult, x->rddiv, tmp_rdc.rate, tmp_rdc.dist);
+        if (tmp_rdc.rdcost < sum_rdc.rdcost) {
+          sum_rdc = tmp_rdc;
+          update_supertx_param_sb(cpi, td, mi_row, mi_col, bsize,
+                                  best_tx,
+                                  supertx_size, pc_tree);
+        }
+      }
+
+      pc_tree->partitioning = best_partition;
+    }
+#endif  // CONFIG_SUPERTX
 
     if (sum_rdc.rdcost < best_rdc.rdcost) {
-      pl = partition_plane_context(xd, mi_row, mi_col, bsize);
-      sum_rdc.rate += cpi->partition_cost[pl][PARTITION_VERT];
+      sum_rdc.rate += partition_cost[PARTITION_VERT];
       sum_rdc.rdcost = RDCOST(x->rdmult, x->rddiv,
                               sum_rdc.rate, sum_rdc.dist);
+#if CONFIG_SUPERTX
+      sum_rate_nocoef += partition_cost[PARTITION_VERT];
+#endif  // CONFIG_SUPERTX
       if (sum_rdc.rdcost < best_rdc.rdcost) {
         best_rdc = sum_rdc;
+#if CONFIG_SUPERTX
+        best_rate_nocoef = sum_rate_nocoef;
+        assert(best_rate_nocoef >= 0);
+#endif  // CONFIG_SUPERTX
         pc_tree->partitioning = PARTITION_VERT;
       }
     }
-    restore_context(x, mi_row, mi_col, a, l, sa, sl, bsize);
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
   }
 
+#if CONFIG_EXT_PARTITION_TYPES
+  // PARTITION_HORZ_A
+  if (partition_horz_allowed && do_rect && bsize > BLOCK_8X8 &&
+      partition_none_allowed) {
+    subsize = get_subsize(bsize, PARTITION_HORZ_A);
+    rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
+                       pc_tree->horizontala,
+                       ctx, mi_row, mi_col, bsize, PARTITION_HORZ_A,
+#if CONFIG_SUPERTX
+                       best_rd, &best_rate_nocoef, &x_ctx,
+#endif
+                       mi_row, mi_col, bsize2,
+                       mi_row, mi_col + mi_step, bsize2,
+                       mi_row + mi_step, mi_col, subsize);
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+  }
+  // PARTITION_HORZ_B
+  if (partition_horz_allowed && do_rect && bsize > BLOCK_8X8 &&
+      partition_none_allowed) {
+    subsize = get_subsize(bsize, PARTITION_HORZ_B);
+    rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
+                       pc_tree->horizontalb,
+                       ctx, mi_row, mi_col, bsize, PARTITION_HORZ_B,
+#if CONFIG_SUPERTX
+                       best_rd, &best_rate_nocoef, &x_ctx,
+#endif
+                       mi_row, mi_col, subsize,
+                       mi_row + mi_step, mi_col, bsize2,
+                       mi_row + mi_step, mi_col + mi_step, bsize2);
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+  }
+  // PARTITION_VERT_A
+  if (partition_vert_allowed && do_rect && bsize > BLOCK_8X8 &&
+      partition_none_allowed) {
+    subsize = get_subsize(bsize, PARTITION_VERT_A);
+    rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
+                       pc_tree->verticala,
+                       ctx, mi_row, mi_col, bsize, PARTITION_VERT_A,
+#if CONFIG_SUPERTX
+                       best_rd, &best_rate_nocoef, &x_ctx,
+#endif
+                       mi_row, mi_col, bsize2,
+                       mi_row + mi_step, mi_col, bsize2,
+                       mi_row, mi_col + mi_step, subsize);
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+  }
+  // PARTITION_VERT_B
+  if (partition_vert_allowed && do_rect && bsize > BLOCK_8X8 &&
+      partition_none_allowed) {
+    subsize = get_subsize(bsize, PARTITION_VERT_B);
+    rd_test_partition3(cpi, td, tile_data, tp, pc_tree, &best_rdc,
+                       pc_tree->verticalb,
+                       ctx, mi_row, mi_col, bsize, PARTITION_VERT_B,
+#if CONFIG_SUPERTX
+                       best_rd, &best_rate_nocoef, &x_ctx,
+#endif
+                       mi_row, mi_col, subsize,
+                       mi_row, mi_col + mi_step, bsize2,
+                       mi_row + mi_step, mi_col + mi_step, bsize2);
+    restore_context(x, &x_ctx, mi_row, mi_col, bsize);
+  }
+#endif  // CONFIG_EXT_PARTITION_TYPES
+
   // TODO(jbb): This code added so that we avoid static analysis
   // warning related to the fact that best_rd isn't used after this
   // point.  This code should be refactored so that the duplicate
   // checks occur in some sub function and thus are used...
   (void) best_rd;
   *rd_cost = best_rdc;
-
+#if CONFIG_SUPERTX
+  *rate_nocoef = best_rate_nocoef;
+#endif  // CONFIG_SUPERTX
 
   if (best_rdc.rate < INT_MAX && best_rdc.dist < INT64_MAX &&
       pc_tree->index != 3) {
-    int output_enabled = (bsize == BLOCK_64X64);
+    int output_enabled = (bsize == cm->sb_size);
     encode_sb(cpi, td, tile_info, tp, mi_row, mi_col, output_enabled,
               bsize, pc_tree);
   }
 
-  if (bsize == BLOCK_64X64) {
+  if (bsize == cm->sb_size) {
     assert(tp_orig < *tp || (tp_orig == *tp && xd->mi[0]->mbmi.skip));
     assert(best_rdc.rate < INT_MAX);
     assert(best_rdc.dist < INT64_MAX);
@@ -2444,34 +4245,42 @@
                              int mi_row,
                              TOKENEXTRA **tp) {
   VP10_COMMON *const cm = &cpi->common;
-  TileInfo *const tile_info = &tile_data->tile_info;
+  const TileInfo *const tile_info = &tile_data->tile_info;
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   SPEED_FEATURES *const sf = &cpi->sf;
   int mi_col;
+#if CONFIG_EXT_PARTITION
+  const int leaf_nodes = 256;
+#else
+  const int leaf_nodes = 64;
+#endif  // CONFIG_EXT_PARTITION
 
   // Initialize the left context for the new SB row
-  memset(&xd->left_context, 0, sizeof(xd->left_context));
-  memset(xd->left_seg_context, 0, sizeof(xd->left_seg_context));
+  vp10_zero_left_context(xd);
 
   // Code each SB in the row
   for (mi_col = tile_info->mi_col_start; mi_col < tile_info->mi_col_end;
-       mi_col += MI_BLOCK_SIZE) {
+       mi_col += cm->mib_size) {
     const struct segmentation *const seg = &cm->seg;
     int dummy_rate;
     int64_t dummy_dist;
     RD_COST dummy_rdc;
+#if CONFIG_SUPERTX
+    int dummy_rate_nocoef;
+#endif  // CONFIG_SUPERTX
     int i;
     int seg_skip = 0;
 
     const int idx_str = cm->mi_stride * mi_row + mi_col;
     MODE_INFO **mi = cm->mi_grid_visible + idx_str;
+    PC_TREE *const pc_root = td->pc_root[cm->mib_size_log2 - MIN_MIB_SIZE_LOG2];
 
     if (sf->adaptive_pred_interp_filter) {
-      for (i = 0; i < 64; ++i)
+      for (i = 0; i < leaf_nodes; ++i)
         td->leaf_tree[i].pred_interp_filter = SWITCHABLE;
 
-      for (i = 0; i < 64; ++i) {
+      for (i = 0; i < leaf_nodes; ++i) {
         td->pc_tree[i].vertical[0].pred_interp_filter = SWITCHABLE;
         td->pc_tree[i].vertical[1].pred_interp_filter = SWITCHABLE;
         td->pc_tree[i].horizontal[0].pred_interp_filter = SWITCHABLE;
@@ -2480,67 +4289,102 @@
     }
 
     vp10_zero(x->pred_mv);
-    td->pc_root->index = 0;
+    pc_root->index = 0;
 
     if (seg->enabled) {
       const uint8_t *const map = seg->update_map ? cpi->segmentation_map
                                                  : cm->last_frame_seg_map;
-      int segment_id = get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
+      int segment_id = get_segment_id(cm, map, cm->sb_size, mi_row, mi_col);
       seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
     }
 
     x->source_variance = UINT_MAX;
     if (sf->partition_search_type == FIXED_PARTITION || seg_skip) {
-      const BLOCK_SIZE bsize =
-          seg_skip ? BLOCK_64X64 : sf->always_this_block_size;
-      set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
+      BLOCK_SIZE bsize;
+      set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->sb_size);
+      bsize = seg_skip ? cm->sb_size : sf->always_this_block_size;
       set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
       rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
-                       BLOCK_64X64, &dummy_rate, &dummy_dist, 1, td->pc_root);
+                       cm->sb_size, &dummy_rate, &dummy_dist,
+#if CONFIG_SUPERTX
+                       &dummy_rate_nocoef,
+#endif  // CONFIG_SUPERTX
+                       1, pc_root);
     } else if (cpi->partition_search_skippable_frame) {
       BLOCK_SIZE bsize;
-      set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
+      set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->sb_size);
       bsize = get_rd_var_based_fixed_partition(cpi, x, mi_row, mi_col);
       set_fixed_partitioning(cpi, tile_info, mi, mi_row, mi_col, bsize);
       rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
-                       BLOCK_64X64, &dummy_rate, &dummy_dist, 1, td->pc_root);
-    } else if (sf->partition_search_type == VAR_BASED_PARTITION &&
-               cm->frame_type != KEY_FRAME) {
-      choose_partitioning(cpi, tile_info, x, mi_row, mi_col);
+                       cm->sb_size, &dummy_rate, &dummy_dist,
+#if CONFIG_SUPERTX
+                       &dummy_rate_nocoef,
+#endif  // CONFIG_SUPERTX
+                       1, pc_root);
+    } else if (sf->partition_search_type == VAR_BASED_PARTITION) {
+      choose_partitioning(cpi, td, tile_info, x, mi_row, mi_col);
       rd_use_partition(cpi, td, tile_data, mi, tp, mi_row, mi_col,
-                       BLOCK_64X64, &dummy_rate, &dummy_dist, 1, td->pc_root);
+                       cm->sb_size, &dummy_rate, &dummy_dist,
+#if CONFIG_SUPERTX
+                       &dummy_rate_nocoef,
+#endif  // CONFIG_SUPERTX
+                       1, pc_root);
     } else {
       // If required set upper and lower partition size limits
       if (sf->auto_min_max_partition_size) {
-        set_offsets(cpi, tile_info, x, mi_row, mi_col, BLOCK_64X64);
+        set_offsets(cpi, tile_info, x, mi_row, mi_col, cm->sb_size);
         rd_auto_partition_range(cpi, tile_info, xd, mi_row, mi_col,
                                 &x->min_partition_size,
                                 &x->max_partition_size);
       }
-      rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, BLOCK_64X64,
-                        &dummy_rdc, INT64_MAX, td->pc_root);
+      rd_pick_partition(cpi, td, tile_data, tp, mi_row, mi_col, cm->sb_size,
+                        &dummy_rdc,
+#if CONFIG_SUPERTX
+                        &dummy_rate_nocoef,
+#endif  // CONFIG_SUPERTX
+                        INT64_MAX, pc_root);
     }
   }
+#if CONFIG_ENTROPY
+  if (cm->do_subframe_update &&
+      cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+    if ((mi_row + MI_SIZE) % (MI_SIZE *
+        VPXMAX(cm->mi_rows / MI_SIZE / COEF_PROBS_BUFS, 1)) == 0 &&
+        mi_row + MI_SIZE < cm->mi_rows &&
+        cm->coef_probs_update_idx < COEF_PROBS_BUFS - 1) {
+      TX_SIZE t;
+      SUBFRAME_STATS *subframe_stats = &cpi->subframe_stats;
+
+      for (t = TX_4X4; t <= TX_32X32; ++t)
+        vp10_full_to_model_counts(cpi->td.counts->coef[t],
+                                  cpi->td.rd_counts.coef_counts[t]);
+      vp10_partial_adapt_probs(cm, mi_row, mi_col);
+      ++cm->coef_probs_update_idx;
+      vp10_copy(subframe_stats->coef_probs_buf[cm->coef_probs_update_idx],
+                cm->fc->coef_probs);
+      vp10_copy(subframe_stats->coef_counts_buf[cm->coef_probs_update_idx],
+                cpi->td.rd_counts.coef_counts);
+      vp10_copy(subframe_stats->eob_counts_buf[cm->coef_probs_update_idx],
+                cm->counts.eob_branch);
+      vp10_fill_token_costs(x->token_costs,
+#if CONFIG_ANS
+                            cm->fc->coef_cdfs,
+#endif  // CONFIG_ANS
+                            cm->fc->coef_probs);
+    }
+  }
+#endif  // CONFIG_ENTROPY
 }
 
 static void init_encode_frame_mb_context(VP10_COMP *cpi) {
   MACROBLOCK *const x = &cpi->td.mb;
   VP10_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
-  const int aligned_mi_cols = mi_cols_aligned_to_sb(cm->mi_cols);
 
   // Copy data over into macro block data structures.
   vp10_setup_src_planes(x, cpi->Source, 0, 0);
 
-  vp10_setup_block_planes(&x->e_mbd, cm->subsampling_x, cm->subsampling_y);
-
-  // Note: this memset assumes above_context[0], [1] and [2]
-  // are allocated as part of the same buffer.
-  memset(xd->above_context[0], 0,
-         sizeof(*xd->above_context[0]) *
-         2 * aligned_mi_cols * MAX_MB_PLANE);
-  memset(xd->above_seg_context, 0,
-         sizeof(*xd->above_seg_context) * aligned_mi_cols);
+  vp10_setup_block_planes(xd, cm->subsampling_x, cm->subsampling_y);
 }
 
 static int check_dual_ref_flags(VP10_COMP *cpi) {
@@ -2549,11 +4393,18 @@
   if (segfeature_active(&cpi->common.seg, 1, SEG_LVL_REF_FRAME)) {
     return 0;
   } else {
-    return (!!(ref_flags & VP9_GOLD_FLAG) + !!(ref_flags & VP9_LAST_FLAG)
-        + !!(ref_flags & VP9_ALT_FLAG)) >= 2;
+    return (!!(ref_flags & VP9_GOLD_FLAG) +
+            !!(ref_flags & VP9_LAST_FLAG) +
+#if CONFIG_EXT_REFS
+            !!(ref_flags & VP9_LAST2_FLAG) +
+            !!(ref_flags & VP9_LAST3_FLAG) +
+            !!(ref_flags & VP9_BWD_FLAG) +
+#endif  // CONFIG_EXT_REFS
+            !!(ref_flags & VP9_ALT_FLAG)) >= 2;
   }
 }
 
+#if !CONFIG_VAR_TX
 static void reset_skip_tx_size(VP10_COMMON *cm, TX_SIZE max_tx_size) {
   int mi_row, mi_col;
   const int mis = cm->mi_stride;
@@ -2566,6 +4417,7 @@
     }
   }
 }
+#endif
 
 static MV_REFERENCE_FRAME get_frame_type(const VP10_COMP *cpi) {
   if (frame_is_intra_only(&cpi->common))
@@ -2575,6 +4427,8 @@
   else if (cpi->refresh_golden_frame || cpi->refresh_alt_ref_frame)
     return GOLDEN_FRAME;
   else
+    // TODO(zoeliu): To investigate whether a frame_type other than
+    // INTRA/ALTREF/GOLDEN/LAST needs to be specified seperately.
     return LAST_FRAME;
 }
 
@@ -2592,11 +4446,11 @@
 
 void vp10_init_tile_data(VP10_COMP *cpi) {
   VP10_COMMON *const cm = &cpi->common;
-  const int tile_cols = 1 << cm->log2_tile_cols;
-  const int tile_rows = 1 << cm->log2_tile_rows;
+  const int tile_cols = cm->tile_cols;
+  const int tile_rows = cm->tile_rows;
   int tile_col, tile_row;
   TOKENEXTRA *pre_tok = cpi->tile_tok[0][0];
-  int tile_tok = 0;
+  unsigned int tile_tok = 0;
 
   if (cpi->tile_data == NULL || cpi->allocated_tiles < tile_cols * tile_rows) {
     if (cpi->tile_data != NULL)
@@ -2607,7 +4461,7 @@
 
     for (tile_row = 0; tile_row < tile_rows; ++tile_row)
       for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
-        TileDataEnc *tile_data =
+        TileDataEnc *const tile_data =
             &cpi->tile_data[tile_row * tile_cols + tile_col];
         int i, j;
         for (i = 0; i < BLOCK_SIZES; ++i) {
@@ -2621,7 +4475,7 @@
 
   for (tile_row = 0; tile_row < tile_rows; ++tile_row) {
     for (tile_col = 0; tile_col < tile_cols; ++tile_col) {
-      TileInfo *tile_info =
+      TileInfo *const tile_info =
           &cpi->tile_data[tile_row * tile_cols + tile_col].tile_info;
       vp10_tile_init(tile_info, cm, tile_row, tile_col);
 
@@ -2635,37 +4489,36 @@
 void vp10_encode_tile(VP10_COMP *cpi, ThreadData *td,
                      int tile_row, int tile_col) {
   VP10_COMMON *const cm = &cpi->common;
-  const int tile_cols = 1 << cm->log2_tile_cols;
-  TileDataEnc *this_tile =
-      &cpi->tile_data[tile_row * tile_cols + tile_col];
+  TileDataEnc *const this_tile =
+      &cpi->tile_data[tile_row * cm->tile_cols + tile_col];
   const TileInfo * const tile_info = &this_tile->tile_info;
   TOKENEXTRA *tok = cpi->tile_tok[tile_row][tile_col];
   int mi_row;
 
+  vp10_zero_above_context(cm, tile_info->mi_col_start, tile_info->mi_col_end);
+
   // Set up pointers to per thread motion search counters.
   td->mb.m_search_count_ptr = &td->rd_counts.m_search_count;
   td->mb.ex_search_count_ptr = &td->rd_counts.ex_search_count;
 
   for (mi_row = tile_info->mi_row_start; mi_row < tile_info->mi_row_end;
-       mi_row += MI_BLOCK_SIZE) {
+       mi_row += cm->mib_size) {
     encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok);
   }
+
   cpi->tok_count[tile_row][tile_col] =
       (unsigned int)(tok - cpi->tile_tok[tile_row][tile_col]);
-  assert(tok - cpi->tile_tok[tile_row][tile_col] <=
-      allocated_tokens(*tile_info));
+  assert(cpi->tok_count[tile_row][tile_col] <= allocated_tokens(*tile_info));
 }
 
 static void encode_tiles(VP10_COMP *cpi) {
   VP10_COMMON *const cm = &cpi->common;
-  const int tile_cols = 1 << cm->log2_tile_cols;
-  const int tile_rows = 1 << cm->log2_tile_rows;
   int tile_col, tile_row;
 
   vp10_init_tile_data(cpi);
 
-  for (tile_row = 0; tile_row < tile_rows; ++tile_row)
-    for (tile_col = 0; tile_col < tile_cols; ++tile_col)
+  for (tile_row = 0; tile_row < cm->tile_rows; ++tile_row)
+    for (tile_col = 0; tile_col < cm->tile_cols; ++tile_col)
       vp10_encode_tile(cpi, &cpi->td, tile_row, tile_col);
 }
 
@@ -2692,20 +4545,21 @@
   RD_COUNTS *const rdc = &cpi->td.rd_counts;
   int i;
 
+  x->min_partition_size = VPXMIN(x->min_partition_size, cm->sb_size);
+  x->max_partition_size = VPXMIN(x->max_partition_size, cm->sb_size);
+
   xd->mi = cm->mi_grid_visible;
   xd->mi[0] = cm->mi;
 
   vp10_zero(*td->counts);
   vp10_zero(rdc->coef_counts);
   vp10_zero(rdc->comp_pred_diff);
-  vp10_zero(rdc->filter_diff);
   rdc->m_search_count = 0;   // Count of motion search hits.
   rdc->ex_search_count = 0;  // Exhaustive mesh search hits.
 
   for (i = 0; i < MAX_SEGMENTS; ++i) {
-    const int qindex = CONFIG_MISC_FIXES && cm->seg.enabled ?
-                       vp10_get_qindex(&cm->seg, i, cm->base_qindex) :
-                       cm->base_qindex;
+    const int qindex = cm->seg.enabled ?
+        vp10_get_qindex(&cm->seg, i, cm->base_qindex) : cm->base_qindex;
     xd->lossless[i] = qindex == 0 &&
                       cm->y_dc_delta_q == 0 &&
                       cm->uv_dc_delta_q == 0 &&
@@ -2722,18 +4576,44 @@
   vp10_initialize_rd_consts(cpi);
   vp10_initialize_me_consts(cpi, x, cm->base_qindex);
   init_encode_frame_mb_context(cpi);
+
   cm->use_prev_frame_mvs = !cm->error_resilient_mode &&
                            cm->width == cm->last_width &&
                            cm->height == cm->last_height &&
                            !cm->intra_only &&
                            cm->last_show_frame;
+#if CONFIG_EXT_REFS
+  // NOTE(zoeliu): As cm->prev_frame can take neither a frame of
+  //               show_exisiting_frame=1, nor can it take a frame not used as
+  //               a reference, it is probable that by the time it is being
+  //               referred to, the frame buffer it originally points to may
+  //               already get expired and have been reassigned to the current
+  //               newly coded frame. Hence, we need to check whether this is
+  //               the case, and if yes, we have 2 choices:
+  //               (1) Simply disable the use of previous frame mvs; or
+  //               (2) Have cm->prev_frame point to one reference frame buffer,
+  //                   e.g. LAST_FRAME.
+  if (cm->use_prev_frame_mvs && !enc_is_ref_frame_buf(cpi, cm->prev_frame)) {
+    // Reassign the LAST_FRAME buffer to cm->prev_frame.
+    const int last_fb_buf_idx = get_ref_frame_buf_idx(cpi, LAST_FRAME);
+    cm->prev_frame = &cm->buffer_pool->frame_bufs[last_fb_buf_idx];
+  }
+#endif  // CONFIG_EXT_REFS
+
   // Special case: set prev_mi to NULL when the previous mode info
   // context cannot be used.
   cm->prev_mi = cm->use_prev_frame_mvs ?
                 cm->prev_mip + cm->mi_stride + 1 : NULL;
 
-  x->quant_fp = cpi->sf.use_quant_fp;
-  vp10_zero(x->skip_txfm);
+#if CONFIG_VAR_TX
+#if CONFIG_REF_MV
+  vp10_zero(x->blk_skip_drl);
+#endif
+#endif
+
+  if (cpi->sf.partition_search_type == VAR_BASED_PARTITION &&
+      cpi->td.var_root[0] == NULL)
+    vp10_setup_var_tree(&cpi->common, &cpi->td);
 
   {
     struct vpx_usec_timer emr_timer;
@@ -2747,7 +4627,10 @@
 #endif
 
     // If allowed, encoding tiles in parallel with one thread handling one tile.
-    if (VPXMIN(cpi->oxcf.max_threads, 1 << cm->log2_tile_cols) > 1)
+    // TODO(geza.lore): The multi-threaded encoder is not safe with more than
+    // 1 tile rows, as it uses the single above_context et al arrays from
+    // cpi->common
+    if (VPXMIN(cpi->oxcf.max_threads, cm->tile_cols) > 1 && cm->tile_rows == 1)
       vp10_encode_tiles_mt(cpi);
     else
       encode_tiles(cpi);
@@ -2762,23 +4645,6 @@
 #endif
 }
 
-static INTERP_FILTER get_interp_filter(
-    const int64_t threshes[SWITCHABLE_FILTER_CONTEXTS], int is_alt_ref) {
-  if (!is_alt_ref &&
-      threshes[EIGHTTAP_SMOOTH] > threshes[EIGHTTAP] &&
-      threshes[EIGHTTAP_SMOOTH] > threshes[EIGHTTAP_SHARP] &&
-      threshes[EIGHTTAP_SMOOTH] > threshes[SWITCHABLE - 1]) {
-    return EIGHTTAP_SMOOTH;
-  } else if (threshes[EIGHTTAP_SHARP] > threshes[EIGHTTAP] &&
-             threshes[EIGHTTAP_SHARP] > threshes[SWITCHABLE - 1]) {
-    return EIGHTTAP_SHARP;
-  } else if (threshes[EIGHTTAP] > threshes[SWITCHABLE - 1]) {
-    return EIGHTTAP;
-  } else {
-    return SWITCHABLE;
-  }
-}
-
 void vp10_encode_frame(VP10_COMP *cpi) {
   VP10_COMMON *const cm = &cpi->common;
 
@@ -2796,9 +4662,19 @@
       cpi->allow_comp_inter_inter = 0;
     } else {
       cpi->allow_comp_inter_inter = 1;
+
+#if CONFIG_EXT_REFS
+      cm->comp_fwd_ref[0] = LAST_FRAME;
+      cm->comp_fwd_ref[1] = LAST2_FRAME;
+      cm->comp_fwd_ref[2] = LAST3_FRAME;
+      cm->comp_fwd_ref[3] = GOLDEN_FRAME;
+      cm->comp_bwd_ref[0] = BWDREF_FRAME;
+      cm->comp_bwd_ref[1] = ALTREF_FRAME;
+#else
       cm->comp_fixed_ref = ALTREF_FRAME;
       cm->comp_var_ref[0] = LAST_FRAME;
       cm->comp_var_ref[1] = GOLDEN_FRAME;
+#endif  // CONFIG_EXT_REFS
     }
   } else {
     cpi->allow_comp_inter_inter = 0;
@@ -2814,12 +4690,14 @@
     // either compound, single or hybrid prediction as per whatever has
     // worked best for that type of frame in the past.
     // It also predicts whether another coding mode would have worked
-    // better that this coding mode. If that is the case, it remembers
+    // better than this coding mode. If that is the case, it remembers
     // that for subsequent frames.
     // It does the same analysis for transform size selection also.
+    //
+    // TODO(zoeliu): To investigate whether a frame_type other than
+    // INTRA/ALTREF/GOLDEN/LAST needs to be specified seperately.
     const MV_REFERENCE_FRAME frame_type = get_frame_type(cpi);
     int64_t *const mode_thrs = rd_opt->prediction_type_threshes[frame_type];
-    int64_t *const filter_thrs = rd_opt->filter_threshes[frame_type];
     const int is_alt_ref = frame_type == ALTREF_FRAME;
 
     /* prediction (compound, single or hybrid) mode selection */
@@ -2836,17 +4714,15 @@
     else
       cm->reference_mode = REFERENCE_MODE_SELECT;
 
-    if (cm->interp_filter == SWITCHABLE)
-      cm->interp_filter = get_interp_filter(filter_thrs, is_alt_ref);
+#if CONFIG_DUAL_FILTER
+    cm->interp_filter = SWITCHABLE;
+#endif
 
     encode_frame_internal(cpi);
 
     for (i = 0; i < REFERENCE_MODES; ++i)
       mode_thrs[i] = (mode_thrs[i] + rdc->comp_pred_diff[i] / cm->MBs) / 2;
 
-    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
-      filter_thrs[i] = (filter_thrs[i] + rdc->filter_diff[i] / cm->MBs) / 2;
-
     if (cm->reference_mode == REFERENCE_MODE_SELECT) {
       int single_count_zero = 0;
       int comp_count_zero = 0;
@@ -2865,42 +4741,57 @@
       }
     }
 
+#if !CONFIG_VAR_TX
     if (cm->tx_mode == TX_MODE_SELECT) {
       int count4x4 = 0;
       int count8x8_lp = 0, count8x8_8x8p = 0;
       int count16x16_16x16p = 0, count16x16_lp = 0;
       int count32x32 = 0;
-
       for (i = 0; i < TX_SIZE_CONTEXTS; ++i) {
-        count4x4 += counts->tx.p32x32[i][TX_4X4];
-        count4x4 += counts->tx.p16x16[i][TX_4X4];
-        count4x4 += counts->tx.p8x8[i][TX_4X4];
+        count4x4 += counts->tx_size[0][i][TX_4X4];
+        count4x4 += counts->tx_size[1][i][TX_4X4];
+        count4x4 += counts->tx_size[2][i][TX_4X4];
 
-        count8x8_lp += counts->tx.p32x32[i][TX_8X8];
-        count8x8_lp += counts->tx.p16x16[i][TX_8X8];
-        count8x8_8x8p += counts->tx.p8x8[i][TX_8X8];
+        count8x8_lp += counts->tx_size[1][i][TX_8X8];
+        count8x8_lp += counts->tx_size[2][i][TX_8X8];
+        count8x8_8x8p += counts->tx_size[0][i][TX_8X8];
 
-        count16x16_16x16p += counts->tx.p16x16[i][TX_16X16];
-        count16x16_lp += counts->tx.p32x32[i][TX_16X16];
-        count32x32 += counts->tx.p32x32[i][TX_32X32];
+        count16x16_16x16p += counts->tx_size[1][i][TX_16X16];
+        count16x16_lp += counts->tx_size[2][i][TX_16X16];
+        count32x32 += counts->tx_size[2][i][TX_32X32];
       }
       if (count4x4 == 0 && count16x16_lp == 0 && count16x16_16x16p == 0 &&
+#if CONFIG_SUPERTX
+          cm->counts.supertx_size[TX_16X16] == 0 &&
+          cm->counts.supertx_size[TX_32X32] == 0 &&
+#endif  // CONFIG_SUPERTX
           count32x32 == 0) {
         cm->tx_mode = ALLOW_8X8;
         reset_skip_tx_size(cm, TX_8X8);
       } else if (count8x8_8x8p == 0 && count16x16_16x16p == 0 &&
-                 count8x8_lp == 0 && count16x16_lp == 0 && count32x32 == 0) {
+                 count8x8_lp == 0 && count16x16_lp == 0 &&
+#if CONFIG_SUPERTX
+                 cm->counts.supertx_size[TX_8X8] == 0 &&
+                 cm->counts.supertx_size[TX_16X16] == 0 &&
+                 cm->counts.supertx_size[TX_32X32] == 0 &&
+#endif  // CONFIG_SUPERTX
+                 count32x32 == 0) {
         cm->tx_mode = ONLY_4X4;
         reset_skip_tx_size(cm, TX_4X4);
-      } else if (count8x8_lp == 0 && count16x16_lp == 0 && count4x4 == 0) {
+      } else if (count8x8_lp == 0 && count16x16_lp == 0 &&
+                 count4x4 == 0) {
         cm->tx_mode = ALLOW_32X32;
-      } else if (count32x32 == 0 && count8x8_lp == 0 && count4x4 == 0) {
+      } else if (count32x32 == 0 && count8x8_lp == 0 &&
+#if CONFIG_SUPERTX
+                 cm->counts.supertx_size[TX_32X32] == 0 &&
+#endif  // CONFIG_SUPERTX
+                 count4x4 == 0) {
         cm->tx_mode = ALLOW_16X16;
         reset_skip_tx_size(cm, TX_16X16);
       }
     }
+#endif
   } else {
-    cm->reference_mode = SINGLE_REFERENCE;
     encode_frame_internal(cpi);
   }
 }
@@ -2941,6 +4832,144 @@
   ++counts->uv_mode[y_mode][uv_mode];
 }
 
+#if CONFIG_VAR_TX
+static void update_txfm_count(MACROBLOCKD *xd,
+                              FRAME_COUNTS *counts,
+                              TX_SIZE tx_size, int blk_row, int blk_col) {
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  const int tx_row = blk_row >> 1;
+  const int tx_col = blk_col >> 1;
+  int max_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
+  int max_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
+  int ctx = txfm_partition_context(xd->above_txfm_context + tx_col,
+                                   xd->left_txfm_context + tx_row,
+                                   tx_size);
+  const TX_SIZE plane_tx_size = mbmi->inter_tx_size[tx_row][tx_col];
+
+  if (xd->mb_to_bottom_edge < 0)
+    max_blocks_high += xd->mb_to_bottom_edge >> 5;
+  if (xd->mb_to_right_edge < 0)
+    max_blocks_wide += xd->mb_to_right_edge >> 5;
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide)
+    return;
+
+  if (tx_size == plane_tx_size) {
+    ++counts->txfm_partition[ctx][0];
+    mbmi->tx_size = tx_size;
+    txfm_partition_update(xd->above_txfm_context + tx_col,
+                          xd->left_txfm_context + tx_row, tx_size);
+  } else {
+    BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+    int bh = num_4x4_blocks_high_lookup[bsize];
+    int i;
+    ++counts->txfm_partition[ctx][1];
+
+    if (tx_size == TX_8X8) {
+      mbmi->inter_tx_size[tx_row][tx_col] = TX_4X4;
+      mbmi->tx_size = TX_4X4;
+      txfm_partition_update(xd->above_txfm_context + tx_col,
+                            xd->left_txfm_context + tx_row, TX_4X4);
+      return;
+    }
+
+    for (i = 0; i < 4; ++i) {
+      int offsetr = (i >> 1) * bh / 2;
+      int offsetc = (i & 0x01) * bh / 2;
+      update_txfm_count(xd, counts, tx_size - 1,
+                        blk_row + offsetr, blk_col + offsetc);
+    }
+  }
+}
+
+static void tx_partition_count_update(VP10_COMMON *cm,
+                                      MACROBLOCKD *xd,
+                                      BLOCK_SIZE plane_bsize,
+                                      int mi_row, int mi_col,
+                                      FRAME_COUNTS *td_counts) {
+  const int mi_width = num_4x4_blocks_wide_lookup[plane_bsize];
+  const int mi_height = num_4x4_blocks_high_lookup[plane_bsize];
+  TX_SIZE max_tx_size = max_txsize_lookup[plane_bsize];
+  BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
+  int bh = num_4x4_blocks_wide_lookup[txb_size];
+  int idx, idy;
+
+  xd->above_txfm_context = cm->above_txfm_context + mi_col;
+  xd->left_txfm_context =
+    xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+  for (idy = 0; idy < mi_height; idy += bh)
+    for (idx = 0; idx < mi_width; idx += bh)
+      update_txfm_count(xd, td_counts, max_tx_size, idy, idx);
+}
+
+static void set_txfm_context(MACROBLOCKD *xd, TX_SIZE tx_size,
+                             int blk_row, int blk_col) {
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  const int tx_row = blk_row >> 1;
+  const int tx_col = blk_col >> 1;
+  int max_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
+  int max_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
+  const TX_SIZE plane_tx_size = mbmi->inter_tx_size[tx_row][tx_col];
+
+  if (xd->mb_to_bottom_edge < 0)
+    max_blocks_high += xd->mb_to_bottom_edge >> 5;
+  if (xd->mb_to_right_edge < 0)
+    max_blocks_wide += xd->mb_to_right_edge >> 5;
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide)
+    return;
+
+  if (tx_size == plane_tx_size) {
+    mbmi->tx_size = tx_size;
+    txfm_partition_update(xd->above_txfm_context + tx_col,
+                          xd->left_txfm_context + tx_row, tx_size);
+
+  } else {
+    BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+    int bsl = b_width_log2_lookup[bsize];
+    int i;
+
+    if (tx_size == TX_8X8) {
+      mbmi->inter_tx_size[tx_row][tx_col] = TX_4X4;
+      mbmi->tx_size = TX_4X4;
+      txfm_partition_update(xd->above_txfm_context + tx_col,
+                            xd->left_txfm_context + tx_row, TX_4X4);
+      return;
+    }
+
+    assert(bsl > 0);
+    --bsl;
+    for (i = 0; i < 4; ++i) {
+      int offsetr = (i >> 1) << bsl;
+      int offsetc = (i & 0x01) << bsl;
+      set_txfm_context(xd, tx_size - 1,
+                       blk_row + offsetr, blk_col + offsetc);
+    }
+  }
+}
+
+static void tx_partition_set_contexts(VP10_COMMON *cm,
+                                      MACROBLOCKD *xd,
+                                      BLOCK_SIZE plane_bsize,
+                                      int mi_row, int mi_col) {
+  const int mi_width = num_4x4_blocks_wide_lookup[plane_bsize];
+  const int mi_height = num_4x4_blocks_high_lookup[plane_bsize];
+  TX_SIZE max_tx_size = max_txsize_lookup[plane_bsize];
+  BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
+  int bh = num_4x4_blocks_wide_lookup[txb_size];
+  int idx, idy;
+
+  xd->above_txfm_context = cm->above_txfm_context + mi_col;
+  xd->left_txfm_context =
+    xd->left_txfm_context_buffer + (mi_row & MAX_MIB_MASK);
+
+  for (idy = 0; idy < mi_height; idy += bh)
+    for (idx = 0; idx < mi_width; idx += bh)
+      set_txfm_context(xd, max_tx_size, idy, idx);
+}
+#endif
+
 static void encode_superblock(VP10_COMP *cpi, ThreadData *td,
                               TOKENEXTRA **t, int output_enabled,
                               int mi_row, int mi_col, BLOCK_SIZE bsize,
@@ -2957,14 +4986,6 @@
   const int mi_width = num_8x8_blocks_wide_lookup[bsize];
   const int mi_height = num_8x8_blocks_high_lookup[bsize];
 
-  x->skip_recode = !x->select_tx_size && mbmi->sb_type >= BLOCK_8X8 &&
-                   cpi->oxcf.aq_mode != COMPLEXITY_AQ &&
-                   cpi->oxcf.aq_mode != CYCLIC_REFRESH_AQ &&
-                   cpi->sf.allow_skip_recode;
-
-  if (!x->skip_recode)
-    memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
-
   x->skip_optimize = ctx->is_coded;
   ctx->is_coded = 1;
   x->use_lp32x32fdct = cpi->sf.use_lp32x32fdct;
@@ -2973,14 +4994,47 @@
     int plane;
     mbmi->skip = 1;
     for (plane = 0; plane < MAX_MB_PLANE; ++plane)
-      vp10_encode_intra_block_plane(x, VPXMAX(bsize, BLOCK_8X8), plane);
+      vp10_encode_intra_block_plane(x, VPXMAX(bsize, BLOCK_8X8), plane, 1);
     if (output_enabled)
       sum_intra_stats(td->counts, mi, xd->above_mi, xd->left_mi,
                       frame_is_intra_only(cm));
+
+#if CONFIG_EXT_INTRA
+    if (output_enabled && bsize >= BLOCK_8X8) {
+      FRAME_COUNTS *counts = td->counts;
+      if (mbmi->mode == DC_PRED &&
+          mbmi->palette_mode_info.palette_size[0] == 0)
+        ++counts->ext_intra[0][mbmi->ext_intra_mode_info.use_ext_intra_mode[0]];
+      if (mbmi->uv_mode == DC_PRED &&
+          mbmi->palette_mode_info.palette_size[1] == 0)
+        ++counts->ext_intra[1][mbmi->ext_intra_mode_info.use_ext_intra_mode[1]];
+      if (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED) {
+        int p_angle;
+        const int intra_filter_ctx = vp10_get_pred_context_intra_interp(xd);
+        p_angle = mode_to_angle_map[mbmi->mode] +
+            mbmi->angle_delta[0] * ANGLE_STEP;
+        if (vp10_is_intra_filter_switchable(p_angle))
+          ++counts->intra_filter[intra_filter_ctx][mbmi->intra_filter];
+      }
+    }
+#endif  // CONFIG_EXT_INTRA
+
+    if (bsize >= BLOCK_8X8 && output_enabled) {
+      for (plane = 0; plane <= 1; ++plane) {
+        if (mbmi->palette_mode_info.palette_size[plane] > 0) {
+          mbmi->palette_mode_info.palette_first_color_idx[plane] =
+              xd->plane[plane].color_index_map[0];
+          // TODO(huisu): this increases the use of token buffer. Needs stretch
+          // test to verify.
+          vp10_tokenize_palette_sb(td, bsize, plane, t);
+        }
+      }
+    }
     vp10_tokenize_sb(cpi, td, t, !output_enabled, VPXMAX(bsize, BLOCK_8X8));
   } else {
     int ref;
     const int is_compound = has_second_ref(mbmi);
+
     set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
     for (ref = 0; ref < 1 + is_compound; ++ref) {
       YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi,
@@ -2996,34 +5050,105 @@
     vp10_build_inter_predictors_sbuv(xd, mi_row, mi_col,
                                      VPXMAX(bsize, BLOCK_8X8));
 
+#if CONFIG_OBMC
+    if (mbmi->motion_variation == OBMC_CAUSAL) {
+#if CONFIG_VP9_HIGHBITDEPTH
+      DECLARE_ALIGNED(16, uint8_t, tmp_buf1[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
+      DECLARE_ALIGNED(16, uint8_t, tmp_buf2[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
+#else
+      DECLARE_ALIGNED(16, uint8_t, tmp_buf1[MAX_MB_PLANE * MAX_SB_SQUARE]);
+      DECLARE_ALIGNED(16, uint8_t, tmp_buf2[MAX_MB_PLANE * MAX_SB_SQUARE]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
+      int dst_stride1[MAX_MB_PLANE] = {MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE};
+      int dst_stride2[MAX_MB_PLANE] = {MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE};
+
+      assert(mbmi->sb_type >= BLOCK_8X8);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        int len = sizeof(uint16_t);
+        dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
+        dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * len);
+        dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * 2 * len);
+        dst_buf2[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
+        dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * len);
+        dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * 2 * len);
+      } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      dst_buf1[0] = tmp_buf1;
+      dst_buf1[1] = tmp_buf1 + MAX_SB_SQUARE;
+      dst_buf1[2] = tmp_buf1 + MAX_SB_SQUARE * 2;
+      dst_buf2[0] = tmp_buf2;
+      dst_buf2[1] = tmp_buf2 + MAX_SB_SQUARE;
+      dst_buf2[2] = tmp_buf2 + MAX_SB_SQUARE * 2;
+#if CONFIG_VP9_HIGHBITDEPTH
+      }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      vp10_build_prediction_by_above_preds(cm, xd, mi_row, mi_col, dst_buf1,
+                                           dst_stride1);
+      vp10_build_prediction_by_left_preds(cm, xd, mi_row, mi_col, dst_buf2,
+                                          dst_stride2);
+      vp10_setup_dst_planes(xd->plane, get_frame_new_buffer(cm),
+                            mi_row, mi_col);
+      vp10_build_obmc_inter_prediction(cm, xd, mi_row, mi_col,
+                                       dst_buf1, dst_stride1,
+                                       dst_buf2, dst_stride2);
+    }
+#endif  // CONFIG_OBMC
+
     vp10_encode_sb(x, VPXMAX(bsize, BLOCK_8X8));
+#if CONFIG_VAR_TX
+    vp10_tokenize_sb_inter(cpi, td, t, !output_enabled,
+                           mi_row, mi_col, VPXMAX(bsize, BLOCK_8X8));
+#else
     vp10_tokenize_sb(cpi, td, t, !output_enabled, VPXMAX(bsize, BLOCK_8X8));
+#endif
   }
 
   if (output_enabled) {
     if (cm->tx_mode == TX_MODE_SELECT &&
         mbmi->sb_type >= BLOCK_8X8  &&
         !(is_inter_block(mbmi) && (mbmi->skip || seg_skip))) {
-      ++get_tx_counts(max_txsize_lookup[bsize], get_tx_size_context(xd),
-                      &td->counts->tx)[mbmi->tx_size];
+#if CONFIG_VAR_TX
+      if (is_inter_block(mbmi))
+        tx_partition_count_update(cm, xd, bsize, mi_row, mi_col, td->counts);
+#endif
+      ++td->counts->tx_size[max_txsize_lookup[bsize] - TX_8X8]
+                           [get_tx_size_context(xd)][mbmi->tx_size];
     } else {
       int x, y;
       TX_SIZE tx_size;
       // The new intra coding scheme requires no change of transform size
-      if (is_inter_block(&mi->mbmi)) {
+      if (is_inter_block(&mi->mbmi))
         tx_size = VPXMIN(tx_mode_to_biggest_tx_size[cm->tx_mode],
                          max_txsize_lookup[bsize]);
-      } else {
+      else
         tx_size = (bsize >= BLOCK_8X8) ? mbmi->tx_size : TX_4X4;
-      }
 
       for (y = 0; y < mi_height; y++)
         for (x = 0; x < mi_width; x++)
           if (mi_col + x < cm->mi_cols && mi_row + y < cm->mi_rows)
             mi_8x8[mis * y + x]->mbmi.tx_size = tx_size;
     }
-    ++td->counts->tx.tx_totals[mbmi->tx_size];
-    ++td->counts->tx.tx_totals[get_uv_tx_size(mbmi, &xd->plane[1])];
+    ++td->counts->tx_size_totals[mbmi->tx_size];
+    ++td->counts->tx_size_totals[get_uv_tx_size(mbmi, &xd->plane[1])];
+#if CONFIG_EXT_TX
+    if (get_ext_tx_types(mbmi->tx_size, bsize, is_inter_block(mbmi)) > 1 &&
+        cm->base_qindex > 0 && !mbmi->skip &&
+        !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
+      int eset = get_ext_tx_set(mbmi->tx_size, bsize,
+                                is_inter_block(mbmi));
+      if (eset > 0) {
+        if (is_inter_block(mbmi)) {
+          ++td->counts->inter_ext_tx[eset][mbmi->tx_size][mbmi->tx_type];
+        } else {
+          ++td->counts->intra_ext_tx[eset][mbmi->tx_size][mbmi->mode]
+              [mbmi->tx_type];
+        }
+      }
+    }
+#else
     if (mbmi->tx_size < TX_32X32 &&
         cm->base_qindex > 0 && !mbmi->skip &&
         !segfeature_active(&cm->seg, mbmi->segment_id, SEG_LVL_SKIP)) {
@@ -3035,5 +5160,1088 @@
                                   [mbmi->tx_type];
       }
     }
+#endif  // CONFIG_EXT_TX
+  }
+
+#if CONFIG_VAR_TX
+  if (cm->tx_mode == TX_MODE_SELECT && mbmi->sb_type >= BLOCK_8X8 &&
+      is_inter_block(mbmi) && !(mbmi->skip || seg_skip)) {
+    if (!output_enabled)
+      tx_partition_set_contexts(cm, xd, bsize, mi_row, mi_col);
+  } else {
+    TX_SIZE tx_size;
+    // The new intra coding scheme requires no change of transform size
+    if (is_inter_block(mbmi))
+      tx_size = VPXMIN(tx_mode_to_biggest_tx_size[cm->tx_mode],
+                       max_txsize_lookup[bsize]);
+    else
+      tx_size = (bsize >= BLOCK_8X8) ? mbmi->tx_size : TX_4X4;
+    mbmi->tx_size = tx_size;
+    set_txfm_ctx(xd->left_txfm_context, tx_size, xd->n8_h);
+    set_txfm_ctx(xd->above_txfm_context, tx_size, xd->n8_w);
+  }
+#endif
+}
+
+#if CONFIG_SUPERTX
+static int check_intra_b(PICK_MODE_CONTEXT *ctx) {
+  if (!is_inter_mode((&ctx->mic)->mbmi.mode))
+    return 1;
+#if CONFIG_EXT_INTER
+  if (ctx->mic.mbmi.ref_frame[1] == INTRA_FRAME)
+    return 1;
+#endif  // CONFIG_EXT_INTER
+  return 0;
+}
+
+static int check_intra_sb(VP10_COMP *cpi, const TileInfo *const tile,
+                          int mi_row, int mi_col, BLOCK_SIZE bsize,
+                          PC_TREE *pc_tree) {
+  const VP10_COMMON *const cm = &cpi->common;
+
+  const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
+  const PARTITION_TYPE partition = pc_tree->partitioning;
+  const BLOCK_SIZE subsize = get_subsize(bsize, partition);
+#if CONFIG_EXT_PARTITION_TYPES
+  int i;
+#endif
+
+  assert(bsize >= BLOCK_8X8);
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return 1;
+
+  switch (partition) {
+    case PARTITION_NONE:
+      return check_intra_b(&pc_tree->none);
+      break;
+    case PARTITION_VERT:
+      if (check_intra_b(&pc_tree->vertical[0]))
+        return 1;
+      if (mi_col + hbs < cm->mi_cols && bsize > BLOCK_8X8) {
+        if (check_intra_b(&pc_tree->vertical[1]))
+          return 1;
+      }
+      break;
+    case PARTITION_HORZ:
+      if (check_intra_b(&pc_tree->horizontal[0]))
+        return 1;
+      if (mi_row + hbs < cm->mi_rows && bsize > BLOCK_8X8) {
+        if (check_intra_b(&pc_tree->horizontal[1]))
+          return 1;
+      }
+      break;
+    case PARTITION_SPLIT:
+      if (bsize == BLOCK_8X8) {
+        if (check_intra_b(pc_tree->leaf_split[0]))
+          return 1;
+      } else {
+        if (check_intra_sb(cpi, tile, mi_row, mi_col, subsize,
+                           pc_tree->split[0]))
+          return 1;
+        if (check_intra_sb(cpi, tile, mi_row, mi_col + hbs, subsize,
+                           pc_tree->split[1]))
+          return 1;
+        if (check_intra_sb(cpi, tile, mi_row + hbs, mi_col, subsize,
+                           pc_tree->split[2]))
+          return 1;
+        if (check_intra_sb(cpi, tile, mi_row + hbs, mi_col + hbs, subsize,
+                           pc_tree->split[3]))
+          return 1;
+      }
+      break;
+#if CONFIG_EXT_PARTITION_TYPES
+    case PARTITION_HORZ_A:
+      for (i = 0; i < 3; i++) {
+        if (check_intra_b(&pc_tree->horizontala[i]))
+          return 1;
+      }
+      break;
+    case PARTITION_HORZ_B:
+      for (i = 0; i < 3; i++) {
+        if (check_intra_b(&pc_tree->horizontalb[i]))
+          return 1;
+      }
+      break;
+    case PARTITION_VERT_A:
+      for (i = 0; i < 3; i++) {
+        if (check_intra_b(&pc_tree->verticala[i]))
+          return 1;
+      }
+      break;
+    case PARTITION_VERT_B:
+      for (i = 0; i < 3; i++) {
+        if (check_intra_b(&pc_tree->verticalb[i]))
+          return 1;
+      }
+      break;
+#endif  // CONFIG_EXT_PARTITION_TYPES
+    default:
+      assert(0);
+  }
+  return 0;
+}
+
+static int check_supertx_b(TX_SIZE supertx_size, PICK_MODE_CONTEXT *ctx) {
+  return ctx->mic.mbmi.tx_size == supertx_size;
+}
+
+static int check_supertx_sb(BLOCK_SIZE bsize, TX_SIZE supertx_size,
+                            PC_TREE *pc_tree) {
+  PARTITION_TYPE partition;
+  BLOCK_SIZE subsize;
+
+  partition = pc_tree->partitioning;
+  subsize = get_subsize(bsize, partition);
+  switch (partition) {
+    case PARTITION_NONE:
+      return check_supertx_b(supertx_size, &pc_tree->none);
+    case PARTITION_VERT:
+      return check_supertx_b(supertx_size, &pc_tree->vertical[0]);
+    case PARTITION_HORZ:
+      return check_supertx_b(supertx_size, &pc_tree->horizontal[0]);
+    case PARTITION_SPLIT:
+      if (bsize == BLOCK_8X8)
+        return check_supertx_b(supertx_size, pc_tree->leaf_split[0]);
+      else
+        return check_supertx_sb(subsize, supertx_size, pc_tree->split[0]);
+#if CONFIG_EXT_PARTITION_TYPES
+    case PARTITION_HORZ_A:
+      return check_supertx_b(supertx_size, &pc_tree->horizontala[0]);
+    case PARTITION_HORZ_B:
+      return check_supertx_b(supertx_size, &pc_tree->horizontalb[0]);
+    case PARTITION_VERT_A:
+      return check_supertx_b(supertx_size, &pc_tree->verticala[0]);
+    case PARTITION_VERT_B:
+      return check_supertx_b(supertx_size, &pc_tree->verticalb[0]);
+#endif  // CONFIG_EXT_PARTITION_TYPES
+    default:
+      assert(0);
+      return 0;
   }
 }
+
+static void predict_superblock(VP10_COMP *cpi, ThreadData *td,
+#if CONFIG_EXT_INTER
+                               int mi_row_ori, int mi_col_ori,
+#endif  // CONFIG_EXT_INTER
+                               int mi_row_pred, int mi_col_pred,
+                               BLOCK_SIZE bsize_pred, int b_sub8x8, int block) {
+  // Used in supertx
+  // (mi_row_ori, mi_col_ori): location for mv
+  // (mi_row_pred, mi_col_pred, bsize_pred): region to predict
+  VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *mi_8x8 = xd->mi[0];
+  MODE_INFO *mi = mi_8x8;
+  MB_MODE_INFO *mbmi = &mi->mbmi;
+  int ref;
+  const int is_compound = has_second_ref(mbmi);
+
+  set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+
+  for (ref = 0; ref < 1 + is_compound; ++ref) {
+    YV12_BUFFER_CONFIG *cfg = get_ref_frame_buffer(cpi,
+                                                   mbmi->ref_frame[ref]);
+    vp10_setup_pre_planes(xd, ref, cfg, mi_row_pred, mi_col_pred,
+                         &xd->block_refs[ref]->sf);
+  }
+
+  if (!b_sub8x8)
+    vp10_build_inter_predictors_sb_extend(
+        xd,
+#if CONFIG_EXT_INTER
+        mi_row_ori, mi_col_ori,
+#endif  // CONFIG_EXT_INTER
+        mi_row_pred, mi_col_pred, bsize_pred);
+  else
+    vp10_build_inter_predictors_sb_sub8x8_extend(
+        xd,
+#if CONFIG_EXT_INTER
+        mi_row_ori, mi_col_ori,
+#endif  // CONFIG_EXT_INTER
+        mi_row_pred, mi_col_pred, bsize_pred, block);
+}
+
+static void predict_b_extend(VP10_COMP *cpi, ThreadData *td,
+                             const TileInfo *const tile,
+                             int block,
+                             int mi_row_ori, int mi_col_ori,
+                             int mi_row_pred, int mi_col_pred,
+                             int mi_row_top, int mi_col_top,
+                             uint8_t * dst_buf[3], int dst_stride[3],
+                             BLOCK_SIZE bsize_top, BLOCK_SIZE bsize_pred,
+                             int output_enabled, int b_sub8x8, int bextend) {
+  // Used in supertx
+  // (mi_row_ori, mi_col_ori): location for mv
+  // (mi_row_pred, mi_col_pred, bsize_pred): region to predict
+  // (mi_row_top, mi_col_top, bsize_top): region of the top partition size
+  // block: sub location of sub8x8 blocks
+  // b_sub8x8: 1: ori is sub8x8; 0: ori is not sub8x8
+  // bextend: 1: region to predict is an extension of ori; 0: not
+
+  MACROBLOCK *const x = &td->mb;
+  VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int r = (mi_row_pred - mi_row_top) * MI_SIZE;
+  int c = (mi_col_pred - mi_col_top) * MI_SIZE;
+  const int mi_width_top = num_8x8_blocks_wide_lookup[bsize_top];
+  const int mi_height_top = num_8x8_blocks_high_lookup[bsize_top];
+
+  if (mi_row_pred < mi_row_top || mi_col_pred < mi_col_top ||
+      mi_row_pred >= mi_row_top + mi_height_top ||
+      mi_col_pred >= mi_col_top + mi_width_top ||
+      mi_row_pred >= cm->mi_rows || mi_col_pred >= cm->mi_cols)
+    return;
+
+  set_offsets_extend(cpi, td, tile, mi_row_pred, mi_col_pred,
+                     mi_row_ori, mi_col_ori, bsize_pred);
+  xd->plane[0].dst.stride = dst_stride[0];
+  xd->plane[1].dst.stride = dst_stride[1];
+  xd->plane[2].dst.stride = dst_stride[2];
+  xd->plane[0].dst.buf = dst_buf[0] +
+                         (r >> xd->plane[0].subsampling_y) * dst_stride[0] +
+                         (c >> xd->plane[0].subsampling_x);
+  xd->plane[1].dst.buf = dst_buf[1] +
+                         (r >> xd->plane[1].subsampling_y) * dst_stride[1] +
+                         (c >> xd->plane[1].subsampling_x);
+  xd->plane[2].dst.buf = dst_buf[2] +
+                         (r >> xd->plane[2].subsampling_y) * dst_stride[2] +
+                         (c >> xd->plane[2].subsampling_x);
+
+  predict_superblock(cpi, td,
+#if CONFIG_EXT_INTER
+                     mi_row_ori, mi_col_ori,
+#endif  // CONFIG_EXT_INTER
+                     mi_row_pred, mi_col_pred, bsize_pred,
+                     b_sub8x8, block);
+
+  if (output_enabled && !bextend)
+    update_stats(&cpi->common, td, 1);
+}
+
+static void extend_dir(VP10_COMP *cpi, ThreadData *td,
+                       const TileInfo *const tile,
+                       int block, BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
+                       int mi_row, int mi_col,
+                       int mi_row_top, int mi_col_top,
+                       int output_enabled,
+                       uint8_t * dst_buf[3], int dst_stride[3], int dir) {
+  // dir: 0-lower, 1-upper, 2-left, 3-right
+  //      4-lowerleft, 5-upperleft, 6-lowerright, 7-upperright
+  MACROBLOCKD *xd = &td->mb.e_mbd;
+  const int mi_width = num_8x8_blocks_wide_lookup[bsize];
+  const int mi_height = num_8x8_blocks_high_lookup[bsize];
+  int xss = xd->plane[1].subsampling_x;
+  int yss = xd->plane[1].subsampling_y;
+  int b_sub8x8 = (bsize < BLOCK_8X8) ? 1 : 0;
+
+  BLOCK_SIZE extend_bsize;
+  int unit, mi_row_pred, mi_col_pred;
+
+  if (dir == 0 || dir == 1) {  // lower and upper
+    extend_bsize = (mi_width == 1 || bsize < BLOCK_8X8 || xss < yss) ?
+                   BLOCK_8X8 : BLOCK_16X8;
+    unit = num_8x8_blocks_wide_lookup[extend_bsize];
+    mi_row_pred = mi_row + ((dir == 0) ? mi_height : -1);
+    mi_col_pred = mi_col;
+
+    predict_b_extend(cpi, td, tile, block, mi_row, mi_col,
+                     mi_row_pred, mi_col_pred,
+                     mi_row_top, mi_col_top, dst_buf, dst_stride,
+                     top_bsize, extend_bsize,
+                     output_enabled, b_sub8x8, 1);
+
+    if (mi_width > unit) {
+      int i;
+      for (i = 0; i < mi_width/unit - 1; i++) {
+        mi_col_pred += unit;
+        predict_b_extend(cpi, td, tile, block, mi_row, mi_col,
+                         mi_row_pred, mi_col_pred, mi_row_top, mi_col_top,
+                         dst_buf, dst_stride, top_bsize, extend_bsize,
+                         output_enabled, b_sub8x8, 1);
+      }
+    }
+  } else if (dir == 2 || dir == 3) {  // left and right
+    extend_bsize = (mi_height == 1 || bsize < BLOCK_8X8 || yss < xss) ?
+                   BLOCK_8X8 : BLOCK_8X16;
+    unit = num_8x8_blocks_high_lookup[extend_bsize];
+    mi_row_pred = mi_row;
+    mi_col_pred = mi_col + ((dir == 3) ? mi_width : -1);
+
+    predict_b_extend(cpi, td, tile, block, mi_row, mi_col,
+                     mi_row_pred, mi_col_pred, mi_row_top, mi_col_top,
+                     dst_buf, dst_stride, top_bsize, extend_bsize,
+                     output_enabled, b_sub8x8, 1);
+
+    if (mi_height > unit) {
+      int i;
+      for (i = 0; i < mi_height/unit - 1; i++) {
+        mi_row_pred += unit;
+        predict_b_extend(cpi, td, tile, block, mi_row, mi_col,
+                         mi_row_pred, mi_col_pred, mi_row_top, mi_col_top,
+                         dst_buf, dst_stride, top_bsize, extend_bsize,
+                         output_enabled, b_sub8x8, 1);
+      }
+    }
+  } else {
+    extend_bsize = BLOCK_8X8;
+    mi_row_pred = mi_row + ((dir == 4 || dir == 6) ? mi_height : -1);
+    mi_col_pred = mi_col + ((dir == 6 || dir == 7) ? mi_width : -1);
+
+    predict_b_extend(cpi, td, tile, block, mi_row, mi_col,
+                     mi_row_pred, mi_col_pred, mi_row_top, mi_col_top,
+                     dst_buf, dst_stride, top_bsize, extend_bsize,
+                     output_enabled, b_sub8x8, 1);
+  }
+}
+
+static void extend_all(VP10_COMP *cpi, ThreadData *td,
+                       const TileInfo *const tile,
+                       int block,
+                       BLOCK_SIZE bsize, BLOCK_SIZE top_bsize,
+                       int mi_row, int mi_col,
+                       int mi_row_top, int mi_col_top,
+                       int output_enabled,
+                       uint8_t * dst_buf[3], int dst_stride[3]) {
+  assert(block >= 0 && block < 4);
+  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col,
+             mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride, 0);
+  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col,
+             mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride, 1);
+  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col,
+             mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride, 2);
+  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col,
+             mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride, 3);
+  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col,
+             mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride, 4);
+  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col,
+             mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride, 5);
+  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col,
+             mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride, 6);
+  extend_dir(cpi, td, tile, block, bsize, top_bsize, mi_row, mi_col,
+             mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride, 7);
+}
+
+
+// This function generates prediction for multiple blocks, between which
+// discontinuity around boundary is reduced by smoothing masks. The basic
+// smoothing mask is a soft step function along horz/vert direction. In more
+// complicated case when a block is split into 4 subblocks, the basic mask is
+// first applied to neighboring subblocks (2 pairs) in horizontal direction and
+// then applied to the 2 masked prediction mentioned above in vertical direction
+// If the block is split into more than one level, at every stage, masked
+// prediction is stored in dst_buf[] passed from higher level.
+static void predict_sb_complex(VP10_COMP *cpi, ThreadData *td,
+                               const TileInfo *const tile,
+                               int mi_row, int mi_col,
+                               int mi_row_top, int mi_col_top,
+                               int output_enabled, BLOCK_SIZE bsize,
+                               BLOCK_SIZE top_bsize,
+                               uint8_t *dst_buf[3], int dst_stride[3],
+                               PC_TREE *pc_tree) {
+  VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+
+  const int ctx =  partition_plane_context(xd, mi_row, mi_col, bsize);
+  const int hbs = num_8x8_blocks_wide_lookup[bsize] / 2;
+  const PARTITION_TYPE partition = pc_tree->partitioning;
+  const BLOCK_SIZE subsize = get_subsize(bsize, partition);
+#if CONFIG_EXT_PARTITION_TYPES
+  const BLOCK_SIZE bsize2 = get_subsize(bsize, PARTITION_SPLIT);
+#endif
+
+  int i;
+  uint8_t *dst_buf1[3], *dst_buf2[3], *dst_buf3[3];
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf1[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf2[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf3[MAX_MB_PLANE * MAX_TX_SQUARE * 2]);
+  int dst_stride1[3] = {MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE};
+  int dst_stride2[3] = {MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE};
+  int dst_stride3[3] = {MAX_TX_SIZE, MAX_TX_SIZE, MAX_TX_SIZE};
+
+  assert(bsize >= BLOCK_8X8);
+
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    int len = sizeof(uint16_t);
+    dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
+    dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_TX_SQUARE * len);
+    dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + 2 * MAX_TX_SQUARE * len);
+    dst_buf2[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
+    dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_TX_SQUARE * len);
+    dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + 2 * MAX_TX_SQUARE * len);
+    dst_buf3[0] = CONVERT_TO_BYTEPTR(tmp_buf3);
+    dst_buf3[1] = CONVERT_TO_BYTEPTR(tmp_buf3 + MAX_TX_SQUARE * len);
+    dst_buf3[2] = CONVERT_TO_BYTEPTR(tmp_buf3 + 2 * MAX_TX_SQUARE * len);
+  } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    dst_buf1[0] = tmp_buf1;
+    dst_buf1[1] = tmp_buf1 + MAX_TX_SQUARE;
+    dst_buf1[2] = tmp_buf1 + 2 * MAX_TX_SQUARE;
+    dst_buf2[0] = tmp_buf2;
+    dst_buf2[1] = tmp_buf2 + MAX_TX_SQUARE;
+    dst_buf2[2] = tmp_buf2 + 2 * MAX_TX_SQUARE;
+    dst_buf3[0] = tmp_buf3;
+    dst_buf3[1] = tmp_buf3 + MAX_TX_SQUARE;
+    dst_buf3[2] = tmp_buf3 + 2 * MAX_TX_SQUARE;
+#if CONFIG_VP9_HIGHBITDEPTH
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  if (output_enabled && bsize < top_bsize)
+    cm->counts.partition[ctx][partition]++;
+
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    xd->plane[i].dst.buf = dst_buf[i];
+    xd->plane[i].dst.stride = dst_stride[i];
+  }
+
+  switch (partition) {
+    case PARTITION_NONE:
+      assert(bsize < top_bsize);
+      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride,
+                       top_bsize, bsize, output_enabled, 0, 0);
+      extend_all(cpi, td, tile, 0, bsize, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride);
+      break;
+    case PARTITION_HORZ:
+      if (bsize == BLOCK_8X8) {
+        // Fisrt half
+        predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride,
+                         top_bsize, BLOCK_8X8, output_enabled, 1, 0);
+        if (bsize < top_bsize)
+          extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, output_enabled,
+                     dst_buf, dst_stride);
+
+        // Second half
+        predict_b_extend(cpi, td, tile, 2, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+                         top_bsize, BLOCK_8X8, output_enabled, 1, 1);
+        if (bsize < top_bsize)
+          extend_all(cpi, td, tile, 2, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, output_enabled,
+                     dst_buf1, dst_stride1);
+
+        // Smooth
+        xd->plane[0].dst.buf = dst_buf[0];
+        xd->plane[0].dst.stride = dst_stride[0];
+        vp10_build_masked_inter_predictor_complex(xd,
+                                                  dst_buf[0], dst_stride[0],
+                                                  dst_buf1[0], dst_stride1[0],
+                                                  mi_row, mi_col,
+                                                  mi_row_top, mi_col_top,
+                                                  bsize, top_bsize,
+                                                  PARTITION_HORZ, 0);
+      }  else {
+        // First half
+        predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride,
+                         top_bsize, subsize, output_enabled, 0, 0);
+        if (bsize < top_bsize)
+          extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, output_enabled,
+                     dst_buf, dst_stride);
+        else
+          extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, output_enabled,
+                     dst_buf, dst_stride, 0);
+
+        if (mi_row + hbs < cm->mi_rows) {
+          // Second half
+          predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col,
+                           mi_row + hbs, mi_col, mi_row_top, mi_col_top,
+                           dst_buf1, dst_stride1, top_bsize, subsize,
+                           output_enabled, 0, 0);
+          if (bsize < top_bsize)
+            extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs,
+                       mi_col, mi_row_top, mi_col_top, output_enabled,
+                       dst_buf1, dst_stride1);
+          else
+            extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs,
+                       mi_col, mi_row_top, mi_col_top, output_enabled,
+                       dst_buf1, dst_stride1, 1);
+
+          // Smooth
+          for (i = 0; i < MAX_MB_PLANE; i++) {
+            xd->plane[i].dst.buf = dst_buf[i];
+            xd->plane[i].dst.stride = dst_stride[i];
+            vp10_build_masked_inter_predictor_complex(
+                xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
+                mi_row, mi_col, mi_row_top, mi_col_top,
+                bsize, top_bsize, PARTITION_HORZ, i);
+          }
+        }
+      }
+      break;
+    case PARTITION_VERT:
+      if (bsize == BLOCK_8X8) {
+        // First half
+        predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride,
+                         top_bsize, BLOCK_8X8, output_enabled, 1, 0);
+        if (bsize < top_bsize)
+          extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, output_enabled,
+                     dst_buf, dst_stride);
+
+        // Second half
+        predict_b_extend(cpi, td, tile, 1, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+                         top_bsize, BLOCK_8X8, output_enabled, 1, 1);
+        if (bsize < top_bsize)
+          extend_all(cpi, td, tile, 1, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, output_enabled,
+                     dst_buf1, dst_stride1);
+
+        // Smooth
+        xd->plane[0].dst.buf = dst_buf[0];
+        xd->plane[0].dst.stride = dst_stride[0];
+        vp10_build_masked_inter_predictor_complex(xd,
+                                                  dst_buf[0], dst_stride[0],
+                                                  dst_buf1[0], dst_stride1[0],
+                                                  mi_row, mi_col,
+                                                  mi_row_top, mi_col_top,
+                                                  bsize, top_bsize,
+                                                  PARTITION_VERT, 0);
+      } else {
+        // bsize: not important, not useful
+        predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride,
+                         top_bsize, subsize, output_enabled, 0, 0);
+        if (bsize < top_bsize)
+          extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, output_enabled,
+                     dst_buf, dst_stride);
+        else
+          extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, output_enabled,
+                     dst_buf, dst_stride, 3);
+
+
+        if (mi_col + hbs < cm->mi_cols) {
+          predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs,
+                           mi_row, mi_col + hbs, mi_row_top, mi_col_top,
+                           dst_buf1, dst_stride1, top_bsize, subsize,
+                           output_enabled, 0, 0);
+          if (bsize < top_bsize)
+            extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row,
+                       mi_col + hbs, mi_row_top, mi_col_top, output_enabled,
+                       dst_buf1, dst_stride1);
+          else
+            extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row,
+                       mi_col + hbs, mi_row_top, mi_col_top, output_enabled,
+                       dst_buf1, dst_stride1, 2);
+
+          for (i = 0; i < MAX_MB_PLANE; i++) {
+            xd->plane[i].dst.buf = dst_buf[i];
+            xd->plane[i].dst.stride = dst_stride[i];
+            vp10_build_masked_inter_predictor_complex(
+                xd, dst_buf[i], dst_stride[i], dst_buf1[i], dst_stride1[i],
+                mi_row, mi_col, mi_row_top, mi_col_top,
+                bsize, top_bsize, PARTITION_VERT, i);
+          }
+        }
+      }
+      break;
+    case PARTITION_SPLIT:
+      if (bsize == BLOCK_8X8) {
+        predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf, dst_stride,
+                         top_bsize, BLOCK_8X8, output_enabled, 1, 0);
+        predict_b_extend(cpi, td, tile, 1, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+                         top_bsize, BLOCK_8X8, output_enabled, 1, 1);
+        predict_b_extend(cpi, td, tile, 2, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf2, dst_stride2,
+                         top_bsize, BLOCK_8X8, output_enabled, 1, 1);
+        predict_b_extend(cpi, td, tile, 3, mi_row, mi_col, mi_row, mi_col,
+                         mi_row_top, mi_col_top, dst_buf3, dst_stride3,
+                         top_bsize, BLOCK_8X8, output_enabled, 1, 1);
+
+        if (bsize < top_bsize) {
+          extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, output_enabled,
+                     dst_buf, dst_stride);
+          extend_all(cpi, td, tile, 1, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, output_enabled,
+                     dst_buf1, dst_stride1);
+          extend_all(cpi, td, tile, 2, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, output_enabled,
+                     dst_buf2, dst_stride2);
+          extend_all(cpi, td, tile, 3, subsize, top_bsize, mi_row, mi_col,
+                     mi_row_top, mi_col_top, output_enabled,
+                     dst_buf3, dst_stride3);
+        }
+      } else {
+        predict_sb_complex(cpi, td, tile, mi_row, mi_col,
+                           mi_row_top, mi_col_top, output_enabled, subsize,
+                           top_bsize, dst_buf, dst_stride,
+                           pc_tree->split[0]);
+        if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols)
+          predict_sb_complex(cpi, td, tile, mi_row, mi_col + hbs,
+                             mi_row_top, mi_col_top, output_enabled, subsize,
+                             top_bsize, dst_buf1, dst_stride1,
+                             pc_tree->split[1]);
+        if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols)
+          predict_sb_complex(cpi, td, tile, mi_row + hbs, mi_col,
+                             mi_row_top, mi_col_top, output_enabled, subsize,
+                             top_bsize, dst_buf2, dst_stride2,
+                             pc_tree->split[2]);
+        if (mi_row + hbs < cm->mi_rows && mi_col + hbs < cm->mi_cols)
+          predict_sb_complex(cpi, td, tile, mi_row + hbs, mi_col + hbs,
+                             mi_row_top, mi_col_top, output_enabled, subsize,
+                             top_bsize, dst_buf3, dst_stride3,
+                             pc_tree->split[3]);
+      }
+        for (i = 0; i < MAX_MB_PLANE; i++) {
+          if (bsize == BLOCK_8X8 && i != 0)
+            continue;  // Skip <4x4 chroma smoothing
+          if (mi_row < cm->mi_rows && mi_col + hbs < cm->mi_cols) {
+            vp10_build_masked_inter_predictor_complex(xd,
+                                                      dst_buf[i],
+                                                      dst_stride[i],
+                                                      dst_buf1[i],
+                                                      dst_stride1[i],
+                                                      mi_row, mi_col,
+                                                      mi_row_top, mi_col_top,
+                                                      bsize, top_bsize,
+                                                      PARTITION_VERT, i);
+            if (mi_row + hbs < cm->mi_rows) {
+              vp10_build_masked_inter_predictor_complex(xd,
+                                                        dst_buf2[i],
+                                                        dst_stride2[i],
+                                                        dst_buf3[i],
+                                                        dst_stride3[i],
+                                                        mi_row, mi_col,
+                                                        mi_row_top, mi_col_top,
+                                                        bsize, top_bsize,
+                                                        PARTITION_VERT, i);
+              vp10_build_masked_inter_predictor_complex(xd,
+                                                        dst_buf[i],
+                                                        dst_stride[i],
+                                                        dst_buf2[i],
+                                                        dst_stride2[i],
+                                                        mi_row, mi_col,
+                                                        mi_row_top, mi_col_top,
+                                                        bsize, top_bsize,
+                                                        PARTITION_HORZ, i);
+            }
+          } else if (mi_row + hbs < cm->mi_rows && mi_col < cm->mi_cols) {
+            vp10_build_masked_inter_predictor_complex(xd,
+                                                      dst_buf[i],
+                                                      dst_stride[i],
+                                                      dst_buf2[i],
+                                                      dst_stride2[i],
+                                                      mi_row, mi_col,
+                                                      mi_row_top, mi_col_top,
+                                                      bsize, top_bsize,
+                                                      PARTITION_HORZ, i);
+          }
+        }
+        break;
+#if CONFIG_EXT_PARTITION_TYPES
+    case PARTITION_HORZ_A:
+      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride,
+                       top_bsize, bsize2, output_enabled, 0, 0);
+      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride);
+
+      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs,
+                       mi_row, mi_col + hbs, mi_row_top, mi_col_top,
+                       dst_buf1, dst_stride1, top_bsize, bsize2,
+                       output_enabled, 0, 0);
+      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col + hbs,
+                 mi_row_top, mi_col_top, output_enabled, dst_buf1, dst_stride1);
+
+      predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
+                       mi_col, mi_row_top, mi_col_top, dst_buf2, dst_stride2,
+                       top_bsize, subsize, output_enabled, 0, 0);
+      if (bsize < top_bsize)
+        extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs, mi_col,
+                   mi_row_top, mi_col_top, output_enabled,
+                   dst_buf2, dst_stride2);
+      else
+        extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row + hbs, mi_col,
+                   mi_row_top, mi_col_top, output_enabled,
+                   dst_buf2, dst_stride2, 1);
+
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf[i];
+        xd->plane[i].dst.stride = dst_stride[i];
+        vp10_build_masked_inter_predictor_complex(xd,
+                                                  dst_buf[i], dst_stride[i],
+                                                  dst_buf1[i], dst_stride1[i],
+                                                  mi_row, mi_col,
+                                                  mi_row_top, mi_col_top,
+                                                  bsize, top_bsize,
+                                                  PARTITION_VERT, i);
+      }
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        vp10_build_masked_inter_predictor_complex(xd,
+                                                  dst_buf[i], dst_stride[i],
+                                                  dst_buf2[i], dst_stride2[i],
+                                                  mi_row, mi_col,
+                                                  mi_row_top, mi_col_top,
+                                                  bsize, top_bsize,
+                                                  PARTITION_HORZ, i);
+      }
+
+      break;
+    case PARTITION_VERT_A:
+
+      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride,
+                       top_bsize, bsize2, output_enabled, 0, 0);
+      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col,
+                 mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride);
+
+      predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
+                       mi_col, mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+                       top_bsize, bsize2, output_enabled, 0, 0);
+      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col,
+                 mi_row_top, mi_col_top, output_enabled, dst_buf1, dst_stride1);
+
+      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
+                       mi_col + hbs, mi_row_top, mi_col_top, dst_buf2,
+                       dst_stride2, top_bsize, subsize, output_enabled,
+                       0, 0);
+      if (bsize < top_bsize)
+        extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col + hbs,
+                   mi_row_top, mi_col_top, output_enabled,
+                   dst_buf2, dst_stride2);
+      else
+        extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col + hbs,
+                   mi_row_top, mi_col_top, output_enabled,
+                   dst_buf2, dst_stride2, 2);
+
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf[i];
+        xd->plane[i].dst.stride = dst_stride[i];
+        vp10_build_masked_inter_predictor_complex(xd,
+                                                  dst_buf[i], dst_stride[i],
+                                                  dst_buf1[i], dst_stride1[i],
+                                                  mi_row, mi_col,
+                                                  mi_row_top, mi_col_top,
+                                                  bsize, top_bsize,
+                                                  PARTITION_HORZ, i);
+      }
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        vp10_build_masked_inter_predictor_complex(xd,
+                                                  dst_buf[i], dst_stride[i],
+                                                  dst_buf2[i], dst_stride2[i],
+                                                  mi_row, mi_col,
+                                                  mi_row_top, mi_col_top,
+                                                  bsize, top_bsize,
+                                                  PARTITION_VERT, i);
+      }
+      break;
+    case PARTITION_HORZ_B:
+
+      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride,
+                       top_bsize, subsize, output_enabled, 0, 0);
+      if (bsize < top_bsize)
+        extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                   mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride);
+      else
+        extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                   mi_row_top, mi_col_top, output_enabled,
+                   dst_buf, dst_stride, 0);
+
+      predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col, mi_row + hbs,
+                       mi_col, mi_row_top, mi_col_top, dst_buf1, dst_stride1,
+                       top_bsize, bsize2, output_enabled, 0, 0);
+      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs, mi_col,
+                 mi_row_top, mi_col_top, output_enabled, dst_buf1, dst_stride1);
+
+      predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col + hbs,
+                       mi_row + hbs, mi_col + hbs, mi_row_top, mi_col_top,
+                       dst_buf2, dst_stride2, top_bsize, bsize2,
+                       output_enabled, 0, 0);
+      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs,
+                 mi_col + hbs, mi_row_top, mi_col_top, output_enabled, dst_buf2,
+                 dst_stride2);
+
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf1[i];
+        xd->plane[i].dst.stride = dst_stride1[i];
+        vp10_build_masked_inter_predictor_complex(xd,
+                                                  dst_buf1[i], dst_stride1[i],
+                                                  dst_buf2[i], dst_stride2[i],
+                                                  mi_row, mi_col,
+                                                  mi_row_top, mi_col_top,
+                                                  bsize, top_bsize,
+                                                  PARTITION_VERT, i);
+      }
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf[i];
+        xd->plane[i].dst.stride = dst_stride[i];
+        vp10_build_masked_inter_predictor_complex(xd,
+                                                  dst_buf[i], dst_stride[i],
+                                                  dst_buf1[i], dst_stride1[i],
+                                                  mi_row, mi_col,
+                                                  mi_row_top, mi_col_top,
+                                                  bsize, top_bsize,
+                                                  PARTITION_HORZ, i);
+      }
+      break;
+    case PARTITION_VERT_B:
+
+      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col, mi_row, mi_col,
+                       mi_row_top, mi_col_top, dst_buf, dst_stride,
+                       top_bsize, subsize, output_enabled, 0, 0);
+      if (bsize < top_bsize)
+        extend_all(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                   mi_row_top, mi_col_top, output_enabled, dst_buf, dst_stride);
+      else
+        extend_dir(cpi, td, tile, 0, subsize, top_bsize, mi_row, mi_col,
+                   mi_row_top, mi_col_top, output_enabled,
+                   dst_buf, dst_stride, 3);
+
+      predict_b_extend(cpi, td, tile, 0, mi_row, mi_col + hbs, mi_row,
+                       mi_col + hbs, mi_row_top, mi_col_top, dst_buf1,
+                       dst_stride1, top_bsize, bsize2, output_enabled,
+                       0, 0);
+      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row, mi_col + hbs,
+                 mi_row_top, mi_col_top, output_enabled, dst_buf1, dst_stride1);
+
+      predict_b_extend(cpi, td, tile, 0, mi_row + hbs, mi_col + hbs,
+                       mi_row + hbs, mi_col + hbs, mi_row_top, mi_col_top,
+                       dst_buf2, dst_stride2, top_bsize, bsize2,
+                       output_enabled, 0, 0);
+      extend_all(cpi, td, tile, 0, bsize2, top_bsize, mi_row + hbs,
+                 mi_col + hbs, mi_row_top, mi_col_top, output_enabled, dst_buf2,
+                 dst_stride2);
+
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf1[i];
+        xd->plane[i].dst.stride = dst_stride1[i];
+        vp10_build_masked_inter_predictor_complex(xd,
+                                                  dst_buf1[i], dst_stride1[i],
+                                                  dst_buf2[i], dst_stride2[i],
+                                                  mi_row, mi_col,
+                                                  mi_row_top, mi_col_top,
+                                                  bsize, top_bsize,
+                                                  PARTITION_HORZ, i);
+      }
+      for (i = 0; i < MAX_MB_PLANE; i++) {
+        xd->plane[i].dst.buf = dst_buf[i];
+        xd->plane[i].dst.stride = dst_stride[i];
+        vp10_build_masked_inter_predictor_complex(xd,
+                                                  dst_buf[i], dst_stride[i],
+                                                  dst_buf1[i], dst_stride1[i],
+                                                  mi_row, mi_col,
+                                                  mi_row_top, mi_col_top,
+                                                  bsize, top_bsize,
+                                                  PARTITION_VERT, i);
+      }
+      break;
+#endif  // CONFIG_EXT_PARTITION_TYPES
+    default:
+        assert(0);
+  }
+
+
+#if CONFIG_EXT_PARTITION_TYPES
+  if (bsize < top_bsize)
+    update_ext_partition_context(xd, mi_row, mi_col, subsize, bsize, partition);
+#else
+  if (bsize < top_bsize && (partition != PARTITION_SPLIT || bsize == BLOCK_8X8))
+    update_partition_context(xd, mi_row, mi_col, subsize, bsize);
+#endif  // CONFIG_EXT_PARTITION_TYPES
+}
+
+static void rd_supertx_sb(VP10_COMP *cpi, ThreadData *td,
+                          const TileInfo *const tile,
+                          int mi_row, int mi_col, BLOCK_SIZE bsize,
+                          int *tmp_rate, int64_t *tmp_dist,
+                          TX_TYPE *best_tx,
+                          PC_TREE *pc_tree) {
+  VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int plane, pnskip, skippable, skippable_uv, rate_uv, this_rate,
+      base_rate = *tmp_rate;
+  int64_t sse, pnsse, sse_uv, this_dist, dist_uv;
+  uint8_t *dst_buf[3];
+  int dst_stride[3];
+  TX_SIZE tx_size;
+  MB_MODE_INFO *mbmi;
+  TX_TYPE tx_type, best_tx_nostx;
+#if CONFIG_EXT_TX
+  int ext_tx_set;
+#endif  // CONFIG_EXT_TX
+  int tmp_rate_tx = 0, skip_tx = 0;
+  int64_t tmp_dist_tx = 0, rd_tx, bestrd_tx = INT64_MAX;
+  uint8_t tmp_zcoeff_blk = 0;
+
+  set_skip_context(xd, mi_row, mi_col);
+  set_mode_info_offsets(cpi, x, xd, mi_row, mi_col);
+  update_state_sb_supertx(cpi, td, tile, mi_row, mi_col, bsize,
+                          0, pc_tree);
+  vp10_setup_dst_planes(xd->plane, get_frame_new_buffer(cm),
+                        mi_row, mi_col);
+  for (plane = 0; plane < MAX_MB_PLANE; plane++) {
+    dst_buf[plane] = xd->plane[plane].dst.buf;
+    dst_stride[plane] = xd->plane[plane].dst.stride;
+  }
+  predict_sb_complex(cpi, td, tile, mi_row, mi_col, mi_row, mi_col,
+                     0, bsize, bsize, dst_buf, dst_stride, pc_tree);
+
+  set_offsets_without_segment_id(cpi, tile, x, mi_row, mi_col, bsize);
+  set_segment_id_supertx(cpi, x, mi_row, mi_col, bsize);
+
+  mbmi = &xd->mi[0]->mbmi;
+  best_tx_nostx = mbmi->tx_type;
+
+  *best_tx = DCT_DCT;
+
+  // chroma
+  skippable_uv = 1;
+  rate_uv = 0;
+  dist_uv = 0;
+  sse_uv = 0;
+  for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+#if CONFIG_VAR_TX
+    ENTROPY_CONTEXT ctxa[2 * MAX_MIB_SIZE];
+    ENTROPY_CONTEXT ctxl[2 * MAX_MIB_SIZE];
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    int coeff_ctx = 1;
+
+    this_rate = 0;
+    this_dist = 0;
+    pnsse = 0;
+    pnskip = 1;
+
+    tx_size = max_txsize_lookup[bsize];
+    tx_size = get_uv_tx_size_impl(tx_size, bsize,
+                                  cm->subsampling_x, cm->subsampling_y);
+    vp10_get_entropy_contexts(bsize, tx_size, pd, ctxa, ctxl);
+    coeff_ctx = combine_entropy_contexts(ctxa[0], ctxl[0]);
+
+    vp10_subtract_plane(x, bsize, plane);
+    vp10_tx_block_rd_b(cpi, x, tx_size,
+                       0, 0, plane, 0,
+                       get_plane_block_size(bsize, pd), coeff_ctx,
+                       &this_rate, &this_dist, &pnsse, &pnskip);
+#else
+    tx_size = max_txsize_lookup[bsize];
+    tx_size = get_uv_tx_size_impl(tx_size, bsize,
+                                  cm->subsampling_x, cm->subsampling_y);
+    vp10_subtract_plane(x, bsize, plane);
+    vp10_txfm_rd_in_plane_supertx(x, cpi, &this_rate, &this_dist,
+                                  &pnskip, &pnsse,
+                                  INT64_MAX, plane, bsize, tx_size, 0);
+#endif  // CONFIG_VAR_TX
+
+    rate_uv += this_rate;
+    dist_uv += this_dist;
+    sse_uv += pnsse;
+    skippable_uv &= pnskip;
+  }
+
+  // luma
+  tx_size = max_txsize_lookup[bsize];
+  vp10_subtract_plane(x, bsize, 0);
+#if CONFIG_EXT_TX
+  ext_tx_set = get_ext_tx_set(tx_size, bsize, 1);
+#endif  // CONFIG_EXT_TX
+  for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
+#if CONFIG_VAR_TX
+    ENTROPY_CONTEXT ctxa[2 * MAX_MIB_SIZE];
+    ENTROPY_CONTEXT ctxl[2 * MAX_MIB_SIZE];
+    const struct macroblockd_plane *const pd = &xd->plane[0];
+    int coeff_ctx = 1;
+#endif  // CONFIG_VAR_TX
+#if CONFIG_EXT_TX
+    if (!ext_tx_used_inter[ext_tx_set][tx_type])
+      continue;
+#else
+    if (tx_size >= TX_32X32 && tx_type != DCT_DCT)
+      continue;
+#endif  // CONFIG_EXT_TX
+    mbmi->tx_type = tx_type;
+
+#if CONFIG_VAR_TX
+    this_rate = 0;
+    this_dist = 0;
+    pnsse = 0;
+    pnskip = 1;
+
+    vp10_get_entropy_contexts(bsize, tx_size, pd, ctxa, ctxl);
+    coeff_ctx = combine_entropy_contexts(ctxa[0], ctxl[0]);
+    vp10_tx_block_rd_b(cpi, x, tx_size,
+                       0, 0, 0, 0,
+                       bsize, coeff_ctx,
+                       &this_rate, &this_dist, &pnsse, &pnskip);
+#else
+    vp10_txfm_rd_in_plane_supertx(x, cpi, &this_rate, &this_dist, &pnskip,
+                                  &pnsse, INT64_MAX, 0, bsize, tx_size, 0);
+#endif  // CONFIG_VAR_TX
+
+#if CONFIG_EXT_TX
+    if (get_ext_tx_types(tx_size, bsize, 1) > 1 &&
+        !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
+        this_rate != INT_MAX) {
+      if (ext_tx_set > 0)
+        this_rate += cpi->inter_tx_type_costs[ext_tx_set]
+            [mbmi->tx_size][mbmi->tx_type];
+    }
+#else
+    if (tx_size < TX_32X32 &&
+        !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
+        this_rate != INT_MAX) {
+      this_rate += cpi->inter_tx_type_costs[tx_size][mbmi->tx_type];
+    }
+#endif  // CONFIG_EXT_TX
+    *tmp_rate = rate_uv + this_rate;
+    *tmp_dist = dist_uv + this_dist;
+    sse = sse_uv + pnsse;
+    skippable = skippable_uv && pnskip;
+    if (skippable) {
+      *tmp_rate = vp10_cost_bit(vp10_get_skip_prob(cm, xd), 1);
+      x->skip = 1;
+    } else {
+      if (RDCOST(x->rdmult, x->rddiv, *tmp_rate, *tmp_dist)
+          < RDCOST(x->rdmult, x->rddiv, 0, sse)) {
+        *tmp_rate += vp10_cost_bit(vp10_get_skip_prob(cm, xd), 0);
+        x->skip = 0;
+      } else {
+        *tmp_dist = sse;
+        *tmp_rate = vp10_cost_bit(vp10_get_skip_prob(cm, xd), 1);
+        x->skip = 1;
+      }
+    }
+    *tmp_rate += base_rate;
+    rd_tx = RDCOST(x->rdmult, x->rddiv, *tmp_rate, *tmp_dist);
+    if (rd_tx < bestrd_tx * 0.99 || tx_type == DCT_DCT) {
+      *best_tx = tx_type;
+      bestrd_tx = rd_tx;
+      tmp_rate_tx = *tmp_rate;
+      tmp_dist_tx = *tmp_dist;
+      skip_tx = x->skip;
+      tmp_zcoeff_blk = x->zcoeff_blk[tx_size][0];
+    }
+  }
+  x->zcoeff_blk[tx_size][0] = tmp_zcoeff_blk;
+  *tmp_rate = tmp_rate_tx;
+  *tmp_dist = tmp_dist_tx;
+  x->skip = skip_tx;
+#if CONFIG_VAR_TX
+  for (plane = 0; plane < 1; ++plane)
+    memset(x->blk_skip[plane], x->skip,
+           sizeof(uint8_t) * pc_tree->none.num_4x4_blk);
+#endif  // CONFIG_VAR_TX
+  xd->mi[0]->mbmi.tx_type = best_tx_nostx;
+}
+#endif  // CONFIG_SUPERTX
diff --git a/vp10/encoder/encodemb.c b/vp10/encoder/encodemb.c
index fb11e46..3810be5 100644
--- a/vp10/encoder/encodemb.c
+++ b/vp10/encoder/encodemb.c
@@ -23,14 +23,11 @@
 #include "vp10/common/scan.h"
 
 #include "vp10/encoder/encodemb.h"
+#include "vp10/encoder/hybrid_fwd_txfm.h"
+#include "vp10/encoder/quantize.h"
 #include "vp10/encoder/rd.h"
 #include "vp10/encoder/tokenize.h"
 
-struct optimize_ctx {
-  ENTROPY_CONTEXT ta[MAX_MB_PLANE][16];
-  ENTROPY_CONTEXT tl[MAX_MB_PLANE][16];
-};
-
 void vp10_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
   struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &x->e_mbd.plane[plane];
@@ -50,84 +47,96 @@
                      pd->dst.buf, pd->dst.stride);
 }
 
-#define RDTRUNC(RM, DM, R, D) ((128 + (R) * (RM)) & 0xFF)
-
 typedef struct vp10_token_state {
   int           rate;
-  int           error;
+  int64_t       error;
   int           next;
   int16_t       token;
   tran_low_t    qc;
+  tran_low_t    dqc;
 } vp10_token_state;
 
-// TODO(jimbankoski): experiment to find optimal RD numbers.
-static const int plane_rd_mult[PLANE_TYPES] = { 4, 2 };
+// These numbers are empirically obtained.
+static const int plane_rd_mult[REF_TYPES][PLANE_TYPES] = {
+    {10, 6}, {8, 5},
+};
 
 #define UPDATE_RD_COST()\
 {\
   rd_cost0 = RDCOST(rdmult, rddiv, rate0, error0);\
   rd_cost1 = RDCOST(rdmult, rddiv, rate1, error1);\
-  if (rd_cost0 == rd_cost1) {\
-    rd_cost0 = RDTRUNC(rdmult, rddiv, rate0, error0);\
-    rd_cost1 = RDTRUNC(rdmult, rddiv, rate1, error1);\
-  }\
 }
 
-// This function is a place holder for now but may ultimately need
-// to scan previous tokens to work out the correct context.
-static int trellis_get_coeff_context(const int16_t *scan,
-                                     const int16_t *nb,
-                                     int idx, int token,
-                                     uint8_t *token_cache) {
-  int bak = token_cache[scan[idx]], pt;
-  token_cache[scan[idx]] = vp10_pt_energy_class[token];
-  pt = get_coef_context(nb, token_cache, idx + 1);
-  token_cache[scan[idx]] = bak;
-  return pt;
-}
+static const int16_t band_count_table[TX_SIZES][8] = {
+  { 1, 2, 3, 4,  3,   16 - 13, 0 },
+  { 1, 2, 3, 4, 11,   64 - 21, 0 },
+  { 1, 2, 3, 4, 11,  256 - 21, 0 },
+  { 1, 2, 3, 4, 11, 1024 - 21, 0 },
+};
 
-static int optimize_b(MACROBLOCK *mb, int plane, int block,
-                      TX_SIZE tx_size, int ctx) {
+static const int16_t band_cum_count_table[TX_SIZES][8] = {
+  { 0, 1, 3, 6, 10, 13, 16, 0 },
+  { 0, 1, 3, 6, 10, 21, 64, 0 },
+  { 0, 1, 3, 6, 10, 21, 256, 0 },
+  { 0, 1, 3, 6, 10, 21, 1024, 0 },
+};
+
+int vp10_optimize_b(MACROBLOCK *mb, int plane, int block,
+                    TX_SIZE tx_size, int ctx) {
   MACROBLOCKD *const xd = &mb->e_mbd;
   struct macroblock_plane *const p = &mb->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   const int ref = is_inter_block(&xd->mi[0]->mbmi);
-  vp10_token_state tokens[1025][2];
-  unsigned best_index[1025][2];
-  uint8_t token_cache[1024];
+  vp10_token_state tokens[MAX_TX_SQUARE + 1][2];
+  unsigned best_index[MAX_TX_SQUARE + 1][2];
+  uint8_t token_cache[MAX_TX_SQUARE];
   const tran_low_t *const coeff = BLOCK_OFFSET(mb->plane[plane].coeff, block);
   tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   const int eob = p->eobs[block];
   const PLANE_TYPE type = pd->plane_type;
   const int default_eob = 16 << (tx_size << 1);
-  const int mul = 1 + (tx_size == TX_32X32);
-  const int16_t *dequant_ptr = pd->dequant;
-  const uint8_t *const band_translate = get_band_translate(tx_size);
-  TX_TYPE tx_type = get_tx_type(type, xd, block);
-  const scan_order *const so = get_scan(tx_size, tx_type);
-  const int16_t *const scan = so->scan;
-  const int16_t *const nb = so->neighbors;
+  const int16_t* const dequant_ptr = pd->dequant;
+  const uint8_t* const band_translate = get_band_translate(tx_size);
+  TX_TYPE tx_type = get_tx_type(type, xd, block, tx_size);
+  const scan_order* const so =
+      get_scan(tx_size, tx_type, is_inter_block(&xd->mi[0]->mbmi));
+  const int16_t* const scan = so->scan;
+  const int16_t* const nb = so->neighbors;
+#if CONFIG_NEW_QUANT
+  int dq = get_dq_profile_from_ctx(ctx);
+  const dequant_val_type_nuq *dequant_val = pd->dequant_val_nuq[dq];
+#endif  // CONFIG_NEW_QUANT
+  const int shift = get_tx_scale(xd, tx_type, tx_size);
+  const int dq_step[2] = { dequant_ptr[0] >> shift, dequant_ptr[1] >> shift };
   int next = eob, sz = 0;
-  int64_t rdmult = mb->rdmult * plane_rd_mult[type], rddiv = mb->rddiv;
+  const int64_t rdmult = (mb->rdmult * plane_rd_mult[ref][type]) >> 1;
+  const int64_t rddiv = mb->rddiv;
   int64_t rd_cost0, rd_cost1;
-  int rate0, rate1, error0, error1;
+  int rate0, rate1;
+  int64_t error0, error1;
   int16_t t0, t1;
-  EXTRABIT e0;
-  int best, band, pt, i, final_eob;
+  int best, band = (eob < default_eob) ?
+      band_translate[eob] : band_translate[eob - 1];
+  int pt, i, final_eob;
 #if CONFIG_VP9_HIGHBITDEPTH
-  const int16_t *cat6_high_cost = vp10_get_high_cost_table(xd->bd);
+  const int *cat6_high_cost = vp10_get_high_cost_table(xd->bd);
 #else
-  const int16_t *cat6_high_cost = vp10_get_high_cost_table(8);
+  const int *cat6_high_cost = vp10_get_high_cost_table(8);
 #endif
+  unsigned int (*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
+                   mb->token_costs[tx_size][type][ref];
+  const int16_t *band_counts = &band_count_table[tx_size][band];
+  int16_t band_left = eob - band_cum_count_table[tx_size][band] + 1;
+  int shortcut = 0;
+  int next_shortcut = 0;
+
+  token_costs += band;
 
   assert((!type && !plane) || (type && plane));
   assert(eob <= default_eob);
 
   /* Now set up a Viterbi trellis to evaluate alternative roundings. */
-  if (!ref)
-    rdmult = (rdmult * 9) >> 4;
-
   /* Initialize the sentinel node of the trellis. */
   tokens[eob][0].rate = 0;
   tokens[eob][0].error = 0;
@@ -136,64 +145,100 @@
   tokens[eob][0].qc = 0;
   tokens[eob][1] = tokens[eob][0];
 
-  for (i = 0; i < eob; i++)
-    token_cache[scan[i]] =
-        vp10_pt_energy_class[vp10_get_token(qcoeff[scan[i]])];
+  for (i = 0; i < eob; i++) {
+    const int rc = scan[i];
+    tokens[i][0].rate = vp10_get_token_cost(qcoeff[rc], &t0, cat6_high_cost);
+    tokens[i][0].token = t0;
+    token_cache[rc] = vp10_pt_energy_class[t0];
+  }
 
   for (i = eob; i-- > 0;) {
-    int base_bits, d2, dx;
+    int base_bits, dx;
+    int64_t d2;
     const int rc = scan[i];
     int x = qcoeff[rc];
+    next_shortcut = shortcut;
+
     /* Only add a trellis state for non-zero coefficients. */
-    if (x) {
-      int shortcut = 0;
+    if (UNLIKELY(x)) {
       error0 = tokens[next][0].error;
       error1 = tokens[next][1].error;
       /* Evaluate the first possibility for this state. */
       rate0 = tokens[next][0].rate;
       rate1 = tokens[next][1].rate;
-      vp10_get_token_extra(x, &t0, &e0);
-      /* Consider both possible successor states. */
-      if (next < default_eob) {
-        band = band_translate[i + 1];
-        pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
-        rate0 += mb->token_costs[tx_size][type][ref][band][0][pt]
-                                [tokens[next][0].token];
-        rate1 += mb->token_costs[tx_size][type][ref][band][0][pt]
-                                [tokens[next][1].token];
+
+      if (next_shortcut) {
+        /* Consider both possible successor states. */
+        if (next < default_eob) {
+          pt = get_coef_context(nb, token_cache, i + 1);
+          rate0 += (*token_costs)[0][pt][tokens[next][0].token];
+          rate1 += (*token_costs)[0][pt][tokens[next][1].token];
+        }
+        UPDATE_RD_COST();
+        /* And pick the best. */
+        best = rd_cost1 < rd_cost0;
+      } else {
+        if (next < default_eob) {
+          pt = get_coef_context(nb, token_cache, i + 1);
+          rate0 += (*token_costs)[0][pt][tokens[next][0].token];
+        }
+        best = 0;
       }
-      UPDATE_RD_COST();
-      /* And pick the best. */
-      best = rd_cost1 < rd_cost0;
-      base_bits = vp10_get_cost(t0, e0, cat6_high_cost);
-      dx = mul * (dqcoeff[rc] - coeff[rc]);
+
+      dx = (dqcoeff[rc] - coeff[rc]) * (1 << shift);
 #if CONFIG_VP9_HIGHBITDEPTH
       if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
         dx >>= xd->bd - 8;
       }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-      d2 = dx * dx;
-      tokens[i][0].rate = base_bits + (best ? rate1 : rate0);
+      d2 = (int64_t)dx * dx;
+      tokens[i][0].rate += (best ? rate1 : rate0);
       tokens[i][0].error = d2 + (best ? error1 : error0);
       tokens[i][0].next = next;
-      tokens[i][0].token = t0;
       tokens[i][0].qc = x;
+      tokens[i][0].dqc = dqcoeff[rc];
       best_index[i][0] = best;
 
       /* Evaluate the second possibility for this state. */
       rate0 = tokens[next][0].rate;
       rate1 = tokens[next][1].rate;
 
-      if ((abs(x) * dequant_ptr[rc != 0] > abs(coeff[rc]) * mul) &&
-          (abs(x) * dequant_ptr[rc != 0] < abs(coeff[rc]) * mul +
-                                               dequant_ptr[rc != 0]))
-        shortcut = 1;
-      else
+      // The threshold of 3 is empirically obtained.
+      if (UNLIKELY(abs(x) > 3)) {
         shortcut = 0;
+      } else {
+#if CONFIG_NEW_QUANT
+        shortcut = (
+            (vp10_dequant_abscoeff_nuq(
+                abs(x), dequant_ptr[rc != 0],
+                dequant_val[band_translate[i]]) > (abs(coeff[rc]) << shift)) &&
+            (vp10_dequant_abscoeff_nuq(
+                abs(x) - 1, dequant_ptr[rc != 0],
+                dequant_val[band_translate[i]]) < (abs(coeff[rc]) << shift)));
+#else   // CONFIG_NEW_QUANT
+        if ((abs(x) * dequant_ptr[rc != 0] > (abs(coeff[rc]) << shift)) &&
+            (abs(x) * dequant_ptr[rc != 0] < (abs(coeff[rc]) << shift) +
+              dequant_ptr[rc != 0]))
+          shortcut = 1;
+        else
+          shortcut = 0;
+#endif   // CONFIG_NEW_QUANT
+      }
 
       if (shortcut) {
         sz = -(x < 0);
         x -= 2 * sz + 1;
+      } else {
+        tokens[i][1] = tokens[i][0];
+        best_index[i][1] = best_index[i][0];
+        next = i;
+
+        if (UNLIKELY(!(--band_left))) {
+          --band_counts;
+          band_left = *band_counts;
+          --token_costs;
+        }
+        continue;
       }
 
       /* Consider both possible successor states. */
@@ -203,47 +248,84 @@
          */
         t0 = tokens[next][0].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
         t1 = tokens[next][1].token == EOB_TOKEN ? EOB_TOKEN : ZERO_TOKEN;
-        e0 = 0;
+        base_bits = 0;
       } else {
-        vp10_get_token_extra(x, &t0, &e0);
+        base_bits = vp10_get_token_cost(x, &t0, cat6_high_cost);
         t1 = t0;
       }
-      if (next < default_eob) {
-        band = band_translate[i + 1];
-        if (t0 != EOB_TOKEN) {
-          pt = trellis_get_coeff_context(scan, nb, i, t0, token_cache);
-          rate0 += mb->token_costs[tx_size][type][ref][band][!x][pt]
-                                  [tokens[next][0].token];
+
+      if (next_shortcut) {
+        if (LIKELY(next < default_eob)) {
+          if (t0 != EOB_TOKEN) {
+            token_cache[rc] = vp10_pt_energy_class[t0];
+            pt = get_coef_context(nb, token_cache, i + 1);
+            rate0 += (*token_costs)[!x][pt][tokens[next][0].token];
+          }
+          if (t1 != EOB_TOKEN) {
+            token_cache[rc] = vp10_pt_energy_class[t1];
+            pt = get_coef_context(nb, token_cache, i + 1);
+            rate1 += (*token_costs)[!x][pt][tokens[next][1].token];
+          }
         }
-        if (t1 != EOB_TOKEN) {
-          pt = trellis_get_coeff_context(scan, nb, i, t1, token_cache);
-          rate1 += mb->token_costs[tx_size][type][ref][band][!x][pt]
-                                  [tokens[next][1].token];
+
+        UPDATE_RD_COST();
+        /* And pick the best. */
+        best = rd_cost1 < rd_cost0;
+      } else {
+        // The two states in next stage are identical.
+        if (next < default_eob && t0 != EOB_TOKEN) {
+          token_cache[rc] = vp10_pt_energy_class[t0];
+          pt = get_coef_context(nb, token_cache, i + 1);
+          rate0 += (*token_costs)[!x][pt][tokens[next][0].token];
         }
+        best = 0;
       }
 
-      UPDATE_RD_COST();
-      /* And pick the best. */
-      best = rd_cost1 < rd_cost0;
-      base_bits = vp10_get_cost(t0, e0, cat6_high_cost);
-
-      if (shortcut) {
+#if CONFIG_NEW_QUANT
+      dx = vp10_dequant_coeff_nuq(
+          x, dequant_ptr[rc != 0],
+          dequant_val[band_translate[i]]) - (coeff[rc] << shift);
 #if CONFIG_VP9_HIGHBITDEPTH
-        if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-          dx -= ((dequant_ptr[rc != 0] >> (xd->bd - 8)) + sz) ^ sz;
-        } else {
-          dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
-        }
-#else
-        dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-        d2 = dx * dx;
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        dx >>= xd->bd - 8;
       }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#else   // CONFIG_NEW_QUANT
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        dx -= ((dequant_ptr[rc != 0] >> (xd->bd - 8)) + sz) ^ sz;
+      } else {
+        dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
+      }
+#else
+      dx -= (dequant_ptr[rc != 0] + sz) ^ sz;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // CONFIG_NEW_QUANT
+      d2 = (int64_t)dx * dx;
+
       tokens[i][1].rate = base_bits + (best ? rate1 : rate0);
       tokens[i][1].error = d2 + (best ? error1 : error0);
       tokens[i][1].next = next;
       tokens[i][1].token = best ? t1 : t0;
       tokens[i][1].qc = x;
+
+      if (x) {
+        tran_low_t offset = dq_step[rc != 0];
+        // The 32x32 transform coefficient uses half quantization step size.
+        // Account for the rounding difference in the dequantized coefficeint
+        // value when the quantization index is dropped from an even number
+        // to an odd number.
+        if (shift & x)
+          offset += (dequant_ptr[rc != 0] & 0x01);
+
+        if (sz == 0)
+          tokens[i][1].dqc = dqcoeff[rc] - offset;
+        else
+          tokens[i][1].dqc = dqcoeff[rc] + offset;
+      } else {
+        tokens[i][1].dqc = 0;
+      }
+
       best_index[i][1] = best;
       /* Finally, make this the new head of the trellis. */
       next = i;
@@ -251,49 +333,58 @@
       /* There's no choice to make for a zero coefficient, so we don't
        *  add a new trellis node, but we do need to update the costs.
        */
-      band = band_translate[i + 1];
       t0 = tokens[next][0].token;
       t1 = tokens[next][1].token;
+      pt = get_coef_context(nb, token_cache, i + 1);
       /* Update the cost of each path if we're past the EOB token. */
       if (t0 != EOB_TOKEN) {
-        tokens[next][0].rate +=
-            mb->token_costs[tx_size][type][ref][band][1][0][t0];
+        tokens[next][0].rate += (*token_costs)[1][pt][t0];
         tokens[next][0].token = ZERO_TOKEN;
       }
       if (t1 != EOB_TOKEN) {
-        tokens[next][1].rate +=
-            mb->token_costs[tx_size][type][ref][band][1][0][t1];
+        tokens[next][1].rate += (*token_costs)[1][pt][t1];
         tokens[next][1].token = ZERO_TOKEN;
       }
       best_index[i][0] = best_index[i][1] = 0;
+      shortcut = (tokens[next][0].rate != tokens[next][1].rate);
       /* Don't update next, because we didn't add a new node. */
     }
+
+    if (UNLIKELY(!(--band_left))) {
+      --band_counts;
+      band_left = *band_counts;
+      --token_costs;
+    }
   }
 
   /* Now pick the best path through the whole trellis. */
-  band = band_translate[i + 1];
   rate0 = tokens[next][0].rate;
   rate1 = tokens[next][1].rate;
   error0 = tokens[next][0].error;
   error1 = tokens[next][1].error;
   t0 = tokens[next][0].token;
   t1 = tokens[next][1].token;
-  rate0 += mb->token_costs[tx_size][type][ref][band][0][ctx][t0];
-  rate1 += mb->token_costs[tx_size][type][ref][band][0][ctx][t1];
+  rate0 += (*token_costs)[0][ctx][t0];
+  rate1 += (*token_costs)[0][ctx][t1];
   UPDATE_RD_COST();
   best = rd_cost1 < rd_cost0;
+
   final_eob = -1;
-  memset(qcoeff, 0, sizeof(*qcoeff) * (16 << (tx_size * 2)));
-  memset(dqcoeff, 0, sizeof(*dqcoeff) * (16 << (tx_size * 2)));
+
   for (i = next; i < eob; i = next) {
     const int x = tokens[i][best].qc;
     const int rc = scan[i];
-    if (x) {
-      final_eob = i;
-    }
 
+    if (x) final_eob = i;
     qcoeff[rc] = x;
-    dqcoeff[rc] = (x * dequant_ptr[rc != 0]) / mul;
+    dqcoeff[rc] = tokens[i][best].dqc;
+
+#if CONFIG_NEW_QUANT
+    dqcoeff[rc] = vp10_dequant_abscoeff_nuq(abs(x), dequant_ptr[rc != 0],
+                                            dequant_val[band_translate[i]]);
+    if (shift) dqcoeff[rc] = ROUND_POWER_OF_TWO(dqcoeff[rc], shift);
+    if (x < 0) dqcoeff[rc] = -dqcoeff[rc];
+#endif  // CONFIG_NEW_QUANT
 
     next = tokens[i][best].next;
     best = best_index[i][best];
@@ -301,410 +392,175 @@
   final_eob++;
 
   mb->plane[plane].eobs[block] = final_eob;
+  assert(final_eob <= default_eob);
   return final_eob;
 }
 
-static INLINE void fdct32x32(int rd_transform,
-                             const int16_t *src, tran_low_t *dst,
-                             int src_stride) {
-  if (rd_transform)
-    vpx_fdct32x32_rd(src, dst, src_stride);
-  else
-    vpx_fdct32x32(src, dst, src_stride);
-}
-
 #if CONFIG_VP9_HIGHBITDEPTH
-static INLINE void highbd_fdct32x32(int rd_transform, const int16_t *src,
-                                    tran_low_t *dst, int src_stride) {
-  if (rd_transform)
-    vpx_highbd_fdct32x32_rd(src, dst, src_stride);
-  else
-    vpx_highbd_fdct32x32(src, dst, src_stride);
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
+typedef enum QUANT_FUNC {
+  QUANT_FUNC_LOWBD = 0,
+  QUANT_FUNC_HIGHBD = 1,
+  QUANT_FUNC_LAST = 2
+} QUANT_FUNC;
 
-void vp10_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
-                       int diff_stride, TX_TYPE tx_type, int lossless) {
-  if (lossless) {
-    vp10_fwht4x4(src_diff, coeff, diff_stride);
-  } else {
-    switch (tx_type) {
-      case DCT_DCT:
-        vpx_fdct4x4(src_diff, coeff, diff_stride);
-        break;
-      case ADST_DCT:
-      case DCT_ADST:
-      case ADST_ADST:
-        vp10_fht4x4(src_diff, coeff, diff_stride, tx_type);
-        break;
-      default:
-        assert(0);
-        break;
-    }
-  }
-}
+static VP10_QUANT_FACADE
+    quant_func_list[VP10_XFORM_QUANT_LAST][QUANT_FUNC_LAST] = {
+        {vp10_quantize_fp_facade, vp10_highbd_quantize_fp_facade},
+        {vp10_quantize_b_facade, vp10_highbd_quantize_b_facade},
+        {vp10_quantize_dc_facade, vp10_highbd_quantize_dc_facade},
+        {NULL, NULL}};
 
-static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
-                         int diff_stride, TX_TYPE tx_type) {
-  switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-      vp10_fht8x8(src_diff, coeff, diff_stride, tx_type);
-      break;
-    default:
-      assert(0);
-      break;
-  }
-}
+#else
+typedef enum QUANT_FUNC {
+  QUANT_FUNC_LOWBD = 0,
+  QUANT_FUNC_LAST = 1
+} QUANT_FUNC;
 
-static void fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
-                           int diff_stride, TX_TYPE tx_type) {
-  switch (tx_type) {
-    case DCT_DCT:
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-      vp10_fht16x16(src_diff, coeff, diff_stride, tx_type);
-      break;
-    default:
-      assert(0);
-      break;
-  }
-}
+static VP10_QUANT_FACADE
+    quant_func_list[VP10_XFORM_QUANT_LAST][QUANT_FUNC_LAST] = {
+        {vp10_quantize_fp_facade},
+        {vp10_quantize_b_facade},
+        {vp10_quantize_dc_facade},
+        {NULL}};
+#endif
 
-static void fwd_txfm_32x32(int rd_transform, const int16_t *src_diff,
-                           tran_low_t *coeff, int diff_stride,
-                           TX_TYPE tx_type) {
-  switch (tx_type) {
-    case DCT_DCT:
-      fdct32x32(rd_transform, src_diff, coeff, diff_stride);
-      break;
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-      assert(0);
-      break;
-    default:
-      assert(0);
-      break;
-  }
-}
+static FWD_TXFM_OPT fwd_txfm_opt_list[VP10_XFORM_QUANT_LAST] = {
+    FWD_TXFM_OPT_NORMAL, FWD_TXFM_OPT_NORMAL, FWD_TXFM_OPT_DC,
+    FWD_TXFM_OPT_NORMAL};
 
-#if CONFIG_VP9_HIGHBITDEPTH
-void vp10_highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
-                              int diff_stride, TX_TYPE tx_type, int lossless) {
-  if (lossless) {
-    assert(tx_type == DCT_DCT);
-    vp10_highbd_fwht4x4(src_diff, coeff, diff_stride);
-  } else {
-    switch (tx_type) {
-      case DCT_DCT:
-        vpx_highbd_fdct4x4(src_diff, coeff, diff_stride);
-        break;
-      case ADST_DCT:
-      case DCT_ADST:
-      case ADST_ADST:
-        vp10_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
-        break;
-      default:
-        assert(0);
-        break;
-    }
-  }
-}
-
-static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
-                         int diff_stride, TX_TYPE tx_type) {
-  switch (tx_type) {
-    case DCT_DCT:
-      vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
-      break;
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-      vp10_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
-      break;
-    default:
-      assert(0);
-      break;
-  }
-}
-
-static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
-                           int diff_stride, TX_TYPE tx_type) {
-  switch (tx_type) {
-    case DCT_DCT:
-      vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
-      break;
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-      vp10_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
-      break;
-    default:
-      assert(0);
-      break;
-  }
-}
-
-static void highbd_fwd_txfm_32x32(int rd_transform, const int16_t *src_diff,
-                                  tran_low_t *coeff, int diff_stride,
-                                  TX_TYPE tx_type) {
-  switch (tx_type) {
-    case DCT_DCT:
-      highbd_fdct32x32(rd_transform, src_diff, coeff, diff_stride);
-      break;
-    case ADST_DCT:
-    case DCT_ADST:
-    case ADST_ADST:
-      assert(0);
-      break;
-    default:
-      assert(0);
-      break;
-  }
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-void vp10_xform_quant_fp(MACROBLOCK *x, int plane, int block,
-                         int blk_row, int blk_col,
-                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
+void vp10_xform_quant(MACROBLOCK *x, int plane, int block, int blk_row,
+                      int blk_col, BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                      VP10_XFORM_QUANT xform_quant_idx) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
-  TX_TYPE tx_type = get_tx_type(plane_type, xd, block);
-  const scan_order *const scan_order = get_scan(tx_size, tx_type);
+  TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+  const scan_order *const scan_order =
+      get_scan(tx_size, tx_type, is_inter_block(&xd->mi[0]->mbmi));
   tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
   tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   uint16_t *const eob = &p->eobs[block];
   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
   const int16_t *src_diff;
+  const int tx1d_size = get_tx1d_size(tx_size);
+  const int tx2d_size = tx1d_size * tx1d_size;
+
+  FWD_TXFM_PARAM fwd_txfm_param;
+  QUANT_PARAM qparam;
+
+  fwd_txfm_param.tx_type = tx_type;
+  fwd_txfm_param.tx_size = tx_size;
+  fwd_txfm_param.fwd_txfm_opt = fwd_txfm_opt_list[xform_quant_idx];
+  fwd_txfm_param.rd_transform = x->use_lp32x32fdct;
+  fwd_txfm_param.lossless = xd->lossless[xd->mi[0]->mbmi.segment_id];
+
   src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
 
+  qparam.log_scale = get_tx_scale(xd, tx_type, tx_size);
 #if CONFIG_VP9_HIGHBITDEPTH
+  fwd_txfm_param.bd = xd->bd;
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    switch (tx_size) {
-      case TX_32X32:
-        highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-        vp10_highbd_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin,
-                                     p->round_fp, p->quant_fp, p->quant_shift,
-                                     qcoeff, dqcoeff, pd->dequant,
-                                     eob, scan_order->scan,
-                                     scan_order->iscan);
-        break;
-      case TX_16X16:
-        vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
-        vp10_highbd_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
-                               p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                               pd->dequant, eob,
-                               scan_order->scan, scan_order->iscan);
-        break;
-      case TX_8X8:
-        vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
-        vp10_highbd_quantize_fp(coeff, 64, x->skip_block, p->zbin, p->round_fp,
-                               p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                               pd->dequant, eob,
-                               scan_order->scan, scan_order->iscan);
-        break;
-      case TX_4X4:
-        if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
-          vp10_highbd_fwht4x4(src_diff, coeff, diff_stride);
-        } else {
-          vpx_highbd_fdct4x4(src_diff, coeff, diff_stride);
-        }
-        vp10_highbd_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
-                               p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                               pd->dequant, eob,
-                               scan_order->scan, scan_order->iscan);
-        break;
-      default:
-        assert(0);
+    highbd_fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
+    if (xform_quant_idx != VP10_XFORM_QUANT_SKIP_QUANT) {
+      if (x->skip_block) {
+        vp10_quantize_skip(tx2d_size, qcoeff, dqcoeff, eob);
+      } else {
+        quant_func_list[xform_quant_idx][QUANT_FUNC_HIGHBD](
+            coeff, tx2d_size, p, qcoeff, pd, dqcoeff, eob,
+            scan_order, &qparam);
+      }
     }
     return;
   }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-  switch (tx_size) {
-    case TX_32X32:
-      fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-      vp10_quantize_fp_32x32(coeff, 1024, x->skip_block, p->zbin, p->round_fp,
-                            p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                            pd->dequant, eob, scan_order->scan,
-                            scan_order->iscan);
-      break;
-    case TX_16X16:
-      vpx_fdct16x16(src_diff, coeff, diff_stride);
-      vp10_quantize_fp(coeff, 256, x->skip_block, p->zbin, p->round_fp,
-                      p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                      pd->dequant, eob,
-                      scan_order->scan, scan_order->iscan);
-      break;
-    case TX_8X8:
-      vp10_fdct8x8_quant(src_diff, diff_stride, coeff, 64,
-                        x->skip_block, p->zbin, p->round_fp,
-                        p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                        pd->dequant, eob,
-                        scan_order->scan, scan_order->iscan);
-      break;
-    case TX_4X4:
-      if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
-        vp10_fwht4x4(src_diff, coeff, diff_stride);
-      } else {
-        vpx_fdct4x4(src_diff, coeff, diff_stride);
-      }
-      vp10_quantize_fp(coeff, 16, x->skip_block, p->zbin, p->round_fp,
-                      p->quant_fp, p->quant_shift, qcoeff, dqcoeff,
-                      pd->dequant, eob,
-                      scan_order->scan, scan_order->iscan);
-      break;
-    default:
-      assert(0);
-      break;
-  }
-}
-
-void vp10_xform_quant_dc(MACROBLOCK *x, int plane, int block,
-                         int blk_row, int blk_col,
-                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  const struct macroblock_plane *const p = &x->plane[plane];
-  const struct macroblockd_plane *const pd = &xd->plane[plane];
-  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
-  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
-  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
-  uint16_t *const eob = &p->eobs[block];
-  const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
-  const int16_t *src_diff;
-  src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
-
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    switch (tx_size) {
-      case TX_32X32:
-        vpx_highbd_fdct32x32_1(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_dc_32x32(coeff, x->skip_block, p->round,
-                                     p->quant_fp[0], qcoeff, dqcoeff,
-                                     pd->dequant[0], eob);
-        break;
-      case TX_16X16:
-        vpx_highbd_fdct16x16_1(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_dc(coeff, 256, x->skip_block, p->round,
-                               p->quant_fp[0], qcoeff, dqcoeff,
-                               pd->dequant[0], eob);
-        break;
-      case TX_8X8:
-        vpx_highbd_fdct8x8_1(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_dc(coeff, 64, x->skip_block, p->round,
-                               p->quant_fp[0], qcoeff, dqcoeff,
-                               pd->dequant[0], eob);
-        break;
-      case TX_4X4:
-        if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
-          vp10_highbd_fwht4x4(src_diff, coeff, diff_stride);
-        } else {
-          vpx_highbd_fdct4x4(src_diff, coeff, diff_stride);
-        }
-        vpx_highbd_quantize_dc(coeff, 16, x->skip_block, p->round,
-                               p->quant_fp[0], qcoeff, dqcoeff,
-                               pd->dequant[0], eob);
-        break;
-      default:
-        assert(0);
+  fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
+  if (xform_quant_idx != VP10_XFORM_QUANT_SKIP_QUANT) {
+    if (x->skip_block) {
+      vp10_quantize_skip(tx2d_size, qcoeff, dqcoeff, eob);
+    } else {
+      quant_func_list[xform_quant_idx][QUANT_FUNC_LOWBD](
+          coeff, tx2d_size, p, qcoeff, pd, dqcoeff, eob,
+          scan_order, &qparam);
     }
-    return;
-  }
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-  switch (tx_size) {
-    case TX_32X32:
-      vpx_fdct32x32_1(src_diff, coeff, diff_stride);
-      vpx_quantize_dc_32x32(coeff, x->skip_block, p->round,
-                            p->quant_fp[0], qcoeff, dqcoeff,
-                            pd->dequant[0], eob);
-      break;
-    case TX_16X16:
-      vpx_fdct16x16_1(src_diff, coeff, diff_stride);
-      vpx_quantize_dc(coeff, 256, x->skip_block, p->round,
-                     p->quant_fp[0], qcoeff, dqcoeff,
-                     pd->dequant[0], eob);
-      break;
-    case TX_8X8:
-      vpx_fdct8x8_1(src_diff, coeff, diff_stride);
-      vpx_quantize_dc(coeff, 64, x->skip_block, p->round,
-                      p->quant_fp[0], qcoeff, dqcoeff,
-                      pd->dequant[0], eob);
-      break;
-    case TX_4X4:
-      if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
-        vp10_fwht4x4(src_diff, coeff, diff_stride);
-      } else {
-        vpx_fdct4x4(src_diff, coeff, diff_stride);
-      }
-      vpx_quantize_dc(coeff, 16, x->skip_block, p->round,
-                      p->quant_fp[0], qcoeff, dqcoeff,
-                      pd->dequant[0], eob);
-      break;
-    default:
-      assert(0);
-      break;
   }
 }
 
-
-
-void vp10_xform_quant(MACROBLOCK *x, int plane, int block,
-                      int blk_row, int blk_col,
-                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size) {
+#if CONFIG_NEW_QUANT
+void vp10_xform_quant_nuq(MACROBLOCK *x, int plane, int block, int blk_row,
+                          int blk_col, BLOCK_SIZE plane_bsize,
+                          TX_SIZE tx_size, int ctx) {
   MACROBLOCKD *const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
   PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
-  TX_TYPE tx_type = get_tx_type(plane_type, xd, block);
-  const scan_order *const scan_order = get_scan(tx_size, tx_type);
+  TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+  const scan_order *const scan_order =
+      get_scan(tx_size, tx_type, is_inter_block(&xd->mi[0]->mbmi));
   tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
   tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  int dq = get_dq_profile_from_ctx(ctx);
   uint16_t *const eob = &p->eobs[block];
   const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
   const int16_t *src_diff;
+  const uint8_t* band = get_band_translate(tx_size);
+
+  FWD_TXFM_PARAM fwd_txfm_param;
+
+  fwd_txfm_param.tx_type = tx_type;
+  fwd_txfm_param.tx_size = tx_size;
+  fwd_txfm_param.fwd_txfm_opt = fwd_txfm_opt_list[VP10_XFORM_QUANT_FP];
+  fwd_txfm_param.rd_transform = x->use_lp32x32fdct;
+  fwd_txfm_param.lossless = xd->lossless[xd->mi[0]->mbmi.segment_id];
+
   src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
 
+// TODO(sarahparker) add all of these new quant quantize functions
+// to quant_func_list, just trying to get this expr to work for now
 #if CONFIG_VP9_HIGHBITDEPTH
+  fwd_txfm_param.bd = xd->bd;
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-     switch (tx_size) {
+    highbd_fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
+    switch (tx_size) {
       case TX_32X32:
-        highbd_fwd_txfm_32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride,
-                         tx_type);
-        vpx_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
-                                    p->round, p->quant, p->quant_shift, qcoeff,
-                                    dqcoeff, pd->dequant, eob,
-                                    scan_order->scan, scan_order->iscan);
+        highbd_quantize_32x32_nuq(coeff, 1024, x->skip_block,
+                                  p->quant, p->quant_shift, pd->dequant,
+                                  (const cuml_bins_type_nuq *)
+                                      p->cuml_bins_nuq[dq],
+                                  (const dequant_val_type_nuq *)
+                                      pd->dequant_val_nuq[dq],
+                                  qcoeff, dqcoeff, eob,
+                                  scan_order->scan, band);
         break;
       case TX_16X16:
-        highbd_fwd_txfm_16x16(src_diff, coeff, diff_stride, tx_type);
-        vpx_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
-                              p->quant, p->quant_shift, qcoeff, dqcoeff,
-                              pd->dequant, eob,
-                              scan_order->scan, scan_order->iscan);
+        highbd_quantize_nuq(coeff, 256, x->skip_block,
+                            p->quant, p->quant_shift, pd->dequant,
+                            (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+                            (const dequant_val_type_nuq *)
+                                pd->dequant_val_nuq[dq],
+                            qcoeff, dqcoeff, eob,
+                            scan_order->scan, band);
         break;
       case TX_8X8:
-        highbd_fwd_txfm_8x8(src_diff, coeff, diff_stride, tx_type);
-        vpx_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
-                              p->quant, p->quant_shift, qcoeff, dqcoeff,
-                              pd->dequant, eob,
-                              scan_order->scan, scan_order->iscan);
+        highbd_quantize_nuq(coeff, 64, x->skip_block,
+                            p->quant, p->quant_shift, pd->dequant,
+                            (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+                            (const dequant_val_type_nuq *)
+                                pd->dequant_val_nuq[dq],
+                            qcoeff, dqcoeff, eob,
+                            scan_order->scan, band);
         break;
       case TX_4X4:
-        vp10_highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, tx_type,
-                                 xd->lossless[xd->mi[0]->mbmi.segment_id]);
-        vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
-                              p->quant, p->quant_shift, qcoeff, dqcoeff,
-                              pd->dequant, eob,
-                              scan_order->scan, scan_order->iscan);
+        highbd_quantize_nuq(coeff, 16, x->skip_block,
+                            p->quant, p->quant_shift, pd->dequant,
+                            (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+                            (const dequant_val_type_nuq *)
+                                pd->dequant_val_nuq[dq],
+                            qcoeff, dqcoeff, eob,
+                            scan_order->scan, band);
         break;
       default:
         assert(0);
@@ -713,35 +569,40 @@
   }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
+  fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
   switch (tx_size) {
     case TX_32X32:
-      fwd_txfm_32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride, tx_type);
-      vpx_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
-                           p->quant, p->quant_shift, qcoeff, dqcoeff,
-                           pd->dequant, eob, scan_order->scan,
-                           scan_order->iscan);
+      quantize_32x32_nuq(coeff, 1024, x->skip_block,
+                         p->quant, p->quant_shift, pd->dequant,
+                         (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+                         (const dequant_val_type_nuq *)
+                         pd->dequant_val_nuq[dq],
+                         qcoeff, dqcoeff, eob,
+                         scan_order->scan, band);
       break;
     case TX_16X16:
-      fwd_txfm_16x16(src_diff, coeff, diff_stride, tx_type);
-      vpx_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
-                     p->quant, p->quant_shift, qcoeff, dqcoeff,
-                     pd->dequant, eob,
-                     scan_order->scan, scan_order->iscan);
+      quantize_nuq(coeff, 256, x->skip_block,
+                   p->quant, p->quant_shift, pd->dequant,
+                   (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+                   (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
+                   qcoeff, dqcoeff, eob,
+                   scan_order->scan, band);
       break;
     case TX_8X8:
-      fwd_txfm_8x8(src_diff, coeff, diff_stride, tx_type);
-      vpx_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
-                     p->quant, p->quant_shift, qcoeff, dqcoeff,
-                     pd->dequant, eob,
-                     scan_order->scan, scan_order->iscan);
+      quantize_nuq(coeff, 64, x->skip_block,
+                   p->quant, p->quant_shift, pd->dequant,
+                   (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+                   (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
+                   qcoeff, dqcoeff, eob,
+                   scan_order->scan, band);
       break;
     case TX_4X4:
-      vp10_fwd_txfm_4x4(src_diff, coeff, diff_stride, tx_type,
-                        xd->lossless[xd->mi[0]->mbmi.segment_id]);
-      vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
-                     p->quant, p->quant_shift, qcoeff, dqcoeff,
-                     pd->dequant, eob,
-                     scan_order->scan, scan_order->iscan);
+      quantize_nuq(coeff, 16, x->skip_block,
+                   p->quant, p->quant_shift, pd->dequant,
+                   (const cuml_bins_type_nuq *)p->cuml_bins_nuq[dq],
+                   (const dequant_val_type_nuq *)pd->dequant_val_nuq[dq],
+                   qcoeff, dqcoeff, eob,
+                   scan_order->scan, band);
       break;
     default:
       assert(0);
@@ -749,136 +610,496 @@
   }
 }
 
+void vp10_xform_quant_fp_nuq(MACROBLOCK *x, int plane, int block, int blk_row,
+                             int blk_col, BLOCK_SIZE plane_bsize,
+                             TX_SIZE tx_size, int ctx) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  int dq = get_dq_profile_from_ctx(ctx);
+  PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
+  TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+  const scan_order *const scan_order =
+      get_scan(tx_size, tx_type, is_inter_block(&xd->mi[0]->mbmi));
+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  uint16_t *const eob = &p->eobs[block];
+  const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  const int16_t *src_diff;
+  const uint8_t* band = get_band_translate(tx_size);
+
+  FWD_TXFM_PARAM fwd_txfm_param;
+
+  fwd_txfm_param.tx_type = tx_type;
+  fwd_txfm_param.tx_size = tx_size;
+  fwd_txfm_param.fwd_txfm_opt = fwd_txfm_opt_list[VP10_XFORM_QUANT_FP];
+  fwd_txfm_param.rd_transform = x->use_lp32x32fdct;
+  fwd_txfm_param.lossless = xd->lossless[xd->mi[0]->mbmi.segment_id];
+
+  src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+
+// TODO(sarahparker) add all of these new quant quantize functions
+// to quant_func_list, just trying to get this expr to work for now
+#if CONFIG_VP9_HIGHBITDEPTH
+  fwd_txfm_param.bd = xd->bd;
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    highbd_fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
+    switch (tx_size) {
+      case TX_32X32:
+        highbd_quantize_32x32_fp_nuq(coeff, 1024, x->skip_block,
+                                     p->quant_fp, pd->dequant,
+                                     (const cuml_bins_type_nuq *)
+                                         p->cuml_bins_nuq[dq],
+                                     (const dequant_val_type_nuq *)
+                                         pd->dequant_val_nuq[dq],
+                                     qcoeff, dqcoeff, eob,
+                                     scan_order->scan, band);
+        break;
+      case TX_16X16:
+        highbd_quantize_fp_nuq(coeff, 256, x->skip_block,
+                               p->quant_fp, pd->dequant,
+                               (const cuml_bins_type_nuq *)
+                                  p->cuml_bins_nuq[dq],
+                               (const dequant_val_type_nuq *)
+                                   pd->dequant_val_nuq[dq],
+                               qcoeff, dqcoeff, eob,
+                               scan_order->scan, band);
+        break;
+      case TX_8X8:
+        highbd_quantize_fp_nuq(coeff, 64, x->skip_block,
+                               p->quant_fp, pd->dequant,
+                               (const cuml_bins_type_nuq *)
+                                  p->cuml_bins_nuq[dq],
+                               (const dequant_val_type_nuq *)
+                                   pd->dequant_val_nuq[dq],
+                               qcoeff, dqcoeff, eob,
+                               scan_order->scan, band);
+        break;
+      case TX_4X4:
+        highbd_quantize_fp_nuq(coeff, 16, x->skip_block,
+                               p->quant_fp, pd->dequant,
+                               (const cuml_bins_type_nuq *)
+                                   p->cuml_bins_nuq[dq],
+                               (const dequant_val_type_nuq *)
+                                   pd->dequant_val_nuq[dq],
+                               qcoeff, dqcoeff, eob,
+                               scan_order->scan, band);
+        break;
+      default:
+        assert(0);
+    }
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
+  switch (tx_size) {
+    case TX_32X32:
+      quantize_32x32_fp_nuq(coeff, 1024, x->skip_block,
+                            p->quant_fp, pd->dequant,
+                            (const cuml_bins_type_nuq *)
+                                p->cuml_bins_nuq[dq],
+                            (const dequant_val_type_nuq *)
+                                pd->dequant_val_nuq[dq],
+                            qcoeff, dqcoeff, eob,
+                            scan_order->scan, band);
+      break;
+    case TX_16X16:
+      quantize_fp_nuq(coeff, 256, x->skip_block,
+                      p->quant_fp, pd->dequant,
+                      (const cuml_bins_type_nuq *)
+                          p->cuml_bins_nuq[dq],
+                      (const dequant_val_type_nuq *)
+                          pd->dequant_val_nuq[dq],
+                      qcoeff, dqcoeff, eob,
+                      scan_order->scan, band);
+      break;
+    case TX_8X8:
+      quantize_fp_nuq(coeff, 64, x->skip_block,
+                      p->quant_fp, pd->dequant,
+                      (const cuml_bins_type_nuq *)
+                          p->cuml_bins_nuq[dq],
+                      (const dequant_val_type_nuq *)
+                          pd->dequant_val_nuq[dq],
+                      qcoeff, dqcoeff, eob,
+                      scan_order->scan, band);
+      break;
+    case TX_4X4:
+      quantize_fp_nuq(coeff, 16, x->skip_block,
+                      p->quant_fp, pd->dequant,
+                      (const cuml_bins_type_nuq *)
+                          p->cuml_bins_nuq[dq],
+                      (const dequant_val_type_nuq *)
+                          pd->dequant_val_nuq[dq],
+                      qcoeff, dqcoeff, eob,
+                      scan_order->scan, band);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+
+void vp10_xform_quant_dc_nuq(MACROBLOCK *x, int plane, int block, int blk_row,
+                             int blk_col, BLOCK_SIZE plane_bsize,
+                             TX_SIZE tx_size, int ctx) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
+  TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  uint16_t *const eob = &p->eobs[block];
+  const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  const int16_t *src_diff;
+  int dq = get_dq_profile_from_ctx(ctx);
+
+  FWD_TXFM_PARAM fwd_txfm_param;
+
+  fwd_txfm_param.tx_type = tx_type;
+  fwd_txfm_param.tx_size = tx_size;
+  fwd_txfm_param.fwd_txfm_opt = fwd_txfm_opt_list[VP10_XFORM_QUANT_DC];
+  fwd_txfm_param.rd_transform = x->use_lp32x32fdct;
+  fwd_txfm_param.lossless = xd->lossless[xd->mi[0]->mbmi.segment_id];
+
+  src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+
+// TODO(sarahparker) add all of these new quant quantize functions
+// to quant_func_list, just trying to get this expr to work for now
+#if CONFIG_VP9_HIGHBITDEPTH
+  fwd_txfm_param.bd = xd->bd;
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    highbd_fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
+    switch (tx_size) {
+      case TX_32X32:
+        highbd_quantize_dc_32x32_nuq(coeff, 1024, x->skip_block,
+                                     p->quant[0], p->quant_shift[0],
+                                     pd->dequant[0],
+                                     p->cuml_bins_nuq[dq][0],
+                                     pd->dequant_val_nuq[dq][0],
+                                     qcoeff, dqcoeff, eob);
+        break;
+      case TX_16X16:
+        highbd_quantize_dc_nuq(coeff, 256, x->skip_block,
+                               p->quant[0], p->quant_shift[0],
+                               pd->dequant[0],
+                               p->cuml_bins_nuq[dq][0],
+                               pd->dequant_val_nuq[dq][0],
+                               qcoeff, dqcoeff, eob);
+        break;
+      case TX_8X8:
+        highbd_quantize_dc_nuq(coeff, 64, x->skip_block,
+                               p->quant[0], p->quant_shift[0],
+                               pd->dequant[0],
+                               p->cuml_bins_nuq[dq][0],
+                               pd->dequant_val_nuq[dq][0],
+                               qcoeff, dqcoeff, eob);
+        break;
+      case TX_4X4:
+        highbd_quantize_dc_nuq(coeff, 16, x->skip_block,
+                               p->quant[0], p->quant_shift[0],
+                               pd->dequant[0],
+                               p->cuml_bins_nuq[dq][0],
+                               pd->dequant_val_nuq[dq][0],
+                               qcoeff, dqcoeff, eob);
+        break;
+      default:
+        assert(0);
+    }
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
+  switch (tx_size) {
+    case TX_32X32:
+      quantize_dc_32x32_nuq(coeff, 1024, x->skip_block,
+                            p->quant[0], p->quant_shift[0], pd->dequant[0],
+                            p->cuml_bins_nuq[dq][0],
+                            pd->dequant_val_nuq[dq][0],
+                            qcoeff, dqcoeff, eob);
+      break;
+    case TX_16X16:
+      quantize_dc_nuq(coeff, 256, x->skip_block,
+                      p->quant[0], p->quant_shift[0], pd->dequant[0],
+                      p->cuml_bins_nuq[dq][0],
+                      pd->dequant_val_nuq[dq][0],
+                      qcoeff, dqcoeff, eob);
+      break;
+    case TX_8X8:
+      quantize_dc_nuq(coeff, 64, x->skip_block,
+                      p->quant[0], p->quant_shift[0], pd->dequant[0],
+                      p->cuml_bins_nuq[dq][0],
+                      pd->dequant_val_nuq[dq][0],
+                      qcoeff, dqcoeff, eob);
+      break;
+    case TX_4X4:
+      quantize_dc_nuq(coeff, 16, x->skip_block,
+                      p->quant[0], p->quant_shift[0], pd->dequant[0],
+                      p->cuml_bins_nuq[dq][0],
+                      pd->dequant_val_nuq[dq][0],
+                      qcoeff, dqcoeff, eob);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+
+void vp10_xform_quant_dc_fp_nuq(MACROBLOCK *x, int plane, int block,
+                                int blk_row, int blk_col,
+                                BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                                int ctx) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
+  TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+  tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  uint16_t *const eob = &p->eobs[block];
+  const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  const int16_t *src_diff;
+  int dq = get_dq_profile_from_ctx(ctx);
+
+  FWD_TXFM_PARAM fwd_txfm_param;
+
+  fwd_txfm_param.tx_type = tx_type;
+  fwd_txfm_param.tx_size = tx_size;
+  fwd_txfm_param.fwd_txfm_opt = fwd_txfm_opt_list[VP10_XFORM_QUANT_DC];
+  fwd_txfm_param.rd_transform = x->use_lp32x32fdct;
+  fwd_txfm_param.lossless = xd->lossless[xd->mi[0]->mbmi.segment_id];
+
+  src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+
+// TODO(sarahparker) add all of these new quant quantize functions
+// to quant_func_list, just trying to get this expr to work for now
+#if CONFIG_VP9_HIGHBITDEPTH
+  fwd_txfm_param.bd = xd->bd;
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    highbd_fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
+    switch (tx_size) {
+      case TX_32X32:
+        highbd_quantize_dc_32x32_fp_nuq(coeff, 1024, x->skip_block,
+                                        p->quant_fp[0], pd->dequant[0],
+                                        p->cuml_bins_nuq[dq][0],
+                                        pd->dequant_val_nuq[dq][0],
+                                        qcoeff, dqcoeff, eob);
+        break;
+      case TX_16X16:
+        highbd_quantize_dc_fp_nuq(coeff, 256, x->skip_block,
+                                  p->quant_fp[0], pd->dequant[0],
+                                  p->cuml_bins_nuq[dq][0],
+                                  pd->dequant_val_nuq[dq][0],
+                                  qcoeff, dqcoeff, eob);
+        break;
+      case TX_8X8:
+        highbd_quantize_dc_fp_nuq(coeff, 64, x->skip_block,
+                                  p->quant_fp[0], pd->dequant[0],
+                                  p->cuml_bins_nuq[dq][0],
+                                  pd->dequant_val_nuq[dq][0],
+                                  qcoeff, dqcoeff, eob);
+        break;
+      case TX_4X4:
+        highbd_quantize_dc_fp_nuq(coeff, 16, x->skip_block,
+                                  p->quant_fp[0], pd->dequant[0],
+                                  p->cuml_bins_nuq[dq][0],
+                                  pd->dequant_val_nuq[dq][0],
+                                  qcoeff, dqcoeff, eob);
+        break;
+      default:
+        assert(0);
+    }
+    return;
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  fwd_txfm(src_diff, coeff, diff_stride, &fwd_txfm_param);
+  switch (tx_size) {
+    case TX_32X32:
+      quantize_dc_32x32_fp_nuq(coeff, 1024, x->skip_block,
+                               p->quant_fp[0], pd->dequant[0],
+                               p->cuml_bins_nuq[dq][0],
+                               pd->dequant_val_nuq[dq][0],
+                               qcoeff, dqcoeff, eob);
+      break;
+    case TX_16X16:
+      quantize_dc_fp_nuq(coeff, 256, x->skip_block,
+                         p->quant_fp[0], pd->dequant[0],
+                         p->cuml_bins_nuq[dq][0],
+                         pd->dequant_val_nuq[dq][0],
+                         qcoeff, dqcoeff, eob);
+
+      break;
+    case TX_8X8:
+      quantize_dc_fp_nuq(coeff, 64, x->skip_block,
+                         p->quant_fp[0], pd->dequant[0],
+                         p->cuml_bins_nuq[dq][0],
+                         pd->dequant_val_nuq[dq][0],
+                         qcoeff, dqcoeff, eob);
+      break;
+    case TX_4X4:
+      quantize_dc_fp_nuq(coeff, 16, x->skip_block,
+                         p->quant_fp[0], pd->dequant[0],
+                         p->cuml_bins_nuq[dq][0],
+                         pd->dequant_val_nuq[dq][0],
+                         qcoeff, dqcoeff, eob);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+#endif  // CONFIG_NEW_QUANT
+
 static void encode_block(int plane, int block, int blk_row, int blk_col,
                          BLOCK_SIZE plane_bsize,
                          TX_SIZE tx_size, void *arg) {
   struct encode_b_args *const args = arg;
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
-  struct optimize_ctx *const ctx = args->ctx;
+  int ctx;
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   uint8_t *dst;
   ENTROPY_CONTEXT *a, *l;
-  TX_TYPE tx_type = get_tx_type(pd->plane_type, xd, block);
+  INV_TXFM_PARAM inv_txfm_param;
+#if CONFIG_VAR_TX
+  int i;
+  const int bwl = b_width_log2_lookup[plane_bsize];
+#endif
   dst = &pd->dst.buf[4 * blk_row * pd->dst.stride + 4 * blk_col];
-  a = &ctx->ta[plane][blk_col];
-  l = &ctx->tl[plane][blk_row];
+  a = &args->ta[blk_col];
+  l = &args->tl[blk_row];
+#if CONFIG_VAR_TX
+  ctx = get_entropy_context(tx_size, a, l);
+#else
+  ctx = combine_entropy_contexts(*a, *l);
+#endif
 
   // TODO(jingning): per transformed block zero forcing only enabled for
   // luma component. will integrate chroma components as well.
-  if (x->zcoeff_blk[tx_size][block] && plane == 0) {
+  // Turn this back on when the rate-distortion loop is synchronized with
+  // the recursive transform block coding.
+//  if (x->zcoeff_blk[tx_size][block] && plane == 0) {
+//    p->eobs[block] = 0;
+//    *a = *l = 0;
+//    return;
+//  }
+
+#if CONFIG_VAR_TX
+  // Assert not magic number (uninitialised).
+  assert(x->blk_skip[plane][(blk_row << bwl) + blk_col] != 234);
+
+  if (x->blk_skip[plane][(blk_row << bwl) + blk_col] == 0) {
+#else
+  {
+#endif
+#if CONFIG_NEW_QUANT
+    vp10_xform_quant_fp_nuq(x, plane, block, blk_row, blk_col, plane_bsize,
+                            tx_size, ctx);
+#else
+    vp10_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize,
+                     tx_size, VP10_XFORM_QUANT_FP);
+#endif  // CONFIG_NEW_QUANT
+  }
+#if CONFIG_VAR_TX
+  else {
     p->eobs[block] = 0;
-    *a = *l = 0;
-    return;
   }
+#endif
 
-  if (!x->skip_recode) {
-    if (x->quant_fp) {
-      // Encoding process for rtc mode
-      if (x->skip_txfm[0] == SKIP_TXFM_AC_DC && plane == 0) {
-        // skip forward transform
-        p->eobs[block] = 0;
-        *a = *l = 0;
-        return;
-      } else {
-        vp10_xform_quant_fp(x, plane, block, blk_row, blk_col,
-                            plane_bsize, tx_size);
-      }
-    } else {
-      if (max_txsize_lookup[plane_bsize] == tx_size) {
-        int txfm_blk_index = (plane << 2) + (block >> (tx_size << 1));
-        if (x->skip_txfm[txfm_blk_index] == SKIP_TXFM_NONE) {
-          // full forward transform and quantization
-          vp10_xform_quant(x, plane, block, blk_row, blk_col,
-                           plane_bsize, tx_size);
-        } else if (x->skip_txfm[txfm_blk_index] == SKIP_TXFM_AC_ONLY) {
-          // fast path forward transform and quantization
-          vp10_xform_quant_dc(x, plane, block, blk_row, blk_col,
-                              plane_bsize, tx_size);
-        } else {
-          // skip forward transform
-          p->eobs[block] = 0;
-          *a = *l = 0;
-          return;
-        }
-      } else {
-        vp10_xform_quant(x, plane, block, blk_row, blk_col,
-                         plane_bsize, tx_size);
-      }
-    }
-  }
-
-  if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
-    const int ctx = combine_entropy_contexts(*a, *l);
-    *a = *l = optimize_b(x, plane, block, tx_size, ctx) > 0;
+  if (p->eobs[block]) {
+    *a = *l = vp10_optimize_b(x, plane, block, tx_size, ctx) > 0;
   } else {
     *a = *l = p->eobs[block] > 0;
   }
 
+#if CONFIG_VAR_TX
+  for (i = 0; i < (1 << tx_size); ++i) {
+    a[i] = a[0];
+    l[i] = l[0];
+  }
+#endif
+
   if (p->eobs[block])
     *(args->skip) = 0;
 
   if (p->eobs[block] == 0)
     return;
+
+  // inverse transform parameters
+  inv_txfm_param.tx_type = get_tx_type(pd->plane_type, xd, block, tx_size);
+  inv_txfm_param.tx_size = tx_size;
+  inv_txfm_param.eob = p->eobs[block];
+  inv_txfm_param.lossless = xd->lossless[xd->mi[0]->mbmi.segment_id];
+
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    switch (tx_size) {
-      case TX_32X32:
-        vp10_highbd_inv_txfm_add_32x32(dqcoeff, dst, pd->dst.stride,
-                                       p->eobs[block], xd->bd, tx_type);
-        break;
-      case TX_16X16:
-        vp10_highbd_inv_txfm_add_16x16(dqcoeff, dst, pd->dst.stride,
-                                       p->eobs[block], xd->bd, tx_type);
-        break;
-      case TX_8X8:
-        vp10_highbd_inv_txfm_add_8x8(dqcoeff, dst, pd->dst.stride,
-                                     p->eobs[block], xd->bd, tx_type);
-        break;
-      case TX_4X4:
-        // this is like vp10_short_idct4x4 but has a special case around eob<=1
-        // which is significant (not just an optimization) for the lossless
-        // case.
-        vp10_highbd_inv_txfm_add_4x4(dqcoeff, dst, pd->dst.stride,
-                                     p->eobs[block], xd->bd, tx_type,
-                                     xd->lossless[xd->mi[0]->mbmi.segment_id]);
-        break;
-      default:
-        assert(0 && "Invalid transform size");
-        break;
-    }
-
+    inv_txfm_param.bd = xd->bd;
+    highbd_inv_txfm_add(dqcoeff, dst, pd->dst.stride, &inv_txfm_param);
     return;
   }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+  inv_txfm_add(dqcoeff, dst, pd->dst.stride, &inv_txfm_param);
+}
 
-  switch (tx_size) {
-    case TX_32X32:
-      vp10_inv_txfm_add_32x32(dqcoeff, dst, pd->dst.stride, p->eobs[block],
-                              tx_type);
-      break;
-    case TX_16X16:
-      vp10_inv_txfm_add_16x16(dqcoeff, dst, pd->dst.stride, p->eobs[block],
-                              tx_type);
-      break;
-    case TX_8X8:
-      vp10_inv_txfm_add_8x8(dqcoeff, dst, pd->dst.stride, p->eobs[block],
-                            tx_type);
-      break;
-    case TX_4X4:
-      // this is like vp10_short_idct4x4 but has a special case around eob<=1
-      // which is significant (not just an optimization) for the lossless
-      // case.
-      vp10_inv_txfm_add_4x4(dqcoeff, dst, pd->dst.stride, p->eobs[block],
-                            tx_type, xd->lossless[xd->mi[0]->mbmi.segment_id]);
-      break;
-    default:
-      assert(0 && "Invalid transform size");
-      break;
+#if CONFIG_VAR_TX
+static void encode_block_inter(int plane, int block, int blk_row, int blk_col,
+                               BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                               void *arg) {
+  struct encode_b_args *const args = arg;
+  MACROBLOCK *const x = args->x;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int tx_row = blk_row >> (1 - pd->subsampling_y);
+  const int tx_col = blk_col >> (1 - pd->subsampling_x);
+  const TX_SIZE plane_tx_size = plane ?
+      get_uv_tx_size_impl(mbmi->inter_tx_size[tx_row][tx_col], bsize, 0, 0) :
+      mbmi->inter_tx_size[tx_row][tx_col];
+
+  int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
+  int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
+
+  if (xd->mb_to_bottom_edge < 0)
+    max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
+  if (xd->mb_to_right_edge < 0)
+    max_blocks_wide += xd->mb_to_right_edge >> (5 + pd->subsampling_x);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide)
+    return;
+
+  if (tx_size == plane_tx_size) {
+    encode_block(plane, block, blk_row, blk_col, plane_bsize,
+                 tx_size, arg);
+  } else {
+    int bsl = b_width_log2_lookup[bsize];
+    int i;
+
+    assert(bsl > 0);
+    --bsl;
+
+    for (i = 0; i < 4; ++i) {
+      const int offsetr = blk_row + ((i >> 1) << bsl);
+      const int offsetc = blk_col + ((i & 0x01) << bsl);
+      int step = 1 << (2 * (tx_size - 1));
+
+      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide)
+        continue;
+
+      encode_block_inter(plane, block + i * step, offsetr, offsetc,
+                         plane_bsize, tx_size - 1, arg);
+    }
   }
 }
+#endif
 
 static void encode_block_pass1(int plane, int block, int blk_row, int blk_col,
                                BLOCK_SIZE plane_bsize,
@@ -889,14 +1110,24 @@
   struct macroblockd_plane *const pd = &xd->plane[plane];
   tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   uint8_t *dst;
+#if CONFIG_NEW_QUANT
+  int ctx;
+#endif  // CONFIG_NEW_QUANT
   dst = &pd->dst.buf[4 * blk_row * pd->dst.stride + 4 * blk_col];
 
-  vp10_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size);
+#if CONFIG_NEW_QUANT
+  ctx = 0;
+  vp10_xform_quant_fp_nuq(x, plane, block, blk_row, blk_col, plane_bsize,
+                          tx_size, ctx);
+#else
+  vp10_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize,
+                   tx_size, VP10_XFORM_QUANT_FP);
+#endif  // CONFIG_NEW_QUANT
 
   if (p->eobs[block] > 0) {
 #if CONFIG_VP9_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      if (xd->lossless[0]) {
+      if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
         vp10_highbd_iwht4x4_add(dqcoeff, dst, pd->dst.stride,
                                 p->eobs[block], xd->bd);
       } else {
@@ -906,7 +1137,7 @@
       return;
     }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-    if (xd->lossless[0]) {
+    if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
       vp10_iwht4x4_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
     } else {
       vp10_idct4x4_add(dqcoeff, dst, pd->dst.stride, p->eobs[block]);
@@ -917,14 +1148,14 @@
 void vp10_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize) {
   vp10_subtract_plane(x, bsize, 0);
   vp10_foreach_transformed_block_in_plane(&x->e_mbd, bsize, 0,
-                                         encode_block_pass1, x);
+                                          encode_block_pass1, x);
 }
 
 void vp10_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize) {
   MACROBLOCKD *const xd = &x->e_mbd;
   struct optimize_ctx ctx;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
-  struct encode_b_args arg = {x, &ctx, &mbmi->skip};
+  struct encode_b_args arg = {x, &ctx, &mbmi->skip, NULL, NULL, 1};
   int plane;
 
   mbmi->skip = 1;
@@ -933,21 +1164,73 @@
     return;
 
   for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
-    if (!x->skip_recode)
-      vp10_subtract_plane(x, bsize, plane);
+#if CONFIG_VAR_TX
+    // TODO(jingning): Clean this up.
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+    const int mi_width = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int mi_height = num_4x4_blocks_high_lookup[plane_bsize];
+    const TX_SIZE max_tx_size = max_txsize_lookup[plane_bsize];
+    const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
+    const int bh = num_4x4_blocks_wide_lookup[txb_size];
+    int idx, idy;
+    int block = 0;
+    int step = 1 << (max_tx_size * 2);
+    vp10_get_entropy_contexts(bsize, TX_4X4, pd, ctx.ta[plane], ctx.tl[plane]);
+#else
+    const struct macroblockd_plane* const pd = &xd->plane[plane];
+    const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi, pd) : mbmi->tx_size;
+    vp10_get_entropy_contexts(bsize, tx_size, pd, ctx.ta[plane], ctx.tl[plane]);
+#endif
+    vp10_subtract_plane(x, bsize, plane);
+    arg.ta = ctx.ta[plane];
+    arg.tl = ctx.tl[plane];
 
-    if (x->optimize && (!x->skip_recode || !x->skip_optimize)) {
-      const struct macroblockd_plane* const pd = &xd->plane[plane];
-      const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi, pd) : mbmi->tx_size;
-      vp10_get_entropy_contexts(bsize, tx_size, pd,
-                               ctx.ta[plane], ctx.tl[plane]);
+#if CONFIG_VAR_TX
+    for (idy = 0; idy < mi_height; idy += bh) {
+      for (idx = 0; idx < mi_width; idx += bh) {
+        encode_block_inter(plane, block, idy, idx, plane_bsize,
+                           max_tx_size, &arg);
+        block += step;
+      }
     }
-
+#else
     vp10_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block,
-                                           &arg);
+                                            &arg);
+#endif
   }
 }
 
+#if CONFIG_SUPERTX
+void vp10_encode_sb_supertx(MACROBLOCK *x, BLOCK_SIZE bsize) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  struct optimize_ctx ctx;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  struct encode_b_args arg = {x, &ctx, &mbmi->skip, NULL, NULL, 1};
+  int plane;
+
+  mbmi->skip = 1;
+  if (x->skip)
+    return;
+
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    const struct macroblockd_plane* const pd = &xd->plane[plane];
+#if CONFIG_VAR_TX
+    const TX_SIZE tx_size = TX_4X4;
+#else
+    const TX_SIZE tx_size = plane ? get_uv_tx_size(mbmi, pd) : mbmi->tx_size;
+#endif
+    vp10_subtract_plane(x, bsize, plane);
+    vp10_get_entropy_contexts(bsize, tx_size, pd,
+                              ctx.ta[plane], ctx.tl[plane]);
+    arg.ta = ctx.ta[plane];
+    arg.tl = ctx.tl[plane];
+    vp10_foreach_transformed_block_in_plane(xd, bsize, plane, encode_block,
+                                            &arg);
+  }
+}
+#endif  // CONFIG_SUPERTX
+
 void vp10_encode_block_intra(int plane, int block, int blk_row, int blk_col,
                              BLOCK_SIZE plane_bsize,
                              TX_SIZE tx_size, void *arg) {
@@ -957,12 +1240,9 @@
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   struct macroblock_plane *const p = &x->plane[plane];
   struct macroblockd_plane *const pd = &xd->plane[plane];
-  tran_low_t *coeff = BLOCK_OFFSET(p->coeff, block);
-  tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
   PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
-  TX_TYPE tx_type = get_tx_type(plane_type, xd, block);
-  const scan_order *const scan_order = get_scan(tx_size, tx_type);
+  const TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
   PREDICTION_MODE mode;
   const int bwl = b_width_log2_lookup[plane_bsize];
   const int bhl = b_height_log2_lookup[plane_bsize];
@@ -972,162 +1252,90 @@
   uint16_t *eob = &p->eobs[block];
   const int src_stride = p->src.stride;
   const int dst_stride = pd->dst.stride;
+  const int tx1d_size = get_tx1d_size(tx_size);
+  ENTROPY_CONTEXT *a = NULL, *l = NULL;
+  int ctx;
+
+  INV_TXFM_PARAM inv_txfm_param;
+
   dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)];
   src = &p->src.buf[4 * (blk_row * src_stride + blk_col)];
   src_diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
 
   mode = plane == 0 ? get_y_mode(xd->mi[0], block) : mbmi->uv_mode;
-  vp10_predict_intra_block(xd, bwl, bhl, tx_size, mode, dst, dst_stride,
-                          dst, dst_stride, blk_col, blk_row, plane);
-
+  vp10_predict_intra_block(xd, bwl, bhl, tx_size, mode, dst, dst_stride, dst,
+                           dst_stride, blk_col, blk_row, plane);
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    switch (tx_size) {
-      case TX_32X32:
-        if (!x->skip_recode) {
-          vpx_highbd_subtract_block(32, 32, src_diff, diff_stride,
-                                    src, src_stride, dst, dst_stride, xd->bd);
-          highbd_fwd_txfm_32x32(x->use_lp32x32fdct, src_diff, coeff,
-                                diff_stride, tx_type);
-          vpx_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
-                                      p->round, p->quant, p->quant_shift,
-                                      qcoeff, dqcoeff, pd->dequant, eob,
-                                      scan_order->scan, scan_order->iscan);
-        }
-        if (*eob)
-          vp10_highbd_inv_txfm_add_32x32(dqcoeff, dst, dst_stride, *eob, xd->bd,
-                                         tx_type);
-        break;
-      case TX_16X16:
-        if (!x->skip_recode) {
-          vpx_highbd_subtract_block(16, 16, src_diff, diff_stride,
-                                    src, src_stride, dst, dst_stride, xd->bd);
-          highbd_fwd_txfm_16x16(src_diff, coeff, diff_stride, tx_type);
-          vpx_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
-                                p->quant, p->quant_shift, qcoeff, dqcoeff,
-                                pd->dequant, eob,
-                                scan_order->scan, scan_order->iscan);
-        }
-        if (*eob)
-          vp10_highbd_inv_txfm_add_16x16(dqcoeff, dst, dst_stride, *eob, xd->bd,
-                                         tx_type);
-        break;
-      case TX_8X8:
-        if (!x->skip_recode) {
-          vpx_highbd_subtract_block(8, 8, src_diff, diff_stride,
-                                    src, src_stride, dst, dst_stride, xd->bd);
-          highbd_fwd_txfm_8x8(src_diff, coeff, diff_stride, tx_type);
-          vpx_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
-                                p->quant, p->quant_shift, qcoeff, dqcoeff,
-                                pd->dequant, eob,
-                                scan_order->scan, scan_order->iscan);
-        }
-        if (*eob)
-          vp10_highbd_inv_txfm_add_8x8(dqcoeff, dst, dst_stride, *eob, xd->bd,
-                                       tx_type);
-        break;
-      case TX_4X4:
-        if (!x->skip_recode) {
-          vpx_highbd_subtract_block(4, 4, src_diff, diff_stride,
-                                    src, src_stride, dst, dst_stride, xd->bd);
-          vp10_highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, tx_type,
-                                   xd->lossless[mbmi->segment_id]);
-          vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
-                                p->quant, p->quant_shift, qcoeff, dqcoeff,
-                                pd->dequant, eob,
-                                scan_order->scan, scan_order->iscan);
-        }
-
-        if (*eob)
-          // this is like vp10_short_idct4x4 but has a special case around
-          // eob<=1 which is significant (not just an optimization) for the
-          // lossless case.
-          vp10_highbd_inv_txfm_add_4x4(dqcoeff, dst, dst_stride, *eob, xd->bd,
-                                       tx_type, xd->lossless[mbmi->segment_id]);
-        break;
-      default:
-        assert(0);
-        return;
-    }
-    if (*eob)
-      *(args->skip) = 0;
-    return;
+    vpx_highbd_subtract_block(tx1d_size, tx1d_size, src_diff, diff_stride, src,
+                              src_stride, dst, dst_stride, xd->bd);
+  } else {
+    vpx_subtract_block(tx1d_size, tx1d_size, src_diff, diff_stride, src,
+                       src_stride, dst, dst_stride);
   }
+#else
+  vpx_subtract_block(tx1d_size, tx1d_size, src_diff, diff_stride, src,
+                     src_stride, dst, dst_stride);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
-  switch (tx_size) {
-    case TX_32X32:
-      if (!x->skip_recode) {
-        vpx_subtract_block(32, 32, src_diff, diff_stride,
-                           src, src_stride, dst, dst_stride);
-        fwd_txfm_32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride,
-                       tx_type);
-        vpx_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
-                             p->quant, p->quant_shift, qcoeff, dqcoeff,
-                             pd->dequant, eob, scan_order->scan,
-                             scan_order->iscan);
-      }
-      if (*eob)
-        vp10_inv_txfm_add_32x32(dqcoeff, dst, dst_stride, *eob, tx_type);
-      break;
-    case TX_16X16:
-      if (!x->skip_recode) {
-        vpx_subtract_block(16, 16, src_diff, diff_stride,
-                           src, src_stride, dst, dst_stride);
-        fwd_txfm_16x16(src_diff, coeff, diff_stride, tx_type);
-        vpx_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
-                       p->quant, p->quant_shift, qcoeff, dqcoeff,
-                       pd->dequant, eob, scan_order->scan,
-                       scan_order->iscan);
-      }
-      if (*eob)
-        vp10_inv_txfm_add_16x16(dqcoeff, dst, dst_stride, *eob, tx_type);
-      break;
-    case TX_8X8:
-      if (!x->skip_recode) {
-        vpx_subtract_block(8, 8, src_diff, diff_stride,
-                           src, src_stride, dst, dst_stride);
-        fwd_txfm_8x8(src_diff, coeff, diff_stride, tx_type);
-        vpx_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
-                       p->quant_shift, qcoeff, dqcoeff,
-                       pd->dequant, eob, scan_order->scan,
-                       scan_order->iscan);
-      }
-      if (*eob)
-        vp10_inv_txfm_add_8x8(dqcoeff, dst, dst_stride, *eob, tx_type);
-      break;
-    case TX_4X4:
-      if (!x->skip_recode) {
-        vpx_subtract_block(4, 4, src_diff, diff_stride,
-                           src, src_stride, dst, dst_stride);
-        vp10_fwd_txfm_4x4(src_diff, coeff, diff_stride, tx_type,
-                          xd->lossless[mbmi->segment_id]);
-        vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
-                       p->quant_shift, qcoeff, dqcoeff,
-                       pd->dequant, eob, scan_order->scan,
-                       scan_order->iscan);
-      }
+  a = &args->ta[blk_col];
+  l = &args->tl[blk_row];
+  ctx = combine_entropy_contexts(*a, *l);
 
-      if (*eob) {
-        // this is like vp10_short_idct4x4 but has a special case around eob<=1
-        // which is significant (not just an optimization) for the lossless
-        // case.
-        vp10_inv_txfm_add_4x4(dqcoeff, dst, dst_stride, *eob, tx_type,
-                              xd->lossless[mbmi->segment_id]);
-      }
-      break;
-    default:
-      assert(0);
-      break;
+  if (args->enable_optimize_b) {
+#if CONFIG_NEW_QUANT
+  vp10_xform_quant_fp_nuq(x, plane, block, blk_row, blk_col, plane_bsize,
+                          tx_size, ctx);
+#else  // CONFIG_NEW_QUANT
+    vp10_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                     VP10_XFORM_QUANT_FP);
+#endif  // CONFIG_NEW_QUANT
+    if (p->eobs[block]) {
+      *a = *l = vp10_optimize_b(x, plane, block, tx_size, ctx) > 0;
+    } else {
+      *a = *l = 0;
+    }
+  } else {
+    vp10_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                     VP10_XFORM_QUANT_B);
+    *a = *l = p->eobs[block] > 0;
   }
-  if (*eob)
+
+  if (*eob) {
+    // inverse transform
+    inv_txfm_param.tx_type = tx_type;
+    inv_txfm_param.tx_size = tx_size;
+    inv_txfm_param.eob = *eob;
+    inv_txfm_param.lossless = xd->lossless[mbmi->segment_id];
+#if CONFIG_VP9_HIGHBITDEPTH
+    inv_txfm_param.bd = xd->bd;
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      highbd_inv_txfm_add(dqcoeff, dst, dst_stride, &inv_txfm_param);
+    } else {
+      inv_txfm_add(dqcoeff, dst, dst_stride, &inv_txfm_param);
+    }
+#else
+    inv_txfm_add(dqcoeff, dst, dst_stride, &inv_txfm_param);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
     *(args->skip) = 0;
+  }
 }
 
-void vp10_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane) {
+void vp10_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane,
+                                   int enable_optimize_b) {
   const MACROBLOCKD *const xd = &x->e_mbd;
-  struct encode_b_args arg = {x, NULL, &xd->mi[0]->mbmi.skip};
+  ENTROPY_CONTEXT ta[2 * MAX_MIB_SIZE];
+  ENTROPY_CONTEXT tl[2 * MAX_MIB_SIZE];
 
+  struct encode_b_args arg = {x, NULL, &xd->mi[0]->mbmi.skip,
+                              ta, tl, enable_optimize_b};
+  if (enable_optimize_b) {
+    const struct macroblockd_plane* const pd = &xd->plane[plane];
+    const TX_SIZE tx_size = plane ? get_uv_tx_size(&xd->mi[0]->mbmi, pd) :
+        xd->mi[0]->mbmi.tx_size;
+    vp10_get_entropy_contexts(bsize, tx_size, pd, ta, tl);
+  }
   vp10_foreach_transformed_block_in_plane(xd, bsize, plane,
                                           vp10_encode_block_intra, &arg);
 }
diff --git a/vp10/encoder/encodemb.h b/vp10/encoder/encodemb.h
index 2e6516e..8beb578 100644
--- a/vp10/encoder/encodemb.h
+++ b/vp10/encoder/encodemb.h
@@ -18,22 +18,55 @@
 extern "C" {
 #endif
 
+struct optimize_ctx {
+  ENTROPY_CONTEXT ta[MAX_MB_PLANE][2 * MAX_MIB_SIZE];
+  ENTROPY_CONTEXT tl[MAX_MB_PLANE][2 * MAX_MIB_SIZE];
+};
+
 struct encode_b_args {
   MACROBLOCK *x;
   struct optimize_ctx *ctx;
   int8_t *skip;
+  ENTROPY_CONTEXT *ta;
+  ENTROPY_CONTEXT *tl;
+  int8_t enable_optimize_b;
 };
+
+typedef enum VP10_XFORM_QUANT {
+  VP10_XFORM_QUANT_FP = 0,
+  VP10_XFORM_QUANT_B = 1,
+  VP10_XFORM_QUANT_DC = 2,
+  VP10_XFORM_QUANT_SKIP_QUANT = 3,
+  VP10_XFORM_QUANT_LAST = 4
+} VP10_XFORM_QUANT;
+
 void vp10_encode_sb(MACROBLOCK *x, BLOCK_SIZE bsize);
+#if CONFIG_SUPERTX
+void vp10_encode_sb_supertx(MACROBLOCK *x, BLOCK_SIZE bsize);
+#endif  // CONFIG_SUPERTX
 void vp10_encode_sby_pass1(MACROBLOCK *x, BLOCK_SIZE bsize);
-void vp10_xform_quant_fp(MACROBLOCK *x, int plane, int block,
-                         int blk_row, int blk_col,
-                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
-void vp10_xform_quant_dc(MACROBLOCK *x, int plane, int block,
-                         int blk_row, int blk_col,
-                         BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
 void vp10_xform_quant(MACROBLOCK *x, int plane, int block,
                       int blk_row, int blk_col,
-                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size);
+                      BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                      VP10_XFORM_QUANT xform_quant_idx);
+#if CONFIG_NEW_QUANT
+void vp10_xform_quant_nuq(MACROBLOCK *x, int plane, int block, int blk_row,
+                          int blk_col, BLOCK_SIZE plane_bsize,
+                          TX_SIZE tx_size, int ctx);
+void vp10_xform_quant_dc_nuq(MACROBLOCK *x, int plane, int block, int blk_row,
+                             int blk_col, BLOCK_SIZE plane_bsize,
+                             TX_SIZE tx_size, int ctx);
+void vp10_xform_quant_fp_nuq(MACROBLOCK *x, int plane, int block, int blk_row,
+                             int blk_col, BLOCK_SIZE plane_bsize,
+                             TX_SIZE tx_size, int ctx);
+void vp10_xform_quant_dc_fp_nuq(MACROBLOCK *x, int plane, int block,
+                                int blk_row, int blk_col,
+                                BLOCK_SIZE plane_bsize, TX_SIZE tx_size,
+                                int ctx);
+#endif
+
+int vp10_optimize_b(MACROBLOCK *mb, int plane, int block,
+                    TX_SIZE tx_size, int ctx);
 
 void vp10_subtract_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
 
@@ -41,15 +74,8 @@
                              BLOCK_SIZE plane_bsize,
                              TX_SIZE tx_size, void *arg);
 
-void vp10_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
-
-void vp10_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
-                       int diff_stride, TX_TYPE tx_type, int lossless);
-
-#if CONFIG_VP9_HIGHBITDEPTH
-void vp10_highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
-                              int diff_stride, TX_TYPE tx_type, int lossless);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
+void vp10_encode_intra_block_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane,
+                                   int enable_optimize_b);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp10/encoder/encodemv.c b/vp10/encoder/encodemv.c
index 0736c65..727d6b5 100644
--- a/vp10/encoder/encodemv.c
+++ b/vp10/encoder/encodemv.c
@@ -31,7 +31,7 @@
   vp10_tokens_from_tree(mv_fp_encodings, vp10_mv_fp_tree);
 }
 
-static void encode_mv_component(vpx_writer* w, int comp,
+static void encode_mv_component(vp10_writer* w, int comp,
                                 const nmv_component* mvcomp, int usehp) {
   int offset;
   const int sign = comp < 0;
@@ -44,7 +44,7 @@
   assert(comp != 0);
 
   // Sign
-  vpx_write(w, sign, mvcomp->sign);
+  vp10_write(w, sign, mvcomp->sign);
 
   // Class
   vp10_write_token(w, vp10_mv_class_tree, mvcomp->classes,
@@ -58,7 +58,7 @@
     int i;
     const int n = mv_class + CLASS0_BITS - 1;  // number of bits
     for (i = 0; i < n; ++i)
-      vpx_write(w, (d >> i) & 1, mvcomp->bits[i]);
+      vp10_write(w, (d >> i) & 1, mvcomp->bits[i]);
   }
 
   // Fractional bits
@@ -68,7 +68,7 @@
 
   // High precision bit
   if (usehp)
-    vpx_write(w, hp,
+    vp10_write(w, hp,
               mv_class == MV_CLASS_0 ? mvcomp->class0_hp : mvcomp->hp);
 }
 
@@ -135,27 +135,16 @@
   }
 }
 
-static void update_mv(vpx_writer *w, const unsigned int ct[2], vpx_prob *cur_p,
+static void update_mv(vp10_writer *w, const unsigned int ct[2], vpx_prob *cur_p,
                       vpx_prob upd_p) {
-#if CONFIG_MISC_FIXES
   (void) upd_p;
   vp10_cond_prob_diff_update(w, cur_p, ct);
-#else
-  const vpx_prob new_p = get_binary_prob(ct[0], ct[1]) | 1;
-  const int update = cost_branch256(ct, *cur_p) + vp10_cost_zero(upd_p) >
-                     cost_branch256(ct, new_p) + vp10_cost_one(upd_p) + 7 * 256;
-  vpx_write(w, update, upd_p);
-  if (update) {
-    *cur_p = new_p;
-    vpx_write_literal(w, new_p >> 1, 7);
-  }
-#endif
 }
 
 static void write_mv_update(const vpx_tree_index *tree,
                             vpx_prob probs[/*n - 1*/],
                             const unsigned int counts[/*n - 1*/],
-                            int n, vpx_writer *w) {
+                            int n, vp10_writer *w) {
   int i;
   unsigned int branch_ct[32][2];
 
@@ -167,10 +156,52 @@
     update_mv(w, branch_ct[i], &probs[i], MV_UPDATE_PROB);
 }
 
-void vp10_write_nmv_probs(VP10_COMMON *cm, int usehp, vpx_writer *w,
-                         nmv_context_counts *const counts) {
+void vp10_write_nmv_probs(VP10_COMMON *cm, int usehp, vp10_writer *w,
+                          nmv_context_counts *const nmv_counts) {
   int i, j;
+#if CONFIG_REF_MV
+  int nmv_ctx = 0;
+  for (nmv_ctx = 0; nmv_ctx < NMV_CONTEXTS; ++nmv_ctx) {
+    nmv_context *const mvc = &cm->fc->nmvc[nmv_ctx];
+    nmv_context_counts *const counts = &nmv_counts[nmv_ctx];
+    write_mv_update(vp10_mv_joint_tree, mvc->joints, counts->joints,
+                    MV_JOINTS, w);
+
+    vp10_cond_prob_diff_update(w, &mvc->zero_rmv, counts->zero_rmv);
+
+    for (i = 0; i < 2; ++i) {
+      nmv_component *comp = &mvc->comps[i];
+      nmv_component_counts *comp_counts = &counts->comps[i];
+
+      update_mv(w, comp_counts->sign, &comp->sign, MV_UPDATE_PROB);
+      write_mv_update(vp10_mv_class_tree, comp->classes, comp_counts->classes,
+                      MV_CLASSES, w);
+      write_mv_update(vp10_mv_class0_tree, comp->class0, comp_counts->class0,
+                      CLASS0_SIZE, w);
+      for (j = 0; j < MV_OFFSET_BITS; ++j)
+        update_mv(w, comp_counts->bits[j], &comp->bits[j], MV_UPDATE_PROB);
+    }
+
+    for (i = 0; i < 2; ++i) {
+      for (j = 0; j < CLASS0_SIZE; ++j)
+        write_mv_update(vp10_mv_fp_tree, mvc->comps[i].class0_fp[j],
+                        counts->comps[i].class0_fp[j], MV_FP_SIZE, w);
+
+      write_mv_update(vp10_mv_fp_tree, mvc->comps[i].fp, counts->comps[i].fp,
+                      MV_FP_SIZE, w);
+    }
+
+    if (usehp) {
+      for (i = 0; i < 2; ++i) {
+        update_mv(w, counts->comps[i].class0_hp, &mvc->comps[i].class0_hp,
+                  MV_UPDATE_PROB);
+        update_mv(w, counts->comps[i].hp, &mvc->comps[i].hp, MV_UPDATE_PROB);
+      }
+    }
+  }
+#else
   nmv_context *const mvc = &cm->fc->nmvc;
+  nmv_context_counts *const counts = nmv_counts;
 
   write_mv_update(vp10_mv_joint_tree, mvc->joints, counts->joints, MV_JOINTS, w);
 
@@ -203,16 +234,35 @@
       update_mv(w, counts->comps[i].hp, &mvc->comps[i].hp, MV_UPDATE_PROB);
     }
   }
+#endif
 }
 
-void vp10_encode_mv(VP10_COMP* cpi, vpx_writer* w,
+void vp10_encode_mv(VP10_COMP* cpi, vp10_writer* w,
                    const MV* mv, const MV* ref,
+#if CONFIG_REF_MV
+                   int is_compound,
+#endif
                    const nmv_context* mvctx, int usehp) {
   const MV diff = {mv->row - ref->row,
                    mv->col - ref->col};
   const MV_JOINT_TYPE j = vp10_get_mv_joint(&diff);
   usehp = usehp && vp10_use_mv_hp(ref);
 
+#if CONFIG_REF_MV && !CONFIG_EXT_INTER
+  if (is_compound) {
+    vp10_write(w, (j == MV_JOINT_ZERO), mvctx->zero_rmv);
+    if (j == MV_JOINT_ZERO)
+      return;
+  } else {
+    if (j == MV_JOINT_ZERO)
+      assert(0);
+  }
+#endif
+
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+  (void)is_compound;
+#endif
+
   vp10_write_token(w, vp10_mv_joint_tree, mvctx->joints, &mv_joint_encodings[j]);
   if (mv_joint_vertical(j))
     encode_mv_component(w, diff.row, &mvctx->comps[0], usehp);
@@ -235,18 +285,132 @@
   build_nmv_component_cost_table(mvcost[1], &ctx->comps[1], usehp);
 }
 
+#if CONFIG_EXT_INTER
 static void inc_mvs(const MB_MODE_INFO *mbmi, const MB_MODE_INFO_EXT *mbmi_ext,
                     const int_mv mvs[2],
-                    nmv_context_counts *counts) {
+#if CONFIG_REF_MV
+                    const int_mv pred_mvs[2],
+#endif
+                    nmv_context_counts *nmv_counts) {
   int i;
+  PREDICTION_MODE mode = mbmi->mode;
+  int mv_idx = (mode == NEWFROMNEARMV);
+#if !CONFIG_REF_MV
+  nmv_context_counts *counts = nmv_counts;
+#endif
+
+  if (mode == NEWMV || mode == NEWFROMNEARMV || mode == NEW_NEWMV) {
+    for (i = 0; i < 1 + has_second_ref(mbmi); ++i) {
+      const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[i]][mv_idx].as_mv;
+      const MV diff = {mvs[i].as_mv.row - ref->row,
+                       mvs[i].as_mv.col - ref->col};
+#if CONFIG_REF_MV
+      int nmv_ctx = vp10_nmv_ctx(mbmi_ext->ref_mv_count[mbmi->ref_frame[i]],
+                                 mbmi_ext->ref_mv_stack[mbmi->ref_frame[i]]);
+      nmv_context_counts *counts = &nmv_counts[nmv_ctx];
+      (void)pred_mvs;
+#endif
+      vp10_inc_mv(&diff, counts, vp10_use_mv_hp(ref));
+    }
+  } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) {
+    const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[1]][0].as_mv;
+    const MV diff = {mvs[1].as_mv.row - ref->row,
+                     mvs[1].as_mv.col - ref->col};
+#if CONFIG_REF_MV
+    int nmv_ctx = vp10_nmv_ctx(mbmi_ext->ref_mv_count[mbmi->ref_frame[1]],
+                               mbmi_ext->ref_mv_stack[mbmi->ref_frame[1]]);
+    nmv_context_counts *counts = &nmv_counts[nmv_ctx];
+#endif
+    vp10_inc_mv(&diff, counts, vp10_use_mv_hp(ref));
+  } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
+    const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[0]][0].as_mv;
+    const MV diff = {mvs[0].as_mv.row - ref->row,
+                     mvs[0].as_mv.col - ref->col};
+#if CONFIG_REF_MV
+    int nmv_ctx = vp10_nmv_ctx(mbmi_ext->ref_mv_count[mbmi->ref_frame[0]],
+                               mbmi_ext->ref_mv_stack[mbmi->ref_frame[0]]);
+    nmv_context_counts *counts = &nmv_counts[nmv_ctx];
+#endif
+    vp10_inc_mv(&diff, counts, vp10_use_mv_hp(ref));
+  }
+}
+
+static void inc_mvs_sub8x8(const MODE_INFO *mi,
+                           int block,
+                           const int_mv mvs[2],
+#if CONFIG_REF_MV
+                           const MB_MODE_INFO_EXT *mbmi_ext,
+#endif
+                           nmv_context_counts *nmv_counts) {
+  int i;
+  PREDICTION_MODE mode = mi->bmi[block].as_mode;
+#if CONFIG_REF_MV
+  const MB_MODE_INFO *mbmi = &mi->mbmi;
+#else
+  nmv_context_counts *counts = nmv_counts;
+#endif
+
+  if (mode == NEWMV || mode == NEWFROMNEARMV || mode == NEW_NEWMV) {
+    for (i = 0; i < 1 + has_second_ref(&mi->mbmi); ++i) {
+      const MV *ref = &mi->bmi[block].ref_mv[i].as_mv;
+      const MV diff = {mvs[i].as_mv.row - ref->row,
+                       mvs[i].as_mv.col - ref->col};
+#if CONFIG_REF_MV
+      int nmv_ctx = vp10_nmv_ctx(mbmi_ext->ref_mv_count[mbmi->ref_frame[i]],
+                                 mbmi_ext->ref_mv_stack[mbmi->ref_frame[i]]);
+      nmv_context_counts *counts = &nmv_counts[nmv_ctx];
+#endif
+      vp10_inc_mv(&diff, counts, vp10_use_mv_hp(ref));
+    }
+  } else if (mode == NEAREST_NEWMV || mode == NEAR_NEWMV) {
+    const MV *ref = &mi->bmi[block].ref_mv[1].as_mv;
+    const MV diff = {mvs[1].as_mv.row - ref->row,
+                     mvs[1].as_mv.col - ref->col};
+#if CONFIG_REF_MV
+    int nmv_ctx = vp10_nmv_ctx(mbmi_ext->ref_mv_count[mbmi->ref_frame[1]],
+                               mbmi_ext->ref_mv_stack[mbmi->ref_frame[1]]);
+    nmv_context_counts *counts = &nmv_counts[nmv_ctx];
+#endif
+    vp10_inc_mv(&diff, counts, vp10_use_mv_hp(ref));
+  } else if (mode == NEW_NEARESTMV || mode == NEW_NEARMV) {
+    const MV *ref = &mi->bmi[block].ref_mv[0].as_mv;
+    const MV diff = {mvs[0].as_mv.row - ref->row,
+                     mvs[0].as_mv.col - ref->col};
+#if CONFIG_REF_MV
+    int nmv_ctx = vp10_nmv_ctx(mbmi_ext->ref_mv_count[mbmi->ref_frame[0]],
+                               mbmi_ext->ref_mv_stack[mbmi->ref_frame[0]]);
+    nmv_context_counts *counts = &nmv_counts[nmv_ctx];
+#endif
+    vp10_inc_mv(&diff, counts, vp10_use_mv_hp(ref));
+  }
+}
+#else
+static void inc_mvs(const MB_MODE_INFO *mbmi, const MB_MODE_INFO_EXT *mbmi_ext,
+                    const int_mv mvs[2],
+#if CONFIG_REF_MV
+                    const int_mv pred_mvs[2],
+#endif
+                    nmv_context_counts *nmv_counts) {
+  int i;
+#if !CONFIG_REF_MV
+  nmv_context_counts *counts = nmv_counts;
+#endif
 
   for (i = 0; i < 1 + has_second_ref(mbmi); ++i) {
+#if CONFIG_REF_MV
+    int nmv_ctx = vp10_nmv_ctx(mbmi_ext->ref_mv_count[mbmi->ref_frame[i]],
+                               mbmi_ext->ref_mv_stack[mbmi->ref_frame[i]]);
+    nmv_context_counts *counts = &nmv_counts[nmv_ctx];
+    const MV *ref = &pred_mvs[i].as_mv;
+#else
     const MV *ref = &mbmi_ext->ref_mvs[mbmi->ref_frame[i]][0].as_mv;
+#endif
     const MV diff = {mvs[i].as_mv.row - ref->row,
                      mvs[i].as_mv.col - ref->col};
     vp10_inc_mv(&diff, counts, vp10_use_mv_hp(ref));
   }
 }
+#endif  // CONFIG_EXT_INTER
 
 void vp10_update_mv_count(ThreadData *td) {
   const MACROBLOCKD *xd = &td->mb.e_mbd;
@@ -262,13 +426,41 @@
     for (idy = 0; idy < 2; idy += num_4x4_h) {
       for (idx = 0; idx < 2; idx += num_4x4_w) {
         const int i = idy * 2 + idx;
+
+#if CONFIG_EXT_INTER
+        if (have_newmv_in_inter_mode(mi->bmi[i].as_mode))
+          inc_mvs_sub8x8(mi, i, mi->bmi[i].as_mv,
+#if CONFIG_REF_MV
+                         mbmi_ext,
+                         td->counts->mv);
+#else
+                         &td->counts->mv);
+#endif
+#else
         if (mi->bmi[i].as_mode == NEWMV)
-          inc_mvs(mbmi, mbmi_ext, mi->bmi[i].as_mv, &td->counts->mv);
+          inc_mvs(mbmi, mbmi_ext, mi->bmi[i].as_mv,
+#if CONFIG_REF_MV
+                  mi->bmi[i].pred_mv_s8,
+                  td->counts->mv);
+#else
+                  &td->counts->mv);
+#endif
+#endif  // CONFIG_EXT_INTER
       }
     }
   } else {
+#if CONFIG_EXT_INTER
+    if (have_newmv_in_inter_mode(mbmi->mode))
+#else
     if (mbmi->mode == NEWMV)
-      inc_mvs(mbmi, mbmi_ext, mbmi->mv, &td->counts->mv);
+#endif  // CONFIG_EXT_INTER
+      inc_mvs(mbmi, mbmi_ext, mbmi->mv,
+#if CONFIG_REF_MV
+              mbmi->pred_mv,
+              td->counts->mv);
+#else
+              &td->counts->mv);
+#endif
   }
 }
 
diff --git a/vp10/encoder/encodemv.h b/vp10/encoder/encodemv.h
index 006f6d7..406fbec 100644
--- a/vp10/encoder/encodemv.h
+++ b/vp10/encoder/encodemv.h
@@ -20,11 +20,14 @@
 
 void vp10_entropy_mv_init(void);
 
-void vp10_write_nmv_probs(VP10_COMMON *cm, int usehp, vpx_writer *w,
-                         nmv_context_counts *const counts);
+void vp10_write_nmv_probs(VP10_COMMON *cm, int usehp, vp10_writer *w,
+                          nmv_context_counts *const counts);
 
-void vp10_encode_mv(VP10_COMP *cpi, vpx_writer* w, const MV* mv, const MV* ref,
-                   const nmv_context* mvctx, int usehp);
+void vp10_encode_mv(VP10_COMP *cpi, vp10_writer* w, const MV* mv, const MV* ref,
+#if CONFIG_REF_MV
+                    int is_compound,
+#endif
+                    const nmv_context* mvctx, int usehp);
 
 void vp10_build_nmv_cost_table(int *mvjoint, int *mvcost[2],
                               const nmv_context* mvctx, int usehp);
diff --git a/vp10/encoder/encoder.c b/vp10/encoder/encoder.c
index e7fff82..5adba4c 100644
--- a/vp10/encoder/encoder.c
+++ b/vp10/encoder/encoder.c
@@ -28,6 +28,9 @@
 #include "vp10/encoder/aq_cyclicrefresh.h"
 #include "vp10/encoder/aq_variance.h"
 #include "vp10/encoder/bitstream.h"
+#if CONFIG_ANS
+#include "vp10/encoder/buf_ans.h"
+#endif
 #include "vp10/encoder/context_tree.h"
 #include "vp10/encoder/encodeframe.h"
 #include "vp10/encoder/encodemv.h"
@@ -36,6 +39,9 @@
 #include "vp10/encoder/firstpass.h"
 #include "vp10/encoder/mbgraph.h"
 #include "vp10/encoder/picklpf.h"
+#if CONFIG_LOOP_RESTORATION
+#include "vp10/encoder/pickrst.h"
+#endif  // CONFIG_LOOP_RESTORATION
 #include "vp10/encoder/ratectrl.h"
 #include "vp10/encoder/rd.h"
 #include "vp10/encoder/resize.h"
@@ -47,7 +53,7 @@
 #include "./vp10_rtcd.h"
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
-#include "vpx/internal/vpx_psnr.h"
+#include "vpx_dsp/psnr.h"
 #if CONFIG_INTERNAL_STATS
 #include "vpx_dsp/ssim.h"
 #endif
@@ -219,13 +225,53 @@
 void vp10_set_high_precision_mv(VP10_COMP *cpi, int allow_high_precision_mv) {
   MACROBLOCK *const mb = &cpi->td.mb;
   cpi->common.allow_high_precision_mv = allow_high_precision_mv;
+
+#if CONFIG_REF_MV
+  if (cpi->common.allow_high_precision_mv) {
+    int i;
+    for (i = 0; i < NMV_CONTEXTS; ++i) {
+      mb->mv_cost_stack[i] = mb->nmvcost_hp[i];
+      mb->mvsadcost = mb->nmvsadcost_hp;
+    }
+  } else {
+    int i;
+    for (i = 0; i < NMV_CONTEXTS; ++i) {
+      mb->mv_cost_stack[i] = mb->nmvcost[i];
+      mb->mvsadcost = mb->nmvsadcost;
+    }
+  }
+#else
   if (cpi->common.allow_high_precision_mv) {
     mb->mvcost = mb->nmvcost_hp;
-    mb->mvsadcost = mb->nmvsadcost_hp;
+    mb->mvsadcost = mb->nmvcost_hp;
   } else {
     mb->mvcost = mb->nmvcost;
-    mb->mvsadcost = mb->nmvsadcost;
+    mb->mvsadcost = mb->nmvcost;
   }
+#endif
+}
+
+static BLOCK_SIZE select_sb_size(const VP10_COMP *const cpi) {
+#if CONFIG_EXT_PARTITION
+  if (cpi->oxcf.superblock_size == VPX_SUPERBLOCK_SIZE_64X64)
+    return BLOCK_64X64;
+
+  if (cpi->oxcf.superblock_size == VPX_SUPERBLOCK_SIZE_128X128)
+    return BLOCK_128X128;
+
+  assert(cpi->oxcf.superblock_size == VPX_SUPERBLOCK_SIZE_DYNAMIC);
+
+  assert(IMPLIES(cpi->common.tile_cols > 1,
+                 cpi->common.tile_width % MAX_MIB_SIZE == 0));
+  assert(IMPLIES(cpi->common.tile_rows > 1,
+                 cpi->common.tile_height % MAX_MIB_SIZE == 0));
+
+  // TODO(any): Possibly could improve this with a heuristic.
+  return BLOCK_128X128;
+#else
+  (void)cpi;
+  return BLOCK_64X64;
+#endif  //  CONFIG_EXT_PARTITION
 }
 
 static void setup_frame(VP10_COMP *cpi) {
@@ -249,6 +295,10 @@
     *cm->fc = cm->frame_contexts[cm->frame_context_idx];
     vp10_zero(cpi->interp_filter_selected[0]);
   }
+
+  cpi->vaq_refresh = 0;
+
+  set_sb_size(cm, select_sb_size(cpi));
 }
 
 static void vp10_enc_setup_mi(VP10_COMMON *cm) {
@@ -329,12 +379,16 @@
     vp10_entropy_mv_init();
     vp10_temporal_filter_init();
     vp10_encode_token_init();
+#if CONFIG_EXT_INTER
+    vp10_init_wedge_masks();
+#endif
     init_done = 1;
   }
 }
 
 static void dealloc_compressor_data(VP10_COMP *cpi) {
   VP10_COMMON *const cm = &cpi->common;
+  int i;
 
   vpx_free(cpi->mbmi_ext_base);
   cpi->mbmi_ext_base = NULL;
@@ -348,6 +402,19 @@
   vpx_free(cpi->coding_context.last_frame_seg_map_copy);
   cpi->coding_context.last_frame_seg_map_copy = NULL;
 
+#if CONFIG_REF_MV
+  for (i = 0; i < NMV_CONTEXTS; ++i) {
+    vpx_free(cpi->nmv_costs[i][0]);
+    vpx_free(cpi->nmv_costs[i][1]);
+    vpx_free(cpi->nmv_costs_hp[i][0]);
+    vpx_free(cpi->nmv_costs_hp[i][1]);
+    cpi->nmv_costs[i][0] = NULL;
+    cpi->nmv_costs[i][1] = NULL;
+    cpi->nmv_costs_hp[i][0] = NULL;
+    cpi->nmv_costs_hp[i][1] = NULL;
+  }
+#endif
+
   vpx_free(cpi->nmvcosts[0]);
   vpx_free(cpi->nmvcosts[1]);
   cpi->nmvcosts[0] = NULL;
@@ -374,13 +441,23 @@
   vpx_free(cpi->active_map.map);
   cpi->active_map.map = NULL;
 
+  // Free up-sampled reference buffers.
+  for (i = 0; i < MAX_REF_FRAMES; i++)
+    vpx_free_frame_buffer(&cpi->upsampled_ref_bufs[i].buf);
+
   vp10_free_ref_frame_buffers(cm->buffer_pool);
 #if CONFIG_VP9_POSTPROC
   vp10_free_postproc_buffers(cm);
-#endif
+#endif  // CONFIG_VP9_POSTPROC
+#if CONFIG_LOOP_RESTORATION
+  vp10_free_restoration_buffers(cm);
+#endif  // CONFIG_LOOP_RESTORATION
   vp10_free_context_buffers(cm);
 
   vpx_free_frame_buffer(&cpi->last_frame_uf);
+#if CONFIG_LOOP_RESTORATION
+  vpx_free_frame_buffer(&cpi->last_frame_db);
+#endif  // CONFIG_LOOP_RESTORATION
   vpx_free_frame_buffer(&cpi->scaled_source);
   vpx_free_frame_buffer(&cpi->scaled_last_source);
   vpx_free_frame_buffer(&cpi->alt_ref_buffer);
@@ -390,22 +467,46 @@
   cpi->tile_tok[0][0] = 0;
 
   vp10_free_pc_tree(&cpi->td);
+  vp10_free_var_tree(&cpi->td);
+
+  if (cpi->common.allow_screen_content_tools)
+    vpx_free(cpi->td.mb.palette_buffer);
 
   if (cpi->source_diff_var != NULL) {
     vpx_free(cpi->source_diff_var);
     cpi->source_diff_var = NULL;
   }
+#if CONFIG_ANS
+  vp10_buf_ans_free(&cpi->buf_ans);
+#endif  // CONFIG_ANS
 }
 
 static void save_coding_context(VP10_COMP *cpi) {
   CODING_CONTEXT *const cc = &cpi->coding_context;
   VP10_COMMON *cm = &cpi->common;
+#if CONFIG_REF_MV
+  int i;
+#endif
 
   // Stores a snapshot of key state variables which can subsequently be
   // restored with a call to vp10_restore_coding_context. These functions are
   // intended for use in a re-code loop in vp10_compress_frame where the
   // quantizer value is adjusted between loop iterations.
+#if CONFIG_REF_MV
+  for (i = 0; i < NMV_CONTEXTS; ++i) {
+    vp10_copy(cc->nmv_vec_cost[i], cpi->td.mb.nmv_vec_cost[i]);
+    memcpy(cc->nmv_costs[i][0], cpi->nmv_costs[i][0],
+           MV_VALS * sizeof(*cpi->nmv_costs[i][0]));
+    memcpy(cc->nmv_costs[i][1], cpi->nmv_costs[i][1],
+           MV_VALS * sizeof(*cpi->nmv_costs[i][1]));
+    memcpy(cc->nmv_costs_hp[i][0], cpi->nmv_costs_hp[i][0],
+           MV_VALS * sizeof(*cpi->nmv_costs_hp[i][0]));
+    memcpy(cc->nmv_costs_hp[i][1], cpi->nmv_costs_hp[i][1],
+           MV_VALS * sizeof(*cpi->nmv_costs_hp[i][1]));
+  }
+#else
   vp10_copy(cc->nmvjointcost,  cpi->td.mb.nmvjointcost);
+#endif
 
   memcpy(cc->nmvcosts[0], cpi->nmvcosts[0],
          MV_VALS * sizeof(*cpi->nmvcosts[0]));
@@ -416,10 +517,6 @@
   memcpy(cc->nmvcosts_hp[1], cpi->nmvcosts_hp[1],
          MV_VALS * sizeof(*cpi->nmvcosts_hp[1]));
 
-#if !CONFIG_MISC_FIXES
-  vp10_copy(cc->segment_pred_probs, cm->segp.pred_probs);
-#endif
-
   memcpy(cpi->coding_context.last_frame_seg_map_copy,
          cm->last_frame_seg_map, (cm->mi_rows * cm->mi_cols));
 
@@ -432,10 +529,27 @@
 static void restore_coding_context(VP10_COMP *cpi) {
   CODING_CONTEXT *const cc = &cpi->coding_context;
   VP10_COMMON *cm = &cpi->common;
+#if CONFIG_REF_MV
+  int i;
+#endif
 
   // Restore key state variables to the snapshot state stored in the
   // previous call to vp10_save_coding_context.
+#if CONFIG_REF_MV
+  for (i = 0; i < NMV_CONTEXTS; ++i) {
+    vp10_copy(cpi->td.mb.nmv_vec_cost[i], cc->nmv_vec_cost[i]);
+    memcpy(cpi->nmv_costs[i][0], cc->nmv_costs[i][0],
+           MV_VALS * sizeof(*cc->nmv_costs[i][0]));
+    memcpy(cpi->nmv_costs[i][1], cc->nmv_costs[i][1],
+           MV_VALS * sizeof(*cc->nmv_costs[i][1]));
+    memcpy(cpi->nmv_costs_hp[i][0], cc->nmv_costs_hp[i][0],
+           MV_VALS * sizeof(*cc->nmv_costs_hp[i][0]));
+    memcpy(cpi->nmv_costs_hp[i][1], cc->nmv_costs_hp[i][1],
+           MV_VALS * sizeof(*cc->nmv_costs_hp[i][1]));
+  }
+#else
   vp10_copy(cpi->td.mb.nmvjointcost, cc->nmvjointcost);
+#endif
 
   memcpy(cpi->nmvcosts[0], cc->nmvcosts[0], MV_VALS * sizeof(*cc->nmvcosts[0]));
   memcpy(cpi->nmvcosts[1], cc->nmvcosts[1], MV_VALS * sizeof(*cc->nmvcosts[1]));
@@ -444,10 +558,6 @@
   memcpy(cpi->nmvcosts_hp[1], cc->nmvcosts_hp[1],
          MV_VALS * sizeof(*cc->nmvcosts_hp[1]));
 
-#if !CONFIG_MISC_FIXES
-  vp10_copy(cm->segp.pred_probs, cc->segment_pred_probs);
-#endif
-
   memcpy(cm->last_frame_seg_map,
          cpi->coding_context.last_frame_seg_map_copy,
          (cm->mi_rows * cm->mi_cols));
@@ -606,9 +716,9 @@
     cpi->lookahead = vp10_lookahead_init(oxcf->width, oxcf->height,
                                         cm->subsampling_x, cm->subsampling_y,
 #if CONFIG_VP9_HIGHBITDEPTH
-                                      cm->use_highbitdepth,
+                                        cm->use_highbitdepth,
 #endif
-                                      oxcf->lag_in_frames);
+                                        oxcf->lag_in_frames);
   if (!cpi->lookahead)
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate lag buffers");
@@ -639,6 +749,19 @@
     vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                        "Failed to allocate last frame buffer");
 
+#if CONFIG_LOOP_RESTORATION
+  if (vpx_realloc_frame_buffer(&cpi->last_frame_db,
+                               cm->width, cm->height,
+                               cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                               cm->use_highbitdepth,
+#endif
+                               VP9_ENC_BORDER_IN_PIXELS, cm->byte_alignment,
+                               NULL, NULL, NULL))
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate last frame deblocked buffer");
+#endif  // CONFIG_LOOP_RESTORATION
+
   if (vpx_realloc_frame_buffer(&cpi->scaled_source,
                                cm->width, cm->height,
                                cm->subsampling_x, cm->subsampling_y,
@@ -687,6 +810,9 @@
     unsigned int tokens = get_token_alloc(cm->mb_rows, cm->mb_cols);
     CHECK_MEM_ERROR(cm, cpi->tile_tok[0][0],
         vpx_calloc(tokens, sizeof(*cpi->tile_tok[0][0])));
+#if CONFIG_ANS
+    vp10_buf_ans_alloc(&cpi->buf_ans, cm, tokens);
+#endif  // CONFIG_ANS
   }
 
   vp10_setup_pc_tree(&cpi->common, &cpi->td);
@@ -697,15 +823,63 @@
   vp10_rc_update_framerate(cpi);
 }
 
-static void set_tile_limits(VP10_COMP *cpi) {
+static void set_tile_info(VP10_COMP *cpi) {
   VP10_COMMON *const cm = &cpi->common;
 
+#if CONFIG_EXT_TILE
+#if CONFIG_EXT_PARTITION
+  if (cpi->oxcf.superblock_size != VPX_SUPERBLOCK_SIZE_64X64) {
+    cm->tile_width  = clamp(cpi->oxcf.tile_columns, 1, 32);
+    cm->tile_height = clamp(cpi->oxcf.tile_rows, 1, 32);
+    cm->tile_width  <<= MAX_MIB_SIZE_LOG2;
+    cm->tile_height <<= MAX_MIB_SIZE_LOG2;
+  } else {
+    cm->tile_width  = clamp(cpi->oxcf.tile_columns, 1, 64);
+    cm->tile_height = clamp(cpi->oxcf.tile_rows, 1, 64);
+    cm->tile_width  <<= MAX_MIB_SIZE_LOG2 - 1;
+    cm->tile_height <<= MAX_MIB_SIZE_LOG2 - 1;
+  }
+#else
+  cm->tile_width  = clamp(cpi->oxcf.tile_columns, 1, 64);
+  cm->tile_height = clamp(cpi->oxcf.tile_rows, 1, 64);
+  cm->tile_width  <<= MAX_MIB_SIZE_LOG2;
+  cm->tile_height <<= MAX_MIB_SIZE_LOG2;
+#endif  // CONFIG_EXT_PARTITION
+
+  cm->tile_width  = VPXMIN(cm->tile_width, cm->mi_cols);
+  cm->tile_height = VPXMIN(cm->tile_height, cm->mi_rows);
+
+  assert(cm->tile_width >> MAX_MIB_SIZE <= 32);
+  assert(cm->tile_height >> MAX_MIB_SIZE <= 32);
+
+  // Get the number of tiles
+  cm->tile_cols = 1;
+  while (cm->tile_cols * cm->tile_width < cm->mi_cols)
+    ++cm->tile_cols;
+
+  cm->tile_rows = 1;
+  while (cm->tile_rows * cm->tile_height < cm->mi_rows)
+    ++cm->tile_rows;
+#else
   int min_log2_tile_cols, max_log2_tile_cols;
   vp10_get_tile_n_bits(cm->mi_cols, &min_log2_tile_cols, &max_log2_tile_cols);
 
   cm->log2_tile_cols = clamp(cpi->oxcf.tile_columns,
                              min_log2_tile_cols, max_log2_tile_cols);
   cm->log2_tile_rows = cpi->oxcf.tile_rows;
+
+  cm->tile_cols = 1 << cm->log2_tile_cols;
+  cm->tile_rows = 1 << cm->log2_tile_rows;
+
+  cm->tile_width = ALIGN_POWER_OF_TWO(cm->mi_cols, MAX_MIB_SIZE_LOG2);
+  cm->tile_width >>= cm->log2_tile_cols;
+  cm->tile_height = ALIGN_POWER_OF_TWO(cm->mi_rows, MAX_MIB_SIZE_LOG2);
+  cm->tile_height >>= cm->log2_tile_rows;
+
+  // round to integer multiples of max superblock size
+  cm->tile_width  = ALIGN_POWER_OF_TWO(cm->tile_width, MAX_MIB_SIZE_LOG2);
+  cm->tile_height = ALIGN_POWER_OF_TWO(cm->tile_height, MAX_MIB_SIZE_LOG2);
+#endif  // CONFIG_EXT_TILE
 }
 
 static void update_frame_size(VP10_COMP *cpi) {
@@ -718,13 +892,22 @@
   memset(cpi->mbmi_ext_base, 0,
          cm->mi_rows * cm->mi_cols * sizeof(*cpi->mbmi_ext_base));
 
-  set_tile_limits(cpi);
+  set_tile_info(cpi);
 }
 
 static void init_buffer_indices(VP10_COMP *cpi) {
+#if CONFIG_EXT_REFS
+  int fb_idx;
+  for (fb_idx = 0; fb_idx < LAST_REF_FRAMES; ++fb_idx)
+    cpi->lst_fb_idxes[fb_idx] = fb_idx;
+  cpi->gld_fb_idx = LAST_REF_FRAMES;
+  cpi->bwd_fb_idx = LAST_REF_FRAMES + 1;
+  cpi->alt_fb_idx = LAST_REF_FRAMES + 2;
+#else
   cpi->lst_fb_idx = 0;
   cpi->gld_fb_idx = 1;
   cpi->alt_fb_idx = 2;
+#endif  // CONFIG_EXT_REFS
 }
 
 static void init_config(struct VP10_COMP *cpi, VP10EncoderConfig *oxcf) {
@@ -913,6 +1096,19 @@
   sad_array[i] >>= 4; \
 }
 
+#if CONFIG_EXT_PARTITION
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad128x128)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad128x128_avg)
+MAKE_BFP_SAD3_WRAPPER(vpx_highbd_sad128x128x3)
+MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad128x128x8)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad128x128x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad128x64)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad128x64_avg)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad128x64x4d)
+MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad64x128)
+MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad64x128_avg)
+MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad64x128x4d)
+#endif  // CONFIG_EXT_PARTITION
 MAKE_BFP_SAD_WRAPPER(vpx_highbd_sad32x16)
 MAKE_BFP_SADAVG_WRAPPER(vpx_highbd_sad32x16_avg)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad32x16x4d)
@@ -969,6 +1165,101 @@
 MAKE_BFP_SAD8_WRAPPER(vpx_highbd_sad4x4x8)
 MAKE_BFP_SAD4D_WRAPPER(vpx_highbd_sad4x4x4d)
 
+#if CONFIG_EXT_INTER
+#define HIGHBD_MBFP(BT, MSDF, MVF, MSVF)         \
+  cpi->fn_ptr[BT].msdf            = MSDF; \
+  cpi->fn_ptr[BT].mvf             = MVF;  \
+  cpi->fn_ptr[BT].msvf            = MSVF;
+
+#define MAKE_MBFP_SAD_WRAPPER(fnname) \
+static unsigned int fnname##_bits8(const uint8_t *src_ptr, \
+                                   int source_stride, \
+                                   const uint8_t *ref_ptr, \
+                                   int ref_stride, \
+                                   const uint8_t *m, \
+                                   int m_stride) {  \
+  return fnname(src_ptr, source_stride, ref_ptr, ref_stride, \
+                m, m_stride); \
+} \
+static unsigned int fnname##_bits10(const uint8_t *src_ptr, \
+                                    int source_stride, \
+                                    const uint8_t *ref_ptr, \
+                                    int ref_stride, \
+                                    const uint8_t *m, \
+                                    int m_stride) {  \
+  return fnname(src_ptr, source_stride, ref_ptr, ref_stride, \
+                m, m_stride) >> 2; \
+} \
+static unsigned int fnname##_bits12(const uint8_t *src_ptr, \
+                                    int source_stride, \
+                                    const uint8_t *ref_ptr, \
+                                    int ref_stride, \
+                                    const uint8_t *m, \
+                                    int m_stride) {  \
+  return fnname(src_ptr, source_stride, ref_ptr, ref_stride, \
+                m, m_stride) >> 4; \
+}
+
+#if CONFIG_EXT_PARTITION
+MAKE_MBFP_SAD_WRAPPER(vpx_highbd_masked_sad128x128)
+MAKE_MBFP_SAD_WRAPPER(vpx_highbd_masked_sad128x64)
+MAKE_MBFP_SAD_WRAPPER(vpx_highbd_masked_sad64x128)
+#endif  // CONFIG_EXT_PARTITION
+MAKE_MBFP_SAD_WRAPPER(vpx_highbd_masked_sad64x64)
+MAKE_MBFP_SAD_WRAPPER(vpx_highbd_masked_sad64x32)
+MAKE_MBFP_SAD_WRAPPER(vpx_highbd_masked_sad32x64)
+MAKE_MBFP_SAD_WRAPPER(vpx_highbd_masked_sad32x32)
+MAKE_MBFP_SAD_WRAPPER(vpx_highbd_masked_sad32x16)
+MAKE_MBFP_SAD_WRAPPER(vpx_highbd_masked_sad16x32)
+MAKE_MBFP_SAD_WRAPPER(vpx_highbd_masked_sad16x16)
+MAKE_MBFP_SAD_WRAPPER(vpx_highbd_masked_sad16x8)
+MAKE_MBFP_SAD_WRAPPER(vpx_highbd_masked_sad8x16)
+MAKE_MBFP_SAD_WRAPPER(vpx_highbd_masked_sad8x8)
+MAKE_MBFP_SAD_WRAPPER(vpx_highbd_masked_sad8x4)
+MAKE_MBFP_SAD_WRAPPER(vpx_highbd_masked_sad4x8)
+MAKE_MBFP_SAD_WRAPPER(vpx_highbd_masked_sad4x4)
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_OBMC
+#define HIGHBD_OBFP(BT, OSDF, OVF, OSVF)                                   \
+  cpi->fn_ptr[BT].osdf            = OSDF;                                  \
+  cpi->fn_ptr[BT].ovf             = OVF;                                   \
+  cpi->fn_ptr[BT].osvf            = OSVF;
+
+#define MAKE_OBFP_SAD_WRAPPER(fnname)                                      \
+static unsigned int fnname##_bits8(const uint8_t *ref, int ref_stride,     \
+                                   const int32_t *wsrc, const int32_t *msk) { \
+  return fnname(ref, ref_stride, wsrc, msk);                               \
+}                                                                          \
+static unsigned int fnname##_bits10(const uint8_t *ref, int ref_stride,    \
+                                    const int32_t *wsrc, const int32_t *msk) { \
+  return fnname(ref, ref_stride, wsrc, msk) >> 2;                          \
+}                                                                          \
+static unsigned int fnname##_bits12(const uint8_t *ref, int ref_stride,    \
+                                    const int32_t *wsrc, const int32_t *msk) { \
+  return fnname(ref, ref_stride, wsrc, msk) >> 4;                          \
+}
+
+#if CONFIG_EXT_PARTITION
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad128x128)
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad128x64)
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad64x128)
+#endif  // CONFIG_EXT_PARTITION
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad64x64)
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad64x32)
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad32x64)
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad32x32)
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad32x16)
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad16x32)
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad16x16)
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad16x8)
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad8x16)
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad8x8)
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad8x4)
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad4x8)
+MAKE_OBFP_SAD_WRAPPER(vpx_highbd_obmc_sad4x4)
+#endif  // CONFIG_OBMC
+
 static void  highbd_set_var_fns(VP10_COMP *const cpi) {
   VP10_COMMON *const cm = &cpi->common;
   if (cm->use_highbitdepth) {
@@ -1103,6 +1394,175 @@
                    vpx_highbd_sad4x4x3_bits8,
                    vpx_highbd_sad4x4x8_bits8,
                    vpx_highbd_sad4x4x4d_bits8)
+
+#if CONFIG_EXT_PARTITION
+        HIGHBD_BFP(BLOCK_128X128,
+                   vpx_highbd_sad128x128_bits8,
+                   vpx_highbd_sad128x128_avg_bits8,
+                   vpx_highbd_8_variance128x128,
+                   vpx_highbd_8_sub_pixel_variance128x128,
+                   vpx_highbd_8_sub_pixel_avg_variance128x128,
+                   vpx_highbd_sad128x128x3_bits8,
+                   vpx_highbd_sad128x128x8_bits8,
+                   vpx_highbd_sad128x128x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_128X64,
+                   vpx_highbd_sad128x64_bits8,
+                   vpx_highbd_sad128x64_avg_bits8,
+                   vpx_highbd_8_variance128x64,
+                   vpx_highbd_8_sub_pixel_variance128x64,
+                   vpx_highbd_8_sub_pixel_avg_variance128x64,
+                   NULL,
+                   NULL,
+                   vpx_highbd_sad128x64x4d_bits8)
+
+        HIGHBD_BFP(BLOCK_64X128,
+                   vpx_highbd_sad64x128_bits8,
+                   vpx_highbd_sad64x128_avg_bits8,
+                   vpx_highbd_8_variance64x128,
+                   vpx_highbd_8_sub_pixel_variance64x128,
+                   vpx_highbd_8_sub_pixel_avg_variance64x128,
+                   NULL,
+                   NULL,
+                   vpx_highbd_sad64x128x4d_bits8)
+#endif  // CONFIG_EXT_PARTITION
+
+#if CONFIG_EXT_INTER
+#if CONFIG_EXT_PARTITION
+        HIGHBD_MBFP(BLOCK_128X128,
+                    vpx_highbd_masked_sad128x128_bits8,
+                    vpx_highbd_masked_variance128x128,
+                    vpx_highbd_masked_sub_pixel_variance128x128)
+        HIGHBD_MBFP(BLOCK_128X64,
+                    vpx_highbd_masked_sad128x64_bits8,
+                    vpx_highbd_masked_variance128x64,
+                    vpx_highbd_masked_sub_pixel_variance128x64)
+        HIGHBD_MBFP(BLOCK_64X128,
+                    vpx_highbd_masked_sad64x128_bits8,
+                    vpx_highbd_masked_variance64x128,
+                    vpx_highbd_masked_sub_pixel_variance64x128)
+#endif  // CONFIG_EXT_PARTITION
+        HIGHBD_MBFP(BLOCK_64X64,
+                    vpx_highbd_masked_sad64x64_bits8,
+                    vpx_highbd_masked_variance64x64,
+                    vpx_highbd_masked_sub_pixel_variance64x64)
+        HIGHBD_MBFP(BLOCK_64X32,
+                    vpx_highbd_masked_sad64x32_bits8,
+                    vpx_highbd_masked_variance64x32,
+                    vpx_highbd_masked_sub_pixel_variance64x32)
+        HIGHBD_MBFP(BLOCK_32X64,
+                    vpx_highbd_masked_sad32x64_bits8,
+                    vpx_highbd_masked_variance32x64,
+                    vpx_highbd_masked_sub_pixel_variance32x64)
+        HIGHBD_MBFP(BLOCK_32X32,
+                    vpx_highbd_masked_sad32x32_bits8,
+                    vpx_highbd_masked_variance32x32,
+                    vpx_highbd_masked_sub_pixel_variance32x32)
+        HIGHBD_MBFP(BLOCK_32X16,
+                    vpx_highbd_masked_sad32x16_bits8,
+                    vpx_highbd_masked_variance32x16,
+                    vpx_highbd_masked_sub_pixel_variance32x16)
+        HIGHBD_MBFP(BLOCK_16X32,
+                    vpx_highbd_masked_sad16x32_bits8,
+                    vpx_highbd_masked_variance16x32,
+                    vpx_highbd_masked_sub_pixel_variance16x32)
+        HIGHBD_MBFP(BLOCK_16X16,
+                    vpx_highbd_masked_sad16x16_bits8,
+                    vpx_highbd_masked_variance16x16,
+                    vpx_highbd_masked_sub_pixel_variance16x16)
+        HIGHBD_MBFP(BLOCK_8X16,
+                    vpx_highbd_masked_sad8x16_bits8,
+                    vpx_highbd_masked_variance8x16,
+                    vpx_highbd_masked_sub_pixel_variance8x16)
+        HIGHBD_MBFP(BLOCK_16X8,
+                    vpx_highbd_masked_sad16x8_bits8,
+                    vpx_highbd_masked_variance16x8,
+                    vpx_highbd_masked_sub_pixel_variance16x8)
+        HIGHBD_MBFP(BLOCK_8X8,
+                    vpx_highbd_masked_sad8x8_bits8,
+                    vpx_highbd_masked_variance8x8,
+                    vpx_highbd_masked_sub_pixel_variance8x8)
+        HIGHBD_MBFP(BLOCK_4X8,
+                    vpx_highbd_masked_sad4x8_bits8,
+                    vpx_highbd_masked_variance4x8,
+                    vpx_highbd_masked_sub_pixel_variance4x8)
+        HIGHBD_MBFP(BLOCK_8X4,
+                    vpx_highbd_masked_sad8x4_bits8,
+                    vpx_highbd_masked_variance8x4,
+                    vpx_highbd_masked_sub_pixel_variance8x4)
+        HIGHBD_MBFP(BLOCK_4X4,
+                    vpx_highbd_masked_sad4x4_bits8,
+                    vpx_highbd_masked_variance4x4,
+                    vpx_highbd_masked_sub_pixel_variance4x4)
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_OBMC
+#if CONFIG_EXT_PARTITION
+        HIGHBD_OBFP(BLOCK_128X128,
+                    vpx_highbd_obmc_sad128x128_bits8,
+                    vpx_highbd_obmc_variance128x128,
+                    vpx_highbd_obmc_sub_pixel_variance128x128)
+        HIGHBD_OBFP(BLOCK_128X64,
+                    vpx_highbd_obmc_sad128x64_bits8,
+                    vpx_highbd_obmc_variance128x64,
+                    vpx_highbd_obmc_sub_pixel_variance128x64)
+        HIGHBD_OBFP(BLOCK_64X128,
+                    vpx_highbd_obmc_sad64x128_bits8,
+                    vpx_highbd_obmc_variance64x128,
+                    vpx_highbd_obmc_sub_pixel_variance64x128)
+#endif  // CONFIG_EXT_PARTITION
+        HIGHBD_OBFP(BLOCK_64X64,
+                    vpx_highbd_obmc_sad64x64_bits8,
+                    vpx_highbd_obmc_variance64x64,
+                    vpx_highbd_obmc_sub_pixel_variance64x64)
+        HIGHBD_OBFP(BLOCK_64X32,
+                    vpx_highbd_obmc_sad64x32_bits8,
+                    vpx_highbd_obmc_variance64x32,
+                    vpx_highbd_obmc_sub_pixel_variance64x32)
+        HIGHBD_OBFP(BLOCK_32X64,
+                    vpx_highbd_obmc_sad32x64_bits8,
+                    vpx_highbd_obmc_variance32x64,
+                    vpx_highbd_obmc_sub_pixel_variance32x64)
+        HIGHBD_OBFP(BLOCK_32X32,
+                    vpx_highbd_obmc_sad32x32_bits8,
+                    vpx_highbd_obmc_variance32x32,
+                    vpx_highbd_obmc_sub_pixel_variance32x32)
+        HIGHBD_OBFP(BLOCK_32X16,
+                    vpx_highbd_obmc_sad32x16_bits8,
+                    vpx_highbd_obmc_variance32x16,
+                    vpx_highbd_obmc_sub_pixel_variance32x16)
+        HIGHBD_OBFP(BLOCK_16X32,
+                    vpx_highbd_obmc_sad16x32_bits8,
+                    vpx_highbd_obmc_variance16x32,
+                    vpx_highbd_obmc_sub_pixel_variance16x32)
+        HIGHBD_OBFP(BLOCK_16X16,
+                    vpx_highbd_obmc_sad16x16_bits8,
+                    vpx_highbd_obmc_variance16x16,
+                    vpx_highbd_obmc_sub_pixel_variance16x16)
+        HIGHBD_OBFP(BLOCK_8X16,
+                    vpx_highbd_obmc_sad8x16_bits8,
+                    vpx_highbd_obmc_variance8x16,
+                    vpx_highbd_obmc_sub_pixel_variance8x16)
+        HIGHBD_OBFP(BLOCK_16X8,
+                    vpx_highbd_obmc_sad16x8_bits8,
+                    vpx_highbd_obmc_variance16x8,
+                    vpx_highbd_obmc_sub_pixel_variance16x8)
+        HIGHBD_OBFP(BLOCK_8X8,
+                    vpx_highbd_obmc_sad8x8_bits8,
+                    vpx_highbd_obmc_variance8x8,
+                    vpx_highbd_obmc_sub_pixel_variance8x8)
+        HIGHBD_OBFP(BLOCK_4X8,
+                    vpx_highbd_obmc_sad4x8_bits8,
+                    vpx_highbd_obmc_variance4x8,
+                    vpx_highbd_obmc_sub_pixel_variance4x8)
+        HIGHBD_OBFP(BLOCK_8X4,
+                    vpx_highbd_obmc_sad8x4_bits8,
+                    vpx_highbd_obmc_variance8x4,
+                    vpx_highbd_obmc_sub_pixel_variance8x4)
+        HIGHBD_OBFP(BLOCK_4X4,
+                    vpx_highbd_obmc_sad4x4_bits8,
+                    vpx_highbd_obmc_variance4x4,
+                    vpx_highbd_obmc_sub_pixel_variance4x4)
+#endif  // CONFIG_OBMC
         break;
 
       case VPX_BITS_10:
@@ -1235,6 +1695,175 @@
                    vpx_highbd_sad4x4x3_bits10,
                    vpx_highbd_sad4x4x8_bits10,
                    vpx_highbd_sad4x4x4d_bits10)
+
+#if CONFIG_EXT_PARTITION
+        HIGHBD_BFP(BLOCK_128X128,
+                   vpx_highbd_sad128x128_bits10,
+                   vpx_highbd_sad128x128_avg_bits10,
+                   vpx_highbd_10_variance128x128,
+                   vpx_highbd_10_sub_pixel_variance128x128,
+                   vpx_highbd_10_sub_pixel_avg_variance128x128,
+                   vpx_highbd_sad128x128x3_bits10,
+                   vpx_highbd_sad128x128x8_bits10,
+                   vpx_highbd_sad128x128x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_128X64,
+                   vpx_highbd_sad128x64_bits10,
+                   vpx_highbd_sad128x64_avg_bits10,
+                   vpx_highbd_10_variance128x64,
+                   vpx_highbd_10_sub_pixel_variance128x64,
+                   vpx_highbd_10_sub_pixel_avg_variance128x64,
+                   NULL,
+                   NULL,
+                   vpx_highbd_sad128x64x4d_bits10)
+
+        HIGHBD_BFP(BLOCK_64X128,
+                   vpx_highbd_sad64x128_bits10,
+                   vpx_highbd_sad64x128_avg_bits10,
+                   vpx_highbd_10_variance64x128,
+                   vpx_highbd_10_sub_pixel_variance64x128,
+                   vpx_highbd_10_sub_pixel_avg_variance64x128,
+                   NULL,
+                   NULL,
+                   vpx_highbd_sad64x128x4d_bits10)
+#endif  // CONFIG_EXT_PARTITION
+
+#if CONFIG_EXT_INTER
+#if CONFIG_EXT_PARTITION
+        HIGHBD_MBFP(BLOCK_128X128,
+                    vpx_highbd_masked_sad128x128_bits10,
+                    vpx_highbd_10_masked_variance128x128,
+                    vpx_highbd_10_masked_sub_pixel_variance128x128)
+        HIGHBD_MBFP(BLOCK_128X64,
+                    vpx_highbd_masked_sad128x64_bits10,
+                    vpx_highbd_10_masked_variance128x64,
+                    vpx_highbd_10_masked_sub_pixel_variance128x64)
+        HIGHBD_MBFP(BLOCK_64X128,
+                    vpx_highbd_masked_sad64x128_bits10,
+                    vpx_highbd_10_masked_variance64x128,
+                    vpx_highbd_10_masked_sub_pixel_variance64x128)
+#endif  // CONFIG_EXT_PARTITION
+        HIGHBD_MBFP(BLOCK_64X64,
+                    vpx_highbd_masked_sad64x64_bits10,
+                    vpx_highbd_10_masked_variance64x64,
+                    vpx_highbd_10_masked_sub_pixel_variance64x64)
+        HIGHBD_MBFP(BLOCK_64X32,
+                    vpx_highbd_masked_sad64x32_bits10,
+                    vpx_highbd_10_masked_variance64x32,
+                    vpx_highbd_10_masked_sub_pixel_variance64x32)
+        HIGHBD_MBFP(BLOCK_32X64,
+                    vpx_highbd_masked_sad32x64_bits10,
+                    vpx_highbd_10_masked_variance32x64,
+                    vpx_highbd_10_masked_sub_pixel_variance32x64)
+        HIGHBD_MBFP(BLOCK_32X32,
+                    vpx_highbd_masked_sad32x32_bits10,
+                    vpx_highbd_10_masked_variance32x32,
+                    vpx_highbd_10_masked_sub_pixel_variance32x32)
+        HIGHBD_MBFP(BLOCK_32X16,
+                    vpx_highbd_masked_sad32x16_bits10,
+                    vpx_highbd_10_masked_variance32x16,
+                    vpx_highbd_10_masked_sub_pixel_variance32x16)
+        HIGHBD_MBFP(BLOCK_16X32,
+                    vpx_highbd_masked_sad16x32_bits10,
+                    vpx_highbd_10_masked_variance16x32,
+                    vpx_highbd_10_masked_sub_pixel_variance16x32)
+        HIGHBD_MBFP(BLOCK_16X16,
+                    vpx_highbd_masked_sad16x16_bits10,
+                    vpx_highbd_10_masked_variance16x16,
+                    vpx_highbd_10_masked_sub_pixel_variance16x16)
+        HIGHBD_MBFP(BLOCK_8X16,
+                    vpx_highbd_masked_sad8x16_bits10,
+                    vpx_highbd_10_masked_variance8x16,
+                    vpx_highbd_10_masked_sub_pixel_variance8x16)
+        HIGHBD_MBFP(BLOCK_16X8,
+                    vpx_highbd_masked_sad16x8_bits10,
+                    vpx_highbd_10_masked_variance16x8,
+                    vpx_highbd_10_masked_sub_pixel_variance16x8)
+        HIGHBD_MBFP(BLOCK_8X8,
+                    vpx_highbd_masked_sad8x8_bits10,
+                    vpx_highbd_10_masked_variance8x8,
+                    vpx_highbd_10_masked_sub_pixel_variance8x8)
+        HIGHBD_MBFP(BLOCK_4X8,
+                    vpx_highbd_masked_sad4x8_bits10,
+                    vpx_highbd_10_masked_variance4x8,
+                    vpx_highbd_10_masked_sub_pixel_variance4x8)
+        HIGHBD_MBFP(BLOCK_8X4,
+                    vpx_highbd_masked_sad8x4_bits10,
+                    vpx_highbd_10_masked_variance8x4,
+                    vpx_highbd_10_masked_sub_pixel_variance8x4)
+        HIGHBD_MBFP(BLOCK_4X4,
+                    vpx_highbd_masked_sad4x4_bits10,
+                    vpx_highbd_10_masked_variance4x4,
+                    vpx_highbd_10_masked_sub_pixel_variance4x4)
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_OBMC
+#if CONFIG_EXT_PARTITION
+        HIGHBD_OBFP(BLOCK_128X128,
+                    vpx_highbd_obmc_sad128x128_bits10,
+                    vpx_highbd_10_obmc_variance128x128,
+                    vpx_highbd_10_obmc_sub_pixel_variance128x128)
+        HIGHBD_OBFP(BLOCK_128X64,
+                    vpx_highbd_obmc_sad128x64_bits10,
+                    vpx_highbd_10_obmc_variance128x64,
+                    vpx_highbd_10_obmc_sub_pixel_variance128x64)
+        HIGHBD_OBFP(BLOCK_64X128,
+                    vpx_highbd_obmc_sad64x128_bits10,
+                    vpx_highbd_10_obmc_variance64x128,
+                    vpx_highbd_10_obmc_sub_pixel_variance64x128)
+#endif  // CONFIG_EXT_PARTITION
+        HIGHBD_OBFP(BLOCK_64X64,
+                    vpx_highbd_obmc_sad64x64_bits10,
+                    vpx_highbd_10_obmc_variance64x64,
+                    vpx_highbd_10_obmc_sub_pixel_variance64x64)
+        HIGHBD_OBFP(BLOCK_64X32,
+                    vpx_highbd_obmc_sad64x32_bits10,
+                    vpx_highbd_10_obmc_variance64x32,
+                    vpx_highbd_10_obmc_sub_pixel_variance64x32)
+        HIGHBD_OBFP(BLOCK_32X64,
+                    vpx_highbd_obmc_sad32x64_bits10,
+                    vpx_highbd_10_obmc_variance32x64,
+                    vpx_highbd_10_obmc_sub_pixel_variance32x64)
+        HIGHBD_OBFP(BLOCK_32X32,
+                    vpx_highbd_obmc_sad32x32_bits10,
+                    vpx_highbd_10_obmc_variance32x32,
+                    vpx_highbd_10_obmc_sub_pixel_variance32x32)
+        HIGHBD_OBFP(BLOCK_32X16,
+                    vpx_highbd_obmc_sad32x16_bits10,
+                    vpx_highbd_10_obmc_variance32x16,
+                    vpx_highbd_10_obmc_sub_pixel_variance32x16)
+        HIGHBD_OBFP(BLOCK_16X32,
+                    vpx_highbd_obmc_sad16x32_bits10,
+                    vpx_highbd_10_obmc_variance16x32,
+                    vpx_highbd_10_obmc_sub_pixel_variance16x32)
+        HIGHBD_OBFP(BLOCK_16X16,
+                    vpx_highbd_obmc_sad16x16_bits10,
+                    vpx_highbd_10_obmc_variance16x16,
+                    vpx_highbd_10_obmc_sub_pixel_variance16x16)
+        HIGHBD_OBFP(BLOCK_8X16,
+                    vpx_highbd_obmc_sad8x16_bits10,
+                    vpx_highbd_10_obmc_variance8x16,
+                    vpx_highbd_10_obmc_sub_pixel_variance8x16)
+        HIGHBD_OBFP(BLOCK_16X8,
+                    vpx_highbd_obmc_sad16x8_bits10,
+                    vpx_highbd_10_obmc_variance16x8,
+                    vpx_highbd_10_obmc_sub_pixel_variance16x8)
+        HIGHBD_OBFP(BLOCK_8X8,
+                    vpx_highbd_obmc_sad8x8_bits10,
+                    vpx_highbd_10_obmc_variance8x8,
+                    vpx_highbd_10_obmc_sub_pixel_variance8x8)
+        HIGHBD_OBFP(BLOCK_4X8,
+                    vpx_highbd_obmc_sad4x8_bits10,
+                    vpx_highbd_10_obmc_variance4x8,
+                    vpx_highbd_10_obmc_sub_pixel_variance4x8)
+        HIGHBD_OBFP(BLOCK_8X4,
+                    vpx_highbd_obmc_sad8x4_bits10,
+                    vpx_highbd_10_obmc_variance8x4,
+                    vpx_highbd_10_obmc_sub_pixel_variance8x4)
+        HIGHBD_OBFP(BLOCK_4X4,
+                    vpx_highbd_obmc_sad4x4_bits10,
+                    vpx_highbd_10_obmc_variance4x4,
+                    vpx_highbd_10_obmc_sub_pixel_variance4x4)
+#endif  // CONFIG_OBMC
         break;
 
       case VPX_BITS_12:
@@ -1367,6 +1996,176 @@
                    vpx_highbd_sad4x4x3_bits12,
                    vpx_highbd_sad4x4x8_bits12,
                    vpx_highbd_sad4x4x4d_bits12)
+
+#if CONFIG_EXT_PARTITION
+        HIGHBD_BFP(BLOCK_128X128,
+                   vpx_highbd_sad128x128_bits12,
+                   vpx_highbd_sad128x128_avg_bits12,
+                   vpx_highbd_12_variance128x128,
+                   vpx_highbd_12_sub_pixel_variance128x128,
+                   vpx_highbd_12_sub_pixel_avg_variance128x128,
+                   vpx_highbd_sad128x128x3_bits12,
+                   vpx_highbd_sad128x128x8_bits12,
+                   vpx_highbd_sad128x128x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_128X64,
+                   vpx_highbd_sad128x64_bits12,
+                   vpx_highbd_sad128x64_avg_bits12,
+                   vpx_highbd_12_variance128x64,
+                   vpx_highbd_12_sub_pixel_variance128x64,
+                   vpx_highbd_12_sub_pixel_avg_variance128x64,
+                   NULL,
+                   NULL,
+                   vpx_highbd_sad128x64x4d_bits12)
+
+        HIGHBD_BFP(BLOCK_64X128,
+                   vpx_highbd_sad64x128_bits12,
+                   vpx_highbd_sad64x128_avg_bits12,
+                   vpx_highbd_12_variance64x128,
+                   vpx_highbd_12_sub_pixel_variance64x128,
+                   vpx_highbd_12_sub_pixel_avg_variance64x128,
+                   NULL,
+                   NULL,
+                   vpx_highbd_sad64x128x4d_bits12)
+#endif  // CONFIG_EXT_PARTITION
+
+#if CONFIG_EXT_INTER
+#if CONFIG_EXT_PARTITION
+        HIGHBD_MBFP(BLOCK_128X128,
+                    vpx_highbd_masked_sad128x128_bits12,
+                    vpx_highbd_12_masked_variance128x128,
+                    vpx_highbd_12_masked_sub_pixel_variance128x128)
+        HIGHBD_MBFP(BLOCK_128X64,
+                    vpx_highbd_masked_sad128x64_bits12,
+                    vpx_highbd_12_masked_variance128x64,
+                    vpx_highbd_12_masked_sub_pixel_variance128x64)
+        HIGHBD_MBFP(BLOCK_64X128,
+                    vpx_highbd_masked_sad64x128_bits12,
+                    vpx_highbd_12_masked_variance64x128,
+                    vpx_highbd_12_masked_sub_pixel_variance64x128)
+#endif  // CONFIG_EXT_PARTITION
+        HIGHBD_MBFP(BLOCK_64X64,
+                    vpx_highbd_masked_sad64x64_bits12,
+                    vpx_highbd_12_masked_variance64x64,
+                    vpx_highbd_12_masked_sub_pixel_variance64x64)
+        HIGHBD_MBFP(BLOCK_64X32,
+                    vpx_highbd_masked_sad64x32_bits12,
+                    vpx_highbd_12_masked_variance64x32,
+                    vpx_highbd_12_masked_sub_pixel_variance64x32)
+        HIGHBD_MBFP(BLOCK_32X64,
+                    vpx_highbd_masked_sad32x64_bits12,
+                    vpx_highbd_12_masked_variance32x64,
+                    vpx_highbd_12_masked_sub_pixel_variance32x64)
+        HIGHBD_MBFP(BLOCK_32X32,
+                    vpx_highbd_masked_sad32x32_bits12,
+                    vpx_highbd_12_masked_variance32x32,
+                    vpx_highbd_12_masked_sub_pixel_variance32x32)
+        HIGHBD_MBFP(BLOCK_32X16,
+                    vpx_highbd_masked_sad32x16_bits12,
+                    vpx_highbd_12_masked_variance32x16,
+                    vpx_highbd_12_masked_sub_pixel_variance32x16)
+        HIGHBD_MBFP(BLOCK_16X32,
+                    vpx_highbd_masked_sad16x32_bits12,
+                    vpx_highbd_12_masked_variance16x32,
+                    vpx_highbd_12_masked_sub_pixel_variance16x32)
+        HIGHBD_MBFP(BLOCK_16X16,
+                    vpx_highbd_masked_sad16x16_bits12,
+                    vpx_highbd_12_masked_variance16x16,
+                    vpx_highbd_12_masked_sub_pixel_variance16x16)
+        HIGHBD_MBFP(BLOCK_8X16,
+                    vpx_highbd_masked_sad8x16_bits12,
+                    vpx_highbd_12_masked_variance8x16,
+                    vpx_highbd_12_masked_sub_pixel_variance8x16)
+        HIGHBD_MBFP(BLOCK_16X8,
+                    vpx_highbd_masked_sad16x8_bits12,
+                    vpx_highbd_12_masked_variance16x8,
+                    vpx_highbd_12_masked_sub_pixel_variance16x8)
+        HIGHBD_MBFP(BLOCK_8X8,
+                    vpx_highbd_masked_sad8x8_bits12,
+                    vpx_highbd_12_masked_variance8x8,
+                    vpx_highbd_12_masked_sub_pixel_variance8x8)
+        HIGHBD_MBFP(BLOCK_4X8,
+                    vpx_highbd_masked_sad4x8_bits12,
+                    vpx_highbd_12_masked_variance4x8,
+                    vpx_highbd_12_masked_sub_pixel_variance4x8)
+        HIGHBD_MBFP(BLOCK_8X4,
+                    vpx_highbd_masked_sad8x4_bits12,
+                    vpx_highbd_12_masked_variance8x4,
+                    vpx_highbd_12_masked_sub_pixel_variance8x4)
+        HIGHBD_MBFP(BLOCK_4X4,
+                    vpx_highbd_masked_sad4x4_bits12,
+                    vpx_highbd_12_masked_variance4x4,
+                    vpx_highbd_12_masked_sub_pixel_variance4x4)
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_OBMC
+#if CONFIG_EXT_PARTITION
+        HIGHBD_OBFP(BLOCK_128X128,
+                    vpx_highbd_obmc_sad128x128_bits12,
+                    vpx_highbd_12_obmc_variance128x128,
+                    vpx_highbd_12_obmc_sub_pixel_variance128x128)
+        HIGHBD_OBFP(BLOCK_128X64,
+                    vpx_highbd_obmc_sad128x64_bits12,
+                    vpx_highbd_12_obmc_variance128x64,
+                    vpx_highbd_12_obmc_sub_pixel_variance128x64)
+        HIGHBD_OBFP(BLOCK_64X128,
+                    vpx_highbd_obmc_sad64x128_bits12,
+                    vpx_highbd_12_obmc_variance64x128,
+                    vpx_highbd_12_obmc_sub_pixel_variance64x128)
+#endif  // CONFIG_EXT_PARTITION
+        HIGHBD_OBFP(BLOCK_64X64,
+                    vpx_highbd_obmc_sad64x64_bits12,
+                    vpx_highbd_12_obmc_variance64x64,
+                    vpx_highbd_12_obmc_sub_pixel_variance64x64)
+        HIGHBD_OBFP(BLOCK_64X32,
+                    vpx_highbd_obmc_sad64x32_bits12,
+                    vpx_highbd_12_obmc_variance64x32,
+                    vpx_highbd_12_obmc_sub_pixel_variance64x32)
+        HIGHBD_OBFP(BLOCK_32X64,
+                    vpx_highbd_obmc_sad32x64_bits12,
+                    vpx_highbd_12_obmc_variance32x64,
+                    vpx_highbd_12_obmc_sub_pixel_variance32x64)
+        HIGHBD_OBFP(BLOCK_32X32,
+                    vpx_highbd_obmc_sad32x32_bits12,
+                    vpx_highbd_12_obmc_variance32x32,
+                    vpx_highbd_12_obmc_sub_pixel_variance32x32)
+        HIGHBD_OBFP(BLOCK_32X16,
+                    vpx_highbd_obmc_sad32x16_bits12,
+                    vpx_highbd_12_obmc_variance32x16,
+                    vpx_highbd_12_obmc_sub_pixel_variance32x16)
+        HIGHBD_OBFP(BLOCK_16X32,
+                    vpx_highbd_obmc_sad16x32_bits12,
+                    vpx_highbd_12_obmc_variance16x32,
+                    vpx_highbd_12_obmc_sub_pixel_variance16x32)
+        HIGHBD_OBFP(BLOCK_16X16,
+                    vpx_highbd_obmc_sad16x16_bits12,
+                    vpx_highbd_12_obmc_variance16x16,
+                    vpx_highbd_12_obmc_sub_pixel_variance16x16)
+        HIGHBD_OBFP(BLOCK_8X16,
+                    vpx_highbd_obmc_sad8x16_bits12,
+                    vpx_highbd_12_obmc_variance8x16,
+                    vpx_highbd_12_obmc_sub_pixel_variance8x16)
+        HIGHBD_OBFP(BLOCK_16X8,
+                    vpx_highbd_obmc_sad16x8_bits12,
+                    vpx_highbd_12_obmc_variance16x8,
+                    vpx_highbd_12_obmc_sub_pixel_variance16x8)
+        HIGHBD_OBFP(BLOCK_8X8,
+                    vpx_highbd_obmc_sad8x8_bits12,
+                    vpx_highbd_12_obmc_variance8x8,
+                    vpx_highbd_12_obmc_sub_pixel_variance8x8)
+        HIGHBD_OBFP(BLOCK_4X8,
+                    vpx_highbd_obmc_sad4x8_bits12,
+                    vpx_highbd_12_obmc_variance4x8,
+                    vpx_highbd_12_obmc_sub_pixel_variance4x8)
+        HIGHBD_OBFP(BLOCK_8X4,
+                    vpx_highbd_obmc_sad8x4_bits12,
+                    vpx_highbd_12_obmc_variance8x4,
+                    vpx_highbd_12_obmc_sub_pixel_variance8x4)
+        HIGHBD_OBFP(BLOCK_4X4,
+                    vpx_highbd_obmc_sad4x4_bits12,
+                    vpx_highbd_12_obmc_variance4x4,
+                    vpx_highbd_12_obmc_sub_pixel_variance4x4)
+#endif  // CONFIG_OBMC
         break;
 
       default:
@@ -1429,14 +2228,30 @@
     rc->baseline_gf_interval = (MIN_GF_INTERVAL + MAX_GF_INTERVAL) / 2;
   }
 
-  cpi->refresh_golden_frame = 0;
   cpi->refresh_last_frame = 1;
+  cpi->refresh_golden_frame = 0;
+#if CONFIG_EXT_REFS
+  cpi->refresh_bwd_ref_frame = 0;
+#endif  // CONFIG_EXT_REFS
+
   cm->refresh_frame_context =
-      oxcf->error_resilient_mode ? REFRESH_FRAME_CONTEXT_OFF :
-          oxcf->frame_parallel_decoding_mode ? REFRESH_FRAME_CONTEXT_FORWARD
-                                             : REFRESH_FRAME_CONTEXT_BACKWARD;
+      (oxcf->error_resilient_mode || oxcf->frame_parallel_decoding_mode) ?
+          REFRESH_FRAME_CONTEXT_FORWARD : REFRESH_FRAME_CONTEXT_BACKWARD;
   cm->reset_frame_context = RESET_FRAME_CONTEXT_NONE;
 
+  cm->allow_screen_content_tools = (cpi->oxcf.content == VP9E_CONTENT_SCREEN);
+  if (cm->allow_screen_content_tools) {
+    MACROBLOCK *x = &cpi->td.mb;
+    if (x->palette_buffer == 0) {
+      CHECK_MEM_ERROR(cm, x->palette_buffer,
+                      vpx_memalign(16, sizeof(*x->palette_buffer)));
+    }
+    // Reallocate the pc_tree, as it's contents depends on
+    // the state of cm->allow_screen_content_tools
+    vp10_free_pc_tree(&cpi->td);
+    vp10_setup_pc_tree(&cpi->common, &cpi->td);
+  }
+
   vp10_reset_segment_features(cm);
   vp10_set_high_precision_mv(cpi, 0);
 
@@ -1487,13 +2302,19 @@
   cpi->alt_ref_source = NULL;
   rc->is_src_frame_alt_ref = 0;
 
+#if CONFIG_EXT_REFS
+  rc->is_bwd_ref_frame = 0;
+  rc->is_last_bipred_frame = 0;
+  rc->is_bipred_frame = 0;
+#endif  // CONFIG_EXT_REFS
+
 #if 0
   // Experimental RD Code
   cpi->frame_distortion = 0;
   cpi->last_frame_distortion = 0;
 #endif
 
-  set_tile_limits(cpi);
+  set_tile_info(cpi);
 
   cpi->ext_refresh_frame_flags_pending = 0;
   cpi->ext_refresh_frame_context_pending = 0;
@@ -1508,12 +2329,14 @@
 #endif
 #define log2f(x) (log (x) / (float) M_LOG2_E)
 
+#if !CONFIG_REF_MV
 static void cal_nmvjointsadcost(int *mvjointsadcost) {
   mvjointsadcost[0] = 600;
   mvjointsadcost[1] = 300;
   mvjointsadcost[2] = 300;
   mvjointsadcost[3] = 300;
 }
+#endif
 
 static void cal_nmvsadcosts(int *mvsadcost[2]) {
   int i = 1;
@@ -1545,6 +2368,14 @@
   } while (++i <= MV_MAX);
 }
 
+static INLINE void init_upsampled_ref_frame_bufs(VP10_COMP *cpi) {
+  int i;
+
+  for (i = 0; i < MAX_REF_FRAMES; ++i) {
+    cpi->upsampled_ref_bufs[i].ref_count = 0;
+    cpi->upsampled_ref_idx[i] = INVALID_IDX;
+  }
+}
 
 VP10_COMP *vp10_create_compressor(VP10EncoderConfig *oxcf,
                                 BufferPool *const pool) {
@@ -1585,9 +2416,23 @@
   cm->current_video_frame = 0;
   cpi->partition_search_skippable_frame = 0;
   cpi->tile_data = NULL;
+  cpi->last_show_frame_buf_idx = INVALID_IDX;
 
   realloc_segmentation_maps(cpi);
 
+#if CONFIG_REF_MV
+  for (i = 0; i < NMV_CONTEXTS; ++i) {
+    CHECK_MEM_ERROR(cm, cpi->nmv_costs[i][0],
+                    vpx_calloc(MV_VALS, sizeof(*cpi->nmv_costs[i][0])));
+    CHECK_MEM_ERROR(cm, cpi->nmv_costs[i][1],
+                    vpx_calloc(MV_VALS, sizeof(*cpi->nmv_costs[i][1])));
+    CHECK_MEM_ERROR(cm, cpi->nmv_costs_hp[i][0],
+                    vpx_calloc(MV_VALS, sizeof(*cpi->nmv_costs_hp[i][0])));
+    CHECK_MEM_ERROR(cm, cpi->nmv_costs_hp[i][1],
+                    vpx_calloc(MV_VALS, sizeof(*cpi->nmv_costs_hp[i][1])));
+  }
+#endif
+
   CHECK_MEM_ERROR(cm, cpi->nmvcosts[0],
                   vpx_calloc(MV_VALS, sizeof(*cpi->nmvcosts[0])));
   CHECK_MEM_ERROR(cm, cpi->nmvcosts[1],
@@ -1628,7 +2473,6 @@
 
   cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
 #if CONFIG_INTERNAL_STATS
-  cpi->b_calculate_ssimg = 0;
   cpi->b_calculate_blockiness = 1;
   cpi->b_calculate_consistency = 1;
   cpi->total_inconsistency = 0;
@@ -1641,22 +2485,12 @@
   if (cpi->b_calculate_psnr) {
     cpi->total_sq_error = 0;
     cpi->total_samples = 0;
-
-    cpi->totalp_sq_error = 0;
-    cpi->totalp_samples = 0;
-
     cpi->tot_recode_hits = 0;
     cpi->summed_quality = 0;
     cpi->summed_weights = 0;
-    cpi->summedp_quality = 0;
-    cpi->summedp_weights = 0;
   }
 
-  if (cpi->b_calculate_ssimg) {
-    cpi->ssimg.worst= 100.0;
-  }
   cpi->fastssim.worst = 100.0;
-
   cpi->psnrhvs.worst = 100.0;
 
   if (cpi->b_calculate_blockiness) {
@@ -1670,20 +2504,28 @@
                                cpi->common.mi_rows * cpi->common.mi_cols));
     cpi->worst_consistency = 100.0;
   }
-
 #endif
 
   cpi->first_time_stamp_ever = INT64_MAX;
 
+#if CONFIG_REF_MV
+  for (i = 0; i < NMV_CONTEXTS; ++i) {
+    cpi->td.mb.nmvcost[i][0] = &cpi->nmv_costs[i][0][MV_MAX];
+    cpi->td.mb.nmvcost[i][1] = &cpi->nmv_costs[i][1][MV_MAX];
+    cpi->td.mb.nmvcost_hp[i][0] = &cpi->nmv_costs_hp[i][0][MV_MAX];
+    cpi->td.mb.nmvcost_hp[i][1] = &cpi->nmv_costs_hp[i][1][MV_MAX];
+  }
+#else
   cal_nmvjointsadcost(cpi->td.mb.nmvjointsadcost);
   cpi->td.mb.nmvcost[0] = &cpi->nmvcosts[0][MV_MAX];
   cpi->td.mb.nmvcost[1] = &cpi->nmvcosts[1][MV_MAX];
+  cpi->td.mb.nmvcost_hp[0] = &cpi->nmvcosts_hp[0][MV_MAX];
+  cpi->td.mb.nmvcost_hp[1] = &cpi->nmvcosts_hp[1][MV_MAX];
+#endif
   cpi->td.mb.nmvsadcost[0] = &cpi->nmvsadcosts[0][MV_MAX];
   cpi->td.mb.nmvsadcost[1] = &cpi->nmvsadcosts[1][MV_MAX];
   cal_nmvsadcosts(cpi->td.mb.nmvsadcost);
 
-  cpi->td.mb.nmvcost_hp[0] = &cpi->nmvcosts_hp[0][MV_MAX];
-  cpi->td.mb.nmvcost_hp[1] = &cpi->nmvcosts_hp[1][MV_MAX];
   cpi->td.mb.nmvsadcost_hp[0] = &cpi->nmvsadcosts_hp[0][MV_MAX];
   cpi->td.mb.nmvsadcost_hp[1] = &cpi->nmvsadcosts_hp[1][MV_MAX];
   cal_nmvsadcosts_hp(cpi->td.mb.nmvsadcost_hp);
@@ -1733,6 +2575,8 @@
     vp10_init_second_pass(cpi);
   }
 
+  init_upsampled_ref_frame_bufs(cpi);
+
   vp10_set_speed_features_framesize_independent(cpi);
   vp10_set_speed_features_framesize_dependent(cpi);
 
@@ -1752,6 +2596,21 @@
     cpi->fn_ptr[BT].sdx8f          = SDX8F; \
     cpi->fn_ptr[BT].sdx4df         = SDX4DF;
 
+#if CONFIG_EXT_PARTITION
+  BFP(BLOCK_128X128, vpx_sad128x128, vpx_sad128x128_avg,
+      vpx_variance128x128, vpx_sub_pixel_variance128x128,
+      vpx_sub_pixel_avg_variance128x128, vpx_sad128x128x3, vpx_sad128x128x8,
+      vpx_sad128x128x4d)
+
+  BFP(BLOCK_128X64, vpx_sad128x64, vpx_sad128x64_avg,
+      vpx_variance128x64, vpx_sub_pixel_variance128x64,
+      vpx_sub_pixel_avg_variance128x64, NULL, NULL, vpx_sad128x64x4d)
+
+  BFP(BLOCK_64X128, vpx_sad64x128, vpx_sad64x128_avg,
+      vpx_variance64x128, vpx_sub_pixel_variance64x128,
+      vpx_sub_pixel_avg_variance64x128, NULL, NULL, vpx_sad64x128x4d)
+#endif  // CONFIG_EXT_PARTITION
+
   BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg,
       vpx_variance32x16, vpx_sub_pixel_variance32x16,
       vpx_sub_pixel_avg_variance32x16, NULL, NULL, vpx_sad32x16x4d)
@@ -1811,6 +2670,90 @@
       vpx_sub_pixel_avg_variance4x4,
       vpx_sad4x4x3, vpx_sad4x4x8, vpx_sad4x4x4d)
 
+#if CONFIG_OBMC
+#define OBFP(BT, OSDF, OVF, OSVF)         \
+  cpi->fn_ptr[BT].osdf            = OSDF; \
+  cpi->fn_ptr[BT].ovf             = OVF;  \
+  cpi->fn_ptr[BT].osvf            = OSVF;
+
+#if CONFIG_EXT_PARTITION
+  OBFP(BLOCK_128X128, vpx_obmc_sad128x128, vpx_obmc_variance128x128,
+       vpx_obmc_sub_pixel_variance128x128)
+  OBFP(BLOCK_128X64, vpx_obmc_sad128x64, vpx_obmc_variance128x64,
+       vpx_obmc_sub_pixel_variance128x64)
+  OBFP(BLOCK_64X128, vpx_obmc_sad64x128, vpx_obmc_variance64x128,
+       vpx_obmc_sub_pixel_variance64x128)
+#endif  // CONFIG_EXT_PARTITION
+  OBFP(BLOCK_64X64, vpx_obmc_sad64x64, vpx_obmc_variance64x64,
+       vpx_obmc_sub_pixel_variance64x64)
+  OBFP(BLOCK_64X32, vpx_obmc_sad64x32, vpx_obmc_variance64x32,
+       vpx_obmc_sub_pixel_variance64x32)
+  OBFP(BLOCK_32X64, vpx_obmc_sad32x64, vpx_obmc_variance32x64,
+       vpx_obmc_sub_pixel_variance32x64)
+  OBFP(BLOCK_32X32, vpx_obmc_sad32x32, vpx_obmc_variance32x32,
+       vpx_obmc_sub_pixel_variance32x32)
+  OBFP(BLOCK_32X16, vpx_obmc_sad32x16, vpx_obmc_variance32x16,
+       vpx_obmc_sub_pixel_variance32x16)
+  OBFP(BLOCK_16X32, vpx_obmc_sad16x32, vpx_obmc_variance16x32,
+       vpx_obmc_sub_pixel_variance16x32)
+  OBFP(BLOCK_16X16, vpx_obmc_sad16x16, vpx_obmc_variance16x16,
+       vpx_obmc_sub_pixel_variance16x16)
+  OBFP(BLOCK_16X8, vpx_obmc_sad16x8, vpx_obmc_variance16x8,
+       vpx_obmc_sub_pixel_variance16x8)
+  OBFP(BLOCK_8X16, vpx_obmc_sad8x16, vpx_obmc_variance8x16,
+       vpx_obmc_sub_pixel_variance8x16)
+  OBFP(BLOCK_8X8, vpx_obmc_sad8x8, vpx_obmc_variance8x8,
+       vpx_obmc_sub_pixel_variance8x8)
+  OBFP(BLOCK_4X8, vpx_obmc_sad4x8, vpx_obmc_variance4x8,
+       vpx_obmc_sub_pixel_variance4x8)
+  OBFP(BLOCK_8X4, vpx_obmc_sad8x4, vpx_obmc_variance8x4,
+       vpx_obmc_sub_pixel_variance8x4)
+  OBFP(BLOCK_4X4, vpx_obmc_sad4x4, vpx_obmc_variance4x4,
+       vpx_obmc_sub_pixel_variance4x4)
+#endif  // CONFIG_OBMC
+
+#if CONFIG_EXT_INTER
+#define MBFP(BT, MSDF, MVF, MSVF)         \
+  cpi->fn_ptr[BT].msdf            = MSDF; \
+  cpi->fn_ptr[BT].mvf             = MVF;  \
+  cpi->fn_ptr[BT].msvf            = MSVF;
+
+#if CONFIG_EXT_PARTITION
+  MBFP(BLOCK_128X128, vpx_masked_sad128x128, vpx_masked_variance128x128,
+       vpx_masked_sub_pixel_variance128x128)
+  MBFP(BLOCK_128X64, vpx_masked_sad128x64, vpx_masked_variance128x64,
+       vpx_masked_sub_pixel_variance128x64)
+  MBFP(BLOCK_64X128, vpx_masked_sad64x128, vpx_masked_variance64x128,
+       vpx_masked_sub_pixel_variance64x128)
+#endif  // CONFIG_EXT_PARTITION
+  MBFP(BLOCK_64X64, vpx_masked_sad64x64, vpx_masked_variance64x64,
+       vpx_masked_sub_pixel_variance64x64)
+  MBFP(BLOCK_64X32, vpx_masked_sad64x32, vpx_masked_variance64x32,
+       vpx_masked_sub_pixel_variance64x32)
+  MBFP(BLOCK_32X64, vpx_masked_sad32x64, vpx_masked_variance32x64,
+       vpx_masked_sub_pixel_variance32x64)
+  MBFP(BLOCK_32X32, vpx_masked_sad32x32, vpx_masked_variance32x32,
+       vpx_masked_sub_pixel_variance32x32)
+  MBFP(BLOCK_32X16, vpx_masked_sad32x16, vpx_masked_variance32x16,
+       vpx_masked_sub_pixel_variance32x16)
+  MBFP(BLOCK_16X32, vpx_masked_sad16x32, vpx_masked_variance16x32,
+       vpx_masked_sub_pixel_variance16x32)
+  MBFP(BLOCK_16X16, vpx_masked_sad16x16, vpx_masked_variance16x16,
+       vpx_masked_sub_pixel_variance16x16)
+  MBFP(BLOCK_16X8, vpx_masked_sad16x8, vpx_masked_variance16x8,
+       vpx_masked_sub_pixel_variance16x8)
+  MBFP(BLOCK_8X16, vpx_masked_sad8x16, vpx_masked_variance8x16,
+       vpx_masked_sub_pixel_variance8x16)
+  MBFP(BLOCK_8X8, vpx_masked_sad8x8, vpx_masked_variance8x8,
+       vpx_masked_sub_pixel_variance8x8)
+  MBFP(BLOCK_4X8, vpx_masked_sad4x8, vpx_masked_variance4x8,
+       vpx_masked_sub_pixel_variance4x8)
+  MBFP(BLOCK_8X4, vpx_masked_sad8x4, vpx_masked_variance8x4,
+       vpx_masked_sub_pixel_variance8x4)
+  MBFP(BLOCK_4X4, vpx_masked_sad4x4, vpx_masked_variance4x4,
+       vpx_masked_sub_pixel_variance4x4)
+#endif  // CONFIG_EXT_INTER
+
 #if CONFIG_VP9_HIGHBITDEPTH
   highbd_set_var_fns(cpi);
 #endif
@@ -1823,11 +2766,15 @@
   vp10_init_quantizer(cpi);
 
   vp10_loop_filter_init(cm);
+#if CONFIG_LOOP_RESTORATION
+  vp10_loop_restoration_precal();
+#endif  // CONFIG_LOOP_RESTORATION
 
   cm->error.setjmp = 0;
 
   return cpi;
 }
+
 #define SNPRINT(H, T) \
   snprintf((H) + strlen(H), sizeof(H) - strlen(H), (T))
 
@@ -1865,14 +2812,8 @@
         const double total_psnr =
             vpx_sse_to_psnr((double)cpi->total_samples, peak,
                             (double)cpi->total_sq_error);
-        const double totalp_psnr =
-            vpx_sse_to_psnr((double)cpi->totalp_samples, peak,
-                            (double)cpi->totalp_sq_error);
         const double total_ssim = 100 * pow(cpi->summed_quality /
                                             cpi->summed_weights, 8.0);
-        const double totalp_ssim = 100 * pow(cpi->summedp_quality /
-                                             cpi->summedp_weights, 8.0);
-
         snprintf(headings, sizeof(headings),
                  "Bitrate\tAVGPsnr\tGLBPsnr\tAVPsnrP\tGLPsnrP\t"
                  "VPXSSIM\tVPSSIMP\tFASTSIM\tPSNRHVS\t"
@@ -1882,8 +2823,8 @@
                  "%7.3f\t%7.3f\t%7.3f\t%7.3f\t"
                  "%7.3f\t%7.3f\t%7.3f\t%7.3f",
                  dr, cpi->psnr.stat[ALL] / cpi->count, total_psnr,
-                 cpi->psnrp.stat[ALL] / cpi->count, totalp_psnr,
-                 total_ssim, totalp_ssim,
+                 cpi->psnr.stat[ALL] / cpi->count, total_psnr,
+                 total_ssim, total_ssim,
                  cpi->fastssim.stat[ALL] / cpi->count,
                  cpi->psnrhvs.stat[ALL] / cpi->count,
                  cpi->psnr.worst, cpi->worst_ssim, cpi->fastssim.worst,
@@ -1897,7 +2838,7 @@
 
         if (cpi->b_calculate_consistency) {
           double consistency =
-              vpx_sse_to_psnr((double)cpi->totalp_samples, peak,
+              vpx_sse_to_psnr((double)cpi->total_samples, peak,
                               (double)cpi->total_inconsistency);
 
           SNPRINT(headings, "\tConsist\tWstCons");
@@ -1905,12 +2846,6 @@
           SNPRINT2(results, "\t%7.3f", cpi->worst_consistency);
         }
 
-        if (cpi->b_calculate_ssimg) {
-          SNPRINT(headings, "\t  SSIMG\tWtSSIMG");
-          SNPRINT2(results, "\t%7.3f", cpi->ssimg.stat[ALL] / cpi->count);
-          SNPRINT2(results, "\t%7.3f", cpi->ssimg.worst);
-        }
-
         fprintf(f, "%s\t    Time  Rc-Err Abs Err\n", headings);
         fprintf(f, "%s\t%8.0f %7.2f %7.2f\n", results,
                 total_encode_time, rate_err, fabs(rate_err));
@@ -1946,8 +2881,11 @@
 
     // Deallocate allocated thread data.
     if (t < cpi->num_workers - 1) {
+      if (cpi->common.allow_screen_content_tools)
+        vpx_free(thread_data->td->mb.palette_buffer);
       vpx_free(thread_data->td->counts);
       vp10_free_pc_tree(thread_data->td);
+      vp10_free_var_tree(thread_data->td);
       vpx_free(thread_data->td);
     }
   }
@@ -2004,271 +2942,16 @@
 #endif
 }
 
-/* TODO(yaowu): The block_variance calls the unoptimized versions of variance()
- * and highbd_8_variance(). It should not.
- */
-static void encoder_variance(const uint8_t *a, int  a_stride,
-                             const uint8_t *b, int  b_stride,
-                             int  w, int  h, unsigned int *sse, int *sum) {
-  int i, j;
-
-  *sum = 0;
-  *sse = 0;
-
-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; j++) {
-      const int diff = a[j] - b[j];
-      *sum += diff;
-      *sse += diff * diff;
-    }
-
-    a += a_stride;
-    b += b_stride;
-  }
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static void encoder_highbd_variance64(const uint8_t *a8, int  a_stride,
-                                      const uint8_t *b8, int  b_stride,
-                                      int w, int h, uint64_t *sse,
-                                      uint64_t *sum) {
-  int i, j;
-
-  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-  *sum = 0;
-  *sse = 0;
-
-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; j++) {
-      const int diff = a[j] - b[j];
-      *sum += diff;
-      *sse += diff * diff;
-    }
-    a += a_stride;
-    b += b_stride;
-  }
-}
-
-static void encoder_highbd_8_variance(const uint8_t *a8, int  a_stride,
-                                      const uint8_t *b8, int  b_stride,
-                                      int w, int h,
-                                      unsigned int *sse, int *sum) {
-  uint64_t sse_long = 0;
-  uint64_t sum_long = 0;
-  encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h,
-                            &sse_long, &sum_long);
-  *sse = (unsigned int)sse_long;
-  *sum = (int)sum_long;
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-static int64_t get_sse(const uint8_t *a, int a_stride,
-                       const uint8_t *b, int b_stride,
-                       int width, int height) {
-  const int dw = width % 16;
-  const int dh = height % 16;
-  int64_t total_sse = 0;
-  unsigned int sse = 0;
-  int sum = 0;
-  int x, y;
-
-  if (dw > 0) {
-    encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride,
-                     dw, height, &sse, &sum);
-    total_sse += sse;
-  }
-
-  if (dh > 0) {
-    encoder_variance(&a[(height - dh) * a_stride], a_stride,
-                     &b[(height - dh) * b_stride], b_stride,
-                     width - dw, dh, &sse, &sum);
-    total_sse += sse;
-  }
-
-  for (y = 0; y < height / 16; ++y) {
-    const uint8_t *pa = a;
-    const uint8_t *pb = b;
-    for (x = 0; x < width / 16; ++x) {
-      vpx_mse16x16(pa, a_stride, pb, b_stride, &sse);
-      total_sse += sse;
-
-      pa += 16;
-      pb += 16;
-    }
-
-    a += 16 * a_stride;
-    b += 16 * b_stride;
-  }
-
-  return total_sse;
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride,
-                                    const uint8_t *b8, int b_stride,
-                                    int width, int height,
-                                    unsigned int input_shift) {
-  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-  int64_t total_sse = 0;
-  int x, y;
-  for (y = 0; y < height; ++y) {
-    for (x = 0; x < width; ++x) {
-      int64_t diff;
-      diff = (a[x] >> input_shift) - (b[x] >> input_shift);
-      total_sse += diff * diff;
-    }
-    a += a_stride;
-    b += b_stride;
-  }
-  return total_sse;
-}
-
-static int64_t highbd_get_sse(const uint8_t *a, int a_stride,
-                              const uint8_t *b, int b_stride,
-                              int width, int height) {
-  int64_t total_sse = 0;
-  int x, y;
-  const int dw = width % 16;
-  const int dh = height % 16;
-  unsigned int sse = 0;
-  int sum = 0;
-  if (dw > 0) {
-    encoder_highbd_8_variance(&a[width - dw], a_stride,
-                              &b[width - dw], b_stride,
-                              dw, height, &sse, &sum);
-    total_sse += sse;
-  }
-  if (dh > 0) {
-    encoder_highbd_8_variance(&a[(height - dh) * a_stride], a_stride,
-                              &b[(height - dh) * b_stride], b_stride,
-                              width - dw, dh, &sse, &sum);
-    total_sse += sse;
-  }
-  for (y = 0; y < height / 16; ++y) {
-    const uint8_t *pa = a;
-    const uint8_t *pb = b;
-    for (x = 0; x < width / 16; ++x) {
-      vpx_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse);
-      total_sse += sse;
-      pa += 16;
-      pb += 16;
-    }
-    a += 16 * a_stride;
-    b += 16 * b_stride;
-  }
-  return total_sse;
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-typedef struct {
-  double psnr[4];       // total/y/u/v
-  uint64_t sse[4];      // total/y/u/v
-  uint32_t samples[4];  // total/y/u/v
-} PSNR_STATS;
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static void calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
-                             const YV12_BUFFER_CONFIG *b,
-                             PSNR_STATS *psnr,
-                             unsigned int bit_depth,
-                             unsigned int in_bit_depth) {
-  const int widths[3] =
-      {a->y_crop_width,  a->uv_crop_width,  a->uv_crop_width };
-  const int heights[3] =
-      {a->y_crop_height, a->uv_crop_height, a->uv_crop_height};
-  const uint8_t *a_planes[3] = {a->y_buffer, a->u_buffer,  a->v_buffer };
-  const int a_strides[3] = {a->y_stride, a->uv_stride, a->uv_stride};
-  const uint8_t *b_planes[3] = {b->y_buffer, b->u_buffer,  b->v_buffer };
-  const int b_strides[3] = {b->y_stride, b->uv_stride, b->uv_stride};
-  int i;
-  uint64_t total_sse = 0;
-  uint32_t total_samples = 0;
-  const double peak = (double)((1 << in_bit_depth) - 1);
-  const unsigned int input_shift = bit_depth - in_bit_depth;
-
-  for (i = 0; i < 3; ++i) {
-    const int w = widths[i];
-    const int h = heights[i];
-    const uint32_t samples = w * h;
-    uint64_t sse;
-    if (a->flags & YV12_FLAG_HIGHBITDEPTH) {
-      if (input_shift) {
-        sse = highbd_get_sse_shift(a_planes[i], a_strides[i],
-                                   b_planes[i], b_strides[i], w, h,
-                                   input_shift);
-      } else {
-        sse = highbd_get_sse(a_planes[i], a_strides[i],
-                             b_planes[i], b_strides[i], w, h);
-      }
-    } else {
-      sse = get_sse(a_planes[i], a_strides[i],
-                    b_planes[i], b_strides[i],
-                    w, h);
-    }
-    psnr->sse[1 + i] = sse;
-    psnr->samples[1 + i] = samples;
-    psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, peak, (double)sse);
-
-    total_sse += sse;
-    total_samples += samples;
-  }
-
-  psnr->sse[0] = total_sse;
-  psnr->samples[0] = total_samples;
-  psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, peak,
-                                  (double)total_sse);
-}
-
-#else  // !CONFIG_VP9_HIGHBITDEPTH
-
-static void calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
-                      PSNR_STATS *psnr) {
-  static const double peak = 255.0;
-  const int widths[3]        = {
-      a->y_crop_width, a->uv_crop_width, a->uv_crop_width};
-  const int heights[3]       = {
-      a->y_crop_height, a->uv_crop_height, a->uv_crop_height};
-  const uint8_t *a_planes[3] = {a->y_buffer, a->u_buffer, a->v_buffer};
-  const int a_strides[3]     = {a->y_stride, a->uv_stride, a->uv_stride};
-  const uint8_t *b_planes[3] = {b->y_buffer, b->u_buffer, b->v_buffer};
-  const int b_strides[3]     = {b->y_stride, b->uv_stride, b->uv_stride};
-  int i;
-  uint64_t total_sse = 0;
-  uint32_t total_samples = 0;
-
-  for (i = 0; i < 3; ++i) {
-    const int w = widths[i];
-    const int h = heights[i];
-    const uint32_t samples = w * h;
-    const uint64_t sse = get_sse(a_planes[i], a_strides[i],
-                                 b_planes[i], b_strides[i],
-                                 w, h);
-    psnr->sse[1 + i] = sse;
-    psnr->samples[1 + i] = samples;
-    psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, peak, (double)sse);
-
-    total_sse += sse;
-    total_samples += samples;
-  }
-
-  psnr->sse[0] = total_sse;
-  psnr->samples[0] = total_samples;
-  psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, peak,
-                                  (double)total_sse);
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 static void generate_psnr_packet(VP10_COMP *cpi) {
   struct vpx_codec_cx_pkt pkt;
   int i;
   PSNR_STATS psnr;
 #if CONFIG_VP9_HIGHBITDEPTH
-  calc_highbd_psnr(cpi->Source, cpi->common.frame_to_show, &psnr,
-                   cpi->td.mb.e_mbd.bd, cpi->oxcf.input_bit_depth);
+  vpx_calc_highbd_psnr(cpi->Source, cpi->common.frame_to_show, &psnr,
+                       cpi->td.mb.e_mbd.bd, cpi->oxcf.input_bit_depth);
 #else
-  calc_psnr(cpi->Source, cpi->common.frame_to_show, &psnr);
+  vpx_calc_psnr(cpi->Source, cpi->common.frame_to_show, &psnr);
 #endif
 
   for (i = 0; i < 4; ++i) {
@@ -2281,7 +2964,7 @@
 }
 
 int vp10_use_as_reference(VP10_COMP *cpi, int ref_frame_flags) {
-  if (ref_frame_flags > 7)
+  if (ref_frame_flags > ((1 << REFS_PER_FRAME) - 1))
     return -1;
 
   cpi->ref_frame_flags = ref_frame_flags;
@@ -2295,13 +2978,23 @@
   cpi->ext_refresh_frame_flags_pending = 1;
 }
 
-static YV12_BUFFER_CONFIG *get_vp10_ref_frame_buffer(VP10_COMP *cpi,
-                                VP9_REFFRAME ref_frame_flag) {
+static YV12_BUFFER_CONFIG *get_vp10_ref_frame_buffer(
+    VP10_COMP *cpi, VP9_REFFRAME ref_frame_flag) {
   MV_REFERENCE_FRAME ref_frame = NONE;
   if (ref_frame_flag == VP9_LAST_FLAG)
     ref_frame = LAST_FRAME;
+#if CONFIG_EXT_REFS
+  else if (ref_frame_flag == VP9_LAST2_FLAG)
+    ref_frame = LAST2_FRAME;
+  else if (ref_frame_flag == VP9_LAST3_FLAG)
+    ref_frame = LAST3_FRAME;
+#endif  // CONFIG_EXT_REFS
   else if (ref_frame_flag == VP9_GOLD_FLAG)
     ref_frame = GOLDEN_FRAME;
+#if CONFIG_EXT_REFS
+  else if (ref_frame_flag == VP9_BWD_FLAG)
+    ref_frame = BWDREF_FRAME;
+#endif  // CONFIG_EXT_REFS
   else if (ref_frame_flag == VP9_ALT_FLAG)
     ref_frame = ALTREF_FRAME;
 
@@ -2472,10 +3165,11 @@
 
 #if CONFIG_VP9_HIGHBITDEPTH
 static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
-                                   YV12_BUFFER_CONFIG *dst, int bd) {
+                                   YV12_BUFFER_CONFIG *dst, int planes,
+                                   int bd) {
 #else
 static void scale_and_extend_frame(const YV12_BUFFER_CONFIG *src,
-                                   YV12_BUFFER_CONFIG *dst) {
+                                   YV12_BUFFER_CONFIG *dst, int planes) {
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   const int src_w = src->y_crop_width;
   const int src_h = src->y_crop_height;
@@ -2485,12 +3179,15 @@
   const int src_strides[3] = {src->y_stride, src->uv_stride, src->uv_stride};
   uint8_t *const dsts[3] = {dst->y_buffer, dst->u_buffer, dst->v_buffer};
   const int dst_strides[3] = {dst->y_stride, dst->uv_stride, dst->uv_stride};
-  const InterpKernel *const kernel = vp10_filter_kernels[EIGHTTAP];
+  const InterpFilterParams interp_filter_params =
+      vp10_get_interp_filter_params(EIGHTTAP_REGULAR);
+  const int16_t *kernel = interp_filter_params.filter_ptr;
+  const int taps = interp_filter_params.taps;
   int x, y, i;
 
   for (y = 0; y < dst_h; y += 16) {
     for (x = 0; x < dst_w; x += 16) {
-      for (i = 0; i < MAX_MB_PLANE; ++i) {
+      for (i = 0; i < planes; ++i) {
         const int factor = (i == 0 || i == 3 ? 1 : 2);
         const int x_q4 = x * (16 / factor) * src_w / dst_w;
         const int y_q4 = y * (16 / factor) * src_h / dst_h;
@@ -2503,26 +3200,29 @@
 #if CONFIG_VP9_HIGHBITDEPTH
         if (src->flags & YV12_FLAG_HIGHBITDEPTH) {
           vpx_highbd_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
-                               kernel[x_q4 & 0xf], 16 * src_w / dst_w,
-                               kernel[y_q4 & 0xf], 16 * src_h / dst_h,
+                               &kernel[(x_q4 & 0xf) * taps], 16 * src_w / dst_w,
+                               &kernel[(y_q4 & 0xf) * taps], 16 * src_h / dst_h,
                                16 / factor, 16 / factor, bd);
         } else {
-          vpx_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
-                        kernel[x_q4 & 0xf], 16 * src_w / dst_w,
-                        kernel[y_q4 & 0xf], 16 * src_h / dst_h,
+          vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride,
+                        &kernel[(x_q4 & 0xf) * taps], 16 * src_w / dst_w,
+                        &kernel[(y_q4 & 0xf) * taps], 16 * src_h / dst_h,
                         16 / factor, 16 / factor);
         }
 #else
-        vpx_convolve8(src_ptr, src_stride, dst_ptr, dst_stride,
-                      kernel[x_q4 & 0xf], 16 * src_w / dst_w,
-                      kernel[y_q4 & 0xf], 16 * src_h / dst_h,
+        vpx_scaled_2d(src_ptr, src_stride, dst_ptr, dst_stride,
+                      &kernel[(x_q4 & 0xf) * taps], 16 * src_w / dst_w,
+                      &kernel[(y_q4 & 0xf) * taps], 16 * src_h / dst_h,
                       16 / factor, 16 / factor);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
       }
     }
   }
 
-  vpx_extend_frame_borders(dst);
+  if (planes == 1)
+    vpx_extend_frame_borders_y(dst);
+  else
+    vpx_extend_frame_borders(dst);
 }
 
 static int scale_down(VP10_COMP *cpi, int q) {
@@ -2578,17 +3278,158 @@
   return force_recode;
 }
 
+static INLINE int get_free_upsampled_ref_buf(EncRefCntBuffer *ubufs) {
+  int i;
+
+  for (i = 0; i < MAX_REF_FRAMES; i++) {
+    if (!ubufs[i].ref_count) {
+      return i;
+    }
+  }
+  return INVALID_IDX;
+}
+
+// Up-sample 1 reference frame.
+static INLINE int upsample_ref_frame(VP10_COMP *cpi,
+                                     const YV12_BUFFER_CONFIG *const ref) {
+  VP10_COMMON * const cm = &cpi->common;
+  EncRefCntBuffer *ubufs = cpi->upsampled_ref_bufs;
+  int new_uidx = get_free_upsampled_ref_buf(ubufs);
+
+  if (new_uidx == INVALID_IDX) {
+    return INVALID_IDX;
+  } else {
+    YV12_BUFFER_CONFIG *upsampled_ref = &ubufs[new_uidx].buf;
+
+    // Can allocate buffer for Y plane only.
+    if (upsampled_ref->buffer_alloc_sz < (ref->buffer_alloc_sz << 6))
+      if (vpx_realloc_frame_buffer(upsampled_ref,
+                                   (cm->width << 3), (cm->height << 3),
+                                   cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                   cm->use_highbitdepth,
+#endif
+                                   (VP9_ENC_BORDER_IN_PIXELS << 3),
+                                   cm->byte_alignment,
+                                   NULL, NULL, NULL))
+        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                           "Failed to allocate up-sampled frame buffer");
+
+    // Currently, only Y plane is up-sampled, U, V are not used.
+#if CONFIG_VP9_HIGHBITDEPTH
+    scale_and_extend_frame(ref, upsampled_ref, 1, (int)cm->bit_depth);
+#else
+    scale_and_extend_frame(ref, upsampled_ref, 1);
+#endif
+    return new_uidx;
+  }
+}
+
+#define DUMP_REF_FRAME_IMAGES    0
+
+#if DUMP_REF_FRAME_IMAGES == 1
+static int dump_one_image(VP10_COMMON *cm,
+                          const YV12_BUFFER_CONFIG *const ref_buf,
+                          char *file_name) {
+  int h;
+  FILE *f_ref = NULL;
+
+  if (ref_buf == NULL) {
+    printf("Frame data buffer is NULL.\n");
+    return VPX_CODEC_MEM_ERROR;
+  }
+
+  if ((f_ref = fopen(file_name, "wb")) == NULL) {
+    printf("Unable to open file %s to write.\n", file_name);
+    return VPX_CODEC_MEM_ERROR;
+  }
+
+  // --- Y ---
+  for (h = 0; h < cm->height; ++h) {
+    fwrite(&ref_buf->y_buffer[h*ref_buf->y_stride],
+           1, cm->width, f_ref);
+  }
+  // --- U ---
+  for (h = 0; h < (cm->height >> 1); ++h) {
+    fwrite(&ref_buf->u_buffer[h*ref_buf->uv_stride],
+           1, (cm->width >> 1), f_ref);
+  }
+  // --- V ---
+  for (h = 0; h < (cm->height >> 1); ++h) {
+    fwrite(&ref_buf->v_buffer[h*ref_buf->uv_stride],
+           1, (cm->width >> 1), f_ref);
+  }
+
+  fclose(f_ref);
+
+  return VPX_CODEC_OK;
+}
+
+static void dump_ref_frame_images(VP10_COMP *cpi) {
+  VP10_COMMON *const cm = &cpi->common;
+  MV_REFERENCE_FRAME ref_frame;
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    char file_name[256] = "";
+    snprintf(file_name, sizeof(file_name), "/tmp/enc_F%d_ref_%d.yuv",
+             cm->current_video_frame, ref_frame);
+    dump_one_image(
+        cm, get_ref_frame_buffer(cpi, ref_frame), file_name);
+  }
+}
+#endif  // DUMP_REF_FRAME_IMAGES == 1
+
 void vp10_update_reference_frames(VP10_COMP *cpi) {
   VP10_COMMON * const cm = &cpi->common;
   BufferPool *const pool = cm->buffer_pool;
+  const int use_upsampled_ref = cpi->sf.use_upsampled_references;
+  int new_uidx = 0;
+
+  // NOTE: Save the new show frame buffer index for --test-code=warn, i.e.,
+  //       for the purpose to verify no mismatch between encoder and decoder.
+  if (cm->show_frame)
+    cpi->last_show_frame_buf_idx = cm->new_fb_idx;
+
+  if (use_upsampled_ref) {
+#if CONFIG_EXT_REFS
+    if (cm->show_existing_frame) {
+      new_uidx = cpi->upsampled_ref_idx[cpi->existing_fb_idx_to_show];
+      // TODO(zoeliu): Once following is confirmed, remove it.
+      assert(cpi->upsampled_ref_bufs[new_uidx].ref_count > 0);
+    } else {
+#endif  // CONFIG_EXT_REFS
+      // Up-sample the current encoded frame.
+      RefCntBuffer *bufs = pool->frame_bufs;
+      const YV12_BUFFER_CONFIG *const ref = &bufs[cm->new_fb_idx].buf;
+
+      new_uidx = upsample_ref_frame(cpi, ref);
+#if CONFIG_EXT_REFS
+    }
+#endif  // CONFIG_EXT_REFS
+  }
 
   // At this point the new frame has been encoded.
   // If any buffer copy / swapping is signaled it should be done here.
   if (cm->frame_type == KEY_FRAME) {
     ref_cnt_fb(pool->frame_bufs,
                &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx);
+#if CONFIG_EXT_REFS
+    ref_cnt_fb(pool->frame_bufs,
+               &cm->ref_frame_map[cpi->bwd_fb_idx], cm->new_fb_idx);
+#endif  // CONFIG_EXT_REFS
     ref_cnt_fb(pool->frame_bufs,
                &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);
+
+    if (use_upsampled_ref) {
+      uref_cnt_fb(cpi->upsampled_ref_bufs,
+                  &cpi->upsampled_ref_idx[cpi->gld_fb_idx], new_uidx);
+#if CONFIG_EXT_REFS
+      uref_cnt_fb(cpi->upsampled_ref_bufs,
+                  &cpi->upsampled_ref_idx[cpi->bwd_fb_idx], new_uidx);
+#endif  // CONFIG_EXT_REFS
+      uref_cnt_fb(cpi->upsampled_ref_bufs,
+                  &cpi->upsampled_ref_idx[cpi->alt_fb_idx], new_uidx);
+    }
   } else if (vp10_preserve_existing_gf(cpi)) {
     // We have decided to preserve the previously existing golden frame as our
     // new ARF frame. However, in the short term in function
@@ -2602,10 +3443,39 @@
 
     ref_cnt_fb(pool->frame_bufs,
                &cm->ref_frame_map[cpi->alt_fb_idx], cm->new_fb_idx);
+    if (use_upsampled_ref)
+      uref_cnt_fb(cpi->upsampled_ref_bufs,
+                  &cpi->upsampled_ref_idx[cpi->alt_fb_idx], new_uidx);
 
     tmp = cpi->alt_fb_idx;
     cpi->alt_fb_idx = cpi->gld_fb_idx;
     cpi->gld_fb_idx = tmp;
+
+    // TODO(zoeliu): Do we need to copy cpi->interp_filter_selected[0] over to
+    // cpi->interp_filter_selected[GOLDEN_FRAME]?
+#if CONFIG_EXT_REFS
+  } else if (cpi->rc.is_last_bipred_frame) {
+    // Refresh the LAST_FRAME with the BWDREF_FRAME and retire the LAST3_FRAME
+    // by updating the virtual indices. Note that the frame BWDREF_FRAME points
+    // to now should be retired, and it should not be used before refreshed.
+    int ref_frame, tmp = cpi->lst_fb_idxes[LAST_REF_FRAMES-1];
+    for (ref_frame = LAST_REF_FRAMES - 1; ref_frame > 0; --ref_frame) {
+      cpi->lst_fb_idxes[ref_frame] = cpi->lst_fb_idxes[ref_frame - 1];
+
+      if (!cpi->rc.is_src_frame_alt_ref) {
+        memcpy(cpi->interp_filter_selected[ref_frame],
+               cpi->interp_filter_selected[ref_frame - 1],
+               sizeof(cpi->interp_filter_selected[ref_frame - 1]));
+      }
+    }
+    cpi->lst_fb_idxes[0] = cpi->bwd_fb_idx;
+    if (!cpi->rc.is_src_frame_alt_ref) {
+      memcpy(cpi->interp_filter_selected[0],
+             cpi->interp_filter_selected[BWDREF_FRAME],
+             sizeof(cpi->interp_filter_selected[BWDREF_FRAME]));
+    }
+    cpi->bwd_fb_idx = tmp;
+#endif  // CONFIG_EXT_REFS
   } else { /* For non key/golden frames */
     if (cpi->refresh_alt_ref_frame) {
       int arf_idx = cpi->alt_fb_idx;
@@ -2616,6 +3486,10 @@
 
       ref_cnt_fb(pool->frame_bufs,
                  &cm->ref_frame_map[arf_idx], cm->new_fb_idx);
+      if (use_upsampled_ref)
+        uref_cnt_fb(cpi->upsampled_ref_bufs,
+                    &cpi->upsampled_ref_idx[cpi->alt_fb_idx], new_uidx);
+
       memcpy(cpi->interp_filter_selected[ALTREF_FRAME],
              cpi->interp_filter_selected[0],
              sizeof(cpi->interp_filter_selected[0]));
@@ -2624,6 +3498,10 @@
     if (cpi->refresh_golden_frame) {
       ref_cnt_fb(pool->frame_bufs,
                  &cm->ref_frame_map[cpi->gld_fb_idx], cm->new_fb_idx);
+      if (use_upsampled_ref)
+        uref_cnt_fb(cpi->upsampled_ref_bufs,
+                    &cpi->upsampled_ref_idx[cpi->gld_fb_idx], new_uidx);
+
       if (!cpi->rc.is_src_frame_alt_ref)
         memcpy(cpi->interp_filter_selected[GOLDEN_FRAME],
                cpi->interp_filter_selected[0],
@@ -2633,24 +3511,135 @@
                cpi->interp_filter_selected[ALTREF_FRAME],
                sizeof(cpi->interp_filter_selected[ALTREF_FRAME]));
     }
+
+#if CONFIG_EXT_REFS
+    if (cpi->refresh_bwd_ref_frame) {
+      ref_cnt_fb(pool->frame_bufs,
+                 &cm->ref_frame_map[cpi->bwd_fb_idx], cm->new_fb_idx);
+      if (use_upsampled_ref)
+        uref_cnt_fb(cpi->upsampled_ref_bufs,
+                    &cpi->upsampled_ref_idx[cpi->bwd_fb_idx], new_uidx);
+
+      memcpy(cpi->interp_filter_selected[BWDREF_FRAME],
+             cpi->interp_filter_selected[0],
+             sizeof(cpi->interp_filter_selected[0]));
+    }
+#endif  // CONFIG_EXT_REFS
   }
 
   if (cpi->refresh_last_frame) {
+#if CONFIG_EXT_REFS
+    // NOTE(zoeliu): We have two layers of mapping (1) from the per-frame
+    // reference to the reference frame buffer virtual index; and then (2) from
+    // the virtual index to the reference frame buffer physical index:
+    //
+    // LAST_FRAME,      ..., LAST3_FRAME,     ..., ALTREF_FRAME
+    //      |                     |                     |
+    //      v                     v                     v
+    // lst_fb_idxes[0], ..., lst_fb_idxes[2], ..., alt_fb_idx
+    //      |                     |                     |
+    //      v                     v                     v
+    // ref_frame_map[], ..., ref_frame_map[], ..., ref_frame_map[]
+    //
+    // When refresh_last_frame is set, it is intended to retire LAST3_FRAME,
+    // have the other 2 LAST reference frames shifted as follows:
+    // LAST_FRAME -> LAST2_FRAME -> LAST3_FRAME
+    // , and then have LAST_FRAME refreshed by the newly coded frame.
+    //
+    // To fulfill it, the decoder will be notified to execute following 2 steps:
+    //
+    // (a) To change ref_frame_map[] and have the virtual index of LAST3_FRAME
+    //     to point to the newly coded frame, i.e.
+    //     ref_frame_map[lst_fb_idexes[2]] => new_fb_idx;
+    //
+    // (b) To change the 1st layer mapping to have LAST_FRAME mapped to the
+    //     original virtual index of LAST3_FRAME and have the other mappings
+    //     shifted as follows:
+    // LAST_FRAME,      LAST2_FRAME,     LAST3_FRAME
+    //      |                |                |
+    //      v                v                v
+    // lst_fb_idxes[2], lst_fb_idxes[0], lst_fb_idxes[1]
+    int ref_frame;
+
+    if (cm->frame_type == KEY_FRAME) {
+      for (ref_frame = 0; ref_frame < LAST_REF_FRAMES; ++ref_frame) {
+        ref_cnt_fb(pool->frame_bufs,
+                   &cm->ref_frame_map[cpi->lst_fb_idxes[ref_frame]],
+                   cm->new_fb_idx);
+
+        if (use_upsampled_ref)
+          uref_cnt_fb(
+              cpi->upsampled_ref_bufs,
+              &cpi->upsampled_ref_idx[cpi->lst_fb_idxes[ref_frame]],
+              new_uidx);
+      }
+    } else {
+      int tmp;
+
+      ref_cnt_fb(pool->frame_bufs,
+                 &cm->ref_frame_map[cpi->lst_fb_idxes[LAST_REF_FRAMES-1]],
+                 cm->new_fb_idx);
+
+      if (use_upsampled_ref)
+        uref_cnt_fb(
+            cpi->upsampled_ref_bufs,
+            &cpi->upsampled_ref_idx[cpi->lst_fb_idxes[LAST_REF_FRAMES-1]],
+            new_uidx);
+
+      tmp = cpi->lst_fb_idxes[LAST_REF_FRAMES-1];
+      for (ref_frame = LAST_REF_FRAMES - 1; ref_frame > 0; --ref_frame) {
+        cpi->lst_fb_idxes[ref_frame] = cpi->lst_fb_idxes[ref_frame - 1];
+
+        if (!cpi->rc.is_src_frame_alt_ref) {
+          memcpy(cpi->interp_filter_selected[ref_frame],
+                 cpi->interp_filter_selected[ref_frame - 1],
+                 sizeof(cpi->interp_filter_selected[ref_frame - 1]));
+        }
+      }
+      cpi->lst_fb_idxes[0] = tmp;
+
+      if (!cpi->rc.is_src_frame_alt_ref) {
+        if (cm->show_existing_frame) {
+          memcpy(cpi->interp_filter_selected[LAST_FRAME],
+                 cpi->interp_filter_selected[BWDREF_FRAME],
+                 sizeof(cpi->interp_filter_selected[BWDREF_FRAME]));
+        } else {
+          memcpy(cpi->interp_filter_selected[LAST_FRAME],
+                 cpi->interp_filter_selected[0],
+                 sizeof(cpi->interp_filter_selected[0]));
+        }
+      }
+    }
+#else
     ref_cnt_fb(pool->frame_bufs,
                &cm->ref_frame_map[cpi->lst_fb_idx], cm->new_fb_idx);
-    if (!cpi->rc.is_src_frame_alt_ref)
+    if (use_upsampled_ref)
+      uref_cnt_fb(cpi->upsampled_ref_bufs,
+                  &cpi->upsampled_ref_idx[cpi->lst_fb_idx], new_uidx);
+    if (!cpi->rc.is_src_frame_alt_ref) {
       memcpy(cpi->interp_filter_selected[LAST_FRAME],
              cpi->interp_filter_selected[0],
              sizeof(cpi->interp_filter_selected[0]));
+    }
+#endif  // CONFIG_EXT_REFS
   }
+
+#if DUMP_REF_FRAME_IMAGES == 1
+    // Dump out all reference frame images.
+    dump_ref_frame_images(cpi);
+#endif   // DUMP_REF_FRAME_IMAGES
+
 #if CONFIG_VP9_TEMPORAL_DENOISING
   if (cpi->oxcf.noise_sensitivity > 0) {
     vp10_denoiser_update_frame_info(&cpi->denoiser,
                                    *cpi->Source,
                                    cpi->common.frame_type,
+                                   cpi->refresh_last_frame,
+#if CONFIG_EXT_REFS
+                                   cpi->refresh_bwd_ref_frame,
+#endif  // CONFIG_EXT_REFS
                                    cpi->refresh_alt_ref_frame,
-                                   cpi->refresh_golden_frame,
-                                   cpi->refresh_last_frame);
+                                   cpi->refresh_golden_frame);
   }
 #endif
 }
@@ -2667,13 +3656,20 @@
 
     vpx_usec_timer_start(&timer);
 
+#if CONFIG_LOOP_RESTORATION
+    vp10_pick_filter_restoration(cpi->Source, cpi, cpi->sf.lpf_pick);
+#else
     vp10_pick_filter_level(cpi->Source, cpi, cpi->sf.lpf_pick);
+#endif  // CONFIG_LOOP_RESTORATION
 
     vpx_usec_timer_mark(&timer);
     cpi->time_pick_lpf += vpx_usec_timer_elapsed(&timer);
   }
 
   if (lf->filter_level > 0) {
+#if CONFIG_VAR_TX || CONFIG_EXT_PARTITION
+    vp10_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
+#else
     if (cpi->num_workers > 1)
       vp10_loop_filter_frame_mt(cm->frame_to_show, cm, xd->plane,
                                lf->filter_level, 0, 0,
@@ -2681,7 +3677,15 @@
                                &cpi->lf_row_sync);
     else
       vp10_loop_filter_frame(cm->frame_to_show, cm, xd, lf->filter_level, 0, 0);
+#endif
   }
+#if CONFIG_LOOP_RESTORATION
+  if (cm->rst_info.restoration_type != RESTORE_NONE) {
+    vp10_loop_restoration_init(&cm->rst_internal, &cm->rst_info,
+                               cm->frame_type == KEY_FRAME);
+    vp10_loop_restoration_rows(cm->frame_to_show, cm, 0, cm->mi_rows, 0);
+  }
+#endif  // CONFIG_LOOP_RESTORATION
 
   vpx_extend_frame_inner_borders(cm->frame_to_show);
 }
@@ -2704,7 +3708,18 @@
 void vp10_scale_references(VP10_COMP *cpi) {
   VP10_COMMON *cm = &cpi->common;
   MV_REFERENCE_FRAME ref_frame;
-  const VP9_REFFRAME ref_mask[3] = {VP9_LAST_FLAG, VP9_GOLD_FLAG, VP9_ALT_FLAG};
+  const VP9_REFFRAME ref_mask[REFS_PER_FRAME] = {
+    VP9_LAST_FLAG,
+#if CONFIG_EXT_REFS
+    VP9_LAST2_FLAG,
+    VP9_LAST3_FLAG,
+#endif  // CONFIG_EXT_REFS
+    VP9_GOLD_FLAG,
+#if CONFIG_EXT_REFS
+    VP9_BWD_FLAG,
+#endif  // CONFIG_EXT_REFS
+    VP9_ALT_FLAG
+  };
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     // Need to convert from VP9_REFFRAME to index into ref_mask (subtract 1).
@@ -2740,7 +3755,8 @@
                                        cm->byte_alignment, NULL, NULL, NULL))
             vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                                "Failed to allocate frame buffer");
-          scale_and_extend_frame(ref, &new_fb_ptr->buf, (int)cm->bit_depth);
+          scale_and_extend_frame(ref, &new_fb_ptr->buf, MAX_MB_PLANE,
+                                 (int)cm->bit_depth);
           cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
           alloc_frame_mvs(cm, new_fb);
         }
@@ -2765,11 +3781,37 @@
                                        cm->byte_alignment, NULL, NULL, NULL))
             vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                                "Failed to allocate frame buffer");
-          scale_and_extend_frame(ref, &new_fb_ptr->buf);
+          scale_and_extend_frame(ref, &new_fb_ptr->buf, MAX_MB_PLANE);
           cpi->scaled_ref_idx[ref_frame - 1] = new_fb;
           alloc_frame_mvs(cm, new_fb);
         }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+
+        if (cpi->sf.use_upsampled_references && (force_scaling ||
+            new_fb_ptr->buf.y_crop_width != cm->width ||
+            new_fb_ptr->buf.y_crop_height != cm->height)) {
+          const int map_idx = get_ref_frame_map_idx(cpi, ref_frame);
+          EncRefCntBuffer *ubuf =
+              &cpi->upsampled_ref_bufs[cpi->upsampled_ref_idx[map_idx]];
+
+          if (vpx_realloc_frame_buffer(&ubuf->buf,
+                                       (cm->width << 3), (cm->height << 3),
+                                       cm->subsampling_x, cm->subsampling_y,
+#if CONFIG_VP9_HIGHBITDEPTH
+                                       cm->use_highbitdepth,
+#endif
+                                       (VP9_ENC_BORDER_IN_PIXELS << 3),
+                                       cm->byte_alignment,
+                                       NULL, NULL, NULL))
+            vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                               "Failed to allocate up-sampled frame buffer");
+#if CONFIG_VP9_HIGHBITDEPTH
+          scale_and_extend_frame(&new_fb_ptr->buf, &ubuf->buf, 1,
+                                 (int)cm->bit_depth);
+#else
+          scale_and_extend_frame(&new_fb_ptr->buf, &ubuf->buf, 1);
+#endif
+        }
       } else {
         const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
         RefCntBuffer *const buf = &pool->frame_bufs[buf_idx];
@@ -2791,10 +3833,17 @@
   if (cpi->oxcf.pass == 0) {
     // Only release scaled references under certain conditions:
     // if reference will be updated, or if scaled reference has same resolution.
-    int refresh[3];
+    int refresh[REFS_PER_FRAME];
     refresh[0] = (cpi->refresh_last_frame) ? 1 : 0;
+#if CONFIG_EXT_REFS
+    refresh[1] = refresh[2] = 0;
+    refresh[3] = (cpi->refresh_golden_frame) ? 1 : 0;
+    refresh[4] = (cpi->refresh_bwd_ref_frame) ? 1 : 0;
+    refresh[5] = (cpi->refresh_alt_ref_frame) ? 1 : 0;
+#else
     refresh[1] = (cpi->refresh_golden_frame) ? 1 : 0;
     refresh[2] = (cpi->refresh_alt_ref_frame) ? 1 : 0;
+#endif  // CONFIG_EXT_REFS
     for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
       const int idx = cpi->scaled_ref_idx[i - 1];
       RefCntBuffer *const buf = idx != INVALID_IDX ?
@@ -2832,8 +3881,9 @@
   model_count[EOB_MODEL_TOKEN] = full_count[EOB_TOKEN];
 }
 
-static void full_to_model_counts(vp10_coeff_count_model *model_count,
-                                 vp10_coeff_count *full_count) {
+
+void vp10_full_to_model_counts(vp10_coeff_count_model *model_count,
+                               vp10_coeff_count *full_count) {
   int i, j, k, l;
 
   for (i = 0; i < PLANE_TYPES; ++i)
@@ -2851,7 +3901,7 @@
 
   vpx_clear_system_state();
 
-  recon_err = vp10_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+  recon_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
 
   if (cpi->twopass.total_left_stats.coded_error != 0.0)
     fprintf(f, "%10u %dx%d  %10d %10d %d %d %10d %10d %10d %10d"
@@ -3019,7 +4069,7 @@
 
     // There has been a change in frame size.
     vp10_set_size_literal(cpi, oxcf->scaled_frame_width,
-                         oxcf->scaled_frame_height);
+                          oxcf->scaled_frame_height);
   }
 
   if (oxcf->pass == 0 &&
@@ -3067,7 +4117,7 @@
   init_motion_estimation(cpi);
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - 1];
+    RefBuffer *const ref_buf = &cm->frame_refs[ref_frame - LAST_FRAME];
     const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
 
     ref_buf->idx = buf_idx;
@@ -3096,9 +4146,28 @@
   set_ref_ptrs(cm, xd, LAST_FRAME, LAST_FRAME);
 }
 
+static void reset_use_upsampled_references(VP10_COMP *cpi) {
+  MV_REFERENCE_FRAME ref_frame;
+
+  // reset up-sampled reference buffer structure.
+  init_upsampled_ref_frame_bufs(cpi);
+
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    const YV12_BUFFER_CONFIG *const ref = get_ref_frame_buffer(cpi,
+                                                               ref_frame);
+    int new_uidx = upsample_ref_frame(cpi, ref);
+
+    // Update the up-sampled reference index.
+    cpi->upsampled_ref_idx[get_ref_frame_map_idx(cpi, ref_frame)] =
+        new_uidx;
+    cpi->upsampled_ref_bufs[new_uidx].ref_count++;
+  }
+}
+
 static void encode_without_recode_loop(VP10_COMP *cpi) {
   VP10_COMMON *const cm = &cpi->common;
   int q = 0, bottom_index = 0, top_index = 0;  // Dummy variables.
+  const int use_upsampled_ref = cpi->sf.use_upsampled_references;
 
   vpx_clear_system_state();
 
@@ -3133,11 +4202,26 @@
   set_size_independent_vars(cpi);
   set_size_dependent_vars(cpi, &q, &bottom_index, &top_index);
 
+  // cpi->sf.use_upsampled_references can be different from frame to frame.
+  // Every time when cpi->sf.use_upsampled_references is changed from 0 to 1.
+  // The reference frames for this frame have to be up-sampled before encoding.
+  if (!use_upsampled_ref && cpi->sf.use_upsampled_references)
+    reset_use_upsampled_references(cpi);
+
   vp10_set_quantizer(cm, q);
   vp10_set_variance_partition_thresholds(cpi, q);
 
   setup_frame(cpi);
 
+#if CONFIG_ENTROPY
+  cm->do_subframe_update = cm->tile_cols == 1 && cm->tile_rows == 1;
+  vp10_copy(cm->starting_coef_probs, cm->fc->coef_probs);
+  vp10_copy(cpi->subframe_stats.enc_starting_coef_probs,
+            cm->fc->coef_probs);
+  cm->coef_probs_update_idx = 0;
+  vp10_copy(cpi->subframe_stats.coef_probs_buf[0], cm->fc->coef_probs);
+#endif  // CONFIG_ENTROPY
+
   suppress_active_map(cpi);
   // Variance adaptive and in frame q adjustment experiments are mutually
   // exclusive.
@@ -3180,9 +4264,16 @@
   int frame_over_shoot_limit;
   int frame_under_shoot_limit;
   int q = 0, q_low = 0, q_high = 0;
+  const int use_upsampled_ref = cpi->sf.use_upsampled_references;
 
   set_size_independent_vars(cpi);
 
+  // cpi->sf.use_upsampled_references can be different from frame to frame.
+  // Every time when cpi->sf.use_upsampled_references is changed from 0 to 1.
+  // The reference frames for this frame have to be up-sampled before encoding.
+  if (!use_upsampled_ref && cpi->sf.use_upsampled_references)
+    reset_use_upsampled_references(cpi);
+
   do {
     vpx_clear_system_state();
 
@@ -3215,7 +4306,7 @@
     }
 
     cpi->Source = vp10_scale_if_required(cm, cpi->un_scaled_source,
-                                      &cpi->scaled_source);
+                                         &cpi->scaled_source);
 
     if (cpi->unscaled_last_source != NULL)
       cpi->Last_Source = vp10_scale_if_required(cm, cpi->unscaled_last_source,
@@ -3233,6 +4324,43 @@
     if (loop_count == 0)
       setup_frame(cpi);
 
+#if CONFIG_ENTROPY
+    // Base q-index may have changed, so we need to assign proper default coef
+    // probs before every iteration.
+    if (frame_is_intra_only(cm) || cm->error_resilient_mode) {
+      int i;
+      vp10_default_coef_probs(cm);
+      if (cm->frame_type == KEY_FRAME || cm->error_resilient_mode ||
+          cm->reset_frame_context == RESET_FRAME_CONTEXT_ALL) {
+        for (i = 0; i < FRAME_CONTEXTS; ++i)
+          cm->frame_contexts[i] = *cm->fc;
+      } else if (cm->reset_frame_context == RESET_FRAME_CONTEXT_CURRENT) {
+        cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
+      }
+    }
+#endif  // CONFIG_ENTROPY
+
+#if CONFIG_ENTROPY
+    cm->do_subframe_update = cm->tile_cols == 1 && cm->tile_rows == 1;
+    if (loop_count == 0 || frame_is_intra_only(cm) ||
+        cm->error_resilient_mode) {
+      vp10_copy(cm->starting_coef_probs, cm->fc->coef_probs);
+      vp10_copy(cpi->subframe_stats.enc_starting_coef_probs,
+                cm->fc->coef_probs);
+    } else {
+      if (cm->do_subframe_update) {
+        vp10_copy(cm->fc->coef_probs,
+                  cpi->subframe_stats.enc_starting_coef_probs);
+        vp10_copy(cm->starting_coef_probs,
+                  cpi->subframe_stats.enc_starting_coef_probs);
+        vp10_zero(cpi->subframe_stats.coef_counts_buf);
+        vp10_zero(cpi->subframe_stats.eob_counts_buf);
+      }
+    }
+    cm->coef_probs_update_idx = 0;
+    vp10_copy(cpi->subframe_stats.coef_probs_buf[0], cm->fc->coef_probs);
+#endif  // CONFIG_ENTROPY
+
     // Variance adaptive and in frame q adjustment experiments are mutually
     // exclusive.
     if (cpi->oxcf.aq_mode == VARIANCE_AQ) {
@@ -3255,6 +4383,7 @@
     // to recode.
     if (cpi->sf.recode_loop >= ALLOW_RECODE_KFARFGF) {
       save_coding_context(cpi);
+
       vp10_pack_bitstream(cpi, dest, size);
 
       rc->projected_frame_size = (int)(*size) << 3;
@@ -3278,12 +4407,12 @@
 
 #if CONFIG_VP9_HIGHBITDEPTH
         if (cm->use_highbitdepth) {
-          kf_err = vp10_highbd_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+          kf_err = vpx_highbd_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
         } else {
-          kf_err = vp10_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+          kf_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
         }
 #else
-        kf_err = vp10_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+        kf_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
         // Prevent possible divide by zero error below for perfect KF
@@ -3429,12 +4558,45 @@
 
 static int get_ref_frame_flags(const VP10_COMP *cpi) {
   const int *const map = cpi->common.ref_frame_map;
-  const int gold_is_last = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idx];
-  const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idx];
-  const int gold_is_alt = map[cpi->gld_fb_idx] == map[cpi->alt_fb_idx];
-  int flags = VP9_ALT_FLAG | VP9_GOLD_FLAG | VP9_LAST_FLAG;
 
-  if (gold_is_last)
+#if CONFIG_EXT_REFS
+  const int last2_is_last =
+      map[cpi->lst_fb_idxes[1]] == map[cpi->lst_fb_idxes[0]];
+  const int last3_is_last =
+      map[cpi->lst_fb_idxes[2]] == map[cpi->lst_fb_idxes[0]];
+  const int gld_is_last = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[0]];
+  const int bwd_is_last = map[cpi->bwd_fb_idx] == map[cpi->lst_fb_idxes[0]];
+  const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idxes[0]];
+
+  const int last3_is_last2 =
+      map[cpi->lst_fb_idxes[2]] == map[cpi->lst_fb_idxes[1]];
+  const int gld_is_last2 = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[1]];
+  const int bwd_is_last2 = map[cpi->bwd_fb_idx] == map[cpi->lst_fb_idxes[1]];
+
+  const int gld_is_last3 = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idxes[2]];
+  const int bwd_is_last3 = map[cpi->bwd_fb_idx] == map[cpi->lst_fb_idxes[2]];
+
+  const int bwd_is_gld = map[cpi->bwd_fb_idx] == map[cpi->gld_fb_idx];
+
+  const int last2_is_alt = map[cpi->lst_fb_idxes[1]] == map[cpi->alt_fb_idx];
+  const int last3_is_alt = map[cpi->lst_fb_idxes[2]] == map[cpi->alt_fb_idx];
+  const int gld_is_alt = map[cpi->gld_fb_idx] == map[cpi->alt_fb_idx];
+  const int bwd_is_alt = map[cpi->bwd_fb_idx] == map[cpi->alt_fb_idx];
+#else
+  const int gld_is_last = map[cpi->gld_fb_idx] == map[cpi->lst_fb_idx];
+  const int gld_is_alt = map[cpi->gld_fb_idx] == map[cpi->alt_fb_idx];
+  const int alt_is_last = map[cpi->alt_fb_idx] == map[cpi->lst_fb_idx];
+#endif  // CONFIG_EXT_REFS
+
+  int flags = VP9_REFFRAME_ALL;
+
+#if CONFIG_EXT_REFS
+  // Disable the use of BWDREF_FRAME for non-bipredictive frames.
+  if (!(cpi->rc.is_bipred_frame || cpi->rc.is_last_bipred_frame))
+    flags &= ~VP9_BWD_FLAG;
+#endif  // CONFIG_EXT_REFS
+
+  if (gld_is_last || gld_is_alt)
     flags &= ~VP9_GOLD_FLAG;
 
   if (cpi->rc.frames_till_gf_update_due == INT_MAX)
@@ -3443,8 +4605,20 @@
   if (alt_is_last)
     flags &= ~VP9_ALT_FLAG;
 
-  if (gold_is_alt)
-    flags &= ~VP9_ALT_FLAG;
+#if CONFIG_EXT_REFS
+  if (last2_is_last || last2_is_alt)
+    flags &= ~VP9_LAST2_FLAG;
+
+  if (last3_is_last || last3_is_last2 || last3_is_alt)
+    flags &= ~VP9_LAST3_FLAG;
+
+  if (gld_is_last2 || gld_is_last3)
+    flags &= ~VP9_GOLD_FLAG;
+
+  if ((bwd_is_last || bwd_is_last2 || bwd_is_last3 ||
+       bwd_is_gld || bwd_is_alt) && (flags & VP9_BWD_FLAG))
+    flags &= ~VP9_BWD_FLAG;
+#endif  // CONFIG_EXT_REFS
 
   return flags;
 }
@@ -3511,6 +4685,9 @@
       (cpi->rc.source_alt_ref_active && !cpi->refresh_alt_ref_frame);
   }
   cm->ref_frame_sign_bias[ALTREF_FRAME] = arf_sign_bias;
+#if CONFIG_EXT_REFS
+  cm->ref_frame_sign_bias[BWDREF_FRAME] = cm->ref_frame_sign_bias[ALTREF_FRAME];
+#endif  // CONFIG_EXT_REFS
 }
 
 static int setup_interp_filter_search_mask(VP10_COMP *cpi) {
@@ -3522,15 +4699,28 @@
       cpi->refresh_alt_ref_frame)
     return mask;
   for (ref = LAST_FRAME; ref <= ALTREF_FRAME; ++ref)
-    for (ifilter = EIGHTTAP; ifilter <= EIGHTTAP_SHARP; ++ifilter)
+    for (ifilter = EIGHTTAP_REGULAR; ifilter < SWITCHABLE_FILTERS; ++ifilter)
       ref_total[ref] += cpi->interp_filter_selected[ref][ifilter];
 
-  for (ifilter = EIGHTTAP; ifilter <= EIGHTTAP_SHARP; ++ifilter) {
+  for (ifilter = EIGHTTAP_REGULAR; ifilter < SWITCHABLE_FILTERS; ++ifilter) {
     if ((ref_total[LAST_FRAME] &&
         cpi->interp_filter_selected[LAST_FRAME][ifilter] == 0) &&
+#if CONFIG_EXT_REFS
+        (ref_total[LAST2_FRAME] == 0 ||
+         cpi->interp_filter_selected[LAST2_FRAME][ifilter] * 50
+         < ref_total[LAST2_FRAME]) &&
+        (ref_total[LAST3_FRAME] == 0 ||
+         cpi->interp_filter_selected[LAST3_FRAME][ifilter] * 50
+         < ref_total[LAST3_FRAME]) &&
+#endif  // CONFIG_EXT_REFS
         (ref_total[GOLDEN_FRAME] == 0 ||
          cpi->interp_filter_selected[GOLDEN_FRAME][ifilter] * 50
            < ref_total[GOLDEN_FRAME]) &&
+#if CONFIG_EXT_REFS
+        (ref_total[BWDREF_FRAME] == 0 ||
+         cpi->interp_filter_selected[BWDREF_FRAME][ifilter] * 50
+           < ref_total[BWDREF_FRAME]) &&
+#endif  // CONFIG_EXT_REFS
         (ref_total[ALTREF_FRAME] == 0 ||
          cpi->interp_filter_selected[ALTREF_FRAME][ifilter] * 50
            < ref_total[ALTREF_FRAME]))
@@ -3539,6 +4729,61 @@
   return mask;
 }
 
+#define DUMP_RECON_FRAMES 0
+
+#if DUMP_RECON_FRAMES == 1
+// NOTE(zoeliu): For debug - Output the filtered reconstructed video.
+static void dump_filtered_recon_frames(VP10_COMP *cpi) {
+  VP10_COMMON *const cm = &cpi->common;
+  const YV12_BUFFER_CONFIG *recon_buf = cm->frame_to_show;
+  int h;
+  char file_name[256] = "/tmp/enc_filtered_recon.yuv";
+  FILE *f_recon = NULL;
+
+  if (recon_buf == NULL || !cm->show_frame) {
+    printf("Frame %d is not ready or no show to dump.\n",
+           cm->current_video_frame);
+    return;
+  }
+
+  if (cm->current_video_frame == 0) {
+    if ((f_recon = fopen(file_name, "wb")) == NULL) {
+      printf("Unable to open file %s to write.\n", file_name);
+      return;
+    }
+  } else {
+    if ((f_recon = fopen(file_name, "ab")) == NULL) {
+      printf("Unable to open file %s to append.\n", file_name);
+      return;
+    }
+  }
+  printf("\nFrame=%5d, encode_update_type[%5d]=%1d, show_existing_frame=%d, "
+         "y_stride=%4d, uv_stride=%4d, width=%4d, height=%4d\n",
+         cm->current_video_frame, cpi->twopass.gf_group.index,
+         cpi->twopass.gf_group.update_type[cpi->twopass.gf_group.index],
+         cm->show_existing_frame,
+         recon_buf->y_stride, recon_buf->uv_stride, cm->width, cm->height);
+
+  // --- Y ---
+  for (h = 0; h < cm->height; ++h) {
+    fwrite(&recon_buf->y_buffer[h*recon_buf->y_stride],
+           1, cm->width, f_recon);
+  }
+  // --- U ---
+  for (h = 0; h < (cm->height >> 1); ++h) {
+    fwrite(&recon_buf->u_buffer[h*recon_buf->uv_stride],
+           1, (cm->width >> 1), f_recon);
+  }
+  // --- V ---
+  for (h = 0; h < (cm->height >> 1); ++h) {
+    fwrite(&recon_buf->v_buffer[h*recon_buf->uv_stride],
+           1, (cm->width >> 1), f_recon);
+  }
+
+  fclose(f_recon);
+}
+#endif  // DUMP_RECON_FRAMES
+
 static void encode_frame_to_data_rate(VP10_COMP *cpi,
                                       size_t *size,
                                       uint8_t *dest,
@@ -3554,6 +4799,65 @@
   // Set the arf sign bias for this frame.
   set_arf_sign_bias(cpi);
 
+#if CONFIG_EXT_REFS
+  // NOTE:
+  // (1) Move the setup of the ref_frame_flags upfront as it would be
+  //     determined by the current frame properties;
+  // (2) The setup of the ref_frame_flags applies to both show_existing_frame's
+  //     and the other cases.
+  if (cm->current_video_frame > 0)
+    cpi->ref_frame_flags = get_ref_frame_flags(cpi);
+
+  if (cm->show_existing_frame) {
+    // NOTE(zoeliu): In BIDIR_PRED, the existing frame to show is the current
+    //               BWDREF_FRAME in the reference frame buffer.
+
+    cm->frame_type = INTER_FRAME;
+    cm->show_frame = 1;
+    cpi->frame_flags = *frame_flags;
+
+    cpi->refresh_last_frame = 0;
+    cpi->refresh_golden_frame = 0;
+    cpi->refresh_bwd_ref_frame = 0;
+    cpi->refresh_alt_ref_frame = 0;
+
+    cpi->rc.is_bwd_ref_frame = 0;
+    cpi->rc.is_last_bipred_frame = 0;
+    cpi->rc.is_bipred_frame = 0;
+
+    // Build the bitstream
+    vp10_pack_bitstream(cpi, dest, size);
+
+    // Set up frame to show to get ready for stats collection.
+    cm->frame_to_show = get_frame_new_buffer(cm);
+
+#if DUMP_RECON_FRAMES == 1
+    // NOTE(zoeliu): For debug - Output the filtered reconstructed video.
+    dump_filtered_recon_frames(cpi);
+#endif  // DUMP_RECON_FRAMES
+
+    // Update the LAST_FRAME in the reference frame buffer.
+    vp10_update_reference_frames(cpi);
+
+    // Update frame flags
+    cpi->frame_flags &= ~FRAMEFLAGS_GOLDEN;
+    cpi->frame_flags &= ~FRAMEFLAGS_BWDREF;
+    cpi->frame_flags &= ~FRAMEFLAGS_ALTREF;
+
+    *frame_flags = cpi->frame_flags & ~FRAMEFLAGS_KEY;
+
+    // Update the frame type
+    cm->last_frame_type = cm->frame_type;
+
+    cm->last_width = cm->width;
+    cm->last_height = cm->height;
+
+    ++cm->current_video_frame;
+
+    return;
+  }
+#endif  // CONFIG_EXT_REFS
+
   // Set default state for segment based loop filter update flags.
   cm->lf.mode_ref_delta_update = 0;
 
@@ -3581,7 +4885,7 @@
     // By default, encoder assumes decoder can use prev_mi.
     if (cm->error_resilient_mode) {
       cm->reset_frame_context = RESET_FRAME_CONTEXT_NONE;
-      cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_OFF;
+      cm->refresh_frame_context = REFRESH_FRAME_CONTEXT_FORWARD;
     } else if (cm->intra_only) {
       // Only reset the current context.
       cm->reset_frame_context = RESET_FRAME_CONTEXT_CURRENT;
@@ -3619,13 +4923,14 @@
     vp10_write_yuv_frame_420(&cpi->denoiser.running_avg_y[INTRA_FRAME],
                             yuv_denoised_file);
   }
-#endif
-#endif
+#endif  // OUTPUT_YUV_DENOISED
+#endif  // CONFIG_VP9_TEMPORAL_DENOISING
+
 #ifdef OUTPUT_YUV_SKINMAP
   if (cpi->common.current_video_frame > 1) {
     vp10_compute_skin_map(cpi, yuv_skinmap_file);
   }
-#endif
+#endif  // OUTPUT_YUV_SKINMAP
 
   // Special case code to reduce pulsing when key frames are forced at a
   // fixed interval. Note the reconstruction error if it is the frame before
@@ -3633,19 +4938,20 @@
   if (cpi->rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) {
 #if CONFIG_VP9_HIGHBITDEPTH
     if (cm->use_highbitdepth) {
-      cpi->ambient_err = vp10_highbd_get_y_sse(cpi->Source,
+      cpi->ambient_err = vpx_highbd_get_y_sse(cpi->Source,
                                               get_frame_new_buffer(cm));
     } else {
-      cpi->ambient_err = vp10_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+      cpi->ambient_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
     }
 #else
-    cpi->ambient_err = vp10_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+    cpi->ambient_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   }
 
   // If the encoder forced a KEY_FRAME decision
-  if (cm->frame_type == KEY_FRAME)
+  if (cm->frame_type == KEY_FRAME) {
     cpi->refresh_last_frame = 1;
+  }
 
   cm->frame_to_show = get_frame_new_buffer(cm);
   cm->frame_to_show->color_space = cm->color_space;
@@ -3653,32 +4959,50 @@
   cm->frame_to_show->render_width  = cm->render_width;
   cm->frame_to_show->render_height = cm->render_height;
 
+#if CONFIG_EXT_REFS
+  // TODO(zoeliu): For non-ref frames, loop filtering may need to be turned
+  // off.
+#endif  // CONFIG_EXT_REFS
+
   // Pick the loop filter level for the frame.
   loopfilter_frame(cpi, cm);
 
-  // build the bitstream
+  // Build the bitstream
   vp10_pack_bitstream(cpi, dest, size);
 
+#if DUMP_RECON_FRAMES == 1
+  // NOTE(zoeliu): For debug - Output the filtered reconstructed video.
+  if (cm->show_frame)
+    dump_filtered_recon_frames(cpi);
+#endif  // DUMP_RECON_FRAMES
+
+#if CONFIG_EXT_REFS
+  if (cpi->rc.is_last_bipred_frame) {
+    // NOTE: If the current frame is a LAST_BIPRED_FRAME, next it is needed
+    //       to show the BWDREF_FRAME.
+    cpi->existing_fb_idx_to_show = cpi->bwd_fb_idx;
+  }
+#endif  // CONFIG_EXT_REFS
+
   if (cm->seg.update_map)
     update_reference_segmentation_map(cpi);
 
   if (frame_is_intra_only(cm) == 0) {
     release_scaled_references(cpi);
   }
+
   vp10_update_reference_frames(cpi);
 
   for (t = TX_4X4; t <= TX_32X32; t++)
-    full_to_model_counts(cpi->td.counts->coef[t],
-                         cpi->td.rd_counts.coef_counts[t]);
+    vp10_full_to_model_counts(cpi->td.counts->coef[t],
+                              cpi->td.rd_counts.coef_counts[t]);
 
   if (cm->refresh_frame_context == REFRESH_FRAME_CONTEXT_BACKWARD) {
+#if CONFIG_ENTROPY
+    cm->partial_prob_update = 0;
+#endif  // CONFIG_ENTROPY
     vp10_adapt_coef_probs(cm);
-#if CONFIG_MISC_FIXES
     vp10_adapt_intra_frame_probs(cm);
-#else
-    if (!frame_is_intra_only(cm))
-      vp10_adapt_intra_frame_probs(cm);
-#endif
   }
 
   if (!frame_is_intra_only(cm)) {
@@ -3698,8 +5022,21 @@
   else
     cpi->frame_flags &= ~FRAMEFLAGS_ALTREF;
 
-  cpi->ref_frame_flags = get_ref_frame_flags(cpi);
+#if CONFIG_EXT_REFS
+  if (cpi->refresh_bwd_ref_frame == 1)
+    cpi->frame_flags |= FRAMEFLAGS_BWDREF;
+  else
+    cpi->frame_flags &= ~FRAMEFLAGS_BWDREF;
+#endif  // CONFIG_EXT_REFS
 
+#if !CONFIG_EXT_REFS
+  cpi->ref_frame_flags = get_ref_frame_flags(cpi);
+#endif  // !CONFIG_EXT_REFS
+
+#if CONFIG_EXT_REFS
+  cm->last3_frame_type = cm->last2_frame_type;
+  cm->last2_frame_type = cm->last_frame_type;
+#endif  // CONFIG_EXT_REFS
   cm->last_frame_type = cm->frame_type;
 
   vp10_rc_postencode_update(cpi, *size);
@@ -3730,12 +5067,21 @@
     cm->last_show_frame = cm->show_frame;
 
   if (cm->show_frame) {
+#if CONFIG_EXT_REFS
+    // TODO(zoeliu): We may only swamp mi and prev_mi for those frames that are
+    // being used as reference.
+#endif  // CONFIG_EXT_REFS
     vp10_swap_mi_and_prev_mi(cm);
     // Don't increment frame counters if this was an altref buffer
     // update not a real frame
     ++cm->current_video_frame;
   }
-  cm->prev_frame = cm->cur_frame;
+
+#if CONFIG_EXT_REFS
+  // NOTE: Shall not refer to any frame not used as reference.
+  if (cm->is_reference_frame)
+#endif  // CONFIG_EXT_REFS
+    cm->prev_frame = cm->cur_frame;
 }
 
 static void Pass0Encode(VP10_COMP *cpi, size_t *size, uint8_t *dest,
@@ -3753,7 +5099,11 @@
   cpi->allow_encode_breakout = ENCODE_BREAKOUT_ENABLED;
   encode_frame_to_data_rate(cpi, size, dest, frame_flags);
 
-  vp10_twopass_postencode_update(cpi);
+#if CONFIG_EXT_REFS
+  // Donot do the post-encoding update for show_existing_frame==1.
+  if (!cpi->common.show_existing_frame)
+#endif  // CONFIG_EXT_REFS
+    vp10_twopass_postencode_update(cpi);
 }
 
 static void init_ref_frame_bufs(VP10_COMMON *cm) {
@@ -3869,8 +5219,11 @@
   return cm->frame_type == KEY_FRAME ||
          cpi->refresh_last_frame ||
          cpi->refresh_golden_frame ||
+#if CONFIG_EXT_REFS
+         cpi->refresh_bwd_ref_frame ||
+#endif  // CONFIG_EXT_REFS
          cpi->refresh_alt_ref_frame ||
-         cm->refresh_frame_context != REFRESH_FRAME_CONTEXT_OFF ||
+         !cm->error_resilient_mode ||
          cm->lf.mode_ref_delta_update ||
          cm->seg.update_map ||
          cm->seg.update_data;
@@ -3933,6 +5286,27 @@
   return arf_src_index;
 }
 
+#if CONFIG_EXT_REFS
+static int get_brf_src_index(VP10_COMP *cpi) {
+  int brf_src_index = 0;
+  const GF_GROUP *const gf_group = &cpi->twopass.gf_group;
+
+  // TODO(zoeliu): We need to add the check on the -bwd_ref command line setup
+  //               flag.
+  if (gf_group->bidir_pred_enabled[gf_group->index]) {
+    if (cpi->oxcf.pass == 2) {
+      if (gf_group->update_type[gf_group->index] == BRF_UPDATE)
+        brf_src_index = gf_group->brf_src_offset[gf_group->index];
+    } else {
+      // TODO(zoeliu): To re-visit the setup for this scenario
+      brf_src_index = cpi->rc.bipred_group_interval - 1;
+    }
+  }
+
+  return brf_src_index;
+}
+#endif  // CONFIG_EXT_REFS
+
 static void check_src_altref(VP10_COMP *cpi,
                              const struct lookahead_entry *source) {
   RATE_CONTROL *const rc = &cpi->rc;
@@ -3969,6 +5343,105 @@
   s->stat[ALL] += all;
   s->worst = VPXMIN(s->worst, all);
 }
+
+static void compute_internal_stats(VP10_COMP *cpi) {
+  VP10_COMMON *const cm = &cpi->common;
+  double samples = 0.0;
+  uint32_t in_bit_depth = 8;
+  uint32_t bit_depth = 8;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (cm->use_highbitdepth) {
+    in_bit_depth = cpi->oxcf.input_bit_depth;
+    bit_depth = cm->bit_depth;
+  }
+#endif
+  if (cm->show_frame) {
+    const YV12_BUFFER_CONFIG *orig = cpi->Source;
+    const YV12_BUFFER_CONFIG *recon = cpi->common.frame_to_show;
+    double y, u, v, frame_all;
+
+    cpi->count++;
+    if (cpi->b_calculate_psnr) {
+      PSNR_STATS psnr;
+      double frame_ssim2 = 0.0, weight = 0.0;
+      vpx_clear_system_state();
+      // TODO(yaowu): unify these two versions into one.
+#if CONFIG_VP9_HIGHBITDEPTH
+      vpx_calc_highbd_psnr(orig, recon, &psnr, bit_depth, in_bit_depth);
+#else
+      vpx_calc_psnr(orig, recon, &psnr);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+      adjust_image_stat(psnr.psnr[1], psnr.psnr[2], psnr.psnr[3],
+                        psnr.psnr[0], &cpi->psnr);
+      cpi->total_sq_error += psnr.sse[0];
+      cpi->total_samples += psnr.samples[0];
+      samples = psnr.samples[0];
+      // TODO(yaowu): unify these two versions into one.
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (cm->use_highbitdepth)
+        frame_ssim2 = vpx_highbd_calc_ssim(orig, recon, &weight,
+                                           bit_depth, in_bit_depth);
+      else
+        frame_ssim2 = vpx_calc_ssim(orig, recon, &weight);
+#else
+      frame_ssim2 = vpx_calc_ssim(orig, recon, &weight);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+      cpi->worst_ssim= VPXMIN(cpi->worst_ssim, frame_ssim2);
+      cpi->summed_quality += frame_ssim2 * weight;
+      cpi->summed_weights += weight;
+
+#if 0
+      {
+        FILE *f = fopen("q_used.stt", "a");
+        fprintf(f, "%5d : Y%f7.3:U%f7.3:V%f7.3:F%f7.3:S%7.3f\n",
+                cpi->common.current_video_frame, y2, u2, v2,
+                frame_psnr2, frame_ssim2);
+        fclose(f);
+      }
+#endif
+    }
+    if (cpi->b_calculate_blockiness) {
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (!cm->use_highbitdepth)
+#endif
+      {
+        const double frame_blockiness = vp10_get_blockiness(
+            orig->y_buffer, orig->y_stride, recon->y_buffer, recon->y_stride,
+            orig->y_width, orig->y_height);
+        cpi->worst_blockiness = VPXMAX(cpi->worst_blockiness, frame_blockiness);
+        cpi->total_blockiness += frame_blockiness;
+      }
+
+      if (cpi->b_calculate_consistency) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (!cm->use_highbitdepth)
+#endif
+        {
+          const double this_inconsistency = vpx_get_ssim_metrics(
+              orig->y_buffer, orig->y_stride, recon->y_buffer, recon->y_stride,
+              orig->y_width, orig->y_height, cpi->ssim_vars, &cpi->metrics, 1);
+
+          const double peak = (double)((1 << in_bit_depth) - 1);
+          const double consistency = vpx_sse_to_psnr(
+               samples, peak, cpi->total_inconsistency);
+          if (consistency > 0.0)
+            cpi->worst_consistency =
+                VPXMIN(cpi->worst_consistency, consistency);
+          cpi->total_inconsistency += this_inconsistency;
+        }
+      }
+    }
+
+    frame_all = vpx_calc_fastssim(orig, recon, &y, &u, &v,
+                                  bit_depth, in_bit_depth);
+    adjust_image_stat(y, u, v, frame_all, &cpi->fastssim);
+    frame_all = vpx_psnrhvs(orig, recon, &y, &u, &v, bit_depth, in_bit_depth);
+    adjust_image_stat(y, u, v, frame_all, &cpi->psnrhvs);
+  }
+}
 #endif  // CONFIG_INTERNAL_STATS
 
 int vp10_get_compressed_data(VP10_COMP *cpi, unsigned int *frame_flags,
@@ -3983,6 +5456,9 @@
   struct lookahead_entry *last_source = NULL;
   struct lookahead_entry *source = NULL;
   int arf_src_index;
+#if CONFIG_EXT_REFS
+  int brf_src_index;
+#endif  // CONFIG_EXT_REFS
   int i;
 
   vpx_usec_timer_start(&cmptimer);
@@ -3999,17 +5475,68 @@
   // Normal defaults
   cm->reset_frame_context = RESET_FRAME_CONTEXT_NONE;
   cm->refresh_frame_context =
-      oxcf->error_resilient_mode ? REFRESH_FRAME_CONTEXT_OFF :
-          oxcf->frame_parallel_decoding_mode ? REFRESH_FRAME_CONTEXT_FORWARD
-                                             : REFRESH_FRAME_CONTEXT_BACKWARD;
+      (oxcf->error_resilient_mode || oxcf->frame_parallel_decoding_mode) ?
+          REFRESH_FRAME_CONTEXT_FORWARD : REFRESH_FRAME_CONTEXT_BACKWARD;
 
   cpi->refresh_last_frame = 1;
   cpi->refresh_golden_frame = 0;
+#if CONFIG_EXT_REFS
+  cpi->refresh_bwd_ref_frame = 0;
+#endif  // CONFIG_EXT_REFS
   cpi->refresh_alt_ref_frame = 0;
 
+#if CONFIG_EXT_REFS
+  if (oxcf->pass == 2 && cm->show_existing_frame) {
+    // Manage the source buffer and flush out the source frame that has been
+    // coded already; Also get prepared for PSNR calculation if needed.
+    if ((source = vp10_lookahead_pop(cpi->lookahead, flush)) == NULL) {
+      *size = 0;
+      return -1;
+    }
+    cpi->Source = &source->img;
+
+    // TODO(zoeliu): To track down to determine whether it's needed to adjust
+    // the frame rate.
+    *time_stamp = source->ts_start;
+    *time_end = source->ts_end;
+
+    // Find a free buffer for the new frame, releasing the reference previously
+    // held.
+    if (cm->new_fb_idx != INVALID_IDX) {
+      --pool->frame_bufs[cm->new_fb_idx].ref_count;
+    }
+    cm->new_fb_idx = get_free_fb(cm);
+
+    if (cm->new_fb_idx == INVALID_IDX)
+      return -1;
+
+    // Clear down mmx registers
+    vpx_clear_system_state();
+
+    // Start with a 0 size frame.
+    *size = 0;
+
+    Pass2Encode(cpi, size, dest, frame_flags);
+
+    if (cpi->b_calculate_psnr)
+      generate_psnr_packet(cpi);
+
+#if CONFIG_INTERNAL_STATS
+    compute_internal_stats(cpi);
+    cpi->bytes += (int)(*size);
+#endif  // CONFIG_INTERNAL_STATS
+
+    // Clear down mmx registers
+    vpx_clear_system_state();
+
+    cm->show_existing_frame = 0;
+
+    return 0;
+  }
+#endif  // CONFIG_EXT_REFS
+
   // Should we encode an arf frame.
   arf_src_index = get_arf_src_index(cpi);
-
   if (arf_src_index) {
     for (i = 0; i <= arf_src_index; ++i) {
       struct lookahead_entry *e = vp10_lookahead_peek(cpi->lookahead, i);
@@ -4043,11 +5570,28 @@
       cpi->refresh_golden_frame = 0;
       cpi->refresh_last_frame = 0;
       rc->is_src_frame_alt_ref = 0;
-      rc->source_alt_ref_pending = 0;
-    } else {
-      rc->source_alt_ref_pending = 0;
+    }
+    rc->source_alt_ref_pending = 0;
+  }
+
+#if CONFIG_EXT_REFS
+  rc->is_bwd_ref_frame = 0;
+  brf_src_index = get_brf_src_index(cpi);
+  if (brf_src_index) {
+    assert(brf_src_index <= rc->frames_to_key);
+    if ((source = vp10_lookahead_peek(cpi->lookahead, brf_src_index)) != NULL) {
+      cm->show_frame = 0;
+      cm->intra_only = 0;
+
+      cpi->refresh_bwd_ref_frame = 1;
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+
+      rc->is_bwd_ref_frame = 1;
     }
   }
+#endif  // CONFIG_EXT_REFS
 
   if (!source) {
     // Get last frame source.
@@ -4096,9 +5640,8 @@
   vpx_clear_system_state();
 
   // adjust frame rates based on timestamps given
-  if (cm->show_frame) {
+  if (cm->show_frame)
     adjust_frame_rate(cpi, source);
-  }
 
   // Find a free buffer for the new frame, releasing the reference previously
   // held.
@@ -4147,7 +5690,7 @@
     Pass0Encode(cpi, size, dest, frame_flags);
   }
 
-  if (cm->refresh_frame_context != REFRESH_FRAME_CONTEXT_OFF)
+  if (!cm->error_resilient_mode)
     cm->frame_contexts[cm->frame_context_idx] = *cm->fc;
 
   // No frame encoded, or frame was dropped, release scaled references.
@@ -4166,178 +5709,25 @@
     generate_psnr_packet(cpi);
 
 #if CONFIG_INTERNAL_STATS
-
   if (oxcf->pass != 1) {
-    double samples = 0.0;
+    compute_internal_stats(cpi);
     cpi->bytes += (int)(*size);
-
-    if (cm->show_frame) {
-      cpi->count++;
-
-      if (cpi->b_calculate_psnr) {
-        YV12_BUFFER_CONFIG *orig = cpi->Source;
-        YV12_BUFFER_CONFIG *recon = cpi->common.frame_to_show;
-        YV12_BUFFER_CONFIG *pp = &cm->post_proc_buffer;
-        PSNR_STATS psnr;
-#if CONFIG_VP9_HIGHBITDEPTH
-        calc_highbd_psnr(orig, recon, &psnr, cpi->td.mb.e_mbd.bd,
-                         cpi->oxcf.input_bit_depth);
-#else
-        calc_psnr(orig, recon, &psnr);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-        adjust_image_stat(psnr.psnr[1], psnr.psnr[2], psnr.psnr[3],
-                          psnr.psnr[0], &cpi->psnr);
-        cpi->total_sq_error += psnr.sse[0];
-        cpi->total_samples += psnr.samples[0];
-        samples = psnr.samples[0];
-
-        {
-          PSNR_STATS psnr2;
-          double frame_ssim2 = 0, weight = 0;
-#if CONFIG_VP9_POSTPROC
-          if (vpx_alloc_frame_buffer(&cm->post_proc_buffer,
-                                     recon->y_crop_width, recon->y_crop_height,
-                                     cm->subsampling_x, cm->subsampling_y,
-#if CONFIG_VP9_HIGHBITDEPTH
-                                     cm->use_highbitdepth,
-#endif
-                                     VP9_ENC_BORDER_IN_PIXELS,
-                                     cm->byte_alignment) < 0) {
-            vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
-                               "Failed to allocate post processing buffer");
-          }
-
-          vp10_deblock(cm->frame_to_show, &cm->post_proc_buffer,
-                      cm->lf.filter_level * 10 / 6);
-#endif
-          vpx_clear_system_state();
-
-#if CONFIG_VP9_HIGHBITDEPTH
-          calc_highbd_psnr(orig, pp, &psnr2, cpi->td.mb.e_mbd.bd,
-                           cpi->oxcf.input_bit_depth);
-#else
-          calc_psnr(orig, pp, &psnr2);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-          cpi->totalp_sq_error += psnr2.sse[0];
-          cpi->totalp_samples += psnr2.samples[0];
-          adjust_image_stat(psnr2.psnr[1], psnr2.psnr[2], psnr2.psnr[3],
-                            psnr2.psnr[0], &cpi->psnrp);
-
-#if CONFIG_VP9_HIGHBITDEPTH
-          if (cm->use_highbitdepth) {
-            frame_ssim2 = vpx_highbd_calc_ssim(orig, recon, &weight,
-                                               (int)cm->bit_depth);
-          } else {
-            frame_ssim2 = vpx_calc_ssim(orig, recon, &weight);
-          }
-#else
-          frame_ssim2 = vpx_calc_ssim(orig, recon, &weight);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-          cpi->worst_ssim= VPXMIN(cpi->worst_ssim, frame_ssim2);
-          cpi->summed_quality += frame_ssim2 * weight;
-          cpi->summed_weights += weight;
-
-#if CONFIG_VP9_HIGHBITDEPTH
-          if (cm->use_highbitdepth) {
-            frame_ssim2 = vpx_highbd_calc_ssim(
-                orig, &cm->post_proc_buffer, &weight, (int)cm->bit_depth);
-          } else {
-            frame_ssim2 = vpx_calc_ssim(orig, &cm->post_proc_buffer, &weight);
-          }
-#else
-          frame_ssim2 = vpx_calc_ssim(orig, &cm->post_proc_buffer, &weight);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-          cpi->summedp_quality += frame_ssim2 * weight;
-          cpi->summedp_weights += weight;
-#if 0
-          {
-            FILE *f = fopen("q_used.stt", "a");
-            fprintf(f, "%5d : Y%f7.3:U%f7.3:V%f7.3:F%f7.3:S%7.3f\n",
-                    cpi->common.current_video_frame, y2, u2, v2,
-                    frame_psnr2, frame_ssim2);
-            fclose(f);
-          }
-#endif
-        }
-      }
-      if (cpi->b_calculate_blockiness) {
-#if CONFIG_VP9_HIGHBITDEPTH
-        if (!cm->use_highbitdepth)
-#endif
-        {
-          double frame_blockiness = vp10_get_blockiness(
-              cpi->Source->y_buffer, cpi->Source->y_stride,
-              cm->frame_to_show->y_buffer, cm->frame_to_show->y_stride,
-              cpi->Source->y_width, cpi->Source->y_height);
-          cpi->worst_blockiness =
-              VPXMAX(cpi->worst_blockiness, frame_blockiness);
-          cpi->total_blockiness += frame_blockiness;
-        }
-      }
-
-      if (cpi->b_calculate_consistency) {
-#if CONFIG_VP9_HIGHBITDEPTH
-        if (!cm->use_highbitdepth)
-#endif
-        {
-          double this_inconsistency = vpx_get_ssim_metrics(
-              cpi->Source->y_buffer, cpi->Source->y_stride,
-              cm->frame_to_show->y_buffer, cm->frame_to_show->y_stride,
-              cpi->Source->y_width, cpi->Source->y_height, cpi->ssim_vars,
-              &cpi->metrics, 1);
-
-          const double peak = (double)((1 << cpi->oxcf.input_bit_depth) - 1);
-          double consistency = vpx_sse_to_psnr(samples, peak,
-                                             (double)cpi->total_inconsistency);
-          if (consistency > 0.0)
-            cpi->worst_consistency =
-                VPXMIN(cpi->worst_consistency, consistency);
-          cpi->total_inconsistency += this_inconsistency;
-        }
-      }
-
-      if (cpi->b_calculate_ssimg) {
-        double y, u, v, frame_all;
-#if CONFIG_VP9_HIGHBITDEPTH
-        if (cm->use_highbitdepth) {
-          frame_all = vpx_highbd_calc_ssimg(cpi->Source, cm->frame_to_show, &y,
-                                            &u, &v, (int)cm->bit_depth);
-        } else {
-          frame_all = vpx_calc_ssimg(cpi->Source, cm->frame_to_show, &y, &u,
-                                     &v);
-        }
-#else
-        frame_all = vpx_calc_ssimg(cpi->Source, cm->frame_to_show, &y, &u, &v);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-        adjust_image_stat(y, u, v, frame_all, &cpi->ssimg);
-      }
-#if CONFIG_VP9_HIGHBITDEPTH
-      if (!cm->use_highbitdepth)
-#endif
-      {
-        double y, u, v, frame_all;
-        frame_all = vpx_calc_fastssim(cpi->Source, cm->frame_to_show, &y, &u,
-                                      &v);
-        adjust_image_stat(y, u, v, frame_all, &cpi->fastssim);
-        /* TODO(JBB): add 10/12 bit support */
-      }
-#if CONFIG_VP9_HIGHBITDEPTH
-      if (!cm->use_highbitdepth)
-#endif
-      {
-        double y, u, v, frame_all;
-        frame_all = vpx_psnrhvs(cpi->Source, cm->frame_to_show, &y, &u, &v);
-        adjust_image_stat(y, u, v, frame_all, &cpi->psnrhvs);
-      }
-    }
   }
-#endif
+#endif  // CONFIG_INTERNAL_STATS
 
   vpx_clear_system_state();
+
+#if CONFIG_EXT_REFS
+  if (cpi->rc.is_last_bipred_frame) {
+    // NOTE(zoeliu): If the current frame is a last bi-predictive frame, it is
+    //               needed next to show the BWDREF_FRAME.
+    cpi->rc.is_last_bipred_frame = 0;
+    cm->show_existing_frame = 1;
+  } else {
+    cm->show_existing_frame = 0;
+  }
+#endif  // CONFIG_EXT_REFS
+
   return 0;
 }
 
@@ -4371,6 +5761,15 @@
   }
 }
 
+int vp10_get_last_show_frame(VP10_COMP *cpi, YV12_BUFFER_CONFIG *frame) {
+  if (cpi->last_show_frame_buf_idx == INVALID_IDX)
+    return -1;
+
+  *frame =
+      cpi->common.buffer_pool->frame_bufs[cpi->last_show_frame_buf_idx].buf;
+  return 0;
+}
+
 int vp10_set_internal_size(VP10_COMP *cpi,
                           VPX_SCALING horiz_mode, VPX_SCALING vert_mode) {
   VP10_COMMON *cm = &cpi->common;
@@ -4429,28 +5828,6 @@
   return 0;
 }
 
-int64_t vp10_get_y_sse(const YV12_BUFFER_CONFIG *a,
-                      const YV12_BUFFER_CONFIG *b) {
-  assert(a->y_crop_width == b->y_crop_width);
-  assert(a->y_crop_height == b->y_crop_height);
-
-  return get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
-                 a->y_crop_width, a->y_crop_height);
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-int64_t vp10_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
-                             const YV12_BUFFER_CONFIG *b) {
-  assert(a->y_crop_width == b->y_crop_width);
-  assert(a->y_crop_height == b->y_crop_height);
-  assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
-  assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
-
-  return highbd_get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
-                        a->y_crop_width, a->y_crop_height);
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
 int vp10_get_quantizer(VP10_COMP *cpi) {
   return cpi->common.base_qindex;
 }
@@ -4458,10 +5835,15 @@
 void vp10_apply_encoding_flags(VP10_COMP *cpi, vpx_enc_frame_flags_t flags) {
   if (flags & (VP8_EFLAG_NO_REF_LAST | VP8_EFLAG_NO_REF_GF |
                VP8_EFLAG_NO_REF_ARF)) {
-    int ref = 7;
+    int ref = VP9_REFFRAME_ALL;
 
-    if (flags & VP8_EFLAG_NO_REF_LAST)
+    if (flags & VP8_EFLAG_NO_REF_LAST) {
       ref ^= VP9_LAST_FLAG;
+#if CONFIG_EXT_REFS
+      ref ^= VP9_LAST2_FLAG;
+      ref ^= VP9_LAST3_FLAG;
+#endif  // CONFIG_EXT_REFS
+    }
 
     if (flags & VP8_EFLAG_NO_REF_GF)
       ref ^= VP9_GOLD_FLAG;
@@ -4475,10 +5857,15 @@
   if (flags & (VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_UPD_GF |
                VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_FORCE_GF |
                VP8_EFLAG_FORCE_ARF)) {
-    int upd = 7;
+    int upd = VP9_REFFRAME_ALL;
 
-    if (flags & VP8_EFLAG_NO_UPD_LAST)
+    if (flags & VP8_EFLAG_NO_UPD_LAST) {
       upd ^= VP9_LAST_FLAG;
+#if CONFIG_EXT_REFS
+      upd ^= VP9_LAST2_FLAG;
+      upd ^= VP9_LAST3_FLAG;
+#endif  // CONFIG_EXT_REFS
+    }
 
     if (flags & VP8_EFLAG_NO_UPD_GF)
       upd ^= VP9_GOLD_FLAG;
diff --git a/vp10/encoder/encoder.h b/vp10/encoder/encoder.h
index 2cd1d3c..d7c62b2 100644
--- a/vp10/encoder/encoder.h
+++ b/vp10/encoder/encoder.h
@@ -23,6 +23,9 @@
 #include "vp10/common/onyxc_int.h"
 
 #include "vp10/encoder/aq_cyclicrefresh.h"
+#if CONFIG_ANS
+#include "vp10/encoder/buf_ans.h"
+#endif
 #include "vp10/encoder/context_tree.h"
 #include "vp10/encoder/encodemb.h"
 #include "vp10/encoder/firstpass.h"
@@ -34,6 +37,7 @@
 #include "vp10/encoder/rd.h"
 #include "vp10/encoder/speed_features.h"
 #include "vp10/encoder/tokenize.h"
+#include "vp10/encoder/variance_tree.h"
 
 #if CONFIG_VP9_TEMPORAL_DENOISING
 #include "vp10/encoder/denoiser.h"
@@ -55,8 +59,10 @@
   int nmvcosts[2][MV_VALS];
   int nmvcosts_hp[2][MV_VALS];
 
-#if !CONFIG_MISC_FIXES
-  vpx_prob segment_pred_probs[PREDICTION_PROBS];
+#if CONFIG_REF_MV
+  int nmv_vec_cost[NMV_CONTEXTS][MV_JOINTS];
+  int nmv_costs[NMV_CONTEXTS][2][MV_VALS];
+  int nmv_costs_hp[NMV_CONTEXTS][2][MV_VALS];
 #endif
 
   unsigned char *last_frame_seg_map_copy;
@@ -69,7 +75,6 @@
   FRAME_CONTEXT fc;
 } CODING_CONTEXT;
 
-
 typedef enum {
   // encode_breakout is disabled.
   ENCODE_BREAKOUT_DISABLED = 0,
@@ -105,7 +110,12 @@
 typedef enum {
   FRAMEFLAGS_KEY    = 1 << 0,
   FRAMEFLAGS_GOLDEN = 1 << 1,
+#if CONFIG_EXT_REFS
+  FRAMEFLAGS_BWDREF = 1 << 2,
+  FRAMEFLAGS_ALTREF = 1 << 3,
+#else
   FRAMEFLAGS_ALTREF = 1 << 2,
+#endif  // CONFIG_EXT_REFS
 } FRAMETYPE_FLAGS;
 
 typedef enum {
@@ -191,6 +201,9 @@
   // ----------------------------------------------------------------
 
   int enable_auto_arf;
+#if CONFIG_EXT_REFS
+  int enable_auto_brf;  // (b)ackward (r)ef (f)rame
+#endif  // CONFIG_EXT_REFS
 
   int encode_breakout;  // early breakout : for video conf recommend 800
 
@@ -233,6 +246,10 @@
   int color_range;
   int render_width;
   int render_height;
+
+#if CONFIG_EXT_PARTITION
+  vpx_superblock_size_t superblock_size;
+#endif  // CONFIG_EXT_PARTITION
 } VP10EncoderConfig;
 
 static INLINE int is_lossless_requested(const VP10EncoderConfig *cfg) {
@@ -249,7 +266,6 @@
 typedef struct RD_COUNTS {
   vp10_coeff_count coef_counts[TX_SIZES][PLANE_TYPES];
   int64_t comp_pred_diff[REFERENCE_MODES];
-  int64_t filter_diff[SWITCHABLE_FILTER_CONTEXTS];
   int m_search_count;
   int ex_search_count;
 } RD_COUNTS;
@@ -261,7 +277,10 @@
 
   PICK_MODE_CONTEXT *leaf_tree;
   PC_TREE *pc_tree;
-  PC_TREE *pc_root;
+  PC_TREE *pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2 + 1];
+
+  VAR_TREE *var_tree;
+  VAR_TREE *var_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2 + 1];
 } ThreadData;
 
 struct EncWorkerData;
@@ -284,12 +303,41 @@
   double worst;
 } ImageStat;
 
+typedef struct {
+  int ref_count;
+  YV12_BUFFER_CONFIG buf;
+} EncRefCntBuffer;
+
+#if CONFIG_ENTROPY
+typedef struct SUBFRAME_STATS {
+  vp10_coeff_probs_model
+  coef_probs_buf[COEF_PROBS_BUFS][TX_SIZES][PLANE_TYPES];
+  vp10_coeff_count
+  coef_counts_buf[COEF_PROBS_BUFS][TX_SIZES][PLANE_TYPES];
+  unsigned int
+  eob_counts_buf[COEF_PROBS_BUFS]
+                [TX_SIZES][PLANE_TYPES][REF_TYPES][COEF_BANDS][COEFF_CONTEXTS];
+  vp10_coeff_probs_model enc_starting_coef_probs[TX_SIZES][PLANE_TYPES];
+} SUBFRAME_STATS;
+#endif  // CONFIG_ENTROPY
+
+typedef struct TileBufferEnc {
+  uint8_t *data;
+  size_t size;
+} TileBufferEnc;
+
 typedef struct VP10_COMP {
   QUANTS quants;
   ThreadData td;
   MB_MODE_INFO_EXT *mbmi_ext_base;
-  DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][8]);
-  DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][8]);
+  DECLARE_ALIGNED(16, int16_t, y_dequant[QINDEX_RANGE][8]);   // 8: SIMD width
+  DECLARE_ALIGNED(16, int16_t, uv_dequant[QINDEX_RANGE][8]);  // 8: SIMD width
+#if CONFIG_NEW_QUANT
+  DECLARE_ALIGNED(16, dequant_val_type_nuq,
+                  y_dequant_val_nuq[QUANT_PROFILES][QINDEX_RANGE][COEF_BANDS]);
+  DECLARE_ALIGNED(16, dequant_val_type_nuq,
+                  uv_dequant_val_nuq[QUANT_PROFILES][QINDEX_RANGE][COEF_BANDS]);
+#endif  // CONFIG_NEW_QUANT
   VP10_COMMON common;
   VP10EncoderConfig oxcf;
   struct lookahead_ctx    *lookahead;
@@ -302,19 +350,32 @@
   YV12_BUFFER_CONFIG *unscaled_last_source;
   YV12_BUFFER_CONFIG scaled_last_source;
 
-  TileDataEnc *tile_data;
-  int allocated_tiles;  // Keep track of memory allocated for tiles.
+  // Up-sampled reference buffers
+  EncRefCntBuffer upsampled_ref_bufs[MAX_REF_FRAMES];
+  int upsampled_ref_idx[MAX_REF_FRAMES];
 
   // For a still frame, this flag is set to 1 to skip partition search.
   int partition_search_skippable_frame;
 
   int scaled_ref_idx[MAX_REF_FRAMES];
+#if CONFIG_EXT_REFS
+  int lst_fb_idxes[LAST_REF_FRAMES];
+#else
   int lst_fb_idx;
+#endif  // CONFIG_EXT_REFS
   int gld_fb_idx;
+#if CONFIG_EXT_REFS
+  int bwd_fb_idx;  // BWD_REF_FRAME
+#endif  // CONFIG_EXT_REFS
   int alt_fb_idx;
 
+  int last_show_frame_buf_idx;  // last show frame buffer index
+
   int refresh_last_frame;
   int refresh_golden_frame;
+#if CONFIG_EXT_REFS
+  int refresh_bwd_ref_frame;
+#endif  // CONFIG_EXT_REFS
   int refresh_alt_ref_frame;
 
   int ext_refresh_frame_flags_pending;
@@ -326,9 +387,9 @@
   int ext_refresh_frame_context;
 
   YV12_BUFFER_CONFIG last_frame_uf;
-
-  TOKENEXTRA *tile_tok[4][1 << 6];
-  unsigned int tok_count[4][1 << 6];
+#if CONFIG_LOOP_RESTORATION
+  YV12_BUFFER_CONFIG last_frame_db;
+#endif  // CONFIG_LOOP_RESTORATION
 
   // Ambient reconstruction err target for force key frames
   int64_t ambient_err;
@@ -337,6 +398,11 @@
 
   CODING_CONTEXT coding_context;
 
+#if CONFIG_REF_MV
+  int *nmv_costs[NMV_CONTEXTS][2];
+  int *nmv_costs_hp[NMV_CONTEXTS][2];
+#endif
+
   int *nmvcosts[2];
   int *nmvcosts_hp[2];
   int *nmvsadcosts[2];
@@ -372,7 +438,7 @@
   // clips, and 300 for < HD clips.
   int encode_breakout;
 
-  unsigned char *segmentation_map;
+  uint8_t *segmentation_map;
 
   // segment threashold for encode breakout
   int  segment_encode_breakout[MAX_SEGMENTS];
@@ -381,9 +447,9 @@
   ActiveMap active_map;
 
   fractional_mv_step_fp *find_fractional_mv_step;
-  vp10_full_search_fn_t full_search_sad;
+  vp10_full_search_fn_t full_search_sad;  // It is currently unused.
   vp10_diamond_search_fn_t diamond_search_sad;
-  vp9_variance_fn_ptr_t fn_ptr[BLOCK_SIZES];
+  vp10_variance_fn_ptr_t fn_ptr[BLOCK_SIZES];
   uint64_t time_receive_data;
   uint64_t time_compress_data;
   uint64_t time_pick_lpf;
@@ -397,7 +463,6 @@
 
   YV12_BUFFER_CONFIG alt_ref_buffer;
 
-
 #if CONFIG_INTERNAL_STATS
   unsigned int mode_chosen_counts[MAX_MODES];
 
@@ -406,28 +471,19 @@
   uint64_t total_samples;
   ImageStat psnr;
 
-  uint64_t totalp_sq_error;
-  uint64_t totalp_samples;
-  ImageStat psnrp;
-
   double total_blockiness;
   double worst_blockiness;
 
   int    bytes;
   double summed_quality;
   double summed_weights;
-  double summedp_quality;
-  double summedp_weights;
   unsigned int tot_recode_hits;
   double worst_ssim;
 
-  ImageStat ssimg;
   ImageStat fastssim;
   ImageStat psnrhvs;
 
-  int b_calculate_ssimg;
   int b_calculate_blockiness;
-
   int b_calculate_consistency;
 
   double total_inconsistency;
@@ -456,19 +512,65 @@
 
   search_site_config ss_cfg;
 
-  int mbmode_cost[INTRA_MODES];
+  int mbmode_cost[BLOCK_SIZE_GROUPS][INTRA_MODES];
+#if CONFIG_REF_MV
+  int newmv_mode_cost[NEWMV_MODE_CONTEXTS][2];
+  int zeromv_mode_cost[ZEROMV_MODE_CONTEXTS][2];
+  int refmv_mode_cost[REFMV_MODE_CONTEXTS][2];
+  int drl_mode_cost0[DRL_MODE_CONTEXTS][2];
+#if CONFIG_EXT_INTER
+  int new2mv_mode_cost[2];
+#endif  // CONFIG_EXT_INTER
+#endif
+
   unsigned int inter_mode_cost[INTER_MODE_CONTEXTS][INTER_MODES];
+#if CONFIG_EXT_INTER
+  unsigned int inter_compound_mode_cost[INTER_MODE_CONTEXTS]
+                                       [INTER_COMPOUND_MODES];
+  unsigned int interintra_mode_cost[BLOCK_SIZE_GROUPS][INTERINTRA_MODES];
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_OBMC || CONFIG_WARPED_MOTION
+  int motvar_cost[BLOCK_SIZES][MOTION_VARIATIONS];
+#endif  // CONFIG_OBMC || CONFIG_WARPED_MOTION
   int intra_uv_mode_cost[INTRA_MODES][INTRA_MODES];
   int y_mode_costs[INTRA_MODES][INTRA_MODES][INTRA_MODES];
   int switchable_interp_costs[SWITCHABLE_FILTER_CONTEXTS][SWITCHABLE_FILTERS];
+#if CONFIG_EXT_PARTITION_TYPES
+  int partition_cost[PARTITION_CONTEXTS][EXT_PARTITION_TYPES];
+#else
   int partition_cost[PARTITION_CONTEXTS][PARTITION_TYPES];
+#endif
+  int palette_y_size_cost[PALETTE_BLOCK_SIZES][PALETTE_SIZES];
+  int palette_uv_size_cost[PALETTE_BLOCK_SIZES][PALETTE_SIZES];
+  int palette_y_color_cost[PALETTE_MAX_SIZE - 1][PALETTE_COLOR_CONTEXTS]
+                                                 [PALETTE_COLORS];
+  int palette_uv_color_cost[PALETTE_MAX_SIZE - 1][PALETTE_COLOR_CONTEXTS]
+                                                  [PALETTE_COLORS];
+  int tx_size_cost[TX_SIZES - 1][TX_SIZE_CONTEXTS][TX_SIZES];
+#if CONFIG_EXT_TX
+  int inter_tx_type_costs[EXT_TX_SETS_INTER][EXT_TX_SIZES][TX_TYPES];
+  int intra_tx_type_costs[EXT_TX_SETS_INTRA][EXT_TX_SIZES][INTRA_MODES]
+                                                          [TX_TYPES];
+#else
+  int intra_tx_type_costs[EXT_TX_SIZES][TX_TYPES][TX_TYPES];
+  int inter_tx_type_costs[EXT_TX_SIZES][TX_TYPES];
+#endif  // CONFIG_EXT_TX
+#if CONFIG_EXT_INTRA
+  int intra_filter_cost[INTRA_FILTERS + 1][INTRA_FILTERS];
+#endif  // CONFIG_EXT_INTRA
 
   int multi_arf_allowed;
   int multi_arf_enabled;
   int multi_arf_last_grp_enabled;
 
-  int intra_tx_type_costs[EXT_TX_SIZES][TX_TYPES][TX_TYPES];
-  int inter_tx_type_costs[EXT_TX_SIZES][TX_TYPES];
+  TileDataEnc *tile_data;
+  int allocated_tiles;  // Keep track of memory allocated for tiles.
+
+  TOKENEXTRA *tile_tok[MAX_TILE_ROWS][MAX_TILE_COLS];
+  unsigned int tok_count[MAX_TILE_ROWS][MAX_TILE_COLS];
+
+  TileBufferEnc tile_buffers[MAX_TILE_ROWS][MAX_TILE_COLS];
+
 #if CONFIG_VP9_TEMPORAL_DENOISING
   VP9_DENOISER denoiser;
 #endif
@@ -482,18 +584,37 @@
   int resize_count;
 
   // VAR_BASED_PARTITION thresholds
-  // 0 - threshold_64x64; 1 - threshold_32x32;
-  // 2 - threshold_16x16; 3 - vbp_threshold_8x8;
-  int64_t vbp_thresholds[4];
+  // 0 - threshold_128x128;
+  // 1 - threshold_64x64;
+  // 2 - threshold_32x32;
+  // 3 - threshold_16x16;
+  // 4 - threshold_8x8;
+  int64_t vbp_thresholds[5];
   int64_t vbp_threshold_minmax;
   int64_t vbp_threshold_sad;
   BLOCK_SIZE vbp_bsize_min;
 
+  // VARIANCE_AQ segment map refresh
+  int vaq_refresh;
+
   // Multi-threading
   int num_workers;
   VPxWorker *workers;
   struct EncWorkerData *tile_thr_data;
   VP9LfSync lf_row_sync;
+#if CONFIG_ENTROPY
+  SUBFRAME_STATS subframe_stats;
+  // TODO(yaowu): minimize the size of count buffers
+  SUBFRAME_STATS wholeframe_stats;
+  vp10_coeff_stats branch_ct_buf[COEF_PROBS_BUFS][TX_SIZES][PLANE_TYPES];
+#endif  // CONFIG_ENTROPY
+#if CONFIG_ANS
+  struct BufAnsCoder buf_ans;
+#endif
+#if CONFIG_EXT_REFS
+  int refresh_frame_mask;
+  int existing_fb_idx_to_show;
+#endif  // CONFIG_EXT_REFS
 } VP10_COMP;
 
 void vp10_initialize_enc(void);
@@ -517,6 +638,8 @@
 int vp10_get_preview_raw_frame(VP10_COMP *cpi, YV12_BUFFER_CONFIG *dest,
                               vp10_ppflags_t *flags);
 
+int vp10_get_last_show_frame(VP10_COMP *cpi, YV12_BUFFER_CONFIG *frame);
+
 int vp10_use_as_reference(VP10_COMP *cpi, int ref_frame_flags);
 
 void vp10_update_reference(VP10_COMP *cpi, int ref_frame_flags);
@@ -541,6 +664,9 @@
 
 int vp10_get_quantizer(struct VP10_COMP *cpi);
 
+void vp10_full_to_model_counts(vp10_coeff_count_model *model_count,
+                               vp10_coeff_count *full_count);
+
 static INLINE int frame_is_kf_gf_arf(const VP10_COMP *cpi) {
   return frame_is_intra_only(&cpi->common) ||
          cpi->refresh_alt_ref_frame ||
@@ -549,17 +675,25 @@
 
 static INLINE int get_ref_frame_map_idx(const VP10_COMP *cpi,
                                         MV_REFERENCE_FRAME ref_frame) {
-  if (ref_frame == LAST_FRAME) {
+#if CONFIG_EXT_REFS
+  if (ref_frame >= LAST_FRAME && ref_frame <= LAST3_FRAME)
+    return cpi->lst_fb_idxes[ref_frame - 1];
+#else
+  if (ref_frame == LAST_FRAME)
     return cpi->lst_fb_idx;
-  } else if (ref_frame == GOLDEN_FRAME) {
+#endif  // CONFIG_EXT_REFS
+  else if (ref_frame == GOLDEN_FRAME)
     return cpi->gld_fb_idx;
-  } else {
+#if CONFIG_EXT_REFS
+  else if (ref_frame == BWDREF_FRAME)
+    return cpi->bwd_fb_idx;
+#endif  // CONFIG_EXT_REFS
+  else
     return cpi->alt_fb_idx;
-  }
 }
 
 static INLINE int get_ref_frame_buf_idx(const VP10_COMP *const cpi,
-                                        int ref_frame) {
+                                        MV_REFERENCE_FRAME ref_frame) {
   const VP10_COMMON *const cm = &cpi->common;
   const int map_idx = get_ref_frame_map_idx(cpi, ref_frame);
   return (map_idx != INVALID_IDX) ? cm->ref_frame_map[map_idx] : INVALID_IDX;
@@ -573,7 +707,29 @@
       buf_idx != INVALID_IDX ? &cm->buffer_pool->frame_bufs[buf_idx].buf : NULL;
 }
 
-static INLINE int get_token_alloc(int mb_rows, int mb_cols) {
+static INLINE const YV12_BUFFER_CONFIG *get_upsampled_ref(
+    VP10_COMP *cpi, const MV_REFERENCE_FRAME ref_frame) {
+  // Use up-sampled reference frames.
+  const int buf_idx =
+      cpi->upsampled_ref_idx[get_ref_frame_map_idx(cpi, ref_frame)];
+  return &cpi->upsampled_ref_bufs[buf_idx].buf;
+}
+
+#if CONFIG_EXT_REFS
+static INLINE int enc_is_ref_frame_buf(VP10_COMP *cpi,
+                                       RefCntBuffer *frame_buf) {
+  MV_REFERENCE_FRAME ref_frame;
+  VP10_COMMON *const cm = &cpi->common;
+  for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
+    const int buf_idx = get_ref_frame_buf_idx(cpi, ref_frame);
+    if (buf_idx == INVALID_IDX) continue;
+    if (frame_buf == &cm->buffer_pool->frame_bufs[buf_idx]) break;
+  }
+  return (ref_frame <= ALTREF_FRAME);
+}
+#endif  // CONFIG_EXT_REFS
+
+static INLINE unsigned int get_token_alloc(int mb_rows, int mb_cols) {
   // TODO(JBB): double check we can't exceed this token count if we have a
   // 32x32 transform crossing a boundary at a multiple of 16.
   // mb_rows, cols are in units of 16 pixels. We assume 3 planes all at full
@@ -584,19 +740,13 @@
 
 // Get the allocated token size for a tile. It does the same calculation as in
 // the frame token allocation.
-static INLINE int allocated_tokens(TileInfo tile) {
+static INLINE unsigned int allocated_tokens(TileInfo tile) {
   int tile_mb_rows = (tile.mi_row_end - tile.mi_row_start + 1) >> 1;
   int tile_mb_cols = (tile.mi_col_end - tile.mi_col_start + 1) >> 1;
 
   return get_token_alloc(tile_mb_rows, tile_mb_cols);
 }
 
-int64_t vp10_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
-#if CONFIG_VP9_HIGHBITDEPTH
-int64_t vp10_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
-                             const YV12_BUFFER_CONFIG *b);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
 void vp10_alloc_compressor_data(VP10_COMP *cpi);
 
 void vp10_scale_references(VP10_COMP *cpi);
@@ -620,6 +770,16 @@
          cpi->oxcf.enable_auto_arf;
 }
 
+// TODO(zoeliu): To set up cpi->oxcf.enable_auto_brf
+#if 0 && CONFIG_EXT_REFS
+static INLINE int is_bwdref_enabled(const VP10_COMP *const cpi) {
+  // NOTE(zoeliu): The enabling of bi-predictive frames depends on the use of
+  //               alt_ref, and now will be off when the alt_ref interval is
+  //               not sufficiently large.
+  return is_altref_enabled(cpi) && cpi->oxcf.enable_auto_brf;
+}
+#endif  // CONFIG_EXT_REFS
+
 static INLINE void set_ref_ptrs(VP10_COMMON *cm, MACROBLOCKD *xd,
                                 MV_REFERENCE_FRAME ref0,
                                 MV_REFERENCE_FRAME ref1) {
@@ -641,6 +801,18 @@
 
 #define LAYER_IDS_TO_IDX(sl, tl, num_tl) ((sl) * (num_tl) + (tl))
 
+// Update up-sampled reference frame index.
+static INLINE void uref_cnt_fb(EncRefCntBuffer *ubufs, int *uidx,
+                               int new_uidx) {
+  const int ref_index = *uidx;
+
+  if (ref_index >= 0 && ubufs[ref_index].ref_count > 0)
+    ubufs[ref_index].ref_count--;
+
+  *uidx = new_uidx;
+  ubufs[new_uidx].ref_count++;
+}
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp10/encoder/ethread.c b/vp10/encoder/ethread.c
index ad47ccf..5434ee7 100644
--- a/vp10/encoder/ethread.c
+++ b/vp10/encoder/ethread.c
@@ -19,9 +19,6 @@
   for (i = 0; i < REFERENCE_MODES; i++)
     td->rd_counts.comp_pred_diff[i] += td_t->rd_counts.comp_pred_diff[i];
 
-  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
-    td->rd_counts.filter_diff[i] += td_t->rd_counts.filter_diff[i];
-
   for (i = 0; i < TX_SIZES; i++)
     for (j = 0; j < PLANE_TYPES; j++)
       for (k = 0; k < REF_TYPES; k++)
@@ -40,8 +37,8 @@
 static int enc_worker_hook(EncWorkerData *const thread_data, void *unused) {
   VP10_COMP *const cpi = thread_data->cpi;
   const VP10_COMMON *const cm = &cpi->common;
-  const int tile_cols = 1 << cm->log2_tile_cols;
-  const int tile_rows = 1 << cm->log2_tile_rows;
+  const int tile_cols = cm->tile_cols;
+  const int tile_rows = cm->tile_rows;
   int t;
 
   (void) unused;
@@ -59,7 +56,7 @@
 
 void vp10_encode_tiles_mt(VP10_COMP *cpi) {
   VP10_COMMON *const cm = &cpi->common;
-  const int tile_cols = 1 << cm->log2_tile_cols;
+  const int tile_cols = cm->tile_cols;
   const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
   const int num_workers = VPXMIN(cpi->oxcf.max_threads, tile_cols);
   int i;
@@ -68,24 +65,23 @@
 
   // Only run once to create threads and allocate thread data.
   if (cpi->num_workers == 0) {
-    int allocated_workers = num_workers;
-
     CHECK_MEM_ERROR(cm, cpi->workers,
-                    vpx_malloc(allocated_workers * sizeof(*cpi->workers)));
+                    vpx_malloc(num_workers * sizeof(*cpi->workers)));
 
     CHECK_MEM_ERROR(cm, cpi->tile_thr_data,
-                    vpx_calloc(allocated_workers,
+                    vpx_calloc(num_workers,
                     sizeof(*cpi->tile_thr_data)));
 
-    for (i = 0; i < allocated_workers; i++) {
+    for (i = 0; i < num_workers; i++) {
       VPxWorker *const worker = &cpi->workers[i];
-      EncWorkerData *thread_data = &cpi->tile_thr_data[i];
+      EncWorkerData *const thread_data = &cpi->tile_thr_data[i];
 
       ++cpi->num_workers;
       winterface->init(worker);
 
-      if (i < allocated_workers - 1) {
-        thread_data->cpi = cpi;
+      thread_data->cpi = cpi;
+
+      if (i < num_workers - 1) {
 
         // Allocate thread data.
         CHECK_MEM_ERROR(cm, thread_data->td,
@@ -97,6 +93,10 @@
         thread_data->td->pc_tree = NULL;
         vp10_setup_pc_tree(cm, thread_data->td);
 
+        // Set up variance tree if needed.
+        if (cpi->sf.partition_search_type == VAR_BASED_PARTITION)
+          vp10_setup_var_tree(cm, &cpi->td);
+
         // Allocate frame counters in thread data.
         CHECK_MEM_ERROR(cm, thread_data->td->counts,
                         vpx_calloc(1, sizeof(*thread_data->td->counts)));
@@ -107,7 +107,6 @@
                              "Tile encoder thread creation failed");
       } else {
         // Main thread acts as a worker and uses the thread data in cpi.
-        thread_data->cpi = cpi;
         thread_data->td = &cpi->td;
       }
 
@@ -133,6 +132,13 @@
       memcpy(thread_data->td->counts, &cpi->common.counts,
              sizeof(cpi->common.counts));
     }
+
+    // Allocate buffers used by palette coding mode.
+    if (cpi->common.allow_screen_content_tools && i < num_workers - 1) {
+        MACROBLOCK *x = &thread_data->td->mb;
+        CHECK_MEM_ERROR(cm, x->palette_buffer,
+                        vpx_memalign(16, sizeof(*x->palette_buffer)));
+    }
   }
 
   // Encode a frame
@@ -161,7 +167,7 @@
 
     // Accumulate counters.
     if (i < cpi->num_workers - 1) {
-      vp10_accumulate_frame_counts(cm, thread_data->td->counts, 0);
+      vp10_accumulate_frame_counts(cm, thread_data->td->counts);
       accumulate_rd_opt(&cpi->td, thread_data->td);
     }
   }
diff --git a/vp10/encoder/firstpass.c b/vp10/encoder/firstpass.c
index 7c5d3c0..add2510 100644
--- a/vp10/encoder/firstpass.c
+++ b/vp10/encoder/firstpass.c
@@ -60,7 +60,6 @@
 #define RC_FACTOR_MIN       0.75
 #define RC_FACTOR_MAX       1.75
 
-
 #define NCOUNT_INTRA_THRESH 8192
 #define NCOUNT_INTRA_FACTOR 3
 #define NCOUNT_FRAME_II_THRESH 5.0
@@ -394,7 +393,7 @@
   MV ref_mv_full = {ref_mv->row >> 3, ref_mv->col >> 3};
   int num00, tmp_err, n;
   const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
-  vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize];
+  vp10_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[bsize];
   const int new_mv_mode_penalty = NEW_MV_MODE_PENALTY;
 
   int step_param = 3;
@@ -497,7 +496,8 @@
   TileInfo tile;
   struct macroblock_plane *const p = x->plane;
   struct macroblockd_plane *const pd = xd->plane;
-  const PICK_MODE_CONTEXT *ctx = &cpi->td.pc_root->none;
+  const PICK_MODE_CONTEXT *ctx =
+      &cpi->td.pc_root[MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2]->none;
   int i;
 
   int recon_yoffset, recon_uvoffset;
@@ -569,7 +569,6 @@
     pd[i].dqcoeff = ctx->dqcoeff_pbuf[i][1];
     p[i].eobs = ctx->eobs_pbuf[i][1];
   }
-  x->skip_recode = 0;
 
   vp10_init_mv_probs(cm);
   vp10_initialize_rd_consts(cpi);
@@ -621,10 +620,13 @@
 
       // Do intra 16x16 prediction.
       xd->mi[0]->mbmi.segment_id = 0;
+#if CONFIG_SUPERTX
+      xd->mi[0]->mbmi.segment_id_supertx = 0;
+#endif  // CONFIG_SUPERTX
       xd->mi[0]->mbmi.mode = DC_PRED;
       xd->mi[0]->mbmi.tx_size = use_dc_pred ?
          (bsize >= BLOCK_16X16 ? TX_16X16 : TX_8X8) : TX_4X4;
-      vp10_encode_intra_block_plane(x, bsize, 0);
+      vp10_encode_intra_block_plane(x, bsize, 0, 0);
       this_error = vpx_get_mb_ss(x->plane[0].src_diff);
 
       // Keep a record of blocks that have almost no intra error residual
@@ -1050,8 +1052,13 @@
        ((twopass->this_frame_stats.intra_error /
          DOUBLE_DIVIDE_CHECK(twopass->this_frame_stats.coded_error)) > 2.0))) {
     if (gld_yv12 != NULL) {
+#if CONFIG_EXT_REFS
+      ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
+                 cm->ref_frame_map[cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]]);
+#else
       ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
                  cm->ref_frame_map[cpi->lst_fb_idx]);
+#endif  // CONFIG_EXT_REFS
     }
     twopass->sr_update_lag = 1;
   } else {
@@ -1061,14 +1068,25 @@
   vpx_extend_frame_borders(new_yv12);
 
   // The frame we just compressed now becomes the last frame.
+#if CONFIG_EXT_REFS
+  ref_cnt_fb(pool->frame_bufs,
+             &cm->ref_frame_map[cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]],
+             cm->new_fb_idx);
+#else
   ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->lst_fb_idx],
              cm->new_fb_idx);
+#endif  // CONFIG_EXT_REFS
 
   // Special case for the first frame. Copy into the GF buffer as a second
   // reference.
   if (cm->current_video_frame == 0 && cpi->gld_fb_idx != INVALID_IDX) {
+#if CONFIG_EXT_REFS
+    ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
+               cm->ref_frame_map[cpi->lst_fb_idxes[LAST_FRAME - LAST_FRAME]]);
+#else
     ref_cnt_fb(pool->frame_bufs, &cm->ref_frame_map[cpi->gld_fb_idx],
                cm->ref_frame_map[cpi->lst_fb_idx]);
+#endif  // CONFIG_EXT_REFS
   }
 
   // Use this to see what the first pass reconstruction looks like.
@@ -1615,7 +1633,7 @@
   GF_GROUP *const gf_group = &twopass->gf_group;
   FIRSTPASS_STATS frame_stats;
   int i;
-  int frame_index = 1;
+  int frame_index = 0;
   int target_frame_size;
   int key_frame;
   const int max_bits = frame_max_bits(&cpi->rc, &cpi->oxcf);
@@ -1626,6 +1644,21 @@
   int mid_frame_idx;
   unsigned char arf_buffer_indices[MAX_ACTIVE_ARFS];
 
+#if CONFIG_EXT_REFS
+  // The use of bi-predictive frames are only enabled when following 3
+  // conditions are met:
+  // (1) Alt-ref is enabled;
+  // (2) The bi-predictive group interval is at least 2; and
+  // (3) The bi-predictive group interval is strictly smaller than the
+  //     golden group interval.
+  const int is_bipred_enabled =
+      rc->source_alt_ref_pending && rc->bipred_group_interval &&
+      rc->bipred_group_interval <=
+          (rc->baseline_gf_interval - rc->source_alt_ref_pending);
+  int bipred_group_end = 0;
+  int bipred_frame_index = 0;
+#endif  // CONFIG_EXT_REFS
+
   key_frame = cpi->common.frame_type == KEY_FRAME;
 
   get_arf_buffer_indices(arf_buffer_indices);
@@ -1634,27 +1667,38 @@
   // is also the golden frame.
   if (!key_frame) {
     if (rc->source_alt_ref_active) {
-      gf_group->update_type[0] = OVERLAY_UPDATE;
-      gf_group->rf_level[0] = INTER_NORMAL;
-      gf_group->bit_allocation[0] = 0;
+      gf_group->update_type[frame_index] = OVERLAY_UPDATE;
+      gf_group->rf_level[frame_index] = INTER_NORMAL;
+      gf_group->bit_allocation[frame_index] = 0;
     } else {
-      gf_group->update_type[0] = GF_UPDATE;
-      gf_group->rf_level[0] = GF_ARF_STD;
-      gf_group->bit_allocation[0] = gf_arf_bits;
+      gf_group->update_type[frame_index] = GF_UPDATE;
+      gf_group->rf_level[frame_index] = GF_ARF_STD;
+      gf_group->bit_allocation[frame_index] = gf_arf_bits;
     }
-    gf_group->arf_update_idx[0] = arf_buffer_indices[0];
-    gf_group->arf_ref_idx[0] = arf_buffer_indices[0];
+    gf_group->arf_update_idx[frame_index] = arf_buffer_indices[0];
+    gf_group->arf_ref_idx[frame_index] = arf_buffer_indices[0];
 
     // Step over the golden frame / overlay frame
     if (EOF == input_stats(twopass, &frame_stats))
       return;
   }
 
+#if CONFIG_EXT_REFS
+  gf_group->bidir_pred_enabled[frame_index] = 0;
+  gf_group->brf_src_offset[frame_index] = 0;
+#endif  // CONFIG_EXT_REFS
+
   // Deduct the boost bits for arf (or gf if it is not a key frame)
   // from the group total.
   if (rc->source_alt_ref_pending || !key_frame)
     total_group_bits -= gf_arf_bits;
 
+  frame_index++;
+
+#if CONFIG_EXT_REFS
+  bipred_frame_index++;
+#endif  // CONFIG_EXT_REFS
+
   // Store the bits to spend on the ARF if there is one.
   if (rc->source_alt_ref_pending) {
     gf_group->update_type[frame_index] = ARF_UPDATE;
@@ -1668,6 +1712,13 @@
     gf_group->arf_ref_idx[frame_index] =
       arf_buffer_indices[cpi->multi_arf_last_grp_enabled &&
                          rc->source_alt_ref_active];
+
+#if CONFIG_EXT_REFS
+    gf_group->bidir_pred_enabled[frame_index] = 0;
+    gf_group->brf_src_offset[frame_index] = 0;
+    // NOTE: "bidir_pred_frame_index" stays unchanged for ARF_UPDATE frames.
+#endif  // CONFIG_EXT_REFS
+
     ++frame_index;
 
     if (cpi->multi_arf_enabled) {
@@ -1713,10 +1764,73 @@
     target_frame_size = clamp(target_frame_size, 0,
                               VPXMIN(max_bits, (int)total_group_bits));
 
-    gf_group->update_type[frame_index] = LF_UPDATE;
-    gf_group->rf_level[frame_index] = INTER_NORMAL;
+#if CONFIG_EXT_REFS
+    // NOTE: BIDIR_PRED is only enabled when the length of the bi-predictive
+    //       frame group interval is strictly smaller than that of the GOLDEN
+    //       FRAME group interval.
+    // TODO(zoeliu): Currently BIDIR_PRED is only enabled when alt-ref is on.
+    if (is_bipred_enabled && !bipred_group_end) {
+      const int cur_brf_src_offset = rc->bipred_group_interval - 1;
 
-    gf_group->bit_allocation[frame_index] = target_frame_size;
+      // --- BRF_UPDATE ---
+      if (bipred_frame_index == 1) {
+        gf_group->update_type[frame_index] = BRF_UPDATE;
+        gf_group->bidir_pred_enabled[frame_index] = 1;
+        gf_group->brf_src_offset[frame_index] = cur_brf_src_offset;
+      // --- LAST_BIPRED_UPDATE ---
+      } else if (bipred_frame_index == rc->bipred_group_interval) {
+        gf_group->update_type[frame_index] = LAST_BIPRED_UPDATE;
+        gf_group->bidir_pred_enabled[frame_index] = 1;
+        gf_group->brf_src_offset[frame_index] = 0;
+        // Reset the bi-predictive frame index.
+        bipred_frame_index = 0;
+      // --- BIPRED_UPDATE ---
+      } else {
+        gf_group->update_type[frame_index] = BIPRED_UPDATE;
+        gf_group->bidir_pred_enabled[frame_index] = 1;
+        gf_group->brf_src_offset[frame_index] = 0;
+      }
+
+      bipred_frame_index++;
+      // Check whether the next bi-predictive frame group would entirely be
+      // included within the current golden frame group.
+      if (bipred_frame_index == 1 && (i + 1 + cur_brf_src_offset) >=
+          (rc->baseline_gf_interval - rc->source_alt_ref_pending)) {
+          bipred_group_end = 1;
+      }
+    } else {
+#endif  // CONFIG_EXT_REFS
+      gf_group->update_type[frame_index] = LF_UPDATE;
+#if CONFIG_EXT_REFS
+      gf_group->bidir_pred_enabled[frame_index] = 0;
+      gf_group->brf_src_offset[frame_index] = 0;
+    }
+#endif  // CONFIG_EXT_REFS
+
+#if CONFIG_EXT_REFS
+    if (gf_group->update_type[frame_index] == BRF_UPDATE) {
+      // Boost up the allocated bits on BWDREF_FRAME
+      gf_group->rf_level[frame_index] = INTER_HIGH;
+      gf_group->bit_allocation[frame_index] =
+          target_frame_size + (target_frame_size >> 2);
+    } else if (gf_group->update_type[frame_index] == LAST_BIPRED_UPDATE) {
+      // Press down the allocated bits on LAST_BIPRED_UPDATE frames
+      gf_group->rf_level[frame_index] = INTER_LOW;
+      gf_group->bit_allocation[frame_index] =
+          target_frame_size - (target_frame_size >> 1);
+    } else if (gf_group->update_type[frame_index] == BIPRED_UPDATE) {
+      // TODO(zoeliu): To investigate whether the allocated bits on
+      // BIPRED_UPDATE frames need to be further adjusted.
+      gf_group->rf_level[frame_index] = INTER_NORMAL;
+      gf_group->bit_allocation[frame_index] = target_frame_size;
+    } else {
+#endif  // CONFIG_EXT_REFS
+      gf_group->rf_level[frame_index] = INTER_NORMAL;
+      gf_group->bit_allocation[frame_index] = target_frame_size;
+#if CONFIG_EXT_REFS
+    }
+#endif  // CONFIG_EXT_REFS
+
     ++frame_index;
   }
 
@@ -1742,6 +1856,10 @@
     gf_group->update_type[frame_index] = GF_UPDATE;
     gf_group->rf_level[frame_index] = GF_ARF_STD;
   }
+#if CONFIG_EXT_REFS
+  gf_group->bidir_pred_enabled[frame_index] = 0;
+  gf_group->brf_src_offset[frame_index] = 0;
+#endif  // CONFIG_EXT_REFS
 
   // Note whether multi-arf was enabled this group for next time.
   cpi->multi_arf_last_grp_enabled = cpi->multi_arf_enabled;
@@ -1832,6 +1950,7 @@
     int int_lbq =
       (int)(vp10_convert_qindex_to_q(rc->last_boosted_qindex,
                                      cpi->common.bit_depth));
+
     active_min_gf_interval = rc->min_gf_interval + VPXMIN(2, int_max_q / 200);
     if (active_min_gf_interval > rc->max_gf_interval)
       active_min_gf_interval = rc->max_gf_interval;
@@ -1953,6 +2072,13 @@
 
   rc->frames_till_gf_update_due = rc->baseline_gf_interval;
 
+#if CONFIG_EXT_REFS
+  rc->bipred_group_interval = BFG_INTERVAL;
+  // The minimum bi-predictive frame group interval is 2.
+  if (rc->bipred_group_interval < 2)
+    rc->bipred_group_interval = 0;
+#endif  // CONFIG_EXT_REFS
+
   // Reset the file position.
   reset_fpf_position(twopass, start_pos);
 
@@ -2394,33 +2520,85 @@
   TWO_PASS *const twopass = &cpi->twopass;
 
   cpi->rc.is_src_frame_alt_ref = 0;
+#if CONFIG_EXT_REFS
+  cpi->rc.is_bwd_ref_frame = 0;
+  cpi->rc.is_last_bipred_frame = 0;
+  cpi->rc.is_bipred_frame = 0;
+#endif  // CONFIG_EXT_REFS
+
   switch (twopass->gf_group.update_type[twopass->gf_group.index]) {
     case KF_UPDATE:
       cpi->refresh_last_frame = 1;
       cpi->refresh_golden_frame = 1;
+#if CONFIG_EXT_REFS
+      cpi->refresh_bwd_ref_frame = 1;
+#endif  // CONFIG_EXT_REFS
       cpi->refresh_alt_ref_frame = 1;
       break;
+
     case LF_UPDATE:
       cpi->refresh_last_frame = 1;
       cpi->refresh_golden_frame = 0;
+#if CONFIG_EXT_REFS
+      cpi->refresh_bwd_ref_frame = 0;
+#endif  // CONFIG_EXT_REFS
       cpi->refresh_alt_ref_frame = 0;
       break;
+
     case GF_UPDATE:
       cpi->refresh_last_frame = 1;
       cpi->refresh_golden_frame = 1;
+#if CONFIG_EXT_REFS
+      cpi->refresh_bwd_ref_frame = 0;
+#endif  // CONFIG_EXT_REFS
       cpi->refresh_alt_ref_frame = 0;
       break;
+
     case OVERLAY_UPDATE:
       cpi->refresh_last_frame = 0;
       cpi->refresh_golden_frame = 1;
+#if CONFIG_EXT_REFS
+      cpi->refresh_bwd_ref_frame = 0;
+#endif  // CONFIG_EXT_REFS
       cpi->refresh_alt_ref_frame = 0;
       cpi->rc.is_src_frame_alt_ref = 1;
       break;
+
     case ARF_UPDATE:
       cpi->refresh_last_frame = 0;
       cpi->refresh_golden_frame = 0;
+#if CONFIG_EXT_REFS
+      cpi->refresh_bwd_ref_frame = 0;
+#endif  // CONFIG_EXT_REFS
       cpi->refresh_alt_ref_frame = 1;
       break;
+
+#if CONFIG_EXT_REFS
+    case BRF_UPDATE:
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_bwd_ref_frame = 1;
+      cpi->refresh_alt_ref_frame = 0;
+      cpi->rc.is_bwd_ref_frame = 1;
+      break;
+
+    case LAST_BIPRED_UPDATE:
+      cpi->refresh_last_frame = 0;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_bwd_ref_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+      cpi->rc.is_last_bipred_frame = 1;
+      break;
+
+    case BIPRED_UPDATE:
+      cpi->refresh_last_frame = 1;
+      cpi->refresh_golden_frame = 0;
+      cpi->refresh_bwd_ref_frame = 0;
+      cpi->refresh_alt_ref_frame = 0;
+      cpi->rc.is_bipred_frame = 1;
+      break;
+#endif  // CONFIG_EXT_REFS
+
     default:
       assert(0);
       break;
@@ -2510,6 +2688,7 @@
     rc->last_q[KEY_FRAME] = (tmp_q + cpi->oxcf.best_allowed_q) / 2;
     rc->avg_frame_qindex[KEY_FRAME] = rc->last_q[KEY_FRAME];
   }
+
   vp10_zero(this_frame);
   if (EOF == input_stats(twopass, &this_frame))
     return;
diff --git a/vp10/encoder/firstpass.h b/vp10/encoder/firstpass.h
index 68a8887..c9f4ad3 100644
--- a/vp10/encoder/firstpass.h
+++ b/vp10/encoder/firstpass.h
@@ -39,6 +39,13 @@
 } FIRSTPASS_MB_STATS;
 #endif
 
+#if CONFIG_EXT_REFS
+// Length of the bi-predictive frame group (BFG)
+// NOTE: Currently each BFG contains one backward ref (BWF) frame plus a certain
+//       number of bi-predictive frames.
+#define BFG_INTERVAL          2
+#endif  // CONFIG_EXT_REFS
+
 #define VLOW_MOTION_THRESHOLD 950
 
 typedef struct {
@@ -72,7 +79,14 @@
   GF_UPDATE = 2,
   ARF_UPDATE = 3,
   OVERLAY_UPDATE = 4,
+#if CONFIG_EXT_REFS
+  BRF_UPDATE = 5,  // Backward Reference Frame
+  LAST_BIPRED_UPDATE = 6,  // Last Bi-predictive Frame
+  BIPRED_UPDATE = 7,  // Bi-predictive Frame, but not the last one
+  FRAME_UPDATE_TYPES = 8
+#else
   FRAME_UPDATE_TYPES = 5
+#endif  // CONFIG_EXT_REFS
 } FRAME_UPDATE_TYPE;
 
 #define FC_ANIMATION_THRESH 0.15
@@ -89,6 +103,10 @@
   unsigned char arf_src_offset[(MAX_LAG_BUFFERS * 2) + 1];
   unsigned char arf_update_idx[(MAX_LAG_BUFFERS * 2) + 1];
   unsigned char arf_ref_idx[(MAX_LAG_BUFFERS * 2) + 1];
+#if CONFIG_EXT_REFS
+  unsigned char brf_src_offset[(MAX_LAG_BUFFERS * 2) + 1];
+  unsigned char bidir_pred_enabled[(MAX_LAG_BUFFERS * 2) + 1];
+#endif  // CONFIG_EXT_REFS
   int bit_allocation[(MAX_LAG_BUFFERS * 2) + 1];
 } GF_GROUP;
 
diff --git a/vp10/encoder/hybrid_fwd_txfm.c b/vp10/encoder/hybrid_fwd_txfm.c
new file mode 100644
index 0000000..a0e0fdc
--- /dev/null
+++ b/vp10/encoder/hybrid_fwd_txfm.c
@@ -0,0 +1,387 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp10_rtcd.h"
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vp10/common/idct.h"
+#include "vp10/encoder/hybrid_fwd_txfm.h"
+
+static INLINE void fdct32x32(int rd_transform, const int16_t *src,
+                             tran_low_t *dst, int src_stride) {
+  if (rd_transform)
+    vpx_fdct32x32_rd(src, dst, src_stride);
+  else
+    vpx_fdct32x32(src, dst, src_stride);
+}
+
+static void fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
+                         int diff_stride, TX_TYPE tx_type, int lossless) {
+  if (lossless) {
+    assert(tx_type == DCT_DCT);
+    vp10_fwht4x4(src_diff, coeff, diff_stride);
+    return;
+  }
+
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      vp10_fht4x4(src_diff, coeff, diff_stride, tx_type);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      vp10_fht4x4(src_diff, coeff, diff_stride, tx_type);
+      break;
+    case IDTX:
+      vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 4, tx_type);
+      break;
+#endif  // CONFIG_EXT_TX
+    default:
+      assert(0);
+  }
+}
+
+static void fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
+                         int diff_stride, TX_TYPE tx_type,
+                         FWD_TXFM_OPT fwd_txfm_opt) {
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      if (fwd_txfm_opt == FWD_TXFM_OPT_NORMAL)
+        vp10_fht8x8(src_diff, coeff, diff_stride, tx_type);
+      else  // FWD_TXFM_OPT_DC
+        vpx_fdct8x8_1(src_diff, coeff, diff_stride);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      vp10_fht8x8(src_diff, coeff, diff_stride, tx_type);
+      break;
+    case IDTX:
+      vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 8, tx_type);
+      break;
+#endif  // CONFIG_EXT_TX
+    default:
+      assert(0);
+  }
+}
+
+static void fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
+                           int diff_stride, TX_TYPE tx_type,
+                           FWD_TXFM_OPT fwd_txfm_opt) {
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      if (fwd_txfm_opt == FWD_TXFM_OPT_NORMAL)
+        vp10_fht16x16(src_diff, coeff, diff_stride, tx_type);
+      else  // FWD_TXFM_OPT_DC
+        vpx_fdct16x16_1(src_diff, coeff, diff_stride);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      vp10_fht16x16(src_diff, coeff, diff_stride, tx_type);
+      break;
+    case IDTX:
+      vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 16, tx_type);
+      break;
+#endif  // CONFIG_EXT_TX
+    default:
+      assert(0);
+  }
+}
+
+static void fwd_txfm_32x32(int rd_transform, const int16_t *src_diff,
+                           tran_low_t *coeff, int diff_stride, TX_TYPE tx_type,
+                           FWD_TXFM_OPT fwd_txfm_opt) {
+  switch (tx_type) {
+    case DCT_DCT:
+      if (fwd_txfm_opt == FWD_TXFM_OPT_NORMAL)
+        fdct32x32(rd_transform, src_diff, coeff, diff_stride);
+      else  // FWD_TXFM_OPT_DC
+        vpx_fdct32x32_1(src_diff, coeff, diff_stride);
+      break;
+#if CONFIG_EXT_TX
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      vp10_fht32x32_c(src_diff, coeff, diff_stride, tx_type);
+      break;
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      vp10_fht32x32_c(src_diff, coeff, diff_stride, tx_type);
+      break;
+    case IDTX:
+      vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 32, tx_type);
+      break;
+#endif  // CONFIG_EXT_TX
+    default:
+      assert(0);
+      break;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_fwd_txfm_4x4(const int16_t *src_diff, tran_low_t *coeff,
+                                int diff_stride, TX_TYPE tx_type, int lossless,
+                                const int bd) {
+  if (lossless) {
+    assert(tx_type == DCT_DCT);
+    vp10_highbd_fwht4x4(src_diff, coeff, diff_stride);
+    return;
+  }
+
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      vp10_fwd_txfm2d_4x4(src_diff, coeff, diff_stride, tx_type, bd);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      vp10_fwd_txfm2d_4x4(src_diff, coeff, diff_stride, tx_type, bd);
+      break;
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      vp10_highbd_fht4x4_c(src_diff, coeff, diff_stride, tx_type);
+      break;
+    case IDTX:
+      vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 4, tx_type);
+      break;
+#endif  // CONFIG_EXT_TX
+    default:
+      assert(0);
+  }
+}
+
+static void highbd_fwd_txfm_8x8(const int16_t *src_diff, tran_low_t *coeff,
+                                int diff_stride, TX_TYPE tx_type,
+                                FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
+  (void)fwd_txfm_opt;
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      vp10_fwd_txfm2d_8x8(src_diff, coeff, diff_stride, tx_type, bd);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      vp10_fwd_txfm2d_8x8(src_diff, coeff, diff_stride, tx_type, bd);
+      break;
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      // Use C version since DST exists only in C
+      vp10_highbd_fht8x8_c(src_diff, coeff, diff_stride, tx_type);
+      break;
+    case IDTX:
+      vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 8, tx_type);
+      break;
+#endif  // CONFIG_EXT_TX
+    default:
+      assert(0);
+  }
+}
+
+static void highbd_fwd_txfm_16x16(const int16_t *src_diff, tran_low_t *coeff,
+                                  int diff_stride, TX_TYPE tx_type,
+                                  FWD_TXFM_OPT fwd_txfm_opt, const int bd) {
+  (void)fwd_txfm_opt;
+  switch (tx_type) {
+    case DCT_DCT:
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+      vp10_fwd_txfm2d_16x16(src_diff, coeff, diff_stride, tx_type, bd);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+      vp10_fwd_txfm2d_16x16(src_diff, coeff, diff_stride, tx_type, bd);
+      break;
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      // Use C version since DST exists only in C
+      vp10_highbd_fht16x16_c(src_diff, coeff, diff_stride, tx_type);
+      break;
+    case IDTX:
+      vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 16, tx_type);
+      break;
+#endif  // CONFIG_EXT_TX
+    default:
+      assert(0);
+  }
+}
+
+static void highbd_fwd_txfm_32x32(int rd_transform, const int16_t *src_diff,
+                                  tran_low_t *coeff, int diff_stride,
+                                  TX_TYPE tx_type, FWD_TXFM_OPT fwd_txfm_opt,
+                                  const int bd) {
+  (void)rd_transform;
+  (void)fwd_txfm_opt;
+  switch (tx_type) {
+    case DCT_DCT:
+      vp10_fwd_txfm2d_32x32(src_diff, coeff, diff_stride, tx_type, bd);
+      break;
+#if CONFIG_EXT_TX
+    case ADST_DCT:
+    case DCT_ADST:
+    case ADST_ADST:
+    case FLIPADST_DCT:
+    case DCT_FLIPADST:
+    case FLIPADST_FLIPADST:
+    case ADST_FLIPADST:
+    case FLIPADST_ADST:
+    case V_DCT:
+    case H_DCT:
+    case V_ADST:
+    case H_ADST:
+    case V_FLIPADST:
+    case H_FLIPADST:
+      vp10_highbd_fht32x32_c(src_diff, coeff, diff_stride, tx_type);
+      break;
+    case IDTX:
+      vp10_fwd_idtx_c(src_diff, coeff, diff_stride, 32, tx_type);
+      break;
+#endif  // CONFIG_EXT_TX
+    default:
+      assert(0);
+      break;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+void fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride,
+              FWD_TXFM_PARAM *fwd_txfm_param) {
+  const int fwd_txfm_opt = fwd_txfm_param->fwd_txfm_opt;
+  const TX_TYPE tx_type = fwd_txfm_param->tx_type;
+  const TX_SIZE tx_size = fwd_txfm_param->tx_size;
+  const int rd_transform = fwd_txfm_param->rd_transform;
+  const int lossless = fwd_txfm_param->lossless;
+  switch (tx_size) {
+    case TX_32X32:
+      fwd_txfm_32x32(rd_transform, src_diff, coeff, diff_stride, tx_type,
+                     fwd_txfm_opt);
+      break;
+    case TX_16X16:
+      fwd_txfm_16x16(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+      break;
+    case TX_8X8:
+      fwd_txfm_8x8(src_diff, coeff, diff_stride, tx_type, fwd_txfm_opt);
+      break;
+    case TX_4X4:
+      fwd_txfm_4x4(src_diff, coeff, diff_stride, tx_type, lossless);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
+                     int diff_stride, FWD_TXFM_PARAM *fwd_txfm_param) {
+  const int fwd_txfm_opt = fwd_txfm_param->fwd_txfm_opt;
+  const TX_TYPE tx_type = fwd_txfm_param->tx_type;
+  const TX_SIZE tx_size = fwd_txfm_param->tx_size;
+  const int rd_transform = fwd_txfm_param->rd_transform;
+  const int lossless = fwd_txfm_param->lossless;
+  const int bd = fwd_txfm_param->bd;
+  switch (tx_size) {
+    case TX_32X32:
+      highbd_fwd_txfm_32x32(rd_transform, src_diff, coeff, diff_stride, tx_type,
+                            fwd_txfm_opt, bd);
+      break;
+    case TX_16X16:
+      highbd_fwd_txfm_16x16(src_diff, coeff, diff_stride, tx_type,
+                            fwd_txfm_opt, bd);
+      break;
+    case TX_8X8:
+      highbd_fwd_txfm_8x8(src_diff, coeff, diff_stride, tx_type,
+                          fwd_txfm_opt, bd);
+      break;
+    case TX_4X4:
+      highbd_fwd_txfm_4x4(src_diff, coeff, diff_stride, tx_type,
+                               lossless, bd);
+      break;
+    default:
+      assert(0);
+      break;
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vp10/encoder/hybrid_fwd_txfm.h b/vp10/encoder/hybrid_fwd_txfm.h
new file mode 100644
index 0000000..cd028bc
--- /dev/null
+++ b/vp10/encoder/hybrid_fwd_txfm.h
@@ -0,0 +1,61 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_ENCODER_HYBRID_FWD_TXFM_H_
+#define VP10_ENCODER_HYBRID_FWD_TXFM_H_
+
+#include "./vpx_config.h"
+
+typedef enum FWD_TXFM_OPT { FWD_TXFM_OPT_NORMAL, FWD_TXFM_OPT_DC } FWD_TXFM_OPT;
+
+typedef struct FWD_TXFM_PARAM {
+  TX_TYPE tx_type;
+  TX_SIZE tx_size;
+  FWD_TXFM_OPT fwd_txfm_opt;
+  int rd_transform;
+  int lossless;
+#if CONFIG_VP9_HIGHBITDEPTH
+  int bd;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+} FWD_TXFM_PARAM;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void fwd_txfm(const int16_t *src_diff, tran_low_t *coeff, int diff_stride,
+              FWD_TXFM_PARAM *fwd_txfm_param);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void highbd_fwd_txfm(const int16_t *src_diff, tran_low_t *coeff,
+                     int diff_stride, FWD_TXFM_PARAM *fwd_txfm_param);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE int get_tx1d_size(TX_SIZE tx_size) {
+  switch (tx_size) {
+    case TX_32X32:
+      return 32;
+    case TX_16X16:
+      return 16;
+    case TX_8X8:
+      return 8;
+    case TX_4X4:
+      return 4;
+    default:
+      assert(0);
+      return -1;
+  }
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_ENCODER_HYBRID_FWD_TXFM_H_
diff --git a/vp10/encoder/lookahead.c b/vp10/encoder/lookahead.c
index 3185cb6..9e8f536 100644
--- a/vp10/encoder/lookahead.c
+++ b/vp10/encoder/lookahead.c
@@ -47,13 +47,13 @@
 
 
 struct lookahead_ctx *vp10_lookahead_init(unsigned int width,
-                                         unsigned int height,
-                                         unsigned int subsampling_x,
-                                         unsigned int subsampling_y,
+                                          unsigned int height,
+                                          unsigned int subsampling_x,
+                                          unsigned int subsampling_y,
 #if CONFIG_VP9_HIGHBITDEPTH
-                                         int use_highbitdepth,
+                                          int use_highbitdepth,
 #endif
-                                         unsigned int depth) {
+                                          unsigned int depth) {
   struct lookahead_ctx *ctx = NULL;
 
   // Clamp the lookahead queue depth
diff --git a/vp10/encoder/mbgraph.c b/vp10/encoder/mbgraph.c
index ed0f539..46cff80 100644
--- a/vp10/encoder/mbgraph.c
+++ b/vp10/encoder/mbgraph.c
@@ -25,13 +25,12 @@
 
 static unsigned int do_16x16_motion_iteration(VP10_COMP *cpi,
                                               const MV *ref_mv,
-                                              MV *dst_mv,
                                               int mb_row,
                                               int mb_col) {
   MACROBLOCK *const x = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   const MV_SPEED_FEATURES *const mv_sf = &cpi->sf.mv;
-  const vp9_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
+  const vp10_variance_fn_ptr_t v_fn_ptr = cpi->fn_ptr[BLOCK_16X16];
 
   const int tmp_col_min = x->mv_col_min;
   const int tmp_col_max = x->mv_col_max;
@@ -51,8 +50,7 @@
 
   /*cpi->sf.search_method == HEX*/
   vp10_hex_search(x, &ref_full, step_param, x->errorperbit, 0,
-                 cond_cost_list(cpi, cost_list),
-                 &v_fn_ptr, 0, ref_mv, dst_mv);
+                  cond_cost_list(cpi, cost_list), &v_fn_ptr, 0, ref_mv);
 
   // Try sub-pixel MC
   // if (bestsme > error_thresh && bestsme < INT_MAX)
@@ -60,15 +58,24 @@
     int distortion;
     unsigned int sse;
     cpi->find_fractional_mv_step(
-        x, dst_mv, ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
+        x, ref_mv, cpi->common.allow_high_precision_mv, x->errorperbit,
         &v_fn_ptr, 0, mv_sf->subpel_iters_per_step,
         cond_cost_list(cpi, cost_list),
         NULL, NULL,
-        &distortion, &sse, NULL, 0, 0);
+        &distortion, &sse, NULL, 0, 0, 0);
   }
 
+#if CONFIG_EXT_INTER
+  if (has_second_ref(&xd->mi[0]->mbmi))
+    xd->mi[0]->mbmi.mode = NEW_NEWMV;
+  else
+#endif  // CONFIG_EXT_INTER
   xd->mi[0]->mbmi.mode = NEWMV;
-  xd->mi[0]->mbmi.mv[0].as_mv = *dst_mv;
+
+  xd->mi[0]->mbmi.mv[0] = x->best_mv;
+#if CONFIG_EXT_INTER
+  xd->mi[0]->mbmi.ref_frame[1] = NONE;
+#endif  // CONFIG_EXT_INTER
 
   vp10_build_inter_predictors_sby(xd, mb_row, mb_col, BLOCK_16X16);
 
@@ -83,40 +90,40 @@
 }
 
 static int do_16x16_motion_search(VP10_COMP *cpi, const MV *ref_mv,
-                                  int_mv *dst_mv, int mb_row, int mb_col) {
+                                  int mb_row, int mb_col) {
   MACROBLOCK *const x = &cpi->td.mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   unsigned int err, tmp_err;
-  MV tmp_mv;
+  MV best_mv;
 
   // Try zero MV first
   // FIXME should really use something like near/nearest MV and/or MV prediction
   err = vpx_sad16x16(x->plane[0].src.buf, x->plane[0].src.stride,
                      xd->plane[0].pre[0].buf, xd->plane[0].pre[0].stride);
-  dst_mv->as_int = 0;
+  best_mv.col = best_mv.row = 0;
 
   // Test last reference frame using the previous best mv as the
   // starting point (best reference) for the search
-  tmp_err = do_16x16_motion_iteration(cpi, ref_mv, &tmp_mv, mb_row, mb_col);
+  tmp_err = do_16x16_motion_iteration(cpi, ref_mv, mb_row, mb_col);
   if (tmp_err < err) {
     err = tmp_err;
-    dst_mv->as_mv = tmp_mv;
+    best_mv = x->best_mv.as_mv;
   }
 
   // If the current best reference mv is not centered on 0,0 then do a 0,0
   // based search as well.
   if (ref_mv->row != 0 || ref_mv->col != 0) {
     unsigned int tmp_err;
-    MV zero_ref_mv = {0, 0}, tmp_mv;
+    MV zero_ref_mv = {0, 0};
 
-    tmp_err = do_16x16_motion_iteration(cpi, &zero_ref_mv, &tmp_mv,
-                                        mb_row, mb_col);
+    tmp_err = do_16x16_motion_iteration(cpi, &zero_ref_mv, mb_row, mb_col);
     if (tmp_err < err) {
-      dst_mv->as_mv = tmp_mv;
       err = tmp_err;
+      best_mv = x->best_mv.as_mv;
     }
   }
 
+  x->best_mv.as_mv = best_mv;
   return err;
 }
 
@@ -204,8 +211,8 @@
     xd->plane[0].pre[0].stride = golden_ref->y_stride;
     g_motion_error = do_16x16_motion_search(cpi,
                                             prev_golden_ref_mv,
-                                            &stats->ref[GOLDEN_FRAME].m.mv,
                                             mb_row, mb_col);
+    stats->ref[GOLDEN_FRAME].m.mv = x->best_mv;
     stats->ref[GOLDEN_FRAME].err = g_motion_error;
   } else {
     stats->ref[GOLDEN_FRAME].err = INT_MAX;
diff --git a/vp10/encoder/mcomp.c b/vp10/encoder/mcomp.c
index 1ba2e2f..afbf3e9 100644
--- a/vp10/encoder/mcomp.c
+++ b/vp10/encoder/mcomp.c
@@ -24,6 +24,7 @@
 
 #include "vp10/encoder/encoder.h"
 #include "vp10/encoder/mcomp.h"
+#include "vp10/encoder/rdopt.h"
 
 // #define NEW_DIAMOND_SEARCH
 
@@ -80,24 +81,29 @@
   return ROUND_POWER_OF_TWO(mv_cost(&diff, mvjcost, mvcost) * weight, 7);
 }
 
-static int mv_err_cost(const MV *mv, const MV *ref,
-                       const int *mvjcost, int *mvcost[2],
-                       int error_per_bit) {
+#define PIXEL_TRANSFORM_ERROR_SCALE 4
+static int mv_err_cost(const MV *mv, const MV *ref, const int *mvjcost,
+                       int *mvcost[2], int error_per_bit) {
   if (mvcost) {
-    const MV diff = { mv->row - ref->row,
-                      mv->col - ref->col };
-    return ROUND_POWER_OF_TWO(mv_cost(&diff, mvjcost, mvcost) *
-                                  error_per_bit, 13);
+    const MV diff = {mv->row - ref->row, mv->col - ref->col};
+    // This product sits at a 32-bit ceiling right now and any additional
+    // accuracy in either bit cost or error cost will cause it to overflow.
+    return ROUND_POWER_OF_TWO(
+        (unsigned)mv_cost(&diff, mvjcost, mvcost) * error_per_bit,
+        RDDIV_BITS + VP9_PROB_COST_SHIFT - RD_EPB_SHIFT +
+            PIXEL_TRANSFORM_ERROR_SCALE);
   }
   return 0;
 }
 
 static int mvsad_err_cost(const MACROBLOCK *x, const MV *mv, const MV *ref,
-                          int error_per_bit) {
-  const MV diff = { mv->row - ref->row,
-                    mv->col - ref->col };
-  return ROUND_POWER_OF_TWO(mv_cost(&diff, x->nmvjointsadcost,
-                                    x->nmvsadcost) * error_per_bit, 8);
+                          int sad_per_bit) {
+  const MV diff = { (mv->row - ref->row) * 8,
+                    (mv->col - ref->col) * 8 };
+  return ROUND_POWER_OF_TWO(
+      (unsigned)mv_cost(&diff, x->nmvjointsadcost, x->mvsadcost) *
+          sad_per_bit,
+      VP9_PROB_COST_SHIFT);
 }
 
 void vp10_init_dsmotion_compensation(search_site_config *cfg, int stride) {
@@ -145,13 +151,15 @@
   cfg->searches_per_step = 8;
 }
 
-/* estimated cost of a motion vector (r,c) */
-#define MVC(r, c)                                       \
-    (mvcost ?                                           \
-     ((mvjcost[((r) != rr) * 2 + ((c) != rc)] +         \
-       mvcost[0][((r) - rr)] + mvcost[1][((c) - rc)]) * \
-      error_per_bit + 4096) >> 13 : 0)
-
+/*
+ * To avoid the penalty for crossing cache-line read, preload the reference
+ * area in a small buffer, which is aligned to make sure there won't be crossing
+ * cache-line read while reading from this buffer. This reduced the cpu
+ * cycles spent on reading ref data in sub-pixel filter functions.
+ * TODO: Currently, since sub-pixel search range here is -3 ~ 3, copy 22 rows x
+ * 32 cols area that is enough for 16x16 macroblock. Later, for SPLITMV, we
+ * could reduce the area.
+ */
 
 // convert motion vector component to offset for sv[a]f calc
 static INLINE int sp(int x) {
@@ -165,13 +173,44 @@
 /* checks if (r, c) has better score than previous best */
 #define CHECK_BETTER(v, r, c) \
   if (c >= minc && c <= maxc && r >= minr && r <= maxr) {              \
+    MV this_mv = {r, c};                                               \
+    v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit); \
     if (second_pred == NULL)                                           \
-      thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \
-                             src_stride, &sse);                        \
+      thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c),      \
+                         sp(r), src_address, src_stride, &sse);        \
     else                                                               \
-      thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), \
-                              z, src_stride, &sse, second_pred);       \
-    if ((v = MVC(r, c) + thismse) < besterr) {                         \
+      thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c),     \
+                          sp(r), src_address, src_stride, &sse,        \
+                          second_pred);                                \
+    v += thismse;                                                      \
+    if (v < besterr) {                                                 \
+      besterr = v;                                                     \
+      br = r;                                                          \
+      bc = c;                                                          \
+      *distortion = thismse;                                           \
+      *sse1 = sse;                                                     \
+    }                                                                  \
+  } else {                                                             \
+    v = INT_MAX;                                                       \
+  }
+
+#define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c)
+
+static INLINE const uint8_t *upre(const uint8_t *buf, int stride,
+                                  int r, int c) {
+  return &buf[(r) * stride + (c)];
+}
+
+/* checks if (r, c) has better score than previous best */
+#define CHECK_BETTER1(v, r, c) \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {              \
+    MV this_mv = {r, c};                                               \
+    thismse = upsampled_pref_error(xd, vfp, src_address, src_stride,   \
+                                   upre(y, y_stride, r, c), y_stride,  \
+                                   second_pred, w, h, &sse);           \
+    v = mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit); \
+    v += thismse;                                                      \
+    if (v < besterr) {                                                 \
       besterr = v;                                                     \
       br = r;                                                          \
       bc = c;                                                          \
@@ -250,7 +289,7 @@
 // TODO(yunqingwang): SECOND_LEVEL_CHECKS_BEST was a rewrote of
 // SECOND_LEVEL_CHECKS, and SECOND_LEVEL_CHECKS should be rewritten
 // later in the same way.
-#define SECOND_LEVEL_CHECKS_BEST                        \
+#define SECOND_LEVEL_CHECKS_BEST(k)                     \
   {                                                     \
     unsigned int second;                                \
     int br0 = br;                                       \
@@ -261,21 +300,22 @@
     } else if (tr != br && tc == bc) {                  \
       kr = br - tr;                                     \
     }                                                   \
-    CHECK_BETTER(second, br0 + kr, bc0);                \
-    CHECK_BETTER(second, br0, bc0 + kc);                \
+    CHECK_BETTER##k(second, br0 + kr, bc0);             \
+    CHECK_BETTER##k(second, br0, bc0 + kc);             \
     if (br0 != br || bc0 != bc) {                       \
-      CHECK_BETTER(second, br0 + kr, bc0 + kc);         \
+      CHECK_BETTER##k(second, br0 + kr, bc0 + kc);      \
     }                                                   \
   }
 
 #define SETUP_SUBPEL_SEARCH                                                \
-  const uint8_t *const z = x->plane[0].src.buf;                            \
+  const uint8_t *const src_address = x->plane[0].src.buf;                  \
   const int src_stride = x->plane[0].src.stride;                           \
   const MACROBLOCKD *xd = &x->e_mbd;                                       \
   unsigned int besterr = INT_MAX;                                          \
   unsigned int sse;                                                        \
   unsigned int whichdir;                                                   \
   int thismse;                                                             \
+  MV *bestmv = &x->best_mv.as_mv;                                          \
   const unsigned int halfiters = iters_per_step;                           \
   const unsigned int quarteriters = iters_per_step;                        \
   const unsigned int eighthiters = iters_per_step;                         \
@@ -283,8 +323,6 @@
   const int offset = bestmv->row * y_stride + bestmv->col;                 \
   const uint8_t *const y = xd->plane[0].pre[0].buf;                        \
                                                                            \
-  int rr = ref_mv->row;                                                    \
-  int rc = ref_mv->col;                                                    \
   int br = bestmv->row * 8;                                                \
   int bc = bestmv->col * 8;                                                \
   int hstep = 4;                                                           \
@@ -302,7 +340,7 @@
                                        const MV *bestmv,
                                        const MV *ref_mv,
                                        int error_per_bit,
-                                       const vp9_variance_fn_ptr_t *vfp,
+                                       const vp10_variance_fn_ptr_t *vfp,
                                        const uint8_t *const src,
                                        const int src_stride,
                                        const uint8_t *const y,
@@ -316,13 +354,13 @@
 #if CONFIG_VP9_HIGHBITDEPTH
   if (second_pred != NULL) {
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-      DECLARE_ALIGNED(16, uint16_t, comp_pred16[64 * 64]);
+      DECLARE_ALIGNED(16, uint16_t, comp_pred16[MAX_SB_SQUARE]);
       vpx_highbd_comp_avg_pred(comp_pred16, second_pred, w, h, y + offset,
                                y_stride);
       besterr = vfp->vf(CONVERT_TO_BYTEPTR(comp_pred16), w, src, src_stride,
                         sse1);
     } else {
-      DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+      DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]);
       vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
       besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
     }
@@ -334,7 +372,7 @@
 #else
   (void) xd;
   if (second_pred != NULL) {
-    DECLARE_ALIGNED(16, uint8_t, comp_pred[64 * 64]);
+    DECLARE_ALIGNED(16, uint8_t, comp_pred[MAX_SB_SQUARE]);
     vpx_comp_avg_pred(comp_pred, second_pred, w, h, y + offset, y_stride);
     besterr = vfp->vf(comp_pred, w, src, src_stride, sse1);
   } else {
@@ -374,11 +412,11 @@
 }
 
 int vp10_find_best_sub_pixel_tree_pruned_evenmore(
-    const MACROBLOCK *x,
-    MV *bestmv, const MV *ref_mv,
+    MACROBLOCK *x,
+    const MV *ref_mv,
     int allow_hp,
     int error_per_bit,
-    const vp9_variance_fn_ptr_t *vfp,
+    const vp10_variance_fn_ptr_t *vfp,
     int forced_stop,
     int iters_per_step,
     int *cost_list,
@@ -386,11 +424,11 @@
     int *distortion,
     unsigned int *sse1,
     const uint8_t *second_pred,
-    int w, int h) {
+    int w, int h, int use_upsampled_ref) {
   SETUP_SUBPEL_SEARCH;
   besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
-                               z, src_stride, y, y_stride, second_pred,
-                               w, h, offset, mvjcost, mvcost,
+                               src_address, src_stride, y, y_stride,
+                               second_pred, w, h, offset, mvjcost, mvcost,
                                sse1, distortion);
   (void) halfiters;
   (void) quarteriters;
@@ -399,6 +437,7 @@
   (void) allow_hp;
   (void) forced_stop;
   (void) hstep;
+  (void) use_upsampled_ref;
 
   if (cost_list &&
       cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
@@ -453,23 +492,26 @@
   return besterr;
 }
 
-int vp10_find_best_sub_pixel_tree_pruned_more(const MACROBLOCK *x,
-                                             MV *bestmv, const MV *ref_mv,
-                                             int allow_hp,
-                                             int error_per_bit,
-                                             const vp9_variance_fn_ptr_t *vfp,
-                                             int forced_stop,
-                                             int iters_per_step,
-                                             int *cost_list,
-                                             int *mvjcost, int *mvcost[2],
-                                             int *distortion,
-                                             unsigned int *sse1,
-                                             const uint8_t *second_pred,
-                                             int w, int h) {
+int vp10_find_best_sub_pixel_tree_pruned_more(MACROBLOCK *x,
+                                              const MV *ref_mv,
+                                              int allow_hp,
+                                              int error_per_bit,
+                                              const vp10_variance_fn_ptr_t *vfp,
+                                              int forced_stop,
+                                              int iters_per_step,
+                                              int *cost_list,
+                                              int *mvjcost, int *mvcost[2],
+                                              int *distortion,
+                                              unsigned int *sse1,
+                                              const uint8_t *second_pred,
+                                              int w, int h,
+                                              int use_upsampled_ref) {
   SETUP_SUBPEL_SEARCH;
+  (void) use_upsampled_ref;
+
   besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
-                               z, src_stride, y, y_stride, second_pred,
-                               w, h, offset, mvjcost, mvcost,
+                               src_address, src_stride, y, y_stride,
+                               second_pred, w, h, offset, mvjcost, mvcost,
                                sse1, distortion);
   if (cost_list &&
       cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
@@ -527,23 +569,25 @@
   return besterr;
 }
 
-int vp10_find_best_sub_pixel_tree_pruned(const MACROBLOCK *x,
-                                        MV *bestmv, const MV *ref_mv,
-                                        int allow_hp,
-                                        int error_per_bit,
-                                        const vp9_variance_fn_ptr_t *vfp,
-                                        int forced_stop,
-                                        int iters_per_step,
-                                        int *cost_list,
-                                        int *mvjcost, int *mvcost[2],
-                                        int *distortion,
-                                        unsigned int *sse1,
-                                        const uint8_t *second_pred,
-                                        int w, int h) {
+int vp10_find_best_sub_pixel_tree_pruned(MACROBLOCK *x,
+                                         const MV *ref_mv,
+                                         int allow_hp,
+                                         int error_per_bit,
+                                         const vp10_variance_fn_ptr_t *vfp,
+                                         int forced_stop,
+                                         int iters_per_step,
+                                         int *cost_list,
+                                         int *mvjcost, int *mvcost[2],
+                                         int *distortion,
+                                         unsigned int *sse1,
+                                         const uint8_t *second_pred,
+                                         int w, int h, int use_upsampled_ref) {
   SETUP_SUBPEL_SEARCH;
+  (void) use_upsampled_ref;
+
   besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
-                               z, src_stride, y, y_stride, second_pred,
-                               w, h, offset, mvjcost, mvcost,
+                               src_address, src_stride, y, y_stride,
+                               second_pred, w, h, offset, mvjcost, mvcost,
                                sse1, distortion);
   if (cost_list &&
       cost_list[0] != INT_MAX && cost_list[1] != INT_MAX &&
@@ -629,32 +673,82 @@
     {0, -1}, {0, 1}, {-1, 0}, {1, 0}
 };
 
-int vp10_find_best_sub_pixel_tree(const MACROBLOCK *x,
-                                 MV *bestmv, const MV *ref_mv,
-                                 int allow_hp,
-                                 int error_per_bit,
-                                 const vp9_variance_fn_ptr_t *vfp,
-                                 int forced_stop,
-                                 int iters_per_step,
-                                 int *cost_list,
-                                 int *mvjcost, int *mvcost[2],
-                                 int *distortion,
-                                 unsigned int *sse1,
-                                 const uint8_t *second_pred,
-                                 int w, int h) {
-  const uint8_t *const z = x->plane[0].src.buf;
-  const uint8_t *const src_address = z;
+static int upsampled_pref_error(const MACROBLOCKD *xd,
+                                const vp10_variance_fn_ptr_t *vfp,
+                                const uint8_t *const src, const int src_stride,
+                                const uint8_t *const y, int y_stride,
+                                const uint8_t *second_pred,
+                                int w, int h, unsigned int *sse) {
+  unsigned int besterr;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
+    if (second_pred != NULL)
+      vpx_highbd_comp_avg_upsampled_pred(pred16, second_pred, w, h, y,
+                                         y_stride);
+    else
+      vpx_highbd_upsampled_pred(pred16, w, h, y, y_stride);
+
+    besterr = vfp->vf(CONVERT_TO_BYTEPTR(pred16), w, src, src_stride,
+                      sse);
+  } else {
+    DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+#else
+    DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+    (void) xd;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    if (second_pred != NULL)
+      vpx_comp_avg_upsampled_pred(pred, second_pred, w, h, y,
+                                  y_stride);
+    else
+      vpx_upsampled_pred(pred, w, h, y, y_stride);
+
+    besterr = vfp->vf(pred, w, src, src_stride, sse);
+#if CONFIG_VP9_HIGHBITDEPTH
+  }
+#endif
+return besterr;
+}
+
+static unsigned int upsampled_setup_center_error(
+    const MACROBLOCKD *xd, const MV *bestmv, const MV *ref_mv,
+    int error_per_bit, const vp10_variance_fn_ptr_t *vfp,
+    const uint8_t *const src, const int src_stride,
+    const uint8_t *const y, int y_stride, const uint8_t *second_pred,
+    int w, int h, int offset, int *mvjcost, int *mvcost[2],
+    unsigned int *sse1, int *distortion) {
+  unsigned int besterr = upsampled_pref_error(xd, vfp, src, src_stride,
+                                              y + offset, y_stride, second_pred,
+                                              w, h, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+  return besterr;
+}
+
+int vp10_find_best_sub_pixel_tree(MACROBLOCK *x,
+                                  const MV *ref_mv,
+                                  int allow_hp,
+                                  int error_per_bit,
+                                  const vp10_variance_fn_ptr_t *vfp,
+                                  int forced_stop,
+                                  int iters_per_step,
+                                  int *cost_list,
+                                  int *mvjcost, int *mvcost[2],
+                                  int *distortion,
+                                  unsigned int *sse1,
+                                  const uint8_t *second_pred,
+                                  int w, int h, int use_upsampled_ref) {
+  const uint8_t *const src_address = x->plane[0].src.buf;
   const int src_stride = x->plane[0].src.stride;
   const MACROBLOCKD *xd = &x->e_mbd;
   unsigned int besterr = INT_MAX;
   unsigned int sse;
-  int thismse;
+  unsigned int thismse;
   const int y_stride = xd->plane[0].pre[0].stride;
+  MV *bestmv = &x->best_mv.as_mv;
   const int offset = bestmv->row * y_stride + bestmv->col;
   const uint8_t *const y = xd->plane[0].pre[0].buf;
 
-  int rr = ref_mv->row;
-  int rc = ref_mv->col;
   int br = bestmv->row * 8;
   int bc = bestmv->col * 8;
   int hstep = 4;
@@ -677,10 +771,18 @@
   bestmv->row *= 8;
   bestmv->col *= 8;
 
-  besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
-                               z, src_stride, y, y_stride, second_pred,
-                               w, h, offset, mvjcost, mvcost,
-                               sse1, distortion);
+  // use_upsampled_ref can be 0 or 1
+  if (use_upsampled_ref)
+    besterr = upsampled_setup_center_error(xd, bestmv, ref_mv, error_per_bit,
+                                           vfp, src_address, src_stride, y,
+                                           y_stride, second_pred, w, h,
+                                           (offset * 8), mvjcost, mvcost, sse1,
+                                           distortion);
+  else
+    besterr = setup_center_error(xd, bestmv, ref_mv, error_per_bit, vfp,
+                                 src_address, src_stride, y, y_stride,
+                                 second_pred, w, h, offset, mvjcost, mvcost,
+                                 sse1, distortion);
 
   (void) cost_list;  // to silence compiler warning
 
@@ -690,16 +792,25 @@
       tr = br + search_step[idx].row;
       tc = bc + search_step[idx].col;
       if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
-        const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
-        MV this_mv;
-        this_mv.row = tr;
-        this_mv.col = tc;
-        if (second_pred == NULL)
-          thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr),
-                             src_address, src_stride, &sse);
-        else
-          thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
-                              src_address, src_stride, &sse, second_pred);
+        MV this_mv = {tr, tc};
+
+        if (use_upsampled_ref) {
+          const uint8_t *const pre_address = y + tr * y_stride + tc;
+
+          thismse = upsampled_pref_error(xd, vfp, src_address, src_stride,
+                                         pre_address, y_stride, second_pred,
+                                         w, h, &sse);
+        } else {
+          const uint8_t *const pre_address = y + (tr >> 3) * y_stride +
+              (tc >> 3);
+          if (second_pred == NULL)
+            thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr),
+                               src_address, src_stride, &sse);
+          else
+            thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
+                                src_address, src_stride, &sse, second_pred);
+        }
+
         cost_array[idx] = thismse +
             mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
 
@@ -721,14 +832,25 @@
     tc = bc + kc;
     tr = br + kr;
     if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
-      const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
       MV this_mv = {tr, tc};
-      if (second_pred == NULL)
-        thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr),
-                           src_address, src_stride, &sse);
-      else
-        thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
-                            src_address, src_stride, &sse, second_pred);
+
+      if (use_upsampled_ref) {
+        const uint8_t *const pre_address = y + tr * y_stride + tc;
+
+        thismse = upsampled_pref_error(xd, vfp, src_address, src_stride,
+                                       pre_address, y_stride, second_pred, w, h,
+                                       &sse);
+      } else {
+        const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
+
+        if (second_pred == NULL)
+          thismse = vfp->svf(pre_address, y_stride, sp(tc), sp(tr),
+                             src_address, src_stride, &sse);
+        else
+          thismse = vfp->svaf(pre_address, y_stride, sp(tc), sp(tr),
+                              src_address, src_stride, &sse, second_pred);
+      }
+
       cost_array[4] = thismse +
           mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
 
@@ -750,20 +872,19 @@
       bc = tc;
     }
 
-    if (iters_per_step > 1 && best_idx != -1)
-      SECOND_LEVEL_CHECKS_BEST;
-
-    tr = br;
-    tc = bc;
+    if (iters_per_step > 1 && best_idx != -1) {
+      if (use_upsampled_ref) {
+        SECOND_LEVEL_CHECKS_BEST(1);
+      } else {
+        SECOND_LEVEL_CHECKS_BEST(0);
+      }
+    }
 
     search_step += 4;
     hstep >>= 1;
     best_idx = -1;
   }
 
-  // Each subsequent iteration checks at least one point in common with
-  // the last iteration could be 2 ( if diag selected) 1/4 pel
-
   // These lines insure static analysis doesn't warn that
   // tr and tc aren't used after the above point.
   (void) tr;
@@ -779,7 +900,7 @@
   return besterr;
 }
 
-#undef MVC
+#undef PRE
 #undef CHECK_BETTER
 
 static INLINE int check_bounds(const MACROBLOCK *x, int row, int col,
@@ -813,17 +934,17 @@
 
 // Calculate and return a sad+mvcost list around an integer best pel.
 static INLINE void calc_int_cost_list(const MACROBLOCK *x,
-                                      const MV *ref_mv,
+                                      const MV *const ref_mv,
                                       int sadpb,
-                                      const vp9_variance_fn_ptr_t *fn_ptr,
+                                      const vp10_variance_fn_ptr_t *fn_ptr,
                                       const MV *best_mv,
                                       int *cost_list) {
   static const MV neighbors[4] = {{0, -1}, {1, 0}, {0, 1}, {-1, 0}};
   const struct buf_2d *const what = &x->plane[0].src;
   const struct buf_2d *const in_what = &x->e_mbd.plane[0].pre[0];
   const MV fcenter_mv = {ref_mv->row >> 3, ref_mv->col >> 3};
-  int br = best_mv->row;
-  int bc = best_mv->col;
+  const int br = best_mv->row;
+  const int bc = best_mv->col;
   MV this_mv;
   int i;
   unsigned int sse;
@@ -841,9 +962,9 @@
       cost_list[i + 1] = fn_ptr->vf(what->buf, what->stride,
                                     get_buf_from_mv(in_what, &this_mv),
                                     in_what->stride, &sse) +
-          // mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb);
-          mv_err_cost(&this_mv, &fcenter_mv, x->nmvjointcost, x->mvcost,
-                      x->errorperbit);
+                                    mv_err_cost(&this_mv, &fcenter_mv,
+                                                x->nmvjointcost, x->mvcost,
+                                                x->errorperbit);
     }
   } else {
     for (i = 0; i < 4; i++) {
@@ -855,9 +976,60 @@
         cost_list[i + 1] = fn_ptr->vf(what->buf, what->stride,
                                       get_buf_from_mv(in_what, &this_mv),
                                       in_what->stride, &sse) +
-            // mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb);
-            mv_err_cost(&this_mv, &fcenter_mv, x->nmvjointcost, x->mvcost,
-                        x->errorperbit);
+                                      mv_err_cost(&this_mv, &fcenter_mv,
+                                                  x->nmvjointcost, x->mvcost,
+                                                  x->errorperbit);
+    }
+  }
+}
+
+static INLINE void calc_int_sad_list(const MACROBLOCK *x,
+                                     const MV *const ref_mv,
+                                     int sadpb,
+                                     const vp10_variance_fn_ptr_t *fn_ptr,
+                                     const MV *best_mv, int *cost_list,
+                                     const int use_mvcost,
+                                     const int bestsad) {
+  static const MV neighbors[4] = {{0, -1}, {1, 0}, {0, 1}, {-1, 0}};
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &x->e_mbd.plane[0].pre[0];
+  const MV fcenter_mv = {ref_mv->row >> 3, ref_mv->col >> 3};
+  int i;
+  const int br = best_mv->row;
+  const int bc = best_mv->col;
+
+  if (cost_list[0] == INT_MAX) {
+    cost_list[0] = bestsad;
+    if (check_bounds(x, br, bc, 1)) {
+      for (i = 0; i < 4; i++) {
+        const MV this_mv = { br + neighbors[i].row,
+            bc + neighbors[i].col };
+        cost_list[i + 1] = fn_ptr->sdf(what->buf, what->stride,
+                                       get_buf_from_mv(in_what, &this_mv),
+                                       in_what->stride);
+      }
+    } else {
+      for (i = 0; i < 4; i++) {
+        const MV this_mv = {br + neighbors[i].row,
+            bc + neighbors[i].col};
+        if (!is_mv_in(x, &this_mv))
+          cost_list[i + 1] = INT_MAX;
+        else
+          cost_list[i + 1] = fn_ptr->sdf(what->buf, what->stride,
+                                         get_buf_from_mv(in_what, &this_mv),
+                                         in_what->stride);
+      }
+    }
+  } else {
+    if (use_mvcost) {
+      for (i = 0; i < 4; i++) {
+        const MV this_mv = {br + neighbors[i].row,
+            bc + neighbors[i].col};
+        if (cost_list[i + 1] != INT_MAX) {
+          cost_list[i + 1] +=
+              mvsad_err_cost(x, &this_mv, &fcenter_mv, sadpb);
+        }
+      }
     }
   }
 }
@@ -867,19 +1039,18 @@
 // candidates as indicated in the num_candidates and candidates arrays
 // passed into this function
 //
-static int vp10_pattern_search(const MACROBLOCK *x,
-                              MV *ref_mv,
-                              int search_param,
-                              int sad_per_bit,
-                              int do_init_search,
-                              int *cost_list,
-                              const vp9_variance_fn_ptr_t *vfp,
-                              int use_mvcost,
-                              const MV *center_mv,
-                              MV *best_mv,
-                              const int num_candidates[MAX_PATTERN_SCALES],
-                              const MV candidates[MAX_PATTERN_SCALES]
-                                                 [MAX_PATTERN_CANDIDATES]) {
+static int pattern_search(MACROBLOCK *x,
+                          MV *start_mv,
+                          int search_param,
+                          int sad_per_bit,
+                          int do_init_search,
+                          int *cost_list,
+                          const vp10_variance_fn_ptr_t *vfp,
+                          int use_mvcost,
+                          const MV *center_mv,
+                          const int num_candidates[MAX_PATTERN_SCALES],
+                          const MV candidates[MAX_PATTERN_SCALES]
+                                              [MAX_PATTERN_CANDIDATES]) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   static const int search_param_to_steps[MAX_MVSEARCH_STEPS] = {
     10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
@@ -887,6 +1058,7 @@
   int i, s, t;
   const struct buf_2d *const what = &x->plane[0].src;
   const struct buf_2d *const in_what = &xd->plane[0].pre[0];
+  const int last_is_4 = num_candidates[0] == 4;
   int br, bc;
   int bestsad = INT_MAX;
   int thissad;
@@ -894,193 +1066,20 @@
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
   int best_init_s = search_param_to_steps[search_param];
   // adjust ref_mv to make sure it is within MV range
-  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
-  br = ref_mv->row;
-  bc = ref_mv->col;
-
-  // Work out the start point for the search
-  bestsad = vfp->sdf(what->buf, what->stride,
-                     get_buf_from_mv(in_what, ref_mv), in_what->stride) +
-      mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
-
-  // Search all possible scales upto the search param around the center point
-  // pick the scale of the point that is best as the starting scale of
-  // further steps around it.
-  if (do_init_search) {
-    s = best_init_s;
-    best_init_s = -1;
-    for (t = 0; t <= s; ++t) {
-      int best_site = -1;
-      if (check_bounds(x, br, bc, 1 << t)) {
-        for (i = 0; i < num_candidates[t]; i++) {
-          const MV this_mv = {br + candidates[t][i].row,
-                              bc + candidates[t][i].col};
-          thissad = vfp->sdf(what->buf, what->stride,
-                             get_buf_from_mv(in_what, &this_mv),
-                             in_what->stride);
-          CHECK_BETTER
-        }
-      } else {
-        for (i = 0; i < num_candidates[t]; i++) {
-          const MV this_mv = {br + candidates[t][i].row,
-                              bc + candidates[t][i].col};
-          if (!is_mv_in(x, &this_mv))
-            continue;
-          thissad = vfp->sdf(what->buf, what->stride,
-                             get_buf_from_mv(in_what, &this_mv),
-                             in_what->stride);
-          CHECK_BETTER
-        }
-      }
-      if (best_site == -1) {
-        continue;
-      } else {
-        best_init_s = t;
-        k = best_site;
-      }
-    }
-    if (best_init_s != -1) {
-      br += candidates[best_init_s][k].row;
-      bc += candidates[best_init_s][k].col;
-    }
-  }
-
-  // If the center point is still the best, just skip this and move to
-  // the refinement step.
-  if (best_init_s != -1) {
-    int best_site = -1;
-    s = best_init_s;
-
-    do {
-      // No need to search all 6 points the 1st time if initial search was used
-      if (!do_init_search || s != best_init_s) {
-        if (check_bounds(x, br, bc, 1 << s)) {
-          for (i = 0; i < num_candidates[s]; i++) {
-            const MV this_mv = {br + candidates[s][i].row,
-                                bc + candidates[s][i].col};
-            thissad = vfp->sdf(what->buf, what->stride,
-                               get_buf_from_mv(in_what, &this_mv),
-                               in_what->stride);
-            CHECK_BETTER
-          }
-        } else {
-          for (i = 0; i < num_candidates[s]; i++) {
-            const MV this_mv = {br + candidates[s][i].row,
-                                bc + candidates[s][i].col};
-            if (!is_mv_in(x, &this_mv))
-              continue;
-            thissad = vfp->sdf(what->buf, what->stride,
-                               get_buf_from_mv(in_what, &this_mv),
-                               in_what->stride);
-            CHECK_BETTER
-          }
-        }
-
-        if (best_site == -1) {
-          continue;
-        } else {
-          br += candidates[s][best_site].row;
-          bc += candidates[s][best_site].col;
-          k = best_site;
-        }
-      }
-
-      do {
-        int next_chkpts_indices[PATTERN_CANDIDATES_REF];
-        best_site = -1;
-        next_chkpts_indices[0] = (k == 0) ? num_candidates[s] - 1 : k - 1;
-        next_chkpts_indices[1] = k;
-        next_chkpts_indices[2] = (k == num_candidates[s] - 1) ? 0 : k + 1;
-
-        if (check_bounds(x, br, bc, 1 << s)) {
-          for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
-            const MV this_mv = {br + candidates[s][next_chkpts_indices[i]].row,
-                                bc + candidates[s][next_chkpts_indices[i]].col};
-            thissad = vfp->sdf(what->buf, what->stride,
-                               get_buf_from_mv(in_what, &this_mv),
-                               in_what->stride);
-            CHECK_BETTER
-          }
-        } else {
-          for (i = 0; i < PATTERN_CANDIDATES_REF; i++) {
-            const MV this_mv = {br + candidates[s][next_chkpts_indices[i]].row,
-                                bc + candidates[s][next_chkpts_indices[i]].col};
-            if (!is_mv_in(x, &this_mv))
-              continue;
-            thissad = vfp->sdf(what->buf, what->stride,
-                               get_buf_from_mv(in_what, &this_mv),
-                               in_what->stride);
-            CHECK_BETTER
-          }
-        }
-
-        if (best_site != -1) {
-          k = next_chkpts_indices[best_site];
-          br += candidates[s][k].row;
-          bc += candidates[s][k].col;
-        }
-      } while (best_site != -1);
-    } while (s--);
-  }
-
-  // Returns the one-away integer pel sad values around the best as follows:
-  // cost_list[0]: cost at the best integer pel
-  // cost_list[1]: cost at delta {0, -1} (left)   from the best integer pel
-  // cost_list[2]: cost at delta { 1, 0} (bottom) from the best integer pel
-  // cost_list[3]: cost at delta { 0, 1} (right)  from the best integer pel
-  // cost_list[4]: cost at delta {-1, 0} (top)    from the best integer pel
-  if (cost_list) {
-    const MV best_mv = { br, bc };
-    calc_int_cost_list(x, &fcenter_mv, sad_per_bit, vfp, &best_mv, cost_list);
-  }
-  best_mv->row = br;
-  best_mv->col = bc;
-  return bestsad;
-}
-
-// A specialized function where the smallest scale search candidates
-// are 4 1-away neighbors, and cost_list is non-null
-// TODO(debargha): Merge this function with the one above. Also remove
-// use_mvcost option since it is always 1, to save unnecessary branches.
-static int vp10_pattern_search_sad(const MACROBLOCK *x,
-                                  MV *ref_mv,
-                                  int search_param,
-                                  int sad_per_bit,
-                                  int do_init_search,
-                                  int *cost_list,
-                                  const vp9_variance_fn_ptr_t *vfp,
-                                  int use_mvcost,
-                                  const MV *center_mv,
-                                  MV *best_mv,
-                                  const int num_candidates[MAX_PATTERN_SCALES],
-                                  const MV candidates[MAX_PATTERN_SCALES]
-                                                     [MAX_PATTERN_CANDIDATES]) {
-  const MACROBLOCKD *const xd = &x->e_mbd;
-  static const int search_param_to_steps[MAX_MVSEARCH_STEPS] = {
-    10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0,
-  };
-  int i, s, t;
-  const struct buf_2d *const what = &x->plane[0].src;
-  const struct buf_2d *const in_what = &xd->plane[0].pre[0];
-  int br, bc;
-  int bestsad = INT_MAX;
-  int thissad;
-  int k = -1;
-  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
-  int best_init_s = search_param_to_steps[search_param];
-  // adjust ref_mv to make sure it is within MV range
-  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
-  br = ref_mv->row;
-  bc = ref_mv->col;
+  clamp_mv(start_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min,
+           x->mv_row_max);
+  br = start_mv->row;
+  bc = start_mv->col;
   if (cost_list != NULL) {
     cost_list[0] = cost_list[1] = cost_list[2] = cost_list[3] = cost_list[4] =
         INT_MAX;
   }
 
   // Work out the start point for the search
-  bestsad = vfp->sdf(what->buf, what->stride,
-                     get_buf_from_mv(in_what, ref_mv), in_what->stride) +
-      mvsad_err_cost(x, ref_mv, &fcenter_mv, sad_per_bit);
+  bestsad =
+      vfp->sdf(what->buf, what->stride,
+               get_buf_from_mv(in_what, start_mv), in_what->stride) +
+      mvsad_err_cost(x, start_mv, &fcenter_mv, sad_per_bit);
 
   // Search all possible scales upto the search param around the center point
   // pick the scale of the point that is best as the starting scale of
@@ -1127,11 +1126,12 @@
   // If the center point is still the best, just skip this and move to
   // the refinement step.
   if (best_init_s != -1) {
-    int do_sad = (num_candidates[0] == 4 && cost_list != NULL);
+    const int last_s = (last_is_4 && cost_list != NULL);
     int best_site = -1;
     s = best_init_s;
 
-    for (; s >= do_sad; s--) {
+    for (; s >= last_s; s--) {
+      // No need to search all points the 1st time if initial search was used
       if (!do_init_search || s != best_init_s) {
         if (check_bounds(x, br, bc, 1 << s)) {
           for (i = 0; i < num_candidates[s]; i++) {
@@ -1210,9 +1210,9 @@
             const MV this_mv = {br + candidates[s][i].row,
                                 bc + candidates[s][i].col};
             cost_list[i + 1] =
-            thissad = vfp->sdf(what->buf, what->stride,
-                               get_buf_from_mv(in_what, &this_mv),
-                               in_what->stride);
+                thissad = vfp->sdf(what->buf, what->stride,
+                                   get_buf_from_mv(in_what, &this_mv),
+                                   in_what->stride);
             CHECK_BETTER
           }
         } else {
@@ -1222,9 +1222,9 @@
             if (!is_mv_in(x, &this_mv))
               continue;
             cost_list[i + 1] =
-            thissad = vfp->sdf(what->buf, what->stride,
-                               get_buf_from_mv(in_what, &this_mv),
-                               in_what->stride);
+                thissad = vfp->sdf(what->buf, what->stride,
+                                   get_buf_from_mv(in_what, &this_mv),
+                                   in_what->stride);
             CHECK_BETTER
           }
         }
@@ -1250,9 +1250,9 @@
             const MV this_mv = {br + candidates[s][next_chkpts_indices[i]].row,
                                 bc + candidates[s][next_chkpts_indices[i]].col};
             cost_list[next_chkpts_indices[i] + 1] =
-            thissad = vfp->sdf(what->buf, what->stride,
-                               get_buf_from_mv(in_what, &this_mv),
-                               in_what->stride);
+                thissad = vfp->sdf(what->buf, what->stride,
+                                   get_buf_from_mv(in_what, &this_mv),
+                                   in_what->stride);
             CHECK_BETTER
           }
         } else {
@@ -1264,9 +1264,9 @@
               continue;
             }
             cost_list[next_chkpts_indices[i] + 1] =
-            thissad = vfp->sdf(what->buf, what->stride,
-                               get_buf_from_mv(in_what, &this_mv),
-                               in_what->stride);
+                thissad = vfp->sdf(what->buf, what->stride,
+                                   get_buf_from_mv(in_what, &this_mv),
+                                   in_what->stride);
             CHECK_BETTER
           }
         }
@@ -1280,57 +1280,29 @@
     }
   }
 
-  // Returns the one-away integer pel sad values around the best as follows:
-  // cost_list[0]: sad at the best integer pel
-  // cost_list[1]: sad at delta {0, -1} (left)   from the best integer pel
-  // cost_list[2]: sad at delta { 1, 0} (bottom) from the best integer pel
-  // cost_list[3]: sad at delta { 0, 1} (right)  from the best integer pel
-  // cost_list[4]: sad at delta {-1, 0} (top)    from the best integer pel
+  // Returns the one-away integer pel cost/sad around the best as follows:
+  // cost_list[0]: cost/sad at the best integer pel
+  // cost_list[1]: cost/sad at delta {0, -1} (left)   from the best integer pel
+  // cost_list[2]: cost/sad at delta { 1, 0} (bottom) from the best integer pel
+  // cost_list[3]: cost/sad at delta { 0, 1} (right)  from the best integer pel
+  // cost_list[4]: cost/sad at delta {-1, 0} (top)    from the best integer pel
   if (cost_list) {
-    static const MV neighbors[4] = {{0, -1}, {1, 0}, {0, 1}, {-1, 0}};
-    if (cost_list[0] == INT_MAX) {
-      cost_list[0] = bestsad;
-      if (check_bounds(x, br, bc, 1)) {
-        for (i = 0; i < 4; i++) {
-          const MV this_mv = { br + neighbors[i].row,
-                               bc + neighbors[i].col };
-          cost_list[i + 1] = vfp->sdf(what->buf, what->stride,
-                                     get_buf_from_mv(in_what, &this_mv),
-                                     in_what->stride);
-        }
-      } else {
-        for (i = 0; i < 4; i++) {
-          const MV this_mv = {br + neighbors[i].row,
-            bc + neighbors[i].col};
-          if (!is_mv_in(x, &this_mv))
-            cost_list[i + 1] = INT_MAX;
-          else
-            cost_list[i + 1] = vfp->sdf(what->buf, what->stride,
-                                       get_buf_from_mv(in_what, &this_mv),
-                                       in_what->stride);
-        }
-      }
+    const MV best_mv = { br, bc };
+    if (last_is_4) {
+      calc_int_sad_list(x, center_mv, sad_per_bit, vfp, &best_mv, cost_list,
+                        use_mvcost, bestsad);
     } else {
-      if (use_mvcost) {
-        for (i = 0; i < 4; i++) {
-          const MV this_mv = {br + neighbors[i].row,
-            bc + neighbors[i].col};
-          if (cost_list[i + 1] != INT_MAX) {
-            cost_list[i + 1] +=
-                mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
-          }
-        }
-      }
+      calc_int_cost_list(x, center_mv, sad_per_bit, vfp, &best_mv, cost_list);
     }
   }
-  best_mv->row = br;
-  best_mv->col = bc;
+  x->best_mv.as_mv.row = br;
+  x->best_mv.as_mv.col = bc;
   return bestsad;
 }
 
 int vp10_get_mvpred_var(const MACROBLOCK *x,
                        const MV *best_mv, const MV *center_mv,
-                       const vp9_variance_fn_ptr_t *vfp,
+                       const vp10_variance_fn_ptr_t *vfp,
                        int use_mvcost) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   const struct buf_2d *const what = &x->plane[0].src;
@@ -1347,7 +1319,7 @@
 int vp10_get_mvpred_av_var(const MACROBLOCK *x,
                           const MV *best_mv, const MV *center_mv,
                           const uint8_t *second_pred,
-                          const vp9_variance_fn_ptr_t *vfp,
+                          const vp10_variance_fn_ptr_t *vfp,
                           int use_mvcost) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   const struct buf_2d *const what = &x->plane[0].src;
@@ -1361,15 +1333,15 @@
                                  x->mvcost, x->errorperbit) : 0);
 }
 
-int vp10_hex_search(const MACROBLOCK *x,
-                   MV *ref_mv,
-                   int search_param,
-                   int sad_per_bit,
-                   int do_init_search,
-                   int *cost_list,
-                   const vp9_variance_fn_ptr_t *vfp,
-                   int use_mvcost,
-                   const MV *center_mv, MV *best_mv) {
+int vp10_hex_search(MACROBLOCK *x,
+                    MV *start_mv,
+                    int search_param,
+                    int sad_per_bit,
+                    int do_init_search,
+                    int *cost_list,
+                    const vp10_variance_fn_ptr_t *vfp,
+                    int use_mvcost,
+                    const MV *center_mv) {
   // First scale has 8-closest points, the rest have 6 points in hex shape
   // at increasing scales
   static const int hex_num_candidates[MAX_PATTERN_SCALES] = {
@@ -1390,22 +1362,20 @@
     {{-512, -1024}, {512, -1024}, {1024, 0}, {512, 1024}, { -512, 1024},
       { -1024, 0}},
   };
-  return vp10_pattern_search(x, ref_mv, search_param, sad_per_bit,
-                            do_init_search, cost_list, vfp, use_mvcost,
-                            center_mv, best_mv,
-                            hex_num_candidates, hex_candidates);
+  return pattern_search(x, start_mv, search_param, sad_per_bit,
+                        do_init_search, cost_list, vfp, use_mvcost,
+                        center_mv, hex_num_candidates, hex_candidates);
 }
 
-int vp10_bigdia_search(const MACROBLOCK *x,
-                      MV *ref_mv,
-                      int search_param,
-                      int sad_per_bit,
-                      int do_init_search,
-                      int *cost_list,
-                      const vp9_variance_fn_ptr_t *vfp,
-                      int use_mvcost,
-                      const MV *center_mv,
-                      MV *best_mv) {
+static int bigdia_search(MACROBLOCK *x,
+                         MV *start_mv,
+                         int search_param,
+                         int sad_per_bit,
+                         int do_init_search,
+                         int *cost_list,
+                         const vp10_variance_fn_ptr_t *vfp,
+                         int use_mvcost,
+                         const MV *center_mv) {
   // First scale has 4-closest points, the rest have 8 points in diamond
   // shape at increasing scales
   static const int bigdia_num_candidates[MAX_PATTERN_SCALES] = {
@@ -1432,25 +1402,23 @@
     {{-512, -512}, {0, -1024}, {512, -512}, {1024, 0}, {512, 512}, {0, 1024},
       {-512, 512}, {-1024, 0}},
   };
-  return vp10_pattern_search_sad(x, ref_mv, search_param, sad_per_bit,
-                                do_init_search, cost_list, vfp, use_mvcost,
-                                center_mv, best_mv,
-                                bigdia_num_candidates, bigdia_candidates);
+  return pattern_search(x, start_mv, search_param, sad_per_bit,
+                        do_init_search, cost_list, vfp, use_mvcost,
+                        center_mv, bigdia_num_candidates, bigdia_candidates);
 }
 
-int vp10_square_search(const MACROBLOCK *x,
-                      MV *ref_mv,
-                      int search_param,
-                      int sad_per_bit,
-                      int do_init_search,
-                      int *cost_list,
-                      const vp9_variance_fn_ptr_t *vfp,
-                      int use_mvcost,
-                      const MV *center_mv,
-                      MV *best_mv) {
+static int square_search(MACROBLOCK *x,
+                         MV *start_mv,
+                         int search_param,
+                         int sad_per_bit,
+                         int do_init_search,
+                         int *cost_list,
+                         const vp10_variance_fn_ptr_t *vfp,
+                         int use_mvcost,
+                         const MV *center_mv) {
   // All scales have 8 closest points in square shape
   static const int square_num_candidates[MAX_PATTERN_SCALES] = {
-    8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
+      8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
   };
   // Note that the largest candidate step at each scale is 2^scale
   static const MV square_candidates[MAX_PATTERN_SCALES]
@@ -1474,40 +1442,37 @@
     {{-1024, -1024}, {0, -1024}, {1024, -1024}, {1024, 0}, {1024, 1024},
       {0, 1024}, {-1024, 1024}, {-1024, 0}},
   };
-  return vp10_pattern_search(x, ref_mv, search_param, sad_per_bit,
-                            do_init_search, cost_list, vfp, use_mvcost,
-                            center_mv, best_mv,
-                            square_num_candidates, square_candidates);
+  return pattern_search(x, start_mv, search_param, sad_per_bit,
+                        do_init_search, cost_list, vfp, use_mvcost,
+                        center_mv, square_num_candidates, square_candidates);
 }
 
-int vp10_fast_hex_search(const MACROBLOCK *x,
-                        MV *ref_mv,
-                        int search_param,
-                        int sad_per_bit,
-                        int do_init_search,  // must be zero for fast_hex
-                        int *cost_list,
-                        const vp9_variance_fn_ptr_t *vfp,
-                        int use_mvcost,
-                        const MV *center_mv,
-                        MV *best_mv) {
+static int fast_hex_search(MACROBLOCK *x,
+                           MV *ref_mv,
+                           int search_param,
+                           int sad_per_bit,
+                           int do_init_search,  // must be zero for fast_hex
+                           int *cost_list,
+                           const vp10_variance_fn_ptr_t *vfp,
+                           int use_mvcost,
+                           const MV *center_mv) {
   return vp10_hex_search(
       x, ref_mv, VPXMAX(MAX_MVSEARCH_STEPS - 2, search_param), sad_per_bit,
-      do_init_search, cost_list, vfp, use_mvcost, center_mv, best_mv);
+      do_init_search, cost_list, vfp, use_mvcost, center_mv);
 }
 
-int vp10_fast_dia_search(const MACROBLOCK *x,
-                        MV *ref_mv,
-                        int search_param,
-                        int sad_per_bit,
-                        int do_init_search,
-                        int *cost_list,
-                        const vp9_variance_fn_ptr_t *vfp,
-                        int use_mvcost,
-                        const MV *center_mv,
-                        MV *best_mv) {
-  return vp10_bigdia_search(
+static int fast_dia_search(MACROBLOCK *x,
+                           MV *ref_mv,
+                           int search_param,
+                           int sad_per_bit,
+                           int do_init_search,
+                           int *cost_list,
+                           const vp10_variance_fn_ptr_t *vfp,
+                           int use_mvcost,
+                           const MV *center_mv) {
+  return bigdia_search(
       x, ref_mv, VPXMAX(MAX_MVSEARCH_STEPS - 2, search_param), sad_per_bit,
-      do_init_search, cost_list, vfp, use_mvcost, center_mv, best_mv);
+      do_init_search, cost_list, vfp, use_mvcost, center_mv);
 }
 
 #undef CHECK_BETTER
@@ -1517,7 +1482,7 @@
 static int exhuastive_mesh_search(const MACROBLOCK *x,
                                   MV *ref_mv, MV *best_mv,
                                   int range, int step, int sad_per_bit,
-                                  const vp9_variance_fn_ptr_t *fn_ptr,
+                                  const vp10_variance_fn_ptr_t *fn_ptr,
                                   const MV *center_mv) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   const struct buf_2d *const what = &x->plane[0].src;
@@ -1600,11 +1565,11 @@
 }
 
 int vp10_diamond_search_sad_c(const MACROBLOCK *x,
-                             const search_site_config *cfg,
-                             MV *ref_mv, MV *best_mv, int search_param,
-                             int sad_per_bit, int *num00,
-                             const vp9_variance_fn_ptr_t *fn_ptr,
-                             const MV *center_mv) {
+                              const search_site_config *cfg,
+                              MV *ref_mv, MV *best_mv, int search_param,
+                              int sad_per_bit, int *num00,
+                              const vp10_variance_fn_ptr_t *fn_ptr,
+                              const MV *center_mv) {
   int i, j, step;
 
   const MACROBLOCKD *const xd = &x->e_mbd;
@@ -1820,10 +1785,10 @@
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0, 0}};
-  DECLARE_ALIGNED(16, int16_t, hbuf[128]);
-  DECLARE_ALIGNED(16, int16_t, vbuf[128]);
-  DECLARE_ALIGNED(16, int16_t, src_hbuf[64]);
-  DECLARE_ALIGNED(16, int16_t, src_vbuf[64]);
+  DECLARE_ALIGNED(16, int16_t, hbuf[2 * MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, int16_t, vbuf[2 * MAX_SB_SIZE]);
+  DECLARE_ALIGNED(16, int16_t, src_hbuf[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, int16_t, src_vbuf[MAX_SB_SQUARE]);
   int idx;
   const int bw = 4 << b_width_log2_lookup[bsize];
   const int bh = 4 << b_height_log2_lookup[bsize];
@@ -1953,12 +1918,12 @@
 /* do_refine: If last step (1-away) of n-step search doesn't pick the center
               point as the best match, we will do a final 1-away diamond
               refining search  */
-int vp10_full_pixel_diamond(const VP10_COMP *cpi, MACROBLOCK *x,
-                           MV *mvp_full, int step_param,
-                           int sadpb, int further_steps, int do_refine,
-                           int *cost_list,
-                           const vp9_variance_fn_ptr_t *fn_ptr,
-                           const MV *ref_mv, MV *dst_mv) {
+static int full_pixel_diamond(VP10_COMP *cpi, MACROBLOCK *x,
+                              MV *mvp_full, int step_param,
+                              int sadpb, int further_steps, int do_refine,
+                              int *cost_list,
+                              const vp10_variance_fn_ptr_t *fn_ptr,
+                              const MV *ref_mv) {
   MV temp_mv;
   int thissme, n, num00 = 0;
   int bestsme = cpi->diamond_search_sad(x, &cpi->ss_cfg, mvp_full, &temp_mv,
@@ -1966,7 +1931,7 @@
                                         fn_ptr, ref_mv);
   if (bestsme < INT_MAX)
     bestsme = vp10_get_mvpred_var(x, &temp_mv, ref_mv, fn_ptr, 1);
-  *dst_mv = temp_mv;
+  x->best_mv.as_mv = temp_mv;
 
   // If there won't be more n-step search, check to see if refining search is
   // needed.
@@ -1991,7 +1956,7 @@
 
       if (thissme < bestsme) {
         bestsme = thissme;
-        *dst_mv = temp_mv;
+        x->best_mv.as_mv = temp_mv;
       }
     }
   }
@@ -1999,20 +1964,20 @@
   // final 1-away diamond refining search
   if (do_refine) {
     const int search_range = 8;
-    MV best_mv = *dst_mv;
+    MV best_mv = x->best_mv.as_mv;
     thissme = vp10_refining_search_sad(x, &best_mv, sadpb, search_range,
                                        fn_ptr, ref_mv);
     if (thissme < INT_MAX)
       thissme = vp10_get_mvpred_var(x, &best_mv, ref_mv, fn_ptr, 1);
     if (thissme < bestsme) {
       bestsme = thissme;
-      *dst_mv = best_mv;
+      x->best_mv.as_mv = best_mv;
     }
   }
 
   // Return cost list.
   if (cost_list) {
-    calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, dst_mv, cost_list);
+    calc_int_cost_list(x, ref_mv, sadpb, fn_ptr, &x->best_mv.as_mv, cost_list);
   }
   return bestsme;
 }
@@ -2023,8 +1988,9 @@
 // Runs an limited range exhaustive mesh search using a pattern set
 // according to the encode speed profile.
 static int full_pixel_exhaustive(VP10_COMP *cpi, MACROBLOCK *x,
-                                 MV *centre_mv_full, int sadpb,  int *cost_list,
-                                 const vp9_variance_fn_ptr_t *fn_ptr,
+                                 const MV *centre_mv_full, int sadpb,
+                                 int *cost_list,
+                                 const vp10_variance_fn_ptr_t *fn_ptr,
                                  const MV *ref_mv, MV *dst_mv) {
   const SPEED_FEATURES *const sf = &cpi->sf;
   MV temp_mv = {centre_mv_full->row, centre_mv_full->col};
@@ -2082,9 +2048,9 @@
 }
 
 int vp10_full_search_sad_c(const MACROBLOCK *x, const MV *ref_mv,
-                          int sad_per_bit, int distance,
-                          const vp9_variance_fn_ptr_t *fn_ptr,
-                          const MV *center_mv, MV *best_mv) {
+                           int sad_per_bit, int distance,
+                           const vp10_variance_fn_ptr_t *fn_ptr,
+                           const MV *center_mv, MV *best_mv) {
   int r, c;
   const MACROBLOCKD *const xd = &x->e_mbd;
   const struct buf_2d *const what = &x->plane[0].src;
@@ -2115,9 +2081,9 @@
 }
 
 int vp10_full_search_sadx3(const MACROBLOCK *x, const MV *ref_mv,
-                          int sad_per_bit, int distance,
-                          const vp9_variance_fn_ptr_t *fn_ptr,
-                          const MV *center_mv, MV *best_mv) {
+                           int sad_per_bit, int distance,
+                           const vp10_variance_fn_ptr_t *fn_ptr,
+                           const MV *center_mv, MV *best_mv) {
   int r;
   const MACROBLOCKD *const xd = &x->e_mbd;
   const struct buf_2d *const what = &x->plane[0].src;
@@ -2180,9 +2146,9 @@
 }
 
 int vp10_full_search_sadx8(const MACROBLOCK *x, const MV *ref_mv,
-                          int sad_per_bit, int distance,
-                          const vp9_variance_fn_ptr_t *fn_ptr,
-                          const MV *center_mv, MV *best_mv) {
+                           int sad_per_bit, int distance,
+                           const vp10_variance_fn_ptr_t *fn_ptr,
+                           const MV *center_mv, MV *best_mv) {
   int r;
   const MACROBLOCKD *const xd = &x->e_mbd;
   const struct buf_2d *const what = &x->plane[0].src;
@@ -2271,7 +2237,7 @@
 int vp10_refining_search_sad(const MACROBLOCK *x,
                             MV *ref_mv, int error_per_bit,
                             int search_range,
-                            const vp9_variance_fn_ptr_t *fn_ptr,
+                            const vp10_variance_fn_ptr_t *fn_ptr,
                             const MV *center_mv) {
   const MACROBLOCKD *const xd = &x->e_mbd;
   const MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};
@@ -2347,29 +2313,30 @@
 
 // This function is called when we do joint motion search in comp_inter_inter
 // mode.
-int vp10_refining_search_8p_c(const MACROBLOCK *x,
-                             MV *ref_mv, int error_per_bit,
-                             int search_range,
-                             const vp9_variance_fn_ptr_t *fn_ptr,
-                             const MV *center_mv,
-                             const uint8_t *second_pred) {
+int vp10_refining_search_8p_c(MACROBLOCK *x,
+                              int error_per_bit,
+                              int search_range,
+                              const vp10_variance_fn_ptr_t *fn_ptr,
+                              const MV *center_mv,
+                              const uint8_t *second_pred) {
   const MV neighbors[8] = {{-1, 0}, {0, -1}, {0, 1}, {1, 0},
                            {-1, -1}, {1, -1}, {-1, 1}, {1, 1}};
   const MACROBLOCKD *const xd = &x->e_mbd;
   const struct buf_2d *const what = &x->plane[0].src;
   const struct buf_2d *const in_what = &xd->plane[0].pre[0];
   const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  MV *best_mv = &x->best_mv.as_mv;
   unsigned int best_sad = fn_ptr->sdaf(what->buf, what->stride,
-      get_buf_from_mv(in_what, ref_mv), in_what->stride, second_pred) +
-      mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
+      get_buf_from_mv(in_what, best_mv), in_what->stride, second_pred) +
+      mvsad_err_cost(x, best_mv, &fcenter_mv, error_per_bit);
   int i, j;
 
   for (i = 0; i < search_range; ++i) {
     int best_site = -1;
 
     for (j = 0; j < 8; ++j) {
-      const MV mv = {ref_mv->row + neighbors[j].row,
-                     ref_mv->col + neighbors[j].col};
+      const MV mv = {best_mv->row + neighbors[j].row,
+                     best_mv->col + neighbors[j].col};
 
       if (is_mv_in(x, &mv)) {
         unsigned int sad = fn_ptr->sdaf(what->buf, what->stride,
@@ -2387,6 +2354,579 @@
     if (best_site == -1) {
       break;
     } else {
+      best_mv->row += neighbors[best_site].row;
+      best_mv->col += neighbors[best_site].col;
+    }
+  }
+  return best_sad;
+}
+
+#define MIN_EX_SEARCH_LIMIT 128
+static int is_exhaustive_allowed(VP10_COMP *cpi, MACROBLOCK *x) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  const int max_ex = VPXMAX(MIN_EX_SEARCH_LIMIT,
+      (*x->m_search_count_ptr * sf->max_exaustive_pct) / 100);
+
+  return sf->allow_exhaustive_searches &&
+      (sf->exhaustive_searches_thresh < INT_MAX) &&
+      (*x->ex_search_count_ptr <= max_ex) &&
+      !cpi->rc.is_src_frame_alt_ref;
+}
+
+int vp10_full_pixel_search(VP10_COMP *cpi, MACROBLOCK *x,
+                           BLOCK_SIZE bsize, MV *mvp_full,
+                           int step_param, int error_per_bit,
+                           int *cost_list, const MV *ref_mv,
+                           int var_max, int rd) {
+  const SPEED_FEATURES *const sf = &cpi->sf;
+  const SEARCH_METHODS method = sf->mv.search_method;
+  vp10_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize];
+  int var = 0;
+
+  if (cost_list) {
+    cost_list[0] = INT_MAX;
+    cost_list[1] = INT_MAX;
+    cost_list[2] = INT_MAX;
+    cost_list[3] = INT_MAX;
+    cost_list[4] = INT_MAX;
+  }
+
+  // Keep track of number of searches (this frame in this thread).
+  ++(*x->m_search_count_ptr);
+
+  switch (method) {
+    case FAST_DIAMOND:
+      var = fast_dia_search(x, mvp_full, step_param, error_per_bit, 0,
+                            cost_list, fn_ptr, 1, ref_mv);
+      break;
+    case FAST_HEX:
+      var = fast_hex_search(x, mvp_full, step_param, error_per_bit, 0,
+                            cost_list, fn_ptr, 1, ref_mv);
+      break;
+    case HEX:
+      var = vp10_hex_search(x, mvp_full, step_param, error_per_bit, 1,
+                            cost_list, fn_ptr, 1, ref_mv);
+      break;
+    case SQUARE:
+      var = square_search(x, mvp_full, step_param, error_per_bit, 1,
+                          cost_list, fn_ptr, 1, ref_mv);
+      break;
+    case BIGDIA:
+      var = bigdia_search(x, mvp_full, step_param, error_per_bit, 1,
+                          cost_list, fn_ptr, 1, ref_mv);
+      break;
+    case NSTEP:
+      var = full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
+                               MAX_MVSEARCH_STEPS - 1 - step_param,
+                               1, cost_list, fn_ptr, ref_mv);
+
+      // Should we allow a follow on exhaustive search?
+      if (is_exhaustive_allowed(cpi, x)) {
+        int64_t exhuastive_thr = sf->exhaustive_searches_thresh;
+        exhuastive_thr >>= 8 - (b_width_log2_lookup[bsize] +
+                                b_height_log2_lookup[bsize]);
+
+        // Threshold variance for an exhaustive full search.
+        if (var > exhuastive_thr) {
+            int var_ex;
+          MV tmp_mv_ex;
+          var_ex = full_pixel_exhaustive(cpi, x, &x->best_mv.as_mv,
+                                         error_per_bit, cost_list, fn_ptr,
+                                         ref_mv, &tmp_mv_ex);
+
+          if (var_ex < var) {
+            var = var_ex;
+            x->best_mv.as_mv = tmp_mv_ex;
+          }
+        }
+      }
+      break;
+
+      break;
+    default:
+      assert(0 && "Invalid search method.");
+  }
+
+  if (method != NSTEP && rd && var < var_max)
+    var = vp10_get_mvpred_var(x, &x->best_mv.as_mv, ref_mv, fn_ptr, 1);
+
+  return var;
+}
+
+#if CONFIG_EXT_INTER
+/* returns subpixel variance error function */
+#define DIST(r, c) \
+    vfp->msvf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \
+              src_stride, mask, mask_stride, &sse)
+
+/* checks if (r, c) has better score than previous best */
+
+#define MVC(r, c)                                       \
+    (mvcost ?                                           \
+     ((mvjcost[((r) != rr) * 2 + ((c) != rc)] +         \
+       mvcost[0][((r) - rr)] + mvcost[1][((c) - rc)]) * \
+      error_per_bit + 4096) >> 13 : 0)
+
+#define CHECK_BETTER(v, r, c) \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {              \
+    thismse = (DIST(r, c));                                            \
+    if ((v = MVC(r, c) + thismse) < besterr) {                         \
+      besterr = v;                                                     \
+      br = r;                                                          \
+      bc = c;                                                          \
+      *distortion = thismse;                                           \
+      *sse1 = sse;                                                     \
+    }                                                                  \
+  } else {                                                             \
+    v = INT_MAX;                                                       \
+  }
+
+#undef CHECK_BETTER0
+#define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c)
+
+#undef CHECK_BETTER1
+#define CHECK_BETTER1(v, r, c) \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {              \
+    thismse = upsampled_masked_pref_error(xd,                          \
+                                          mask, mask_stride,           \
+                                          vfp, z, src_stride,          \
+                                          upre(y, y_stride, r, c),     \
+                                          y_stride,                    \
+                                          w, h, &sse);    \
+    if ((v = MVC(r, c) + thismse) < besterr) {                         \
+      besterr = v;                                                     \
+      br = r;                                                          \
+      bc = c;                                                          \
+      *distortion = thismse;                                           \
+      *sse1 = sse;                                                     \
+    }                                                                  \
+  } else {                                                             \
+    v = INT_MAX;                                                       \
+  }
+
+int vp10_find_best_masked_sub_pixel_tree(const MACROBLOCK *x,
+                                         const uint8_t *mask, int mask_stride,
+                                         MV *bestmv, const MV *ref_mv,
+                                         int allow_hp,
+                                         int error_per_bit,
+                                         const vp10_variance_fn_ptr_t *vfp,
+                                         int forced_stop,
+                                         int iters_per_step,
+                                         int *mvjcost, int *mvcost[2],
+                                         int *distortion,
+                                         unsigned int *sse1, int is_second) {
+  const uint8_t *const z = x->plane[0].src.buf;
+  const int src_stride = x->plane[0].src.stride;
+  const MACROBLOCKD *xd = &x->e_mbd;
+  unsigned int besterr = INT_MAX;
+  unsigned int sse;
+  int thismse;
+  unsigned int whichdir;
+  unsigned int halfiters = iters_per_step;
+  unsigned int quarteriters = iters_per_step;
+  unsigned int eighthiters = iters_per_step;
+
+  const int y_stride = xd->plane[0].pre[is_second].stride;
+  const int offset = bestmv->row * y_stride + bestmv->col;
+  const uint8_t *const y = xd->plane[0].pre[is_second].buf;
+
+  int rr = ref_mv->row;
+  int rc = ref_mv->col;
+  int br = bestmv->row * 8;
+  int bc = bestmv->col * 8;
+  int hstep = 4;
+  const int minc = VPXMAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
+  const int maxc = VPXMIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
+  const int minr = VPXMAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
+  const int maxr = VPXMIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
+
+  int tr = br;
+  int tc = bc;
+
+  // central mv
+  bestmv->row *= 8;
+  bestmv->col *= 8;
+
+  // calculate central point error
+  besterr = vfp->mvf(y + offset, y_stride, z, src_stride, mask, mask_stride,
+                     sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+
+  // 1/2 pel
+  FIRST_LEVEL_CHECKS;
+  if (halfiters > 1) {
+    SECOND_LEVEL_CHECKS;
+  }
+  tr = br;
+  tc = bc;
+
+  // Note forced_stop: 0 - full, 1 - qtr only, 2 - half only
+  if (forced_stop != 2) {
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (quarteriters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+    tr = br;
+    tc = bc;
+  }
+
+  if (allow_hp && vp10_use_mv_hp(ref_mv) && forced_stop == 0) {
+    hstep >>= 1;
+    FIRST_LEVEL_CHECKS;
+    if (eighthiters > 1) {
+      SECOND_LEVEL_CHECKS;
+    }
+    tr = br;
+    tc = bc;
+  }
+  // These lines insure static analysis doesn't warn that
+  // tr and tc aren't used after the above point.
+  (void) tr;
+  (void) tc;
+
+  bestmv->row = br;
+  bestmv->col = bc;
+
+  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
+    return INT_MAX;
+
+  return besterr;
+}
+
+static unsigned int setup_masked_center_error(const uint8_t *mask,
+                                              int mask_stride,
+                                              const MV *bestmv,
+                                              const MV *ref_mv,
+                                              int error_per_bit,
+                                              const vp10_variance_fn_ptr_t *vfp,
+                                              const uint8_t *const src,
+                                              const int src_stride,
+                                              const uint8_t *const y,
+                                              int y_stride,
+                                              int offset,
+                                              int *mvjcost, int *mvcost[2],
+                                              unsigned int *sse1,
+                                              int *distortion) {
+  unsigned int besterr;
+  besterr = vfp->mvf(y + offset, y_stride, src, src_stride,
+                     mask, mask_stride, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+  return besterr;
+}
+
+static int upsampled_masked_pref_error(const MACROBLOCKD *xd,
+                                       const uint8_t *mask,
+                                       int mask_stride,
+                                       const vp10_variance_fn_ptr_t *vfp,
+                                       const uint8_t *const src,
+                                       const int src_stride,
+                                       const uint8_t *const y, int y_stride,
+                                       int w, int h, unsigned int *sse) {
+  unsigned int besterr;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
+    vpx_highbd_upsampled_pred(pred16, w, h, y, y_stride);
+
+    besterr = vfp->mvf(CONVERT_TO_BYTEPTR(pred16), w, src, src_stride,
+                       mask, mask_stride, sse);
+  } else {
+    DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+#else
+    DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+    (void) xd;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    vpx_upsampled_pred(pred, w, h, y, y_stride);
+
+    besterr = vfp->mvf(pred, w, src, src_stride,
+                       mask, mask_stride, sse);
+#if CONFIG_VP9_HIGHBITDEPTH
+  }
+#endif
+  return besterr;
+}
+
+static unsigned int upsampled_setup_masked_center_error(
+    const MACROBLOCKD *xd,
+    const uint8_t *mask, int mask_stride,
+    const MV *bestmv, const MV *ref_mv,
+    int error_per_bit, const vp10_variance_fn_ptr_t *vfp,
+    const uint8_t *const src, const int src_stride,
+    const uint8_t *const y, int y_stride,
+    int w, int h, int offset, int *mvjcost, int *mvcost[2],
+    unsigned int *sse1, int *distortion) {
+  unsigned int besterr = upsampled_masked_pref_error(
+      xd, mask, mask_stride, vfp, src, src_stride,
+      y + offset, y_stride, w, h, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+  return besterr;
+}
+
+int vp10_find_best_masked_sub_pixel_tree_up(VP10_COMP *cpi,
+                                            MACROBLOCK *x,
+                                            const uint8_t *mask,
+                                            int mask_stride,
+                                            int mi_row, int mi_col,
+                                            MV *bestmv, const MV *ref_mv,
+                                            int allow_hp,
+                                            int error_per_bit,
+                                            const vp10_variance_fn_ptr_t *vfp,
+                                            int forced_stop,
+                                            int iters_per_step,
+                                            int *mvjcost, int *mvcost[2],
+                                            int *distortion,
+                                            unsigned int *sse1,
+                                            int is_second,
+                                            int use_upsampled_ref) {
+  const uint8_t *const z = x->plane[0].src.buf;
+  const uint8_t *const src_address = z;
+  const int src_stride = x->plane[0].src.stride;
+  MACROBLOCKD *xd = &x->e_mbd;
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  unsigned int besterr = INT_MAX;
+  unsigned int sse;
+  unsigned int thismse;
+
+  int rr = ref_mv->row;
+  int rc = ref_mv->col;
+  int br = bestmv->row * 8;
+  int bc = bestmv->col * 8;
+  int hstep = 4;
+  int iter;
+  int round = 3 - forced_stop;
+  const int minc = VPXMAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
+  const int maxc = VPXMIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
+  const int minr = VPXMAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
+  const int maxr = VPXMIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
+  int tr = br;
+  int tc = bc;
+  const MV *search_step = search_step_table;
+  int idx, best_idx = -1;
+  unsigned int cost_array[5];
+  int kr, kc;
+  const int w = 4 * num_4x4_blocks_wide_lookup[mbmi->sb_type];
+  const int h = 4 * num_4x4_blocks_high_lookup[mbmi->sb_type];
+  int offset;
+  int y_stride;
+  const uint8_t *y;
+
+  const struct buf_2d backup_pred = pd->pre[is_second];
+  if (use_upsampled_ref) {
+    int ref = xd->mi[0]->mbmi.ref_frame[is_second];
+    const YV12_BUFFER_CONFIG *upsampled_ref = get_upsampled_ref(cpi, ref);
+    setup_pred_plane(&pd->pre[is_second], upsampled_ref->y_buffer,
+                     upsampled_ref->y_stride, (mi_row << 3), (mi_col << 3),
+                     NULL, pd->subsampling_x, pd->subsampling_y);
+  }
+  y = pd->pre[is_second].buf;
+  y_stride = pd->pre[is_second].stride;
+  offset = bestmv->row * y_stride + bestmv->col;
+
+  if (!(allow_hp && vp10_use_mv_hp(ref_mv)))
+    if (round == 3)
+      round = 2;
+
+  bestmv->row *= 8;
+  bestmv->col *= 8;
+
+  // use_upsampled_ref can be 0 or 1
+  if (use_upsampled_ref)
+    besterr = upsampled_setup_masked_center_error(
+        xd, mask, mask_stride, bestmv, ref_mv, error_per_bit,
+        vfp, z, src_stride, y, y_stride,
+        w, h, (offset << 3),
+        mvjcost, mvcost, sse1, distortion);
+  else
+    besterr = setup_masked_center_error(
+        mask, mask_stride, bestmv, ref_mv, error_per_bit,
+        vfp, z, src_stride, y, y_stride,
+        offset, mvjcost, mvcost, sse1, distortion);
+
+  for (iter = 0; iter < round; ++iter) {
+    // Check vertical and horizontal sub-pixel positions.
+    for (idx = 0; idx < 4; ++idx) {
+      tr = br + search_step[idx].row;
+      tc = bc + search_step[idx].col;
+      if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+        MV this_mv = {tr, tc};
+
+        if (use_upsampled_ref) {
+          const uint8_t *const pre_address = y + tr * y_stride + tc;
+
+          thismse = upsampled_masked_pref_error(xd,
+                                                mask, mask_stride,
+                                                vfp, src_address, src_stride,
+                                                pre_address, y_stride,
+                                                w, h, &sse);
+        } else {
+          const uint8_t *const pre_address = y + (tr >> 3) * y_stride +
+              (tc >> 3);
+          thismse = vfp->msvf(pre_address, y_stride, sp(tc), sp(tr),
+                              src_address, src_stride,
+                              mask, mask_stride, &sse);
+        }
+
+        cost_array[idx] = thismse +
+            mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
+
+        if (cost_array[idx] < besterr) {
+          best_idx = idx;
+          besterr = cost_array[idx];
+          *distortion = thismse;
+          *sse1 = sse;
+        }
+      } else {
+        cost_array[idx] = INT_MAX;
+      }
+    }
+
+    // Check diagonal sub-pixel position
+    kc = (cost_array[0] <= cost_array[1] ? -hstep : hstep);
+    kr = (cost_array[2] <= cost_array[3] ? -hstep : hstep);
+
+    tc = bc + kc;
+    tr = br + kr;
+    if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+      MV this_mv = {tr, tc};
+
+      if (use_upsampled_ref) {
+        const uint8_t *const pre_address = y + tr * y_stride + tc;
+
+        thismse = upsampled_masked_pref_error(xd,
+                                              mask, mask_stride,
+                                              vfp, src_address, src_stride,
+                                              pre_address, y_stride,
+                                              w, h, &sse);
+      } else {
+        const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
+
+        thismse = vfp->msvf(pre_address, y_stride, sp(tc), sp(tr),
+                            src_address, src_stride, mask, mask_stride, &sse);
+      }
+
+      cost_array[4] = thismse +
+          mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
+
+      if (cost_array[4] < besterr) {
+        best_idx = 4;
+        besterr = cost_array[4];
+        *distortion = thismse;
+        *sse1 = sse;
+      }
+    } else {
+      cost_array[idx] = INT_MAX;
+    }
+
+    if (best_idx < 4 && best_idx >= 0) {
+      br += search_step[best_idx].row;
+      bc += search_step[best_idx].col;
+    } else if (best_idx == 4) {
+      br = tr;
+      bc = tc;
+    }
+
+    if (iters_per_step > 1 && best_idx != -1) {
+      if (use_upsampled_ref) {
+        SECOND_LEVEL_CHECKS_BEST(1);
+      } else {
+        SECOND_LEVEL_CHECKS_BEST(0);
+      }
+    }
+
+    tr = br;
+    tc = bc;
+
+    search_step += 4;
+    hstep >>= 1;
+    best_idx = -1;
+  }
+
+  // These lines insure static analysis doesn't warn that
+  // tr and tc aren't used after the above point.
+  (void) tr;
+  (void) tc;
+
+  bestmv->row = br;
+  bestmv->col = bc;
+
+  if (use_upsampled_ref) {
+    pd->pre[is_second] = backup_pred;
+  }
+
+  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
+    return INT_MAX;
+
+  return besterr;
+}
+
+#undef DIST
+#undef MVC
+#undef CHECK_BETTER
+
+static int get_masked_mvpred_var(const MACROBLOCK *x,
+                                 const uint8_t *mask, int mask_stride,
+                                 const MV *best_mv, const MV *center_mv,
+                                 const vp10_variance_fn_ptr_t *vfp,
+                                 int use_mvcost, int is_second) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
+  const MV mv = {best_mv->row * 8, best_mv->col * 8};
+  unsigned int unused;
+
+  return vfp->mvf(what->buf, what->stride,
+                  get_buf_from_mv(in_what, best_mv), in_what->stride,
+                  mask, mask_stride, &unused) +
+      (use_mvcost ?  mv_err_cost(&mv, center_mv, x->nmvjointcost,
+                                 x->mvcost, x->errorperbit) : 0);
+}
+
+int masked_refining_search_sad(const MACROBLOCK *x,
+                               const uint8_t *mask, int mask_stride,
+                               MV *ref_mv, int error_per_bit,
+                               int search_range,
+                               const vp10_variance_fn_ptr_t *fn_ptr,
+                               const MV *center_mv, int is_second) {
+  const MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  unsigned int best_sad = fn_ptr->msdf(what->buf, what->stride,
+                                       get_buf_from_mv(in_what, ref_mv),
+                                       in_what->stride, mask, mask_stride) +
+      mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
+  int i, j;
+
+  for (i = 0; i < search_range; i++) {
+    int best_site = -1;
+
+    for (j = 0; j < 4; j++) {
+      const MV mv = {ref_mv->row + neighbors[j].row,
+                     ref_mv->col + neighbors[j].col};
+      if (is_mv_in(x, &mv)) {
+        unsigned int sad = fn_ptr->msdf(what->buf, what->stride,
+            get_buf_from_mv(in_what, &mv), in_what->stride, mask, mask_stride);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
+          if (sad < best_sad) {
+            best_sad = sad;
+            best_site = j;
+          }
+        }
+      }
+    }
+
+    if (best_site == -1) {
+      break;
+    } else {
       ref_mv->row += neighbors[best_site].row;
       ref_mv->col += neighbors[best_site].col;
     }
@@ -2394,94 +2934,688 @@
   return best_sad;
 }
 
-#define MIN_EX_SEARCH_LIMIT 128
-static int is_exhaustive_allowed(VP10_COMP *cpi, MACROBLOCK *x) {
-  const SPEED_FEATURES *const sf = &cpi->sf;
-  const int max_ex = VPXMAX(MIN_EX_SEARCH_LIMIT,
-      (*x->m_search_count_ptr * sf->max_exaustive_pct) / 100);
+int masked_diamond_search_sad(const MACROBLOCK *x,
+                              const search_site_config *cfg,
+                              const uint8_t *mask, int mask_stride,
+                              MV *ref_mv, MV *best_mv,
+                              int search_param,
+                              int sad_per_bit, int *num00,
+                              const vp10_variance_fn_ptr_t *fn_ptr,
+                              const MV *center_mv, int is_second) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const what = &x->plane[0].src;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
+  // search_param determines the length of the initial step and hence the number
+  // of iterations
+  // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 =
+  // (MAX_FIRST_STEP/4) pel... etc.
+  const search_site *const ss = &cfg->ss[search_param * cfg->searches_per_step];
+  const int tot_steps = (cfg->ss_count / cfg->searches_per_step) - search_param;
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  const uint8_t *best_address, *in_what_ref;
+  int best_sad = INT_MAX;
+  int best_site = 0;
+  int last_site = 0;
+  int i, j, step;
 
-  return sf->allow_exhaustive_searches &&
-      (sf->exhaustive_searches_thresh < INT_MAX) &&
-      (*x->ex_search_count_ptr <= max_ex) &&
-      !cpi->rc.is_src_frame_alt_ref;
-}
+  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+  in_what_ref = get_buf_from_mv(in_what, ref_mv);
+  best_address = in_what_ref;
+  *num00 = 0;
+  *best_mv = *ref_mv;
 
-int vp10_full_pixel_search(VP10_COMP *cpi, MACROBLOCK *x,
-                          BLOCK_SIZE bsize, MV *mvp_full,
-                          int step_param, int error_per_bit,
-                          int *cost_list,
-                          const MV *ref_mv, MV *tmp_mv,
-                          int var_max, int rd) {
-  const SPEED_FEATURES *const sf = &cpi->sf;
-  const SEARCH_METHODS method = sf->mv.search_method;
-  vp9_variance_fn_ptr_t *fn_ptr = &cpi->fn_ptr[bsize];
-  int var = 0;
-  if (cost_list) {
-    cost_list[0] = INT_MAX;
-    cost_list[1] = INT_MAX;
-    cost_list[2] = INT_MAX;
-    cost_list[3] = INT_MAX;
-    cost_list[4] = INT_MAX;
-  }
+  // Check the starting position
+  best_sad = fn_ptr->msdf(what->buf, what->stride,
+                         best_address, in_what->stride,
+                         mask, mask_stride) +
+      mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
 
-  // Keep track of number of searches (this frame in this thread).
-  ++(*x->m_search_count_ptr);
+  i = 1;
 
-  switch (method) {
-    case FAST_DIAMOND:
-      var = vp10_fast_dia_search(x, mvp_full, step_param, error_per_bit, 0,
-                                cost_list, fn_ptr, 1, ref_mv, tmp_mv);
-      break;
-    case FAST_HEX:
-      var = vp10_fast_hex_search(x, mvp_full, step_param, error_per_bit, 0,
-                                cost_list, fn_ptr, 1, ref_mv, tmp_mv);
-      break;
-    case HEX:
-      var = vp10_hex_search(x, mvp_full, step_param, error_per_bit, 1,
-                           cost_list, fn_ptr, 1, ref_mv, tmp_mv);
-      break;
-    case SQUARE:
-      var = vp10_square_search(x, mvp_full, step_param, error_per_bit, 1,
-                              cost_list, fn_ptr, 1, ref_mv, tmp_mv);
-      break;
-    case BIGDIA:
-      var = vp10_bigdia_search(x, mvp_full, step_param, error_per_bit, 1,
-                              cost_list, fn_ptr, 1, ref_mv, tmp_mv);
-      break;
-    case NSTEP:
-      var = vp10_full_pixel_diamond(cpi, x, mvp_full, step_param, error_per_bit,
-                                   MAX_MVSEARCH_STEPS - 1 - step_param,
-                                   1, cost_list, fn_ptr, ref_mv, tmp_mv);
-
-      // Should we allow a follow on exhaustive search?
-      if (is_exhaustive_allowed(cpi, x)) {
-        int64_t exhuastive_thr = sf->exhaustive_searches_thresh;
-        exhuastive_thr >>= 8 - (b_width_log2_lookup[bsize] +
-                                b_height_log2_lookup[bsize]);
-
-        // Threshold variance for an exhaustive full search.
-        if (var > exhuastive_thr) {
-            int var_ex;
-          MV tmp_mv_ex;
-          var_ex = full_pixel_exhaustive(cpi, x, tmp_mv,
-                                         error_per_bit, cost_list, fn_ptr,
-                                         ref_mv, &tmp_mv_ex);
-
-          if (var_ex < var) {
-            var = var_ex;
-            *tmp_mv = tmp_mv_ex;
+  for (step = 0; step < tot_steps; step++) {
+    for (j = 0; j < cfg->searches_per_step; j++) {
+      const MV mv = {best_mv->row + ss[i].mv.row,
+                     best_mv->col + ss[i].mv.col};
+      if (is_mv_in(x, &mv)) {
+       int sad = fn_ptr->msdf(what->buf, what->stride,
+                              best_address + ss[i].offset, in_what->stride,
+                              mask, mask_stride);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+          if (sad < best_sad) {
+            best_sad = sad;
+            best_site = i;
           }
         }
       }
-      break;
 
-      break;
-    default:
-      assert(0 && "Invalid search method.");
+      i++;
+    }
+
+    if (best_site != last_site) {
+      best_mv->row += ss[best_site].mv.row;
+      best_mv->col += ss[best_site].mv.col;
+      best_address += ss[best_site].offset;
+      last_site = best_site;
+#if defined(NEW_DIAMOND_SEARCH)
+      while (1) {
+        const MV this_mv = {best_mv->row + ss[best_site].mv.row,
+                            best_mv->col + ss[best_site].mv.col};
+        if (is_mv_in(x, &this_mv)) {
+          int sad = fn_ptr->msdf(what->buf, what->stride,
+                                 best_address + ss[best_site].offset,
+                                 in_what->stride, mask, mask_stride);
+          if (sad < best_sad) {
+            sad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
+            if (sad < best_sad) {
+              best_sad = sad;
+              best_mv->row += ss[best_site].mv.row;
+              best_mv->col += ss[best_site].mv.col;
+              best_address += ss[best_site].offset;
+              continue;
+            }
+          }
+        }
+        break;
+      }
+#endif
+    } else if (best_address == in_what_ref) {
+      (*num00)++;
+    }
+  }
+  return best_sad;
+}
+
+int vp10_masked_full_pixel_diamond(const VP10_COMP *cpi, MACROBLOCK *x,
+                                   const uint8_t *mask, int mask_stride,
+                                   MV *mvp_full, int step_param,
+                                   int sadpb, int further_steps, int do_refine,
+                                   const vp10_variance_fn_ptr_t *fn_ptr,
+                                   const MV *ref_mv, MV *dst_mv,
+                                   int is_second) {
+  MV temp_mv;
+  int thissme, n, num00 = 0;
+  int bestsme = masked_diamond_search_sad(x, &cpi->ss_cfg,
+                                          mask, mask_stride,
+                                          mvp_full, &temp_mv,
+                                          step_param, sadpb, &n,
+                                          fn_ptr, ref_mv, is_second);
+  if (bestsme < INT_MAX)
+    bestsme = get_masked_mvpred_var(x, mask, mask_stride, &temp_mv, ref_mv,
+                                    fn_ptr, 1, is_second);
+  *dst_mv = temp_mv;
+
+  // If there won't be more n-step search, check to see if refining search is
+  // needed.
+  if (n > further_steps)
+    do_refine = 0;
+
+  while (n < further_steps) {
+    ++n;
+
+    if (num00) {
+      num00--;
+    } else {
+      thissme = masked_diamond_search_sad(x, &cpi->ss_cfg,
+                                          mask, mask_stride,
+                                          mvp_full, &temp_mv,
+                                          step_param + n, sadpb, &num00,
+                                          fn_ptr, ref_mv, is_second);
+      if (thissme < INT_MAX)
+        thissme = get_masked_mvpred_var(x, mask, mask_stride,
+                                        &temp_mv, ref_mv, fn_ptr, 1,
+                                        is_second);
+
+      // check to see if refining search is needed.
+      if (num00 > further_steps - n)
+        do_refine = 0;
+
+      if (thissme < bestsme) {
+        bestsme = thissme;
+        *dst_mv = temp_mv;
+      }
+    }
   }
 
-  if (method != NSTEP && rd && var < var_max)
-    var = vp10_get_mvpred_var(x, tmp_mv, ref_mv, fn_ptr, 1);
-
-  return var;
+  // final 1-away diamond refining search
+  if (do_refine) {
+    const int search_range = 8;
+    MV best_mv = *dst_mv;
+    thissme = masked_refining_search_sad(x, mask, mask_stride,
+                                         &best_mv, sadpb, search_range,
+                                         fn_ptr, ref_mv, is_second);
+    if (thissme < INT_MAX)
+      thissme = get_masked_mvpred_var(x, mask, mask_stride,
+                                      &best_mv, ref_mv, fn_ptr, 1,
+                                      is_second);
+    if (thissme < bestsme) {
+      bestsme = thissme;
+      *dst_mv = best_mv;
+    }
+  }
+  return bestsme;
 }
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_OBMC
+/* returns subpixel variance error function */
+#define DIST(r, c)                                                     \
+  vfp->osvf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, mask, &sse)
+
+/* checks if (r, c) has better score than previous best */
+#define MVC(r, c)                                                      \
+  (mvcost ?                                                            \
+    ((mvjcost[((r) != rr) * 2 + ((c) != rc)] +                         \
+      mvcost[0][((r) - rr)] + mvcost[1][((c) - rc)]) *                 \
+      error_per_bit + 4096) >> 13 : 0)
+
+#define CHECK_BETTER(v, r, c) \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {              \
+    thismse = (DIST(r, c));                                            \
+    if ((v = MVC(r, c) + thismse) < besterr) {                         \
+      besterr = v;                                                     \
+      br = r;                                                          \
+      bc = c;                                                          \
+      *distortion = thismse;                                           \
+      *sse1 = sse;                                                     \
+    }                                                                  \
+  } else {                                                             \
+    v = INT_MAX;                                                       \
+  }
+
+#undef CHECK_BETTER0
+#define CHECK_BETTER0(v, r, c) CHECK_BETTER(v, r, c)
+
+#undef CHECK_BETTER1
+#define CHECK_BETTER1(v, r, c) \
+  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {              \
+    thismse = upsampled_obmc_pref_error(xd,                            \
+                                        mask,                          \
+                                        vfp, z,                        \
+                                        upre(y, y_stride, r, c),       \
+                                        y_stride,                      \
+                                        w, h, &sse);                   \
+    if ((v = MVC(r, c) + thismse) < besterr) {                         \
+      besterr = v;                                                     \
+      br = r;                                                          \
+      bc = c;                                                          \
+      *distortion = thismse;                                           \
+      *sse1 = sse;                                                     \
+    }                                                                  \
+  } else {                                                             \
+    v = INT_MAX;                                                       \
+  }
+
+static unsigned int setup_obmc_center_error(const int32_t *mask,
+                                            const MV *bestmv,
+                                            const MV *ref_mv,
+                                            int error_per_bit,
+                                            const vp10_variance_fn_ptr_t *vfp,
+                                            const int32_t *const wsrc,
+                                            const uint8_t *const y,
+                                            int y_stride,
+                                            int offset,
+                                            int *mvjcost, int *mvcost[2],
+                                            unsigned int *sse1,
+                                            int *distortion) {
+  unsigned int besterr;
+  besterr = vfp->ovf(y + offset, y_stride, wsrc, mask, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+  return besterr;
+}
+
+static int upsampled_obmc_pref_error(const MACROBLOCKD *xd,
+                                     const int32_t *mask,
+                                     const vp10_variance_fn_ptr_t *vfp,
+                                     const int32_t *const wsrc,
+                                     const uint8_t *const y, int y_stride,
+                                     int w, int h, unsigned int *sse) {
+  unsigned int besterr;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    DECLARE_ALIGNED(16, uint16_t, pred16[MAX_SB_SQUARE]);
+    vpx_highbd_upsampled_pred(pred16, w, h, y, y_stride);
+
+    besterr = vfp->ovf(CONVERT_TO_BYTEPTR(pred16), w, wsrc, mask, sse);
+  } else {
+    DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+#else
+    DECLARE_ALIGNED(16, uint8_t, pred[MAX_SB_SQUARE]);
+    (void) xd;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    vpx_upsampled_pred(pred, w, h, y, y_stride);
+
+    besterr = vfp->ovf(pred, w, wsrc, mask, sse);
+#if CONFIG_VP9_HIGHBITDEPTH
+  }
+#endif
+  return besterr;
+}
+
+static unsigned int upsampled_setup_obmc_center_error(
+                        const MACROBLOCKD *xd,
+                        const int32_t *mask,
+                        const MV *bestmv, const MV *ref_mv,
+                        int error_per_bit, const vp10_variance_fn_ptr_t *vfp,
+                        const int32_t *const wsrc,
+                        const uint8_t *const y, int y_stride,
+                        int w, int h, int offset, int *mvjcost, int *mvcost[2],
+                        unsigned int *sse1, int *distortion) {
+  unsigned int besterr = upsampled_obmc_pref_error(xd, mask, vfp, wsrc,
+                                                   y + offset, y_stride,
+                                                   w, h, sse1);
+  *distortion = besterr;
+  besterr += mv_err_cost(bestmv, ref_mv, mvjcost, mvcost, error_per_bit);
+  return besterr;
+}
+
+int vp10_find_best_obmc_sub_pixel_tree_up(VP10_COMP *cpi, MACROBLOCK *x,
+                                          const int32_t *wsrc,
+                                          const int32_t *mask,
+                                          int mi_row, int mi_col,
+                                          MV *bestmv, const MV *ref_mv,
+                                          int allow_hp, int error_per_bit,
+                                          const vp10_variance_fn_ptr_t *vfp,
+                                          int forced_stop, int iters_per_step,
+                                          int *mvjcost, int *mvcost[2],
+                                          int *distortion, unsigned int *sse1,
+                                          int is_second,
+                                          int use_upsampled_ref) {
+  const int *const z = wsrc;
+  const int *const src_address = z;
+  MACROBLOCKD *xd = &x->e_mbd;
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  unsigned int besterr = INT_MAX;
+  unsigned int sse;
+  unsigned int thismse;
+
+  int rr = ref_mv->row;
+  int rc = ref_mv->col;
+  int br = bestmv->row * 8;
+  int bc = bestmv->col * 8;
+  int hstep = 4;
+  int iter;
+  int round = 3 - forced_stop;
+  const int minc = VPXMAX(x->mv_col_min * 8, ref_mv->col - MV_MAX);
+  const int maxc = VPXMIN(x->mv_col_max * 8, ref_mv->col + MV_MAX);
+  const int minr = VPXMAX(x->mv_row_min * 8, ref_mv->row - MV_MAX);
+  const int maxr = VPXMIN(x->mv_row_max * 8, ref_mv->row + MV_MAX);
+  int tr = br;
+  int tc = bc;
+  const MV *search_step = search_step_table;
+  int idx, best_idx = -1;
+  unsigned int cost_array[5];
+  int kr, kc;
+  const int w = 4 * num_4x4_blocks_wide_lookup[mbmi->sb_type];
+  const int h = 4 * num_4x4_blocks_high_lookup[mbmi->sb_type];
+  int offset;
+  int y_stride;
+  const uint8_t *y;
+
+  const struct buf_2d backup_pred = pd->pre[is_second];
+  if (use_upsampled_ref) {
+    int ref = xd->mi[0]->mbmi.ref_frame[is_second];
+    const YV12_BUFFER_CONFIG *upsampled_ref = get_upsampled_ref(cpi, ref);
+    setup_pred_plane(&pd->pre[is_second], upsampled_ref->y_buffer,
+                     upsampled_ref->y_stride, (mi_row << 3), (mi_col << 3),
+                     NULL, pd->subsampling_x, pd->subsampling_y);
+  }
+  y = pd->pre[is_second].buf;
+  y_stride = pd->pre[is_second].stride;
+  offset = bestmv->row * y_stride + bestmv->col;
+
+  if (!(allow_hp && vp10_use_mv_hp(ref_mv)))
+    if (round == 3)
+      round = 2;
+
+  bestmv->row *= 8;
+  bestmv->col *= 8;
+  // use_upsampled_ref can be 0 or 1
+  if (use_upsampled_ref)
+    besterr = upsampled_setup_obmc_center_error(
+        xd, mask, bestmv, ref_mv, error_per_bit,
+        vfp, z, y, y_stride,
+        w, h, (offset << 3),
+        mvjcost, mvcost, sse1, distortion);
+  else
+    besterr = setup_obmc_center_error(
+        mask, bestmv, ref_mv, error_per_bit,
+        vfp, z, y, y_stride,
+        offset, mvjcost, mvcost, sse1, distortion);
+
+  for (iter = 0; iter < round; ++iter) {
+    // Check vertical and horizontal sub-pixel positions.
+    for (idx = 0; idx < 4; ++idx) {
+      tr = br + search_step[idx].row;
+      tc = bc + search_step[idx].col;
+      if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+        MV this_mv = {tr, tc};
+
+        if (use_upsampled_ref) {
+          const uint8_t *const pre_address = y + tr * y_stride + tc;
+
+          thismse = upsampled_obmc_pref_error(xd, mask,
+                                              vfp, src_address,
+                                              pre_address, y_stride,
+                                              w, h, &sse);
+        } else {
+          const uint8_t *const pre_address = y + (tr >> 3) * y_stride +
+              (tc >> 3);
+          thismse = vfp->osvf(pre_address, y_stride, sp(tc), sp(tr),
+                              src_address, mask, &sse);
+        }
+
+        cost_array[idx] = thismse +
+            mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
+        if (cost_array[idx] < besterr) {
+          best_idx = idx;
+          besterr = cost_array[idx];
+          *distortion = thismse;
+          *sse1 = sse;
+        }
+      } else {
+        cost_array[idx] = INT_MAX;
+      }
+    }
+
+    // Check diagonal sub-pixel position
+    kc = (cost_array[0] <= cost_array[1] ? -hstep : hstep);
+    kr = (cost_array[2] <= cost_array[3] ? -hstep : hstep);
+
+    tc = bc + kc;
+    tr = br + kr;
+    if (tc >= minc && tc <= maxc && tr >= minr && tr <= maxr) {
+      MV this_mv = {tr, tc};
+
+      if (use_upsampled_ref) {
+        const uint8_t *const pre_address = y + tr * y_stride + tc;
+
+        thismse = upsampled_obmc_pref_error(xd, mask, vfp, src_address,
+                                            pre_address, y_stride,
+                                            w, h, &sse);
+      } else {
+        const uint8_t *const pre_address = y + (tr >> 3) * y_stride + (tc >> 3);
+
+        thismse = vfp->osvf(pre_address, y_stride, sp(tc), sp(tr),
+                            src_address, mask, &sse);
+      }
+
+      cost_array[4] = thismse +
+          mv_err_cost(&this_mv, ref_mv, mvjcost, mvcost, error_per_bit);
+
+      if (cost_array[4] < besterr) {
+        best_idx = 4;
+        besterr = cost_array[4];
+        *distortion = thismse;
+        *sse1 = sse;
+      }
+    } else {
+      cost_array[idx] = INT_MAX;
+    }
+
+    if (best_idx < 4 && best_idx >= 0) {
+      br += search_step[best_idx].row;
+      bc += search_step[best_idx].col;
+    } else if (best_idx == 4) {
+      br = tr;
+      bc = tc;
+    }
+
+    if (iters_per_step > 1 && best_idx != -1) {
+      if (use_upsampled_ref) {
+        SECOND_LEVEL_CHECKS_BEST(1);
+      } else {
+        SECOND_LEVEL_CHECKS_BEST(0);
+      }
+    }
+
+    tr = br;
+    tc = bc;
+
+    search_step += 4;
+    hstep >>= 1;
+    best_idx = -1;
+  }
+
+  // These lines insure static analysis doesn't warn that
+  // tr and tc aren't used after the above point.
+  (void) tr;
+  (void) tc;
+
+  bestmv->row = br;
+  bestmv->col = bc;
+
+  if (use_upsampled_ref) {
+    pd->pre[is_second] = backup_pred;
+  }
+
+  if ((abs(bestmv->col - ref_mv->col) > (MAX_FULL_PEL_VAL << 3)) ||
+      (abs(bestmv->row - ref_mv->row) > (MAX_FULL_PEL_VAL << 3)))
+    return INT_MAX;
+
+  return besterr;
+}
+
+#undef DIST
+#undef MVC
+#undef CHECK_BETTER
+
+static int get_obmc_mvpred_var(const MACROBLOCK *x,
+                               const int32_t *wsrc,
+                               const int32_t *mask,
+                               const MV *best_mv, const MV *center_mv,
+                               const vp10_variance_fn_ptr_t *vfp,
+                               int use_mvcost, int is_second) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
+  const MV mv = {best_mv->row * 8, best_mv->col * 8};
+  unsigned int unused;
+
+  return vfp->ovf(get_buf_from_mv(in_what, best_mv), in_what->stride,
+                  wsrc, mask, &unused) +
+         (use_mvcost ?  mv_err_cost(&mv, center_mv, x->nmvjointcost,
+                                    x->mvcost, x->errorperbit) : 0);
+}
+
+int obmc_refining_search_sad(const MACROBLOCK *x,
+                             const int32_t *wsrc,
+                             const int32_t *mask,
+                             MV *ref_mv, int error_per_bit,
+                             int search_range,
+                             const vp10_variance_fn_ptr_t *fn_ptr,
+                             const MV *center_mv, int is_second) {
+  const MV neighbors[4] = {{ -1, 0}, {0, -1}, {0, 1}, {1, 0}};
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  unsigned int best_sad = fn_ptr->osdf(get_buf_from_mv(in_what, ref_mv),
+                                       in_what->stride, wsrc, mask) +
+                         mvsad_err_cost(x, ref_mv, &fcenter_mv, error_per_bit);
+  int i, j;
+
+  for (i = 0; i < search_range; i++) {
+    int best_site = -1;
+
+    for (j = 0; j < 4; j++) {
+      const MV mv = {ref_mv->row + neighbors[j].row,
+                     ref_mv->col + neighbors[j].col};
+      if (is_mv_in(x, &mv)) {
+        unsigned int sad = fn_ptr->osdf(get_buf_from_mv(in_what, &mv),
+                                        in_what->stride, wsrc, mask);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost(x, &mv, &fcenter_mv, error_per_bit);
+          if (sad < best_sad) {
+            best_sad = sad;
+            best_site = j;
+          }
+        }
+      }
+    }
+
+    if (best_site == -1) {
+      break;
+    } else {
+      ref_mv->row += neighbors[best_site].row;
+      ref_mv->col += neighbors[best_site].col;
+    }
+  }
+  return best_sad;
+}
+
+int obmc_diamond_search_sad(const MACROBLOCK *x,
+                            const search_site_config *cfg,
+                            const int32_t *wsrc, const int32_t *mask,
+                            MV *ref_mv, MV *best_mv,
+                            int search_param,
+                            int sad_per_bit, int *num00,
+                            const vp10_variance_fn_ptr_t *fn_ptr,
+                            const MV *center_mv, int is_second) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const in_what = &xd->plane[0].pre[is_second];
+  // search_param determines the length of the initial step and hence the number
+  // of iterations
+  // 0 = initial step (MAX_FIRST_STEP) pel : 1 = (MAX_FIRST_STEP/2) pel, 2 =
+  // (MAX_FIRST_STEP/4) pel... etc.
+  const search_site *const ss = &cfg->ss[search_param * cfg->searches_per_step];
+  const int tot_steps = (cfg->ss_count / cfg->searches_per_step) - search_param;
+  const MV fcenter_mv = {center_mv->row >> 3, center_mv->col >> 3};
+  const uint8_t *best_address, *in_what_ref;
+  int best_sad = INT_MAX;
+  int best_site = 0;
+  int last_site = 0;
+  int i, j, step;
+
+  clamp_mv(ref_mv, x->mv_col_min, x->mv_col_max, x->mv_row_min, x->mv_row_max);
+  in_what_ref = in_what->buf + ref_mv->row * in_what->stride + ref_mv->col;
+  best_address = in_what_ref;
+  *num00 = 0;
+  *best_mv = *ref_mv;
+
+  // Check the starting position
+  best_sad = fn_ptr->osdf(best_address, in_what->stride, wsrc, mask) +
+             mvsad_err_cost(x, best_mv, &fcenter_mv, sad_per_bit);
+
+  i = 1;
+
+  for (step = 0; step < tot_steps; step++) {
+    for (j = 0; j < cfg->searches_per_step; j++) {
+      const MV mv = {best_mv->row + ss[i].mv.row,
+                     best_mv->col + ss[i].mv.col};
+      if (is_mv_in(x, &mv)) {
+       int sad = fn_ptr->osdf(best_address + ss[i].offset, in_what->stride,
+                              wsrc, mask);
+        if (sad < best_sad) {
+          sad += mvsad_err_cost(x, &mv, &fcenter_mv, sad_per_bit);
+          if (sad < best_sad) {
+            best_sad = sad;
+            best_site = i;
+          }
+        }
+      }
+
+      i++;
+    }
+
+    if (best_site != last_site) {
+      best_mv->row += ss[best_site].mv.row;
+      best_mv->col += ss[best_site].mv.col;
+      best_address += ss[best_site].offset;
+      last_site = best_site;
+#if defined(NEW_DIAMOND_SEARCH)
+      while (1) {
+        const MV this_mv = {best_mv->row + ss[best_site].mv.row,
+                            best_mv->col + ss[best_site].mv.col};
+        if (is_mv_in(x, &this_mv)) {
+          int sad = fn_ptr->osdf(best_address + ss[best_site].offset,
+                                 in_what->stride, wsrc, mask);
+          if (sad < best_sad) {
+            sad += mvsad_err_cost(x, &this_mv, &fcenter_mv, sad_per_bit);
+            if (sad < best_sad) {
+              best_sad = sad;
+              best_mv->row += ss[best_site].mv.row;
+              best_mv->col += ss[best_site].mv.col;
+              best_address += ss[best_site].offset;
+              continue;
+            }
+          }
+        }
+        break;
+      }
+#endif
+    } else if (best_address == in_what_ref) {
+      (*num00)++;
+    }
+  }
+  return best_sad;
+}
+
+int vp10_obmc_full_pixel_diamond(const VP10_COMP *cpi, MACROBLOCK *x,
+                                 const int32_t *wsrc, const int32_t *mask,
+                                 MV *mvp_full, int step_param,
+                                 int sadpb, int further_steps, int do_refine,
+                                 const vp10_variance_fn_ptr_t *fn_ptr,
+                                 const MV *ref_mv, MV *dst_mv,
+                                 int is_second) {
+  MV temp_mv;
+  int thissme, n, num00 = 0;
+  int bestsme = obmc_diamond_search_sad(x, &cpi->ss_cfg,
+                                        wsrc, mask,
+                                        mvp_full, &temp_mv,
+                                        step_param, sadpb, &n,
+                                        fn_ptr, ref_mv, is_second);
+  if (bestsme < INT_MAX)
+    bestsme = get_obmc_mvpred_var(x, wsrc, mask,
+                                  &temp_mv, ref_mv, fn_ptr, 1, is_second);
+  *dst_mv = temp_mv;
+
+  // If there won't be more n-step search, check to see if refining search is
+  // needed.
+  if (n > further_steps)
+    do_refine = 0;
+
+  while (n < further_steps) {
+    ++n;
+
+    if (num00) {
+      num00--;
+    } else {
+      thissme = obmc_diamond_search_sad(x, &cpi->ss_cfg,
+                                        wsrc, mask,
+                                        mvp_full, &temp_mv,
+                                        step_param + n, sadpb, &num00,
+                                        fn_ptr, ref_mv, is_second);
+      if (thissme < INT_MAX)
+        thissme = get_obmc_mvpred_var(x, wsrc, mask,
+                                      &temp_mv, ref_mv, fn_ptr, 1, is_second);
+
+      // check to see if refining search is needed.
+      if (num00 > further_steps - n)
+        do_refine = 0;
+
+      if (thissme < bestsme) {
+        bestsme = thissme;
+        *dst_mv = temp_mv;
+      }
+    }
+  }
+
+  // final 1-away diamond refining search
+  if (do_refine) {
+    const int search_range = 8;
+    MV best_mv = *dst_mv;
+    thissme = obmc_refining_search_sad(x, wsrc, mask,
+                                       &best_mv, sadpb, search_range,
+                                       fn_ptr, ref_mv, is_second);
+    if (thissme < INT_MAX)
+      thissme = get_obmc_mvpred_var(x, wsrc, mask,
+                                    &best_mv, ref_mv, fn_ptr, 1, is_second);
+    if (thissme < bestsme) {
+      bestsme = thissme;
+      *dst_mv = best_mv;
+    }
+  }
+  return bestsme;
+}
+#endif  // CONFIG_OBMC
diff --git a/vp10/encoder/mcomp.h b/vp10/encoder/mcomp.h
index 9d1ab2a..f97f6c7 100644
--- a/vp10/encoder/mcomp.h
+++ b/vp10/encoder/mcomp.h
@@ -53,12 +53,12 @@
 // Utility to compute variance + MV rate cost for a given MV
 int vp10_get_mvpred_var(const MACROBLOCK *x,
                        const MV *best_mv, const MV *center_mv,
-                       const vp9_variance_fn_ptr_t *vfp,
+                       const vp10_variance_fn_ptr_t *vfp,
                        int use_mvcost);
 int vp10_get_mvpred_av_var(const MACROBLOCK *x,
                           const MV *best_mv, const MV *center_mv,
                           const uint8_t *second_pred,
-                          const vp9_variance_fn_ptr_t *vfp,
+                          const vp10_variance_fn_ptr_t *vfp,
                           int use_mvcost);
 
 struct VP10_COMP;
@@ -69,7 +69,7 @@
 int vp10_refining_search_sad(const struct macroblock *x,
                             struct mv *ref_mv,
                             int sad_per_bit, int distance,
-                            const struct vp9_variance_vtable *fn_ptr,
+                            const vp10_variance_fn_ptr_t *fn_ptr,
                             const struct mv *center_mv);
 
 // Runs sequence of diamond searches in smaller steps for RD.
@@ -77,46 +77,39 @@
                            MV *mvp_full, int step_param,
                            int sadpb, int further_steps, int do_refine,
                            int *cost_list,
-                           const vp9_variance_fn_ptr_t *fn_ptr,
+                           const vp10_variance_fn_ptr_t *fn_ptr,
                            const MV *ref_mv, MV *dst_mv);
 
 // Perform integral projection based motion estimation.
 unsigned int vp10_int_pro_motion_estimation(const struct VP10_COMP *cpi,
-                                           MACROBLOCK *x,
-                                           BLOCK_SIZE bsize,
-                                           int mi_row, int mi_col);
+                                            MACROBLOCK *x,
+                                            BLOCK_SIZE bsize,
+                                            int mi_row, int mi_col);
 
-typedef int (integer_mv_pattern_search_fn) (
-    const MACROBLOCK *x,
-    MV *ref_mv,
-    int search_param,
-    int error_per_bit,
-    int do_init_search,
-    int *cost_list,
-    const vp9_variance_fn_ptr_t *vf,
-    int use_mvcost,
-    const MV *center_mv,
-    MV *best_mv);
 
-integer_mv_pattern_search_fn vp10_hex_search;
-integer_mv_pattern_search_fn vp10_bigdia_search;
-integer_mv_pattern_search_fn vp10_square_search;
-integer_mv_pattern_search_fn vp10_fast_hex_search;
-integer_mv_pattern_search_fn vp10_fast_dia_search;
+int vp10_hex_search(MACROBLOCK *x,
+                    MV *start_mv,
+                    int search_param,
+                    int sad_per_bit,
+                    int do_init_search,
+                    int *cost_list,
+                    const vp10_variance_fn_ptr_t *vfp,
+                    int use_mvcost,
+                    const MV *center_mv);
 
 typedef int (fractional_mv_step_fp) (
-    const MACROBLOCK *x,
-    MV *bestmv, const MV *ref_mv,
+    MACROBLOCK *x,
+    const MV *ref_mv,
     int allow_hp,
     int error_per_bit,
-    const vp9_variance_fn_ptr_t *vfp,
+    const vp10_variance_fn_ptr_t *vfp,
     int forced_stop,  // 0 - full, 1 - qtr only, 2 - half only
     int iters_per_step,
     int *cost_list,
     int *mvjcost, int *mvcost[2],
     int *distortion, unsigned int *sse1,
     const uint8_t *second_pred,
-    int w, int h);
+    int w, int h, int use_upsampled_ref);
 
 extern fractional_mv_step_fp vp10_find_best_sub_pixel_tree;
 extern fractional_mv_step_fp vp10_find_best_sub_pixel_tree_pruned;
@@ -124,40 +117,93 @@
 extern fractional_mv_step_fp vp10_find_best_sub_pixel_tree_pruned_evenmore;
 
 typedef int (*vp10_full_search_fn_t)(const MACROBLOCK *x,
-                                    const MV *ref_mv, int sad_per_bit,
-                                    int distance,
-                                    const vp9_variance_fn_ptr_t *fn_ptr,
-                                    const MV *center_mv, MV *best_mv);
-
-typedef int (*vp10_refining_search_fn_t)(const MACROBLOCK *x,
-                                        MV *ref_mv, int sad_per_bit,
-                                        int distance,
-                                        const vp9_variance_fn_ptr_t *fn_ptr,
-                                        const MV *center_mv);
+                                     const MV *ref_mv, int sad_per_bit,
+                                     int distance,
+                                     const vp10_variance_fn_ptr_t *fn_ptr,
+                                     const MV *center_mv, MV *best_mv);
 
 typedef int (*vp10_diamond_search_fn_t)(const MACROBLOCK *x,
-                                       const search_site_config *cfg,
-                                       MV *ref_mv, MV *best_mv,
-                                       int search_param, int sad_per_bit,
-                                       int *num00,
-                                       const vp9_variance_fn_ptr_t *fn_ptr,
-                                       const MV *center_mv);
+                                        const search_site_config *cfg,
+                                        MV *ref_mv, MV *best_mv,
+                                        int search_param, int sad_per_bit,
+                                        int *num00,
+                                        const vp10_variance_fn_ptr_t *fn_ptr,
+                                        const MV *center_mv);
 
-int vp10_refining_search_8p_c(const MACROBLOCK *x,
-                             MV *ref_mv, int error_per_bit,
-                             int search_range,
-                             const vp9_variance_fn_ptr_t *fn_ptr,
-                             const MV *center_mv, const uint8_t *second_pred);
+int vp10_refining_search_8p_c(MACROBLOCK *x,
+                              int error_per_bit,
+                              int search_range,
+                              const vp10_variance_fn_ptr_t *fn_ptr,
+                              const MV *center_mv, const uint8_t *second_pred);
 
 struct VP10_COMP;
 
 int vp10_full_pixel_search(struct VP10_COMP *cpi, MACROBLOCK *x,
-                          BLOCK_SIZE bsize, MV *mvp_full,
-                          int step_param, int error_per_bit,
-                          int *cost_list,
-                          const MV *ref_mv, MV *tmp_mv,
-                          int var_max, int rd);
+                           BLOCK_SIZE bsize, MV *mvp_full,
+                           int step_param, int error_per_bit,
+                           int *cost_list, const MV *ref_mv,
+                           int var_max, int rd);
 
+#if CONFIG_EXT_INTER
+int vp10_find_best_masked_sub_pixel_tree(const MACROBLOCK *x,
+                                         const uint8_t *mask, int mask_stride,
+                                         MV *bestmv, const MV *ref_mv,
+                                         int allow_hp,
+                                         int error_per_bit,
+                                         const vp10_variance_fn_ptr_t *vfp,
+                                         int forced_stop,
+                                         int iters_per_step,
+                                         int *mvjcost, int *mvcost[2],
+                                         int *distortion,
+                                         unsigned int *sse1,
+                                         int is_second);
+int vp10_find_best_masked_sub_pixel_tree_up(struct VP10_COMP *cpi,
+                                            MACROBLOCK *x,
+                                            const uint8_t *mask,
+                                            int mask_stride,
+                                            int mi_row, int mi_col,
+                                            MV *bestmv, const MV *ref_mv,
+                                            int allow_hp,
+                                            int error_per_bit,
+                                            const vp10_variance_fn_ptr_t *vfp,
+                                            int forced_stop,
+                                            int iters_per_step,
+                                            int *mvjcost, int *mvcost[2],
+                                            int *distortion,
+                                            unsigned int *sse1,
+                                            int is_second,
+                                            int use_upsampled_ref);
+int vp10_masked_full_pixel_diamond(const struct VP10_COMP *cpi, MACROBLOCK *x,
+                                   const uint8_t *mask, int mask_stride,
+                                   MV *mvp_full, int step_param,
+                                   int sadpb, int further_steps, int do_refine,
+                                   const vp10_variance_fn_ptr_t *fn_ptr,
+                                   const MV *ref_mv, MV *dst_mv,
+                                   int is_second);
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_OBMC
+int vp10_obmc_full_pixel_diamond(const struct VP10_COMP *cpi, MACROBLOCK *x,
+                                 const int32_t *wsrc,
+                                 const int32_t *mask,
+                                 MV *mvp_full, int step_param,
+                                 int sadpb, int further_steps, int do_refine,
+                                 const vp10_variance_fn_ptr_t *fn_ptr,
+                                 const MV *ref_mv, MV *dst_mv,
+                                 int is_second);
+int vp10_find_best_obmc_sub_pixel_tree_up(struct VP10_COMP *cpi, MACROBLOCK *x,
+                                          const int32_t *wsrc,
+                                          const int32_t *mask,
+                                          int mi_row, int mi_col,
+                                          MV *bestmv, const MV *ref_mv,
+                                          int allow_hp, int error_per_bit,
+                                          const vp10_variance_fn_ptr_t *vfp,
+                                          int forced_stop, int iters_per_step,
+                                          int *mvjcost, int *mvcost[2],
+                                          int *distortion, unsigned int *sse1,
+                                          int is_second,
+                                          int use_upsampled_ref);
+#endif  // CONFIG_OBMC
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp10/encoder/palette.c b/vp10/encoder/palette.c
new file mode 100644
index 0000000..cbc3582
--- /dev/null
+++ b/vp10/encoder/palette.c
@@ -0,0 +1,195 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include "vp10/encoder/palette.h"
+
+static float calc_dist(const float *p1, const float *p2, int dim) {
+  float dist = 0;
+  int i = 0;
+
+  for (i = 0; i < dim; ++i) {
+    float diff = p1[i] - roundf(p2[i]);
+    dist += diff * diff;
+  }
+  return dist;
+}
+
+void vp10_calc_indices(const float *data, const float *centroids,
+                       uint8_t *indices, int n, int k, int dim) {
+  int i, j;
+  float min_dist, this_dist;
+
+  for (i = 0; i < n; ++i) {
+    min_dist = calc_dist(data + i * dim, centroids, dim);
+    indices[i] = 0;
+    for (j = 1; j < k; ++j) {
+      this_dist = calc_dist(data + i * dim, centroids + j * dim, dim);
+      if (this_dist < min_dist) {
+        min_dist = this_dist;
+        indices[i] = j;
+      }
+    }
+  }
+}
+
+// Generate a random number in the range [0, 32768).
+static unsigned int lcg_rand16(unsigned int *state) {
+  *state = *state * 1103515245 + 12345;
+  return *state / 65536 % 32768;
+}
+
+static void calc_centroids(const float *data, float *centroids,
+                           const uint8_t *indices, int n, int k, int dim) {
+  int i, j, index;
+  int count[PALETTE_MAX_SIZE];
+  unsigned int rand_state = (unsigned int)data[0];
+
+  assert(n <= 32768);
+
+  memset(count, 0, sizeof(count[0]) * k);
+  memset(centroids, 0, sizeof(centroids[0]) * k * dim);
+
+  for (i = 0; i < n; ++i) {
+    index = indices[i];
+    assert(index < k);
+    ++count[index];
+    for (j = 0; j < dim; ++j) {
+      centroids[index * dim + j] += data[i * dim + j];
+    }
+  }
+
+  for (i = 0; i < k; ++i) {
+    if (count[i] == 0) {
+      memcpy(centroids + i * dim, data + (lcg_rand16(&rand_state) % n) * dim,
+                 sizeof(centroids[0]) * dim);
+    } else {
+      const float norm = 1.0f / count[i];
+      for (j = 0; j < dim; ++j)
+        centroids[i * dim + j] *= norm;
+    }
+  }
+}
+
+static float calc_total_dist(const float *data, const float *centroids,
+                              const uint8_t *indices, int n, int k, int dim) {
+  float dist = 0;
+  int i;
+  (void) k;
+
+  for (i = 0; i < n; ++i)
+    dist += calc_dist(data + i * dim, centroids + indices[i] * dim, dim);
+
+  return dist;
+}
+
+int vp10_k_means(const float *data, float *centroids, uint8_t *indices,
+                 uint8_t *pre_indices, int n, int k, int dim, int max_itr) {
+  int i = 0;
+  float pre_dist, this_dist;
+  float pre_centroids[2 * PALETTE_MAX_SIZE];
+
+  vp10_calc_indices(data, centroids, indices, n, k, dim);
+  pre_dist = calc_total_dist(data, centroids, indices, n, k, dim);
+  memcpy(pre_centroids, centroids, sizeof(pre_centroids[0]) * k * dim);
+  memcpy(pre_indices, indices, sizeof(pre_indices[0]) * n);
+  while (i < max_itr) {
+    calc_centroids(data, centroids, indices, n, k, dim);
+    vp10_calc_indices(data, centroids, indices, n, k, dim);
+    this_dist = calc_total_dist(data, centroids, indices, n, k, dim);
+
+    if (this_dist > pre_dist) {
+      memcpy(centroids, pre_centroids, sizeof(pre_centroids[0]) * k * dim);
+      memcpy(indices, pre_indices, sizeof(pre_indices[0]) * n);
+      break;
+    }
+    if (!memcmp(centroids, pre_centroids, sizeof(pre_centroids[0]) * k * dim))
+      break;
+
+    memcpy(pre_centroids, centroids, sizeof(pre_centroids[0]) * k * dim);
+    memcpy(pre_indices, indices, sizeof(pre_indices[0]) * n);
+    pre_dist = this_dist;
+    ++i;
+  }
+
+  return i;
+}
+
+void vp10_insertion_sort(float *data, int n) {
+  int i, j, k;
+  float val;
+
+  if (n <= 1)
+    return;
+
+  for (i = 1; i < n; ++i) {
+    val = data[i];
+    j = 0;
+    while (val > data[j] && j < i)
+      ++j;
+
+    if (j == i)
+      continue;
+
+    for (k = i; k > j; --k)
+      data[k] = data[k - 1];
+    data[j] = val;
+  }
+}
+
+int vp10_count_colors(const uint8_t *src, int stride, int rows, int cols) {
+  int n = 0, r, c, i, val_count[256];
+  uint8_t val;
+  memset(val_count, 0, sizeof(val_count));
+
+  for (r = 0; r < rows; ++r) {
+    for (c = 0; c < cols; ++c) {
+      val = src[r * stride + c];
+      ++val_count[val];
+    }
+  }
+
+  for (i = 0; i < 256; ++i) {
+    if (val_count[i]) {
+      ++n;
+    }
+  }
+
+  return n;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+int vp10_count_colors_highbd(const uint8_t *src8, int stride, int rows,
+                             int cols, int bit_depth) {
+  int n = 0, r, c, i;
+  uint16_t val;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  int val_count[1 << 12];
+
+  assert(bit_depth <= 12);
+  memset(val_count, 0, (1 << 12) * sizeof(val_count[0]));
+  for (r = 0; r < rows; ++r) {
+    for (c = 0; c < cols; ++c) {
+      val = src[r * stride + c];
+      ++val_count[val];
+    }
+  }
+
+  for (i = 0; i < (1 << bit_depth); ++i) {
+    if (val_count[i]) {
+      ++n;
+    }
+  }
+
+  return n;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+
diff --git a/vp10/encoder/palette.h b/vp10/encoder/palette.h
new file mode 100644
index 0000000..40d9ef9
--- /dev/null
+++ b/vp10/encoder/palette.h
@@ -0,0 +1,35 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_ENCODER_PALETTE_H_
+#define VP10_ENCODER_PALETTE_H_
+
+#include "vp10/common/blockd.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void vp10_insertion_sort(float *data, int n);
+void vp10_calc_indices(const float *data, const float *centroids,
+                       uint8_t *indices, int n, int k, int dim);
+int vp10_k_means(const float *data, float *centroids, uint8_t *indices,
+                 uint8_t *pre_indices, int n, int k, int dim, int max_itr);
+int vp10_count_colors(const uint8_t *src, int stride, int rows, int cols);
+#if CONFIG_VP9_HIGHBITDEPTH
+int vp10_count_colors_highbd(const uint8_t *src8, int stride, int rows,
+                             int cols, int bit_depth);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif /* VP10_ENCODER_PALETTE_H_ */
diff --git a/vp10/encoder/picklpf.c b/vp10/encoder/picklpf.c
index 045e03d..9e0e72a 100644
--- a/vp10/encoder/picklpf.c
+++ b/vp10/encoder/picklpf.c
@@ -13,6 +13,7 @@
 
 #include "./vpx_scale_rtcd.h"
 
+#include "vpx_dsp/psnr.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
@@ -25,7 +26,7 @@
 #include "vp10/encoder/picklpf.h"
 #include "vp10/encoder/quantize.h"
 
-static int get_max_filter_level(const VP10_COMP *cpi) {
+int vp10_get_max_filter_level(const VP10_COMP *cpi) {
   if (cpi->oxcf.pass == 2) {
     return cpi->twopass.section_intra_rating > 8 ? MAX_LOOP_FILTER * 3 / 4
                                                  : MAX_LOOP_FILTER;
@@ -34,29 +35,34 @@
   }
 }
 
-
 static int64_t try_filter_frame(const YV12_BUFFER_CONFIG *sd,
                                 VP10_COMP *const cpi,
                                 int filt_level, int partial_frame) {
   VP10_COMMON *const cm = &cpi->common;
   int64_t filt_err;
 
+#if CONFIG_VAR_TX || CONFIG_EXT_PARTITION
+  vp10_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level,
+                         1, partial_frame);
+#else
   if (cpi->num_workers > 1)
     vp10_loop_filter_frame_mt(cm->frame_to_show, cm, cpi->td.mb.e_mbd.plane,
-                             filt_level, 1, partial_frame,
-                             cpi->workers, cpi->num_workers, &cpi->lf_row_sync);
+                              filt_level, 1, partial_frame,
+                              cpi->workers, cpi->num_workers,
+                              &cpi->lf_row_sync);
   else
     vp10_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filt_level,
-                          1, partial_frame);
+                           1, partial_frame);
+#endif
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (cm->use_highbitdepth) {
-    filt_err = vp10_highbd_get_y_sse(sd, cm->frame_to_show);
+    filt_err = vpx_highbd_get_y_sse(sd, cm->frame_to_show);
   } else {
-    filt_err = vp10_get_y_sse(sd, cm->frame_to_show);
+    filt_err = vpx_get_y_sse(sd, cm->frame_to_show);
   }
 #else
-  filt_err = vp10_get_y_sse(sd, cm->frame_to_show);
+  filt_err = vpx_get_y_sse(sd, cm->frame_to_show);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
   // Re-instate the unfiltered frame
@@ -65,15 +71,16 @@
   return filt_err;
 }
 
-static int search_filter_level(const YV12_BUFFER_CONFIG *sd, VP10_COMP *cpi,
-                               int partial_frame) {
+int vp10_search_filter_level(const YV12_BUFFER_CONFIG *sd, VP10_COMP *cpi,
+                             int partial_frame, double *best_cost_ret) {
   const VP10_COMMON *const cm = &cpi->common;
   const struct loopfilter *const lf = &cm->lf;
   const int min_filter_level = 0;
-  const int max_filter_level = get_max_filter_level(cpi);
+  const int max_filter_level = vp10_get_max_filter_level(cpi);
   int filt_direction = 0;
   int64_t best_err;
   int filt_best;
+  MACROBLOCK *x = &cpi->td.mb;
 
   // Start the search at the previous frame filter level unless it is now out of
   // range.
@@ -113,11 +120,11 @@
       }
       // If value is close to the best so far then bias towards a lower loop
       // filter value.
-      if ((ss_err[filt_low] - bias) < best_err) {
+      if (ss_err[filt_low] < (best_err + bias)) {
         // Was it actually better than the previous best?
-        if (ss_err[filt_low] < best_err)
+        if (ss_err[filt_low] < best_err) {
           best_err = ss_err[filt_low];
-
+        }
         filt_best = filt_low;
       }
     }
@@ -127,7 +134,8 @@
       if (ss_err[filt_high] < 0) {
         ss_err[filt_high] = try_filter_frame(sd, cpi, filt_high, partial_frame);
       }
-      // Was it better than the previous best?
+      // If value is significantly better than previous best, bias added against
+      // raising filter value
       if (ss_err[filt_high] < (best_err - bias)) {
         best_err = ss_err[filt_high];
         filt_best = filt_high;
@@ -144,9 +152,15 @@
     }
   }
 
+  // Update best error
+  best_err = ss_err[filt_best];
+
+  if (best_cost_ret)
+    *best_cost_ret = RDCOST_DBL(x->rdmult, x->rddiv, 0, best_err);
   return filt_best;
 }
 
+#if !CONFIG_LOOP_RESTORATION
 void vp10_pick_filter_level(const YV12_BUFFER_CONFIG *sd, VP10_COMP *cpi,
                            LPF_PICK_METHOD method) {
   VP10_COMMON *const cm = &cpi->common;
@@ -159,7 +173,7 @@
       lf->filter_level = 0;
   } else if (method >= LPF_PICK_FROM_Q) {
     const int min_filter_level = 0;
-    const int max_filter_level = get_max_filter_level(cpi);
+    const int max_filter_level = vp10_get_max_filter_level(cpi);
     const int q = vp10_ac_quant(cm->base_qindex, 0, cm->bit_depth);
     // These values were determined by linear fitting the result of the
     // searched level, filt_guess = q * 0.316206 + 3.87252
@@ -187,7 +201,15 @@
       filt_guess -= 4;
     lf->filter_level = clamp(filt_guess, min_filter_level, max_filter_level);
   } else {
-    lf->filter_level = search_filter_level(sd, cpi,
-                                           method == LPF_PICK_FROM_SUBIMAGE);
+    lf->filter_level = vp10_search_filter_level(
+        sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, NULL);
   }
+
+#if CONFIG_EXT_TILE
+  // TODO(any): 0 loopfilter level is only necessary if individual tile
+  // decoding is required. We need to communicate this requirement to this
+  // code and force loop filter level 0 only if required.
+  lf->filter_level = 0;
+#endif  // CONFIG_EXT_TILE
 }
+#endif  // !CONFIG_LOOP_RESTORATION
diff --git a/vp10/encoder/picklpf.h b/vp10/encoder/picklpf.h
index 21a8758..29ec976 100644
--- a/vp10/encoder/picklpf.h
+++ b/vp10/encoder/picklpf.h
@@ -20,9 +20,11 @@
 
 struct yv12_buffer_config;
 struct VP10_COMP;
-
+int vp10_get_max_filter_level(const VP10_COMP *cpi);
+int vp10_search_filter_level(const YV12_BUFFER_CONFIG *sd, VP10_COMP *cpi,
+                             int partial_frame, double *err);
 void vp10_pick_filter_level(const struct yv12_buffer_config *sd,
-                           struct VP10_COMP *cpi, LPF_PICK_METHOD method);
+                            struct VP10_COMP *cpi, LPF_PICK_METHOD method);
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp10/encoder/pickrst.c b/vp10/encoder/pickrst.c
new file mode 100644
index 0000000..fa01062
--- /dev/null
+++ b/vp10/encoder/pickrst.c
@@ -0,0 +1,681 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <float.h>
+#include <limits.h>
+#include <math.h>
+
+#include "./vpx_scale_rtcd.h"
+
+#include "vpx_dsp/psnr.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_mem/vpx_mem.h"
+#include "vpx_ports/mem.h"
+
+#include "vp10/common/onyxc_int.h"
+#include "vp10/common/quant_common.h"
+
+#include "vp10/encoder/encoder.h"
+#include "vp10/encoder/quantize.h"
+#include "vp10/encoder/picklpf.h"
+#include "vp10/encoder/pickrst.h"
+
+static int64_t try_restoration_frame(const YV12_BUFFER_CONFIG *sd,
+                                     VP10_COMP *const cpi,
+                                     RestorationInfo *rsi,
+                                     int partial_frame) {
+  VP10_COMMON *const cm = &cpi->common;
+  int64_t filt_err;
+  vp10_loop_restoration_frame(cm->frame_to_show, cm,
+                              rsi, 1, partial_frame);
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (cm->use_highbitdepth) {
+    filt_err = vpx_highbd_get_y_sse(sd, cm->frame_to_show);
+  } else {
+    filt_err = vpx_get_y_sse(sd, cm->frame_to_show);
+  }
+#else
+  filt_err = vpx_get_y_sse(sd, cm->frame_to_show);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  // Re-instate the unfiltered frame
+  vpx_yv12_copy_y(&cpi->last_frame_db, cm->frame_to_show);
+  return filt_err;
+}
+
+static int search_bilateral_level(const YV12_BUFFER_CONFIG *sd,
+                                  VP10_COMP *cpi,
+                                  int filter_level, int partial_frame,
+                                  double *best_cost_ret) {
+  VP10_COMMON *const cm = &cpi->common;
+  int i, restoration_best;
+  int64_t err;
+  double best_cost;
+  double cost;
+  const int restoration_level_bits = vp10_restoration_level_bits(&cpi->common);
+  const int restoration_levels = 1 << restoration_level_bits;
+  MACROBLOCK *x = &cpi->td.mb;
+  int bits;
+  RestorationInfo rsi;
+
+  //  Make a copy of the unfiltered / processed recon buffer
+  vpx_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
+  vp10_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filter_level,
+                         1, partial_frame);
+  vpx_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_db);
+
+  restoration_best = -1;
+  rsi.restoration_type = RESTORE_NONE;
+  err = try_restoration_frame(sd, cpi, &rsi, partial_frame);
+  bits = 0;
+  best_cost = RDCOST_DBL(x->rdmult, x->rddiv,
+                         (bits << (VP9_PROB_COST_SHIFT - 4)), err);
+  for (i = 0; i < restoration_levels; ++i) {
+    rsi.restoration_type = RESTORE_BILATERAL;
+    rsi.restoration_level = i;
+    err = try_restoration_frame(sd, cpi, &rsi, partial_frame);
+    // Normally the rate is rate in bits * 256 and dist is sum sq err * 64
+    // when RDCOST is used.  However below we just scale both in the correct
+    // ratios appropriately but not exactly by these values.
+    bits = restoration_level_bits;
+    cost = RDCOST_DBL(x->rdmult, x->rddiv,
+                      (bits << (VP9_PROB_COST_SHIFT - 4)), err);
+    if (cost < best_cost) {
+      restoration_best = i;
+      best_cost = cost;
+    }
+  }
+  if (best_cost_ret) *best_cost_ret = best_cost;
+  vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
+  return restoration_best;
+}
+
+static int search_filter_bilateral_level(const YV12_BUFFER_CONFIG *sd,
+                                         VP10_COMP *cpi,
+                                         int partial_frame,
+                                         int *restoration_level,
+                                         double *best_cost_ret) {
+  const VP10_COMMON *const cm = &cpi->common;
+  const struct loopfilter *const lf = &cm->lf;
+  const int min_filter_level = 0;
+  const int max_filter_level = vp10_get_max_filter_level(cpi);
+  int filt_direction = 0;
+  int filt_best, restoration_best;
+  double best_err;
+  int i;
+  int bilateral_lev;
+
+  // Start the search at the previous frame filter level unless it is now out of
+  // range.
+  int filt_mid = clamp(lf->filter_level, min_filter_level, max_filter_level);
+  int filter_step = filt_mid < 16 ? 4 : filt_mid / 4;
+  double ss_err[MAX_LOOP_FILTER + 1];
+
+  // Set each entry to -1
+  for (i = 0; i <= MAX_LOOP_FILTER; ++i)
+    ss_err[i] = -1.0;
+
+  bilateral_lev = search_bilateral_level(sd, cpi, filt_mid,
+                                         partial_frame, &best_err);
+  filt_best = filt_mid;
+  restoration_best = bilateral_lev;
+  ss_err[filt_mid] = best_err;
+
+  while (filter_step > 0) {
+    const int filt_high = VPXMIN(filt_mid + filter_step, max_filter_level);
+    const int filt_low = VPXMAX(filt_mid - filter_step, min_filter_level);
+
+    // Bias against raising loop filter in favor of lowering it.
+    double bias = (best_err / (1 << (15 - (filt_mid / 8)))) * filter_step;
+
+    if ((cpi->oxcf.pass == 2) && (cpi->twopass.section_intra_rating < 20))
+      bias = (bias * cpi->twopass.section_intra_rating) / 20;
+
+    // yx, bias less for large block size
+    if (cm->tx_mode != ONLY_4X4)
+      bias /= 2;
+
+    if (filt_direction <= 0 && filt_low != filt_mid) {
+      // Get Low filter error score
+      if (ss_err[filt_low] < 0) {
+        bilateral_lev = search_bilateral_level(
+            sd, cpi, filt_low, partial_frame, &ss_err[filt_low]);
+      }
+      // If value is close to the best so far then bias towards a lower loop
+      // filter value.
+      if (ss_err[filt_low] < (best_err + bias)) {
+        // Was it actually better than the previous best?
+        if (ss_err[filt_low] < best_err) {
+          best_err = ss_err[filt_low];
+        }
+        filt_best = filt_low;
+        restoration_best = bilateral_lev;
+      }
+    }
+
+    // Now look at filt_high
+    if (filt_direction >= 0 && filt_high != filt_mid) {
+      if (ss_err[filt_high] < 0) {
+        bilateral_lev = search_bilateral_level(
+            sd, cpi, filt_high, partial_frame, &ss_err[filt_high]);
+      }
+      // If value is significantly better than previous best, bias added against
+      // raising filter value
+      if (ss_err[filt_high] < (best_err - bias)) {
+        best_err = ss_err[filt_high];
+        filt_best = filt_high;
+        restoration_best = bilateral_lev;
+      }
+    }
+
+    // Half the step distance if the best filter value was the same as last time
+    if (filt_best == filt_mid) {
+      filter_step /= 2;
+      filt_direction = 0;
+    } else {
+      filt_direction = (filt_best < filt_mid) ? -1 : 1;
+      filt_mid = filt_best;
+    }
+  }
+
+  // Update best error
+  best_err = ss_err[filt_best];
+
+  *restoration_level = restoration_best;
+  if (best_cost_ret) *best_cost_ret = best_err;
+  return filt_best;
+}
+
+static double find_average(uint8_t *src, int width, int height, int stride) {
+  uint64_t sum = 0;
+  double avg = 0;
+  int i, j;
+  for (i = 0; i < height; i++)
+    for (j = 0; j < width; j++)
+      sum += src[i * stride + j];
+  avg = (double)sum / (height * width);
+  return avg;
+}
+
+static void compute_stats(uint8_t *dgd, uint8_t *src, int width, int height,
+                          int dgd_stride, int src_stride,
+                          double *M, double *H) {
+  int i, j, k, l;
+  double Y[RESTORATION_WIN2];
+  const double avg = find_average(dgd, width, height, dgd_stride);
+
+  memset(M, 0, sizeof(*M) * RESTORATION_WIN2);
+  memset(H, 0, sizeof(*H) * RESTORATION_WIN2 * RESTORATION_WIN2);
+  for (i = RESTORATION_HALFWIN; i < height - RESTORATION_HALFWIN; i++) {
+    for (j = RESTORATION_HALFWIN; j < width - RESTORATION_HALFWIN; j++) {
+      const double X = (double)src[i * src_stride + j] - avg;
+      int idx = 0;
+      for (k = -RESTORATION_HALFWIN; k <= RESTORATION_HALFWIN; k++) {
+        for (l = -RESTORATION_HALFWIN; l <= RESTORATION_HALFWIN; l++) {
+          Y[idx] = (double)dgd[(i + l) * dgd_stride + (j + k)] - avg;
+          idx++;
+        }
+      }
+      for (k = 0; k < RESTORATION_WIN2; ++k) {
+        M[k] += Y[k] * X;
+        H[k * RESTORATION_WIN2 + k] += Y[k] * Y[k];
+        for (l = k + 1; l < RESTORATION_WIN2; ++l) {
+          double value = Y[k] * Y[l];
+          H[k * RESTORATION_WIN2 + l] += value;
+          H[l * RESTORATION_WIN2 + k] += value;
+        }
+      }
+    }
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static double find_average_highbd(uint16_t *src,
+                                  int width, int height, int stride) {
+  uint64_t sum = 0;
+  double avg = 0;
+  int i, j;
+  for (i = 0; i < height; i++)
+    for (j = 0; j < width; j++)
+      sum += src[i * stride + j];
+  avg = (double)sum / (height * width);
+  return avg;
+}
+
+static void compute_stats_highbd(
+    uint8_t *dgd8, uint8_t *src8, int width, int height,
+    int dgd_stride, int src_stride, double *M, double *H) {
+  int i, j, k, l;
+  double Y[RESTORATION_WIN2];
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *dgd = CONVERT_TO_SHORTPTR(dgd8);
+  const double avg = find_average_highbd(dgd, width, height, dgd_stride);
+
+  memset(M, 0, sizeof(*M) * RESTORATION_WIN2);
+  memset(H, 0, sizeof(*H) * RESTORATION_WIN2 * RESTORATION_WIN2);
+  for (i = RESTORATION_HALFWIN; i < height - RESTORATION_HALFWIN; i++) {
+    for (j = RESTORATION_HALFWIN; j < width - RESTORATION_HALFWIN; j++) {
+      const double X = (double)src[i * src_stride + j] - avg;
+      int idx = 0;
+      for (k = -RESTORATION_HALFWIN; k <= RESTORATION_HALFWIN; k++) {
+        for (l = -RESTORATION_HALFWIN; l <= RESTORATION_HALFWIN; l++) {
+          Y[idx] = (double)dgd[(i + l) * dgd_stride + (j + k)] - avg;
+          idx++;
+        }
+      }
+      for (k = 0; k < RESTORATION_WIN2; ++k) {
+        M[k] += Y[k] * X;
+        H[k * RESTORATION_WIN2 + k] += Y[k] * Y[k];
+        for (l = k + 1; l < RESTORATION_WIN2; ++l) {
+          double value = Y[k] * Y[l];
+          H[k * RESTORATION_WIN2 + l] += value;
+          H[l * RESTORATION_WIN2 + k] += value;
+        }
+      }
+    }
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+// Solves Ax = b, where x and b are column vectors
+static int linsolve(int n, double *A, int stride, double *b, double *x) {
+  int i, j, k;
+  double c;
+  // Partial pivoting
+  for (i = n - 1; i > 0; i--) {
+    if (A[(i - 1) * stride] < A[i * stride]) {
+      for (j = 0; j < n; j++) {
+        c = A[i * stride + j];
+        A[i * stride + j] = A[(i - 1) * stride + j];
+        A[(i - 1) * stride + j] = c;
+      }
+      c = b[i];
+      b[i] = b[i - 1];
+      b[i - 1] = c;
+    }
+  }
+  // Forward elimination
+  for (k = 0; k < n - 1; k++) {
+    for (i = k; i < n - 1; i++) {
+      c = A[(i + 1) * stride + k] / A[k * stride + k];
+      for (j = 0; j < n; j++)
+        A[(i + 1) * stride + j] -= c * A[k * stride + j];
+      b[i + 1] -= c * b[k];
+    }
+  }
+  // Backward substitution
+  for (i = n - 1; i >= 0; i--) {
+    if (fabs(A[i * stride + i]) < 1e-10)
+      return 0;
+    c = 0;
+    for (j = i + 1; j <= n - 1; j++)
+      c += A[i * stride + j] * x[j];
+    x[i] = (b[i] - c) / A[i * stride + i];
+  }
+  return 1;
+}
+
+static INLINE int wrap_index(int i) {
+  return (i >= RESTORATION_HALFWIN1 ? RESTORATION_WIN - 1 - i : i);
+}
+
+// Fix vector b, update vector a
+static void update_a_sep_sym(double **Mc, double **Hc, double *a, double *b) {
+  int i, j;
+  double S[RESTORATION_WIN];
+  double A[RESTORATION_WIN], B[RESTORATION_WIN2];
+  int w, w2;
+  memset(A, 0, sizeof(A));
+  memset(B, 0, sizeof(B));
+  for (i = 0; i < RESTORATION_WIN; i ++) {
+    int j;
+    for (j = 0; j < RESTORATION_WIN; ++j) {
+      const int jj = wrap_index(j);
+      A[jj] += Mc[i][j] * b[i];
+    }
+  }
+  for (i = 0; i < RESTORATION_WIN; i ++) {
+    for (j = 0; j < RESTORATION_WIN; j ++) {
+      int k, l;
+      for (k = 0; k < RESTORATION_WIN; ++k)
+        for (l = 0; l < RESTORATION_WIN; ++l) {
+          const int kk = wrap_index(k);
+          const int ll = wrap_index(l);
+          B[ll * RESTORATION_HALFWIN1 + kk] +=
+              Hc[j * RESTORATION_WIN + i][k * RESTORATION_WIN2 + l] *
+              b[i] * b[j];
+        }
+    }
+  }
+  // Normalization enforcement in the system of equations itself
+  w = RESTORATION_WIN;
+  w2 = (w >> 1) + 1;
+  for (i = 0; i < w2 - 1; ++i)
+    A[i] -= A[w2 - 1] * 2 + B[i * w2 + w2 - 1]
+              - 2 * B[(w2 - 1) * w2 + (w2 - 1)];
+  for (i = 0; i < w2 - 1; ++i)
+    for (j = 0; j < w2 - 1; ++j)
+      B[i * w2 + j] -= 2 * (B[i * w2 + (w2 - 1)] + B[(w2 - 1) * w2 + j] -
+                            2 * B[(w2 - 1) * w2 + (w2 - 1)]);
+  if (linsolve(w2 - 1, B, w2, A, S)) {
+    S[w2 - 1] = 1.0;
+    for (i = w2; i < w; ++i) {
+      S[i] = S[w - 1 - i];
+      S[w2 - 1] -= 2 * S[i];
+    }
+    memcpy(a, S, w * sizeof(*a));
+  }
+}
+
+// Fix vector a, update vector b
+static void update_b_sep_sym(double **Mc, double **Hc, double *a, double *b) {
+  int i, j;
+  double S[RESTORATION_WIN];
+  double A[RESTORATION_WIN], B[RESTORATION_WIN2];
+  int w, w2;
+  memset(A, 0, sizeof(A));
+  memset(B, 0, sizeof(B));
+  for (i = 0; i < RESTORATION_WIN; i ++) {
+    int j;
+    const int ii = wrap_index(i);
+    for (j = 0; j < RESTORATION_WIN; j ++)
+      A[ii] += Mc[i][j] * a[j];
+  }
+
+  for (i = 0; i < RESTORATION_WIN; i++) {
+    for (j = 0; j < RESTORATION_WIN; j++) {
+      const int ii = wrap_index(i);
+      const int jj = wrap_index(j);
+      int k, l;
+      for (k = 0; k < RESTORATION_WIN; ++k)
+        for (l = 0; l < RESTORATION_WIN; ++l)
+          B[jj * RESTORATION_HALFWIN1 + ii] +=
+              Hc[i * RESTORATION_WIN + j][k * RESTORATION_WIN2 + l] *
+              a[k] * a[l];
+    }
+  }
+  // Normalization enforcement in the system of equations itself
+  w = RESTORATION_WIN;
+  w2 = RESTORATION_HALFWIN1;
+  for (i = 0; i < w2 - 1; ++i)
+    A[i] -= A[w2 - 1] * 2 + B[i * w2 + w2 - 1]
+              - 2 * B[(w2 - 1) * w2 + (w2 - 1)];
+  for (i = 0; i < w2 - 1; ++i)
+    for (j = 0; j < w2 - 1; ++j)
+      B[i * w2 + j] -= 2 * (B[i * w2 + (w2 - 1)] + B[(w2 - 1) * w2 + j] -
+                            2 * B[(w2 - 1) * w2 + (w2 - 1)]);
+  if (linsolve(w2 - 1, B, w2, A, S)) {
+    S[w2 - 1] = 1.0;
+    for (i = w2; i < w; ++i) {
+      S[i] = S[w - 1 - i];
+      S[w2 - 1] -= 2 * S[i];
+    }
+    memcpy(b, S, w * sizeof(*b));
+  }
+}
+
+static int wiener_decompose_sep_sym(double *M, double *H,
+                                    double *a, double *b) {
+  static const double init_filt[RESTORATION_WIN] = {
+    0.035623, -0.127154,  0.211436,  0.760190,  0.211436, -0.127154,  0.035623,
+  };
+  int i, j, iter;
+  double *Hc[RESTORATION_WIN2];
+  double *Mc[RESTORATION_WIN];
+  for (i = 0; i < RESTORATION_WIN; i++) {
+    Mc[i] = M + i * RESTORATION_WIN;
+    for (j = 0; j < RESTORATION_WIN; j++) {
+      Hc[i * RESTORATION_WIN + j] =
+          H + i * RESTORATION_WIN * RESTORATION_WIN2 + j * RESTORATION_WIN;
+    }
+  }
+  memcpy(a, init_filt, sizeof(*a) * RESTORATION_WIN);
+  memcpy(b, init_filt, sizeof(*b) * RESTORATION_WIN);
+
+  iter = 1;
+  while (iter < 10) {
+    update_a_sep_sym(Mc, Hc, a, b);
+    update_b_sep_sym(Mc, Hc, a, b);
+    iter++;
+  }
+  return 1;
+}
+
+// Computes the function x'*A*x - x'*b for the learned filters, and compares
+// against identity filters; Final score is defined as the difference between
+// the function values
+  static double compute_score(double *M, double *H, int *vfilt, int *hfilt) {
+  double ab[RESTORATION_WIN * RESTORATION_WIN];
+  int i, k, l;
+  double P = 0, Q = 0;
+  double iP = 0, iQ = 0;
+  double Score, iScore;
+  int w;
+  double a[RESTORATION_WIN], b[RESTORATION_WIN];
+  w = RESTORATION_WIN;
+  a[RESTORATION_HALFWIN] = b[RESTORATION_HALFWIN] = 1.0;
+  for (i = 0; i < RESTORATION_HALFWIN; ++i) {
+    a[i] = a[RESTORATION_WIN - i - 1 ] =
+        (double) vfilt[i] / RESTORATION_FILT_STEP;
+    b[i] = b[RESTORATION_WIN - i - 1 ] =
+        (double) hfilt[i] / RESTORATION_FILT_STEP;
+    a[RESTORATION_HALFWIN] -= 2 * a[i];
+    b[RESTORATION_HALFWIN] -= 2 * b[i];
+  }
+  for (k = 0; k < w; ++k) {
+    for (l = 0; l < w; ++l) {
+      ab[k * w + l] = a[l] * b[k];
+    }
+  }
+  for (k = 0; k < w * w; ++k) {
+    P += ab[k] * M[k];
+    for (l = 0; l < w * w; ++l)
+      Q += ab[k] * H[k * w * w + l] * ab[l];
+  }
+  Score = Q - 2 * P;
+
+  iP = M[(w * w) >> 1];
+  iQ = H[((w * w) >> 1) * w * w + ((w * w) >> 1)];
+  iScore = iQ - 2 * iP;
+
+  return Score - iScore;
+}
+
+#define CLIP(x, lo, hi) ((x) < (lo) ? (lo) : (x) > (hi) ? (hi) : (x))
+#define RINT(x) ((x) < 0 ? (int)((x) - 0.5) : (int)((x) + 0.5))
+
+static void quantize_sym_filter(double *f, int *fi) {
+  int i;
+  for (i = 0; i < RESTORATION_HALFWIN; ++i) {
+    fi[i] = RINT(f[i] * RESTORATION_FILT_STEP);
+  }
+  // Specialize for 7-tap filter
+  fi[0] = CLIP(fi[0], WIENER_FILT_TAP0_MINV, WIENER_FILT_TAP0_MAXV);
+  fi[1] = CLIP(fi[1], WIENER_FILT_TAP1_MINV, WIENER_FILT_TAP1_MAXV);
+  fi[2] = CLIP(fi[2], WIENER_FILT_TAP2_MINV, WIENER_FILT_TAP2_MAXV);
+}
+
+static int search_wiener_filter(const YV12_BUFFER_CONFIG *src,
+                                VP10_COMP *cpi,
+                                int filter_level,
+                                int partial_frame,
+                                int *vfilter, int *hfilter,
+                                double *best_cost_ret) {
+  VP10_COMMON *const cm = &cpi->common;
+  RestorationInfo rsi;
+  int64_t err;
+  int bits;
+  double cost_wiener, cost_norestore;
+  MACROBLOCK *x = &cpi->td.mb;
+  double M[RESTORATION_WIN2];
+  double H[RESTORATION_WIN2 * RESTORATION_WIN2];
+  double vfilterd[RESTORATION_WIN], hfilterd[RESTORATION_WIN];
+  const YV12_BUFFER_CONFIG *dgd = cm->frame_to_show;
+  const int width = cm->width;
+  const int height = cm->height;
+  const int src_stride = src->y_stride;
+  const int dgd_stride = dgd->y_stride;
+  double score;
+
+  assert(width == dgd->y_crop_width);
+  assert(height == dgd->y_crop_height);
+  assert(width == src->y_crop_width);
+  assert(height == src->y_crop_height);
+
+  //  Make a copy of the unfiltered / processed recon buffer
+  vpx_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_uf);
+  vp10_loop_filter_frame(cm->frame_to_show, cm, &cpi->td.mb.e_mbd, filter_level,
+                         1, partial_frame);
+  vpx_yv12_copy_y(cm->frame_to_show, &cpi->last_frame_db);
+
+  rsi.restoration_type = RESTORE_NONE;
+  err = try_restoration_frame(src, cpi, &rsi, partial_frame);
+  bits = 0;
+  cost_norestore = RDCOST_DBL(x->rdmult, x->rddiv,
+                              (bits << (VP9_PROB_COST_SHIFT - 4)), err);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (cm->use_highbitdepth)
+    compute_stats_highbd(dgd->y_buffer, src->y_buffer, width, height,
+                         dgd_stride, src_stride, M, H);
+  else
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    compute_stats(dgd->y_buffer, src->y_buffer, width, height,
+                  dgd_stride, src_stride, M, H);
+
+  if (!wiener_decompose_sep_sym(M, H, vfilterd, hfilterd)) {
+    *best_cost_ret = DBL_MAX;
+    return 0;
+  }
+  quantize_sym_filter(vfilterd, vfilter);
+  quantize_sym_filter(hfilterd, hfilter);
+
+  // Filter score computes the value of the function x'*A*x - x'*b for the
+  // learned filter and compares it against identity filer. If there is no
+  // reduction in the function, the filter is reverted back to identity
+  score = compute_score(M, H, vfilter, hfilter);
+  if (score > 0.0) {
+    int i;
+    for (i = 0; i < RESTORATION_HALFWIN; ++i)
+      vfilter[i] = hfilter[i] = 0;
+    rsi.restoration_type = RESTORE_NONE;
+    if (best_cost_ret) *best_cost_ret = cost_norestore;
+    vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
+    return 0;
+  }
+
+  rsi.restoration_type = RESTORE_WIENER;
+  memcpy(rsi.vfilter, vfilter, sizeof(rsi.vfilter));
+  memcpy(rsi.hfilter, hfilter, sizeof(rsi.hfilter));
+  err = try_restoration_frame(src, cpi, &rsi, partial_frame);
+  bits = WIENER_FILT_BITS;
+  cost_wiener = RDCOST_DBL(x->rdmult, x->rddiv,
+                           (bits << (VP9_PROB_COST_SHIFT - 4)), err);
+
+  vpx_yv12_copy_y(&cpi->last_frame_uf, cm->frame_to_show);
+
+  if (cost_wiener < cost_norestore) {
+    if (best_cost_ret) *best_cost_ret = cost_wiener;
+    return 1;
+  } else {
+    if (best_cost_ret) *best_cost_ret = cost_norestore;
+    return 0;
+  }
+}
+
+void vp10_pick_filter_restoration(
+    const YV12_BUFFER_CONFIG *sd, VP10_COMP *cpi, LPF_PICK_METHOD method) {
+  VP10_COMMON *const cm = &cpi->common;
+  struct loopfilter *const lf = &cm->lf;
+  int wiener_success = 0;
+  double cost_bilateral = DBL_MAX;
+  double cost_wiener = DBL_MAX;
+  double cost_norestore = DBL_MAX;
+
+  lf->sharpness_level =
+      cm->frame_type == KEY_FRAME ? 0 : cpi->oxcf.sharpness;
+
+  if (method == LPF_PICK_MINIMAL_LPF && lf->filter_level) {
+      lf->filter_level = 0;
+      cm->rst_info.restoration_type = RESTORE_NONE;
+  } else if (method >= LPF_PICK_FROM_Q) {
+    const int min_filter_level = 0;
+    const int max_filter_level = vp10_get_max_filter_level(cpi);
+    const int q = vp10_ac_quant(cm->base_qindex, 0, cm->bit_depth);
+    // These values were determined by linear fitting the result of the
+    // searched level, filt_guess = q * 0.316206 + 3.87252
+#if CONFIG_VP9_HIGHBITDEPTH
+    int filt_guess;
+    switch (cm->bit_depth) {
+      case VPX_BITS_8:
+        filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18);
+        break;
+      case VPX_BITS_10:
+        filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 4060632, 20);
+        break;
+      case VPX_BITS_12:
+        filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 16242526, 22);
+        break;
+      default:
+        assert(0 && "bit_depth should be VPX_BITS_8, VPX_BITS_10 "
+                    "or VPX_BITS_12");
+        return;
+    }
+#else
+    int filt_guess = ROUND_POWER_OF_TWO(q * 20723 + 1015158, 18);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    if (cm->frame_type == KEY_FRAME)
+      filt_guess -= 4;
+    lf->filter_level = clamp(filt_guess, min_filter_level, max_filter_level);
+    cm->rst_info.restoration_level = search_bilateral_level(
+        sd, cpi, lf->filter_level, method == LPF_PICK_FROM_SUBIMAGE,
+        &cost_bilateral);
+    wiener_success = search_wiener_filter(
+        sd, cpi, lf->filter_level, method == LPF_PICK_FROM_SUBIMAGE,
+        cm->rst_info.vfilter, cm->rst_info.hfilter, &cost_wiener);
+    if (cost_bilateral < cost_wiener) {
+      if (cm->rst_info.restoration_level != -1)
+        cm->rst_info.restoration_type = RESTORE_BILATERAL;
+      else
+        cm->rst_info.restoration_type = RESTORE_NONE;
+    } else {
+      if (wiener_success)
+        cm->rst_info.restoration_type = RESTORE_WIENER;
+      else
+        cm->rst_info.restoration_type = RESTORE_NONE;
+    }
+  } else {
+    int blf_filter_level = -1;
+    blf_filter_level = search_filter_bilateral_level(
+        sd, cpi, method == LPF_PICK_FROM_SUBIMAGE,
+        &cm->rst_info.restoration_level, &cost_bilateral);
+    lf->filter_level = vp10_search_filter_level(
+        sd, cpi, method == LPF_PICK_FROM_SUBIMAGE, &cost_norestore);
+    wiener_success = search_wiener_filter(
+        sd, cpi, lf->filter_level, method == LPF_PICK_FROM_SUBIMAGE,
+        cm->rst_info.vfilter, cm->rst_info.hfilter, &cost_wiener);
+    if (cost_bilateral < cost_wiener) {
+      lf->filter_level = blf_filter_level;
+      if (cm->rst_info.restoration_level != -1)
+        cm->rst_info.restoration_type = RESTORE_BILATERAL;
+      else
+        cm->rst_info.restoration_type = RESTORE_NONE;
+    } else {
+      if (wiener_success)
+        cm->rst_info.restoration_type = RESTORE_WIENER;
+      else
+        cm->rst_info.restoration_type = RESTORE_NONE;
+    }
+    // printf("[%d] Costs %g %g (%d) %g (%d)\n", cm->rst_info.restoration_type,
+    //         cost_norestore, cost_bilateral, lf->filter_level, cost_wiener,
+    //         wiener_success);
+  }
+}
diff --git a/vp10/encoder/pickrst.h b/vp10/encoder/pickrst.h
new file mode 100644
index 0000000..8e2340d
--- /dev/null
+++ b/vp10/encoder/pickrst.h
@@ -0,0 +1,31 @@
+/*
+ *  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+
+#ifndef VP10_ENCODER_PICKRST_H_
+#define VP10_ENCODER_PICKRST_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "vp10/encoder/encoder.h"
+
+struct yv12_buffer_config;
+struct VP10_COMP;
+
+void vp10_pick_filter_restoration(
+    const YV12_BUFFER_CONFIG *sd, VP10_COMP *cpi, LPF_PICK_METHOD method);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VP10_ENCODER_PICKRST_H_
diff --git a/vp10/encoder/quantize.c b/vp10/encoder/quantize.c
index 136efe3..36b4804 100644
--- a/vp10/encoder/quantize.c
+++ b/vp10/encoder/quantize.c
@@ -10,16 +10,925 @@
 
 #include <math.h>
 #include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/quantize.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 
 #include "vp10/common/quant_common.h"
+#include "vp10/common/scan.h"
 #include "vp10/common/seg_common.h"
 
 #include "vp10/encoder/encoder.h"
 #include "vp10/encoder/quantize.h"
 #include "vp10/encoder/rd.h"
 
+#if CONFIG_NEW_QUANT
+static INLINE int quantize_coeff_nuq(const tran_low_t coeffv,
+                                     const int16_t quant,
+                                     const int16_t quant_shift,
+                                     const int16_t dequant,
+                                     const tran_low_t *cuml_bins_ptr,
+                                     const tran_low_t *dequant_val,
+                                     tran_low_t *qcoeff_ptr,
+                                     tran_low_t *dqcoeff_ptr) {
+  const int coeff = coeffv;
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int i, q;
+  int tmp = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+  for (i = 0; i < NUQ_KNOTS; i++) {
+    if (tmp < cuml_bins_ptr[i]) {
+      q = i;
+      break;
+    }
+  }
+  if (i == NUQ_KNOTS) {
+    tmp -= cuml_bins_ptr[NUQ_KNOTS - 1];
+    q = NUQ_KNOTS + (((((tmp * quant) >> 16) + tmp) * quant_shift) >> 16);
+  }
+  if (q) {
+    *dqcoeff_ptr =
+        vp10_dequant_abscoeff_nuq(q, dequant, dequant_val);
+    *qcoeff_ptr  = (q ^ coeff_sign) - coeff_sign;
+    *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
+  } else {
+    *qcoeff_ptr = 0;
+    *dqcoeff_ptr = 0;
+  }
+  return (q != 0);
+}
+
+static INLINE int quantize_coeff_bigtx_nuq(const tran_low_t coeffv,
+                                           const int16_t quant,
+                                           const int16_t quant_shift,
+                                           const int16_t dequant,
+                                           const tran_low_t *cuml_bins_ptr,
+                                           const tran_low_t *dequant_val,
+                                           tran_low_t *qcoeff_ptr,
+                                           tran_low_t *dqcoeff_ptr,
+                                           int logsizeby32) {
+  const int coeff = coeffv;
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int i, q;
+  int tmp = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+  for (i = 0; i < NUQ_KNOTS; i++) {
+    if (tmp < ROUND_POWER_OF_TWO(cuml_bins_ptr[i], 1 + logsizeby32)) {
+      q = i;
+      break;
+    }
+  }
+  if (i == NUQ_KNOTS) {
+    tmp -= ROUND_POWER_OF_TWO(cuml_bins_ptr[NUQ_KNOTS - 1], 1 + logsizeby32);
+    q = NUQ_KNOTS +
+        (((((tmp * quant) >> 16) + tmp) * quant_shift) >> (15 - logsizeby32));
+  }
+  if (q) {
+    *dqcoeff_ptr =
+         ROUND_POWER_OF_TWO(vp10_dequant_abscoeff_nuq(q, dequant, dequant_val),
+                            1 + logsizeby32);
+    // *dqcoeff_ptr = vp10_dequant_abscoeff_nuq(q, dequant, dequant_val) >>
+    // (1 + logsizeby32);
+    *qcoeff_ptr  = (q ^ coeff_sign) - coeff_sign;
+    *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
+  } else {
+    *qcoeff_ptr = 0;
+    *dqcoeff_ptr = 0;
+  }
+  return (q != 0);
+}
+
+static INLINE int quantize_coeff_fp_nuq(const tran_low_t coeffv,
+                                        const int16_t quant,
+                                        const int16_t dequant,
+                                        const tran_low_t *cuml_bins_ptr,
+                                        const tran_low_t *dequant_val,
+                                        tran_low_t *qcoeff_ptr,
+                                        tran_low_t *dqcoeff_ptr) {
+  const int coeff = coeffv;
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int i, q;
+  int tmp = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+  for (i = 0; i < NUQ_KNOTS; i++) {
+    if (tmp < cuml_bins_ptr[i]) {
+      q = i;
+      break;
+    }
+  }
+  if (i == NUQ_KNOTS) {
+    q = NUQ_KNOTS +
+        ((((int64_t)tmp - cuml_bins_ptr[NUQ_KNOTS - 1]) * quant) >> 16);
+  }
+  if (q) {
+    *dqcoeff_ptr =
+        vp10_dequant_abscoeff_nuq(q, dequant, dequant_val);
+    *qcoeff_ptr  = (q ^ coeff_sign) - coeff_sign;
+    *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
+  } else {
+    *qcoeff_ptr = 0;
+    *dqcoeff_ptr = 0;
+  }
+  return (q != 0);
+}
+
+static INLINE int quantize_coeff_bigtx_fp_nuq(const tran_low_t coeffv,
+                                              const int16_t quant,
+                                              const int16_t dequant,
+                                              const tran_low_t *cuml_bins_ptr,
+                                              const tran_low_t *dequant_val,
+                                              tran_low_t *qcoeff_ptr,
+                                              tran_low_t *dqcoeff_ptr,
+                                              int logsizeby32) {
+  const int coeff = coeffv;
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int i, q;
+  int tmp = clamp(abs_coeff, INT16_MIN, INT16_MAX);
+  for (i = 0; i < NUQ_KNOTS; i++) {
+    if (tmp < ROUND_POWER_OF_TWO(cuml_bins_ptr[i], 1 + logsizeby32)) {
+      q = i;
+      break;
+    }
+  }
+  if (i == NUQ_KNOTS) {
+    q = NUQ_KNOTS +
+        ((((int64_t)tmp - ROUND_POWER_OF_TWO(cuml_bins_ptr[NUQ_KNOTS - 1],
+                                             1 + logsizeby32)) * quant) >>
+         (15 - logsizeby32));
+  }
+  if (q) {
+    *dqcoeff_ptr =
+        ROUND_POWER_OF_TWO(vp10_dequant_abscoeff_nuq(q, dequant, dequant_val),
+                           1 + logsizeby32);
+    // *dqcoeff_ptr = vp10_dequant_abscoeff_nuq(q, dequant, dequant_val) >>
+    // (1 + logsizeby32);
+    *qcoeff_ptr  = (q ^ coeff_sign) - coeff_sign;
+    *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
+  } else {
+    *qcoeff_ptr = 0;
+    *dqcoeff_ptr = 0;
+  }
+  return (q != 0);
+}
+
+void quantize_dc_nuq(const tran_low_t *coeff_ptr,
+                     intptr_t n_coeffs,
+                     int skip_block,
+                     const int16_t quant,
+                     const int16_t quant_shift,
+                     const int16_t dequant,
+                     const tran_low_t *cuml_bins_ptr,
+                     const tran_low_t *dequant_val,
+                     tran_low_t *qcoeff_ptr,
+                     tran_low_t *dqcoeff_ptr,
+                     uint16_t *eob_ptr) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    const int rc = 0;
+    if (quantize_coeff_nuq(coeff_ptr[rc],
+                           quant,
+                           quant_shift,
+                           dequant,
+                           cuml_bins_ptr,
+                           dequant_val,
+                           qcoeff_ptr,
+                           dqcoeff_ptr))
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+void quantize_dc_fp_nuq(const tran_low_t *coeff_ptr,
+                        intptr_t n_coeffs,
+                        int skip_block,
+                        const int16_t quant,
+                        const int16_t dequant,
+                        const tran_low_t *cuml_bins_ptr,
+                        const tran_low_t *dequant_val,
+                        tran_low_t *qcoeff_ptr,
+                        tran_low_t *dqcoeff_ptr,
+                        uint16_t *eob_ptr) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    const int rc = 0;
+    if (quantize_coeff_fp_nuq(coeff_ptr[rc],
+                              quant,
+                              dequant,
+                              cuml_bins_ptr,
+                              dequant_val,
+                              qcoeff_ptr,
+                              dqcoeff_ptr))
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+void quantize_dc_32x32_nuq(const tran_low_t *coeff_ptr,
+                           intptr_t n_coeffs,
+                           int skip_block,
+                           const int16_t quant,
+                           const int16_t quant_shift,
+                           const int16_t dequant,
+                           const tran_low_t *cuml_bins_ptr,
+                           const tran_low_t *dequant_val,
+                           tran_low_t *qcoeff_ptr,
+                           tran_low_t *dqcoeff_ptr,
+                           uint16_t *eob_ptr) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    const int rc = 0;
+    if (quantize_coeff_bigtx_nuq(coeff_ptr[rc],
+                                 quant,
+                                 quant_shift,
+                                 dequant,
+                                 cuml_bins_ptr,
+                                 dequant_val,
+                                 qcoeff_ptr,
+                                 dqcoeff_ptr,
+                                 0))
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+void quantize_dc_32x32_fp_nuq(const tran_low_t *coeff_ptr,
+                              intptr_t n_coeffs,
+                              int skip_block,
+                              const int16_t quant,
+                              const int16_t dequant,
+                              const tran_low_t *cuml_bins_ptr,
+                              const tran_low_t *dequant_val,
+                              tran_low_t *qcoeff_ptr,
+                              tran_low_t *dqcoeff_ptr,
+                              uint16_t *eob_ptr) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    const int rc = 0;
+    if (quantize_coeff_bigtx_fp_nuq(coeff_ptr[rc],
+                                    quant,
+                                    dequant,
+                                    cuml_bins_ptr,
+                                    dequant_val,
+                                    qcoeff_ptr,
+                                    dqcoeff_ptr,
+                                    0))
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+void quantize_nuq_c(const tran_low_t *coeff_ptr,
+                    intptr_t n_coeffs,
+                    int skip_block,
+                    const int16_t *quant_ptr,
+                    const int16_t *quant_shift_ptr,
+                    const int16_t *dequant_ptr,
+                    const cuml_bins_type_nuq *cuml_bins_ptr,
+                    const dequant_val_type_nuq *dequant_val,
+                    tran_low_t *qcoeff_ptr,
+                    tran_low_t *dqcoeff_ptr,
+                    uint16_t *eob_ptr,
+                    const int16_t *scan,
+                    const uint8_t *band) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    int i;
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      if (quantize_coeff_nuq(coeff_ptr[rc],
+                             quant_ptr[rc != 0],
+                             quant_shift_ptr[rc != 0],
+                             dequant_ptr[rc != 0],
+                             cuml_bins_ptr[band[i]],
+                             dequant_val[band[i]],
+                             &qcoeff_ptr[rc],
+                             &dqcoeff_ptr[rc]))
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+void quantize_fp_nuq_c(const tran_low_t *coeff_ptr,
+                       intptr_t n_coeffs,
+                       int skip_block,
+                       const int16_t *quant_ptr,
+                       const int16_t *dequant_ptr,
+                       const cuml_bins_type_nuq *cuml_bins_ptr,
+                       const dequant_val_type_nuq *dequant_val,
+                       tran_low_t *qcoeff_ptr,
+                       tran_low_t *dqcoeff_ptr,
+                       uint16_t *eob_ptr,
+                       const int16_t *scan,
+                       const uint8_t *band) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    int i;
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      if (quantize_coeff_fp_nuq(coeff_ptr[rc],
+                                quant_ptr[rc != 0],
+                                dequant_ptr[rc != 0],
+                                cuml_bins_ptr[band[i]],
+                                dequant_val[band[i]],
+                                &qcoeff_ptr[rc],
+                                &dqcoeff_ptr[rc]))
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+void quantize_32x32_nuq_c(const tran_low_t *coeff_ptr,
+                          intptr_t n_coeffs,
+                          int skip_block,
+                          const int16_t *quant_ptr,
+                          const int16_t *quant_shift_ptr,
+                          const int16_t *dequant_ptr,
+                          const cuml_bins_type_nuq *cuml_bins_ptr,
+                          const dequant_val_type_nuq *dequant_val,
+                          tran_low_t *qcoeff_ptr,
+                          tran_low_t *dqcoeff_ptr,
+                          uint16_t *eob_ptr,
+                          const int16_t *scan,
+                          const uint8_t *band) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    int i;
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      if (quantize_coeff_bigtx_nuq(coeff_ptr[rc],
+                                   quant_ptr[rc != 0],
+                                   quant_shift_ptr[rc != 0],
+                                   dequant_ptr[rc != 0],
+                                   cuml_bins_ptr[band[i]],
+                                   dequant_val[band[i]],
+                                   &qcoeff_ptr[rc],
+                                   &dqcoeff_ptr[rc],
+                                   0))
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+void quantize_32x32_fp_nuq_c(const tran_low_t *coeff_ptr,
+                             intptr_t n_coeffs,
+                             int skip_block,
+                             const int16_t *quant_ptr,
+                             const int16_t *dequant_ptr,
+                             const cuml_bins_type_nuq *cuml_bins_ptr,
+                             const dequant_val_type_nuq *dequant_val,
+                             tran_low_t *qcoeff_ptr,
+                             tran_low_t *dqcoeff_ptr,
+                             uint16_t *eob_ptr,
+                             const int16_t *scan,
+                             const uint8_t *band) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    int i;
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      if (quantize_coeff_bigtx_fp_nuq(coeff_ptr[rc],
+                                      quant_ptr[rc != 0],
+                                      dequant_ptr[rc != 0],
+                                      cuml_bins_ptr[band[i]],
+                                      dequant_val[band[i]],
+                                      &qcoeff_ptr[rc],
+                                      &dqcoeff_ptr[rc],
+                                      0))
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+#endif  // CONFIG_NEW_QUANT
+
+void vp10_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr,
+                        tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr) {
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  *eob_ptr = 0;
+}
+
+void vp10_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+                             const MACROBLOCKD_PLANE *pd,
+                             tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                             const scan_order *sc, const QUANT_PARAM *qparam) {
+  // obsolete skip_block
+  const int skip_block = 0;
+
+  if (qparam->log_scale == 0) {
+    vp10_quantize_fp(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round_fp,
+                     p->quant_fp, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
+                     pd->dequant, eob_ptr, sc->scan, sc->iscan);
+  } else {
+    vp10_quantize_fp_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin,
+                           p->round_fp, p->quant_fp, p->quant_shift, qcoeff_ptr,
+                           dqcoeff_ptr, pd->dequant, eob_ptr, sc->scan,
+                           sc->iscan);
+  }
+}
+
+void vp10_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+                            const MACROBLOCKD_PLANE *pd,
+                            tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                            const scan_order *sc, const QUANT_PARAM *qparam) {
+  // obsolete skip_block
+  const int skip_block = 0;
+
+  if (qparam->log_scale == 0) {
+    vpx_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round, p->quant,
+                   p->quant_shift, qcoeff_ptr, dqcoeff_ptr, pd->dequant,
+                   eob_ptr, sc->scan, sc->iscan);
+  } else {
+    vpx_quantize_b_32x32(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round,
+                         p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
+                         pd->dequant, eob_ptr, sc->scan, sc->iscan);
+  }
+}
+
+void vp10_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+                             const MACROBLOCKD_PLANE *pd,
+                             tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                             const scan_order *sc, const QUANT_PARAM *qparam) {
+  // obsolete skip_block
+  const int skip_block = 0;
+  (void)sc;
+  if (qparam->log_scale == 0) {
+    vpx_quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round,
+                    p->quant_fp[0], qcoeff_ptr, dqcoeff_ptr, pd->dequant[0],
+                    eob_ptr);
+  } else {
+    vpx_quantize_dc_32x32(coeff_ptr, skip_block, p->round, p->quant_fp[0],
+                          qcoeff_ptr, dqcoeff_ptr, pd->dequant[0], eob_ptr);
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_highbd_quantize_fp_facade(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
+    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const scan_order *sc,
+    const QUANT_PARAM *qparam) {
+  // obsolete skip_block
+  const int skip_block = 0;
+
+  vp10_highbd_quantize_fp(coeff_ptr, n_coeffs, skip_block, p->zbin,
+                          p->round_fp, p->quant_fp, p->quant_shift,
+                          qcoeff_ptr, dqcoeff_ptr, pd->dequant, eob_ptr,
+                          sc->scan, sc->iscan, qparam->log_scale);
+}
+
+void vp10_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
+                                   intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+                                   tran_low_t *qcoeff_ptr,
+                                   const MACROBLOCKD_PLANE *pd,
+                                   tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                   const scan_order *sc,
+                                   const QUANT_PARAM *qparam) {
+  // obsolete skip_block
+  const int skip_block = 0;
+
+  vp10_highbd_quantize_b(coeff_ptr, n_coeffs, skip_block, p->zbin, p->round,
+                         p->quant, p->quant_shift, qcoeff_ptr, dqcoeff_ptr,
+                         pd->dequant, eob_ptr, sc->scan, sc->iscan,
+                         qparam->log_scale);
+}
+
+void vp10_highbd_quantize_dc_facade(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
+    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const scan_order *sc,
+    const QUANT_PARAM *qparam) {
+  // obsolete skip_block
+  const int skip_block = 0;
+
+  (void)sc;
+
+  vp10_highbd_quantize_dc(coeff_ptr, (int)n_coeffs, skip_block, p->round,
+                         p->quant_fp[0], qcoeff_ptr, dqcoeff_ptr,
+                         pd->dequant[0], eob_ptr, qparam->log_scale);
+}
+
+#if CONFIG_NEW_QUANT
+static INLINE int highbd_quantize_coeff_nuq(const tran_low_t coeffv,
+                                            const int16_t quant,
+                                            const int16_t quant_shift,
+                                            const int16_t dequant,
+                                            const tran_low_t *cuml_bins_ptr,
+                                            const tran_low_t *dequant_val,
+                                            tran_low_t *qcoeff_ptr,
+                                            tran_low_t *dqcoeff_ptr) {
+  const int coeff = coeffv;
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int i, q;
+  int64_t tmp = clamp(abs_coeff, INT32_MIN, INT32_MAX);
+  for (i = 0; i < NUQ_KNOTS; i++) {
+    if (tmp < cuml_bins_ptr[i]) {
+      q = i;
+      break;
+    }
+  }
+  if (i == NUQ_KNOTS) {
+    tmp -= cuml_bins_ptr[NUQ_KNOTS - 1];
+    q = NUQ_KNOTS + (((((tmp * quant) >> 16) + tmp) * quant_shift) >> 16);
+  }
+  if (q) {
+    *dqcoeff_ptr =
+        vp10_dequant_abscoeff_nuq(q, dequant, dequant_val);
+    *qcoeff_ptr  = (q ^ coeff_sign) - coeff_sign;
+    *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
+  } else {
+    *qcoeff_ptr = 0;
+    *dqcoeff_ptr = 0;
+  }
+  return (q != 0);
+}
+
+static INLINE int highbd_quantize_coeff_fp_nuq(const tran_low_t coeffv,
+                                               const int16_t quant,
+                                               const int16_t dequant,
+                                               const tran_low_t *cuml_bins_ptr,
+                                               const tran_low_t *dequant_val,
+                                               tran_low_t *qcoeff_ptr,
+                                               tran_low_t *dqcoeff_ptr) {
+  const int coeff = coeffv;
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int i, q;
+  int64_t tmp = clamp(abs_coeff, INT32_MIN, INT32_MAX);
+  for (i = 0; i < NUQ_KNOTS; i++) {
+    if (tmp < cuml_bins_ptr[i]) {
+      q = i;
+      break;
+    }
+  }
+  if (i == NUQ_KNOTS) {
+    q = NUQ_KNOTS +
+        (((tmp - cuml_bins_ptr[NUQ_KNOTS - 1]) * quant) >> 16);
+  }
+  if (q) {
+    *dqcoeff_ptr =
+        vp10_dequant_abscoeff_nuq(q, dequant, dequant_val);
+    *qcoeff_ptr  = (q ^ coeff_sign) - coeff_sign;
+    *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
+  } else {
+    *qcoeff_ptr = 0;
+    *dqcoeff_ptr = 0;
+  }
+  return (q != 0);
+}
+
+static INLINE int highbd_quantize_coeff_bigtx_fp_nuq(
+    const tran_low_t coeffv,
+    const int16_t quant,
+    const int16_t dequant,
+    const tran_low_t *cuml_bins_ptr,
+    const tran_low_t *dequant_val,
+    tran_low_t *qcoeff_ptr,
+    tran_low_t *dqcoeff_ptr,
+    int logsizeby32) {
+  const int coeff = coeffv;
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int i, q;
+  int64_t tmp = clamp(abs_coeff, INT32_MIN, INT32_MAX);
+  for (i = 0; i < NUQ_KNOTS; i++) {
+    if (tmp < ROUND_POWER_OF_TWO(cuml_bins_ptr[i], 1 + logsizeby32)) {
+      q = i;
+      break;
+    }
+  }
+  if (i == NUQ_KNOTS) {
+    q = NUQ_KNOTS +
+        (((tmp - ROUND_POWER_OF_TWO(cuml_bins_ptr[NUQ_KNOTS - 1],
+                                    1 + logsizeby32)) * quant) >>
+         (15 - logsizeby32));
+  }
+  if (q) {
+    *dqcoeff_ptr =
+        ROUND_POWER_OF_TWO(vp10_dequant_abscoeff_nuq(q, dequant, dequant_val),
+                           1 + logsizeby32);
+    *qcoeff_ptr  = (q ^ coeff_sign) - coeff_sign;
+    *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
+  } else {
+    *qcoeff_ptr = 0;
+    *dqcoeff_ptr = 0;
+  }
+  return (q != 0);
+}
+
+static INLINE int highbd_quantize_coeff_bigtx_nuq(const tran_low_t coeffv,
+                                                  const int16_t quant,
+                                                  const int16_t quant_shift,
+                                                  const int16_t dequant,
+                                                  const tran_low_t
+                                                        *cuml_bins_ptr,
+                                                  const tran_low_t *dequant_val,
+                                                  tran_low_t *qcoeff_ptr,
+                                                  tran_low_t *dqcoeff_ptr,
+                                                  int logsizeby32) {
+  const int coeff = coeffv;
+  const int coeff_sign = (coeff >> 31);
+  const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+  int i, q;
+  int64_t tmp = clamp(abs_coeff, INT32_MIN, INT32_MAX);
+  for (i = 0; i < NUQ_KNOTS; i++) {
+    if (tmp < ROUND_POWER_OF_TWO(cuml_bins_ptr[i], 1 + logsizeby32)) {
+      q = i;
+      break;
+    }
+  }
+  if (i == NUQ_KNOTS) {
+    tmp -= ROUND_POWER_OF_TWO(cuml_bins_ptr[NUQ_KNOTS - 1], 1 + logsizeby32);
+    q = NUQ_KNOTS +
+        (((((tmp * quant) >> 16) + tmp) * quant_shift) >> (15 - logsizeby32));
+  }
+  if (q) {
+    *dqcoeff_ptr =
+        ROUND_POWER_OF_TWO(vp10_dequant_abscoeff_nuq(q, dequant, dequant_val),
+                           1 + logsizeby32);
+    *qcoeff_ptr  = (q ^ coeff_sign) - coeff_sign;
+    *dqcoeff_ptr = *qcoeff_ptr < 0 ? -*dqcoeff_ptr : *dqcoeff_ptr;
+  } else {
+    *qcoeff_ptr = 0;
+    *dqcoeff_ptr = 0;
+  }
+  return (q != 0);
+}
+
+void highbd_quantize_dc_nuq(const tran_low_t *coeff_ptr,
+                            intptr_t n_coeffs,
+                            int skip_block,
+                            const int16_t quant,
+                            const int16_t quant_shift,
+                            const int16_t dequant,
+                            const tran_low_t *cuml_bins_ptr,
+                            const tran_low_t *dequant_val,
+                            tran_low_t *qcoeff_ptr,
+                            tran_low_t *dqcoeff_ptr,
+                            uint16_t *eob_ptr) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    const int rc = 0;
+    if (highbd_quantize_coeff_nuq(coeff_ptr[rc],
+                                  quant,
+                                  quant_shift,
+                                  dequant,
+                                  cuml_bins_ptr,
+                                  dequant_val,
+                                  qcoeff_ptr,
+                                  dqcoeff_ptr))
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+void highbd_quantize_dc_fp_nuq(const tran_low_t *coeff_ptr,
+                               intptr_t n_coeffs,
+                               int skip_block,
+                               const int16_t quant,
+                               const int16_t dequant,
+                               const tran_low_t *cuml_bins_ptr,
+                               const tran_low_t *dequant_val,
+                               tran_low_t *qcoeff_ptr,
+                               tran_low_t *dqcoeff_ptr,
+                               uint16_t *eob_ptr) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    const int rc = 0;
+    if (highbd_quantize_coeff_fp_nuq(coeff_ptr[rc],
+                                     quant,
+                                     dequant,
+                                     cuml_bins_ptr,
+                                     dequant_val,
+                                     qcoeff_ptr,
+                                     dqcoeff_ptr))
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+void highbd_quantize_nuq_c(const tran_low_t *coeff_ptr,
+                           intptr_t n_coeffs,
+                           int skip_block,
+                           const int16_t *quant_ptr,
+                           const int16_t *quant_shift_ptr,
+                           const int16_t *dequant_ptr,
+                           const cuml_bins_type_nuq *cuml_bins_ptr,
+                           const dequant_val_type_nuq *dequant_val,
+                           tran_low_t *qcoeff_ptr,
+                           tran_low_t *dqcoeff_ptr,
+                           uint16_t *eob_ptr,
+                           const int16_t *scan,
+                           const uint8_t *band) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    int i;
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      if (highbd_quantize_coeff_nuq(coeff_ptr[rc],
+                                    quant_ptr[rc != 0],
+                                    quant_shift_ptr[rc != 0],
+                                    dequant_ptr[rc != 0],
+                                    cuml_bins_ptr[band[i]],
+                                    dequant_val[band[i]],
+                                    &qcoeff_ptr[rc],
+                                    &dqcoeff_ptr[rc]))
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+void highbd_quantize_32x32_nuq_c(const tran_low_t *coeff_ptr,
+                                 intptr_t n_coeffs,
+                                 int skip_block,
+                                 const int16_t *quant_ptr,
+                                 const int16_t *quant_shift_ptr,
+                                 const int16_t *dequant_ptr,
+                                 const cuml_bins_type_nuq *cuml_bins_ptr,
+                                 const dequant_val_type_nuq *dequant_val,
+                                 tran_low_t *qcoeff_ptr,
+                                 tran_low_t *dqcoeff_ptr,
+                                 uint16_t *eob_ptr,
+                                 const int16_t *scan,
+                                 const uint8_t *band) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    int i;
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      if (highbd_quantize_coeff_bigtx_nuq(coeff_ptr[rc],
+                                          quant_ptr[rc != 0],
+                                          quant_shift_ptr[rc != 0],
+                                          dequant_ptr[rc != 0],
+                                          cuml_bins_ptr[band[i]],
+                                          dequant_val[band[i]],
+                                          &qcoeff_ptr[rc],
+                                          &dqcoeff_ptr[rc],
+                                          0))
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+void highbd_quantize_32x32_fp_nuq_c(const tran_low_t *coeff_ptr,
+                                    intptr_t n_coeffs,
+                                    int skip_block,
+                                    const int16_t *quant_ptr,
+                                    const int16_t *dequant_ptr,
+                                    const cuml_bins_type_nuq *cuml_bins_ptr,
+                                    const dequant_val_type_nuq *dequant_val,
+                                    tran_low_t *qcoeff_ptr,
+                                    tran_low_t *dqcoeff_ptr,
+                                    uint16_t *eob_ptr,
+                                    const int16_t *scan,
+                                    const uint8_t *band) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    int i;
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      if (highbd_quantize_coeff_bigtx_fp_nuq(coeff_ptr[rc],
+                                             quant_ptr[rc != 0],
+                                             dequant_ptr[rc != 0],
+                                             cuml_bins_ptr[band[i]],
+                                             dequant_val[band[i]],
+                                             &qcoeff_ptr[rc],
+                                             &dqcoeff_ptr[rc],
+                                             0))
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+void highbd_quantize_fp_nuq_c(const tran_low_t *coeff_ptr,
+                              intptr_t n_coeffs,
+                              int skip_block,
+                              const int16_t *quant_ptr,
+                              const int16_t *dequant_ptr,
+                              const cuml_bins_type_nuq *cuml_bins_ptr,
+                              const dequant_val_type_nuq *dequant_val,
+                              tran_low_t *qcoeff_ptr,
+                              tran_low_t *dqcoeff_ptr,
+                              uint16_t *eob_ptr,
+                              const int16_t *scan,
+                              const uint8_t *band) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    int i;
+    for (i = 0; i < n_coeffs; i++) {
+      const int rc = scan[i];
+      if (highbd_quantize_coeff_fp_nuq(coeff_ptr[rc],
+                                       quant_ptr[rc != 0],
+                                       dequant_ptr[rc != 0],
+                                       cuml_bins_ptr[band[i]],
+                                       dequant_val[band[i]],
+                                       &qcoeff_ptr[rc],
+                                       &dqcoeff_ptr[rc]))
+        eob = i;
+    }
+  }
+  *eob_ptr = eob + 1;
+}
+
+void highbd_quantize_dc_32x32_nuq(const tran_low_t *coeff_ptr,
+                                  intptr_t n_coeffs,
+                                  int skip_block,
+                                  const int16_t quant,
+                                  const int16_t quant_shift,
+                                  const int16_t dequant,
+                                  const tran_low_t *cuml_bins_ptr,
+                                  const tran_low_t *dequant_val,
+                                  tran_low_t *qcoeff_ptr,
+                                  tran_low_t *dqcoeff_ptr,
+                                  uint16_t *eob_ptr) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    const int rc = 0;
+    if (highbd_quantize_coeff_bigtx_nuq(coeff_ptr[rc],
+                                        quant,
+                                        quant_shift,
+                                        dequant,
+                                        cuml_bins_ptr,
+                                        dequant_val,
+                                        qcoeff_ptr,
+                                        dqcoeff_ptr,
+                                        0))
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+
+void highbd_quantize_dc_32x32_fp_nuq(const tran_low_t *coeff_ptr,
+                                     intptr_t n_coeffs,
+                                     int skip_block,
+                                     const int16_t quant,
+                                     const int16_t dequant,
+                                     const tran_low_t *cuml_bins_ptr,
+                                     const tran_low_t *dequant_val,
+                                     tran_low_t *qcoeff_ptr,
+                                     tran_low_t *dqcoeff_ptr,
+                                     uint16_t *eob_ptr) {
+  int eob = -1;
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+  if (!skip_block) {
+    const int rc = 0;
+    if (highbd_quantize_coeff_bigtx_fp_nuq(coeff_ptr[rc],
+                                           quant,
+                                           dequant,
+                                           cuml_bins_ptr,
+                                           dequant_val,
+                                           qcoeff_ptr,
+                                           dqcoeff_ptr,
+                                           0))
+      eob = 0;
+  }
+  *eob_ptr = eob + 1;
+}
+#endif  // CONFIG_NEW_QUANT
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 void vp10_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
                        int skip_block,
                        const int16_t *zbin_ptr, const int16_t *round_ptr,
@@ -73,9 +982,11 @@
                               const int16_t *dequant_ptr,
                               uint16_t *eob_ptr,
                               const int16_t *scan,
-                              const int16_t *iscan) {
+                              const int16_t *iscan, const int log_scale) {
   int i;
   int eob = -1;
+  const int scale = 1 << log_scale;
+  const int shift = 16 - log_scale;
   // TODO(jingning) Decide the need of these arguments after the
   // quantization process is completed.
   (void)zbin_ptr;
@@ -94,16 +1005,18 @@
       const int coeff_sign = (coeff >> 31);
       const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
       const int64_t tmp = abs_coeff + round_ptr[rc != 0];
-      const uint32_t abs_qcoeff = (uint32_t)((tmp * quant_ptr[rc != 0]) >> 16);
+      const uint32_t abs_qcoeff =
+          (uint32_t)((tmp * quant_ptr[rc != 0]) >> shift);
       qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0];
+      dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0]  / scale;
       if (abs_qcoeff)
         eob = i;
     }
   }
   *eob_ptr = eob + 1;
 }
-#endif
+
+#endif  // CONFIG_VP9_HIGHBITDEPTH
 
 // TODO(jingning) Refactor this file and combine functions with similar
 // operations.
@@ -148,74 +1061,100 @@
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vp10_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr,
-                                    intptr_t n_coeffs, int skip_block,
-                                    const int16_t *zbin_ptr,
-                                    const int16_t *round_ptr,
-                                    const int16_t *quant_ptr,
-                                    const int16_t *quant_shift_ptr,
-                                    tran_low_t *qcoeff_ptr,
-                                    tran_low_t *dqcoeff_ptr,
-                                    const int16_t *dequant_ptr,
-                                    uint16_t *eob_ptr,
-                                    const int16_t *scan, const int16_t *iscan) {
-  int i, eob = -1;
-  (void)zbin_ptr;
-  (void)quant_shift_ptr;
+void vp10_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                              int skip_block, const int16_t *zbin_ptr,
+                              const int16_t *round_ptr,
+                              const int16_t *quant_ptr,
+                              const int16_t *quant_shift_ptr,
+                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                              const int16_t *dequant_ptr,
+                              uint16_t *eob_ptr, const int16_t *scan,
+                              const int16_t *iscan, const int log_scale) {
+  int i, non_zero_count = (int)n_coeffs, eob = -1;
+  int zbins[2] = {zbin_ptr[0], zbin_ptr[1]};
+  int round[2] = {round_ptr[0], round_ptr[1]};
+  int nzbins[2];
+  int scale = 1;
+  int shift = 16;
   (void)iscan;
 
+  if (log_scale > 0) {
+    zbins[0] = ROUND_POWER_OF_TWO(zbin_ptr[0], log_scale);
+    zbins[1] = ROUND_POWER_OF_TWO(zbin_ptr[1], log_scale);
+    round[0] = ROUND_POWER_OF_TWO(round_ptr[0], log_scale);
+    round[1] = ROUND_POWER_OF_TWO(round_ptr[1], log_scale);
+    scale = 1 << log_scale;
+    shift = 16 - log_scale;
+  }
+
+  nzbins[0] = zbins[0] * -1;
+  nzbins[1] = zbins[1] * -1;
+
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
   if (!skip_block) {
-    for (i = 0; i < n_coeffs; i++) {
-      uint32_t abs_qcoeff = 0;
+    // Pre-scan pass
+    for (i = (int)n_coeffs - 1; i >= 0; i--) {
+      const int rc = scan[i];
+      const int coeff = coeff_ptr[rc];
+
+      if (coeff < zbins[rc != 0] && coeff > nzbins[rc != 0])
+        non_zero_count--;
+      else
+        break;
+    }
+
+    // Quantization pass: All coefficients with index >= zero_flag are
+    // skippable. Note: zero_flag can be zero.
+    for (i = 0; i < non_zero_count; i++) {
       const int rc = scan[i];
       const int coeff = coeff_ptr[rc];
       const int coeff_sign = (coeff >> 31);
       const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
 
-      if (abs_coeff >= (dequant_ptr[rc != 0] >> 2)) {
-        const int64_t tmp = abs_coeff
-                           + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1);
-        abs_qcoeff = (uint32_t) ((tmp * quant_ptr[rc != 0]) >> 15);
+      if (abs_coeff >= zbins[rc != 0]) {
+        const int64_t tmp1 = abs_coeff + round[rc != 0];
+        const int64_t tmp2 = ((tmp1 * quant_ptr[rc != 0]) >> 16) + tmp1;
+        const uint32_t abs_qcoeff =
+            (uint32_t)((tmp2 * quant_shift_ptr[rc != 0]) >> shift);
         qcoeff_ptr[rc] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
-        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / 2;
+        dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant_ptr[rc != 0] / scale;
+        if (abs_qcoeff)
+          eob = i;
       }
-
-      if (abs_qcoeff)
-        eob = i;
     }
   }
   *eob_ptr = eob + 1;
 }
 #endif
 
-void vp10_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
-                                const int16_t *scan, const int16_t *iscan) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  struct macroblock_plane *p = &x->plane[plane];
-  struct macroblockd_plane *pd = &xd->plane[plane];
-
 #if CONFIG_VP9_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    vpx_highbd_quantize_b(BLOCK_OFFSET(p->coeff, block),
-                          16, x->skip_block,
-                          p->zbin, p->round, p->quant, p->quant_shift,
-                          BLOCK_OFFSET(p->qcoeff, block),
-                          BLOCK_OFFSET(pd->dqcoeff, block),
-                          pd->dequant, &p->eobs[block],
-                          scan, iscan);
-    return;
+void vp10_highbd_quantize_dc(const tran_low_t *coeff_ptr,
+                            int n_coeffs, int skip_block,
+                            const int16_t *round_ptr, const int16_t quant,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t dequant_ptr, uint16_t *eob_ptr,
+                            const int log_scale) {
+  int eob = -1;
+
+  memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
+  memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
+
+  if (!skip_block) {
+    const int coeff = coeff_ptr[0];
+    const int coeff_sign = (coeff >> 31);
+    const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
+    const int64_t tmp = abs_coeff + round_ptr[0];
+    const uint32_t abs_qcoeff = (uint32_t)((tmp * quant) >> (16 - log_scale));
+    qcoeff_ptr[0] = (tran_low_t)((abs_qcoeff ^ coeff_sign) - coeff_sign);
+    dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant_ptr / (1 << log_scale);
+    if (abs_qcoeff)
+      eob = 0;
   }
-#endif
-  vpx_quantize_b(BLOCK_OFFSET(p->coeff, block),
-                 16, x->skip_block,
-                 p->zbin, p->round, p->quant, p->quant_shift,
-                 BLOCK_OFFSET(p->qcoeff, block),
-                 BLOCK_OFFSET(pd->dqcoeff, block),
-                 pd->dequant, &p->eobs[block], scan, iscan);
+  *eob_ptr = eob + 1;
 }
+#endif
 
 static void invert_quant(int16_t *quant, int16_t *shift, int d) {
   unsigned t;
@@ -252,16 +1191,16 @@
   VP10_COMMON *const cm = &cpi->common;
   QUANTS *const quants = &cpi->quants;
   int i, q, quant;
+#if CONFIG_NEW_QUANT
+  int dq;
+#endif
 
   for (q = 0; q < QINDEX_RANGE; q++) {
     const int qzbin_factor = get_qzbin_factor(q, cm->bit_depth);
     const int qrounding_factor = q == 0 ? 64 : 48;
 
     for (i = 0; i < 2; ++i) {
-      int qrounding_factor_fp = i == 0 ? 48 : 42;
-      if (q == 0)
-        qrounding_factor_fp = 64;
-
+      int qrounding_factor_fp = 64;
       // y
       quant = i == 0 ? vp10_dc_quant(q, cm->y_dc_delta_q, cm->bit_depth)
                      : vp10_ac_quant(q, 0, cm->bit_depth);
@@ -284,7 +1223,22 @@
       cpi->uv_dequant[q][i] = quant;
     }
 
-    for (i = 2; i < 8; i++) {
+#if CONFIG_NEW_QUANT
+    for (dq = 0; dq < QUANT_PROFILES; dq++) {
+      for (i = 0; i < COEF_BANDS; i++) {
+        const int quant = cpi->y_dequant[q][i != 0];
+        const int uvquant = cpi->uv_dequant[q][i != 0];
+        vp10_get_dequant_val_nuq(quant, q, i,
+                                 cpi->y_dequant_val_nuq[dq][q][i],
+                                 quants->y_cuml_bins_nuq[dq][q][i], dq);
+        vp10_get_dequant_val_nuq(uvquant, q, i,
+                                 cpi->uv_dequant_val_nuq[dq][q][i],
+                                 quants->uv_cuml_bins_nuq[dq][q][i], dq);
+      }
+    }
+#endif  // CONFIG_NEW_QUANT
+
+    for (i = 2; i < 8; i++) {  // 8: SIMD width
       quants->y_quant[q][i] = quants->y_quant[q][1];
       quants->y_quant_fp[q][i] = quants->y_quant_fp[q][1];
       quants->y_round_fp[q][i] = quants->y_round_fp[q][1];
@@ -304,14 +1258,17 @@
   }
 }
 
-void vp10_init_plane_quantizers(VP10_COMP *cpi, MACROBLOCK *x) {
+void vp10_init_plane_quantizers(const VP10_COMP *cpi, MACROBLOCK *x,
+                                const int segment_id) {
   const VP10_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
-  QUANTS *const quants = &cpi->quants;
-  const int segment_id = xd->mi[0]->mbmi.segment_id;
+  const QUANTS *const quants = &cpi->quants;
   const int qindex = vp10_get_qindex(&cm->seg, segment_id, cm->base_qindex);
   const int rdmult = vp10_compute_rd_mult(cpi, qindex + cm->y_dc_delta_q);
   int i;
+#if CONFIG_NEW_QUANT
+  int dq;
+#endif
 
   // Y
   x->plane[0].quant = quants->y_quant[qindex];
@@ -321,6 +1278,14 @@
   x->plane[0].zbin = quants->y_zbin[qindex];
   x->plane[0].round = quants->y_round[qindex];
   xd->plane[0].dequant = cpi->y_dequant[qindex];
+#if CONFIG_NEW_QUANT
+  for (dq = 0; dq < QUANT_PROFILES; dq ++) {
+    x->plane[0].cuml_bins_nuq[dq] = (cuml_bins_type_nuq*)
+                                    quants->y_cuml_bins_nuq[dq][qindex];
+    xd->plane[0].dequant_val_nuq[dq] = (const dequant_val_type_nuq*)
+                                   cpi->y_dequant_val_nuq[dq][qindex];
+  }
+#endif  // CONFIG_NEW_QUANT
 
   x->plane[0].quant_thred[0] = x->plane[0].zbin[0] * x->plane[0].zbin[0];
   x->plane[0].quant_thred[1] = x->plane[0].zbin[1] * x->plane[0].zbin[1];
@@ -334,6 +1299,14 @@
     x->plane[i].zbin = quants->uv_zbin[qindex];
     x->plane[i].round = quants->uv_round[qindex];
     xd->plane[i].dequant = cpi->uv_dequant[qindex];
+#if CONFIG_NEW_QUANT
+    for (dq = 0; dq < QUANT_PROFILES; dq ++) {
+      x->plane[i].cuml_bins_nuq[dq] = (cuml_bins_type_nuq*)
+                                      quants->uv_cuml_bins_nuq[dq][qindex];
+      xd->plane[i].dequant_val_nuq[dq] = (const dequant_val_type_nuq*)
+                                     cpi->uv_dequant_val_nuq[dq][qindex];
+    }
+#endif  // CONFIG_NEW_QUANT
 
     x->plane[i].quant_thred[0] = x->plane[i].zbin[0] * x->plane[i].zbin[0];
     x->plane[i].quant_thred[1] = x->plane[i].zbin[1] * x->plane[i].zbin[1];
@@ -342,14 +1315,15 @@
   x->skip_block = segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP);
   x->q_index = qindex;
 
-  x->errorperbit = rdmult >> 6;
-  x->errorperbit += (x->errorperbit == 0);
+  set_error_per_bit(x, rdmult);
 
   vp10_initialize_me_consts(cpi, x, x->q_index);
 }
 
 void vp10_frame_init_quantizer(VP10_COMP *cpi) {
-  vp10_init_plane_quantizers(cpi, &cpi->td.mb);
+  MACROBLOCK *const x = &cpi->td.mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  vp10_init_plane_quantizers(cpi, x, xd->mi[0]->mbmi.segment_id);
 }
 
 void vp10_set_quantizer(VP10_COMMON *cm, int q) {
diff --git a/vp10/encoder/quantize.h b/vp10/encoder/quantize.h
index b44088e..ca57c7e 100644
--- a/vp10/encoder/quantize.h
+++ b/vp10/encoder/quantize.h
@@ -12,13 +12,35 @@
 #define VP10_ENCODER_QUANTIZE_H_
 
 #include "./vpx_config.h"
+#include "vp10/common/scan.h"
 #include "vp10/encoder/block.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
+typedef struct QUANT_PARAM {
+  int log_scale;
+} QUANT_PARAM;
+
+typedef void (*VP10_QUANT_FACADE)(const tran_low_t *coeff_ptr,
+                                  intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+                                  tran_low_t *qcoeff_ptr,
+                                  const MACROBLOCKD_PLANE *pd,
+                                  tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                  const scan_order *sc,
+                                  const QUANT_PARAM *qparam);
+
 typedef struct {
+#if CONFIG_NEW_QUANT
+  DECLARE_ALIGNED(16, tran_low_t,
+                  y_cuml_bins_nuq[QUANT_PROFILES][QINDEX_RANGE][COEF_BANDS]
+                               [NUQ_KNOTS]);
+  DECLARE_ALIGNED(16, tran_low_t,
+                  uv_cuml_bins_nuq[QUANT_PROFILES][QINDEX_RANGE][COEF_BANDS]
+                                [NUQ_KNOTS]);
+#endif  // CONFIG_NEW_QUANT
+  // 0: dc 1: ac 2-8: ac repeated to SIMD width
   DECLARE_ALIGNED(16, int16_t, y_quant[QINDEX_RANGE][8]);
   DECLARE_ALIGNED(16, int16_t, y_quant_shift[QINDEX_RANGE][8]);
   DECLARE_ALIGNED(16, int16_t, y_zbin[QINDEX_RANGE][8]);
@@ -37,15 +59,13 @@
   DECLARE_ALIGNED(16, int16_t, uv_round[QINDEX_RANGE][8]);
 } QUANTS;
 
-void vp10_regular_quantize_b_4x4(MACROBLOCK *x, int plane, int block,
-                                const int16_t *scan, const int16_t *iscan);
-
 struct VP10_COMP;
 struct VP10Common;
 
 void vp10_frame_init_quantizer(struct VP10_COMP *cpi);
 
-void vp10_init_plane_quantizers(struct VP10_COMP *cpi, MACROBLOCK *x);
+void vp10_init_plane_quantizers(const struct VP10_COMP *cpi, MACROBLOCK *x,
+                                int segment_id);
 
 void vp10_init_quantizer(struct VP10_COMP *cpi);
 
@@ -55,6 +75,146 @@
 
 int vp10_qindex_to_quantizer(int qindex);
 
+void vp10_quantize_skip(intptr_t n_coeffs, tran_low_t *qcoeff_ptr,
+                        tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr);
+
+void vp10_quantize_fp_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+                             const MACROBLOCKD_PLANE *pd,
+                             tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                             const scan_order *sc, const QUANT_PARAM *qparam);
+
+void vp10_quantize_b_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                            const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+                            const MACROBLOCKD_PLANE *pd,
+                            tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                            const scan_order *sc, const QUANT_PARAM *qparam);
+
+void vp10_quantize_dc_facade(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                             const MACROBLOCK_PLANE *p, tran_low_t *qcoeff_ptr,
+                             const MACROBLOCKD_PLANE *pd,
+                             tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                             const scan_order *sc, const QUANT_PARAM *qparam);
+
+#if CONFIG_NEW_QUANT
+void quantize_dc_nuq(const tran_low_t *coeff_ptr,
+                     intptr_t n_coeffs,
+                     int skip_block,
+                     const int16_t quant,
+                     const int16_t quant_shift,
+                     const int16_t dequant,
+                     const tran_low_t *cuml_bins_ptr,
+                     const tran_low_t *dequant_val,
+                     tran_low_t *qcoeff_ptr,
+                     tran_low_t *dqcoeff_ptr,
+                     uint16_t *eob_ptr);
+void quantize_dc_32x32_nuq(const tran_low_t *coeff_ptr,
+                           intptr_t n_coeffs,
+                           int skip_block,
+                           const int16_t quant,
+                           const int16_t quant_shift,
+                           const int16_t dequant,
+                           const tran_low_t *cuml_bins_ptr,
+                           const tran_low_t *dequant_val,
+                           tran_low_t *qcoeff_ptr,
+                           tran_low_t *dqcoeff_ptr,
+                           uint16_t *eob_ptr);
+void quantize_dc_fp_nuq(const tran_low_t *coeff_ptr,
+                        intptr_t n_coeffs,
+                        int skip_block,
+                        const int16_t quant,
+                        const int16_t dequant,
+                        const tran_low_t *cuml_bins_ptr,
+                        const tran_low_t *dequant_val,
+                        tran_low_t *qcoeff_ptr,
+                        tran_low_t *dqcoeff_ptr,
+                        uint16_t *eob_ptr);
+void quantize_dc_32x32_fp_nuq(const tran_low_t *coeff_ptr,
+                              intptr_t n_coeffs,
+                              int skip_block,
+                              const int16_t quant,
+                              const int16_t dequant,
+                              const tran_low_t *cuml_bins_ptr,
+                              const tran_low_t *dequant_val,
+                              tran_low_t *qcoeff_ptr,
+                              tran_low_t *dqcoeff_ptr,
+                              uint16_t *eob_ptr);
+#endif  // CONFIG_NEW_QUANT
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vp10_highbd_quantize_fp_facade(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
+    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const scan_order *sc,
+    const QUANT_PARAM *qparam);
+
+void vp10_highbd_quantize_b_facade(const tran_low_t *coeff_ptr,
+                                   intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+                                   tran_low_t *qcoeff_ptr,
+                                   const MACROBLOCKD_PLANE *pd,
+                                   tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr,
+                                   const scan_order *sc,
+                                   const QUANT_PARAM *qparam);
+
+void vp10_highbd_quantize_dc_facade(
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const MACROBLOCK_PLANE *p,
+    tran_low_t *qcoeff_ptr, const MACROBLOCKD_PLANE *pd,
+    tran_low_t *dqcoeff_ptr, uint16_t *eob_ptr, const scan_order *sc,
+    const QUANT_PARAM *qparam);
+
+void vp10_highbd_quantize_dc(const tran_low_t *coeff_ptr,
+                            int n_coeffs, int skip_block,
+                            const int16_t *round_ptr, const int16_t quant,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t dequant_ptr, uint16_t *eob_ptr,
+                            const int log_scale);
+#if CONFIG_NEW_QUANT
+void highbd_quantize_dc_nuq(const tran_low_t *coeff_ptr,
+                            intptr_t n_coeffs,
+                            int skip_block,
+                            const int16_t quant,
+                            const int16_t quant_shift,
+                            const int16_t dequant,
+                            const tran_low_t *cuml_bins_ptr,
+                            const tran_low_t *dequant_val,
+                            tran_low_t *qcoeff_ptr,
+                            tran_low_t *dqcoeff_ptr,
+                            uint16_t *eob_ptr);
+void highbd_quantize_dc_32x32_nuq(const tran_low_t *coeff_ptr,
+                                  intptr_t n_coeffs,
+                                  int skip_block,
+                                  const int16_t quant,
+                                  const int16_t quant_shift,
+                                  const int16_t dequant,
+                                  const tran_low_t *cuml_bins_ptr,
+                                  const tran_low_t *dequant_val,
+                                  tran_low_t *qcoeff_ptr,
+                                  tran_low_t *dqcoeff_ptr,
+                                  uint16_t *eob_ptr);
+void highbd_quantize_dc_fp_nuq(const tran_low_t *coeff_ptr,
+                               intptr_t n_coeffs,
+                               int skip_block,
+                               const int16_t quant,
+                               const int16_t dequant,
+                               const tran_low_t *cuml_bins_ptr,
+                               const tran_low_t *dequant_val,
+                               tran_low_t *qcoeff_ptr,
+                               tran_low_t *dqcoeff_ptr,
+                               uint16_t *eob_ptr);
+void highbd_quantize_dc_32x32_fp_nuq(const tran_low_t *coeff_ptr,
+                                     intptr_t n_coeffs,
+                                     int skip_block,
+                                     const int16_t quant,
+                                     const int16_t dequant,
+                                     const tran_low_t *cuml_bins_ptr,
+                                     const tran_low_t *dequant_val,
+                                     tran_low_t *qcoeff_ptr,
+                                     tran_low_t *dqcoeff_ptr,
+                                     uint16_t *eob_ptr);
+
+#endif  // CONFIG_NEW_QUANT
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp10/encoder/ratectrl.c b/vp10/encoder/ratectrl.c
index 6068775..5dd42d4 100644
--- a/vp10/encoder/ratectrl.c
+++ b/vp10/encoder/ratectrl.c
@@ -240,11 +240,16 @@
   RATE_CONTROL *const rc = &cpi->rc;
 
   // Non-viewable frames are a special case and are treated as pure overhead.
-  if (!cm->show_frame) {
+#if CONFIG_EXT_REFS
+  // TODO(zoeliu): To further explore whether we should treat BWDREF_FRAME
+  //               differently, since it is a no-show frame.
+  if (!cm->show_frame && !rc->is_bwd_ref_frame)
+#else
+  if (!cm->show_frame)
+#endif  // CONFIG_EXT_REFS
     rc->bits_off_target -= encoded_frame_size;
-  } else {
+  else
     rc->bits_off_target += rc->avg_frame_bandwidth - encoded_frame_size;
-  }
 
   // Clip the buffer level to the maximum specified buffer size.
   rc->bits_off_target = VPXMIN(rc->bits_off_target, rc->maximum_buffer_size);
@@ -947,13 +952,23 @@
 int vp10_frame_type_qdelta(const VP10_COMP *cpi, int rf_level, int q) {
   static const double rate_factor_deltas[RATE_FACTOR_LEVELS] = {
     1.00,  // INTER_NORMAL
+#if CONFIG_EXT_REFS
+    0.80,  // INTER_LOW
+    1.25,  // INTER_HIGH
+#else
     1.00,  // INTER_HIGH
+#endif  // CONFIG_EXT_REFS
     1.50,  // GF_ARF_LOW
     1.75,  // GF_ARF_STD
     2.00,  // KF_STD
   };
   static const FRAME_TYPE frame_type[RATE_FACTOR_LEVELS] =
+#if CONFIG_EXT_REFS
+      { INTER_FRAME, INTER_FRAME, INTER_FRAME,
+        INTER_FRAME, INTER_FRAME, KEY_FRAME };
+#else
       {INTER_FRAME, INTER_FRAME, INTER_FRAME, INTER_FRAME, KEY_FRAME};
+#endif  // CONFIG_EXT_REFS
   const VP10_COMMON *const cm = &cpi->common;
   int qdelta = vp10_compute_qdelta_by_rate(&cpi->rc, frame_type[rf_level],
                                           q, rate_factor_deltas[rf_level],
@@ -1282,7 +1297,7 @@
     }
   }
 
-  // Keep record of last boosted (KF/KF/ARF) Q value.
+  // Keep record of last boosted (KF/GF/ARF) Q value.
   // If the current frame is coded at a lower Q then we also update it.
   // If all mbs in this group are skipped only update if the Q value is
   // better than that already stored.
@@ -1314,7 +1329,12 @@
 
   // Actual bits spent
   rc->total_actual_bits += rc->projected_frame_size;
+#if CONFIG_EXT_REFS
+  rc->total_target_bits += (cm->show_frame || rc->is_bwd_ref_frame) ?
+                            rc->avg_frame_bandwidth : 0;
+#else
   rc->total_target_bits += cm->show_frame ? rc->avg_frame_bandwidth : 0;
+#endif  // CONFIG_EXT_REFS
 
   rc->total_target_vs_actual = rc->total_actual_bits - rc->total_target_bits;
 
@@ -1328,7 +1348,12 @@
 
   if (cm->frame_type == KEY_FRAME)
     rc->frames_since_key = 0;
+
+#if CONFIG_EXT_REFS
+  if (cm->show_frame || rc->is_bwd_ref_frame) {
+#else
   if (cm->show_frame) {
+#endif  // CONFIG_EXT_REFS
     rc->frames_since_key++;
     rc->frames_to_key--;
   }
diff --git a/vp10/encoder/ratectrl.h b/vp10/encoder/ratectrl.h
index 0b9fd45..f70429b 100644
--- a/vp10/encoder/ratectrl.h
+++ b/vp10/encoder/ratectrl.h
@@ -28,6 +28,17 @@
 #define MAX_GF_INTERVAL     16
 #define FIXED_GF_INTERVAL   8    // Used in some testing modes only
 
+#if CONFIG_EXT_REFS
+typedef enum {
+  INTER_NORMAL = 0,
+  INTER_LOW = 1,
+  INTER_HIGH = 2,
+  GF_ARF_LOW = 3,
+  GF_ARF_STD = 5,
+  KF_STD = 6,
+  RATE_FACTOR_LEVELS = 7
+} RATE_FACTOR_LEVEL;
+#else
 typedef enum {
   INTER_NORMAL = 0,
   INTER_HIGH = 1,
@@ -36,6 +47,7 @@
   KF_STD = 4,
   RATE_FACTOR_LEVELS = 5
 } RATE_FACTOR_LEVEL;
+#endif  // CONFIG_EXT_REFS
 
 // Internal frame scaling level.
 typedef enum {
@@ -90,6 +102,17 @@
   int source_alt_ref_active;
   int is_src_frame_alt_ref;
 
+#if CONFIG_EXT_REFS
+  // Length of the bi-predictive frame group interval
+  int bipred_group_interval;
+
+  // NOTE: Different types of frames may have different bits allocated
+  //       accordingly, aiming to achieve the overall optimal RD performance.
+  int is_bwd_ref_frame;
+  int is_last_bipred_frame;
+  int is_bipred_frame;
+#endif  // CONFIG_EXT_REFS
+
   int avg_frame_bandwidth;  // Average frame size target for clip
   int min_frame_bandwidth;  // Minimum allocation used for any frame
   int max_frame_bandwidth;  // Maximum burst rate allowed for a frame.
diff --git a/vp10/encoder/rd.c b/vp10/encoder/rd.c
index f4fdb24..9e0a339 100644
--- a/vp10/encoder/rd.c
+++ b/vp10/encoder/rd.c
@@ -41,7 +41,6 @@
 #include "vp10/encoder/tokenize.h"
 
 #define RD_THRESH_POW      1.25
-#define RD_MULT_EPB_RATIO  64
 
 // Factor to weigh the rate for switchable interp filters.
 #define SWITCHABLE_INTERP_RATE_FACTOR 1
@@ -63,7 +62,10 @@
 // This table is used to correct for block size.
 // The factors here are << 2 (2 = x0.5, 32 = x8 etc).
 static const uint8_t rd_thresh_block_size_factor[BLOCK_SIZES] = {
-  2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32
+  2, 3, 3, 4, 6, 6, 8, 12, 12, 16, 24, 24, 32,
+#if CONFIG_EXT_PARTITION
+  48, 48, 64
+#endif  // CONFIG_EXT_PARTITION
 };
 
 static void fill_mode_costs(VP10_COMP *cpi) {
@@ -75,7 +77,10 @@
       vp10_cost_tokens(cpi->y_mode_costs[i][j], vp10_kf_y_mode_prob[i][j],
                       vp10_intra_mode_tree);
 
-  vp10_cost_tokens(cpi->mbmode_cost, fc->y_mode_prob[1], vp10_intra_mode_tree);
+  for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
+    vp10_cost_tokens(cpi->mbmode_cost[i], fc->y_mode_prob[i],
+                     vp10_intra_mode_tree);
+
   for (i = 0; i < INTRA_MODES; ++i)
     vp10_cost_tokens(cpi->intra_uv_mode_cost[i],
                      fc->uv_mode_prob[i], vp10_intra_mode_tree);
@@ -84,6 +89,50 @@
     vp10_cost_tokens(cpi->switchable_interp_costs[i],
                     fc->switchable_interp_prob[i], vp10_switchable_interp_tree);
 
+  for (i = 0; i < PALETTE_BLOCK_SIZES; ++i) {
+    vp10_cost_tokens(cpi->palette_y_size_cost[i],
+                     vp10_default_palette_y_size_prob[i],
+                     vp10_palette_size_tree);
+    vp10_cost_tokens(cpi->palette_uv_size_cost[i],
+                     vp10_default_palette_uv_size_prob[i],
+                     vp10_palette_size_tree);
+  }
+
+  for (i = 0; i < PALETTE_MAX_SIZE - 1; ++i)
+    for (j = 0; j < PALETTE_COLOR_CONTEXTS; ++j) {
+      vp10_cost_tokens(cpi->palette_y_color_cost[i][j],
+                       vp10_default_palette_y_color_prob[i][j],
+                       vp10_palette_color_tree[i]);
+      vp10_cost_tokens(cpi->palette_uv_color_cost[i][j],
+                       vp10_default_palette_uv_color_prob[i][j],
+                       vp10_palette_color_tree[i]);
+    }
+
+  for (i = 0; i < TX_SIZES - 1; ++i)
+    for (j = 0; j < TX_SIZE_CONTEXTS; ++j)
+      vp10_cost_tokens(cpi->tx_size_cost[i][j], fc->tx_size_probs[i][j],
+                       vp10_tx_size_tree[i]);
+
+#if CONFIG_EXT_TX
+  for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
+    int s;
+    for (s = 1; s < EXT_TX_SETS_INTER; ++s) {
+      if (use_inter_ext_tx_for_txsize[s][i]) {
+        vp10_cost_tokens(cpi->inter_tx_type_costs[s][i],
+                         fc->inter_ext_tx_prob[s][i],
+                         vp10_ext_tx_inter_tree[s]);
+      }
+    }
+    for (s = 1; s < EXT_TX_SETS_INTRA; ++s) {
+      if (use_intra_ext_tx_for_txsize[s][i]) {
+        for (j = 0; j < INTRA_MODES; ++j)
+          vp10_cost_tokens(cpi->intra_tx_type_costs[s][i][j],
+                           fc->intra_ext_tx_prob[s][i][j],
+                           vp10_ext_tx_intra_tree[s]);
+      }
+    }
+  }
+#else
   for (i = TX_4X4; i < EXT_TX_SIZES; ++i) {
     for (j = 0; j < TX_TYPES; ++j)
       vp10_cost_tokens(cpi->intra_tx_type_costs[i][j],
@@ -95,10 +144,19 @@
                      fc->inter_ext_tx_prob[i],
                      vp10_ext_tx_tree);
   }
+#endif  // CONFIG_EXT_TX
+#if CONFIG_EXT_INTRA
+  for (i = 0; i < INTRA_FILTERS + 1; ++i)
+    vp10_cost_tokens(cpi->intra_filter_cost[i], fc->intra_filter_probs[i],
+                     vp10_intra_filter_tree);
+#endif  // CONFIG_EXT_INTRA
 }
 
-static void fill_token_costs(vp10_coeff_cost *c,
-                             vp10_coeff_probs_model (*p)[PLANE_TYPES]) {
+void vp10_fill_token_costs(vp10_coeff_cost *c,
+#if CONFIG_ANS
+                           coeff_cdf_model (*cdf)[PLANE_TYPES],
+#endif  // CONFIG_ANS
+                           vp10_coeff_probs_model (*p)[PLANE_TYPES]) {
   int i, j, k, l;
   TX_SIZE t;
   for (t = TX_4X4; t <= TX_32X32; ++t)
@@ -106,12 +164,20 @@
       for (j = 0; j < REF_TYPES; ++j)
         for (k = 0; k < COEF_BANDS; ++k)
           for (l = 0; l < BAND_COEFF_CONTEXTS(k); ++l) {
+#if CONFIG_ANS
+            const vpx_prob *const tree_probs = p[t][i][j][k][l];
+            vp10_cost_tokens_ans((int *)c[t][i][j][k][0][l], tree_probs,
+                                 cdf[t][i][j][k][l], 0);
+            vp10_cost_tokens_ans((int *)c[t][i][j][k][1][l], tree_probs,
+                                 cdf[t][i][j][k][l], 1);
+#else
             vpx_prob probs[ENTROPY_NODES];
             vp10_model_to_full_probs(p[t][i][j][k][l], probs);
             vp10_cost_tokens((int *)c[t][i][j][k][0][l], probs,
                             vp10_coef_tree);
             vp10_cost_tokens_skip((int *)c[t][i][j][k][1][l], probs,
                                  vp10_coef_tree);
+#endif  // CONFIG_ANS
             assert(c[t][i][j][k][0][l][EOB_TOKEN] ==
                    c[t][i][j][k][1][l][EOB_TOKEN]);
           }
@@ -157,7 +223,11 @@
   8, 8, 4, 4, 2, 2, 1, 0
 };
 static const int rd_frame_type_factor[FRAME_UPDATE_TYPES] = {
-  128, 144, 128, 128, 144
+  128, 144, 128, 128, 144,
+#if CONFIG_EXT_REFS
+  // TODO(zoeliu): To adjust further following factor values.
+  128, 128, 128
+#endif  // CONFIG_EXT_REFS
 };
 
 int vp10_compute_rd_mult(const VP10_COMP *cpi, int qindex) {
@@ -219,7 +289,8 @@
   return VPXMAX((int)(pow(q, RD_THRESH_POW) * 5.12), 8);
 }
 
-void vp10_initialize_me_consts(VP10_COMP *cpi, MACROBLOCK *x, int qindex) {
+void vp10_initialize_me_consts(const VP10_COMP *cpi, MACROBLOCK *x,
+                               int qindex) {
 #if CONFIG_VP9_HIGHBITDEPTH
   switch (cpi->common.bit_depth) {
     case VPX_BITS_8:
@@ -276,6 +347,21 @@
   }
 }
 
+#if CONFIG_REF_MV
+void vp10_set_mvcost(MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame) {
+  MB_MODE_INFO_EXT *mbmi_ext = x->mbmi_ext;
+  int nmv_ctx = vp10_nmv_ctx(mbmi_ext->ref_mv_count[ref_frame],
+                             mbmi_ext->ref_mv_stack[ref_frame]);
+  x->mvcost = x->mv_cost_stack[nmv_ctx];
+  x->nmvjointcost = x->nmv_vec_cost[nmv_ctx];
+  x->mvsadcost = x->mvcost;
+  x->nmvjointsadcost = x->nmvjointcost;
+
+    x->nmv_vec_cost[nmv_ctx][MV_JOINT_ZERO] =
+        x->zero_rmv_cost[nmv_ctx][1] - x->zero_rmv_cost[nmv_ctx][0];
+}
+#endif
+
 void vp10_initialize_rd_consts(VP10_COMP *cpi) {
   VP10_COMMON *const cm = &cpi->common;
   MACROBLOCK *const x = &cpi->td.mb;
@@ -287,34 +373,113 @@
   rd->RDDIV = RDDIV_BITS;  // In bits (to multiply D by 128).
   rd->RDMULT = vp10_compute_rd_mult(cpi, cm->base_qindex + cm->y_dc_delta_q);
 
-  x->errorperbit = rd->RDMULT / RD_MULT_EPB_RATIO;
-  x->errorperbit += (x->errorperbit == 0);
-
-  x->select_tx_size = (cpi->sf.tx_size_search_method == USE_LARGESTALL &&
-                       cm->frame_type != KEY_FRAME) ? 0 : 1;
+  set_error_per_bit(x, rd->RDMULT);
 
   set_block_thresholds(cm, rd);
 
-  fill_token_costs(x->token_costs, cm->fc->coef_probs);
-
-  if (cpi->sf.partition_search_type != VAR_BASED_PARTITION ||
-      cm->frame_type == KEY_FRAME) {
-    for (i = 0; i < PARTITION_CONTEXTS; ++i)
-      vp10_cost_tokens(cpi->partition_cost[i], cm->fc->partition_prob[i],
-                      vp10_partition_tree);
-  }
-
-  fill_mode_costs(cpi);
-
   if (!frame_is_intra_only(cm)) {
-    vp10_build_nmv_cost_table(x->nmvjointcost,
-                             cm->allow_high_precision_mv ? x->nmvcost_hp
-                                                         : x->nmvcost,
-                             &cm->fc->nmvc, cm->allow_high_precision_mv);
+#if CONFIG_REF_MV
+    int nmv_ctx;
 
-    for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
-      vp10_cost_tokens((int *)cpi->inter_mode_cost[i],
-                      cm->fc->inter_mode_probs[i], vp10_inter_mode_tree);
+    for (nmv_ctx = 0; nmv_ctx < NMV_CONTEXTS; ++nmv_ctx) {
+      vpx_prob tmp_prob = cm->fc->nmvc[nmv_ctx].joints[MV_JOINT_ZERO];
+      cm->fc->nmvc[nmv_ctx].joints[MV_JOINT_ZERO] = 1;
+
+      vp10_build_nmv_cost_table(
+          x->nmv_vec_cost[nmv_ctx],
+          cm->allow_high_precision_mv ? x->nmvcost_hp[nmv_ctx]
+                                      : x->nmvcost[nmv_ctx],
+          &cm->fc->nmvc[nmv_ctx], cm->allow_high_precision_mv);
+      cm->fc->nmvc[nmv_ctx].joints[MV_JOINT_ZERO] = tmp_prob;
+
+      x->nmv_vec_cost[nmv_ctx][MV_JOINT_ZERO] = 0;
+      x->zero_rmv_cost[nmv_ctx][0] =
+          vp10_cost_bit(cm->fc->nmvc[nmv_ctx].zero_rmv, 0);
+      x->zero_rmv_cost[nmv_ctx][1] =
+          vp10_cost_bit(cm->fc->nmvc[nmv_ctx].zero_rmv, 1);
+    }
+    x->mvcost = x->mv_cost_stack[0];
+    x->nmvjointcost = x->nmv_vec_cost[0];
+    x->mvsadcost = x->mvcost;
+    x->nmvjointsadcost = x->nmvjointcost;
+#else
+    vp10_build_nmv_cost_table(
+        x->nmvjointcost,
+        cm->allow_high_precision_mv ? x->nmvcost_hp : x->nmvcost, &cm->fc->nmvc,
+        cm->allow_high_precision_mv);
+#endif
+  }
+  if (cpi->oxcf.pass != 1) {
+    vp10_fill_token_costs(x->token_costs,
+#if CONFIG_ANS
+                          cm->fc->coef_cdfs,
+#endif  // CONFIG_ANS
+                          cm->fc->coef_probs);
+
+    if (cpi->sf.partition_search_type != VAR_BASED_PARTITION ||
+        cm->frame_type == KEY_FRAME) {
+#if CONFIG_EXT_PARTITION_TYPES
+      vp10_cost_tokens(cpi->partition_cost[0], cm->fc->partition_prob[0],
+                       vp10_partition_tree);
+      for (i = 1; i < PARTITION_CONTEXTS; ++i)
+        vp10_cost_tokens(cpi->partition_cost[i], cm->fc->partition_prob[i],
+                         vp10_ext_partition_tree);
+#else
+      for (i = 0; i < PARTITION_CONTEXTS; ++i)
+        vp10_cost_tokens(cpi->partition_cost[i], cm->fc->partition_prob[i],
+                         vp10_partition_tree);
+#endif  // CONFIG_EXT_PARTITION_TYPES
+    }
+
+    fill_mode_costs(cpi);
+
+    if (!frame_is_intra_only(cm)) {
+#if CONFIG_REF_MV
+      for (i = 0; i < NEWMV_MODE_CONTEXTS; ++i) {
+        cpi->newmv_mode_cost[i][0] = vp10_cost_bit(cm->fc->newmv_prob[i], 0);
+        cpi->newmv_mode_cost[i][1] = vp10_cost_bit(cm->fc->newmv_prob[i], 1);
+      }
+
+      for (i = 0; i < ZEROMV_MODE_CONTEXTS; ++i) {
+        cpi->zeromv_mode_cost[i][0] = vp10_cost_bit(cm->fc->zeromv_prob[i], 0);
+        cpi->zeromv_mode_cost[i][1] = vp10_cost_bit(cm->fc->zeromv_prob[i], 1);
+      }
+
+      for (i = 0; i < REFMV_MODE_CONTEXTS; ++i) {
+        cpi->refmv_mode_cost[i][0] = vp10_cost_bit(cm->fc->refmv_prob[i], 0);
+        cpi->refmv_mode_cost[i][1] = vp10_cost_bit(cm->fc->refmv_prob[i], 1);
+      }
+
+      for (i = 0; i < DRL_MODE_CONTEXTS; ++i) {
+        cpi->drl_mode_cost0[i][0] = vp10_cost_bit(cm->fc->drl_prob[i], 0);
+        cpi->drl_mode_cost0[i][1] = vp10_cost_bit(cm->fc->drl_prob[i], 1);
+      }
+#if CONFIG_EXT_INTER
+      cpi->new2mv_mode_cost[0] = vp10_cost_bit(cm->fc->new2mv_prob, 0);
+      cpi->new2mv_mode_cost[1] = vp10_cost_bit(cm->fc->new2mv_prob, 1);
+#endif  // CONFIG_EXT_INTER
+#else
+      for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
+        vp10_cost_tokens((int *)cpi->inter_mode_cost[i],
+                         cm->fc->inter_mode_probs[i], vp10_inter_mode_tree);
+#endif  // CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+      for (i = 0; i < INTER_MODE_CONTEXTS; ++i)
+        vp10_cost_tokens((int *)cpi->inter_compound_mode_cost[i],
+                         cm->fc->inter_compound_mode_probs[i],
+                         vp10_inter_compound_mode_tree);
+      for (i = 0; i < BLOCK_SIZE_GROUPS; ++i)
+        vp10_cost_tokens((int *)cpi->interintra_mode_cost[i],
+                         cm->fc->interintra_mode_prob[i],
+                         vp10_interintra_mode_tree);
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_OBMC || CONFIG_WARPED_MOTION
+      for (i = BLOCK_8X8; i < BLOCK_SIZES; i++) {
+        vp10_cost_tokens((int *)cpi->motvar_cost[i],
+                         cm->fc->motvar_prob[i], vp10_motvar_tree);
+      }
+#endif  // CONFIG_OBMC || CONFIG_WARPED_MOTION
+    }
   }
 }
 
@@ -412,16 +577,15 @@
         (((uint64_t)qstep * qstep << (n_log2 + 10)) + (var >> 1)) / var;
     const int xsq_q10 = (int)VPXMIN(xsq_q10_64, MAX_XSQ_Q10);
     model_rd_norm(xsq_q10, &r_q10, &d_q10);
-    *rate = ((r_q10 << n_log2) + 2) >> 2;
+    *rate = ROUND_POWER_OF_TWO(r_q10 << n_log2, 10 - VP9_PROB_COST_SHIFT);
     *dist = (var * (int64_t)d_q10 + 512) >> 10;
   }
 }
 
-void vp10_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
-                              const struct macroblockd_plane *pd,
-                              ENTROPY_CONTEXT t_above[16],
-                              ENTROPY_CONTEXT t_left[16]) {
-  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+static void get_entropy_contexts_plane(
+    BLOCK_SIZE plane_bsize, TX_SIZE tx_size, const struct macroblockd_plane *pd,
+    ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE],
+    ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE]) {
   const int num_4x4_w = num_4x4_blocks_wide_lookup[plane_bsize];
   const int num_4x4_h = num_4x4_blocks_high_lookup[plane_bsize];
   const ENTROPY_CONTEXT *const above = pd->above_context;
@@ -457,6 +621,14 @@
   }
 }
 
+void vp10_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
+                              const struct macroblockd_plane *pd,
+                              ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE],
+                              ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE]) {
+  const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+  get_entropy_contexts_plane(plane_bsize, tx_size, pd, t_above, t_left);
+}
+
 void vp10_mv_pred(VP10_COMP *cpi, MACROBLOCK *x,
                  uint8_t *ref_y_buffer, int ref_y_stride,
                  int ref_frame, BLOCK_SIZE block_size) {
@@ -559,13 +731,36 @@
           &cm->buffer_pool->frame_bufs[scaled_idx].buf : NULL;
 }
 
+#if CONFIG_DUAL_FILTER
+int vp10_get_switchable_rate(const VP10_COMP *cpi,
+                             const MACROBLOCKD *const xd) {
+  const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  int inter_filter_cost = 0;
+  int dir;
+
+  for (dir = 0; dir < 2; ++dir) {
+    if (has_subpel_mv_component(xd->mi[0], xd, dir) ||
+        (mbmi->ref_frame[1] > INTRA_FRAME &&
+         has_subpel_mv_component(xd->mi[0], xd, dir + 2))) {
+      const int ctx = vp10_get_pred_context_switchable_interp(xd, dir);
+      inter_filter_cost +=
+          cpi->switchable_interp_costs[ctx][mbmi->interp_filter[dir]];
+    }
+  }
+  return SWITCHABLE_INTERP_RATE_FACTOR * inter_filter_cost;
+}
+#else
 int vp10_get_switchable_rate(const VP10_COMP *cpi,
                              const MACROBLOCKD *const xd) {
   const MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
   const int ctx = vp10_get_pred_context_switchable_interp(xd);
+#if CONFIG_EXT_INTERP
+  if (!vp10_is_interp_needed(xd)) return 0;
+#endif  // CONFIG_EXT_INTERP
   return SWITCHABLE_INTERP_RATE_FACTOR *
-             cpi->switchable_interp_costs[ctx][mbmi->interp_filter];
+      cpi->switchable_interp_costs[ctx][mbmi->interp_filter];
 }
+#endif
 
 void vp10_set_rd_speed_thresholds(VP10_COMP *cpi) {
   int i;
@@ -578,66 +773,296 @@
 
   if (sf->adaptive_rd_thresh) {
     rd->thresh_mult[THR_NEARESTMV] = 300;
-    rd->thresh_mult[THR_NEARESTG] = 300;
+#if CONFIG_EXT_REFS
+    rd->thresh_mult[THR_NEARESTL2] = 300;
+    rd->thresh_mult[THR_NEARESTL3] = 300;
+    rd->thresh_mult[THR_NEARESTB] = 300;
+#endif  // CONFIG_EXT_REFS
     rd->thresh_mult[THR_NEARESTA] = 300;
+    rd->thresh_mult[THR_NEARESTG] = 300;
   } else {
     rd->thresh_mult[THR_NEARESTMV] = 0;
-    rd->thresh_mult[THR_NEARESTG] = 0;
+#if CONFIG_EXT_REFS
+    rd->thresh_mult[THR_NEARESTL2] = 0;
+    rd->thresh_mult[THR_NEARESTL3] = 0;
+    rd->thresh_mult[THR_NEARESTB] = 0;
+#endif  // CONFIG_EXT_REFS
     rd->thresh_mult[THR_NEARESTA] = 0;
+    rd->thresh_mult[THR_NEARESTG] = 0;
   }
 
   rd->thresh_mult[THR_DC] += 1000;
 
   rd->thresh_mult[THR_NEWMV] += 1000;
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_NEWL2] += 1000;
+  rd->thresh_mult[THR_NEWL3] += 1000;
+  rd->thresh_mult[THR_NEWB] += 1000;
+#endif  // CONFIG_EXT_REFS
   rd->thresh_mult[THR_NEWA] += 1000;
   rd->thresh_mult[THR_NEWG] += 1000;
 
   rd->thresh_mult[THR_NEARMV] += 1000;
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_NEARL2] += 1000;
+  rd->thresh_mult[THR_NEARL3] += 1000;
+  rd->thresh_mult[THR_NEARB] += 1000;
+#endif  // CONFIG_EXT_REFS
   rd->thresh_mult[THR_NEARA] += 1000;
-  rd->thresh_mult[THR_COMP_NEARESTLA] += 1000;
-  rd->thresh_mult[THR_COMP_NEARESTGA] += 1000;
+  rd->thresh_mult[THR_NEARG] += 1000;
+
+#if CONFIG_EXT_INTER
+  rd->thresh_mult[THR_NEWFROMNEARMV] += 1000;
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_NEWFROMNEARL2] += 1000;
+  rd->thresh_mult[THR_NEWFROMNEARL3] += 1000;
+  rd->thresh_mult[THR_NEWFROMNEARB] += 1000;
+#endif  // CONFIG_EXT_REFS
+  rd->thresh_mult[THR_NEWFROMNEARA] += 1000;
+  rd->thresh_mult[THR_NEWFROMNEARG] += 1000;
+#endif  // CONFIG_EXT_INTER
+
+  rd->thresh_mult[THR_ZEROMV] += 2000;
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_ZEROL2] += 2000;
+  rd->thresh_mult[THR_ZEROL3] += 2000;
+  rd->thresh_mult[THR_ZEROB] += 2000;
+#endif  // CONFIG_EXT_REFS
+  rd->thresh_mult[THR_ZEROG] += 2000;
+  rd->thresh_mult[THR_ZEROA] += 2000;
 
   rd->thresh_mult[THR_TM] += 1000;
 
+#if CONFIG_EXT_INTER
+
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLA] += 1000;
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2A] += 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3A] += 1000;
+#endif  // CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTGA] += 1000;
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTLB] += 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL2B] += 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTL3B] += 1000;
+  rd->thresh_mult[THR_COMP_NEAREST_NEARESTGB] += 1000;
+#endif  // CONFIG_EXT_REFS
+
+#else  // CONFIG_EXT_INTER
+
+  rd->thresh_mult[THR_COMP_NEARESTLA] += 1000;
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_NEARESTL2A] += 1000;
+  rd->thresh_mult[THR_COMP_NEARESTL3A] += 1000;
+#endif  // CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_NEARESTGA] += 1000;
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_NEARESTLB] += 1000;
+  rd->thresh_mult[THR_COMP_NEARESTL2B] += 1000;
+  rd->thresh_mult[THR_COMP_NEARESTL3B] += 1000;
+  rd->thresh_mult[THR_COMP_NEARESTGB] += 1000;
+#endif  // CONFIG_EXT_REFS
+
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_EXT_INTER
+
+  rd->thresh_mult[THR_COMP_NEAREST_NEARLA] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARESTLA] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARLA] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLA] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTLA] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLA] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARLA] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWLA] += 2000;
+  rd->thresh_mult[THR_COMP_ZERO_ZEROLA] += 2500;
+
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_NEAREST_NEARL2A] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARESTL2A] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARL2A] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWL2A] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTL2A] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWL2A] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARL2A] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWL2A] += 2000;
+  rd->thresh_mult[THR_COMP_ZERO_ZEROL2A] += 2500;
+
+  rd->thresh_mult[THR_COMP_NEAREST_NEARL3A] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARESTL3A] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARL3A] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWL3A] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTL3A] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWL3A] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARL3A] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWL3A] += 2000;
+  rd->thresh_mult[THR_COMP_ZERO_ZEROL3A] += 2500;
+#endif  // CONFIG_EXT_REFS
+
+  rd->thresh_mult[THR_COMP_NEAREST_NEARGA] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARESTGA] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARGA] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWGA] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTGA] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWGA] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARGA] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWGA] += 2000;
+  rd->thresh_mult[THR_COMP_ZERO_ZEROGA] += 2500;
+
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_NEAREST_NEARLB] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARESTLB] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARLB] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWLB] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTLB] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWLB] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARLB] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWLB] += 2000;
+  rd->thresh_mult[THR_COMP_ZERO_ZEROLB] += 2500;
+
+  rd->thresh_mult[THR_COMP_NEAREST_NEARL2B] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARESTL2B] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARL2B] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWL2B] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTL2B] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWL2B] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARL2B] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWL2B] += 2000;
+  rd->thresh_mult[THR_COMP_ZERO_ZEROL2B] += 2500;
+
+  rd->thresh_mult[THR_COMP_NEAREST_NEARL3B] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARESTL3B] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARL3B] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWL3B] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTL3B] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWL3B] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARL3B] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWL3B] += 2000;
+  rd->thresh_mult[THR_COMP_ZERO_ZEROL3B] += 2500;
+
+  rd->thresh_mult[THR_COMP_NEAREST_NEARGB] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARESTGB] += 1200;
+  rd->thresh_mult[THR_COMP_NEAR_NEARGB] += 1200;
+  rd->thresh_mult[THR_COMP_NEAREST_NEWGB] += 1500;
+  rd->thresh_mult[THR_COMP_NEW_NEARESTGB] += 1500;
+  rd->thresh_mult[THR_COMP_NEAR_NEWGB] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEARGB] += 1700;
+  rd->thresh_mult[THR_COMP_NEW_NEWGB] += 2000;
+  rd->thresh_mult[THR_COMP_ZERO_ZEROGB] += 2500;
+#endif  // CONFIG_EXT_REFS
+
+#else  // CONFIG_EXT_INTER
+
   rd->thresh_mult[THR_COMP_NEARLA] += 1500;
   rd->thresh_mult[THR_COMP_NEWLA] += 2000;
-  rd->thresh_mult[THR_NEARG] += 1000;
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_NEARL2A] += 1500;
+  rd->thresh_mult[THR_COMP_NEWL2A] += 2000;
+  rd->thresh_mult[THR_COMP_NEARL3A] += 1500;
+  rd->thresh_mult[THR_COMP_NEWL3A] += 2000;
+#endif  // CONFIG_EXT_REFS
   rd->thresh_mult[THR_COMP_NEARGA] += 1500;
   rd->thresh_mult[THR_COMP_NEWGA] += 2000;
 
-  rd->thresh_mult[THR_ZEROMV] += 2000;
-  rd->thresh_mult[THR_ZEROG] += 2000;
-  rd->thresh_mult[THR_ZEROA] += 2000;
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_NEARLB] += 1500;
+  rd->thresh_mult[THR_COMP_NEWLB] += 2000;
+  rd->thresh_mult[THR_COMP_NEARL2B] += 1500;
+  rd->thresh_mult[THR_COMP_NEWL2B] += 2000;
+  rd->thresh_mult[THR_COMP_NEARL3B] += 1500;
+  rd->thresh_mult[THR_COMP_NEWL3B] += 2000;
+  rd->thresh_mult[THR_COMP_NEARGB] += 1500;
+  rd->thresh_mult[THR_COMP_NEWGB] += 2000;
+#endif  // CONFIG_EXT_REFS
+
   rd->thresh_mult[THR_COMP_ZEROLA] += 2500;
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_ZEROL2A] += 2500;
+  rd->thresh_mult[THR_COMP_ZEROL3A] += 2500;
+#endif  // CONFIG_EXT_REFS
   rd->thresh_mult[THR_COMP_ZEROGA] += 2500;
 
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_ZEROLB] += 2500;
+  rd->thresh_mult[THR_COMP_ZEROL2B] += 2500;
+  rd->thresh_mult[THR_COMP_ZEROL3B] += 2500;
+  rd->thresh_mult[THR_COMP_ZEROGB] += 2500;
+#endif  // CONFIG_EXT_REFS
+
+#endif  // CONFIG_EXT_INTER
+
   rd->thresh_mult[THR_H_PRED] += 2000;
   rd->thresh_mult[THR_V_PRED] += 2000;
-  rd->thresh_mult[THR_D45_PRED ] += 2500;
   rd->thresh_mult[THR_D135_PRED] += 2500;
-  rd->thresh_mult[THR_D117_PRED] += 2500;
-  rd->thresh_mult[THR_D153_PRED] += 2500;
   rd->thresh_mult[THR_D207_PRED] += 2500;
+  rd->thresh_mult[THR_D153_PRED] += 2500;
   rd->thresh_mult[THR_D63_PRED] += 2500;
+  rd->thresh_mult[THR_D117_PRED] += 2500;
+  rd->thresh_mult[THR_D45_PRED ] += 2500;
+
+#if CONFIG_EXT_INTER
+  rd->thresh_mult[THR_COMP_INTERINTRA_ZEROL   ] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTL] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEARL   ] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEWL    ] += 2000;
+
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_INTERINTRA_ZEROL2  ] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTL2] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEARL2  ] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEWL2   ] += 2000;
+
+  rd->thresh_mult[THR_COMP_INTERINTRA_ZEROL3  ] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTL3] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEARL3  ] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEWL3   ] += 2000;
+#endif  // CONFIG_EXT_REFS
+
+  rd->thresh_mult[THR_COMP_INTERINTRA_ZEROG   ] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTG] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEARG   ] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEWG    ] += 2000;
+
+#if CONFIG_EXT_REFS
+  rd->thresh_mult[THR_COMP_INTERINTRA_ZEROB   ] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTB] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEARB   ] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEWB    ] += 2000;
+#endif  // CONFIG_EXT_REFS
+
+  rd->thresh_mult[THR_COMP_INTERINTRA_ZEROA   ] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEARESTA] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEARA   ] += 1500;
+  rd->thresh_mult[THR_COMP_INTERINTRA_NEWA    ] += 2000;
+#endif  // CONFIG_EXT_INTER
 }
 
 void vp10_set_rd_speed_thresholds_sub8x8(VP10_COMP *cpi) {
-  static const int thresh_mult[2][MAX_REFS] =
-      {{2500, 2500, 2500, 4500, 4500, 2500},
-       {2000, 2000, 2000, 4000, 4000, 2000}};
+  static const int thresh_mult[2][MAX_REFS] = {
+#if CONFIG_EXT_REFS
+    { 2500, 2500, 2500, 2500, 2500, 2500, 4500, 4500,
+      4500, 4500, 4500, 4500, 4500, 4500, 2500 },
+    { 2000, 2000, 2000, 2000, 2000, 2000, 4000, 4000,
+      4000, 4000, 4000, 4000, 4000, 4000, 2000 }
+#else
+    { 2500, 2500, 2500, 4500, 4500, 2500 },
+    { 2000, 2000, 2000, 4000, 4000, 2000 }
+#endif  // CONFIG_EXT_REFS
+  };
   RD_OPT *const rd = &cpi->rd;
   const int idx = cpi->oxcf.mode == BEST;
   memcpy(rd->thresh_mult_sub8x8, thresh_mult[idx], sizeof(thresh_mult[idx]));
 }
 
-void vp10_update_rd_thresh_fact(int (*factor_buf)[MAX_MODES], int rd_thresh,
-                               int bsize, int best_mode_index) {
+void vp10_update_rd_thresh_fact(const VP10_COMMON *const cm,
+                                int (*factor_buf)[MAX_MODES], int rd_thresh,
+                                int bsize, int best_mode_index) {
   if (rd_thresh > 0) {
     const int top_mode = bsize < BLOCK_8X8 ? MAX_REFS : MAX_MODES;
     int mode;
     for (mode = 0; mode < top_mode; ++mode) {
       const BLOCK_SIZE min_size = VPXMAX(bsize - 1, BLOCK_4X4);
-      const BLOCK_SIZE max_size = VPXMIN(bsize + 2, BLOCK_64X64);
+      const BLOCK_SIZE max_size = VPXMIN(bsize + 2, cm->sb_size);
       BLOCK_SIZE bs;
       for (bs = min_size; bs <= max_size; ++bs) {
         int *const fact = &factor_buf[bs][mode];
diff --git a/vp10/encoder/rd.h b/vp10/encoder/rd.h
index cd58bf8..3b5f470 100644
--- a/vp10/encoder/rd.h
+++ b/vp10/encoder/rd.h
@@ -13,19 +13,29 @@
 
 #include <limits.h>
 
+#if CONFIG_ANS
+#include "vp10/common/ans.h"
+#endif  // CONFIG_ANS
 #include "vp10/common/blockd.h"
 
 #include "vp10/encoder/block.h"
 #include "vp10/encoder/context_tree.h"
+#include "vp10/encoder/cost.h"
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 #define RDDIV_BITS          7
+#define RD_EPB_SHIFT        6
 
 #define RDCOST(RM, DM, R, D) \
-  (((128 + ((int64_t)R) * (RM)) >> 8) + (D << DM))
+  (ROUND_POWER_OF_TWO(((int64_t)R) * (RM), VP9_PROB_COST_SHIFT) + (D << DM))
+
+#define RDCOST_DBL(RM, DM, R, D)                                   \
+  (((((double)(R)) * (RM)) / (double)(1 << VP9_PROB_COST_SHIFT)) + \
+   ((double)(D) * (1 << (DM))))
+
 #define QIDX_SKIP_THRESH     115
 
 #define MV_COST_WEIGHT      108
@@ -33,8 +43,29 @@
 
 #define INVALID_MV 0x80008000
 
+#if CONFIG_EXT_REFS
+
+#if CONFIG_EXT_INTER
+#define MAX_MODES 144
+#else  // CONFIG_EXT_INTER
+#define MAX_MODES 66
+#endif  // CONFIG_EXT_INTER
+
+#else  // CONFIG_EXT_REFS
+
+#if CONFIG_EXT_INTER
+#define MAX_MODES 57
+#else  // CONFIG_EXT_INTER
 #define MAX_MODES 30
+#endif  // CONFIG_EXT_INTER
+
+#endif  // CONFIG_EXT_REFS
+
+#if CONFIG_EXT_REFS
+#define MAX_REFS  15
+#else
 #define MAX_REFS  6
+#endif  // CONFIG_EXT_REFS
 
 #define RD_THRESH_MAX_FACT 64
 #define RD_THRESH_INC      1
@@ -43,36 +74,214 @@
 // const MODE_DEFINITION vp10_mode_order[MAX_MODES] used in the rd code.
 typedef enum {
   THR_NEARESTMV,
+#if CONFIG_EXT_REFS
+  THR_NEARESTL2,
+  THR_NEARESTL3,
+  THR_NEARESTB,
+#endif  // CONFIG_EXT_REFS
   THR_NEARESTA,
   THR_NEARESTG,
 
   THR_DC,
 
   THR_NEWMV,
+#if CONFIG_EXT_REFS
+  THR_NEWL2,
+  THR_NEWL3,
+  THR_NEWB,
+#endif  // CONFIG_EXT_REFS
   THR_NEWA,
   THR_NEWG,
 
   THR_NEARMV,
+#if CONFIG_EXT_REFS
+  THR_NEARL2,
+  THR_NEARL3,
+  THR_NEARB,
+#endif  // CONFIG_EXT_REFS
   THR_NEARA,
   THR_NEARG,
 
+#if CONFIG_EXT_INTER
+  THR_NEWFROMNEARMV,
+#if CONFIG_EXT_REFS
+  THR_NEWFROMNEARL2,
+  THR_NEWFROMNEARL3,
+  THR_NEWFROMNEARB,
+#endif  // CONFIG_EXT_REFS
+  THR_NEWFROMNEARA,
+  THR_NEWFROMNEARG,
+#endif  // CONFIG_EXT_INTER
+
   THR_ZEROMV,
+#if CONFIG_EXT_REFS
+  THR_ZEROL2,
+  THR_ZEROL3,
+  THR_ZEROB,
+#endif  // CONFIG_EXT_REFS
   THR_ZEROG,
   THR_ZEROA,
 
+#if CONFIG_EXT_INTER
+
+  THR_COMP_NEAREST_NEARESTLA,
+#if CONFIG_EXT_REFS
+  THR_COMP_NEAREST_NEARESTL2A,
+  THR_COMP_NEAREST_NEARESTL3A,
+#endif  // CONFIG_EXT_REFS
+  THR_COMP_NEAREST_NEARESTGA,
+#if CONFIG_EXT_REFS
+  THR_COMP_NEAREST_NEARESTLB,
+  THR_COMP_NEAREST_NEARESTL2B,
+  THR_COMP_NEAREST_NEARESTL3B,
+  THR_COMP_NEAREST_NEARESTGB,
+#endif  // CONFIG_EXT_REFS
+
+#else  // CONFIG_EXT_INTER
+
   THR_COMP_NEARESTLA,
+#if CONFIG_EXT_REFS
+  THR_COMP_NEARESTL2A,
+  THR_COMP_NEARESTL3A,
+#endif  // CONFIG_EXT_REFS
   THR_COMP_NEARESTGA,
+#if CONFIG_EXT_REFS
+  THR_COMP_NEARESTLB,
+  THR_COMP_NEARESTL2B,
+  THR_COMP_NEARESTL3B,
+  THR_COMP_NEARESTGB,
+#endif  // CONFIG_EXT_REFS
+
+#endif  // CONFIG_EXT_INTER
 
   THR_TM,
 
+#if CONFIG_EXT_INTER
+
+  THR_COMP_NEAR_NEARESTLA,
+  THR_COMP_NEAREST_NEARLA,
+  THR_COMP_NEAR_NEARLA,
+  THR_COMP_NEW_NEARESTLA,
+  THR_COMP_NEAREST_NEWLA,
+  THR_COMP_NEW_NEARLA,
+  THR_COMP_NEAR_NEWLA,
+  THR_COMP_NEW_NEWLA,
+  THR_COMP_ZERO_ZEROLA,
+
+#if CONFIG_EXT_REFS
+  THR_COMP_NEAR_NEARESTL2A,
+  THR_COMP_NEAREST_NEARL2A,
+  THR_COMP_NEAR_NEARL2A,
+  THR_COMP_NEW_NEARESTL2A,
+  THR_COMP_NEAREST_NEWL2A,
+  THR_COMP_NEW_NEARL2A,
+  THR_COMP_NEAR_NEWL2A,
+  THR_COMP_NEW_NEWL2A,
+  THR_COMP_ZERO_ZEROL2A,
+
+  THR_COMP_NEAR_NEARESTL3A,
+  THR_COMP_NEAREST_NEARL3A,
+  THR_COMP_NEAR_NEARL3A,
+  THR_COMP_NEW_NEARESTL3A,
+  THR_COMP_NEAREST_NEWL3A,
+  THR_COMP_NEW_NEARL3A,
+  THR_COMP_NEAR_NEWL3A,
+  THR_COMP_NEW_NEWL3A,
+  THR_COMP_ZERO_ZEROL3A,
+#endif  // CONFIG_EXT_REFS
+
+  THR_COMP_NEAR_NEARESTGA,
+  THR_COMP_NEAREST_NEARGA,
+  THR_COMP_NEAR_NEARGA,
+  THR_COMP_NEW_NEARESTGA,
+  THR_COMP_NEAREST_NEWGA,
+  THR_COMP_NEW_NEARGA,
+  THR_COMP_NEAR_NEWGA,
+  THR_COMP_NEW_NEWGA,
+  THR_COMP_ZERO_ZEROGA,
+
+#if CONFIG_EXT_REFS
+  THR_COMP_NEAR_NEARESTLB,
+  THR_COMP_NEAREST_NEARLB,
+  THR_COMP_NEAR_NEARLB,
+  THR_COMP_NEW_NEARESTLB,
+  THR_COMP_NEAREST_NEWLB,
+  THR_COMP_NEW_NEARLB,
+  THR_COMP_NEAR_NEWLB,
+  THR_COMP_NEW_NEWLB,
+  THR_COMP_ZERO_ZEROLB,
+
+  THR_COMP_NEAR_NEARESTL2B,
+  THR_COMP_NEAREST_NEARL2B,
+  THR_COMP_NEAR_NEARL2B,
+  THR_COMP_NEW_NEARESTL2B,
+  THR_COMP_NEAREST_NEWL2B,
+  THR_COMP_NEW_NEARL2B,
+  THR_COMP_NEAR_NEWL2B,
+  THR_COMP_NEW_NEWL2B,
+  THR_COMP_ZERO_ZEROL2B,
+
+  THR_COMP_NEAR_NEARESTL3B,
+  THR_COMP_NEAREST_NEARL3B,
+  THR_COMP_NEAR_NEARL3B,
+  THR_COMP_NEW_NEARESTL3B,
+  THR_COMP_NEAREST_NEWL3B,
+  THR_COMP_NEW_NEARL3B,
+  THR_COMP_NEAR_NEWL3B,
+  THR_COMP_NEW_NEWL3B,
+  THR_COMP_ZERO_ZEROL3B,
+
+  THR_COMP_NEAR_NEARESTGB,
+  THR_COMP_NEAREST_NEARGB,
+  THR_COMP_NEAR_NEARGB,
+  THR_COMP_NEW_NEARESTGB,
+  THR_COMP_NEAREST_NEWGB,
+  THR_COMP_NEW_NEARGB,
+  THR_COMP_NEAR_NEWGB,
+  THR_COMP_NEW_NEWGB,
+  THR_COMP_ZERO_ZEROGB,
+#endif  // CONFIG_EXT_REFS
+
+#else  // CONFIG_EXT_INTER
+
   THR_COMP_NEARLA,
   THR_COMP_NEWLA,
+#if CONFIG_EXT_REFS
+  THR_COMP_NEARL2A,
+  THR_COMP_NEWL2A,
+  THR_COMP_NEARL3A,
+  THR_COMP_NEWL3A,
+#endif  // CONFIG_EXT_REFS
   THR_COMP_NEARGA,
   THR_COMP_NEWGA,
 
+#if CONFIG_EXT_REFS
+  THR_COMP_NEARLB,
+  THR_COMP_NEWLB,
+  THR_COMP_NEARL2B,
+  THR_COMP_NEWL2B,
+  THR_COMP_NEARL3B,
+  THR_COMP_NEWL3B,
+  THR_COMP_NEARGB,
+  THR_COMP_NEWGB,
+#endif  // CONFIG_EXT_REFS
+
   THR_COMP_ZEROLA,
+#if CONFIG_EXT_REFS
+  THR_COMP_ZEROL2A,
+  THR_COMP_ZEROL3A,
+#endif  // CONFIG_EXT_REFS
   THR_COMP_ZEROGA,
 
+#if CONFIG_EXT_REFS
+  THR_COMP_ZEROLB,
+  THR_COMP_ZEROL2B,
+  THR_COMP_ZEROL3B,
+  THR_COMP_ZEROGB,
+#endif  // CONFIG_EXT_REFS
+
+#endif  // CONFIG_EXT_INTER
+
   THR_H_PRED,
   THR_V_PRED,
   THR_D135_PRED,
@@ -81,14 +290,68 @@
   THR_D63_PRED,
   THR_D117_PRED,
   THR_D45_PRED,
+
+#if CONFIG_EXT_INTER
+  THR_COMP_INTERINTRA_ZEROL,
+  THR_COMP_INTERINTRA_NEARESTL,
+  THR_COMP_INTERINTRA_NEARL,
+  THR_COMP_INTERINTRA_NEWL,
+
+#if CONFIG_EXT_REFS
+  THR_COMP_INTERINTRA_ZEROL2,
+  THR_COMP_INTERINTRA_NEARESTL2,
+  THR_COMP_INTERINTRA_NEARL2,
+  THR_COMP_INTERINTRA_NEWL2,
+
+  THR_COMP_INTERINTRA_ZEROL3,
+  THR_COMP_INTERINTRA_NEARESTL3,
+  THR_COMP_INTERINTRA_NEARL3,
+  THR_COMP_INTERINTRA_NEWL3,
+#endif  // CONFIG_EXT_REFS
+
+  THR_COMP_INTERINTRA_ZEROG,
+  THR_COMP_INTERINTRA_NEARESTG,
+  THR_COMP_INTERINTRA_NEARG,
+  THR_COMP_INTERINTRA_NEWG,
+
+#if CONFIG_EXT_REFS
+  THR_COMP_INTERINTRA_ZEROB,
+  THR_COMP_INTERINTRA_NEARESTB,
+  THR_COMP_INTERINTRA_NEARB,
+  THR_COMP_INTERINTRA_NEWB,
+#endif  // CONFIG_EXT_REFS
+
+  THR_COMP_INTERINTRA_ZEROA,
+  THR_COMP_INTERINTRA_NEARESTA,
+  THR_COMP_INTERINTRA_NEARA,
+  THR_COMP_INTERINTRA_NEWA,
+#endif  // CONFIG_EXT_INTER
 } THR_MODES;
 
 typedef enum {
   THR_LAST,
+#if CONFIG_EXT_REFS
+  THR_LAST2,
+  THR_LAST3,
+  THR_BWDR,
+#endif  // CONFIG_EXT_REFS
   THR_GOLD,
   THR_ALTR,
+
   THR_COMP_LA,
+#if CONFIG_EXT_REFS
+  THR_COMP_L2A,
+  THR_COMP_L3A,
+#endif  // CONFIG_EXT_REFS
   THR_COMP_GA,
+
+#if CONFIG_EXT_REFS
+  THR_COMP_LB,
+  THR_COMP_L2B,
+  THR_COMP_L3B,
+  THR_COMP_GB,
+#endif  // CONFIG_EXT_REFS
+
   THR_INTRA,
 } THR_MODES_SUB8X8;
 
@@ -104,8 +367,6 @@
 
   int64_t prediction_type_threshes[MAX_REF_FRAMES][REFERENCE_MODES];
 
-  int64_t filter_threshes[MAX_REF_FRAMES][SWITCHABLE_FILTER_CONTEXTS];
-
   int RDMULT;
   int RDDIV;
 } RD_OPT;
@@ -130,7 +391,7 @@
 
 void vp10_initialize_rd_consts(struct VP10_COMP *cpi);
 
-void vp10_initialize_me_consts(struct VP10_COMP *cpi,
+void vp10_initialize_me_consts(const struct VP10_COMP *cpi,
                                MACROBLOCK *x, int qindex);
 
 void vp10_model_rd_from_var_lapndz(unsigned int var, unsigned int n,
@@ -151,17 +412,28 @@
 
 void vp10_init_me_luts(void);
 
+#if CONFIG_REF_MV
+void vp10_set_mvcost(MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame);
+#endif
+
 void vp10_get_entropy_contexts(BLOCK_SIZE bsize, TX_SIZE tx_size,
                               const struct macroblockd_plane *pd,
-                              ENTROPY_CONTEXT t_above[16],
-                              ENTROPY_CONTEXT t_left[16]);
+                              ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE],
+                              ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE]);
 
 void vp10_set_rd_speed_thresholds(struct VP10_COMP *cpi);
 
 void vp10_set_rd_speed_thresholds_sub8x8(struct VP10_COMP *cpi);
 
-void vp10_update_rd_thresh_fact(int (*fact)[MAX_MODES], int rd_thresh,
-                               int bsize, int best_mode_index);
+void vp10_update_rd_thresh_fact(const VP10_COMMON *const cm,
+                                int (*fact)[MAX_MODES], int rd_thresh,
+                                int bsize, int best_mode_index);
+
+void vp10_fill_token_costs(vp10_coeff_cost *c,
+#if CONFIG_ANS
+                           coeff_cdf_model (*cdf)[PLANE_TYPES],
+#endif  // CONFIG_ANS
+                           vp10_coeff_probs_model (*p)[PLANE_TYPES]);
 
 static INLINE int rd_less_than_thresh(int64_t best_rd, int thresh,
                                       int thresh_fact) {
@@ -172,6 +444,11 @@
                  uint8_t *ref_y_buffer, int ref_y_stride,
                  int ref_frame, BLOCK_SIZE block_size);
 
+static INLINE void set_error_per_bit(MACROBLOCK *x, int rdmult) {
+  x->errorperbit = rdmult >> RD_EPB_SHIFT;
+  x->errorperbit += (x->errorperbit == 0);
+}
+
 void vp10_setup_pred_block(const MACROBLOCKD *xd,
                           struct buf_2d dst[MAX_MB_PLANE],
                           const YV12_BUFFER_CONFIG *src,
diff --git a/vp10/encoder/rdopt.c b/vp10/encoder/rdopt.c
index c62da96..45c10cd 100644
--- a/vp10/encoder/rdopt.c
+++ b/vp10/encoder/rdopt.c
@@ -15,6 +15,7 @@
 #include "./vpx_dsp_rtcd.h"
 
 #include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/blend.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 #include "vpx_ports/system_state.h"
@@ -35,26 +36,83 @@
 #include "vp10/encoder/encodemb.h"
 #include "vp10/encoder/encodemv.h"
 #include "vp10/encoder/encoder.h"
+#include "vp10/encoder/hybrid_fwd_txfm.h"
 #include "vp10/encoder/mcomp.h"
+#include "vp10/encoder/palette.h"
 #include "vp10/encoder/quantize.h"
 #include "vp10/encoder/ratectrl.h"
 #include "vp10/encoder/rd.h"
 #include "vp10/encoder/rdopt.h"
 #include "vp10/encoder/aq_variance.h"
 
+#if CONFIG_DUAL_FILTER
+#if CONFIG_EXT_INTERP
+static const int filter_sets[25][2] = {
+    {0, 0}, {0, 1}, {0, 2}, {0, 3}, {0, 4},
+    {1, 0}, {1, 1}, {1, 2}, {1, 3}, {1, 4},
+    {2, 0}, {2, 1}, {2, 2}, {2, 3}, {2, 4},
+    {3, 0}, {3, 1}, {3, 2}, {3, 3}, {3, 4},
+    {4, 0}, {4, 1}, {4, 2}, {4, 3}, {4, 4},
+};
+#else
+static const int filter_sets[9][2] = {
+    {0, 0}, {0, 1}, {0, 2},
+    {1, 0}, {1, 1}, {1, 2},
+    {2, 0}, {2, 1}, {2, 2},
+};
+#endif
+#endif
+
+#if CONFIG_EXT_REFS
+
+#define LAST_FRAME_MODE_MASK    ((1 << INTRA_FRAME) | (1 << LAST2_FRAME) | \
+                                 (1 << LAST3_FRAME) | (1 << GOLDEN_FRAME) | \
+                                 (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME))
+#define LAST2_FRAME_MODE_MASK   ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | \
+                                 (1 << LAST3_FRAME) | (1 << GOLDEN_FRAME) | \
+                                 (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME))
+#define LAST3_FRAME_MODE_MASK   ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | \
+                                 (1 << LAST2_FRAME) | (1 << GOLDEN_FRAME) | \
+                                 (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME))
+#define GOLDEN_FRAME_MODE_MASK  ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | \
+                                 (1 << LAST2_FRAME) | (1 << LAST3_FRAME) | \
+                                 (1 << BWDREF_FRAME) | (1 << ALTREF_FRAME))
+#define BWDREF_FRAME_MODE_MASK  ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | \
+                                 (1 << LAST2_FRAME) | (1 << LAST3_FRAME) | \
+                                 (1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME))
+#define ALTREF_FRAME_MODE_MASK  ((1 << INTRA_FRAME) | (1 << LAST_FRAME) | \
+                                 (1 << LAST2_FRAME) | (1 << LAST3_FRAME) | \
+                                 (1 << GOLDEN_FRAME) | (1 << BWDREF_FRAME))
+
+#else
+
 #define LAST_FRAME_MODE_MASK    ((1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME) | \
                                  (1 << INTRA_FRAME))
 #define GOLDEN_FRAME_MODE_MASK  ((1 << LAST_FRAME) | (1 << ALTREF_FRAME) | \
                                  (1 << INTRA_FRAME))
-#define ALT_REF_MODE_MASK       ((1 << LAST_FRAME) | (1 << GOLDEN_FRAME) | \
+#define ALTREF_FRAME_MODE_MASK  ((1 << LAST_FRAME) | (1 << GOLDEN_FRAME) | \
                                  (1 << INTRA_FRAME))
 
+#endif  // CONFIG_EXT_REFS
+
+#if CONFIG_EXT_REFS
+#define SECOND_REF_FRAME_MASK   ((1 << ALTREF_FRAME) | (1 << BWDREF_FRAME) | \
+                                 0x01)
+#else
 #define SECOND_REF_FRAME_MASK   ((1 << ALTREF_FRAME) | 0x01)
+#endif  // CONFIG_EXT_REFS
 
 #define MIN_EARLY_TERM_INDEX    3
 #define NEW_MV_DISCOUNT_FACTOR  8
 
-const double ext_tx_th = 0.99;
+#if CONFIG_EXT_INTRA
+#define ANGLE_FAST_SEARCH 1
+#define ANGLE_SKIP_THRESH 10
+#define FILTER_FAST_SEARCH 1
+#endif  // CONFIG_EXT_INTRA
+
+const double ADST_FLIP_SVM[8] = {-6.6623, -2.8062, -3.2531, 3.1671,  // vert
+                                 -7.7051, -3.2234, -3.6193, 3.4533};  // horz
 
 typedef struct {
   PREDICTION_MODE mode;
@@ -66,9 +124,10 @@
 } REF_DEFINITION;
 
 struct rdcost_block_args {
+  const VP10_COMP *cpi;
   MACROBLOCK *x;
-  ENTROPY_CONTEXT t_above[16];
-  ENTROPY_CONTEXT t_left[16];
+  ENTROPY_CONTEXT t_above[2 * MAX_MIB_SIZE];
+  ENTROPY_CONTEXT t_left[2 * MAX_MIB_SIZE];
   int this_rate;
   int64_t this_dist;
   int64_t this_sse;
@@ -83,36 +142,213 @@
 #define LAST_NEW_MV_INDEX 6
 static const MODE_DEFINITION vp10_mode_order[MAX_MODES] = {
   {NEARESTMV, {LAST_FRAME,   NONE}},
+#if CONFIG_EXT_REFS
+  {NEARESTMV, {LAST2_FRAME,  NONE}},
+  {NEARESTMV, {LAST3_FRAME,  NONE}},
+  {NEARESTMV, {BWDREF_FRAME, NONE}},
+#endif  // CONFIG_EXT_REFS
   {NEARESTMV, {ALTREF_FRAME, NONE}},
   {NEARESTMV, {GOLDEN_FRAME, NONE}},
 
   {DC_PRED,   {INTRA_FRAME,  NONE}},
 
   {NEWMV,     {LAST_FRAME,   NONE}},
+#if CONFIG_EXT_REFS
+  {NEWMV,     {LAST2_FRAME,  NONE}},
+  {NEWMV,     {LAST3_FRAME,  NONE}},
+  {NEWMV,     {BWDREF_FRAME, NONE}},
+#endif  // CONFIG_EXT_REFS
   {NEWMV,     {ALTREF_FRAME, NONE}},
   {NEWMV,     {GOLDEN_FRAME, NONE}},
 
   {NEARMV,    {LAST_FRAME,   NONE}},
+#if CONFIG_EXT_REFS
+  {NEARMV,    {LAST2_FRAME,  NONE}},
+  {NEARMV,    {LAST3_FRAME,  NONE}},
+  {NEARMV,    {BWDREF_FRAME, NONE}},
+#endif  // CONFIG_EXT_REFS
   {NEARMV,    {ALTREF_FRAME, NONE}},
   {NEARMV,    {GOLDEN_FRAME, NONE}},
 
+#if CONFIG_EXT_INTER
+  {NEWFROMNEARMV,    {LAST_FRAME,   NONE}},
+#if CONFIG_EXT_REFS
+  {NEWFROMNEARMV,    {LAST2_FRAME,  NONE}},
+  {NEWFROMNEARMV,    {LAST3_FRAME,  NONE}},
+  {NEWFROMNEARMV,    {BWDREF_FRAME, NONE}},
+#endif  // CONFIG_EXT_REFS
+  {NEWFROMNEARMV,    {ALTREF_FRAME, NONE}},
+  {NEWFROMNEARMV,    {GOLDEN_FRAME, NONE}},
+#endif  // CONFIG_EXT_INTER
+
   {ZEROMV,    {LAST_FRAME,   NONE}},
+#if CONFIG_EXT_REFS
+  {ZEROMV,    {LAST2_FRAME,  NONE}},
+  {ZEROMV,    {LAST3_FRAME,  NONE}},
+  {ZEROMV,    {BWDREF_FRAME, NONE}},
+#endif  // CONFIG_EXT_REFS
   {ZEROMV,    {GOLDEN_FRAME, NONE}},
   {ZEROMV,    {ALTREF_FRAME, NONE}},
 
+  // TODO(zoeliu): May need to reconsider the order on the modes to check
+
+#if CONFIG_EXT_INTER
+  {NEAREST_NEARESTMV, {LAST_FRAME,   ALTREF_FRAME}},
+#if CONFIG_EXT_REFS
+  {NEAREST_NEARESTMV, {LAST2_FRAME,  ALTREF_FRAME}},
+  {NEAREST_NEARESTMV, {LAST3_FRAME,  ALTREF_FRAME}},
+#endif  // CONFIG_EXT_REFS
+  {NEAREST_NEARESTMV, {GOLDEN_FRAME, ALTREF_FRAME}},
+#if CONFIG_EXT_REFS
+  {NEAREST_NEARESTMV, {LAST_FRAME,   BWDREF_FRAME}},
+  {NEAREST_NEARESTMV, {LAST2_FRAME,  BWDREF_FRAME}},
+  {NEAREST_NEARESTMV, {LAST3_FRAME,  BWDREF_FRAME}},
+  {NEAREST_NEARESTMV, {GOLDEN_FRAME, BWDREF_FRAME}},
+#endif  // CONFIG_EXT_REFS
+
+#else  // CONFIG_EXT_INTER
+
   {NEARESTMV, {LAST_FRAME,   ALTREF_FRAME}},
+#if CONFIG_EXT_REFS
+  {NEARESTMV, {LAST2_FRAME,  ALTREF_FRAME}},
+  {NEARESTMV, {LAST3_FRAME,  ALTREF_FRAME}},
+#endif  // CONFIG_EXT_REFS
   {NEARESTMV, {GOLDEN_FRAME, ALTREF_FRAME}},
+#if CONFIG_EXT_REFS
+  {NEARESTMV, {LAST_FRAME,   BWDREF_FRAME}},
+  {NEARESTMV, {LAST2_FRAME,  BWDREF_FRAME}},
+  {NEARESTMV, {LAST3_FRAME,  BWDREF_FRAME}},
+  {NEARESTMV, {GOLDEN_FRAME, BWDREF_FRAME}},
+#endif  // CONFIG_EXT_REFS
+#endif  // CONFIG_EXT_INTER
 
   {TM_PRED,   {INTRA_FRAME,  NONE}},
 
+#if CONFIG_EXT_INTER
+  {NEAR_NEARESTMV, {LAST_FRAME,   ALTREF_FRAME}},
+  {NEAREST_NEARMV, {LAST_FRAME,   ALTREF_FRAME}},
+  {NEAR_NEARMV,    {LAST_FRAME,   ALTREF_FRAME}},
+  {NEW_NEARESTMV,  {LAST_FRAME,   ALTREF_FRAME}},
+  {NEAREST_NEWMV,  {LAST_FRAME,   ALTREF_FRAME}},
+  {NEW_NEARMV,     {LAST_FRAME,   ALTREF_FRAME}},
+  {NEAR_NEWMV,     {LAST_FRAME,   ALTREF_FRAME}},
+  {NEW_NEWMV,      {LAST_FRAME,   ALTREF_FRAME}},
+  {ZERO_ZEROMV,    {LAST_FRAME,   ALTREF_FRAME}},
+
+#if CONFIG_EXT_REFS
+  {NEAR_NEARESTMV, {LAST2_FRAME,  ALTREF_FRAME}},
+  {NEAREST_NEARMV, {LAST2_FRAME,  ALTREF_FRAME}},
+  {NEAR_NEARMV,    {LAST2_FRAME,  ALTREF_FRAME}},
+  {NEW_NEARESTMV,  {LAST2_FRAME,  ALTREF_FRAME}},
+  {NEAREST_NEWMV,  {LAST2_FRAME,  ALTREF_FRAME}},
+  {NEW_NEARMV,     {LAST2_FRAME,  ALTREF_FRAME}},
+  {NEAR_NEWMV,     {LAST2_FRAME,  ALTREF_FRAME}},
+  {NEW_NEWMV,      {LAST2_FRAME,  ALTREF_FRAME}},
+  {ZERO_ZEROMV,    {LAST2_FRAME,  ALTREF_FRAME}},
+
+  {NEAR_NEARESTMV, {LAST3_FRAME,  ALTREF_FRAME}},
+  {NEAREST_NEARMV, {LAST3_FRAME,  ALTREF_FRAME}},
+  {NEAR_NEARMV,    {LAST3_FRAME,  ALTREF_FRAME}},
+  {NEW_NEARESTMV,  {LAST3_FRAME,  ALTREF_FRAME}},
+  {NEAREST_NEWMV,  {LAST3_FRAME,  ALTREF_FRAME}},
+  {NEW_NEARMV,     {LAST3_FRAME,  ALTREF_FRAME}},
+  {NEAR_NEWMV,     {LAST3_FRAME,  ALTREF_FRAME}},
+  {NEW_NEWMV,      {LAST3_FRAME,  ALTREF_FRAME}},
+  {ZERO_ZEROMV,    {LAST3_FRAME,  ALTREF_FRAME}},
+#endif  // CONFIG_EXT_REFS
+
+  {NEAR_NEARESTMV, {GOLDEN_FRAME, ALTREF_FRAME}},
+  {NEAREST_NEARMV, {GOLDEN_FRAME, ALTREF_FRAME}},
+  {NEAR_NEARMV,    {GOLDEN_FRAME, ALTREF_FRAME}},
+  {NEW_NEARESTMV,  {GOLDEN_FRAME, ALTREF_FRAME}},
+  {NEAREST_NEWMV,  {GOLDEN_FRAME, ALTREF_FRAME}},
+  {NEW_NEARMV,     {GOLDEN_FRAME, ALTREF_FRAME}},
+  {NEAR_NEWMV,     {GOLDEN_FRAME, ALTREF_FRAME}},
+  {NEW_NEWMV,      {GOLDEN_FRAME, ALTREF_FRAME}},
+  {ZERO_ZEROMV,    {GOLDEN_FRAME, ALTREF_FRAME}},
+
+#if CONFIG_EXT_REFS
+  {NEAR_NEARESTMV, {LAST_FRAME,   BWDREF_FRAME}},
+  {NEAREST_NEARMV, {LAST_FRAME,   BWDREF_FRAME}},
+  {NEAR_NEARMV,    {LAST_FRAME,   BWDREF_FRAME}},
+  {NEW_NEARESTMV,  {LAST_FRAME,   BWDREF_FRAME}},
+  {NEAREST_NEWMV,  {LAST_FRAME,   BWDREF_FRAME}},
+  {NEW_NEARMV,     {LAST_FRAME,   BWDREF_FRAME}},
+  {NEAR_NEWMV,     {LAST_FRAME,   BWDREF_FRAME}},
+  {NEW_NEWMV,      {LAST_FRAME,   BWDREF_FRAME}},
+  {ZERO_ZEROMV,    {LAST_FRAME,   BWDREF_FRAME}},
+
+  {NEAR_NEARESTMV, {LAST2_FRAME,  BWDREF_FRAME}},
+  {NEAREST_NEARMV, {LAST2_FRAME,  BWDREF_FRAME}},
+  {NEAR_NEARMV,    {LAST2_FRAME,  BWDREF_FRAME}},
+  {NEW_NEARESTMV,  {LAST2_FRAME,  BWDREF_FRAME}},
+  {NEAREST_NEWMV,  {LAST2_FRAME,  BWDREF_FRAME}},
+  {NEW_NEARMV,     {LAST2_FRAME,  BWDREF_FRAME}},
+  {NEAR_NEWMV,     {LAST2_FRAME,  BWDREF_FRAME}},
+  {NEW_NEWMV,      {LAST2_FRAME,  BWDREF_FRAME}},
+  {ZERO_ZEROMV,    {LAST2_FRAME,  BWDREF_FRAME}},
+
+  {NEAR_NEARESTMV, {LAST3_FRAME,  BWDREF_FRAME}},
+  {NEAREST_NEARMV, {LAST3_FRAME,  BWDREF_FRAME}},
+  {NEAR_NEARMV,    {LAST3_FRAME,  BWDREF_FRAME}},
+  {NEW_NEARESTMV,  {LAST3_FRAME,  BWDREF_FRAME}},
+  {NEAREST_NEWMV,  {LAST3_FRAME,  BWDREF_FRAME}},
+  {NEW_NEARMV,     {LAST3_FRAME,  BWDREF_FRAME}},
+  {NEAR_NEWMV,     {LAST3_FRAME,  BWDREF_FRAME}},
+  {NEW_NEWMV,      {LAST3_FRAME,  BWDREF_FRAME}},
+  {ZERO_ZEROMV,    {LAST3_FRAME,  BWDREF_FRAME}},
+
+  {NEAR_NEARESTMV, {GOLDEN_FRAME, BWDREF_FRAME}},
+  {NEAREST_NEARMV, {GOLDEN_FRAME, BWDREF_FRAME}},
+  {NEAR_NEARMV,    {GOLDEN_FRAME, BWDREF_FRAME}},
+  {NEW_NEARESTMV,  {GOLDEN_FRAME, BWDREF_FRAME}},
+  {NEAREST_NEWMV,  {GOLDEN_FRAME, BWDREF_FRAME}},
+  {NEW_NEARMV,     {GOLDEN_FRAME, BWDREF_FRAME}},
+  {NEAR_NEWMV,     {GOLDEN_FRAME, BWDREF_FRAME}},
+  {NEW_NEWMV,      {GOLDEN_FRAME, BWDREF_FRAME}},
+  {ZERO_ZEROMV,    {GOLDEN_FRAME, BWDREF_FRAME}},
+#endif  // CONFIG_EXT_REFS
+
+#else  // CONFIG_EXT_INTER
+
   {NEARMV,    {LAST_FRAME,   ALTREF_FRAME}},
   {NEWMV,     {LAST_FRAME,   ALTREF_FRAME}},
+#if CONFIG_EXT_REFS
+  {NEARMV,    {LAST2_FRAME,  ALTREF_FRAME}},
+  {NEWMV,     {LAST2_FRAME,  ALTREF_FRAME}},
+  {NEARMV,    {LAST3_FRAME,  ALTREF_FRAME}},
+  {NEWMV,     {LAST3_FRAME,  ALTREF_FRAME}},
+#endif  // CONFIG_EXT_REFS
   {NEARMV,    {GOLDEN_FRAME, ALTREF_FRAME}},
   {NEWMV,     {GOLDEN_FRAME, ALTREF_FRAME}},
 
+#if CONFIG_EXT_REFS
+  {NEARMV,    {LAST_FRAME,   BWDREF_FRAME}},
+  {NEWMV,     {LAST_FRAME,   BWDREF_FRAME}},
+  {NEARMV,    {LAST2_FRAME,  BWDREF_FRAME}},
+  {NEWMV,     {LAST2_FRAME,  BWDREF_FRAME}},
+  {NEARMV,    {LAST3_FRAME,  BWDREF_FRAME}},
+  {NEWMV,     {LAST3_FRAME,  BWDREF_FRAME}},
+  {NEARMV,    {GOLDEN_FRAME, BWDREF_FRAME}},
+  {NEWMV,     {GOLDEN_FRAME, BWDREF_FRAME}},
+#endif  // CONFIG_EXT_REFS
+
   {ZEROMV,    {LAST_FRAME,   ALTREF_FRAME}},
+#if CONFIG_EXT_REFS
+  {ZEROMV,    {LAST2_FRAME,  ALTREF_FRAME}},
+  {ZEROMV,    {LAST3_FRAME,  ALTREF_FRAME}},
+#endif  // CONFIG_EXT_REFS
   {ZEROMV,    {GOLDEN_FRAME, ALTREF_FRAME}},
 
+#if CONFIG_EXT_REFS
+  {ZEROMV,    {LAST_FRAME,   BWDREF_FRAME}},
+  {ZEROMV,    {LAST2_FRAME,  BWDREF_FRAME}},
+  {ZEROMV,    {LAST3_FRAME,  BWDREF_FRAME}},
+  {ZEROMV,    {GOLDEN_FRAME, BWDREF_FRAME}},
+#endif  // CONFIG_EXT_REFS
+
+#endif  // CONFIG_EXT_INTER
+
   {H_PRED,    {INTRA_FRAME,  NONE}},
   {V_PRED,    {INTRA_FRAME,  NONE}},
   {D135_PRED, {INTRA_FRAME,  NONE}},
@@ -121,61 +357,444 @@
   {D63_PRED,  {INTRA_FRAME,  NONE}},
   {D117_PRED, {INTRA_FRAME,  NONE}},
   {D45_PRED,  {INTRA_FRAME,  NONE}},
+
+#if CONFIG_EXT_INTER
+  {ZEROMV,    {LAST_FRAME,   INTRA_FRAME}},
+  {NEARESTMV, {LAST_FRAME,   INTRA_FRAME}},
+  {NEARMV,    {LAST_FRAME,   INTRA_FRAME}},
+  {NEWMV,     {LAST_FRAME,   INTRA_FRAME}},
+
+#if CONFIG_EXT_REFS
+  {ZEROMV,    {LAST2_FRAME,  INTRA_FRAME}},
+  {NEARESTMV, {LAST2_FRAME,  INTRA_FRAME}},
+  {NEARMV,    {LAST2_FRAME,  INTRA_FRAME}},
+  {NEWMV,     {LAST2_FRAME,  INTRA_FRAME}},
+
+  {ZEROMV,    {LAST3_FRAME,  INTRA_FRAME}},
+  {NEARESTMV, {LAST3_FRAME,  INTRA_FRAME}},
+  {NEARMV,    {LAST3_FRAME,  INTRA_FRAME}},
+  {NEWMV,     {LAST3_FRAME,  INTRA_FRAME}},
+#endif  // CONFIG_EXT_REFS
+
+  {ZEROMV,    {GOLDEN_FRAME, INTRA_FRAME}},
+  {NEARESTMV, {GOLDEN_FRAME, INTRA_FRAME}},
+  {NEARMV,    {GOLDEN_FRAME, INTRA_FRAME}},
+  {NEWMV,     {GOLDEN_FRAME, INTRA_FRAME}},
+
+#if CONFIG_EXT_REFS
+  {ZEROMV,    {BWDREF_FRAME, INTRA_FRAME}},
+  {NEARESTMV, {BWDREF_FRAME, INTRA_FRAME}},
+  {NEARMV,    {BWDREF_FRAME, INTRA_FRAME}},
+  {NEWMV,     {BWDREF_FRAME, INTRA_FRAME}},
+#endif  // CONFIG_EXT_REFS
+
+  {ZEROMV,    {ALTREF_FRAME, INTRA_FRAME}},
+  {NEARESTMV, {ALTREF_FRAME, INTRA_FRAME}},
+  {NEARMV,    {ALTREF_FRAME, INTRA_FRAME}},
+  {NEWMV,     {ALTREF_FRAME, INTRA_FRAME}},
+#endif  // CONFIG_EXT_INTER
 };
 
 static const REF_DEFINITION vp10_ref_order[MAX_REFS] = {
   {{LAST_FRAME,   NONE}},
+#if CONFIG_EXT_REFS
+  {{LAST2_FRAME,  NONE}},
+  {{LAST3_FRAME,  NONE}},
+  {{BWDREF_FRAME, NONE}},
+#endif  // CONFIG_EXT_REFS
   {{GOLDEN_FRAME, NONE}},
   {{ALTREF_FRAME, NONE}},
+
   {{LAST_FRAME,   ALTREF_FRAME}},
+#if CONFIG_EXT_REFS
+  {{LAST2_FRAME,  ALTREF_FRAME}},
+  {{LAST3_FRAME,  ALTREF_FRAME}},
+#endif  // CONFIG_EXT_REFS
   {{GOLDEN_FRAME, ALTREF_FRAME}},
+
+#if CONFIG_EXT_REFS
+  {{LAST_FRAME,   BWDREF_FRAME}},
+  {{LAST2_FRAME,  BWDREF_FRAME}},
+  {{LAST3_FRAME,  BWDREF_FRAME}},
+  {{GOLDEN_FRAME, BWDREF_FRAME}},
+#endif  // CONFIG_EXT_REFS
+
   {{INTRA_FRAME,  NONE}},
 };
 
-static void swap_block_ptr(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
-                           int m, int n, int min_plane, int max_plane) {
-  int i;
+static INLINE int write_uniform_cost(int n, int v) {
+  int l = get_unsigned_bits(n), m = (1 << l) - n;
+  if (l == 0)
+    return 0;
+  if (v < m)
+    return (l - 1) * vp10_cost_bit(128, 0);
+  else
+    return l * vp10_cost_bit(128, 0);
+}
 
-  for (i = min_plane; i < max_plane; ++i) {
-    struct macroblock_plane *const p = &x->plane[i];
-    struct macroblockd_plane *const pd = &x->e_mbd.plane[i];
+// constants for prune 1 and prune 2 decision boundaries
+#define FAST_EXT_TX_CORR_MID 0.0
+#define FAST_EXT_TX_EDST_MID 0.1
+#define FAST_EXT_TX_CORR_MARGIN 0.5
+#define FAST_EXT_TX_EDST_MARGIN 0.3
 
-    p->coeff    = ctx->coeff_pbuf[i][m];
-    p->qcoeff   = ctx->qcoeff_pbuf[i][m];
-    pd->dqcoeff = ctx->dqcoeff_pbuf[i][m];
-    p->eobs     = ctx->eobs_pbuf[i][m];
+static const TX_TYPE_1D vtx_tab[TX_TYPES] = {
+  DCT_1D,
+  ADST_1D,
+  DCT_1D,
+  ADST_1D,
+#if CONFIG_EXT_TX
+  FLIPADST_1D,
+  DCT_1D,
+  FLIPADST_1D,
+  ADST_1D,
+  FLIPADST_1D,
+  IDTX_1D,
+  DCT_1D,
+  IDTX_1D,
+  ADST_1D,
+  IDTX_1D,
+  FLIPADST_1D,
+  IDTX_1D,
+#endif  // CONFIG_EXT_TX
+};
 
-    ctx->coeff_pbuf[i][m]   = ctx->coeff_pbuf[i][n];
-    ctx->qcoeff_pbuf[i][m]  = ctx->qcoeff_pbuf[i][n];
-    ctx->dqcoeff_pbuf[i][m] = ctx->dqcoeff_pbuf[i][n];
-    ctx->eobs_pbuf[i][m]    = ctx->eobs_pbuf[i][n];
+static const TX_TYPE_1D htx_tab[TX_TYPES] = {
+  DCT_1D,
+  DCT_1D,
+  ADST_1D,
+  ADST_1D,
+#if CONFIG_EXT_TX
+  DCT_1D,
+  FLIPADST_1D,
+  FLIPADST_1D,
+  FLIPADST_1D,
+  ADST_1D,
+  IDTX_1D,
+  IDTX_1D,
+  DCT_1D,
+  IDTX_1D,
+  ADST_1D,
+  IDTX_1D,
+  FLIPADST_1D,
+#endif  // CONFIG_EXT_TX
+};
 
-    ctx->coeff_pbuf[i][n]   = p->coeff;
-    ctx->qcoeff_pbuf[i][n]  = p->qcoeff;
-    ctx->dqcoeff_pbuf[i][n] = pd->dqcoeff;
-    ctx->eobs_pbuf[i][n]    = p->eobs;
+static void get_energy_distribution_fine(const VP10_COMP *cpi,
+                                         BLOCK_SIZE bsize,
+                                         uint8_t *src, int src_stride,
+                                         uint8_t *dst, int dst_stride,
+                                         double *hordist, double *verdist) {
+  int bw = 4 << (b_width_log2_lookup[bsize]);
+  int bh = 4 << (b_height_log2_lookup[bsize]);
+  unsigned int esq[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  unsigned int var[16];
+  double total = 0;
+
+  const int f_index = bsize - BLOCK_16X16;
+  if (f_index < 0) {
+    int i, j, index;
+    int w_shift = bw == 8 ? 1 : 2;
+    int h_shift = bh == 8 ? 1 : 2;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cpi->common.use_highbitdepth) {
+      uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+      uint16_t *dst16 = CONVERT_TO_SHORTPTR(dst);
+      for (i = 0; i < bh; ++i)
+        for (j = 0; j < bw; ++j) {
+          index = (j >> w_shift) + ((i >> h_shift) << 2);
+          esq[index] += (src16[j + i * src_stride] -
+                        dst16[j + i * dst_stride]) *
+                        (src16[j + i * src_stride] -
+                        dst16[j + i * dst_stride]);
+        }
+    } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+      for (i = 0; i < bh; ++i)
+        for (j = 0; j < bw; ++j) {
+          index = (j >> w_shift) + ((i >> h_shift) << 2);
+          esq[index] += (src[j + i * src_stride] - dst[j + i * dst_stride]) *
+                        (src[j + i * src_stride] - dst[j + i * dst_stride]);
+        }
+#if CONFIG_VP9_HIGHBITDEPTH
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  } else {
+    var[0] = cpi->fn_ptr[f_index].vf(src, src_stride,
+                                     dst, dst_stride, &esq[0]);
+    var[1] = cpi->fn_ptr[f_index].vf(src + bw / 4, src_stride,
+                                     dst + bw / 4, dst_stride, &esq[1]);
+    var[2] = cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride,
+                                     dst + bw / 2, dst_stride, &esq[2]);
+    var[3] = cpi->fn_ptr[f_index].vf(src + 3 * bw / 4, src_stride,
+                                     dst + 3 * bw / 4, dst_stride, &esq[3]);
+    src += bh / 4 * src_stride;
+    dst += bh / 4 * dst_stride;
+
+    var[4] = cpi->fn_ptr[f_index].vf(src, src_stride,
+                                     dst, dst_stride, &esq[4]);
+    var[5] = cpi->fn_ptr[f_index].vf(src + bw / 4, src_stride,
+                                     dst + bw / 4, dst_stride, &esq[5]);
+    var[6] = cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride,
+                                     dst + bw / 2, dst_stride, &esq[6]);
+    var[7] = cpi->fn_ptr[f_index].vf(src + 3 * bw / 4, src_stride,
+                                     dst + 3 * bw / 4, dst_stride, &esq[7]);
+    src += bh / 4 * src_stride;
+    dst += bh / 4 * dst_stride;
+
+    var[8] = cpi->fn_ptr[f_index].vf(src, src_stride,
+                                     dst, dst_stride, &esq[8]);
+    var[9] = cpi->fn_ptr[f_index].vf(src + bw / 4, src_stride,
+                                     dst + bw / 4, dst_stride, &esq[9]);
+    var[10] = cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride,
+                                      dst + bw / 2, dst_stride, &esq[10]);
+    var[11] = cpi->fn_ptr[f_index].vf(src + 3 * bw / 4, src_stride,
+                                      dst + 3 * bw / 4, dst_stride, &esq[11]);
+    src += bh / 4 * src_stride;
+    dst += bh / 4 * dst_stride;
+
+    var[12] = cpi->fn_ptr[f_index].vf(src, src_stride,
+                                      dst, dst_stride, &esq[12]);
+    var[13] = cpi->fn_ptr[f_index].vf(src + bw / 4, src_stride,
+                                      dst + bw / 4, dst_stride, &esq[13]);
+    var[14] = cpi->fn_ptr[f_index].vf(src + bw / 2, src_stride,
+                                      dst + bw / 2, dst_stride, &esq[14]);
+    var[15] = cpi->fn_ptr[f_index].vf(src + 3 * bw / 4, src_stride,
+                                      dst + 3 * bw / 4, dst_stride, &esq[15]);
+  }
+
+  total = esq[0] + esq[1] + esq[2] + esq[3] +
+          esq[4] + esq[5] + esq[6] + esq[7] +
+          esq[8] + esq[9] + esq[10] + esq[11] +
+          esq[12] + esq[13] + esq[14] + esq[15];
+  if (total > 0) {
+    const double e_recip = 1.0 / total;
+    hordist[0] = ((double)esq[0] + (double)esq[4] + (double)esq[8] +
+                  (double)esq[12]) * e_recip;
+    hordist[1] = ((double)esq[1] + (double)esq[5] + (double)esq[9] +
+                  (double)esq[13]) * e_recip;
+    hordist[2] = ((double)esq[2] + (double)esq[6] + (double)esq[10] +
+                  (double)esq[14]) * e_recip;
+    verdist[0] = ((double)esq[0] + (double)esq[1] + (double)esq[2] +
+                  (double)esq[3]) * e_recip;
+    verdist[1] = ((double)esq[4] + (double)esq[5] + (double)esq[6] +
+                  (double)esq[7]) * e_recip;
+    verdist[2] = ((double)esq[8] + (double)esq[9] + (double)esq[10] +
+                  (double)esq[11]) * e_recip;
+  } else {
+    hordist[0] = verdist[0] = 0.25;
+    hordist[1] = verdist[1] = 0.25;
+    hordist[2] = verdist[2] = 0.25;
+  }
+  (void) var[0];
+  (void) var[1];
+  (void) var[2];
+  (void) var[3];
+  (void) var[4];
+  (void) var[5];
+  (void) var[6];
+  (void) var[7];
+  (void) var[8];
+  (void) var[9];
+  (void) var[10];
+  (void) var[11];
+  (void) var[12];
+  (void) var[13];
+  (void) var[14];
+  (void) var[15];
+}
+
+static int adst_vs_flipadst(const VP10_COMP *cpi, BLOCK_SIZE bsize,
+                            uint8_t *src, int src_stride,
+                            uint8_t *dst, int dst_stride,
+                            double *hdist, double *vdist) {
+  int prune_bitmask = 0;
+  double svm_proj_h = 0, svm_proj_v = 0;
+  get_energy_distribution_fine(cpi, bsize, src, src_stride,
+                               dst, dst_stride, hdist, vdist);
+
+  svm_proj_v = vdist[0] * ADST_FLIP_SVM[0] +
+               vdist[1] * ADST_FLIP_SVM[1] +
+               vdist[2] * ADST_FLIP_SVM[2] + ADST_FLIP_SVM[3];
+  svm_proj_h = hdist[0] * ADST_FLIP_SVM[4] +
+               hdist[1] * ADST_FLIP_SVM[5] +
+               hdist[2] * ADST_FLIP_SVM[6] + ADST_FLIP_SVM[7];
+  if (svm_proj_v > FAST_EXT_TX_EDST_MID + FAST_EXT_TX_EDST_MARGIN)
+    prune_bitmask |= 1 << FLIPADST_1D;
+  else if (svm_proj_v < FAST_EXT_TX_EDST_MID - FAST_EXT_TX_EDST_MARGIN)
+    prune_bitmask |= 1 << ADST_1D;
+
+  if (svm_proj_h > FAST_EXT_TX_EDST_MID + FAST_EXT_TX_EDST_MARGIN)
+    prune_bitmask |= 1 << (FLIPADST_1D + 8);
+  else if (svm_proj_h < FAST_EXT_TX_EDST_MID - FAST_EXT_TX_EDST_MARGIN)
+    prune_bitmask |= 1 << (ADST_1D + 8);
+
+  return prune_bitmask;
+}
+
+#if CONFIG_EXT_TX
+static void get_horver_correlation(int16_t *diff, int stride,
+                                   int w, int h,
+                                   double *hcorr, double *vcorr) {
+  // Returns hor/ver correlation coefficient
+  const int num = (h - 1) * (w - 1);
+  double num_r;
+  int i, j;
+  int64_t xy_sum = 0, xz_sum = 0;
+  int64_t x_sum = 0, y_sum = 0, z_sum = 0;
+  int64_t x2_sum = 0, y2_sum = 0, z2_sum = 0;
+  double x_var_n, y_var_n, z_var_n, xy_var_n, xz_var_n;
+  *hcorr = *vcorr = 1;
+
+  assert(num > 0);
+  num_r = 1.0 / num;
+  for (i = 1; i < h; ++i) {
+    for (j = 1; j < w; ++j) {
+      const int16_t x = diff[i * stride + j];
+      const int16_t y = diff[i * stride + j - 1];
+      const int16_t z = diff[(i - 1) * stride + j];
+      xy_sum += x * y;
+      xz_sum += x * z;
+      x_sum += x;
+      y_sum += y;
+      z_sum += z;
+      x2_sum += x * x;
+      y2_sum += y * y;
+      z2_sum += z * z;
+    }
+  }
+  x_var_n =  x2_sum - (x_sum * x_sum) * num_r;
+  y_var_n =  y2_sum - (y_sum * y_sum) * num_r;
+  z_var_n =  z2_sum - (z_sum * z_sum) * num_r;
+  xy_var_n = xy_sum - (x_sum * y_sum) * num_r;
+  xz_var_n = xz_sum - (x_sum * z_sum) * num_r;
+  if (x_var_n > 0 && y_var_n > 0) {
+    *hcorr = xy_var_n / sqrt(x_var_n * y_var_n);
+    *hcorr = *hcorr < 0 ? 0 : *hcorr;
+  }
+  if (x_var_n > 0 && z_var_n > 0) {
+    *vcorr = xz_var_n / sqrt(x_var_n * z_var_n);
+    *vcorr = *vcorr < 0 ? 0 : *vcorr;
   }
 }
 
-static void model_rd_for_sb(VP10_COMP *cpi, BLOCK_SIZE bsize,
-                            MACROBLOCK *x, MACROBLOCKD *xd,
-                            int *out_rate_sum, int64_t *out_dist_sum,
-                            int *skip_txfm_sb, int64_t *skip_sse_sb) {
-  // Note our transform coeffs are 8 times an orthogonal transform.
-  // Hence quantizer step is also 8 times. To get effective quantizer
-  // we need to divide by 8 before sending to modeling function.
-  int i;
-  int64_t rate_sum = 0;
-  int64_t dist_sum = 0;
-  const int ref = xd->mi[0]->mbmi.ref_frame[0];
-  unsigned int sse;
-  unsigned int var = 0;
-  unsigned int sum_sse = 0;
-  int64_t total_sse = 0;
-  int skip_flag = 1;
-  const int shift = 6;
-  int rate;
-  int64_t dist;
+int dct_vs_idtx(int16_t *diff, int stride, int w, int h,
+                double *hcorr, double *vcorr) {
+  int prune_bitmask = 0;
+  get_horver_correlation(diff, stride, w, h, hcorr, vcorr);
+
+  if (*vcorr > FAST_EXT_TX_CORR_MID + FAST_EXT_TX_CORR_MARGIN)
+    prune_bitmask |= 1 << IDTX_1D;
+  else if (*vcorr < FAST_EXT_TX_CORR_MID - FAST_EXT_TX_CORR_MARGIN)
+    prune_bitmask |= 1 << DCT_1D;
+
+  if (*hcorr > FAST_EXT_TX_CORR_MID + FAST_EXT_TX_CORR_MARGIN)
+    prune_bitmask |= 1 << (IDTX_1D + 8);
+  else if (*hcorr < FAST_EXT_TX_CORR_MID - FAST_EXT_TX_CORR_MARGIN)
+    prune_bitmask |= 1 << (DCT_1D + 8);
+  return prune_bitmask;
+}
+
+// Performance drop: 0.5%, Speed improvement: 24%
+static int prune_two_for_sby(const VP10_COMP *cpi,
+                             BLOCK_SIZE bsize,
+                             MACROBLOCK *x,
+                             MACROBLOCKD *xd, int adst_flipadst,
+                             int dct_idtx) {
+  struct macroblock_plane *const p = &x->plane[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
+  const int bw = 4 << (b_width_log2_lookup[bs]);
+  const int bh = 4 << (b_height_log2_lookup[bs]);
+  double hdist[3] = {0, 0, 0}, vdist[3] = {0, 0, 0};
+  double hcorr, vcorr;
+  int prune = 0;
+  vp10_subtract_plane(x, bsize, 0);
+
+  if (adst_flipadst)
+    prune |= adst_vs_flipadst(cpi, bsize, p->src.buf, p->src.stride,
+                              pd->dst.buf, pd->dst.stride, hdist, vdist);
+  if (dct_idtx)
+    prune |= dct_vs_idtx(p->src_diff, bw, bw, bh, &hcorr, &vcorr);
+
+  return prune;
+}
+#endif  // CONFIG_EXT_TX
+
+// Performance drop: 0.3%, Speed improvement: 5%
+static int prune_one_for_sby(const VP10_COMP *cpi,
+                             BLOCK_SIZE bsize,
+                             MACROBLOCK *x,
+                             MACROBLOCKD *xd) {
+  struct macroblock_plane *const p = &x->plane[0];
+  struct macroblockd_plane *const pd = &xd->plane[0];
+  double hdist[3] = {0, 0, 0}, vdist[3] = {0, 0, 0};
+  vp10_subtract_plane(x, bsize, 0);
+  return adst_vs_flipadst(cpi, bsize, p->src.buf, p->src.stride, pd->dst.buf,
+                          pd->dst.stride, hdist, vdist);
+}
+
+static int prune_tx_types(const VP10_COMP *cpi,
+                          BLOCK_SIZE bsize,
+                          MACROBLOCK *x,
+                          MACROBLOCKD *xd, int tx_set) {
+#if CONFIG_EXT_TX
+  const int *tx_set_1D = ext_tx_used_inter_1D[tx_set];
+#else
+  const int tx_set_1D[TX_TYPES_1D] = {0};
+#endif
+
+  switch (cpi->sf.tx_type_search.prune_mode) {
+    case NO_PRUNE:
+      return 0;
+      break;
+    case PRUNE_ONE :
+      if ((tx_set >= 0) & !(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D]))
+        return 0;
+      return prune_one_for_sby(cpi, bsize, x, xd);
+      break;
+  #if CONFIG_EXT_TX
+    case PRUNE_TWO :
+      if ((tx_set >= 0) & !(tx_set_1D[FLIPADST_1D] & tx_set_1D[ADST_1D])) {
+        if (!(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D]))
+          return 0;
+        return prune_two_for_sby(cpi, bsize, x, xd, 0, 1);
+      }
+      if ((tx_set >= 0) & !(tx_set_1D[DCT_1D] & tx_set_1D[IDTX_1D]))
+        return prune_two_for_sby(cpi, bsize, x, xd, 1, 0);
+      return prune_two_for_sby(cpi, bsize, x, xd, 1, 1);
+      break;
+  #endif
+  }
+  assert(0);
+  return 0;
+}
+
+static int do_tx_type_search(TX_TYPE tx_type,
+                             int prune) {
+// TODO(sarahparker) implement for non ext tx
+#if CONFIG_EXT_TX
+  return !(((prune >> vtx_tab[tx_type]) & 1) |
+         ((prune >> (htx_tab[tx_type] + 8)) & 1));
+#else
+  // temporary to avoid compiler warnings
+  (void) vtx_tab;
+  (void) htx_tab;
+  (void) tx_type;
+  (void) prune;
+  return 1;
+#endif
+}
+
+static void model_rd_from_sse(const VP10_COMP *const cpi,
+                              const MACROBLOCKD *const xd,
+                              BLOCK_SIZE bsize,
+                              int plane,
+                              uint64_t sse,
+                              int *rate,
+                              int64_t *dist) {
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
   const int dequant_shift =
 #if CONFIG_VP9_HIGHBITDEPTH
       (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ?
@@ -183,92 +802,72 @@
 #endif  // CONFIG_VP9_HIGHBITDEPTH
           3;
 
-  x->pred_sse[ref] = 0;
+  // Fast approximate the modelling function.
+  if (cpi->sf.simple_model_rd_from_var) {
+    const int64_t square_error = sse;
+    int quantizer = (pd->dequant[1] >> dequant_shift);
 
-  for (i = 0; i < MAX_MB_PLANE; ++i) {
-    struct macroblock_plane *const p = &x->plane[i];
-    struct macroblockd_plane *const pd = &xd->plane[i];
-    const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
-    const TX_SIZE max_tx_size = max_txsize_lookup[bs];
-    const BLOCK_SIZE unit_size = txsize_to_bsize[max_tx_size];
-    const int64_t dc_thr = p->quant_thred[0] >> shift;
-    const int64_t ac_thr = p->quant_thred[1] >> shift;
-    // The low thresholds are used to measure if the prediction errors are
-    // low enough so that we can skip the mode search.
-    const int64_t low_dc_thr = VPXMIN(50, dc_thr >> 2);
-    const int64_t low_ac_thr = VPXMIN(80, ac_thr >> 2);
-    int bw = 1 << (b_width_log2_lookup[bs] - b_width_log2_lookup[unit_size]);
-    int bh = 1 << (b_height_log2_lookup[bs] - b_width_log2_lookup[unit_size]);
-    int idx, idy;
-    int lw = b_width_log2_lookup[unit_size] + 2;
-    int lh = b_height_log2_lookup[unit_size] + 2;
-
-    sum_sse = 0;
-
-    for (idy = 0; idy < bh; ++idy) {
-      for (idx = 0; idx < bw; ++idx) {
-        uint8_t *src = p->src.buf + (idy * p->src.stride << lh) + (idx << lw);
-        uint8_t *dst = pd->dst.buf + (idy * pd->dst.stride << lh) + (idx << lh);
-        int block_idx = (idy << 1) + idx;
-        int low_err_skip = 0;
-
-        var = cpi->fn_ptr[unit_size].vf(src, p->src.stride,
-                                        dst, pd->dst.stride, &sse);
-        x->bsse[(i << 2) + block_idx] = sse;
-        sum_sse += sse;
-
-        x->skip_txfm[(i << 2) + block_idx] = SKIP_TXFM_NONE;
-        if (!x->select_tx_size) {
-          // Check if all ac coefficients can be quantized to zero.
-          if (var < ac_thr || var == 0) {
-            x->skip_txfm[(i << 2) + block_idx] = SKIP_TXFM_AC_ONLY;
-
-            // Check if dc coefficient can be quantized to zero.
-            if (sse - var < dc_thr || sse == var) {
-              x->skip_txfm[(i << 2) + block_idx] = SKIP_TXFM_AC_DC;
-
-              if (!sse || (var < low_ac_thr && sse - var < low_dc_thr))
-                low_err_skip = 1;
-            }
-          }
-        }
-
-        if (skip_flag && !low_err_skip)
-          skip_flag = 0;
-
-        if (i == 0)
-          x->pred_sse[ref] += sse;
-      }
-    }
-
-    total_sse += sum_sse;
-
-    // Fast approximate the modelling function.
-    if (cpi->sf.simple_model_rd_from_var) {
-      int64_t rate;
-      const int64_t square_error = sum_sse;
-      int quantizer = (pd->dequant[1] >> dequant_shift);
-
-      if (quantizer < 120)
-        rate = (square_error * (280 - quantizer)) >> 8;
-      else
-        rate = 0;
-      dist = (square_error * quantizer) >> 8;
-      rate_sum += rate;
-      dist_sum += dist;
-    } else {
-      vp10_model_rd_from_var_lapndz(sum_sse, num_pels_log2_lookup[bs],
-                                   pd->dequant[1] >> dequant_shift,
-                                   &rate, &dist);
-      rate_sum += rate;
-      dist_sum += dist;
-    }
+    if (quantizer < 120)
+      *rate = (square_error * (280 - quantizer)) >> (16 - VP9_PROB_COST_SHIFT);
+    else
+      *rate = 0;
+    *dist = (square_error * quantizer) >> 8;
+  } else {
+    vp10_model_rd_from_var_lapndz(sse, num_pels_log2_lookup[bsize],
+                                  pd->dequant[1] >> dequant_shift,
+                                  rate, dist);
   }
 
-  *skip_txfm_sb = skip_flag;
+  *dist <<= 4;
+}
+
+
+static void model_rd_for_sb(const VP10_COMP *const cpi, BLOCK_SIZE bsize,
+                            MACROBLOCK *x, MACROBLOCKD *xd,
+                            int plane_from, int plane_to,
+                            int *out_rate_sum, int64_t *out_dist_sum,
+                            int *skip_txfm_sb, int64_t *skip_sse_sb) {
+  // Note our transform coeffs are 8 times an orthogonal transform.
+  // Hence quantizer step is also 8 times. To get effective quantizer
+  // we need to divide by 8 before sending to modeling function.
+  int plane;
+  const int ref = xd->mi[0]->mbmi.ref_frame[0];
+
+  int64_t rate_sum = 0;
+  int64_t dist_sum = 0;
+  int64_t total_sse = 0;
+
+  x->pred_sse[ref] = 0;
+
+  for (plane = plane_from; plane <= plane_to; ++plane) {
+    struct macroblock_plane *const p = &x->plane[plane];
+    struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE bs = get_plane_block_size(bsize, pd);
+
+    unsigned int sse;
+    int rate;
+    int64_t dist;
+
+    // TODO(geza): Write direct sse functions that do not compute
+    // variance as well.
+    cpi->fn_ptr[bs].vf(p->src.buf, p->src.stride,
+                       pd->dst.buf, pd->dst.stride, &sse);
+
+    if (plane == 0)
+      x->pred_sse[ref] = sse;
+
+    total_sse += sse;
+
+    model_rd_from_sse(cpi, xd, bs, plane, sse, &rate, &dist);
+
+    rate_sum += rate;
+    dist_sum += dist;
+  }
+
+  *skip_txfm_sb = total_sse == 0;
   *skip_sse_sb = total_sse << 4;
   *out_rate_sum = (int)rate_sum;
-  *out_dist_sum = dist_sum << 4;
+  *out_dist_sum = dist_sum;
 }
 
 int64_t vp10_block_error_c(const tran_low_t *coeff, const tran_low_t *dqcoeff,
@@ -336,7 +935,11 @@
 };
 static int cost_coeffs(MACROBLOCK *x,
                        int plane, int block,
+#if CONFIG_VAR_TX
+                       int coeff_ctx,
+#else
                        ENTROPY_CONTEXT *A, ENTROPY_CONTEXT *L,
+#endif
                        TX_SIZE tx_size,
                        const int16_t *scan, const int16_t *nb,
                        int use_fast_coef_costing) {
@@ -350,104 +953,210 @@
   const tran_low_t *const qcoeff = BLOCK_OFFSET(p->qcoeff, block);
   unsigned int (*token_costs)[2][COEFF_CONTEXTS][ENTROPY_TOKENS] =
                    x->token_costs[tx_size][type][is_inter_block(mbmi)];
-  uint8_t token_cache[32 * 32];
+  uint8_t token_cache[MAX_TX_SQUARE];
+#if CONFIG_VAR_TX
+  int pt = coeff_ctx;
+#else
   int pt = combine_entropy_contexts(*A, *L);
+#endif
   int c, cost;
 #if CONFIG_VP9_HIGHBITDEPTH
-  const int16_t *cat6_high_cost = vp10_get_high_cost_table(xd->bd);
+  const int *cat6_high_cost = vp10_get_high_cost_table(xd->bd);
 #else
-  const int16_t *cat6_high_cost = vp10_get_high_cost_table(8);
+  const int *cat6_high_cost = vp10_get_high_cost_table(8);
 #endif
 
+#if !CONFIG_VAR_TX && !CONFIG_SUPERTX
   // Check for consistency of tx_size with mode info
   assert(type == PLANE_TYPE_Y ? mbmi->tx_size == tx_size
                               : get_uv_tx_size(mbmi, pd) == tx_size);
+#endif  // !CONFIG_VAR_TX && !CONFIG_SUPERTX
 
   if (eob == 0) {
     // single eob token
     cost = token_costs[0][0][pt][EOB_TOKEN];
     c = 0;
   } else {
-    int band_left = *band_count++;
+    if (use_fast_coef_costing) {
+      int band_left = *band_count++;
 
-    // dc token
-    int v = qcoeff[0];
-    int16_t prev_t;
-    EXTRABIT e;
-    vp10_get_token_extra(v, &prev_t, &e);
-    cost = (*token_costs)[0][pt][prev_t] +
-        vp10_get_cost(prev_t, e, cat6_high_cost);
+      // dc token
+      int v = qcoeff[0];
+      int16_t prev_t;
+      cost = vp10_get_token_cost(v, &prev_t, cat6_high_cost);
+      cost += (*token_costs)[0][pt][prev_t];
 
-    token_cache[0] = vp10_pt_energy_class[prev_t];
-    ++token_costs;
+      token_cache[0] = vp10_pt_energy_class[prev_t];
+      ++token_costs;
 
-    // ac tokens
-    for (c = 1; c < eob; c++) {
-      const int rc = scan[c];
-      int16_t t;
+      // ac tokens
+      for (c = 1; c < eob; c++) {
+        const int rc = scan[c];
+        int16_t t;
 
-      v = qcoeff[rc];
-      vp10_get_token_extra(v, &t, &e);
-      if (use_fast_coef_costing) {
-        cost += (*token_costs)[!prev_t][!prev_t][t] +
-            vp10_get_cost(t, e, cat6_high_cost);
-      } else {
-        pt = get_coef_context(nb, token_cache, c);
-        cost += (*token_costs)[!prev_t][pt][t] +
-            vp10_get_cost(t, e, cat6_high_cost);
-        token_cache[rc] = vp10_pt_energy_class[t];
+        v = qcoeff[rc];
+        cost += vp10_get_token_cost(v, &t, cat6_high_cost);
+        cost += (*token_costs)[!prev_t][!prev_t][t];
+        prev_t = t;
+        if (!--band_left) {
+          band_left = *band_count++;
+          ++token_costs;
+        }
       }
-      prev_t = t;
-      if (!--band_left) {
-        band_left = *band_count++;
-        ++token_costs;
-      }
-    }
 
-    // eob token
-    if (band_left) {
-      if (use_fast_coef_costing) {
+      // eob token
+      if (band_left)
         cost += (*token_costs)[0][!prev_t][EOB_TOKEN];
-      } else {
+
+    } else {  // !use_fast_coef_costing
+      int band_left = *band_count++;
+
+      // dc token
+      int v = qcoeff[0];
+      int16_t tok;
+      unsigned int (*tok_cost_ptr)[COEFF_CONTEXTS][ENTROPY_TOKENS];
+      cost = vp10_get_token_cost(v, &tok, cat6_high_cost);
+      cost += (*token_costs)[0][pt][tok];
+
+      token_cache[0] = vp10_pt_energy_class[tok];
+      ++token_costs;
+
+      tok_cost_ptr = &((*token_costs)[!tok]);
+
+      // ac tokens
+      for (c = 1; c < eob; c++) {
+        const int rc = scan[c];
+
+        v = qcoeff[rc];
+        cost += vp10_get_token_cost(v, &tok, cat6_high_cost);
+        pt = get_coef_context(nb, token_cache, c);
+        cost += (*tok_cost_ptr)[pt][tok];
+        token_cache[rc] = vp10_pt_energy_class[tok];
+        if (!--band_left) {
+          band_left = *band_count++;
+          ++token_costs;
+        }
+        tok_cost_ptr = &((*token_costs)[!tok]);
+      }
+
+      // eob token
+      if (band_left) {
         pt = get_coef_context(nb, token_cache, c);
         cost += (*token_costs)[0][pt][EOB_TOKEN];
       }
     }
   }
 
+#if !CONFIG_VAR_TX
   // is eob first coefficient;
   *A = *L = (c > 0);
+#endif
 
   return cost;
 }
 
-static void dist_block(MACROBLOCK *x, int plane, int block, TX_SIZE tx_size,
+static void dist_block(const VP10_COMP *cpi, MACROBLOCK *x, int plane,
+                       int block, int blk_row, int blk_col, TX_SIZE tx_size,
                        int64_t *out_dist, int64_t *out_sse) {
-  const int ss_txfrm_size = tx_size << 1;
   MACROBLOCKD* const xd = &x->e_mbd;
   const struct macroblock_plane *const p = &x->plane[plane];
   const struct macroblockd_plane *const pd = &xd->plane[plane];
-  int64_t this_sse;
-  int shift = tx_size == TX_32X32 ? 0 : 2;
-  tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
-  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  if (cpi->sf.use_transform_domain_distortion) {
+    // Transform domain distortion computation is more accurate as it does
+    // not involve an inverse transform, but it is less accurate.
+    const int ss_txfrm_size = tx_size << 1;
+    int64_t this_sse;
+    int tx_type = get_tx_type(pd->plane_type, xd, block, tx_size);
+    int shift = (MAX_TX_SCALE - get_tx_scale(xd, tx_type, tx_size)) * 2;
+    tran_low_t *const coeff = BLOCK_OFFSET(p->coeff, block);
+    tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
 #if CONFIG_VP9_HIGHBITDEPTH
-  const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8;
-  *out_dist = vp10_highbd_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
-                                     &this_sse, bd) >> shift;
+    const int bd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? xd->bd : 8;
+    *out_dist = vp10_highbd_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
+                                        &this_sse, bd) >> shift;
 #else
-  *out_dist = vp10_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
-                              &this_sse) >> shift;
+    *out_dist = vp10_block_error(coeff, dqcoeff, 16 << ss_txfrm_size,
+                                 &this_sse) >> shift;
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-  *out_sse = this_sse >> shift;
+    *out_sse = this_sse >> shift;
+  } else {
+    const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+    const int bs = 4*num_4x4_blocks_wide_lookup[tx_bsize];
+    const int src_stride = x->plane[plane].src.stride;
+    const int dst_stride = xd->plane[plane].dst.stride;
+    const int src_idx = 4 * (blk_row * src_stride + blk_col);
+    const int dst_idx = 4 * (blk_row * dst_stride + blk_col);
+    const uint8_t *src = &x->plane[plane].src.buf[src_idx];
+    const uint8_t *dst = &xd->plane[plane].dst.buf[dst_idx];
+    const tran_low_t *dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+    const uint16_t eob = p->eobs[block];
+
+    unsigned int tmp;
+
+    assert(cpi != NULL);
+
+    cpi->fn_ptr[tx_bsize].vf(src, src_stride, dst, dst_stride, &tmp);
+    *out_sse = (int64_t)tmp * 16;
+
+    if (eob) {
+      const MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+#if CONFIG_VP9_HIGHBITDEPTH
+      DECLARE_ALIGNED(16, uint16_t, recon16[MAX_TX_SQUARE]);
+      uint8_t *recon = (uint8_t*)recon16;
+#else
+      DECLARE_ALIGNED(16, uint8_t, recon[MAX_TX_SQUARE]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+      const PLANE_TYPE plane_type = plane == 0 ? PLANE_TYPE_Y : PLANE_TYPE_UV;
+
+      INV_TXFM_PARAM inv_txfm_param;
+
+      inv_txfm_param.tx_type = get_tx_type(plane_type, xd, block, tx_size);
+      inv_txfm_param.tx_size = tx_size;
+      inv_txfm_param.eob = eob;
+      inv_txfm_param.lossless = xd->lossless[mbmi->segment_id];
+
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        recon = CONVERT_TO_BYTEPTR(recon);
+        inv_txfm_param.bd = xd->bd;
+        vpx_highbd_convolve_copy(dst, dst_stride, recon, MAX_TX_SIZE,
+                                 NULL, 0, NULL, 0, bs, bs, xd->bd);
+        highbd_inv_txfm_add(dqcoeff, recon, MAX_TX_SIZE, &inv_txfm_param);
+      } else
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      {
+        vpx_convolve_copy(dst, dst_stride, recon, MAX_TX_SIZE,
+                          NULL, 0, NULL, 0, bs, bs);
+        inv_txfm_add(dqcoeff, recon, MAX_TX_SIZE, &inv_txfm_param);
+      }
+
+      cpi->fn_ptr[tx_bsize].vf(src, src_stride, recon, MAX_TX_SIZE, &tmp);
+    }
+
+    *out_dist = (int64_t)tmp * 16;
+  }
 }
 
 static int rate_block(int plane, int block, int blk_row, int blk_col,
                       TX_SIZE tx_size, struct rdcost_block_args* args) {
-  return cost_coeffs(args->x, plane, block, args->t_above + blk_col,
-                     args->t_left + blk_row, tx_size,
-                     args->so->scan, args->so->neighbors,
+#if CONFIG_VAR_TX
+  int coeff_ctx = combine_entropy_contexts(*(args->t_above + blk_col),
+                                           *(args->t_left + blk_row));
+  int coeff_cost = cost_coeffs(args->x, plane, block, coeff_ctx,
+                               tx_size, args->so->scan, args->so->neighbors,
+                               args->use_fast_coef_costing);
+  const struct macroblock_plane *p = &args->x->plane[plane];
+  *(args->t_above + blk_col) = !(p->eobs[block] == 0);
+  *(args->t_left  + blk_row) = !(p->eobs[block] == 0);
+  return coeff_cost;
+#else
+  return cost_coeffs(args->x, plane, block,
+                     args->t_above + blk_col,
+                     args->t_left + blk_row,
+                     tx_size, args->so->scan, args->so->neighbors,
                      args->use_fast_coef_costing);
+#endif  // CONFIG_VAR_TX
 }
 
 static void block_rd_txfm(int plane, int block, int blk_row, int blk_col,
@@ -457,58 +1166,77 @@
   MACROBLOCK *const x = args->x;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+#if CONFIG_NEW_QUANT
+  int ctx;
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+#endif  //  CONFIG_NEW_QUANT
   int64_t rd1, rd2, rd;
   int rate;
   int64_t dist;
   int64_t sse;
+  ENTROPY_CONTEXT coeff_ctx = combine_entropy_contexts(
+      *(args->t_above + blk_col), *(args->t_left + blk_row));
+
+#if CONFIG_NEW_QUANT
+  ctx = get_entropy_context(tx_size, pd->above_context + blk_col,
+                            pd->left_context + blk_row);
+#endif  //  CONFIG_NEW_QUANT
 
   if (args->exit_early)
     return;
 
   if (!is_inter_block(mbmi)) {
-    struct encode_b_args arg = {x, NULL, &mbmi->skip};
+    struct encode_b_args intra_arg = {
+        x, NULL, &mbmi->skip, args->t_above, args->t_left, 1};
     vp10_encode_block_intra(plane, block, blk_row, blk_col,
-                            plane_bsize, tx_size, &arg);
-    dist_block(x, plane, block, tx_size, &dist, &sse);
-  } else if (max_txsize_lookup[plane_bsize] == tx_size) {
-    if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] ==
-        SKIP_TXFM_NONE) {
-      // full forward transform and quantization
-      vp10_xform_quant(x, plane, block, blk_row, blk_col,
-                       plane_bsize, tx_size);
-      dist_block(x, plane, block, tx_size, &dist, &sse);
-    } else if (x->skip_txfm[(plane << 2) + (block >> (tx_size << 1))] ==
-               SKIP_TXFM_AC_ONLY) {
-      // compute DC coefficient
-      tran_low_t *const coeff   = BLOCK_OFFSET(x->plane[plane].coeff, block);
-      tran_low_t *const dqcoeff = BLOCK_OFFSET(xd->plane[plane].dqcoeff, block);
-      vp10_xform_quant_dc(x, plane, block, blk_row, blk_col,
-                          plane_bsize, tx_size);
-      sse  = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
-      dist = sse;
-      if (x->plane[plane].eobs[block]) {
-        const int64_t orig_sse = (int64_t)coeff[0] * coeff[0];
-        const int64_t resd_sse = coeff[0] - dqcoeff[0];
-        int64_t dc_correct = orig_sse - resd_sse * resd_sse;
-#if CONFIG_VP9_HIGHBITDEPTH
-        dc_correct >>= ((xd->bd - 8) * 2);
-#endif
-        if (tx_size != TX_32X32)
-          dc_correct >>= 2;
+                            plane_bsize, tx_size, &intra_arg);
 
-        dist = VPXMAX(0, sse - dc_correct);
-      }
+    if (args->cpi->sf.use_transform_domain_distortion) {
+      dist_block(args->cpi, x, plane, block, blk_row, blk_col,
+                 tx_size, &dist, &sse);
     } else {
-      // SKIP_TXFM_AC_DC
-      // skip forward transform
-      x->plane[plane].eobs[block] = 0;
-      sse  = x->bsse[(plane << 2) + (block >> (tx_size << 1))] << 4;
-      dist = sse;
+      // Note that the encode block_intra call above already calls
+      // inv_txfm_add, so we can't just call dist_block here.
+      const int bs = 4 << tx_size;
+      const BLOCK_SIZE tx_bsize = txsize_to_bsize[tx_size];
+      const vpx_variance_fn_t variance = args->cpi->fn_ptr[tx_bsize].vf;
+
+      const struct macroblock_plane *const p = &x->plane[plane];
+      const struct macroblockd_plane *const pd = &xd->plane[plane];
+
+      const int src_stride = p->src.stride;
+      const int dst_stride = pd->dst.stride;
+      const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+
+      const uint8_t *src = &p->src.buf[4 * (blk_row * src_stride + blk_col)];
+      const uint8_t *dst = &pd->dst.buf[4 * (blk_row * dst_stride + blk_col)];
+      const int16_t *diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+
+      unsigned int tmp;
+
+      sse = vpx_sum_squares_2d_i16(diff, diff_stride, bs);
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+        sse = ROUND_POWER_OF_TWO(sse, (xd->bd - 8) * 2);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      sse = (int64_t)sse * 16;
+
+      variance(src, src_stride, dst, dst_stride, &tmp);
+      dist = (int64_t)tmp * 16;
     }
   } else {
     // full forward transform and quantization
-    vp10_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size);
-    dist_block(x, plane, block, tx_size, &dist, &sse);
+#if CONFIG_NEW_QUANT
+    vp10_xform_quant_fp_nuq(x, plane, block, blk_row, blk_col, plane_bsize,
+                            tx_size, ctx);
+#else
+    vp10_xform_quant(x, plane, block, blk_row, blk_col, plane_bsize, tx_size,
+                     VP10_XFORM_QUANT_FP);
+#endif  // CONFIG_NEW_QUANT
+    if (x->plane[plane].eobs[block])
+      vp10_optimize_b(x, plane, block, tx_size, coeff_ctx);
+    dist_block(args->cpi, x, plane, block, blk_row, blk_col,
+               tx_size, &dist, &sse);
   }
 
   rd = RDCOST(x->rdmult, x->rddiv, 0, dist);
@@ -541,6 +1269,7 @@
 }
 
 static void txfm_rd_in_plane(MACROBLOCK *x,
+                             const VP10_COMP *cpi,
                              int *rate, int64_t *distortion,
                              int *skippable, int64_t *sse,
                              int64_t ref_best_rd, int plane,
@@ -552,6 +1281,7 @@
   struct rdcost_block_args args;
   vp10_zero(args);
   args.x = x;
+  args.cpi = cpi;
   args.best_rd = ref_best_rd;
   args.use_fast_coef_costing = use_fast_coef_casting;
   args.skippable = 1;
@@ -561,11 +1291,11 @@
 
   vp10_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left);
 
-  tx_type = get_tx_type(pd->plane_type, xd, 0);
-  args.so = get_scan(tx_size, tx_type);
+  tx_type = get_tx_type(pd->plane_type, xd, 0, tx_size);
+  args.so = get_scan(tx_size, tx_type, is_inter_block(&xd->mi[0]->mbmi));
 
   vp10_foreach_transformed_block_in_plane(xd, bsize, plane,
-                                         block_rd_txfm, &args);
+                                          block_rd_txfm, &args);
   if (args.exit_early) {
     *rate       = INT_MAX;
     *distortion = INT64_MAX;
@@ -579,113 +1309,157 @@
   }
 }
 
-static void choose_largest_tx_size(VP10_COMP *cpi, MACROBLOCK *x,
+#if CONFIG_SUPERTX
+void vp10_txfm_rd_in_plane_supertx(MACROBLOCK *x,
+                                   const VP10_COMP *cpi,
                                    int *rate, int64_t *distortion,
-                                   int *skip, int64_t *sse,
-                                   int64_t ref_best_rd,
-                                   BLOCK_SIZE bs) {
-  const TX_SIZE max_tx_size = max_txsize_lookup[bs];
-  VP10_COMMON *const cm = &cpi->common;
-  const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
+                                   int *skippable, int64_t *sse,
+                                   int64_t ref_best_rd, int plane,
+                                   BLOCK_SIZE bsize, TX_SIZE tx_size,
+                                   int use_fast_coef_casting) {
   MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  struct rdcost_block_args args;
+  TX_TYPE tx_type;
 
-  TX_TYPE tx_type, best_tx_type = DCT_DCT;
-  int r, s;
-  int64_t d, psse, this_rd, best_rd = INT64_MAX;
-  vpx_prob skip_prob = vp10_get_skip_prob(cm, xd);
-  int  s0 = vp10_cost_bit(skip_prob, 0);
-  int  s1 = vp10_cost_bit(skip_prob, 1);
-  const int is_inter = is_inter_block(mbmi);
+  vp10_zero(args);
+  args.cpi = cpi;
+  args.x = x;
+  args.best_rd = ref_best_rd;
+  args.use_fast_coef_costing = use_fast_coef_casting;
 
-  mbmi->tx_size = VPXMIN(max_tx_size, largest_tx_size);
-  if (mbmi->tx_size < TX_32X32 &&
-      !xd->lossless[mbmi->segment_id]) {
-    for (tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
-      mbmi->tx_type = tx_type;
-      txfm_rd_in_plane(x, &r, &d, &s,
-                       &psse, ref_best_rd, 0, bs, mbmi->tx_size,
-                       cpi->sf.use_fast_coef_costing);
-      if (r == INT_MAX)
-        continue;
-      if (is_inter)
-        r += cpi->inter_tx_type_costs[mbmi->tx_size][mbmi->tx_type];
-      else
-        r += cpi->intra_tx_type_costs[mbmi->tx_size]
-                                     [intra_mode_to_tx_type_context[mbmi->mode]]
-                                     [mbmi->tx_type];
-      if (s)
-        this_rd = RDCOST(x->rdmult, x->rddiv, s1, psse);
-      else
-        this_rd = RDCOST(x->rdmult, x->rddiv, r + s0, d);
-      if (is_inter && !xd->lossless[mbmi->segment_id] && !s)
-        this_rd = VPXMIN(this_rd, RDCOST(x->rdmult, x->rddiv, s1, psse));
+  if (plane == 0)
+    xd->mi[0]->mbmi.tx_size = tx_size;
 
-      if (this_rd < ((best_tx_type == DCT_DCT) ? ext_tx_th : 1) * best_rd) {
-        best_rd = this_rd;
-        best_tx_type = mbmi->tx_type;
-      }
-    }
-  }
-  mbmi->tx_type = best_tx_type;
-  txfm_rd_in_plane(x, rate, distortion, skip,
-                   sse, ref_best_rd, 0, bs,
-                   mbmi->tx_size, cpi->sf.use_fast_coef_costing);
-  if (mbmi->tx_size < TX_32X32 && !xd->lossless[mbmi->segment_id] &&
-      *rate != INT_MAX) {
-    if (is_inter)
-      *rate += cpi->inter_tx_type_costs[mbmi->tx_size][mbmi->tx_type];
-    else
-      *rate += cpi->intra_tx_type_costs[mbmi->tx_size]
-          [intra_mode_to_tx_type_context[mbmi->mode]]
-          [mbmi->tx_type];
+  vp10_get_entropy_contexts(bsize, tx_size, pd, args.t_above, args.t_left);
+
+  tx_type = get_tx_type(pd->plane_type, xd, 0, tx_size);
+  args.so = get_scan(tx_size, tx_type, is_inter_block(&xd->mi[0]->mbmi));
+
+  block_rd_txfm(plane, 0, 0, 0, get_plane_block_size(bsize, pd),
+                tx_size, &args);
+
+  if (args.exit_early) {
+    *rate       = INT_MAX;
+    *distortion = INT64_MAX;
+    *sse        = INT64_MAX;
+    *skippable  = 0;
+  } else {
+    *distortion = args.this_dist;
+    *rate       = args.this_rate;
+    *sse        = args.this_sse;
+    *skippable  = !x->plane[plane].eobs[0];
   }
 }
+#endif  // CONFIG_SUPERTX
 
-static void choose_smallest_tx_size(VP10_COMP *cpi, MACROBLOCK *x,
-                                    int *rate, int64_t *distortion,
-                                    int *skip, int64_t *sse,
-                                    int64_t ref_best_rd,
-                                    BLOCK_SIZE bs) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-
-  mbmi->tx_size = TX_4X4;
-
-  txfm_rd_in_plane(x, rate, distortion, skip,
-                   sse, ref_best_rd, 0, bs,
-                   mbmi->tx_size, cpi->sf.use_fast_coef_costing);
-}
-
-static void choose_tx_size_from_rd(VP10_COMP *cpi, MACROBLOCK *x,
-                                   int *rate,
-                                   int64_t *distortion,
-                                   int *skip,
-                                   int64_t *psse,
-                                   int64_t ref_best_rd,
-                                   BLOCK_SIZE bs) {
-  const TX_SIZE max_tx_size = max_txsize_lookup[bs];
+static int64_t txfm_yrd(VP10_COMP *cpi, MACROBLOCK *x,
+                        int *r, int64_t *d, int *s, int64_t *sse,
+                        int64_t ref_best_rd,
+                        BLOCK_SIZE bs, TX_TYPE tx_type, int tx_size) {
   VP10_COMMON *const cm = &cpi->common;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
-  vpx_prob skip_prob = vp10_get_skip_prob(cm, xd);
-  int r, s;
-  int64_t d, sse;
   int64_t rd = INT64_MAX;
-  int n, m;
+  vpx_prob skip_prob = vp10_get_skip_prob(cm, xd);
   int s0, s1;
-  int64_t best_rd = INT64_MAX, last_rd = INT64_MAX;
-  TX_SIZE best_tx = max_tx_size;
-  int start_tx, end_tx;
+  const TX_SIZE max_tx_size = max_txsize_lookup[bs];
   const int tx_select = cm->tx_mode == TX_MODE_SELECT;
-  TX_TYPE tx_type, best_tx_type = DCT_DCT;
   const int is_inter = is_inter_block(mbmi);
+  const int r_tx_size =
+      cpi->tx_size_cost[max_tx_size - TX_8X8][get_tx_size_context(xd)][tx_size];
+#if CONFIG_EXT_TX
+  int ext_tx_set;
+#endif  // CONFIG_EXT_TX
 
-  const vpx_prob *tx_probs = get_tx_probs2(max_tx_size, xd, &cm->fc->tx_probs);
   assert(skip_prob > 0);
   s0 = vp10_cost_bit(skip_prob, 0);
   s1 = vp10_cost_bit(skip_prob, 1);
 
+  mbmi->tx_type = tx_type;
+  mbmi->tx_size = tx_size;
+  txfm_rd_in_plane(x,
+                   cpi,
+                   r, d, s,
+                   sse, ref_best_rd, 0, bs, tx_size,
+                   cpi->sf.use_fast_coef_costing);
+  if (*r == INT_MAX)
+    return INT64_MAX;
+#if CONFIG_EXT_TX
+  ext_tx_set = get_ext_tx_set(tx_size, bs, is_inter);
+  if (get_ext_tx_types(tx_size, bs, is_inter) > 1 &&
+      !xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+    if (is_inter) {
+      if (ext_tx_set > 0)
+        *r += cpi->inter_tx_type_costs[ext_tx_set]
+                                      [mbmi->tx_size][mbmi->tx_type];
+    } else {
+      if (ext_tx_set > 0 && ALLOW_INTRA_EXT_TX)
+        *r += cpi->intra_tx_type_costs[ext_tx_set][mbmi->tx_size]
+                                      [mbmi->mode][mbmi->tx_type];
+    }
+  }
+
+#else
+  if (tx_size < TX_32X32 &&
+      !xd->lossless[xd->mi[0]->mbmi.segment_id] && !FIXED_TX_TYPE) {
+    if (is_inter) {
+      *r += cpi->inter_tx_type_costs[mbmi->tx_size][mbmi->tx_type];
+    } else {
+      *r += cpi->intra_tx_type_costs[mbmi->tx_size]
+           [intra_mode_to_tx_type_context[mbmi->mode]]
+           [mbmi->tx_type];
+    }
+  }
+#endif  // CONFIG_EXT_TX
+
+  if (*s) {
+    if (is_inter) {
+      rd = RDCOST(x->rdmult, x->rddiv, s1, *sse);
+    } else {
+      rd =  RDCOST(x->rdmult, x->rddiv, s1 + r_tx_size * tx_select, *sse);
+    }
+  } else {
+    rd = RDCOST(x->rdmult, x->rddiv, *r + s0 + r_tx_size * tx_select, *d);
+  }
+
+  if (tx_select)
+    *r += r_tx_size;
+
+  if (is_inter && !xd->lossless[xd->mi[0]->mbmi.segment_id] && !(*s))
+    rd = VPXMIN(rd, RDCOST(x->rdmult, x->rddiv, s1, *sse));
+
+  return rd;
+}
+
+static int64_t choose_tx_size_fix_type(VP10_COMP *cpi,
+                                       BLOCK_SIZE bs,
+                                       MACROBLOCK *x,
+                                       int *rate,
+                                       int64_t *distortion,
+                                       int *skip,
+                                       int64_t *psse,
+                                       int64_t ref_best_rd,
+                                       TX_TYPE tx_type,
+                                       int prune) {
+  VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  int r, s;
+  int64_t d, sse;
+  int64_t rd = INT64_MAX;
+  int n;
+  int start_tx, end_tx;
+  int64_t best_rd = INT64_MAX, last_rd = INT64_MAX;
+  const TX_SIZE max_tx_size = max_txsize_lookup[bs];
+  TX_SIZE best_tx = max_tx_size;
+  uint8_t zcoeff_blk[TX_SIZES][MAX_MIB_SIZE * MAX_MIB_SIZE * 4];
+  const int tx_select = cm->tx_mode == TX_MODE_SELECT;
+  const int is_inter = is_inter_block(mbmi);
+#if CONFIG_EXT_TX
+  int ext_tx_set;
+#endif  // CONFIG_EXT_TX
+
   if (tx_select) {
     start_tx = max_tx_size;
     end_tx = 0;
@@ -701,83 +1475,313 @@
   *skip       = 0;
   *psse       = INT64_MAX;
 
-  for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
-    last_rd = INT64_MAX;
-    for (n = start_tx; n >= end_tx; --n) {
-      int r_tx_size = 0;
-      for (m = 0; m <= n - (n == (int) max_tx_size); ++m) {
-        if (m == n)
-          r_tx_size += vp10_cost_zero(tx_probs[m]);
-        else
-          r_tx_size += vp10_cost_one(tx_probs[m]);
+  mbmi->tx_type = tx_type;
+  last_rd = INT64_MAX;
+  for (n = start_tx; n >= end_tx; --n) {
+    if (FIXED_TX_TYPE && tx_type != get_default_tx_type(0, xd, 0, n))
+      continue;
+    if (!is_inter && x->use_default_intra_tx_type &&
+        tx_type != get_default_tx_type(0, xd, 0, n))
+      continue;
+    if (is_inter && x->use_default_inter_tx_type &&
+        tx_type != get_default_tx_type(0, xd, 0, n))
+      continue;
+    if (max_tx_size == TX_32X32 && n == TX_4X4)
+      continue;
+#if CONFIG_EXT_TX
+    ext_tx_set = get_ext_tx_set(n, bs, is_inter);
+    if (is_inter) {
+      if (!ext_tx_used_inter[ext_tx_set][tx_type])
+        continue;
+      if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE) {
+        if (!do_tx_type_search(tx_type, prune))
+          continue;
+      }
+    } else {
+      if (!ALLOW_INTRA_EXT_TX && bs >= BLOCK_8X8) {
+        if (tx_type != intra_mode_to_tx_type_context[mbmi->mode])
+          continue;
+      }
+      if (!ext_tx_used_intra[ext_tx_set][tx_type])
+        continue;
+    }
+#else  // CONFIG_EXT_TX
+    if (n >= TX_32X32 && tx_type != DCT_DCT)
+      continue;
+    if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE &&
+        !do_tx_type_search(tx_type, prune))
+        continue;
+#endif  // CONFIG_EXT_TX
+
+    rd = txfm_yrd(cpi, x, &r, &d, &s, &sse, ref_best_rd, bs, tx_type, n);
+
+    // Early termination in transform size search.
+    if (cpi->sf.tx_size_search_breakout &&
+        (rd == INT64_MAX ||
+         (s == 1 && tx_type != DCT_DCT && n < start_tx) ||
+         (n < (int) max_tx_size && rd > last_rd)))
+      break;
+
+    last_rd = rd;
+    if (rd < best_rd) {
+      best_tx = n;
+      best_rd = rd;
+      *distortion = d;
+      *rate       = r;
+      *skip       = s;
+      *psse       = sse;
+      memcpy(zcoeff_blk[mbmi->tx_size], x->zcoeff_blk[mbmi->tx_size],
+             sizeof(zcoeff_blk[mbmi->tx_size][0]) *
+             MAX_MIB_SIZE * MAX_MIB_SIZE * 4);
+    }
+  }
+  mbmi->tx_size = best_tx;
+
+  memcpy(x->zcoeff_blk[mbmi->tx_size], zcoeff_blk[mbmi->tx_size],
+         sizeof(zcoeff_blk[mbmi->tx_size][0]) *
+         MAX_MIB_SIZE * MAX_MIB_SIZE * 4);
+
+  return best_rd;
+}
+
+#if CONFIG_EXT_INTER
+static int64_t estimate_yrd_for_sb(VP10_COMP *cpi,
+                                   BLOCK_SIZE bs,
+                                   MACROBLOCK *x,
+                                   int *r, int64_t *d,
+                                   int *s, int64_t *sse,
+                                   int64_t ref_best_rd) {
+  return txfm_yrd(cpi, x, r, d, s, sse, ref_best_rd, bs,
+                  DCT_DCT, max_txsize_lookup[bs]);
+}
+#endif  // CONFIG_EXT_INTER
+
+static void choose_largest_tx_size(VP10_COMP *cpi, MACROBLOCK *x,
+                                   int *rate, int64_t *distortion,
+                                   int *skip, int64_t *sse,
+                                   int64_t ref_best_rd,
+                                   BLOCK_SIZE bs) {
+  const TX_SIZE max_tx_size = max_txsize_lookup[bs];
+  VP10_COMMON *const cm = &cpi->common;
+  const TX_SIZE largest_tx_size = tx_mode_to_biggest_tx_size[cm->tx_mode];
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  TX_TYPE tx_type, best_tx_type = DCT_DCT;
+  int r, s;
+  int64_t d, psse, this_rd, best_rd = INT64_MAX;
+  vpx_prob skip_prob = vp10_get_skip_prob(cm, xd);
+  int  s0 = vp10_cost_bit(skip_prob, 0);
+  int  s1 = vp10_cost_bit(skip_prob, 1);
+  const int is_inter = is_inter_block(mbmi);
+  int prune = 0;
+#if CONFIG_EXT_TX
+  int ext_tx_set;
+#endif  // CONFIG_EXT_TX
+
+  mbmi->tx_size = VPXMIN(max_tx_size, largest_tx_size);
+
+#if CONFIG_EXT_TX
+  ext_tx_set = get_ext_tx_set(mbmi->tx_size, bs, is_inter);
+#endif  // CONFIG_EXT_TX
+
+  if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE)
+#if CONFIG_EXT_TX
+    prune = prune_tx_types(cpi, bs, x, xd, ext_tx_set);
+#else
+    prune = prune_tx_types(cpi, bs, x, xd, 0);
+#endif
+#if CONFIG_EXT_TX
+  if (get_ext_tx_types(mbmi->tx_size, bs, is_inter) > 1 &&
+      !xd->lossless[mbmi->segment_id]) {
+    for (tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
+      if (is_inter) {
+        if (x->use_default_inter_tx_type &&
+            tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size))
+          continue;
+        if (!ext_tx_used_inter[ext_tx_set][tx_type])
+          continue;
+        if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE) {
+          if (!do_tx_type_search(tx_type, prune))
+            continue;
+        }
+      } else {
+        if (x->use_default_intra_tx_type &&
+            tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size))
+          continue;
+        if (!ALLOW_INTRA_EXT_TX && bs >= BLOCK_8X8) {
+          if (tx_type != intra_mode_to_tx_type_context[mbmi->mode])
+            continue;
+        }
+        if (!ext_tx_used_intra[ext_tx_set][tx_type])
+          continue;
       }
 
-      if (n >= TX_32X32 && tx_type != DCT_DCT) {
-        continue;
-      }
       mbmi->tx_type = tx_type;
-      txfm_rd_in_plane(x, &r, &d, &s,
-                       &sse, ref_best_rd, 0, bs, n,
+
+      txfm_rd_in_plane(x,
+                       cpi,
+                       &r, &d, &s,
+                       &psse, ref_best_rd, 0, bs, mbmi->tx_size,
                        cpi->sf.use_fast_coef_costing);
-      if (n < TX_32X32 &&
-          !xd->lossless[xd->mi[0]->mbmi.segment_id] &&
-          r != INT_MAX) {
-        if (is_inter)
-          r += cpi->inter_tx_type_costs[mbmi->tx_size][mbmi->tx_type];
-        else
-          r += cpi->intra_tx_type_costs[mbmi->tx_size]
-              [intra_mode_to_tx_type_context[mbmi->mode]]
-              [mbmi->tx_type];
-      }
 
       if (r == INT_MAX)
         continue;
-
-      if (s) {
+      if (get_ext_tx_types(mbmi->tx_size, bs, is_inter) > 1) {
         if (is_inter) {
-          rd = RDCOST(x->rdmult, x->rddiv, s1, sse);
+          if (ext_tx_set > 0)
+            r += cpi->inter_tx_type_costs[ext_tx_set]
+                                         [mbmi->tx_size][mbmi->tx_type];
         } else {
-          rd =  RDCOST(x->rdmult, x->rddiv, s1 + r_tx_size * tx_select, sse);
+          if (ext_tx_set > 0 && ALLOW_INTRA_EXT_TX)
+            r += cpi->intra_tx_type_costs[ext_tx_set][mbmi->tx_size]
+                                         [mbmi->mode][mbmi->tx_type];
         }
-      } else {
-        rd = RDCOST(x->rdmult, x->rddiv, r + s0 + r_tx_size * tx_select, d);
       }
 
-      if (tx_select && !(s && is_inter))
-        r += r_tx_size;
+      if (s)
+        this_rd = RDCOST(x->rdmult, x->rddiv, s1, psse);
+      else
+        this_rd = RDCOST(x->rdmult, x->rddiv, r + s0, d);
+      if (is_inter_block(mbmi) && !xd->lossless[mbmi->segment_id] && !s)
+        this_rd = VPXMIN(this_rd, RDCOST(x->rdmult, x->rddiv, s1, psse));
 
-      if (is_inter && !xd->lossless[xd->mi[0]->mbmi.segment_id] && !s)
-        rd = VPXMIN(rd, RDCOST(x->rdmult, x->rddiv, s1, sse));
-
-      // Early termination in transform size search.
-      if (cpi->sf.tx_size_search_breakout &&
-          (rd == INT64_MAX ||
-           (s == 1 && tx_type != DCT_DCT && n < start_tx) ||
-           (n < (int) max_tx_size && rd > last_rd)))
-        break;
-
-      last_rd = rd;
-      if (rd <
-          (is_inter && best_tx_type == DCT_DCT ? ext_tx_th : 1) *
-          best_rd) {
-        best_tx = n;
-        best_rd = rd;
-        *distortion = d;
-        *rate       = r;
-        *skip       = s;
-        *psse       = sse;
+      if (this_rd < best_rd) {
+        best_rd = this_rd;
         best_tx_type = mbmi->tx_type;
       }
     }
   }
 
+#else  // CONFIG_EXT_TX
+  if (mbmi->tx_size < TX_32X32 &&
+      !xd->lossless[mbmi->segment_id]) {
+    for (tx_type = 0; tx_type < TX_TYPES; ++tx_type) {
+      if (!is_inter && x->use_default_intra_tx_type &&
+          tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size))
+        continue;
+      if (is_inter && x->use_default_inter_tx_type &&
+          tx_type != get_default_tx_type(0, xd, 0, mbmi->tx_size))
+        continue;
+      mbmi->tx_type = tx_type;
+      txfm_rd_in_plane(x,
+                       cpi,
+                       &r, &d, &s,
+                       &psse, ref_best_rd, 0, bs, mbmi->tx_size,
+                       cpi->sf.use_fast_coef_costing);
+      if (r == INT_MAX)
+        continue;
+      if (is_inter) {
+        r += cpi->inter_tx_type_costs[mbmi->tx_size][mbmi->tx_type];
+        if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE &&
+            !do_tx_type_search(tx_type, prune))
+            continue;
+      } else {
+        r += cpi->intra_tx_type_costs[mbmi->tx_size]
+                                     [intra_mode_to_tx_type_context[mbmi->mode]]
+                                     [mbmi->tx_type];
+      }
+      if (s)
+        this_rd = RDCOST(x->rdmult, x->rddiv, s1, psse);
+      else
+        this_rd = RDCOST(x->rdmult, x->rddiv, r + s0, d);
+      if (is_inter && !xd->lossless[mbmi->segment_id] && !s)
+        this_rd = VPXMIN(this_rd, RDCOST(x->rdmult, x->rddiv, s1, psse));
+
+      if (this_rd < best_rd) {
+        best_rd = this_rd;
+        best_tx_type = mbmi->tx_type;
+      }
+    }
+  }
+#endif  // CONFIG_EXT_TX
+  mbmi->tx_type = best_tx_type;
+
+  txfm_rd_in_plane(x,
+                   cpi,
+                   rate, distortion, skip,
+                   sse, ref_best_rd, 0, bs,
+                   mbmi->tx_size, cpi->sf.use_fast_coef_costing);
+}
+
+static void choose_smallest_tx_size(VP10_COMP *cpi, MACROBLOCK *x,
+                                    int *rate, int64_t *distortion,
+                                    int *skip, int64_t *sse,
+                                    int64_t ref_best_rd,
+                                    BLOCK_SIZE bs) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+
+  mbmi->tx_size = TX_4X4;
+  mbmi->tx_type = DCT_DCT;
+
+  txfm_rd_in_plane(x,
+                   cpi,
+                   rate, distortion, skip,
+                   sse, ref_best_rd, 0, bs,
+                   mbmi->tx_size, cpi->sf.use_fast_coef_costing);
+}
+
+static void choose_tx_size_type_from_rd(VP10_COMP *cpi, MACROBLOCK *x,
+                                        int *rate,
+                                        int64_t *distortion,
+                                        int *skip,
+                                        int64_t *psse,
+                                        int64_t ref_best_rd,
+                                        BLOCK_SIZE bs) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  uint8_t zcoeff_blk[TX_SIZES][MAX_MIB_SIZE * MAX_MIB_SIZE * 4];
+  int r, s;
+  int64_t d, sse;
+  int64_t rd = INT64_MAX;
+  int64_t best_rd = INT64_MAX;
+  TX_SIZE best_tx = max_txsize_lookup[bs];
+  const int is_inter = is_inter_block(mbmi);
+  TX_TYPE tx_type, best_tx_type = DCT_DCT;
+  int prune = 0;
+
+  if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE)
+    // passing -1 in for tx_type indicates that all 1D
+    // transforms should be considered for pruning
+    prune = prune_tx_types(cpi, bs, x, xd, -1);
+
+  *distortion = INT64_MAX;
+  *rate       = INT_MAX;
+  *skip       = 0;
+  *psse       = INT64_MAX;
+
+  for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
+#if CONFIG_REF_MV
+    if (tx_type != DCT_DCT && is_inter && mbmi->ref_mv_idx > 0)
+      continue;
+#endif
+    rd = choose_tx_size_fix_type(cpi, bs, x, &r, &d, &s, &sse, ref_best_rd,
+                                 tx_type, prune);
+    if (rd < best_rd) {
+      best_rd = rd;
+      *distortion = d;
+      *rate       = r;
+      *skip       = s;
+      *psse       = sse;
+      best_tx_type = tx_type;
+      best_tx = mbmi->tx_size;
+      memcpy(zcoeff_blk[mbmi->tx_size], x->zcoeff_blk[mbmi->tx_size],
+             sizeof(zcoeff_blk[mbmi->tx_size][0]) *
+             MAX_MIB_SIZE * MAX_MIB_SIZE * 4);
+    }
+  }
+
   mbmi->tx_size = best_tx;
   mbmi->tx_type = best_tx_type;
+
+#if !CONFIG_EXT_TX
   if (mbmi->tx_size >= TX_32X32)
     assert(mbmi->tx_type == DCT_DCT);
-  txfm_rd_in_plane(x, &r, &d, &s,
-                   &sse, ref_best_rd, 0, bs, best_tx,
-                   cpi->sf.use_fast_coef_costing);
+#endif
+
+  memcpy(x->zcoeff_blk[mbmi->tx_size], zcoeff_blk[mbmi->tx_size],
+         sizeof(zcoeff_blk[mbmi->tx_size][0]) *
+         MAX_MIB_SIZE * MAX_MIB_SIZE * 4);
 }
 
 static void super_block_yrd(VP10_COMP *cpi, MACROBLOCK *x, int *rate,
@@ -790,16 +1794,15 @@
 
   assert(bs == xd->mi[0]->mbmi.sb_type);
 
-  if (CONFIG_MISC_FIXES && xd->lossless[0]) {
+  if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
     choose_smallest_tx_size(cpi, x, rate, distortion, skip, ret_sse,
                             ref_best_rd, bs);
-  } else if (cpi->sf.tx_size_search_method == USE_LARGESTALL ||
-             xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+  } else if (cpi->sf.tx_size_search_method == USE_LARGESTALL) {
     choose_largest_tx_size(cpi, x, rate, distortion, skip, ret_sse, ref_best_rd,
                            bs);
   } else {
-    choose_tx_size_from_rd(cpi, x, rate, distortion, skip, ret_sse,
-                           ref_best_rd, bs);
+    choose_tx_size_type_from_rd(cpi, x, rate, distortion, skip, ret_sse,
+                                ref_best_rd, bs);
   }
 }
 
@@ -824,6 +1827,179 @@
   return 0;
 }
 
+static int rd_pick_palette_intra_sby(VP10_COMP *cpi, MACROBLOCK *x,
+                                     BLOCK_SIZE bsize,
+                                     int palette_ctx, int dc_mode_cost,
+                                     PALETTE_MODE_INFO *palette_mode_info,
+                                     uint8_t *best_palette_color_map,
+                                     TX_SIZE *best_tx, TX_TYPE *best_tx_type,
+                                     PREDICTION_MODE *mode_selected,
+                                     int64_t *best_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mic = xd->mi[0];
+  const int rows = 4 * num_4x4_blocks_high_lookup[bsize];
+  const int cols = 4 * num_4x4_blocks_wide_lookup[bsize];
+  int this_rate, this_rate_tokenonly, s, colors, n;
+  int rate_overhead = 0;
+  int64_t this_distortion, this_rd;
+  const int src_stride = x->plane[0].src.stride;
+  const uint8_t *const src = x->plane[0].src.buf;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (cpi->common.use_highbitdepth)
+    colors = vp10_count_colors_highbd(src, src_stride, rows, cols,
+                                      cpi->common.bit_depth);
+  else
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    colors = vp10_count_colors(src, src_stride, rows, cols);
+  palette_mode_info->palette_size[0] = 0;
+#if CONFIG_EXT_INTRA
+  mic->mbmi.ext_intra_mode_info.use_ext_intra_mode[0] = 0;
+#endif  // CONFIG_EXT_INTRA
+
+  if (colors > 1 && colors <= 64 && cpi->common.allow_screen_content_tools) {
+    int r, c, i, j, k;
+    const int max_itr = 50;
+    int color_ctx, color_idx = 0;
+    int color_order[PALETTE_MAX_SIZE];
+    float *const data = x->palette_buffer->kmeans_data_buf;
+    uint8_t *const indices = x->palette_buffer->kmeans_indices_buf;
+    uint8_t *const pre_indices = x->palette_buffer->kmeans_pre_indices_buf;
+    float centroids[PALETTE_MAX_SIZE];
+    uint8_t *const color_map = xd->plane[0].color_index_map;
+    float lb, ub, val;
+    MB_MODE_INFO *const mbmi = &mic->mbmi;
+    PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+#if CONFIG_VP9_HIGHBITDEPTH
+    uint16_t *src16 = CONVERT_TO_SHORTPTR(src);
+    if (cpi->common.use_highbitdepth)
+      lb = ub = src16[0];
+    else
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      lb = ub = src[0];
+
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cpi->common.use_highbitdepth) {
+      for (r = 0; r < rows; ++r) {
+        for (c = 0; c < cols; ++c) {
+          val = src16[r * src_stride + c];
+          data[r * cols + c] = val;
+          if (val < lb)
+            lb = val;
+          else if (val > ub)
+            ub = val;
+        }
+      }
+    } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      for (r = 0; r < rows; ++r) {
+        for (c = 0; c < cols; ++c) {
+          val = src[r * src_stride + c];
+          data[r * cols + c] = val;
+          if (val < lb)
+            lb = val;
+          else if (val > ub)
+            ub = val;
+        }
+      }
+#if CONFIG_VP9_HIGHBITDEPTH
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    mbmi->mode = DC_PRED;
+#if CONFIG_EXT_INTRA
+    mbmi->ext_intra_mode_info.use_ext_intra_mode[0] = 0;
+#endif  // CONFIG_EXT_INTRA
+
+    if (rows * cols > PALETTE_MAX_BLOCK_SIZE)
+      return 0;
+
+    for (n = colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors;
+        n >= 2; --n) {
+      for (i = 0; i < n; ++i)
+        centroids[i] = lb + (2 * i + 1) * (ub - lb) / n / 2;
+      vp10_k_means(data, centroids, indices, pre_indices, rows * cols,
+                   n, 1, max_itr);
+      vp10_insertion_sort(centroids, n);
+      for (i = 0; i < n; ++i)
+        centroids[i] = roundf(centroids[i]);
+      // remove duplicates
+      i = 1;
+      k = n;
+      while (i < k) {
+        if (centroids[i] == centroids[i - 1]) {
+          j = i;
+          while (j < k - 1) {
+            assert((j + 1) < PALETTE_MAX_SIZE);
+            assert(j > 0);
+            centroids[j] = centroids[j + 1];
+            ++j;
+          }
+          --k;
+        } else {
+          ++i;
+        }
+      }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (cpi->common.use_highbitdepth)
+        for (i = 0; i < k; ++i)
+          pmi->palette_colors[i] = clip_pixel_highbd((int)lroundf(centroids[i]),
+                                                     cpi->common.bit_depth);
+      else
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        for (i = 0; i < k; ++i)
+          pmi->palette_colors[i] = clip_pixel((int)lroundf(centroids[i]));
+      pmi->palette_size[0] = k;
+
+      vp10_calc_indices(data, centroids, indices, rows * cols, k, 1);
+      for (r = 0; r < rows; ++r)
+        for (c = 0; c < cols; ++c)
+          color_map[r * cols + c] = indices[r * cols + c];
+
+      super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
+                      &s, NULL, bsize, *best_rd);
+      if (this_rate_tokenonly == INT_MAX)
+        continue;
+
+      this_rate = this_rate_tokenonly + dc_mode_cost +
+          cpi->common.bit_depth * k * vp10_cost_bit(128, 0) +
+          cpi->palette_y_size_cost[bsize - BLOCK_8X8][k - 2] +
+          write_uniform_cost(k, color_map[0]) +
+          vp10_cost_bit(vp10_default_palette_y_mode_prob[bsize - BLOCK_8X8]
+                                                        [palette_ctx], 1);
+      for (i = 0; i < rows; ++i) {
+        for (j = (i == 0 ? 1 : 0); j < cols; ++j) {
+          color_ctx = vp10_get_palette_color_context(color_map, cols, i, j,
+                                                     k, color_order);
+          for (r = 0; r < k; ++r)
+            if (color_map[i * cols + j] == color_order[r]) {
+              color_idx = r;
+              break;
+            }
+          assert(color_idx >= 0 && color_idx < k);
+          this_rate +=
+              cpi->palette_y_color_cost[k - 2][color_ctx][color_idx];
+        }
+      }
+      this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+
+      if (this_rd < *best_rd) {
+        *best_rd = this_rd;
+        *palette_mode_info = *pmi;
+        memcpy(best_palette_color_map, color_map,
+               rows * cols * sizeof(color_map[0]));
+        *mode_selected = DC_PRED;
+        *best_tx = mbmi->tx_size;
+        *best_tx_type = mbmi->tx_type;
+        rate_overhead = this_rate - this_rate_tokenonly;
+      }
+    }
+  }
+
+  return rate_overhead;
+}
+
 static int64_t rd_pick_intra4x4block(VP10_COMP *cpi, MACROBLOCK *x,
                                      int row, int col,
                                      PREDICTION_MODE *best_mode,
@@ -851,9 +2027,10 @@
   uint16_t best_dst16[8 * 8];
 #endif
 
-  memcpy(ta, a, sizeof(ta));
-  memcpy(tl, l, sizeof(tl));
+  memcpy(ta, a, num_4x4_blocks_wide * sizeof(a[0]));
+  memcpy(tl, l, num_4x4_blocks_high * sizeof(l[0]));
   xd->mi[0]->mbmi.tx_size = TX_4X4;
+  xd->mi[0]->mbmi.palette_mode_info.palette_size[0] = 0;
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -870,11 +2047,11 @@
       // one of the neighboring directional modes
       if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
         if (conditional_skipintra(mode, *best_mode))
-            continue;
+          continue;
       }
 
-      memcpy(tempa, ta, sizeof(ta));
-      memcpy(templ, tl, sizeof(tl));
+      memcpy(tempa, ta, num_4x4_blocks_wide * sizeof(ta[0]));
+      memcpy(templ, tl, num_4x4_blocks_high * sizeof(tl[0]));
 
       for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
         for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
@@ -882,9 +2059,8 @@
           const uint8_t *const src = &src_init[idx * 4 + idy * 4 * src_stride];
           uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
           int16_t *const src_diff = vp10_raster_block_offset_int16(BLOCK_8X8,
-                                                                  block,
-                                                                  p->src_diff);
-          tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
+                                                                   block,
+                                                                   p->src_diff);
           xd->mi[0]->bmi[block].as_mode = mode;
           vp10_predict_intra_block(xd, 1, 1, TX_4X4, mode, dst, dst_stride,
                                   dst, dst_stride,
@@ -892,35 +2068,58 @@
           vpx_highbd_subtract_block(4, 4, src_diff, 8, src, src_stride,
                                     dst, dst_stride, xd->bd);
           if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
-            TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block);
-            const scan_order *so = get_scan(TX_4X4, tx_type);
-            vp10_highbd_fwd_txfm_4x4(src_diff, coeff, 8, DCT_DCT, 1);
-            vp10_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
-            ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
+            TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block, TX_4X4);
+            const scan_order *so = get_scan(TX_4X4, tx_type, 0);
+#if CONFIG_VAR_TX
+            const int coeff_ctx = combine_entropy_contexts(*(tempa + idx),
+                                                           *(templ + idy));
+#endif  // CONFIG_VAR_TX
+            vp10_xform_quant(x, 0, block, row + idy, col + idx, BLOCK_8X8,
+                             TX_4X4, VP10_XFORM_QUANT_FP);
+#if CONFIG_VAR_TX
+            ratey += cost_coeffs(x, 0, block, coeff_ctx, TX_4X4, so->scan,
+                                 so->neighbors, cpi->sf.use_fast_coef_costing);
+            *(tempa + idx) = !(p->eobs[block] == 0);
+            *(templ + idy) = !(p->eobs[block] == 0);
+#else
+            ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy,
+                                 TX_4X4,
                                  so->scan, so->neighbors,
                                  cpi->sf.use_fast_coef_costing);
+#endif  // CONFIG_VAR_TX
             if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
               goto next_highbd;
             vp10_highbd_inv_txfm_add_4x4(BLOCK_OFFSET(pd->dqcoeff, block),
                                          dst, dst_stride, p->eobs[block],
                                          xd->bd, DCT_DCT, 1);
           } else {
-            int64_t unused;
-            TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block);
-            const scan_order *so = get_scan(TX_4X4, tx_type);
-            vp10_highbd_fwd_txfm_4x4(src_diff, coeff, 8, tx_type, 0);
-            vp10_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
-            ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
-                                 so->scan, so->neighbors,
+            int64_t dist;
+            unsigned int tmp;
+            TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block, TX_4X4);
+            const scan_order *so = get_scan(TX_4X4, tx_type, 0);
+            const int coeff_ctx = combine_entropy_contexts(*(tempa + idx),
+                                                           *(templ + idy));
+            vp10_xform_quant(x, 0, block, row + idy, col + idx, BLOCK_8X8,
+                             TX_4X4, VP10_XFORM_QUANT_FP);
+            vp10_optimize_b(x, 0, block, TX_4X4, coeff_ctx);
+#if CONFIG_VAR_TX
+            ratey += cost_coeffs(x, 0, block, coeff_ctx, TX_4X4, so->scan,
+                                 so->neighbors, cpi->sf.use_fast_coef_costing);
+            *(tempa + idx) = !(p->eobs[block] == 0);
+            *(templ + idy) = !(p->eobs[block] == 0);
+#else
+            ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy,
+                                 TX_4X4, so->scan, so->neighbors,
                                  cpi->sf.use_fast_coef_costing);
-            distortion += vp10_highbd_block_error(
-                coeff, BLOCK_OFFSET(pd->dqcoeff, block),
-                16, &unused, xd->bd) >> 2;
-            if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
-              goto next_highbd;
+#endif  // CONFIG_VAR_TX
             vp10_highbd_inv_txfm_add_4x4(BLOCK_OFFSET(pd->dqcoeff, block),
                                          dst, dst_stride, p->eobs[block],
                                          xd->bd, tx_type, 0);
+            cpi->fn_ptr[BLOCK_4X4].vf(src, src_stride, dst, dst_stride, &tmp);
+            dist = (int64_t)tmp << 4;
+            distortion += dist;
+            if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
+              goto next_highbd;
           }
         }
       }
@@ -934,17 +2133,18 @@
         *bestdistortion = distortion;
         best_rd = this_rd;
         *best_mode = mode;
-        memcpy(a, tempa, sizeof(tempa));
-        memcpy(l, templ, sizeof(templ));
+        memcpy(a, tempa, num_4x4_blocks_wide * sizeof(tempa[0]));
+        memcpy(l, templ, num_4x4_blocks_high * sizeof(templ[0]));
         for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy) {
           memcpy(best_dst16 + idy * 8,
                  CONVERT_TO_SHORTPTR(dst_init + idy * dst_stride),
                  num_4x4_blocks_wide * 4 * sizeof(uint16_t));
         }
       }
-    next_highbd:
+next_highbd:
       {}
     }
+
     if (best_rd >= rd_thresh)
       return best_rd;
 
@@ -971,11 +2171,11 @@
     // one of the neighboring directional modes
     if (cpi->sf.mode_search_skip_flags & FLAG_SKIP_INTRA_DIRMISMATCH) {
       if (conditional_skipintra(mode, *best_mode))
-          continue;
+        continue;
     }
 
-    memcpy(tempa, ta, sizeof(ta));
-    memcpy(templ, tl, sizeof(tl));
+    memcpy(tempa, ta, num_4x4_blocks_wide * sizeof(ta[0]));
+    memcpy(templ, tl, num_4x4_blocks_high * sizeof(tl[0]));
 
     for (idy = 0; idy < num_4x4_blocks_high; ++idy) {
       for (idx = 0; idx < num_4x4_blocks_wide; ++idx) {
@@ -984,39 +2184,65 @@
         uint8_t *const dst = &dst_init[idx * 4 + idy * 4 * dst_stride];
         int16_t *const src_diff =
             vp10_raster_block_offset_int16(BLOCK_8X8, block, p->src_diff);
-        tran_low_t *const coeff = BLOCK_OFFSET(x->plane[0].coeff, block);
         xd->mi[0]->bmi[block].as_mode = mode;
         vp10_predict_intra_block(xd, 1, 1, TX_4X4, mode, dst, dst_stride,
                                 dst, dst_stride, col + idx, row + idy, 0);
         vpx_subtract_block(4, 4, src_diff, 8, src, src_stride, dst, dst_stride);
 
         if (xd->lossless[xd->mi[0]->mbmi.segment_id]) {
-          TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block);
-          const scan_order *so = get_scan(TX_4X4, tx_type);
-          vp10_fwd_txfm_4x4(src_diff, coeff, 8, DCT_DCT, 1);
-          vp10_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
-          ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
+          TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block, TX_4X4);
+          const scan_order *so = get_scan(TX_4X4, tx_type, 0);
+#if CONFIG_VAR_TX
+          const int coeff_ctx = combine_entropy_contexts(*(tempa + idx),
+                                                         *(templ + idy));
+#endif
+          vp10_xform_quant(x, 0, block, row + idy, col + idx, BLOCK_8X8,
+                           TX_4X4, VP10_XFORM_QUANT_B);
+#if CONFIG_VAR_TX
+          ratey += cost_coeffs(x, 0, block, coeff_ctx, TX_4X4, so->scan,
+                               so->neighbors, cpi->sf.use_fast_coef_costing);
+          *(tempa + idx) = !(p->eobs[block] == 0);
+          *(templ + idy) = !(p->eobs[block] == 0);
+#else
+          ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy,
+                               TX_4X4,
                                so->scan, so->neighbors,
                                cpi->sf.use_fast_coef_costing);
+#endif
           if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
             goto next;
           vp10_inv_txfm_add_4x4(BLOCK_OFFSET(pd->dqcoeff, block),
                                 dst, dst_stride, p->eobs[block], DCT_DCT, 1);
         } else {
-          int64_t unused;
-          TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block);
-          const scan_order *so = get_scan(TX_4X4, tx_type);
-          vp10_fwd_txfm_4x4(src_diff, coeff, 8, tx_type, 0);
-          vp10_regular_quantize_b_4x4(x, 0, block, so->scan, so->iscan);
-          ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy, TX_4X4,
-                             so->scan, so->neighbors,
-                             cpi->sf.use_fast_coef_costing);
-          distortion += vp10_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, block),
-                                        16, &unused) >> 2;
-          if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
-            goto next;
+          int64_t dist;
+          unsigned int tmp;
+          TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, block, TX_4X4);
+          const scan_order *so = get_scan(TX_4X4, tx_type, 0);
+          const int coeff_ctx = combine_entropy_contexts(*(tempa + idx),
+                                                         *(templ + idy));
+          vp10_xform_quant(x, 0, block, row + idy, col + idx, BLOCK_8X8,
+                           TX_4X4, VP10_XFORM_QUANT_FP);
+          vp10_optimize_b(x, 0, block, TX_4X4, coeff_ctx);
+#if CONFIG_VAR_TX
+          ratey += cost_coeffs(x, 0, block, coeff_ctx, TX_4X4, so->scan,
+                               so->neighbors, cpi->sf.use_fast_coef_costing);
+          *(tempa + idx) = !(p->eobs[block] == 0);
+          *(templ + idy) = !(p->eobs[block] == 0);
+#else
+          ratey += cost_coeffs(x, 0, block, tempa + idx, templ + idy,
+                               TX_4X4, so->scan, so->neighbors,
+                               cpi->sf.use_fast_coef_costing);
+#endif
           vp10_inv_txfm_add_4x4(BLOCK_OFFSET(pd->dqcoeff, block),
                                 dst, dst_stride, p->eobs[block], tx_type, 0);
+          cpi->fn_ptr[BLOCK_4X4].vf(src, src_stride, dst, dst_stride, &tmp);
+          dist = (int64_t)tmp << 4;
+          distortion += dist;
+          // To use the pixel domain distortion, the step below needs to be
+          // put behind the inv txfm. Compared to calculating the distortion
+          // in the frequency domain, the overhead of encoding effort is low.
+          if (RDCOST(x->rdmult, x->rddiv, ratey, distortion) >= best_rd)
+            goto next;
         }
       }
     }
@@ -1030,8 +2256,8 @@
       *bestdistortion = distortion;
       best_rd = this_rd;
       *best_mode = mode;
-      memcpy(a, tempa, sizeof(tempa));
-      memcpy(l, templ, sizeof(templ));
+      memcpy(a, tempa, num_4x4_blocks_wide * sizeof(tempa[0]));
+      memcpy(l, templ, num_4x4_blocks_high * sizeof(templ[0]));
       for (idy = 0; idy < num_4x4_blocks_high * 4; ++idy)
         memcpy(best_dst + idy * 8, dst_init + idy * dst_stride,
                num_4x4_blocks_wide * 4);
@@ -1067,18 +2293,18 @@
   int64_t total_distortion = 0;
   int tot_rate_y = 0;
   int64_t total_rd = 0;
-  ENTROPY_CONTEXT t_above[4], t_left[4];
-  const int *bmode_costs = cpi->mbmode_cost;
+  const int *bmode_costs = cpi->mbmode_cost[0];
 
-  memcpy(t_above, xd->plane[0].above_context, sizeof(t_above));
-  memcpy(t_left, xd->plane[0].left_context, sizeof(t_left));
+#if CONFIG_EXT_INTRA
+  mic->mbmi.ext_intra_mode_info.use_ext_intra_mode[0] = 0;
+  mic->mbmi.intra_filter = INTRA_FILTER_LINEAR;
+#endif  // CONFIG_EXT_INTRA
 
   // TODO(any): Add search of the tx_type to improve rd performance at the
   // expense of speed.
   mic->mbmi.tx_type = DCT_DCT;
+  mic->mbmi.tx_size = TX_4X4;
 
-  // Later we can add search of the tx_type to improve results.
-  // For now just set it to DCT_DCT
   // Pick modes for each sub-block (of size 4x4, 4x8, or 8x4) in an 8x8 block.
   for (idy = 0; idy < 2; idy += num_4x4_blocks_high) {
     for (idx = 0; idx < 2; idx += num_4x4_blocks_wide) {
@@ -1094,7 +2320,9 @@
       }
 
       this_rd = rd_pick_intra4x4block(cpi, mb, idy, idx, &best_mode,
-                                      bmode_costs, t_above + idx, t_left + idy,
+                                      bmode_costs,
+                                      xd->plane[0].above_context + idx,
+                                      xd->plane[0].left_context + idy,
                                       &r, &ry, &d, bsize, best_rd - total_rd);
       if (this_rd >= best_rd - total_rd)
         return INT64_MAX;
@@ -1114,55 +2342,557 @@
         return INT64_MAX;
     }
   }
+  mic->mbmi.mode = mic->bmi[3].as_mode;
+
+  // Add in the cost of the transform type
+  if (!xd->lossless[mic->mbmi.segment_id]) {
+    int rate_tx_type = 0;
+#if CONFIG_EXT_TX
+    if (get_ext_tx_types(TX_4X4, bsize, 0) > 1) {
+      const int eset = get_ext_tx_set(TX_4X4, bsize, 0);
+      rate_tx_type =
+          cpi->intra_tx_type_costs[eset][TX_4X4]
+                                  [mic->mbmi.mode][mic->mbmi.tx_type];
+    }
+#else
+    rate_tx_type =
+        cpi->intra_tx_type_costs[TX_4X4]
+                                [intra_mode_to_tx_type_context[mic->mbmi.mode]]
+                                [mic->mbmi.tx_type];
+#endif
+    assert(mic->mbmi.tx_size == TX_4X4);
+    cost += rate_tx_type;
+    tot_rate_y += rate_tx_type;
+  }
 
   *rate = cost;
   *rate_y = tot_rate_y;
   *distortion = total_distortion;
-  mic->mbmi.mode = mic->bmi[3].as_mode;
 
   return RDCOST(mb->rdmult, mb->rddiv, cost, total_distortion);
 }
 
+#if CONFIG_EXT_INTRA
+// Return 1 if an ext intra mode is selected; return 0 otherwise.
+static int rd_pick_ext_intra_sby(VP10_COMP *cpi, MACROBLOCK *x,
+                                 int *rate, int *rate_tokenonly,
+                                 int64_t *distortion, int *skippable,
+                                 BLOCK_SIZE bsize, int mode_cost,
+                                 int64_t *best_rd, uint16_t skip_mask) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mic = xd->mi[0];
+  MB_MODE_INFO *mbmi = &mic->mbmi;
+  int this_rate, this_rate_tokenonly, s;
+  int ext_intra_selected_flag = 0;
+  int64_t this_distortion, this_rd;
+  EXT_INTRA_MODE mode;
+  TX_SIZE best_tx_size = TX_4X4;
+  EXT_INTRA_MODE_INFO ext_intra_mode_info;
+  TX_TYPE best_tx_type;
+
+  vp10_zero(ext_intra_mode_info);
+  mbmi->ext_intra_mode_info.use_ext_intra_mode[0] = 1;
+  mbmi->mode = DC_PRED;
+  mbmi->palette_mode_info.palette_size[0] = 0;
+
+  for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) {
+    if (skip_mask & (1 << mode))
+      continue;
+    mbmi->ext_intra_mode_info.ext_intra_mode[0] = mode;
+    super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
+                    &s, NULL, bsize, *best_rd);
+    if (this_rate_tokenonly == INT_MAX)
+      continue;
+
+    this_rate = this_rate_tokenonly +
+        vp10_cost_bit(cpi->common.fc->ext_intra_probs[0], 1) +
+        write_uniform_cost(FILTER_INTRA_MODES, mode) + mode_cost;
+    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+
+    if (this_rd < *best_rd) {
+      *best_rd            = this_rd;
+      best_tx_size        = mic->mbmi.tx_size;
+      ext_intra_mode_info = mbmi->ext_intra_mode_info;
+      best_tx_type        = mic->mbmi.tx_type;
+      *rate               = this_rate;
+      *rate_tokenonly     = this_rate_tokenonly;
+      *distortion         = this_distortion;
+      *skippable          = s;
+      ext_intra_selected_flag = 1;
+    }
+  }
+
+  if (ext_intra_selected_flag) {
+    mbmi->mode = DC_PRED;
+    mbmi->tx_size = best_tx_size;
+    mbmi->ext_intra_mode_info.use_ext_intra_mode[0] =
+        ext_intra_mode_info.use_ext_intra_mode[0];
+    mbmi->ext_intra_mode_info.ext_intra_mode[0] =
+        ext_intra_mode_info.ext_intra_mode[0];
+    mbmi->tx_type = best_tx_type;
+    return 1;
+  } else {
+    return 0;
+  }
+}
+
+static void pick_intra_angle_routine_sby(VP10_COMP *cpi, MACROBLOCK *x,
+                                         int *rate, int *rate_tokenonly,
+                                         int64_t *distortion, int *skippable,
+                                         int *best_angle_delta,
+                                         TX_SIZE *best_tx_size,
+                                         TX_TYPE *best_tx_type,
+                                         INTRA_FILTER *best_filter,
+                                         BLOCK_SIZE bsize, int rate_overhead,
+                                         int64_t *best_rd,
+                                         uint8_t zcoeff_blk[][MAX_MIB_SIZE *
+                                                            MAX_MIB_SIZE * 4]) {
+  int this_rate, this_rate_tokenonly, s;
+  int64_t this_distortion, this_rd;
+  MB_MODE_INFO *mbmi = &x->e_mbd.mi[0]->mbmi;
+  super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
+                  &s, NULL, bsize, *best_rd);
+  if (this_rate_tokenonly == INT_MAX)
+    return;
+
+  this_rate = this_rate_tokenonly + rate_overhead;
+  this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+
+  if (this_rd < *best_rd) {
+    *best_rd            = this_rd;
+    *best_angle_delta   = mbmi->angle_delta[0];
+    *best_tx_size       = mbmi->tx_size;
+    *best_filter        = mbmi->intra_filter;
+    *best_tx_type       = mbmi->tx_type;
+    *rate               = this_rate;
+    *rate_tokenonly     = this_rate_tokenonly;
+    *distortion         = this_distortion;
+    *skippable          = s;
+    memcpy(zcoeff_blk[mbmi->tx_size], x->zcoeff_blk[mbmi->tx_size],
+           sizeof(zcoeff_blk[mbmi->tx_size][0]) *
+           MAX_MIB_SIZE * MAX_MIB_SIZE * 4);
+  }
+}
+
+static int64_t rd_pick_intra_angle_sby(VP10_COMP *cpi, MACROBLOCK *x,
+                                       int *rate, int *rate_tokenonly,
+                                       int64_t *distortion, int *skippable,
+                                       BLOCK_SIZE bsize, int rate_overhead,
+                                       int64_t best_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MODE_INFO *const mic = xd->mi[0];
+  MB_MODE_INFO *mbmi = &mic->mbmi;
+  int this_rate, this_rate_tokenonly, s;
+  int angle_delta, best_angle_delta = 0, p_angle;
+  const int intra_filter_ctx = vp10_get_pred_context_intra_interp(xd);
+  INTRA_FILTER filter, best_filter = INTRA_FILTER_LINEAR;
+  const double rd_adjust = 1.2;
+  int64_t this_distortion, this_rd;
+  TX_SIZE best_tx_size = mic->mbmi.tx_size;
+  TX_TYPE best_tx_type = mbmi->tx_type;
+  uint8_t zcoeff_blk[TX_SIZES][MAX_MIB_SIZE * MAX_MIB_SIZE * 4];
+
+  if (ANGLE_FAST_SEARCH) {
+    int deltas_level1[3] = {0, -2, 2};
+    int deltas_level2[3][2] = {
+        {-1, 1}, {-3, -1}, {1, 3},
+    };
+    const int level1 = 3, level2 = 2;
+    int i, j, best_i = -1;
+
+    for (i = 0; i < level1; ++i) {
+      mic->mbmi.angle_delta[0] = deltas_level1[i];
+      p_angle = mode_to_angle_map[mbmi->mode] +
+          mbmi->angle_delta[0] * ANGLE_STEP;
+      for (filter = INTRA_FILTER_LINEAR; filter < INTRA_FILTERS; ++filter) {
+        int64_t tmp_best_rd;
+        if ((FILTER_FAST_SEARCH || !vp10_is_intra_filter_switchable(p_angle)) &&
+            filter != INTRA_FILTER_LINEAR)
+          continue;
+        mic->mbmi.intra_filter = filter;
+        tmp_best_rd = (i == 0 && filter == INTRA_FILTER_LINEAR &&
+            best_rd < INT64_MAX) ? (int64_t)(best_rd * rd_adjust) : best_rd;
+        super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
+                        &s, NULL, bsize, tmp_best_rd);
+        if (this_rate_tokenonly == INT_MAX) {
+          if (i == 0 && filter == INTRA_FILTER_LINEAR)
+            return best_rd;
+          else
+            continue;
+        }
+        this_rate = this_rate_tokenonly + rate_overhead +
+            cpi->intra_filter_cost[intra_filter_ctx][filter];
+        this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+        if (i == 0 && filter == INTRA_FILTER_LINEAR &&
+            best_rd < INT64_MAX && this_rd > best_rd * rd_adjust)
+          return best_rd;
+        if (this_rd < best_rd) {
+          best_i              = i;
+          best_rd             = this_rd;
+          best_angle_delta    = mbmi->angle_delta[0];
+          best_tx_size        = mbmi->tx_size;
+          best_filter         = mbmi->intra_filter;
+          best_tx_type        = mbmi->tx_type;
+          *rate               = this_rate;
+          *rate_tokenonly     = this_rate_tokenonly;
+          *distortion         = this_distortion;
+          *skippable          = s;
+          memcpy(zcoeff_blk[mbmi->tx_size], x->zcoeff_blk[mbmi->tx_size],
+                 sizeof(zcoeff_blk[mbmi->tx_size][0]) *
+                 MAX_MIB_SIZE * MAX_MIB_SIZE * 4);
+        }
+      }
+    }
+
+    if (best_i >= 0) {
+      for (j = 0; j < level2; ++j) {
+        mic->mbmi.angle_delta[0] = deltas_level2[best_i][j];
+        p_angle = mode_to_angle_map[mbmi->mode] +
+            mbmi->angle_delta[0] * ANGLE_STEP;
+        for (filter = INTRA_FILTER_LINEAR; filter < INTRA_FILTERS; ++filter) {
+          mic->mbmi.intra_filter = filter;
+          if ((FILTER_FAST_SEARCH || !vp10_is_intra_filter_switchable(p_angle))
+              && filter != INTRA_FILTER_LINEAR)
+            continue;
+          pick_intra_angle_routine_sby(cpi, x, rate, rate_tokenonly,
+                                       distortion, skippable,
+                                       &best_angle_delta, &best_tx_size,
+                                       &best_tx_type, &best_filter, bsize,
+                                       rate_overhead +
+                                       cpi->intra_filter_cost
+                                       [intra_filter_ctx][filter],
+                                       &best_rd, zcoeff_blk);
+        }
+      }
+    }
+  } else {
+    for (angle_delta = -MAX_ANGLE_DELTAS; angle_delta <= MAX_ANGLE_DELTAS;
+        ++angle_delta) {
+      mbmi->angle_delta[0] = angle_delta;
+      p_angle = mode_to_angle_map[mbmi->mode] +
+          mbmi->angle_delta[0] * ANGLE_STEP;
+      for (filter = INTRA_FILTER_LINEAR; filter < INTRA_FILTERS; ++filter) {
+        mic->mbmi.intra_filter = filter;
+        if ((FILTER_FAST_SEARCH || !vp10_is_intra_filter_switchable(p_angle)) &&
+            filter != INTRA_FILTER_LINEAR)
+          continue;
+        pick_intra_angle_routine_sby(cpi, x, rate, rate_tokenonly,
+                                     distortion, skippable,
+                                     &best_angle_delta, &best_tx_size,
+                                     &best_tx_type, &best_filter, bsize,
+                                     rate_overhead +
+                                     cpi->intra_filter_cost
+                                     [intra_filter_ctx][filter],
+                                     &best_rd, zcoeff_blk);
+      }
+    }
+  }
+
+  if (FILTER_FAST_SEARCH && *rate_tokenonly < INT_MAX) {
+    mbmi->angle_delta[0] = best_angle_delta;
+    p_angle = mode_to_angle_map[mbmi->mode] +
+        mbmi->angle_delta[0] * ANGLE_STEP;
+    if (vp10_is_intra_filter_switchable(p_angle)) {
+      for (filter = INTRA_FILTER_LINEAR + 1; filter < INTRA_FILTERS; ++filter) {
+        mic->mbmi.intra_filter = filter;
+        pick_intra_angle_routine_sby(cpi, x, rate, rate_tokenonly,
+                                     distortion, skippable,
+                                     &best_angle_delta, &best_tx_size,
+                                     &best_tx_type, &best_filter, bsize,
+                                     rate_overhead + cpi->intra_filter_cost
+                                     [intra_filter_ctx][filter], &best_rd,
+                                     zcoeff_blk);
+      }
+    }
+  }
+
+  mbmi->tx_size = best_tx_size;
+  mbmi->angle_delta[0] = best_angle_delta;
+  mic->mbmi.intra_filter = best_filter;
+  mbmi->tx_type = best_tx_type;
+  if (*rate_tokenonly < INT_MAX)
+    memcpy(x->zcoeff_blk[mbmi->tx_size], zcoeff_blk[mbmi->tx_size],
+           sizeof(zcoeff_blk[mbmi->tx_size][0]) *
+           MAX_MIB_SIZE * MAX_MIB_SIZE * 4);
+
+  return best_rd;
+}
+
+// Indices are sign, integer, and fractional part of the gradient value
+static const uint8_t gradient_to_angle_bin[2][7][16] = {
+    {
+        {6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 0, 0, 0, 0, },
+        {0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, },
+        {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, },
+        {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, },
+        {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, },
+        {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, },
+        {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, },
+    },
+    {
+        {6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, },
+        {4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, },
+        {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, },
+        {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, },
+        {3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, },
+        {3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, },
+        {2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, },
+    },
+};
+
+static const uint8_t mode_to_angle_bin[INTRA_MODES] = {
+    0, 2, 6, 0, 4, 3, 5, 7, 1, 0,
+};
+
+static void angle_estimation(const uint8_t *src, int src_stride,
+                             int rows, int cols,
+                             uint8_t *directional_mode_skip_mask) {
+  int i, r, c, index, dx, dy, temp, sn, remd, quot;
+  uint64_t hist[DIRECTIONAL_MODES];
+  uint64_t hist_sum = 0;
+
+  memset(hist, 0, DIRECTIONAL_MODES * sizeof(hist[0]));
+  src += src_stride;
+  for (r = 1; r < rows; ++r) {
+    for (c = 1; c < cols; ++c) {
+      dx = src[c] - src[c - 1];
+      dy = src[c] - src[c - src_stride];
+      temp = dx * dx + dy * dy;
+      if (dy == 0) {
+        index = 2;
+      } else {
+        sn = (dx > 0) ^ (dy > 0);
+        dx = abs(dx);
+        dy = abs(dy);
+        remd = dx % dy;
+        quot = dx / dy;
+        remd = remd * 16 / dy;
+        index = gradient_to_angle_bin[sn][VPXMIN(quot, 6)][VPXMIN(remd, 15)];
+      }
+      hist[index] += temp;
+    }
+    src += src_stride;
+  }
+
+  for (i = 0; i < DIRECTIONAL_MODES; ++i)
+    hist_sum += hist[i];
+  for (i = 0; i < INTRA_MODES; ++i) {
+    if (i != DC_PRED && i != TM_PRED) {
+      int index = mode_to_angle_bin[i];
+      uint64_t score = 2 * hist[index];
+      int weight = 2;
+      if (index > 0) {
+        score += hist[index - 1];
+        weight += 1;
+      }
+      if (index < DIRECTIONAL_MODES - 1) {
+        score += hist[index + 1];
+        weight += 1;
+      }
+      if (score * ANGLE_SKIP_THRESH  < hist_sum * weight)
+        directional_mode_skip_mask[i] = 1;
+    }
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void highbd_angle_estimation(const uint8_t *src8, int src_stride,
+                                    int rows, int cols,
+                                    uint8_t *directional_mode_skip_mask) {
+  int i, r, c, index, dx, dy, temp, sn, remd, quot;
+  uint64_t hist[DIRECTIONAL_MODES];
+  uint64_t hist_sum = 0;
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+
+  memset(hist, 0, DIRECTIONAL_MODES * sizeof(hist[0]));
+  src += src_stride;
+  for (r = 1; r < rows; ++r) {
+    for (c = 1; c < cols; ++c) {
+      dx = src[c] - src[c - 1];
+      dy = src[c] - src[c - src_stride];
+      temp = dx * dx + dy * dy;
+      if (dy == 0) {
+        index = 2;
+      } else {
+        sn = (dx > 0) ^ (dy > 0);
+        dx = abs(dx);
+        dy = abs(dy);
+        remd = dx % dy;
+        quot = dx / dy;
+        remd = remd * 16 / dy;
+        index = gradient_to_angle_bin[sn][VPXMIN(quot, 6)][VPXMIN(remd, 15)];
+      }
+      hist[index] += temp;
+    }
+    src += src_stride;
+  }
+
+  for (i = 0; i < DIRECTIONAL_MODES; ++i)
+    hist_sum += hist[i];
+  for (i = 0; i < INTRA_MODES; ++i) {
+    if (i != DC_PRED && i != TM_PRED) {
+      int index = mode_to_angle_bin[i];
+      uint64_t score = 2 * hist[index];
+      int weight = 2;
+      if (index > 0) {
+        score += hist[index - 1];
+        weight += 1;
+      }
+      if (index < DIRECTIONAL_MODES - 1) {
+        score += hist[index + 1];
+        weight += 1;
+      }
+      if (score * ANGLE_SKIP_THRESH  < hist_sum * weight)
+        directional_mode_skip_mask[i] = 1;
+    }
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // CONFIG_EXT_INTRA
+
 // This function is used only for intra_only frames
 static int64_t rd_pick_intra_sby_mode(VP10_COMP *cpi, MACROBLOCK *x,
                                       int *rate, int *rate_tokenonly,
                                       int64_t *distortion, int *skippable,
                                       BLOCK_SIZE bsize,
                                       int64_t best_rd) {
-  PREDICTION_MODE mode;
+  uint8_t mode_idx;
   PREDICTION_MODE mode_selected = DC_PRED;
   MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO *const mic = xd->mi[0];
   int this_rate, this_rate_tokenonly, s;
   int64_t this_distortion, this_rd;
   TX_SIZE best_tx = TX_4X4;
+#if CONFIG_EXT_INTRA
+  const int intra_filter_ctx = vp10_get_pred_context_intra_interp(xd);
+  EXT_INTRA_MODE_INFO ext_intra_mode_info;
+  int is_directional_mode, rate_overhead, best_angle_delta = 0;
+  INTRA_FILTER best_filter = INTRA_FILTER_LINEAR;
+  uint8_t directional_mode_skip_mask[INTRA_MODES];
+  uint16_t filter_intra_mode_skip_mask = (1 << FILTER_INTRA_MODES) - 1;
+  const int src_stride = x->plane[0].src.stride;
+  const uint8_t *src = x->plane[0].src.buf;
+#endif  // CONFIG_EXT_INTRA
   TX_TYPE best_tx_type = DCT_DCT;
   int *bmode_costs;
+  PALETTE_MODE_INFO palette_mode_info;
+  PALETTE_MODE_INFO *const pmi = &mic->mbmi.palette_mode_info;
+  uint8_t *best_palette_color_map = cpi->common.allow_screen_content_tools ?
+      x->palette_buffer->best_palette_color_map : NULL;
+  const int rows = 4 * num_4x4_blocks_high_lookup[bsize];
+  const int cols = 4 * num_4x4_blocks_wide_lookup[bsize];
+  int palette_ctx = 0;
   const MODE_INFO *above_mi = xd->above_mi;
   const MODE_INFO *left_mi = xd->left_mi;
   const PREDICTION_MODE A = vp10_above_block_mode(mic, above_mi, 0);
   const PREDICTION_MODE L = vp10_left_block_mode(mic, left_mi, 0);
+  const PREDICTION_MODE FINAL_MODE_SEARCH = TM_PRED + 1;
+  const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
   bmode_costs = cpi->y_mode_costs[A][L];
 
-  memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm));
+#if CONFIG_EXT_INTRA
+  ext_intra_mode_info.use_ext_intra_mode[0] = 0;
+  mic->mbmi.ext_intra_mode_info.use_ext_intra_mode[0] = 0;
+  mic->mbmi.angle_delta[0] = 0;
+  memset(directional_mode_skip_mask, 0,
+         sizeof(directional_mode_skip_mask[0]) * INTRA_MODES);
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    highbd_angle_estimation(src, src_stride, rows, cols,
+                            directional_mode_skip_mask);
+  else
+#endif
+    angle_estimation(src, src_stride, rows, cols, directional_mode_skip_mask);
+#endif  // CONFIG_EXT_INTRA
+  palette_mode_info.palette_size[0] = 0;
+  pmi->palette_size[0] = 0;
+  if (above_mi)
+    palette_ctx += (above_mi->mbmi.palette_mode_info.palette_size[0] > 0);
+  if (left_mi)
+    palette_ctx += (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
+
+  if (cpi->sf.tx_type_search.fast_intra_tx_type_search)
+    x->use_default_intra_tx_type = 1;
+  else
+    x->use_default_intra_tx_type = 0;
 
   /* Y Search for intra prediction mode */
-  for (mode = DC_PRED; mode <= TM_PRED; mode++) {
-    mic->mbmi.mode = mode;
-
+  for (mode_idx = DC_PRED; mode_idx <= FINAL_MODE_SEARCH; ++mode_idx) {
+    if (mode_idx == FINAL_MODE_SEARCH) {
+      if (x->use_default_intra_tx_type == 0)
+        break;
+      mic->mbmi.mode = mode_selected;
+      x->use_default_intra_tx_type = 0;
+    } else {
+      mic->mbmi.mode = mode_idx;
+    }
+#if CONFIG_EXT_INTRA
+    is_directional_mode =
+        (mic->mbmi.mode != DC_PRED && mic->mbmi.mode != TM_PRED);
+    if (is_directional_mode && directional_mode_skip_mask[mic->mbmi.mode])
+      continue;
+    if (is_directional_mode) {
+      rate_overhead = bmode_costs[mic->mbmi.mode] +
+          write_uniform_cost(2 * MAX_ANGLE_DELTAS + 1, 0);
+      this_rate_tokenonly = INT_MAX;
+      this_rd =
+          rd_pick_intra_angle_sby(cpi, x, &this_rate, &this_rate_tokenonly,
+                                  &this_distortion, &s, bsize, rate_overhead,
+                                  best_rd);
+    } else {
+      mic->mbmi.angle_delta[0] = 0;
+      super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
+                      &s, NULL, bsize, best_rd);
+    }
+#else
     super_block_yrd(cpi, x, &this_rate_tokenonly, &this_distortion,
-        &s, NULL, bsize, best_rd);
+                    &s, NULL, bsize, best_rd);
+#endif  // CONFIG_EXT_INTRA
 
     if (this_rate_tokenonly == INT_MAX)
       continue;
 
-    this_rate = this_rate_tokenonly + bmode_costs[mode];
+    this_rate = this_rate_tokenonly + bmode_costs[mic->mbmi.mode];
+
+    if (!xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+      // super_block_yrd above includes the cost of the tx_size in the
+      // tokenonly rate, but for intra blocks, tx_size is always coded
+      // (prediction granularity), so we account for it in the full rate,
+      // not the tokenonly rate.
+      this_rate_tokenonly -=
+          cpi->tx_size_cost[max_tx_size - TX_8X8][get_tx_size_context(xd)]
+                                                 [mic->mbmi.tx_size];
+    }
+    if (cpi->common.allow_screen_content_tools && mic->mbmi.mode == DC_PRED)
+      this_rate +=
+          vp10_cost_bit(vp10_default_palette_y_mode_prob[bsize - BLOCK_8X8]
+                                                         [palette_ctx], 0);
+#if CONFIG_EXT_INTRA
+    if (mic->mbmi.mode == DC_PRED && ALLOW_FILTER_INTRA_MODES)
+      this_rate += vp10_cost_bit(cpi->common.fc->ext_intra_probs[0], 0);
+    if (is_directional_mode) {
+      int p_angle;
+      this_rate += write_uniform_cost(2 * MAX_ANGLE_DELTAS + 1,
+                                      MAX_ANGLE_DELTAS +
+                                      mic->mbmi.angle_delta[0]);
+      p_angle = mode_to_angle_map[mic->mbmi.mode] +
+          mic->mbmi.angle_delta[0] * ANGLE_STEP;
+      if (vp10_is_intra_filter_switchable(p_angle))
+        this_rate +=
+            cpi->intra_filter_cost[intra_filter_ctx][mic->mbmi.intra_filter];
+    }
+    filter_intra_mode_skip_mask ^= (1 << mic->mbmi.mode);
+#endif  // CONFIG_EXT_INTRA
     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
 
     if (this_rd < best_rd) {
-      mode_selected   = mode;
+      mode_selected   = mic->mbmi.mode;
       best_rd         = this_rd;
       best_tx         = mic->mbmi.tx_size;
+#if CONFIG_EXT_INTRA
+      best_angle_delta = mic->mbmi.angle_delta[0];
+      best_filter     = mic->mbmi.intra_filter;
+#endif  // CONFIG_EXT_INTRA
       best_tx_type    = mic->mbmi.tx_type;
       *rate           = this_rate;
       *rate_tokenonly = this_rate_tokenonly;
@@ -1171,13 +2901,720 @@
     }
   }
 
+  if (cpi->common.allow_screen_content_tools)
+    rd_pick_palette_intra_sby(cpi, x, bsize, palette_ctx, bmode_costs[DC_PRED],
+                              &palette_mode_info, best_palette_color_map,
+                              &best_tx, &best_tx_type, &mode_selected,
+                              &best_rd);
+
+#if CONFIG_EXT_INTRA
+  if (ALLOW_FILTER_INTRA_MODES) {
+    if (rd_pick_ext_intra_sby(cpi, x, rate, rate_tokenonly, distortion,
+                              skippable, bsize, bmode_costs[DC_PRED],
+                              &best_rd, filter_intra_mode_skip_mask)) {
+      mode_selected       = mic->mbmi.mode;
+      best_tx             = mic->mbmi.tx_size;
+      ext_intra_mode_info = mic->mbmi.ext_intra_mode_info;
+      best_tx_type        = mic->mbmi.tx_type;
+    }
+  }
+
+  mic->mbmi.ext_intra_mode_info.use_ext_intra_mode[0] =
+      ext_intra_mode_info.use_ext_intra_mode[0];
+  if (ext_intra_mode_info.use_ext_intra_mode[0]) {
+    mic->mbmi.ext_intra_mode_info.ext_intra_mode[0] =
+        ext_intra_mode_info.ext_intra_mode[0];
+    palette_mode_info.palette_size[0] = 0;
+  }
+#endif  // CONFIG_EXT_INTRA
+
   mic->mbmi.mode = mode_selected;
   mic->mbmi.tx_size = best_tx;
+#if CONFIG_EXT_INTRA
+  mic->mbmi.angle_delta[0] = best_angle_delta;
+  mic->mbmi.intra_filter = best_filter;
+#endif  // CONFIG_EXT_INTRA
   mic->mbmi.tx_type = best_tx_type;
+  pmi->palette_size[0] = palette_mode_info.palette_size[0];
+  if (palette_mode_info.palette_size[0] > 0) {
+    memcpy(pmi->palette_colors, palette_mode_info.palette_colors,
+           PALETTE_MAX_SIZE * sizeof(palette_mode_info.palette_colors[0]));
+    memcpy(xd->plane[0].color_index_map, best_palette_color_map,
+           rows * cols * sizeof(best_palette_color_map[0]));
+  }
 
   return best_rd;
 }
 
+#if CONFIG_VAR_TX
+void vp10_tx_block_rd_b(const VP10_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
+                        int blk_row, int blk_col, int plane, int block,
+                        int plane_bsize, int coeff_ctx,
+                        int *rate, int64_t *dist, int64_t *bsse, int *skip) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  int64_t tmp;
+  tran_low_t *const dqcoeff = BLOCK_OFFSET(pd->dqcoeff, block);
+  PLANE_TYPE plane_type = (plane == 0) ? PLANE_TYPE_Y : PLANE_TYPE_UV;
+  TX_TYPE tx_type = get_tx_type(plane_type, xd, block, tx_size);
+  const scan_order *const scan_order =
+      get_scan(tx_size, tx_type, is_inter_block(&xd->mi[0]->mbmi));
+
+  BLOCK_SIZE txm_bsize = txsize_to_bsize[tx_size];
+  int bh = 4 * num_4x4_blocks_wide_lookup[txm_bsize];
+  int src_stride = p->src.stride;
+  uint8_t *src = &p->src.buf[4 * blk_row * src_stride + 4 * blk_col];
+  uint8_t *dst = &pd->dst.buf[4 * blk_row * pd->dst.stride + 4 * blk_col];
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint16_t, rec_buffer16[MAX_TX_SQUARE]);
+  uint8_t *rec_buffer;
+#else
+  DECLARE_ALIGNED(16, uint8_t, rec_buffer[MAX_TX_SQUARE]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  const int diff_stride = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
+  const int16_t *diff = &p->src_diff[4 * (blk_row * diff_stride + blk_col)];
+
+  int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
+  int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
+
+  if (xd->mb_to_bottom_edge < 0)
+    max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
+  if (xd->mb_to_right_edge < 0)
+    max_blocks_wide += xd->mb_to_right_edge >> (5 + pd->subsampling_x);
+
+#if CONFIG_NEW_QUANT
+  vp10_xform_quant_fp_nuq(x, plane, block, blk_row, blk_col,
+                          plane_bsize, tx_size, coeff_ctx);
+#else
+  vp10_xform_quant(x, plane, block, blk_row, blk_col,
+                   plane_bsize, tx_size, VP10_XFORM_QUANT_FP);
+#endif  // CONFIG_NEW_QUANT
+
+  vp10_optimize_b(x, plane, block, tx_size, coeff_ctx);
+
+  // TODO(any): Use dist_block to compute distortion
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    rec_buffer = CONVERT_TO_BYTEPTR(rec_buffer16);
+    vpx_highbd_convolve_copy(dst, pd->dst.stride, rec_buffer, MAX_TX_SIZE,
+                             NULL, 0, NULL, 0, bh, bh, xd->bd);
+  } else {
+    rec_buffer = (uint8_t *)rec_buffer16;
+    vpx_convolve_copy(dst, pd->dst.stride, rec_buffer, MAX_TX_SIZE,
+                      NULL, 0, NULL, 0, bh, bh);
+  }
+#else
+  vpx_convolve_copy(dst, pd->dst.stride, rec_buffer, MAX_TX_SIZE,
+                    NULL, 0, NULL, 0, bh, bh);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  if (blk_row + (bh >> 2) > max_blocks_high ||
+      blk_col + (bh >> 2) > max_blocks_wide) {
+    int idx, idy;
+    int blocks_height = VPXMIN(bh >> 2, max_blocks_high - blk_row);
+    int blocks_width  = VPXMIN(bh >> 2, max_blocks_wide - blk_col);
+    tmp = 0;
+    for (idy = 0; idy < blocks_height; idy += 2) {
+      for (idx = 0; idx < blocks_width; idx += 2) {
+        const int16_t *d = diff + 4 * idy * diff_stride + 4 * idx;
+        tmp += vpx_sum_squares_2d_i16(d, diff_stride, 8);
+      }
+    }
+  } else {
+    tmp = vpx_sum_squares_2d_i16(diff, diff_stride, bh);
+  }
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    tmp = ROUND_POWER_OF_TWO(tmp, (xd->bd - 8) * 2);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  *bsse += tmp * 16;
+
+  if (p->eobs[block] > 0) {
+    INV_TXFM_PARAM inv_txfm_param;
+    inv_txfm_param.tx_type = tx_type;
+    inv_txfm_param.tx_size = tx_size;
+    inv_txfm_param.eob = p->eobs[block];
+    inv_txfm_param.lossless = xd->lossless[xd->mi[0]->mbmi.segment_id];
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+      inv_txfm_param.bd = xd->bd;
+      highbd_inv_txfm_add(dqcoeff, rec_buffer, MAX_TX_SIZE, &inv_txfm_param);
+    } else {
+      inv_txfm_add(dqcoeff, rec_buffer, MAX_TX_SIZE, &inv_txfm_param);
+    }
+#else  // CONFIG_VP9_HIGHBITDEPTH
+    inv_txfm_add(dqcoeff, rec_buffer, MAX_TX_SIZE, &inv_txfm_param);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    if ((bh >> 2) + blk_col > max_blocks_wide ||
+        (bh >> 2) + blk_row > max_blocks_high) {
+      int idx, idy;
+      unsigned int this_dist;
+      int blocks_height = VPXMIN(bh >> 2, max_blocks_high - blk_row);
+      int blocks_width  = VPXMIN(bh >> 2, max_blocks_wide - blk_col);
+      tmp = 0;
+      for (idy = 0; idy < blocks_height; idy += 2) {
+        for (idx = 0; idx < blocks_width; idx += 2) {
+          uint8_t *const s = src + 4 * idy * src_stride + 4 * idx;
+          uint8_t *const r = rec_buffer + 4 * idy * MAX_TX_SIZE + 4 * idx;
+          cpi->fn_ptr[BLOCK_8X8].vf(s, src_stride, r, MAX_TX_SIZE, &this_dist);
+          tmp += this_dist;
+        }
+      }
+    } else {
+      uint32_t this_dist;
+      cpi->fn_ptr[txm_bsize].vf(src, src_stride, rec_buffer, MAX_TX_SIZE,
+                                &this_dist);
+      tmp = this_dist;
+    }
+  }
+  *dist += tmp * 16;
+  *rate += cost_coeffs(x, plane, block, coeff_ctx, tx_size,
+                       scan_order->scan, scan_order->neighbors, 0);
+  *skip &= (p->eobs[block] == 0);
+}
+
+static void select_tx_block(const VP10_COMP *cpi, MACROBLOCK *x,
+                            int blk_row, int blk_col, int plane, int block,
+                            TX_SIZE tx_size, BLOCK_SIZE plane_bsize,
+                            ENTROPY_CONTEXT *ta, ENTROPY_CONTEXT *tl,
+                            TXFM_CONTEXT *tx_above, TXFM_CONTEXT *tx_left,
+                            int *rate, int64_t *dist,
+                            int64_t *bsse, int *skip,
+                            int64_t ref_best_rd, int *is_cost_valid) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  const int tx_row = blk_row >> (1 - pd->subsampling_y);
+  const int tx_col = blk_col >> (1 - pd->subsampling_x);
+  TX_SIZE (*const inter_tx_size)[MAX_MIB_SIZE] =
+    (TX_SIZE (*)[MAX_MIB_SIZE])&mbmi->inter_tx_size[tx_row][tx_col];
+  const int bw = num_4x4_blocks_wide_lookup[plane_bsize];
+  int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
+  int max_blocks_wide = bw;
+  int64_t this_rd = INT64_MAX;
+  ENTROPY_CONTEXT *pta = ta + blk_col;
+  ENTROPY_CONTEXT *ptl = tl + blk_row;
+  ENTROPY_CONTEXT stxa = 0, stxl = 0;
+  int coeff_ctx, i;
+  int ctx = txfm_partition_context(tx_above + (blk_col >> 1),
+                                   tx_left + (blk_row >> 1), tx_size);
+
+  int64_t sum_dist = 0, sum_bsse = 0;
+  int64_t sum_rd = INT64_MAX;
+  int sum_rate = vp10_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 1);
+  int all_skip = 1;
+  int tmp_eob = 0;
+  int zero_blk_rate;
+
+  if (ref_best_rd < 0) {
+    *is_cost_valid = 0;
+    return;
+  }
+
+  switch (tx_size) {
+    case TX_4X4:
+      stxa = pta[0];
+      stxl = ptl[0];
+      break;
+    case TX_8X8:
+      stxa = !!*(const uint16_t *)&pta[0];
+      stxl = !!*(const uint16_t *)&ptl[0];
+      break;
+    case TX_16X16:
+      stxa = !!*(const uint32_t *)&pta[0];
+      stxl = !!*(const uint32_t *)&ptl[0];
+      break;
+    case TX_32X32:
+      stxa = !!*(const uint64_t *)&pta[0];
+      stxl = !!*(const uint64_t *)&ptl[0];
+      break;
+    default:
+      assert(0 && "Invalid transform size.");
+      break;
+  }
+  coeff_ctx = combine_entropy_contexts(stxa, stxl);
+
+  if (xd->mb_to_bottom_edge < 0)
+    max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
+  if (xd->mb_to_right_edge < 0)
+    max_blocks_wide += xd->mb_to_right_edge >> (5 + pd->subsampling_x);
+
+  *rate = 0;
+  *dist = 0;
+  *bsse = 0;
+  *skip = 1;
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide)
+    return;
+
+  zero_blk_rate =
+      x->token_costs[tx_size][pd->plane_type][1][0][0][coeff_ctx][EOB_TOKEN];
+
+  if (cpi->common.tx_mode == TX_MODE_SELECT || tx_size == TX_4X4) {
+    inter_tx_size[0][0] = tx_size;
+    vp10_tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
+                       plane_bsize, coeff_ctx, rate, dist, bsse, skip);
+
+    if ((RDCOST(x->rdmult, x->rddiv, *rate, *dist) >=
+         RDCOST(x->rdmult, x->rddiv, zero_blk_rate, *bsse) || *skip == 1) &&
+        !xd->lossless[mbmi->segment_id]) {
+      *rate = zero_blk_rate;
+      *dist = *bsse;
+      *skip = 1;
+      x->blk_skip[plane][blk_row * bw + blk_col] = 1;
+      p->eobs[block] = 0;
+    } else {
+      x->blk_skip[plane][blk_row * bw + blk_col] = 0;
+      *skip = 0;
+    }
+
+    if (tx_size > TX_4X4)
+      *rate += vp10_cost_bit(cpi->common.fc->txfm_partition_prob[ctx], 0);
+    this_rd = RDCOST(x->rdmult, x->rddiv, *rate, *dist);
+    tmp_eob = p->eobs[block];
+  }
+
+  if (tx_size > TX_4X4) {
+    BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+    int bsl = b_height_log2_lookup[bsize];
+    int sub_step = 1 << (2 * (tx_size - 1));
+    int i;
+    int this_rate;
+    int64_t this_dist;
+    int64_t this_bsse;
+    int this_skip;
+    int this_cost_valid = 1;
+    int64_t tmp_rd = 0;
+
+    --bsl;
+    for (i = 0; i < 4 && this_cost_valid; ++i) {
+      int offsetr = (i >> 1) << bsl;
+      int offsetc = (i & 0x01) << bsl;
+      select_tx_block(cpi, x, blk_row + offsetr, blk_col + offsetc,
+                      plane, block + i * sub_step, tx_size - 1,
+                      plane_bsize, ta, tl, tx_above, tx_left,
+                      &this_rate, &this_dist,
+                      &this_bsse, &this_skip,
+                      ref_best_rd - tmp_rd, &this_cost_valid);
+      sum_rate += this_rate;
+      sum_dist += this_dist;
+      sum_bsse += this_bsse;
+      all_skip &= this_skip;
+      tmp_rd = RDCOST(x->rdmult, x->rddiv, sum_rate, sum_dist);
+      if (this_rd < tmp_rd)
+        break;
+    }
+    if (this_cost_valid)
+      sum_rd = tmp_rd;
+  }
+
+  if (this_rd < sum_rd) {
+    int idx, idy;
+    for (i = 0; i < (1 << tx_size); ++i)
+      pta[i] = ptl[i] = !(tmp_eob == 0);
+    txfm_partition_update(tx_above + (blk_col >> 1),
+                          tx_left + (blk_row >> 1), tx_size);
+    inter_tx_size[0][0] = tx_size;
+    for (idy = 0; idy < (1 << tx_size) / 2; ++idy)
+      for (idx = 0; idx < (1 << tx_size) / 2; ++idx)
+        inter_tx_size[idy][idx] = tx_size;
+    mbmi->tx_size = tx_size;
+    if (this_rd == INT64_MAX)
+      *is_cost_valid = 0;
+    x->blk_skip[plane][blk_row * bw + blk_col] = *skip;
+  } else {
+    *rate = sum_rate;
+    *dist = sum_dist;
+    *bsse = sum_bsse;
+    *skip = all_skip;
+    if (sum_rd == INT64_MAX)
+      *is_cost_valid = 0;
+  }
+}
+
+static void inter_block_yrd(const VP10_COMP *cpi, MACROBLOCK *x,
+                            int *rate, int64_t *distortion, int *skippable,
+                            int64_t *sse, BLOCK_SIZE bsize,
+                            int64_t ref_best_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  int is_cost_valid = 1;
+  int64_t this_rd = 0;
+
+  if (ref_best_rd < 0)
+    is_cost_valid = 0;
+
+  *rate = 0;
+  *distortion = 0;
+  *sse = 0;
+  *skippable = 1;
+
+  if (is_cost_valid) {
+    const struct macroblockd_plane *const pd = &xd->plane[0];
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+    const int mi_width = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int mi_height = num_4x4_blocks_high_lookup[plane_bsize];
+    BLOCK_SIZE txb_size = txsize_to_bsize[max_txsize_lookup[plane_bsize]];
+    int bh = num_4x4_blocks_wide_lookup[txb_size];
+    int idx, idy;
+    int block = 0;
+    int step = 1 << (max_txsize_lookup[plane_bsize] * 2);
+    ENTROPY_CONTEXT ctxa[2 * MAX_MIB_SIZE];
+    ENTROPY_CONTEXT ctxl[2 * MAX_MIB_SIZE];
+    TXFM_CONTEXT tx_above[MAX_MIB_SIZE];
+    TXFM_CONTEXT tx_left[MAX_MIB_SIZE];
+
+    int pnrate = 0, pnskip = 1;
+    int64_t pndist = 0, pnsse = 0;
+
+    vp10_get_entropy_contexts(bsize, TX_4X4, pd, ctxa, ctxl);
+    memcpy(tx_above, xd->above_txfm_context,
+           sizeof(TXFM_CONTEXT) * (mi_width >> 1));
+    memcpy(tx_left, xd->left_txfm_context,
+           sizeof(TXFM_CONTEXT) * (mi_height >> 1));
+
+    for (idy = 0; idy < mi_height; idy += bh) {
+      for (idx = 0; idx < mi_width; idx += bh) {
+        select_tx_block(cpi, x, idy, idx, 0, block,
+                        max_txsize_lookup[plane_bsize], plane_bsize,
+                        ctxa, ctxl, tx_above, tx_left,
+                        &pnrate, &pndist, &pnsse, &pnskip,
+                        ref_best_rd - this_rd, &is_cost_valid);
+        *rate += pnrate;
+        *distortion += pndist;
+        *sse += pnsse;
+        *skippable &= pnskip;
+        this_rd += VPXMIN(RDCOST(x->rdmult, x->rddiv, pnrate, pndist),
+                          RDCOST(x->rdmult, x->rddiv, 0, pnsse));
+        block += step;
+      }
+    }
+  }
+
+  this_rd = VPXMIN(RDCOST(x->rdmult, x->rddiv, *rate, *distortion),
+                   RDCOST(x->rdmult, x->rddiv, 0, *sse));
+  if (this_rd > ref_best_rd)
+    is_cost_valid = 0;
+
+  if (!is_cost_valid) {
+    // reset cost value
+    *rate = INT_MAX;
+    *distortion = INT64_MAX;
+    *sse = INT64_MAX;
+    *skippable = 0;
+  }
+}
+
+static int64_t select_tx_size_fix_type(const VP10_COMP *cpi, MACROBLOCK *x,
+                                       int *rate, int64_t *dist,
+                                       int *skippable,
+                                       int64_t *sse, BLOCK_SIZE bsize,
+                                       int64_t ref_best_rd, TX_TYPE tx_type) {
+  const VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
+  const int is_inter = is_inter_block(mbmi);
+#if CONFIG_EXT_TX
+  int ext_tx_set = get_ext_tx_set(max_tx_size, bsize, is_inter);
+#endif  // CONFIG_EXT_TX
+  vpx_prob skip_prob = vp10_get_skip_prob(cm, xd);
+  int s0 = vp10_cost_bit(skip_prob, 0);
+  int s1 = vp10_cost_bit(skip_prob, 1);
+  int64_t rd;
+
+  mbmi->tx_type = tx_type;
+  inter_block_yrd(cpi, x, rate, dist, skippable, sse, bsize, ref_best_rd);
+
+  if (*rate == INT_MAX)
+    return INT64_MAX;
+
+#if CONFIG_EXT_TX
+  if (get_ext_tx_types(max_tx_size, bsize, is_inter) > 1 &&
+      !xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+    if (is_inter) {
+      if (ext_tx_set > 0)
+        *rate += cpi->inter_tx_type_costs[ext_tx_set]
+                                         [max_tx_size][mbmi->tx_type];
+    } else {
+      if (ext_tx_set > 0 && ALLOW_INTRA_EXT_TX)
+        *rate += cpi->intra_tx_type_costs[ext_tx_set][max_tx_size]
+                                         [mbmi->mode][mbmi->tx_type];
+    }
+  }
+#else  // CONFIG_EXT_TX
+  if (max_tx_size < TX_32X32 && !xd->lossless[xd->mi[0]->mbmi.segment_id]) {
+    if (is_inter)
+      *rate += cpi->inter_tx_type_costs[max_tx_size][mbmi->tx_type];
+    else
+      *rate += cpi->intra_tx_type_costs[max_tx_size]
+                 [intra_mode_to_tx_type_context[mbmi->mode]][mbmi->tx_type];
+  }
+#endif  // CONFIG_EXT_TX
+
+  if (*skippable)
+    rd = RDCOST(x->rdmult, x->rddiv, s1, *sse);
+  else
+    rd = RDCOST(x->rdmult, x->rddiv, *rate + s0, *dist);
+
+  if (is_inter && !xd->lossless[xd->mi[0]->mbmi.segment_id] && !(*skippable))
+    rd = VPXMIN(rd, RDCOST(x->rdmult, x->rddiv, s1, *sse));
+
+  return rd;
+}
+
+static void select_tx_type_yrd(const VP10_COMP *cpi, MACROBLOCK *x,
+                               int *rate, int64_t *distortion, int *skippable,
+                               int64_t *sse, BLOCK_SIZE bsize,
+                               int64_t ref_best_rd) {
+  const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  int64_t rd = INT64_MAX;
+  int64_t best_rd = INT64_MAX;
+  TX_TYPE tx_type, best_tx_type = DCT_DCT;
+  const int is_inter = is_inter_block(mbmi);
+  TX_SIZE best_tx_size[MAX_MIB_SIZE][MAX_MIB_SIZE];
+  TX_SIZE best_tx = TX_SIZES;
+  uint8_t best_blk_skip[MAX_MIB_SIZE * MAX_MIB_SIZE * 4];
+  const int n4 = 1 << (num_pels_log2_lookup[bsize] - 4);
+  int idx, idy;
+  int prune = 0;
+#if CONFIG_EXT_TX
+  int ext_tx_set = get_ext_tx_set(max_tx_size, bsize, is_inter);
+#endif  // CONFIG_EXT_TX
+
+  if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE)
+#if CONFIG_EXT_TX
+    prune = prune_tx_types(cpi, bsize, x, xd, ext_tx_set);
+#else
+    prune = prune_tx_types(cpi, bsize, x, xd, 0);
+#endif
+
+  *distortion = INT64_MAX;
+  *rate       = INT_MAX;
+  *skippable  = 0;
+  *sse        = INT64_MAX;
+
+  for (tx_type = DCT_DCT; tx_type < TX_TYPES; ++tx_type) {
+    int this_rate = 0;
+    int this_skip = 1;
+    int64_t this_dist = 0;
+    int64_t this_sse  = 0;
+#if CONFIG_EXT_TX
+    if (is_inter) {
+      if (!ext_tx_used_inter[ext_tx_set][tx_type])
+        continue;
+      if (cpi->sf.tx_type_search.prune_mode > NO_PRUNE) {
+        if (!do_tx_type_search(tx_type, prune))
+          continue;
+      }
+    } else {
+      if (!ALLOW_INTRA_EXT_TX && bsize >= BLOCK_8X8) {
+        if (tx_type != intra_mode_to_tx_type_context[mbmi->mode])
+          continue;
+      }
+      if (!ext_tx_used_intra[ext_tx_set][tx_type])
+        continue;
+    }
+#else  // CONFIG_EXT_TX
+    if (max_tx_size >= TX_32X32 && tx_type != DCT_DCT)
+      continue;
+    if (is_inter && cpi->sf.tx_type_search.prune_mode > NO_PRUNE &&
+        !do_tx_type_search(tx_type, prune))
+      continue;
+#endif  // CONFIG_EXT_TX
+    if (is_inter && x->use_default_inter_tx_type &&
+        tx_type != get_default_tx_type(0, xd, 0, max_tx_size))
+      continue;
+
+    rd = select_tx_size_fix_type(cpi, x, &this_rate, &this_dist, &this_skip,
+                                 &this_sse, bsize, ref_best_rd, tx_type);
+
+    if (rd < best_rd) {
+      best_rd = rd;
+      *distortion = this_dist;
+      *rate       = this_rate;
+      *skippable  = this_skip;
+      *sse        = this_sse;
+      best_tx_type = mbmi->tx_type;
+      best_tx = mbmi->tx_size;
+      memcpy(best_blk_skip, x->blk_skip[0], sizeof(best_blk_skip[0]) * n4);
+      for (idy = 0; idy < xd->n8_h; ++idy)
+        for (idx = 0; idx < xd->n8_w; ++idx)
+          best_tx_size[idy][idx] = mbmi->inter_tx_size[idy][idx];
+    }
+  }
+
+  mbmi->tx_type = best_tx_type;
+  for (idy = 0; idy < xd->n8_h; ++idy)
+    for (idx = 0; idx < xd->n8_w; ++idx)
+      mbmi->inter_tx_size[idy][idx] = best_tx_size[idy][idx];
+  mbmi->tx_size = best_tx;
+  memcpy(x->blk_skip[0], best_blk_skip, sizeof(best_blk_skip[0]) * n4);
+}
+
+static void tx_block_rd(const VP10_COMP *cpi, MACROBLOCK *x,
+                        int blk_row, int blk_col, int plane, int block,
+                        TX_SIZE tx_size, BLOCK_SIZE plane_bsize,
+                        ENTROPY_CONTEXT *above_ctx, ENTROPY_CONTEXT *left_ctx,
+                        int *rate, int64_t *dist, int64_t *bsse, int *skip) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  struct macroblock_plane *const p = &x->plane[plane];
+  struct macroblockd_plane *const pd = &xd->plane[plane];
+  BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+  const int tx_row = blk_row >> (1 - pd->subsampling_y);
+  const int tx_col = blk_col >> (1 - pd->subsampling_x);
+  const TX_SIZE plane_tx_size = plane ?
+      get_uv_tx_size_impl(mbmi->inter_tx_size[tx_row][tx_col], bsize, 0, 0) :
+      mbmi->inter_tx_size[tx_row][tx_col];
+
+  int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
+  int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
+
+  if (xd->mb_to_bottom_edge < 0)
+    max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
+  if (xd->mb_to_right_edge < 0)
+    max_blocks_wide += xd->mb_to_right_edge >> (5 + pd->subsampling_x);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide)
+    return;
+
+  if (tx_size == plane_tx_size) {
+    int coeff_ctx, i;
+    ENTROPY_CONTEXT *ta = above_ctx + blk_col;
+    ENTROPY_CONTEXT *tl = left_ctx  + blk_row;
+    switch (tx_size) {
+      case TX_4X4:
+        break;
+      case TX_8X8:
+        ta[0] = !!*(const uint16_t *)&ta[0];
+        tl[0] = !!*(const uint16_t *)&tl[0];
+        break;
+      case TX_16X16:
+        ta[0] = !!*(const uint32_t *)&ta[0];
+        tl[0] = !!*(const uint32_t *)&tl[0];
+        break;
+      case TX_32X32:
+        ta[0] = !!*(const uint64_t *)&ta[0];
+        tl[0] = !!*(const uint64_t *)&tl[0];
+        break;
+      default:
+        assert(0 && "Invalid transform size.");
+        break;
+    }
+    coeff_ctx = combine_entropy_contexts(ta[0], tl[0]);
+    vp10_tx_block_rd_b(cpi, x, tx_size, blk_row, blk_col, plane, block,
+                       plane_bsize, coeff_ctx, rate, dist, bsse, skip);
+    for (i = 0; i < (1 << tx_size); ++i) {
+      ta[i] = !(p->eobs[block] == 0);
+      tl[i] = !(p->eobs[block] == 0);
+    }
+  } else {
+    int bsl = b_width_log2_lookup[bsize];
+    int step = 1 << (2 * (tx_size - 1));
+    int i;
+
+    assert(bsl > 0);
+    --bsl;
+
+    for (i = 0; i < 4; ++i) {
+      int offsetr = (i >> 1) << bsl;
+      int offsetc = (i & 0x01) << bsl;
+      tx_block_rd(cpi, x, blk_row + offsetr, blk_col + offsetc, plane,
+                  block + i * step, tx_size - 1, plane_bsize,
+                  above_ctx, left_ctx, rate, dist, bsse, skip);
+    }
+  }
+}
+
+// Return value 0: early termination triggered, no valid rd cost available;
+//              1: rd cost values are valid.
+static int inter_block_uvrd(const VP10_COMP *cpi, MACROBLOCK *x,
+                            int *rate, int64_t *distortion, int *skippable,
+                            int64_t *sse, BLOCK_SIZE bsize,
+                            int64_t ref_best_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  int plane;
+  int is_cost_valid = 1;
+  int64_t this_rd;
+
+  if (ref_best_rd < 0)
+    is_cost_valid = 0;
+
+  if (is_inter_block(mbmi) && is_cost_valid) {
+    int plane;
+    for (plane = 1; plane < MAX_MB_PLANE; ++plane)
+      vp10_subtract_plane(x, bsize, plane);
+  }
+
+  *rate = 0;
+  *distortion = 0;
+  *sse = 0;
+  *skippable = 1;
+
+  for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+    const int mi_width = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int mi_height = num_4x4_blocks_high_lookup[plane_bsize];
+    BLOCK_SIZE txb_size = txsize_to_bsize[max_txsize_lookup[plane_bsize]];
+    int bh = num_4x4_blocks_wide_lookup[txb_size];
+    int idx, idy;
+    int block = 0;
+    int step = 1 << (max_txsize_lookup[plane_bsize] * 2);
+    int pnrate = 0, pnskip = 1;
+    int64_t pndist = 0, pnsse = 0;
+    ENTROPY_CONTEXT ta[2 * MAX_MIB_SIZE];
+    ENTROPY_CONTEXT tl[2 * MAX_MIB_SIZE];
+
+    vp10_get_entropy_contexts(bsize, TX_4X4, pd, ta, tl);
+
+    for (idy = 0; idy < mi_height; idy += bh) {
+      for (idx = 0; idx < mi_width; idx += bh) {
+        tx_block_rd(cpi, x, idy, idx, plane, block,
+                    max_txsize_lookup[plane_bsize], plane_bsize, ta, tl,
+                    &pnrate, &pndist, &pnsse, &pnskip);
+        block += step;
+      }
+    }
+
+    if (pnrate == INT_MAX) {
+      is_cost_valid = 0;
+      break;
+    }
+
+    *rate += pnrate;
+    *distortion += pndist;
+    *sse += pnsse;
+    *skippable &= pnskip;
+
+    this_rd = VPXMIN(RDCOST(x->rdmult, x->rddiv, *rate, *distortion),
+                     RDCOST(x->rdmult, x->rddiv, 0, *sse));
+
+    if (this_rd > ref_best_rd) {
+      is_cost_valid = 0;
+      break;
+    }
+  }
+
+  if (!is_cost_valid) {
+    // reset cost value
+    *rate = INT_MAX;
+    *distortion = INT64_MAX;
+    *sse = INT64_MAX;
+    *skippable = 0;
+  }
+
+  return is_cost_valid;
+}
+#endif
+
 // Return value 0: early termination triggered, no valid rd cost available;
 //              1: rd cost values are valid.
 static int super_block_uvrd(const VP10_COMP *cpi, MACROBLOCK *x,
@@ -1207,7 +3644,9 @@
   *skippable = 1;
 
   for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
-    txfm_rd_in_plane(x, &pnrate, &pndist, &pnskip, &pnsse,
+    txfm_rd_in_plane(x,
+                     cpi,
+                     &pnrate, &pndist, &pnskip, &pnsse,
                      ref_best_rd, plane, bsize, uv_tx_size,
                      cpi->sf.use_fast_coef_costing);
     if (pnrate == INT_MAX) {
@@ -1218,6 +3657,11 @@
     *distortion += pndist;
     *sse += pnsse;
     *skippable &= pnskip;
+    if (RDCOST(x->rdmult, x->rddiv, *rate, *distortion) > ref_best_rd &&
+        RDCOST(x->rdmult, x->rddiv, 0, *sse) > ref_best_rd) {
+      is_cost_valid = 0;
+      break;
+    }
   }
 
   if (!is_cost_valid) {
@@ -1231,45 +3675,456 @@
   return is_cost_valid;
 }
 
+static void rd_pick_palette_intra_sbuv(VP10_COMP *cpi, MACROBLOCK *x,
+                                       int dc_mode_cost,
+                                       PALETTE_MODE_INFO *palette_mode_info,
+                                       uint8_t *best_palette_color_map,
+                                       PREDICTION_MODE *mode_selected,
+                                       int64_t *best_rd, int *rate,
+                                       int *rate_tokenonly,
+                                       int64_t *distortion, int *skippable) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int rows = (4 * num_4x4_blocks_high_lookup[bsize]) >>
+      (xd->plane[1].subsampling_y);
+  const int cols = (4 * num_4x4_blocks_wide_lookup[bsize]) >>
+      (xd->plane[1].subsampling_x);
+  int this_rate, this_rate_tokenonly, s;
+  int64_t this_distortion, this_rd;
+  int colors_u, colors_v, colors;
+  const int src_stride = x->plane[1].src.stride;
+  const uint8_t *const src_u = x->plane[1].src.buf;
+  const uint8_t *const src_v = x->plane[2].src.buf;
+
+  if (rows * cols > PALETTE_MAX_BLOCK_SIZE)
+    return;
+
+#if CONFIG_EXT_INTRA
+  mbmi->ext_intra_mode_info.use_ext_intra_mode[1] = 0;
+#endif  // CONFIG_EXT_INTRA
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (cpi->common.use_highbitdepth) {
+    colors_u = vp10_count_colors_highbd(src_u, src_stride, rows, cols,
+                                        cpi->common.bit_depth);
+    colors_v = vp10_count_colors_highbd(src_v, src_stride, rows, cols,
+                                        cpi->common.bit_depth);
+  } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    colors_u = vp10_count_colors(src_u, src_stride, rows, cols);
+    colors_v = vp10_count_colors(src_v, src_stride, rows, cols);
+#if CONFIG_VP9_HIGHBITDEPTH
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  colors = colors_u > colors_v ? colors_u : colors_v;
+  if (colors > 1 && colors <= 64) {
+    int r, c, n, i, j;
+    const int max_itr = 50;
+    int color_ctx, color_idx = 0;
+    int color_order[PALETTE_MAX_SIZE];
+    int64_t this_sse;
+    float lb_u, ub_u, val_u;
+    float lb_v, ub_v, val_v;
+    float *const data = x->palette_buffer->kmeans_data_buf;
+    uint8_t *const indices = x->palette_buffer->kmeans_indices_buf;
+    uint8_t *const pre_indices = x->palette_buffer->kmeans_pre_indices_buf;
+    float centroids[2 * PALETTE_MAX_SIZE];
+    uint8_t *const color_map = xd->plane[1].color_index_map;
+    PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+    uint16_t *src_u16 = CONVERT_TO_SHORTPTR(src_u);
+    uint16_t *src_v16 = CONVERT_TO_SHORTPTR(src_v);
+    if (cpi->common.use_highbitdepth) {
+      lb_u = src_u16[0];
+      ub_u = src_u16[0];
+      lb_v = src_v16[0];
+      ub_v = src_v16[0];
+    } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      lb_u = src_u[0];
+      ub_u = src_u[0];
+      lb_v = src_v[0];
+      ub_v = src_v[0];
+#if CONFIG_VP9_HIGHBITDEPTH
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+    mbmi->uv_mode = DC_PRED;
+#if CONFIG_EXT_INTRA
+    mbmi->ext_intra_mode_info.use_ext_intra_mode[1] = 0;
+#endif  // CONFIG_EXT_INTRA
+    for (r = 0; r < rows; ++r) {
+      for (c = 0; c < cols; ++c) {
+#if CONFIG_VP9_HIGHBITDEPTH
+        if (cpi->common.use_highbitdepth) {
+          val_u = src_u16[r * src_stride + c];
+          val_v = src_v16[r * src_stride + c];
+          data[(r * cols + c) * 2 ] = val_u;
+          data[(r * cols + c) * 2 + 1] = val_v;
+        } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+          val_u = src_u[r * src_stride + c];
+          val_v = src_v[r * src_stride + c];
+          data[(r * cols + c) * 2 ] = val_u;
+          data[(r * cols + c) * 2 + 1] = val_v;
+#if CONFIG_VP9_HIGHBITDEPTH
+        }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        if (val_u < lb_u)
+          lb_u = val_u;
+        else if (val_u > ub_u)
+          ub_u = val_u;
+        if (val_v < lb_v)
+          lb_v = val_v;
+        else if (val_v > ub_v)
+          ub_v = val_v;
+      }
+    }
+
+    for (n = colors > PALETTE_MAX_SIZE ? PALETTE_MAX_SIZE : colors;
+        n >= 2; --n) {
+      for (i = 0; i < n; ++i) {
+        centroids[i * 2] = lb_u + (2 * i + 1) * (ub_u - lb_u) / n / 2;
+        centroids[i * 2 + 1] =
+            lb_v + (2 * i + 1) * (ub_v - lb_v) / n / 2;;
+      }
+      r = vp10_k_means(data, centroids, indices, pre_indices, rows * cols, n,
+                       2, max_itr);
+      pmi->palette_size[1] = n;
+      for (i = 1; i < 3; ++i) {
+        for (j = 0; j < n; ++j) {
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (cpi->common.use_highbitdepth)
+            pmi->palette_colors[i * PALETTE_MAX_SIZE + j] =
+                clip_pixel_highbd(roundf(centroids[j * 2 + i - 1]),
+                                  cpi->common.bit_depth);
+          else
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+            pmi->palette_colors[i * PALETTE_MAX_SIZE + j] =
+                clip_pixel(roundf(centroids[j * 2 + i - 1]));
+        }
+      }
+      for (r = 0; r < rows; ++r)
+        for (c = 0; c < cols; ++c)
+          color_map[r * cols + c] = indices[r * cols + c];
+
+      super_block_uvrd(cpi, x, &this_rate_tokenonly,
+                       &this_distortion, &s, &this_sse, bsize, *best_rd);
+      if (this_rate_tokenonly == INT_MAX)
+        continue;
+      this_rate = this_rate_tokenonly + dc_mode_cost +
+          2 * cpi->common.bit_depth * n * vp10_cost_bit(128, 0) +
+          cpi->palette_uv_size_cost[bsize - BLOCK_8X8][n - 2] +
+          write_uniform_cost(n, color_map[0]) +
+          vp10_cost_bit(vp10_default_palette_uv_mode_prob
+                        [pmi->palette_size[0] > 0], 1);
+
+      for (i = 0; i < rows; ++i) {
+        for (j = (i == 0 ? 1 : 0); j < cols; ++j) {
+          color_ctx = vp10_get_palette_color_context(color_map, cols, i, j, n,
+                                                     color_order);
+          for (r = 0; r < n; ++r)
+            if (color_map[i * cols + j] == color_order[r]) {
+              color_idx = r;
+              break;
+            }
+          assert(color_idx >= 0 && color_idx < n);
+          this_rate +=
+              cpi->palette_uv_color_cost[n - 2][color_ctx][color_idx];
+        }
+      }
+
+      this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+      if (this_rd < *best_rd) {
+        *best_rd = this_rd;
+        *palette_mode_info = *pmi;
+        memcpy(best_palette_color_map, xd->plane[1].color_index_map,
+               rows * cols * sizeof(best_palette_color_map[0]));
+        *mode_selected = DC_PRED;
+        *rate = this_rate;
+        *distortion = this_distortion;
+        *rate_tokenonly = this_rate_tokenonly;
+        *skippable = s;
+      }
+    }
+  }
+}
+
+#if CONFIG_EXT_INTRA
+// Return 1 if an ext intra mode is selected; return 0 otherwise.
+static int rd_pick_ext_intra_sbuv(VP10_COMP *cpi, MACROBLOCK *x,
+                                  int *rate, int *rate_tokenonly,
+                                  int64_t *distortion, int *skippable,
+                                  BLOCK_SIZE bsize, int64_t *best_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  int ext_intra_selected_flag = 0;
+  int this_rate_tokenonly, this_rate, s;
+  int64_t this_distortion, this_sse, this_rd;
+  EXT_INTRA_MODE mode;
+  EXT_INTRA_MODE_INFO ext_intra_mode_info;
+
+  vp10_zero(ext_intra_mode_info);
+  mbmi->ext_intra_mode_info.use_ext_intra_mode[1] = 1;
+  mbmi->uv_mode = DC_PRED;
+  mbmi->palette_mode_info.palette_size[1] = 0;
+
+  for (mode = 0; mode < FILTER_INTRA_MODES; ++mode) {
+    mbmi->ext_intra_mode_info.ext_intra_mode[1] = mode;
+    if (!super_block_uvrd(cpi, x, &this_rate_tokenonly,
+                          &this_distortion, &s, &this_sse, bsize, *best_rd))
+      continue;
+
+    this_rate = this_rate_tokenonly +
+        vp10_cost_bit(cpi->common.fc->ext_intra_probs[1], 1) +
+        cpi->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode] +
+        write_uniform_cost(FILTER_INTRA_MODES, mode);
+    this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+    if (this_rd < *best_rd) {
+      *best_rd        = this_rd;
+      *rate           = this_rate;
+      *rate_tokenonly = this_rate_tokenonly;
+      *distortion     = this_distortion;
+      *skippable      = s;
+      ext_intra_mode_info = mbmi->ext_intra_mode_info;
+      ext_intra_selected_flag = 1;
+    }
+  }
+
+
+  if (ext_intra_selected_flag) {
+    mbmi->uv_mode = DC_PRED;
+    mbmi->ext_intra_mode_info.use_ext_intra_mode[1] =
+        ext_intra_mode_info.use_ext_intra_mode[1];
+    mbmi->ext_intra_mode_info.ext_intra_mode[1] =
+        ext_intra_mode_info.ext_intra_mode[1];
+    return 1;
+  } else {
+    return 0;
+  }
+}
+
+static void pick_intra_angle_routine_sbuv(VP10_COMP *cpi, MACROBLOCK *x,
+                                          int *rate, int *rate_tokenonly,
+                                          int64_t *distortion, int *skippable,
+                                          int *best_angle_delta,
+                                          BLOCK_SIZE bsize, int rate_overhead,
+                                          int64_t *best_rd) {
+  MB_MODE_INFO *mbmi = &x->e_mbd.mi[0]->mbmi;
+  int this_rate_tokenonly, this_rate, s;
+  int64_t this_distortion, this_sse, this_rd;
+
+  if (!super_block_uvrd(cpi, x, &this_rate_tokenonly,
+                        &this_distortion, &s, &this_sse, bsize, *best_rd))
+    return;
+
+  this_rate = this_rate_tokenonly + rate_overhead;
+  this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+  if (this_rd < *best_rd) {
+    *best_rd          = this_rd;
+    *best_angle_delta = mbmi->angle_delta[1];
+    *rate             = this_rate;
+    *rate_tokenonly   = this_rate_tokenonly;
+    *distortion       = this_distortion;
+    *skippable        = s;
+  }
+}
+
+static int rd_pick_intra_angle_sbuv(VP10_COMP *cpi, MACROBLOCK *x,
+                                    int *rate, int *rate_tokenonly,
+                                    int64_t *distortion, int *skippable,
+                                    BLOCK_SIZE bsize, int rate_overhead,
+                                    int64_t best_rd) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  int this_rate_tokenonly, this_rate, s;
+  int64_t this_distortion, this_sse, this_rd;
+  int angle_delta, best_angle_delta = 0;
+  const double rd_adjust = 1.2;
+
+  *rate_tokenonly = INT_MAX;
+  if (ANGLE_FAST_SEARCH) {
+    int deltas_level1[3] = {0, -2, 2};
+    int deltas_level2[3][2] = {
+        {-1, 1}, {-3, -1}, {1, 3},
+    };
+    const int level1 = 3, level2 = 2;
+    int i, j, best_i = -1;
+
+    for (i = 0; i < level1; ++i) {
+      int64_t tmp_best_rd;
+      mbmi->angle_delta[1] = deltas_level1[i];
+      tmp_best_rd = (i == 0 && best_rd < INT64_MAX) ?
+          (int64_t)(best_rd * rd_adjust) : best_rd;
+      if (!super_block_uvrd(cpi, x, &this_rate_tokenonly, &this_distortion,
+                            &s, &this_sse, bsize, tmp_best_rd)) {
+        if (i == 0)
+          break;
+        else
+          continue;
+      }
+      this_rate = this_rate_tokenonly + rate_overhead;
+      this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
+      if (i == 0 && best_rd < INT64_MAX && this_rd > best_rd * rd_adjust)
+        break;
+      if (this_rd < best_rd) {
+        best_i           = i;
+        best_rd          = this_rd;
+        best_angle_delta = mbmi->angle_delta[1];
+        *rate            = this_rate;
+        *rate_tokenonly  = this_rate_tokenonly;
+        *distortion      = this_distortion;
+        *skippable       = s;
+      }
+    }
+
+    if (best_i >= 0) {
+      for (j = 0; j < level2; ++j) {
+        mbmi->angle_delta[1] = deltas_level2[best_i][j];
+        pick_intra_angle_routine_sbuv(cpi, x, rate, rate_tokenonly,
+                                      distortion, skippable,
+                                      &best_angle_delta, bsize,
+                                      rate_overhead, &best_rd);
+      }
+    }
+  } else {
+    for (angle_delta = -MAX_ANGLE_DELTAS; angle_delta <= MAX_ANGLE_DELTAS;
+        ++angle_delta) {
+      mbmi->angle_delta[1] = angle_delta;
+      pick_intra_angle_routine_sbuv(cpi, x, rate, rate_tokenonly,
+                                    distortion, skippable,
+                                    &best_angle_delta, bsize,
+                                    rate_overhead, &best_rd);
+    }
+  }
+
+  mbmi->angle_delta[1] = best_angle_delta;
+  return *rate_tokenonly != INT_MAX;
+}
+#endif  // CONFIG_EXT_INTRA
+
 static int64_t rd_pick_intra_sbuv_mode(VP10_COMP *cpi, MACROBLOCK *x,
-                                       PICK_MODE_CONTEXT *ctx,
                                        int *rate, int *rate_tokenonly,
                                        int64_t *distortion, int *skippable,
                                        BLOCK_SIZE bsize, TX_SIZE max_tx_size) {
   MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
   PREDICTION_MODE mode;
   PREDICTION_MODE mode_selected = DC_PRED;
   int64_t best_rd = INT64_MAX, this_rd;
   int this_rate_tokenonly, this_rate, s;
   int64_t this_distortion, this_sse;
+  const int rows = (4 * num_4x4_blocks_high_lookup[bsize]) >>
+      (xd->plane[1].subsampling_y);
+  const int cols = (4 * num_4x4_blocks_wide_lookup[bsize]) >>
+      (xd->plane[1].subsampling_x);
+  PALETTE_MODE_INFO palette_mode_info;
+  PALETTE_MODE_INFO *const pmi = &xd->mi[0]->mbmi.palette_mode_info;
+  uint8_t *best_palette_color_map = NULL;
+#if CONFIG_EXT_INTRA
+  int is_directional_mode, rate_overhead, best_angle_delta = 0;
+  EXT_INTRA_MODE_INFO ext_intra_mode_info;
 
-  memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm));
+  ext_intra_mode_info.use_ext_intra_mode[1] = 0;
+  mbmi->ext_intra_mode_info.use_ext_intra_mode[1] = 0;
+#endif  // CONFIG_EXT_INTRA
+  palette_mode_info.palette_size[1] = 0;
+  pmi->palette_size[1] = 0;
   for (mode = DC_PRED; mode <= TM_PRED; ++mode) {
     if (!(cpi->sf.intra_uv_mode_mask[max_tx_size] & (1 << mode)))
       continue;
 
-    xd->mi[0]->mbmi.uv_mode = mode;
-
+    mbmi->uv_mode = mode;
+#if CONFIG_EXT_INTRA
+    is_directional_mode = (mode != DC_PRED && mode != TM_PRED);
+    rate_overhead = cpi->intra_uv_mode_cost[mbmi->mode][mode] +
+        write_uniform_cost(2 * MAX_ANGLE_DELTAS + 1, 0);
+    mbmi->angle_delta[1] = 0;
+    if (mbmi->sb_type >= BLOCK_8X8 && is_directional_mode) {
+      if (!rd_pick_intra_angle_sbuv(cpi, x, &this_rate,
+                                    &this_rate_tokenonly, &this_distortion, &s,
+                                    bsize, rate_overhead, best_rd))
+        continue;
+    } else {
+      if (!super_block_uvrd(cpi, x, &this_rate_tokenonly,
+                            &this_distortion, &s, &this_sse, bsize, best_rd))
+        continue;
+    }
+    this_rate = this_rate_tokenonly +
+        cpi->intra_uv_mode_cost[mbmi->mode][mode];
+    if (mbmi->sb_type >= BLOCK_8X8 && is_directional_mode)
+      this_rate += write_uniform_cost(2 * MAX_ANGLE_DELTAS + 1,
+                                      MAX_ANGLE_DELTAS +
+                                      mbmi->angle_delta[1]);
+    if (mode == DC_PRED)
+      this_rate += vp10_cost_bit(cpi->common.fc->ext_intra_probs[1], 0);
+#else
     if (!super_block_uvrd(cpi, x, &this_rate_tokenonly,
                           &this_distortion, &s, &this_sse, bsize, best_rd))
       continue;
     this_rate = this_rate_tokenonly +
-        cpi->intra_uv_mode_cost[xd->mi[0]->mbmi.mode][mode];
+        cpi->intra_uv_mode_cost[mbmi->mode][mode];
+#endif  // CONFIG_EXT_INTRA
+    if (cpi->common.allow_screen_content_tools && mbmi->sb_type >= BLOCK_8X8 &&
+        mode == DC_PRED)
+      this_rate += vp10_cost_bit(vp10_default_palette_uv_mode_prob
+                                 [pmi->palette_size[0] > 0], 0);
+
     this_rd = RDCOST(x->rdmult, x->rddiv, this_rate, this_distortion);
 
     if (this_rd < best_rd) {
       mode_selected   = mode;
+#if CONFIG_EXT_INTRA
+      best_angle_delta = mbmi->angle_delta[1];
+#endif  // CONFIG_EXT_INTRA
       best_rd         = this_rd;
       *rate           = this_rate;
       *rate_tokenonly = this_rate_tokenonly;
       *distortion     = this_distortion;
       *skippable      = s;
-      if (!x->select_tx_size)
-        swap_block_ptr(x, ctx, 2, 0, 1, MAX_MB_PLANE);
     }
   }
 
-  xd->mi[0]->mbmi.uv_mode = mode_selected;
+  if (cpi->common.allow_screen_content_tools && mbmi->sb_type >= BLOCK_8X8) {
+    best_palette_color_map = x->palette_buffer->best_palette_color_map;
+    rd_pick_palette_intra_sbuv(cpi, x,
+                               cpi->intra_uv_mode_cost[mbmi->mode][DC_PRED],
+                               &palette_mode_info, best_palette_color_map,
+                               &mode_selected, &best_rd, rate, rate_tokenonly,
+                               distortion, skippable);
+  }
+
+#if CONFIG_EXT_INTRA
+  if (mbmi->sb_type >= BLOCK_8X8 && ALLOW_FILTER_INTRA_MODES) {
+    if (rd_pick_ext_intra_sbuv(cpi, x, rate, rate_tokenonly, distortion,
+                               skippable, bsize, &best_rd)) {
+      mode_selected   = mbmi->uv_mode;
+      ext_intra_mode_info = mbmi->ext_intra_mode_info;
+    }
+  }
+
+  mbmi->ext_intra_mode_info.use_ext_intra_mode[1] =
+      ext_intra_mode_info.use_ext_intra_mode[1];
+  if (ext_intra_mode_info.use_ext_intra_mode[1]) {
+    mbmi->ext_intra_mode_info.ext_intra_mode[1] =
+        ext_intra_mode_info.ext_intra_mode[1];
+    palette_mode_info.palette_size[1] = 0;
+  }
+  mbmi->angle_delta[1] = best_angle_delta;
+#endif  // CONFIG_EXT_INTRA
+  mbmi->uv_mode = mode_selected;
+  pmi->palette_size[1] = palette_mode_info.palette_size[1];
+  if (palette_mode_info.palette_size[1] > 0) {
+    memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
+           palette_mode_info.palette_colors + PALETTE_MAX_SIZE,
+           2 * PALETTE_MAX_SIZE * sizeof(palette_mode_info.palette_colors[0]));
+    memcpy(xd->plane[1].color_index_map, best_palette_color_map,
+           rows * cols * sizeof(best_palette_color_map[0]));
+  }
+
   return best_rd;
 }
 
@@ -1280,7 +4135,6 @@
   int64_t unused;
 
   x->e_mbd.mi[0]->mbmi.uv_mode = DC_PRED;
-  memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm));
   super_block_uvrd(cpi, x, rate_tokenonly, distortion,
                    skippable, &unused, bsize, INT64_MAX);
   *rate = *rate_tokenonly +
@@ -1302,7 +4156,8 @@
   // Else do a proper rd search for each possible transform size that may
   // be considered in the main rd loop.
   } else {
-    rd_pick_intra_sbuv_mode(cpi, x, ctx,
+    (void)ctx;
+    rd_pick_intra_sbuv_mode(cpi, x,
                             rate_uv, rate_uv_tokenonly, dist_uv, skip_uv,
                             bsize < BLOCK_8X8 ? BLOCK_8X8 : bsize, max_tx_size);
   }
@@ -1310,9 +4165,78 @@
 }
 
 static int cost_mv_ref(const VP10_COMP *cpi, PREDICTION_MODE mode,
-                       int mode_context) {
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+                       int is_compound,
+#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
+                       int16_t mode_context) {
+#if CONFIG_REF_MV
+  int mode_cost = 0;
+#if CONFIG_EXT_INTER
+  int16_t mode_ctx = is_compound ? mode_context :
+                                   (mode_context & NEWMV_CTX_MASK);
+#else
+  int16_t mode_ctx = mode_context & NEWMV_CTX_MASK;
+#endif  // CONFIG_EXT_INTER
+  int16_t is_all_zero_mv = mode_context & (1 << ALL_ZERO_FLAG_OFFSET);
+
   assert(is_inter_mode(mode));
+
+#if CONFIG_EXT_INTER
+  if (is_compound) {
+    return cpi->inter_compound_mode_cost[mode_context]
+                                        [INTER_COMPOUND_OFFSET(mode)];
+  } else {
+    if (mode == NEWMV || mode == NEWFROMNEARMV) {
+#else
+  if (mode == NEWMV) {
+#endif  // CONFIG_EXT_INTER
+    mode_cost = cpi->newmv_mode_cost[mode_ctx][0];
+#if CONFIG_EXT_INTER
+    if (!is_compound)
+      mode_cost += cpi->new2mv_mode_cost[mode == NEWFROMNEARMV];
+#endif  // CONFIG_EXT_INTER
+    return mode_cost;
+  } else {
+    mode_cost = cpi->newmv_mode_cost[mode_ctx][1];
+    mode_ctx = (mode_context >> ZEROMV_OFFSET) & ZEROMV_CTX_MASK;
+
+    if (is_all_zero_mv)
+      return mode_cost;
+
+    if (mode == ZEROMV) {
+      mode_cost += cpi->zeromv_mode_cost[mode_ctx][0];
+      return mode_cost;
+    } else {
+      mode_cost += cpi->zeromv_mode_cost[mode_ctx][1];
+      mode_ctx = (mode_context >> REFMV_OFFSET) & REFMV_CTX_MASK;
+
+      if (mode_context & (1 << SKIP_NEARESTMV_OFFSET))
+        mode_ctx = 6;
+      if (mode_context & (1 << SKIP_NEARMV_OFFSET))
+        mode_ctx = 7;
+      if (mode_context & (1 << SKIP_NEARESTMV_SUB8X8_OFFSET))
+        mode_ctx = 8;
+
+      mode_cost += cpi->refmv_mode_cost[mode_ctx][mode != NEARESTMV];
+      return mode_cost;
+    }
+  }
+#if CONFIG_EXT_INTER
+  }
+#endif  // CONFIG_EXT_INTER
+#else
+  assert(is_inter_mode(mode));
+#if CONFIG_EXT_INTER
+  if (is_inter_compound_mode(mode)) {
+    return cpi->inter_compound_mode_cost[mode_context]
+                                        [INTER_COMPOUND_OFFSET(mode)];
+  } else {
+#endif  // CONFIG_EXT_INTER
   return cpi->inter_mode_cost[mode_context][INTER_OFFSET(mode)];
+#if CONFIG_EXT_INTER
+  }
+#endif  // CONFIG_EXT_INTER
+#endif
 }
 
 static int set_and_cost_bmi_mvs(VP10_COMP *cpi, MACROBLOCK *x, MACROBLOCKD *xd,
@@ -1320,6 +4244,9 @@
                                 PREDICTION_MODE mode, int_mv this_mv[2],
                                 int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
                                 int_mv seg_mvs[MAX_REF_FRAMES],
+#if CONFIG_EXT_INTER
+                                int_mv compound_seg_newmvs[2],
+#endif  // CONFIG_EXT_INTER
                                 int_mv *best_ref_mv[2], const int *mvjcost,
                                 int *mvcost[2]) {
   MODE_INFO *const mic = xd->mi[0];
@@ -1330,17 +4257,43 @@
   const int num_4x4_blocks_wide = num_4x4_blocks_wide_lookup[mbmi->sb_type];
   const int num_4x4_blocks_high = num_4x4_blocks_high_lookup[mbmi->sb_type];
   const int is_compound = has_second_ref(mbmi);
+  int mode_ctx = mbmi_ext->mode_context[mbmi->ref_frame[0]];
 
   switch (mode) {
     case NEWMV:
+#if CONFIG_EXT_INTER
+    case NEWFROMNEARMV:
+#endif  // CONFIG_EXT_INTER
       this_mv[0].as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
+#if CONFIG_EXT_INTER
+      if (!cpi->common.allow_high_precision_mv ||
+          !vp10_use_mv_hp(&best_ref_mv[0]->as_mv))
+        lower_mv_precision(&this_mv[0].as_mv, 0);
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_REF_MV
+      for (idx = 0; idx < 1 + is_compound; ++idx) {
+        this_mv[idx] = seg_mvs[mbmi->ref_frame[idx]];
+        vp10_set_mvcost(x, mbmi->ref_frame[idx]);
+        thismvcost += vp10_mv_bit_cost(&this_mv[idx].as_mv,
+                                       &best_ref_mv[idx]->as_mv,
+                                       x->nmvjointcost, x->mvcost,
+                                       MV_COST_WEIGHT_SUB);
+      }
+      (void)mvjcost;
+      (void)mvcost;
+#else
       thismvcost += vp10_mv_bit_cost(&this_mv[0].as_mv, &best_ref_mv[0]->as_mv,
-                                    mvjcost, mvcost, MV_COST_WEIGHT_SUB);
+                                     mvjcost, mvcost, MV_COST_WEIGHT_SUB);
+#if !CONFIG_EXT_INTER
       if (is_compound) {
         this_mv[1].as_int = seg_mvs[mbmi->ref_frame[1]].as_int;
-        thismvcost += vp10_mv_bit_cost(&this_mv[1].as_mv, &best_ref_mv[1]->as_mv,
-                                      mvjcost, mvcost, MV_COST_WEIGHT_SUB);
+        thismvcost += vp10_mv_bit_cost(&this_mv[1].as_mv,
+                                       &best_ref_mv[1]->as_mv,
+                                       mvjcost, mvcost, MV_COST_WEIGHT_SUB);
       }
+#endif  // !CONFIG_EXT_INTER
+#endif
       break;
     case NEARMV:
     case NEARESTMV:
@@ -1353,6 +4306,61 @@
       if (is_compound)
         this_mv[1].as_int = 0;
       break;
+#if CONFIG_EXT_INTER
+    case NEW_NEWMV:
+      if (compound_seg_newmvs[0].as_int == INVALID_MV ||
+          compound_seg_newmvs[1].as_int == INVALID_MV) {
+        this_mv[0].as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
+        this_mv[1].as_int = seg_mvs[mbmi->ref_frame[1]].as_int;
+      } else {
+        this_mv[0].as_int = compound_seg_newmvs[0].as_int;
+        this_mv[1].as_int = compound_seg_newmvs[1].as_int;
+      }
+      if (!cpi->common.allow_high_precision_mv ||
+          !vp10_use_mv_hp(&best_ref_mv[0]->as_mv))
+        lower_mv_precision(&this_mv[0].as_mv, 0);
+      if (!cpi->common.allow_high_precision_mv ||
+          !vp10_use_mv_hp(&best_ref_mv[1]->as_mv))
+        lower_mv_precision(&this_mv[1].as_mv, 0);
+      thismvcost += vp10_mv_bit_cost(&this_mv[0].as_mv,
+                                     &best_ref_mv[0]->as_mv,
+                                     mvjcost, mvcost, MV_COST_WEIGHT_SUB);
+      thismvcost += vp10_mv_bit_cost(&this_mv[1].as_mv,
+                                     &best_ref_mv[1]->as_mv,
+                                     mvjcost, mvcost, MV_COST_WEIGHT_SUB);
+      break;
+    case NEW_NEARMV:
+    case NEW_NEARESTMV:
+      this_mv[0].as_int = seg_mvs[mbmi->ref_frame[0]].as_int;
+      if (!cpi->common.allow_high_precision_mv ||
+          !vp10_use_mv_hp(&best_ref_mv[0]->as_mv))
+        lower_mv_precision(&this_mv[0].as_mv, 0);
+      thismvcost += vp10_mv_bit_cost(&this_mv[0].as_mv, &best_ref_mv[0]->as_mv,
+                                     mvjcost, mvcost, MV_COST_WEIGHT_SUB);
+      this_mv[1].as_int = frame_mv[mode][mbmi->ref_frame[1]].as_int;
+      break;
+    case NEAR_NEWMV:
+    case NEAREST_NEWMV:
+      this_mv[0].as_int = frame_mv[mode][mbmi->ref_frame[0]].as_int;
+      this_mv[1].as_int = seg_mvs[mbmi->ref_frame[1]].as_int;
+      if (!cpi->common.allow_high_precision_mv ||
+          !vp10_use_mv_hp(&best_ref_mv[1]->as_mv))
+        lower_mv_precision(&this_mv[1].as_mv, 0);
+      thismvcost += vp10_mv_bit_cost(&this_mv[1].as_mv, &best_ref_mv[1]->as_mv,
+                                     mvjcost, mvcost, MV_COST_WEIGHT_SUB);
+      break;
+    case NEAREST_NEARMV:
+    case NEAR_NEARESTMV:
+    case NEAREST_NEARESTMV:
+    case NEAR_NEARMV:
+      this_mv[0].as_int = frame_mv[mode][mbmi->ref_frame[0]].as_int;
+      this_mv[1].as_int = frame_mv[mode][mbmi->ref_frame[1]].as_int;
+       break;
+    case ZERO_ZEROMV:
+      this_mv[0].as_int = 0;
+      this_mv[1].as_int = 0;
+      break;
+#endif  // CONFIG_EXT_INTER
     default:
       break;
   }
@@ -1363,12 +4371,36 @@
 
   mic->bmi[i].as_mode = mode;
 
+#if CONFIG_REF_MV
+  if (mode == NEWMV) {
+    mic->bmi[i].pred_mv_s8[0].as_int = best_ref_mv[0]->as_int;
+    if (is_compound)
+      mic->bmi[i].pred_mv_s8[1].as_int = best_ref_mv[1]->as_int;
+  } else {
+    mic->bmi[i].pred_mv_s8[0].as_int = this_mv[0].as_int;
+    if (is_compound)
+      mic->bmi[i].pred_mv_s8[1].as_int = this_mv[1].as_int;
+  }
+#endif
+
   for (idy = 0; idy < num_4x4_blocks_high; ++idy)
     for (idx = 0; idx < num_4x4_blocks_wide; ++idx)
       memmove(&mic->bmi[i + idy * 2 + idx], &mic->bmi[i], sizeof(mic->bmi[i]));
 
-  return cost_mv_ref(cpi, mode, mbmi_ext->mode_context[mbmi->ref_frame[0]]) +
-            thismvcost;
+#if CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+  if (is_compound)
+    mode_ctx = mbmi_ext->compound_mode_context[mbmi->ref_frame[0]];
+  else
+#endif  // CONFIG_EXT_INTER
+  mode_ctx = vp10_mode_context_analyzer(mbmi_ext->mode_context,
+                                        mbmi->ref_frame, mbmi->sb_type, i);
+#endif
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+  return cost_mv_ref(cpi, mode, is_compound, mode_ctx) + thismvcost;
+#else
+  return cost_mv_ref(cpi, mode, mode_ctx) + thismvcost;
+#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
 }
 
 static int64_t encode_inter_mb_segment(VP10_COMP *cpi,
@@ -1390,32 +4422,19 @@
   const int width = 4 * num_4x4_blocks_wide_lookup[plane_bsize];
   const int height = 4 * num_4x4_blocks_high_lookup[plane_bsize];
   int idx, idy;
-  void (*fwd_txm4x4)(const int16_t *input, tran_low_t *output, int stride);
-
   const uint8_t *const src =
       &p->src.buf[vp10_raster_block_offset(BLOCK_8X8, i, p->src.stride)];
   uint8_t *const dst = &pd->dst.buf[vp10_raster_block_offset(BLOCK_8X8, i,
                                                             pd->dst.stride)];
   int64_t thisdistortion = 0, thissse = 0;
   int thisrate = 0;
-  TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, i);
-  const scan_order *so = get_scan(TX_4X4, tx_type);
+  TX_TYPE tx_type = get_tx_type(PLANE_TYPE_Y, xd, i, TX_4X4);
+  const scan_order *so = get_scan(TX_4X4, tx_type, 1);
 
   vp10_build_inter_predictor_sub8x8(xd, 0, i, ir, ic, mi_row, mi_col);
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    fwd_txm4x4 = xd->lossless[mi->mbmi.segment_id] ? vp10_highbd_fwht4x4
-                                                   : vpx_highbd_fdct4x4;
-  } else {
-    fwd_txm4x4 = xd->lossless[mi->mbmi.segment_id] ? vp10_fwht4x4 : vpx_fdct4x4;
-  }
-#else
-  fwd_txm4x4 = xd->lossless[mi->mbmi.segment_id] ? vp10_fwht4x4 : vpx_fdct4x4;
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
     vpx_highbd_subtract_block(
         height, width, vp10_raster_block_offset_int16(BLOCK_8X8, i, p->src_diff),
         8, src, p->src.stride, dst, pd->dst.stride, xd->bd);
@@ -1433,42 +4452,43 @@
   k = i;
   for (idy = 0; idy < height / 4; ++idy) {
     for (idx = 0; idx < width / 4; ++idx) {
-      int64_t ssz, rd, rd1, rd2;
-      tran_low_t* coeff;
-
+      int64_t dist, ssz, rd, rd1, rd2;
+      int coeff_ctx;
       k += (idy * 2 + idx);
-      coeff = BLOCK_OFFSET(p->coeff, k);
-      fwd_txm4x4(vp10_raster_block_offset_int16(BLOCK_8X8, k, p->src_diff),
-                 coeff, 8);
-      vp10_regular_quantize_b_4x4(x, 0, k, so->scan, so->iscan);
-#if CONFIG_VP9_HIGHBITDEPTH
-      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-        thisdistortion += vp10_highbd_block_error(coeff,
-                                                 BLOCK_OFFSET(pd->dqcoeff, k),
-                                                 16, &ssz, xd->bd);
-      } else {
-        thisdistortion += vp10_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
-                                          16, &ssz);
-      }
-#else
-      thisdistortion += vp10_block_error(coeff, BLOCK_OFFSET(pd->dqcoeff, k),
-                                        16, &ssz);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
+      coeff_ctx = combine_entropy_contexts(*(ta + (k & 1)),
+                                           *(tl + (k >> 1)));
+      vp10_xform_quant(x, 0, k, idy + (i >> 1), idx + (i & 0x01), BLOCK_8X8,
+                       TX_4X4, VP10_XFORM_QUANT_FP);
+      if (xd->lossless[xd->mi[0]->mbmi.segment_id] == 0)
+        vp10_optimize_b(x, 0, k, TX_4X4, coeff_ctx);
+      dist_block(cpi, x, 0, k, idy + (i >> 1), idx + (i & 0x1), TX_4X4,
+                 &dist, &ssz);
+      thisdistortion += dist;
       thissse += ssz;
-      thisrate += cost_coeffs(x, 0, k, ta + (k & 1), tl + (k >> 1), TX_4X4,
+#if CONFIG_VAR_TX
+      thisrate += cost_coeffs(x, 0, k, coeff_ctx,
+                              TX_4X4,
                               so->scan, so->neighbors,
                               cpi->sf.use_fast_coef_costing);
-      rd1 = RDCOST(x->rdmult, x->rddiv, thisrate, thisdistortion >> 2);
-      rd2 = RDCOST(x->rdmult, x->rddiv, 0, thissse >> 2);
+      *(ta + (k & 1)) = !(p->eobs[k] == 0);
+      *(tl + (k >> 1)) = !(p->eobs[k] == 0);
+#else
+      thisrate += cost_coeffs(x, 0, k, ta + (k & 1), tl + (k >> 1),
+                              TX_4X4,
+                              so->scan, so->neighbors,
+                              cpi->sf.use_fast_coef_costing);
+#endif
+      rd1 = RDCOST(x->rdmult, x->rddiv, thisrate, thisdistortion);
+      rd2 = RDCOST(x->rdmult, x->rddiv, 0, thissse);
       rd = VPXMIN(rd1, rd2);
       if (rd >= best_yrd)
         return INT64_MAX;
     }
   }
 
-  *distortion = thisdistortion >> 2;
+  *distortion = thisdistortion;
   *labelyrate = thisrate;
-  *sse = thissse >> 2;
+  *sse = thissse;
 
   return RDCOST(x->rdmult, x->rddiv, *labelyrate, *distortion);
 }
@@ -1481,6 +4501,12 @@
   int64_t bsse;
   int64_t brdcost;
   int_mv mvs[2];
+#if CONFIG_REF_MV
+  int_mv pred_mv[2];
+#endif
+#if CONFIG_EXT_INTER
+  int_mv ref_mv[2];
+#endif  // CONFIG_EXT_INTER
   ENTROPY_CONTEXT ta[2];
   ENTROPY_CONTEXT tl[2];
 } SEG_RDSTAT;
@@ -1495,7 +4521,11 @@
   int64_t sse;
   int segment_yrate;
   PREDICTION_MODE modes[4];
+#if CONFIG_EXT_INTER
+  SEG_RDSTAT rdstat[4][INTER_MODES + INTER_COMPOUND_MODES];
+#else
   SEG_RDSTAT rdstat[4][INTER_MODES];
+#endif  // CONFIG_EXT_INTER
   int mvthresh;
 } BEST_SEG_INFO;
 
@@ -1530,24 +4560,45 @@
     x->e_mbd.plane[0].pre[1] = orig_pre[1];
 }
 
-static INLINE int mv_has_subpel(const MV *mv) {
-  return (mv->row & 0x0F) || (mv->col & 0x0F);
-}
-
 // Check if NEARESTMV/NEARMV/ZEROMV is the cheapest way encode zero motion.
 // TODO(aconverse): Find out if this is still productive then clean up or remove
 static int check_best_zero_mv(
-    const VP10_COMP *cpi, const uint8_t mode_context[MAX_REF_FRAMES],
+    const VP10_COMP *cpi, const int16_t mode_context[MAX_REF_FRAMES],
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+    const int16_t compound_mode_context[MAX_REF_FRAMES],
+#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
     int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES], int this_mode,
-    const MV_REFERENCE_FRAME ref_frames[2]) {
+    const MV_REFERENCE_FRAME ref_frames[2],
+    const BLOCK_SIZE bsize, int block) {
+
+#if !CONFIG_EXT_INTER
+  assert(ref_frames[1] != INTRA_FRAME);  // Just sanity check
+#endif
+
   if ((this_mode == NEARMV || this_mode == NEARESTMV || this_mode == ZEROMV) &&
       frame_mv[this_mode][ref_frames[0]].as_int == 0 &&
-      (ref_frames[1] == NONE ||
+      (ref_frames[1] <= INTRA_FRAME ||
        frame_mv[this_mode][ref_frames[1]].as_int == 0)) {
-    int rfc = mode_context[ref_frames[0]];
+#if CONFIG_REF_MV
+    int16_t rfc = vp10_mode_context_analyzer(mode_context,
+                                             ref_frames, bsize, block);
+#else
+    int16_t rfc = mode_context[ref_frames[0]];
+#endif
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+    int c1 = cost_mv_ref(cpi, NEARMV, ref_frames[1] > INTRA_FRAME, rfc);
+    int c2 = cost_mv_ref(cpi, NEARESTMV, ref_frames[1] > INTRA_FRAME, rfc);
+    int c3 = cost_mv_ref(cpi, ZEROMV, ref_frames[1] > INTRA_FRAME, rfc);
+#else
     int c1 = cost_mv_ref(cpi, NEARMV, rfc);
     int c2 = cost_mv_ref(cpi, NEARESTMV, rfc);
     int c3 = cost_mv_ref(cpi, ZEROMV, rfc);
+#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
+
+#if !CONFIG_REF_MV
+    (void)bsize;
+    (void)block;
+#endif
 
     if (this_mode == NEARMV) {
       if (c1 > c3) return 0;
@@ -1555,7 +4606,7 @@
       if (c2 > c3) return 0;
     } else {
       assert(this_mode == ZEROMV);
-      if (ref_frames[1] == NONE) {
+      if (ref_frames[1] <= INTRA_FRAME) {
         if ((c3 >= c2 && frame_mv[NEARESTMV][ref_frames[0]].as_int == 0) ||
             (c3 >= c1 && frame_mv[NEARMV][ref_frames[0]].as_int == 0))
           return 0;
@@ -1568,6 +4619,54 @@
       }
     }
   }
+#if CONFIG_EXT_INTER
+  else if ((this_mode == NEAREST_NEARESTMV || this_mode == NEAREST_NEARMV ||
+            this_mode == NEAR_NEARESTMV || this_mode == NEAR_NEARMV ||
+            this_mode == ZERO_ZEROMV) &&
+            frame_mv[this_mode][ref_frames[0]].as_int == 0 &&
+            frame_mv[this_mode][ref_frames[1]].as_int == 0) {
+#if CONFIG_REF_MV
+    int16_t rfc = compound_mode_context[ref_frames[0]];
+    int c1 = cost_mv_ref(cpi, NEAREST_NEARMV, 1, rfc);
+    int c2 = cost_mv_ref(cpi, NEAREST_NEARESTMV, 1, rfc);
+    int c3 = cost_mv_ref(cpi, ZERO_ZEROMV, 1, rfc);
+    int c4 = cost_mv_ref(cpi, NEAR_NEARESTMV, 1, rfc);
+    int c5 = cost_mv_ref(cpi, NEAR_NEARMV, 1, rfc);
+#else
+    int16_t rfc = mode_context[ref_frames[0]];
+    int c1 = cost_mv_ref(cpi, NEAREST_NEARMV, rfc);
+    int c2 = cost_mv_ref(cpi, NEAREST_NEARESTMV, rfc);
+    int c3 = cost_mv_ref(cpi, ZERO_ZEROMV, rfc);
+    int c4 = cost_mv_ref(cpi, NEAR_NEARESTMV, rfc);
+    int c5 = cost_mv_ref(cpi, NEAR_NEARMV, rfc);
+#endif
+
+    if (this_mode == NEAREST_NEARMV) {
+      if (c1 > c3) return 0;
+    } else if (this_mode == NEAREST_NEARESTMV) {
+      if (c2 > c3) return 0;
+    } else if (this_mode == NEAR_NEARESTMV) {
+      if (c4 > c3) return 0;
+    } else if (this_mode == NEAR_NEARMV) {
+      if (c5 > c3) return 0;
+    } else {
+      assert(this_mode == ZERO_ZEROMV);
+      if ((c3 >= c2 &&
+           frame_mv[NEAREST_NEARESTMV][ref_frames[0]].as_int == 0 &&
+           frame_mv[NEAREST_NEARESTMV][ref_frames[1]].as_int == 0) ||
+          (c3 >= c1 &&
+           frame_mv[NEAREST_NEARMV][ref_frames[0]].as_int == 0 &&
+           frame_mv[NEAREST_NEARMV][ref_frames[1]].as_int == 0) ||
+          (c3 >= c5 &&
+           frame_mv[NEAR_NEARMV][ref_frames[0]].as_int == 0 &&
+           frame_mv[NEAR_NEARMV][ref_frames[1]].as_int == 0) ||
+          (c3 >= c4 &&
+           frame_mv[NEAR_NEARESTMV][ref_frames[0]].as_int == 0 &&
+           frame_mv[NEAR_NEARESTMV][ref_frames[1]].as_int == 0))
+        return 0;
+    }
+  }
+#endif  // CONFIG_EXT_INTER
   return 1;
 }
 
@@ -1575,8 +4674,12 @@
                                 BLOCK_SIZE bsize,
                                 int_mv *frame_mv,
                                 int mi_row, int mi_col,
+#if CONFIG_EXT_INTER
+                                int_mv* ref_mv_sub8x8[2],
+#endif
                                 int_mv single_newmv[MAX_REF_FRAMES],
-                                int *rate_mv) {
+                                int *rate_mv,
+                                const int block) {
   const VP10_COMMON *const cm = &cpi->common;
   const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
   const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
@@ -1586,7 +4689,14 @@
                        mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]};
   int_mv ref_mv[2];
   int ite, ref;
-  const InterpKernel *kernel = vp10_filter_kernels[mbmi->interp_filter];
+#if CONFIG_DUAL_FILTER
+  INTERP_FILTER interp_filter[4] = {
+      mbmi->interp_filter[0], mbmi->interp_filter[1],
+      mbmi->interp_filter[2], mbmi->interp_filter[3],
+  };
+#else
+  const INTERP_FILTER interp_filter = mbmi->interp_filter;
+#endif
   struct scale_factors sf;
 
   // Do joint motion search in compound mode to get more accurate mv.
@@ -1599,13 +4709,18 @@
 
   // Prediction buffer from second frame.
 #if CONFIG_VP9_HIGHBITDEPTH
-  DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[64 * 64]);
+  DECLARE_ALIGNED(16, uint16_t, second_pred_alloc_16[MAX_SB_SQUARE]);
   uint8_t *second_pred;
 #else
-  DECLARE_ALIGNED(16, uint8_t, second_pred[64 * 64]);
+  DECLARE_ALIGNED(16, uint8_t, second_pred[MAX_SB_SQUARE]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
   for (ref = 0; ref < 2; ++ref) {
+#if CONFIG_EXT_INTER
+    if (bsize < BLOCK_8X8 && ref_mv_sub8x8 != NULL)
+      ref_mv[ref].as_int = ref_mv_sub8x8[ref]->as_int;
+    else
+#endif  // CONFIG_EXT_INTER
     ref_mv[ref] = x->mbmi_ext->ref_mvs[refs[ref]][0];
 
     if (scaled_ref_frame[ref]) {
@@ -1626,11 +4741,11 @@
   // frame we must use a unit scaling factor during mode selection.
 #if CONFIG_VP9_HIGHBITDEPTH
   vp10_setup_scale_factors_for_frame(&sf, cm->width, cm->height,
-                                    cm->width, cm->height,
-                                    cm->use_highbitdepth);
+                                     cm->width, cm->height,
+                                     cm->use_highbitdepth);
 #else
   vp10_setup_scale_factors_for_frame(&sf, cm->width, cm->height,
-                                    cm->width, cm->height);
+                                     cm->width, cm->height);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
   // Allow joint search multiple times iteratively for each reference frame
@@ -1639,7 +4754,7 @@
     struct buf_2d ref_yv12[2];
     int bestsme = INT_MAX;
     int sadpb = x->sadperbit16;
-    MV tmp_mv;
+    MV *const best_mv = &x->best_mv.as_mv;
     int search_range = 3;
 
     int tmp_col_min = x->mv_col_min;
@@ -1654,6 +4769,14 @@
     ref_yv12[0] = xd->plane[0].pre[0];
     ref_yv12[1] = xd->plane[0].pre[1];
 
+#if CONFIG_DUAL_FILTER
+    // reload the filter types
+    interp_filter[0] = (id == 0) ?
+        mbmi->interp_filter[2] : mbmi->interp_filter[0];
+    interp_filter[1] = (id == 0) ?
+        mbmi->interp_filter[3] : mbmi->interp_filter[1];
+#endif
+
     // Get the prediction block from the 'other' reference frame.
 #if CONFIG_VP9_HIGHBITDEPTH
     if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
@@ -1663,7 +4786,7 @@
                                        second_pred, pw,
                                        &frame_mv[refs[!id]].as_mv,
                                        &sf, pw, ph, 0,
-                                       kernel, MV_PRECISION_Q3,
+                                       interp_filter, MV_PRECISION_Q3,
                                        mi_col * MI_SIZE, mi_row * MI_SIZE,
                                        xd->bd);
     } else {
@@ -1673,7 +4796,7 @@
                                 second_pred, pw,
                                 &frame_mv[refs[!id]].as_mv,
                                 &sf, pw, ph, 0,
-                                kernel, MV_PRECISION_Q3,
+                                interp_filter, MV_PRECISION_Q3,
                                 mi_col * MI_SIZE, mi_row * MI_SIZE);
     }
 #else
@@ -1682,7 +4805,7 @@
                               second_pred, pw,
                               &frame_mv[refs[!id]].as_mv,
                               &sf, pw, ph, 0,
-                              kernel, MV_PRECISION_Q3,
+                              interp_filter, MV_PRECISION_Q3,
                               mi_col * MI_SIZE, mi_row * MI_SIZE);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -1692,19 +4815,22 @@
     vp10_set_mv_search_range(x, &ref_mv[id].as_mv);
 
     // Use the mv result from the single mode as mv predictor.
-    tmp_mv = frame_mv[refs[id]].as_mv;
+    *best_mv = frame_mv[refs[id]].as_mv;
 
-    tmp_mv.col >>= 3;
-    tmp_mv.row >>= 3;
+    best_mv->col >>= 3;
+    best_mv->row >>= 3;
+
+#if CONFIG_REF_MV
+    vp10_set_mvcost(x, refs[id]);
+#endif
 
     // Small-range full-pixel motion search.
-    bestsme = vp10_refining_search_8p_c(x, &tmp_mv, sadpb,
-                                       search_range,
-                                       &cpi->fn_ptr[bsize],
-                                       &ref_mv[id].as_mv, second_pred);
+    bestsme = vp10_refining_search_8p_c(x, sadpb, search_range,
+                                        &cpi->fn_ptr[bsize],
+                                        &ref_mv[id].as_mv, second_pred);
     if (bestsme < INT_MAX)
-      bestsme = vp10_get_mvpred_av_var(x, &tmp_mv, &ref_mv[id].as_mv,
-                                      second_pred, &cpi->fn_ptr[bsize], 1);
+      bestsme = vp10_get_mvpred_av_var(x, best_mv, &ref_mv[id].as_mv,
+                                       second_pred, &cpi->fn_ptr[bsize], 1);
 
     x->mv_col_min = tmp_col_min;
     x->mv_col_max = tmp_col_max;
@@ -1714,17 +4840,50 @@
     if (bestsme < INT_MAX) {
       int dis; /* TODO: use dis in distortion calculation later. */
       unsigned int sse;
-      bestsme = cpi->find_fractional_mv_step(
-          x, &tmp_mv,
-          &ref_mv[id].as_mv,
-          cpi->common.allow_high_precision_mv,
-          x->errorperbit,
-          &cpi->fn_ptr[bsize],
-          0, cpi->sf.mv.subpel_iters_per_step,
-          NULL,
-          x->nmvjointcost, x->mvcost,
-          &dis, &sse, second_pred,
-          pw, ph);
+      if (cpi->sf.use_upsampled_references) {
+        // Use up-sampled reference frames.
+        struct macroblockd_plane *const pd = &xd->plane[0];
+        struct buf_2d backup_pred = pd->pre[0];
+        const YV12_BUFFER_CONFIG *upsampled_ref =
+            get_upsampled_ref(cpi, refs[id]);
+
+        // Set pred for Y plane
+        setup_pred_plane(&pd->pre[0], upsampled_ref->y_buffer,
+                         upsampled_ref->y_stride, (mi_row << 3), (mi_col << 3),
+                         NULL, pd->subsampling_x, pd->subsampling_y);
+
+        // If bsize < BLOCK_8X8, adjust pred pointer for this block
+        if (bsize < BLOCK_8X8)
+          pd->pre[0].buf =
+              &pd->pre[0].buf[(vp10_raster_block_offset(BLOCK_8X8, block,
+              pd->pre[0].stride)) << 3];
+
+        bestsme = cpi->find_fractional_mv_step(
+            x, &ref_mv[id].as_mv,
+            cpi->common.allow_high_precision_mv,
+            x->errorperbit,
+            &cpi->fn_ptr[bsize],
+            0, cpi->sf.mv.subpel_iters_per_step,
+            NULL,
+            x->nmvjointcost, x->mvcost,
+            &dis, &sse, second_pred,
+            pw, ph, 1);
+
+        // Restore the reference frames.
+        pd->pre[0] = backup_pred;
+      } else {
+        (void) block;
+        bestsme = cpi->find_fractional_mv_step(
+            x, &ref_mv[id].as_mv,
+            cpi->common.allow_high_precision_mv,
+            x->errorperbit,
+            &cpi->fn_ptr[bsize],
+            0, cpi->sf.mv.subpel_iters_per_step,
+            NULL,
+            x->nmvjointcost, x->mvcost,
+            &dis, &sse, second_pred,
+            pw, ph, 0);
+        }
     }
 
     // Restore the pointer to the first (possibly scaled) prediction buffer.
@@ -1732,7 +4891,7 @@
       xd->plane[0].pre[0] = ref_yv12[0];
 
     if (bestsme < last_besterr[id]) {
-      frame_mv[refs[id]].as_mv = tmp_mv;
+      frame_mv[refs[id]].as_mv = *best_mv;
       last_besterr[id] = bestsme;
     } else {
       break;
@@ -1748,10 +4907,21 @@
       for (i = 0; i < MAX_MB_PLANE; i++)
         xd->plane[i].pre[ref] = backup_yv12[ref][i];
     }
-
+#if CONFIG_REF_MV
+    vp10_set_mvcost(x, refs[ref]);
+#endif
+#if CONFIG_EXT_INTER
+    if (bsize >= BLOCK_8X8)
+#endif  // CONFIG_EXT_INTER
     *rate_mv += vp10_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
-                                &x->mbmi_ext->ref_mvs[refs[ref]][0].as_mv,
-                                x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+                                 &x->mbmi_ext->ref_mvs[refs[ref]][0].as_mv,
+                                 x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+#if CONFIG_EXT_INTER
+    else
+      *rate_mv += vp10_mv_bit_cost(&frame_mv[refs[ref]].as_mv,
+                                   &ref_mv_sub8x8[ref]->as_mv,
+                                   x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+#endif  // CONFIG_EXT_INTER
   }
 }
 
@@ -1763,11 +4933,18 @@
                                         int64_t *returndistortion,
                                         int *skippable, int64_t *psse,
                                         int mvthresh,
+#if CONFIG_EXT_INTER
+                                        int_mv seg_mvs[4][2][MAX_REF_FRAMES],
+                                        int_mv compound_seg_newmvs[4][2],
+#else
                                         int_mv seg_mvs[4][MAX_REF_FRAMES],
+#endif  // CONFIG_EXT_INTER
                                         BEST_SEG_INFO *bsi_buf, int filter_idx,
                                         int mi_row, int mi_col) {
-  int i;
   BEST_SEG_INFO *bsi = bsi_buf + filter_idx;
+#if CONFIG_REF_MV
+  int_mv tmp_ref_mv[2];
+#endif
   MACROBLOCKD *xd = &x->e_mbd;
   MODE_INFO *mi = xd->mi[0];
   MB_MODE_INFO *mbmi = &mi->mbmi;
@@ -1799,8 +4976,20 @@
   bsi->mvp.as_int = best_ref_mv->as_int;
   bsi->mvthresh = mvthresh;
 
-  for (i = 0; i < 4; i++)
-    bsi->modes[i] = ZEROMV;
+  for (idx = 0; idx < 4; ++idx)
+    bsi->modes[idx] = ZEROMV;
+
+#if CONFIG_REFMV
+  for (idx = 0; idx < 4; ++idx) {
+    for (k = NEARESTMV; k <= NEWMV; ++k) {
+      bsi->rdstat[idx][INTER_OFFSET(k)].pred_mv[0].as_int = INVALID_MV;
+      bsi->rdstat[idx][INTER_OFFSET(k)].pred_mv[1].as_int = INVALID_MV;
+
+      bsi->rdstat[idx][INTER_OFFSET(k)].mvs[0].as_int = INVALID_MV;
+      bsi->rdstat[idx][INTER_OFFSET(k)].mvs[1].as_int = INVALID_MV;
+    }
+  }
+#endif
 
   memcpy(t_above, pd->above_context, sizeof(t_above));
   memcpy(t_left, pd->left_context, sizeof(t_left));
@@ -1822,28 +5011,169 @@
       int64_t best_rd = INT64_MAX;
       const int i = idy * 2 + idx;
       int ref;
+#if CONFIG_REF_MV
+        CANDIDATE_MV ref_mv_stack[2][MAX_REF_MV_STACK_SIZE];
+        uint8_t ref_mv_count[2];
+#endif
+#if CONFIG_EXT_INTER
+      int mv_idx;
+      int_mv ref_mvs_sub8x8[2][2];
+#endif  // CONFIG_EXT_INTER
 
       for (ref = 0; ref < 1 + has_second_rf; ++ref) {
         const MV_REFERENCE_FRAME frame = mbmi->ref_frame[ref];
+#if CONFIG_EXT_INTER
+        int_mv mv_ref_list[MAX_MV_REF_CANDIDATES];
+        vp10_update_mv_context(xd, mi, frame, mv_ref_list, i,
+                               mi_row, mi_col, NULL);
+#endif  // CONFIG_EXT_INTER
         frame_mv[ZEROMV][frame].as_int = 0;
         vp10_append_sub8x8_mvs_for_idx(cm, xd, i, ref, mi_row, mi_col,
+#if CONFIG_REF_MV
+                                       ref_mv_stack[ref],
+                                       &ref_mv_count[ref],
+#endif
+#if CONFIG_EXT_INTER
+                                       mv_ref_list,
+#endif  // CONFIG_EXT_INTER
                                       &frame_mv[NEARESTMV][frame],
-                                      &frame_mv[NEARMV][frame],
-                                      mbmi_ext->mode_context);
+                                      &frame_mv[NEARMV][frame]);
+
+#if CONFIG_REF_MV
+        tmp_ref_mv[ref] = frame_mv[NEARESTMV][mbmi->ref_frame[ref]];
+        lower_mv_precision(&tmp_ref_mv[ref].as_mv, cm->allow_high_precision_mv);
+        bsi->ref_mv[ref] = &tmp_ref_mv[ref];
+        mbmi_ext->ref_mvs[frame][0] = tmp_ref_mv[ref];
+#endif
+
+#if CONFIG_EXT_INTER
+        mv_ref_list[0].as_int = frame_mv[NEARESTMV][frame].as_int;
+        mv_ref_list[1].as_int = frame_mv[NEARMV][frame].as_int;
+        vp10_find_best_ref_mvs(cm->allow_high_precision_mv, mv_ref_list,
+                             &ref_mvs_sub8x8[0][ref], &ref_mvs_sub8x8[1][ref]);
+
+        if (has_second_rf) {
+          frame_mv[ZERO_ZEROMV][frame].as_int = 0;
+          frame_mv[NEAREST_NEARESTMV][frame].as_int =
+            frame_mv[NEARESTMV][frame].as_int;
+
+          if (ref == 0) {
+            frame_mv[NEAREST_NEARMV][frame].as_int =
+              frame_mv[NEARESTMV][frame].as_int;
+            frame_mv[NEAR_NEARESTMV][frame].as_int =
+              frame_mv[NEARMV][frame].as_int;
+            frame_mv[NEAREST_NEWMV][frame].as_int =
+              frame_mv[NEARESTMV][frame].as_int;
+            frame_mv[NEAR_NEWMV][frame].as_int =
+              frame_mv[NEARMV][frame].as_int;
+            frame_mv[NEAR_NEARMV][frame].as_int =
+              frame_mv[NEARMV][frame].as_int;
+          } else if (ref == 1) {
+            frame_mv[NEAREST_NEARMV][frame].as_int =
+              frame_mv[NEARMV][frame].as_int;
+            frame_mv[NEAR_NEARESTMV][frame].as_int =
+              frame_mv[NEARESTMV][frame].as_int;
+            frame_mv[NEW_NEARESTMV][frame].as_int =
+              frame_mv[NEARESTMV][frame].as_int;
+            frame_mv[NEW_NEARMV][frame].as_int =
+              frame_mv[NEARMV][frame].as_int;
+            frame_mv[NEAR_NEARMV][frame].as_int =
+              frame_mv[NEARMV][frame].as_int;
+          }
+        }
+#endif  // CONFIG_EXT_INTER
       }
 
       // search for the best motion vector on this segment
+#if CONFIG_EXT_INTER
+      for (this_mode = (has_second_rf ? NEAREST_NEARESTMV : NEARESTMV);
+           this_mode <= (has_second_rf ? NEW_NEWMV : NEWFROMNEARMV);
+           ++this_mode) {
+#else
       for (this_mode = NEARESTMV; this_mode <= NEWMV; ++this_mode) {
+#endif  // CONFIG_EXT_INTER
         const struct buf_2d orig_src = x->plane[0].src;
         struct buf_2d orig_pre[2];
+        // This flag controls if the motion estimation will kick off. When it
+        // is set to a non-zero value, the encoder will force motion estimation.
+        int run_mv_search = 0;
 
         mode_idx = INTER_OFFSET(this_mode);
+#if CONFIG_EXT_INTER
+        mv_idx = (this_mode == NEWFROMNEARMV) ? 1 : 0;
+
+        for (ref = 0; ref < 1 + has_second_rf; ++ref)
+          bsi->ref_mv[ref]->as_int = ref_mvs_sub8x8[mv_idx][ref].as_int;
+#endif  // CONFIG_EXT_INTER
         bsi->rdstat[i][mode_idx].brdcost = INT64_MAX;
         if (!(inter_mode_mask & (1 << this_mode)))
           continue;
 
-        if (!check_best_zero_mv(cpi, mbmi_ext->mode_context, frame_mv,
-                                this_mode, mbmi->ref_frame))
+#if CONFIG_REF_MV
+        run_mv_search = 2;
+#if !CONFIG_EXT_INTER
+        if (filter_idx > 0 && this_mode == NEWMV) {
+          BEST_SEG_INFO* ref_bsi = bsi_buf;
+          SEG_RDSTAT *ref_rdstat = &ref_bsi->rdstat[i][mode_idx];
+
+          if (has_second_rf) {
+            if (seg_mvs[i][mbmi->ref_frame[0]].as_int ==
+                    ref_rdstat->mvs[0].as_int &&
+                ref_rdstat->mvs[0].as_int != INVALID_MV)
+              if (bsi->ref_mv[0]->as_int == ref_rdstat->pred_mv[0].as_int)
+                --run_mv_search;
+
+            if (seg_mvs[i][mbmi->ref_frame[1]].as_int ==
+                    ref_rdstat->mvs[1].as_int &&
+                ref_rdstat->mvs[1].as_int != INVALID_MV)
+              if (bsi->ref_mv[1]->as_int == ref_rdstat->pred_mv[1].as_int)
+                --run_mv_search;
+          } else {
+            if (bsi->ref_mv[0]->as_int == ref_rdstat->pred_mv[0].as_int &&
+                ref_rdstat->mvs[0].as_int != INVALID_MV) {
+              run_mv_search = 0;
+              seg_mvs[i][mbmi->ref_frame[0]].as_int =
+                  ref_rdstat->mvs[0].as_int;
+            }
+          }
+
+          if (run_mv_search != 0 && filter_idx > 1) {
+            ref_bsi = bsi_buf + 1;
+            ref_rdstat = &ref_bsi->rdstat[i][mode_idx];
+            run_mv_search = 2;
+
+            if (has_second_rf) {
+              if (seg_mvs[i][mbmi->ref_frame[0]].as_int ==
+                      ref_rdstat->mvs[0].as_int &&
+                  ref_rdstat->mvs[0].as_int != INVALID_MV)
+                if (bsi->ref_mv[0]->as_int == ref_rdstat->pred_mv[0].as_int)
+                  --run_mv_search;
+
+              if (seg_mvs[i][mbmi->ref_frame[1]].as_int ==
+                      ref_rdstat->mvs[1].as_int &&
+                  ref_rdstat->mvs[1].as_int != INVALID_MV)
+                if (bsi->ref_mv[1]->as_int == ref_rdstat->pred_mv[1].as_int)
+                  --run_mv_search;
+            } else {
+              if (bsi->ref_mv[0]->as_int ==
+                      ref_rdstat->pred_mv[0].as_int &&
+                  ref_rdstat->mvs[0].as_int != INVALID_MV) {
+                run_mv_search = 0;
+                seg_mvs[i][mbmi->ref_frame[0]].as_int =
+                    ref_rdstat->mvs[0].as_int;
+              }
+            }
+          }
+        }
+#endif  // CONFIG_EXT_INTER
+#endif  // CONFIG_REF_MV
+
+        if (!check_best_zero_mv(cpi, mbmi_ext->mode_context,
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+                                mbmi_ext->compound_mode_context,
+#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
+                                frame_mv,
+                                this_mode, mbmi->ref_frame, bsize, i))
           continue;
 
         memcpy(orig_pre, pd->pre, sizeof(orig_pre));
@@ -1853,9 +5183,17 @@
                sizeof(bsi->rdstat[i][mode_idx].tl));
 
         // motion search for newmv (single predictor case only)
-        if (!has_second_rf && this_mode == NEWMV &&
-            seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV) {
-          MV *const new_mv = &mode_mv[NEWMV][0].as_mv;
+        if (!has_second_rf &&
+#if CONFIG_EXT_INTER
+            have_newmv_in_inter_mode(this_mode) &&
+            (seg_mvs[i][mv_idx][mbmi->ref_frame[0]].as_int == INVALID_MV ||
+             vp10_use_mv_hp(&bsi->ref_mv[0]->as_mv) == 0)
+#else
+            this_mode == NEWMV &&
+            (seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV ||
+                run_mv_search)
+#endif  // CONFIG_EXT_INTER
+            ) {
           int step_param = 0;
           int bestsme = INT_MAX;
           int sadpb = x->sadperbit4;
@@ -1869,12 +5207,18 @@
             break;
 
           if (cpi->oxcf.mode != BEST) {
+#if CONFIG_EXT_INTER
+            bsi->mvp.as_int = bsi->ref_mv[0]->as_int;
+#else
             // use previous block's result as next block's MV predictor.
+#if !CONFIG_REF_MV
             if (i > 0) {
               bsi->mvp.as_int = mi->bmi[i - 1].as_mv[0].as_int;
               if (i == 2)
                 bsi->mvp.as_int = mi->bmi[i - 2].as_mv[0].as_int;
             }
+#endif
+#endif  // CONFIG_EXT_INTER
           }
           if (i == 0)
             max_mv = x->max_mv_context[mbmi->ref_frame[0]];
@@ -1892,8 +5236,13 @@
             step_param = cpi->mv_step_param;
           }
 
+#if CONFIG_REF_MV
+          mvp_full.row = bsi->ref_mv[0]->as_mv.row >> 3;
+          mvp_full.col = bsi->ref_mv[0]->as_mv.col >> 3;
+#else
           mvp_full.row = bsi->mvp.as_mv.row >> 3;
           mvp_full.col = bsi->mvp.as_mv.col >> 3;
+#endif
 
           if (cpi->sf.adaptive_motion_search) {
             mvp_full.row = x->pred_mv[mbmi->ref_frame[0]].row >> 3;
@@ -1906,58 +5255,135 @@
 
           vp10_set_mv_search_range(x, &bsi->ref_mv[0]->as_mv);
 
+#if CONFIG_REF_MV
+          vp10_set_mvcost(x, mbmi->ref_frame[0]);
+#endif
           bestsme = vp10_full_pixel_search(
               cpi, x, bsize, &mvp_full, step_param, sadpb,
               cpi->sf.mv.subpel_search_method != SUBPEL_TREE ? cost_list : NULL,
-              &bsi->ref_mv[0]->as_mv, new_mv,
-              INT_MAX, 1);
+              &bsi->ref_mv[0]->as_mv, INT_MAX, 1);
 
           if (bestsme < INT_MAX) {
             int distortion;
-            cpi->find_fractional_mv_step(
-                x,
-                new_mv,
-                &bsi->ref_mv[0]->as_mv,
-                cm->allow_high_precision_mv,
-                x->errorperbit, &cpi->fn_ptr[bsize],
-                cpi->sf.mv.subpel_force_stop,
-                cpi->sf.mv.subpel_iters_per_step,
-                cond_cost_list(cpi, cost_list),
-                x->nmvjointcost, x->mvcost,
-                &distortion,
-                &x->pred_sse[mbmi->ref_frame[0]],
-                NULL, 0, 0);
+            if (cpi->sf.use_upsampled_references) {
+              const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
+              const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
+              // Use up-sampled reference frames.
+              struct macroblockd_plane *const pd = &xd->plane[0];
+              struct buf_2d backup_pred = pd->pre[0];
+              const YV12_BUFFER_CONFIG *upsampled_ref =
+                  get_upsampled_ref(cpi, mbmi->ref_frame[0]);
+
+              // Set pred for Y plane
+              setup_pred_plane(&pd->pre[0], upsampled_ref->y_buffer,
+                               upsampled_ref->y_stride,
+                               (mi_row << 3), (mi_col << 3),
+                               NULL, pd->subsampling_x, pd->subsampling_y);
+
+              // adjust pred pointer for this block
+              pd->pre[0].buf =
+                  &pd->pre[0].buf[(vp10_raster_block_offset(BLOCK_8X8, i,
+                  pd->pre[0].stride)) << 3];
+
+              cpi->find_fractional_mv_step(
+                  x, &bsi->ref_mv[0]->as_mv,
+                  cm->allow_high_precision_mv,
+                  x->errorperbit, &cpi->fn_ptr[bsize],
+                  cpi->sf.mv.subpel_force_stop,
+                  cpi->sf.mv.subpel_iters_per_step,
+                  cond_cost_list(cpi, cost_list),
+                  x->nmvjointcost, x->mvcost,
+                  &distortion,
+                  &x->pred_sse[mbmi->ref_frame[0]],
+                  NULL, pw, ph, 1);
+
+              // Restore the reference frames.
+              pd->pre[0] = backup_pred;
+            } else {
+              cpi->find_fractional_mv_step(
+                  x, &bsi->ref_mv[0]->as_mv,
+                  cm->allow_high_precision_mv,
+                  x->errorperbit, &cpi->fn_ptr[bsize],
+                  cpi->sf.mv.subpel_force_stop,
+                  cpi->sf.mv.subpel_iters_per_step,
+                  cond_cost_list(cpi, cost_list),
+                  x->nmvjointcost, x->mvcost,
+                  &distortion,
+                  &x->pred_sse[mbmi->ref_frame[0]],
+                  NULL, 0, 0, 0);
+            }
 
             // save motion search result for use in compound prediction
-            seg_mvs[i][mbmi->ref_frame[0]].as_mv = *new_mv;
+#if CONFIG_EXT_INTER
+            seg_mvs[i][mv_idx][mbmi->ref_frame[0]].as_mv = x->best_mv.as_mv;
+#else
+            seg_mvs[i][mbmi->ref_frame[0]].as_mv = x->best_mv.as_mv;
+#endif  // CONFIG_EXT_INTER
           }
 
           if (cpi->sf.adaptive_motion_search)
-            x->pred_mv[mbmi->ref_frame[0]] = *new_mv;
+            x->pred_mv[mbmi->ref_frame[0]] = x->best_mv.as_mv;
+
+#if CONFIG_EXT_INTER
+          mode_mv[this_mode][0] = x->best_mv;
+#else
+          mode_mv[NEWMV][0] = x->best_mv;
+#endif  // CONFIG_EXT_INTER
 
           // restore src pointers
           mi_buf_restore(x, orig_src, orig_pre);
         }
 
         if (has_second_rf) {
+#if CONFIG_EXT_INTER
+          if (seg_mvs[i][mv_idx][mbmi->ref_frame[1]].as_int == INVALID_MV ||
+              seg_mvs[i][mv_idx][mbmi->ref_frame[0]].as_int == INVALID_MV)
+#else
           if (seg_mvs[i][mbmi->ref_frame[1]].as_int == INVALID_MV ||
               seg_mvs[i][mbmi->ref_frame[0]].as_int == INVALID_MV)
+#endif  // CONFIG_EXT_INTER
             continue;
         }
 
-        if (has_second_rf && this_mode == NEWMV &&
-            mbmi->interp_filter == EIGHTTAP) {
+#if CONFIG_DUAL_FILTER
+        (void)run_mv_search;
+#endif
+
+        if (has_second_rf &&
+#if CONFIG_EXT_INTER
+            this_mode == NEW_NEWMV &&
+#else
+            this_mode == NEWMV &&
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_DUAL_FILTER
+            (mbmi->interp_filter[0] == EIGHTTAP_REGULAR || run_mv_search)) {
+#else
+            (mbmi->interp_filter == EIGHTTAP_REGULAR || run_mv_search)) {
+#endif
           // adjust src pointers
           mi_buf_shift(x, i);
           if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
             int rate_mv;
             joint_motion_search(cpi, x, bsize, frame_mv[this_mode],
-                                mi_row, mi_col, seg_mvs[i],
-                                &rate_mv);
+                                mi_row, mi_col,
+#if CONFIG_EXT_INTER
+                                bsi->ref_mv,
+                                seg_mvs[i][mv_idx],
+#else
+                                seg_mvs[i],
+#endif  // CONFIG_EXT_INTER
+                                &rate_mv, i);
+#if CONFIG_EXT_INTER
+            compound_seg_newmvs[i][0].as_int =
+                frame_mv[this_mode][mbmi->ref_frame[0]].as_int;
+            compound_seg_newmvs[i][1].as_int =
+                frame_mv[this_mode][mbmi->ref_frame[1]].as_int;
+#else
             seg_mvs[i][mbmi->ref_frame[0]].as_int =
                 frame_mv[this_mode][mbmi->ref_frame[0]].as_int;
             seg_mvs[i][mbmi->ref_frame[1]].as_int =
                 frame_mv[this_mode][mbmi->ref_frame[1]].as_int;
+#endif  // CONFIG_EXT_INTER
           }
           // restore src pointers
           mi_buf_restore(x, orig_src, orig_pre);
@@ -1965,7 +5391,14 @@
 
         bsi->rdstat[i][mode_idx].brate =
             set_and_cost_bmi_mvs(cpi, x, xd, i, this_mode, mode_mv[this_mode],
-                                 frame_mv, seg_mvs[i], bsi->ref_mv,
+                                 frame_mv,
+#if CONFIG_EXT_INTER
+                                 seg_mvs[i][mv_idx],
+                                 compound_seg_newmvs[i],
+#else
+                                 seg_mvs[i],
+#endif  // CONFIG_EXT_INTER
+                                 bsi->ref_mv,
                                  x->nmvjointcost, x->mvcost);
 
         for (ref = 0; ref < 1 + has_second_rf; ++ref) {
@@ -1977,6 +5410,26 @@
           if (num_4x4_blocks_high > 1)
             bsi->rdstat[i + 2][mode_idx].mvs[ref].as_int =
                 mode_mv[this_mode][ref].as_int;
+#if CONFIG_REF_MV
+          bsi->rdstat[i][mode_idx].pred_mv[ref].as_int =
+              mi->bmi[i].pred_mv_s8[ref].as_int;
+          if (num_4x4_blocks_wide > 1)
+            bsi->rdstat[i + 1][mode_idx].pred_mv[ref].as_int =
+                mi->bmi[i].pred_mv_s8[ref].as_int;
+          if (num_4x4_blocks_high > 1)
+            bsi->rdstat[i + 2][mode_idx].pred_mv[ref].as_int =
+                mi->bmi[i].pred_mv_s8[ref].as_int;
+#endif
+#if CONFIG_EXT_INTER
+          bsi->rdstat[i][mode_idx].ref_mv[ref].as_int =
+            bsi->ref_mv[ref]->as_int;
+          if (num_4x4_blocks_wide > 1)
+            bsi->rdstat[i + 1][mode_idx].ref_mv[ref].as_int =
+              bsi->ref_mv[ref]->as_int;
+          if (num_4x4_blocks_high > 1)
+            bsi->rdstat[i + 2][mode_idx].ref_mv[ref].as_int =
+              bsi->ref_mv[ref]->as_int;
+#endif  // CONFIG_EXT_INTER
         }
 
         // Trap vectors that reach beyond the UMV borders
@@ -1992,22 +5445,68 @@
 
           for (ref = 0; ref < 1 + has_second_rf; ++ref) {
             subpelmv |= mv_has_subpel(&mode_mv[this_mode][ref].as_mv);
+#if CONFIG_EXT_INTER
+            if (have_newmv_in_inter_mode(this_mode))
+              have_ref &= (
+                  (mode_mv[this_mode][ref].as_int ==
+                   ref_bsi->rdstat[i][mode_idx].mvs[ref].as_int) &&
+                  (bsi->ref_mv[ref]->as_int ==
+                   ref_bsi->rdstat[i][mode_idx].ref_mv[ref].as_int));
+            else
+#endif  // CONFIG_EXT_INTER
             have_ref &= mode_mv[this_mode][ref].as_int ==
                 ref_bsi->rdstat[i][mode_idx].mvs[ref].as_int;
           }
 
+          have_ref &= ref_bsi->rdstat[i][mode_idx].brate > 0;
+
           if (filter_idx > 1 && !subpelmv && !have_ref) {
             ref_bsi = bsi_buf + 1;
             have_ref = 1;
             for (ref = 0; ref < 1 + has_second_rf; ++ref)
+#if CONFIG_EXT_INTER
+              if (have_newmv_in_inter_mode(this_mode))
+                have_ref &= (
+                    (mode_mv[this_mode][ref].as_int ==
+                     ref_bsi->rdstat[i][mode_idx].mvs[ref].as_int) &&
+                    (bsi->ref_mv[ref]->as_int ==
+                     ref_bsi->rdstat[i][mode_idx].ref_mv[ref].as_int));
+              else
+#endif  // CONFIG_EXT_INTER
               have_ref &= mode_mv[this_mode][ref].as_int ==
                   ref_bsi->rdstat[i][mode_idx].mvs[ref].as_int;
+
+            have_ref &= ref_bsi->rdstat[i][mode_idx].brate > 0;
           }
 
           if (!subpelmv && have_ref &&
               ref_bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
+#if CONFIG_REF_MV
+            bsi->rdstat[i][mode_idx].byrate =
+                ref_bsi->rdstat[i][mode_idx].byrate;
+            bsi->rdstat[i][mode_idx].bdist =
+                ref_bsi->rdstat[i][mode_idx].bdist;
+            bsi->rdstat[i][mode_idx].bsse =
+                ref_bsi->rdstat[i][mode_idx].bsse;
+            bsi->rdstat[i][mode_idx].brate +=
+                ref_bsi->rdstat[i][mode_idx].byrate;
+            bsi->rdstat[i][mode_idx].eobs =
+                ref_bsi->rdstat[i][mode_idx].eobs;
+
+            bsi->rdstat[i][mode_idx].brdcost =
+                RDCOST(x->rdmult, x->rddiv, bsi->rdstat[i][mode_idx].brate,
+                       bsi->rdstat[i][mode_idx].bdist);
+
+            memcpy(bsi->rdstat[i][mode_idx].ta,
+                   ref_bsi->rdstat[i][mode_idx].ta,
+                   sizeof(bsi->rdstat[i][mode_idx].ta));
+            memcpy(bsi->rdstat[i][mode_idx].tl,
+                   ref_bsi->rdstat[i][mode_idx].tl,
+                   sizeof(bsi->rdstat[i][mode_idx].tl));
+#else
             memcpy(&bsi->rdstat[i][mode_idx], &ref_bsi->rdstat[i][mode_idx],
                    sizeof(SEG_RDSTAT));
+#endif
             if (num_4x4_blocks_wide > 1)
               bsi->rdstat[i + 1][mode_idx].eobs =
                   ref_bsi->rdstat[i + 1][mode_idx].eobs;
@@ -2016,6 +5515,24 @@
                   ref_bsi->rdstat[i + 2][mode_idx].eobs;
 
             if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
+#if CONFIG_REF_MV
+              // If the NEWMV mode is using the same motion vector as the
+              // NEARESTMV mode, skip the rest rate-distortion calculations
+              // and use the inferred motion vector modes.
+              if (this_mode == NEWMV) {
+                if (has_second_rf) {
+                  if (bsi->rdstat[i][mode_idx].mvs[0].as_int ==
+                          bsi->ref_mv[0]->as_int &&
+                      bsi->rdstat[i][mode_idx].mvs[1].as_int ==
+                          bsi->ref_mv[1]->as_int)
+                    continue;
+                } else {
+                  if (bsi->rdstat[i][mode_idx].mvs[0].as_int ==
+                      bsi->ref_mv[0]->as_int)
+                    continue;
+                }
+              }
+#endif
               mode_selected = this_mode;
               best_rd = bsi->rdstat[i][mode_idx].brdcost;
             }
@@ -2033,6 +5550,7 @@
                                     bsi->rdstat[i][mode_idx].tl,
                                     idy, idx,
                                     mi_row, mi_col);
+
         if (bsi->rdstat[i][mode_idx].brdcost < INT64_MAX) {
           bsi->rdstat[i][mode_idx].brdcost += RDCOST(x->rdmult, x->rddiv,
                                             bsi->rdstat[i][mode_idx].brate, 0);
@@ -2045,6 +5563,24 @@
         }
 
         if (bsi->rdstat[i][mode_idx].brdcost < best_rd) {
+#if CONFIG_REF_MV
+          // If the NEWMV mode is using the same motion vector as the
+          // NEARESTMV mode, skip the rest rate-distortion calculations
+          // and use the inferred motion vector modes.
+          if (this_mode == NEWMV) {
+            if (has_second_rf) {
+              if (bsi->rdstat[i][mode_idx].mvs[0].as_int ==
+                      bsi->ref_mv[0]->as_int &&
+                  bsi->rdstat[i][mode_idx].mvs[1].as_int ==
+                      bsi->ref_mv[1]->as_int)
+                continue;
+            } else {
+              if (bsi->rdstat[i][mode_idx].mvs[0].as_int ==
+                  bsi->ref_mv[0]->as_int)
+                continue;
+            }
+          }
+#endif
           mode_selected = this_mode;
           best_rd = bsi->rdstat[i][mode_idx].brdcost;
         }
@@ -2053,7 +5589,11 @@
       if (best_rd == INT64_MAX) {
         int iy, midx;
         for (iy = i + 1; iy < 4; ++iy)
+#if CONFIG_EXT_INTER
+          for (midx = 0; midx < INTER_MODES + INTER_COMPOUND_MODES; ++midx)
+#else
           for (midx = 0; midx < INTER_MODES; ++midx)
+#endif  // CONFIG_EXT_INTER
             bsi->rdstat[iy][midx].brdcost = INT64_MAX;
         bsi->segment_rd = INT64_MAX;
         return INT64_MAX;
@@ -2063,9 +5603,21 @@
       memcpy(t_above, bsi->rdstat[i][mode_idx].ta, sizeof(t_above));
       memcpy(t_left, bsi->rdstat[i][mode_idx].tl, sizeof(t_left));
 
+#if CONFIG_EXT_INTER
+      mv_idx = (mode_selected == NEWFROMNEARMV) ? 1 : 0;
+      bsi->ref_mv[0]->as_int = bsi->rdstat[i][mode_idx].ref_mv[0].as_int;
+      if (has_second_rf)
+        bsi->ref_mv[1]->as_int = bsi->rdstat[i][mode_idx].ref_mv[1].as_int;
+#endif  // CONFIG_EXT_INTER
       set_and_cost_bmi_mvs(cpi, x, xd, i, mode_selected, mode_mv[mode_selected],
-                           frame_mv, seg_mvs[i], bsi->ref_mv, x->nmvjointcost,
-                           x->mvcost);
+                           frame_mv,
+#if CONFIG_EXT_INTER
+                           seg_mvs[i][mv_idx],
+                           compound_seg_newmvs[i],
+#else
+                           seg_mvs[i],
+#endif  // CONFIG_EXT_INTER
+                           bsi->ref_mv, x->nmvjointcost, x->mvcost);
 
       br += bsi->rdstat[i][mode_idx].brate;
       bd += bsi->rdstat[i][mode_idx].bdist;
@@ -2076,7 +5628,11 @@
       if (this_segment_rd > bsi->segment_rd) {
         int iy, midx;
         for (iy = i + 1; iy < 4; ++iy)
+#if CONFIG_EXT_INTER
+          for (midx = 0; midx < INTER_MODES + INTER_COMPOUND_MODES; ++midx)
+#else
           for (midx = 0; midx < INTER_MODES; ++midx)
+#endif  // CONFIG_EXT_INTER
             bsi->rdstat[iy][midx].brdcost = INT64_MAX;
         bsi->segment_rd = INT64_MAX;
         return INT64_MAX;
@@ -2097,13 +5653,24 @@
   if (bsi->segment_rd > best_rd)
     return INT64_MAX;
   /* set it to the best */
-  for (i = 0; i < 4; i++) {
-    mode_idx = INTER_OFFSET(bsi->modes[i]);
-    mi->bmi[i].as_mv[0].as_int = bsi->rdstat[i][mode_idx].mvs[0].as_int;
+  for (idx = 0; idx < 4; idx++) {
+    mode_idx = INTER_OFFSET(bsi->modes[idx]);
+    mi->bmi[idx].as_mv[0].as_int = bsi->rdstat[idx][mode_idx].mvs[0].as_int;
     if (has_second_ref(mbmi))
-      mi->bmi[i].as_mv[1].as_int = bsi->rdstat[i][mode_idx].mvs[1].as_int;
-    x->plane[0].eobs[i] = bsi->rdstat[i][mode_idx].eobs;
-    mi->bmi[i].as_mode = bsi->modes[i];
+      mi->bmi[idx].as_mv[1].as_int = bsi->rdstat[idx][mode_idx].mvs[1].as_int;
+#if CONFIG_REF_MV
+    mi->bmi[idx].pred_mv_s8[0] = bsi->rdstat[idx][mode_idx].pred_mv[0];
+    if (has_second_ref(mbmi))
+      mi->bmi[idx].pred_mv_s8[1] = bsi->rdstat[idx][mode_idx].pred_mv[1];
+#endif
+#if CONFIG_EXT_INTER
+    mi->bmi[idx].ref_mv[0].as_int = bsi->rdstat[idx][mode_idx].ref_mv[0].as_int;
+    if (has_second_rf)
+      mi->bmi[idx].ref_mv[1].as_int =
+          bsi->rdstat[idx][mode_idx].ref_mv[1].as_int;
+#endif  // CONFIG_EXT_INTER
+    x->plane[0].eobs[idx] = bsi->rdstat[idx][mode_idx].eobs;
+    mi->bmi[idx].as_mode = bsi->modes[idx];
   }
 
   /*
@@ -2147,34 +5714,112 @@
     if (cm->reference_mode != COMPOUND_REFERENCE) {
       vpx_prob ref_single_p1 = vp10_get_pred_prob_single_ref_p1(cm, xd);
       vpx_prob ref_single_p2 = vp10_get_pred_prob_single_ref_p2(cm, xd);
+#if CONFIG_EXT_REFS
+      vpx_prob ref_single_p3 = vp10_get_pred_prob_single_ref_p3(cm, xd);
+      vpx_prob ref_single_p4 = vp10_get_pred_prob_single_ref_p4(cm, xd);
+      vpx_prob ref_single_p5 = vp10_get_pred_prob_single_ref_p5(cm, xd);
+#endif  // CONFIG_EXT_REFS
+
       unsigned int base_cost = vp10_cost_bit(intra_inter_p, 1);
 
-      if (cm->reference_mode == REFERENCE_MODE_SELECT)
-        base_cost += vp10_cost_bit(comp_inter_p, 0);
-
-      ref_costs_single[LAST_FRAME] = ref_costs_single[GOLDEN_FRAME] =
+      ref_costs_single[LAST_FRAME] =
+#if CONFIG_EXT_REFS
+          ref_costs_single[LAST2_FRAME] =
+          ref_costs_single[LAST3_FRAME] =
+          ref_costs_single[BWDREF_FRAME] =
+#endif  // CONFIG_EXT_REFS
+          ref_costs_single[GOLDEN_FRAME] =
           ref_costs_single[ALTREF_FRAME] = base_cost;
+
+#if CONFIG_EXT_REFS
+      ref_costs_single[LAST_FRAME]   += vp10_cost_bit(ref_single_p1, 0);
+      ref_costs_single[LAST2_FRAME]  += vp10_cost_bit(ref_single_p1, 0);
+      ref_costs_single[LAST3_FRAME]  += vp10_cost_bit(ref_single_p1, 0);
+      ref_costs_single[GOLDEN_FRAME] += vp10_cost_bit(ref_single_p1, 0);
+      ref_costs_single[BWDREF_FRAME] += vp10_cost_bit(ref_single_p1, 1);
+      ref_costs_single[ALTREF_FRAME] += vp10_cost_bit(ref_single_p1, 1);
+
+      ref_costs_single[LAST_FRAME]   += vp10_cost_bit(ref_single_p3, 0);
+      ref_costs_single[LAST2_FRAME]  += vp10_cost_bit(ref_single_p3, 0);
+      ref_costs_single[LAST3_FRAME]  += vp10_cost_bit(ref_single_p3, 1);
+      ref_costs_single[GOLDEN_FRAME] += vp10_cost_bit(ref_single_p3, 1);
+
+      ref_costs_single[BWDREF_FRAME] += vp10_cost_bit(ref_single_p2, 0);
+      ref_costs_single[ALTREF_FRAME] += vp10_cost_bit(ref_single_p2, 1);
+
+      ref_costs_single[LAST_FRAME]   += vp10_cost_bit(ref_single_p4, 0);
+      ref_costs_single[LAST2_FRAME]  += vp10_cost_bit(ref_single_p4, 1);
+
+      ref_costs_single[LAST3_FRAME]  += vp10_cost_bit(ref_single_p5, 0);
+      ref_costs_single[GOLDEN_FRAME] += vp10_cost_bit(ref_single_p5, 1);
+#else
       ref_costs_single[LAST_FRAME]   += vp10_cost_bit(ref_single_p1, 0);
       ref_costs_single[GOLDEN_FRAME] += vp10_cost_bit(ref_single_p1, 1);
       ref_costs_single[ALTREF_FRAME] += vp10_cost_bit(ref_single_p1, 1);
+
       ref_costs_single[GOLDEN_FRAME] += vp10_cost_bit(ref_single_p2, 0);
       ref_costs_single[ALTREF_FRAME] += vp10_cost_bit(ref_single_p2, 1);
+#endif  // CONFIG_EXT_REFS
     } else {
       ref_costs_single[LAST_FRAME]   = 512;
+#if CONFIG_EXT_REFS
+      ref_costs_single[LAST2_FRAME]  = 512;
+      ref_costs_single[LAST3_FRAME]  = 512;
+      ref_costs_single[BWDREF_FRAME] = 512;
+#endif  // CONFIG_EXT_REFS
       ref_costs_single[GOLDEN_FRAME] = 512;
       ref_costs_single[ALTREF_FRAME] = 512;
     }
+
     if (cm->reference_mode != SINGLE_REFERENCE) {
       vpx_prob ref_comp_p = vp10_get_pred_prob_comp_ref_p(cm, xd);
+#if CONFIG_EXT_REFS
+      vpx_prob ref_comp_p1 = vp10_get_pred_prob_comp_ref_p1(cm, xd);
+      vpx_prob ref_comp_p2 = vp10_get_pred_prob_comp_ref_p2(cm, xd);
+      vpx_prob bwdref_comp_p = vp10_get_pred_prob_comp_bwdref_p(cm, xd);
+#endif  // CONFIG_EXT_REFS
+
       unsigned int base_cost = vp10_cost_bit(intra_inter_p, 1);
 
-      if (cm->reference_mode == REFERENCE_MODE_SELECT)
-        base_cost += vp10_cost_bit(comp_inter_p, 1);
+      ref_costs_comp[LAST_FRAME] =
+#if CONFIG_EXT_REFS
+          ref_costs_comp[LAST2_FRAME] =
+          ref_costs_comp[LAST3_FRAME] =
+#endif  // CONFIG_EXT_REFS
+          ref_costs_comp[GOLDEN_FRAME] = base_cost;
 
-      ref_costs_comp[LAST_FRAME]   = base_cost + vp10_cost_bit(ref_comp_p, 0);
-      ref_costs_comp[GOLDEN_FRAME] = base_cost + vp10_cost_bit(ref_comp_p, 1);
+#if CONFIG_EXT_REFS
+      ref_costs_comp[BWDREF_FRAME] = ref_costs_comp[ALTREF_FRAME] = 0;
+#endif  // CONFIG_EXT_REFS
+
+#if CONFIG_EXT_REFS
+      ref_costs_comp[LAST_FRAME]   += vp10_cost_bit(ref_comp_p, 0);
+      ref_costs_comp[LAST2_FRAME]  += vp10_cost_bit(ref_comp_p, 0);
+      ref_costs_comp[LAST3_FRAME]  += vp10_cost_bit(ref_comp_p, 1);
+      ref_costs_comp[GOLDEN_FRAME] += vp10_cost_bit(ref_comp_p, 1);
+
+      ref_costs_comp[LAST_FRAME]   += vp10_cost_bit(ref_comp_p1, 1);
+      ref_costs_comp[LAST2_FRAME]  += vp10_cost_bit(ref_comp_p1, 0);
+
+      ref_costs_comp[LAST3_FRAME]  += vp10_cost_bit(ref_comp_p2, 0);
+      ref_costs_comp[GOLDEN_FRAME] += vp10_cost_bit(ref_comp_p2, 1);
+
+      // NOTE(zoeliu): BWDREF and ALTREF each add an extra cost by coding 1
+      //               more bit.
+      ref_costs_comp[BWDREF_FRAME] += vp10_cost_bit(bwdref_comp_p, 0);
+      ref_costs_comp[ALTREF_FRAME] += vp10_cost_bit(bwdref_comp_p, 1);
+#else
+      ref_costs_comp[LAST_FRAME]   += vp10_cost_bit(ref_comp_p, 0);
+      ref_costs_comp[GOLDEN_FRAME] += vp10_cost_bit(ref_comp_p, 1);
+#endif  // CONFIG_EXT_REFS
     } else {
       ref_costs_comp[LAST_FRAME]   = 512;
+#if CONFIG_EXT_REFS
+      ref_costs_comp[LAST2_FRAME]  = 512;
+      ref_costs_comp[LAST3_FRAME]  = 512;
+      ref_costs_comp[BWDREF_FRAME] = 512;
+      ref_costs_comp[ALTREF_FRAME] = 512;
+#endif  // CONFIG_EXT_REFS
       ref_costs_comp[GOLDEN_FRAME] = 512;
     }
   }
@@ -2183,7 +5828,6 @@
 static void store_coding_context(MACROBLOCK *x, PICK_MODE_CONTEXT *ctx,
                          int mode_index,
                          int64_t comp_pred_diff[REFERENCE_MODES],
-                         int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS],
                          int skippable) {
   MACROBLOCKD *const xd = &x->e_mbd;
 
@@ -2197,18 +5841,16 @@
   ctx->single_pred_diff = (int)comp_pred_diff[SINGLE_REFERENCE];
   ctx->comp_pred_diff   = (int)comp_pred_diff[COMPOUND_REFERENCE];
   ctx->hybrid_pred_diff = (int)comp_pred_diff[REFERENCE_MODE_SELECT];
-
-  memcpy(ctx->best_filter_diff, best_filter_diff,
-         sizeof(*best_filter_diff) * SWITCHABLE_FILTER_CONTEXTS);
 }
 
-static void setup_buffer_inter(VP10_COMP *cpi, MACROBLOCK *x,
-                               MV_REFERENCE_FRAME ref_frame,
-                               BLOCK_SIZE block_size,
-                               int mi_row, int mi_col,
-                               int_mv frame_nearest_mv[MAX_REF_FRAMES],
-                               int_mv frame_near_mv[MAX_REF_FRAMES],
-                               struct buf_2d yv12_mb[4][MAX_MB_PLANE]) {
+static void setup_buffer_inter(
+    VP10_COMP *cpi, MACROBLOCK *x,
+    MV_REFERENCE_FRAME ref_frame,
+    BLOCK_SIZE block_size,
+    int mi_row, int mi_col,
+    int_mv frame_nearest_mv[MAX_REF_FRAMES],
+    int_mv frame_near_mv[MAX_REF_FRAMES],
+    struct buf_2d yv12_mb[MAX_REF_FRAMES][MAX_MB_PLANE]) {
   const VP10_COMMON *cm = &cpi->common;
   const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
   MACROBLOCKD *const xd = &x->e_mbd;
@@ -2224,8 +5866,16 @@
   vp10_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf);
 
   // Gets an initial list of candidate vectors from neighbours and orders them
-  vp10_find_mv_refs(cm, xd, mi, ref_frame, candidates, mi_row, mi_col,
-                   NULL, NULL, mbmi_ext->mode_context);
+  vp10_find_mv_refs(cm, xd, mi, ref_frame,
+#if CONFIG_REF_MV
+                    &mbmi_ext->ref_mv_count[ref_frame],
+                    mbmi_ext->ref_mv_stack[ref_frame],
+#if CONFIG_EXT_INTER
+                    mbmi_ext->compound_mode_context,
+#endif  // CONFIG_EXT_INTER
+#endif
+                    candidates, mi_row, mi_col,
+                    NULL, NULL, mbmi_ext->mode_context);
 
   // Candidate refinement carried out at encoder and decoder
   vp10_find_best_ref_mvs(cm->allow_high_precision_mv, candidates,
@@ -2243,7 +5893,11 @@
 static void single_motion_search(VP10_COMP *cpi, MACROBLOCK *x,
                                  BLOCK_SIZE bsize,
                                  int mi_row, int mi_col,
-                                 int_mv *tmp_mv, int *rate_mv) {
+#if CONFIG_EXT_INTER
+                                 int ref_idx,
+                                 int mv_idx,
+#endif  // CONFIG_EXT_INTER
+                                 int *rate_mv) {
   MACROBLOCKD *xd = &x->e_mbd;
   const VP10_COMMON *cm = &cpi->common;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
@@ -2252,8 +5906,14 @@
   int step_param;
   int sadpb = x->sadperbit16;
   MV mvp_full;
+#if CONFIG_EXT_INTER
+  int ref = mbmi->ref_frame[ref_idx];
+  MV ref_mv = x->mbmi_ext->ref_mvs[ref][mv_idx].as_mv;
+#else
   int ref = mbmi->ref_frame[0];
   MV ref_mv = x->mbmi_ext->ref_mvs[ref][0].as_mv;
+  int ref_idx = 0;
+#endif  // CONFIG_EXT_INTER
 
   int tmp_col_min = x->mv_col_min;
   int tmp_col_max = x->mv_col_max;
@@ -2269,15 +5929,19 @@
   pred_mv[1] = x->mbmi_ext->ref_mvs[ref][1].as_mv;
   pred_mv[2] = x->pred_mv[ref];
 
+#if CONFIG_REF_MV
+  vp10_set_mvcost(x, ref);
+#endif
+
   if (scaled_ref_frame) {
     int i;
     // Swap out the reference frame for a version that's been scaled to
     // match the resolution of the current frame, allowing the existing
     // motion search code to be used without additional modifications.
     for (i = 0; i < MAX_MB_PLANE; i++)
-      backup_yv12[i] = xd->plane[i].pre[0];
+      backup_yv12[i] = xd->plane[i].pre[ref_idx];
 
-    vp10_setup_pre_planes(xd, 0, scaled_ref_frame, mi_row, mi_col, NULL);
+    vp10_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL);
   }
 
   vp10_set_mv_search_range(x, &ref_mv);
@@ -2294,10 +5958,184 @@
     step_param = cpi->mv_step_param;
   }
 
-  if (cpi->sf.adaptive_motion_search && bsize < BLOCK_64X64) {
-    int boffset =
-        2 * (b_width_log2_lookup[BLOCK_64X64] -
-             VPXMIN(b_height_log2_lookup[bsize], b_width_log2_lookup[bsize]));
+  if (cpi->sf.adaptive_motion_search && bsize < cm->sb_size) {
+    int boffset =  2 * (b_width_log2_lookup[cm->sb_size] -
+         VPXMIN(b_height_log2_lookup[bsize], b_width_log2_lookup[bsize]));
+    step_param = VPXMAX(step_param, boffset);
+  }
+
+  if (cpi->sf.adaptive_motion_search) {
+    int bwl = b_width_log2_lookup[bsize];
+    int bhl = b_height_log2_lookup[bsize];
+    int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4);
+
+    if (tlevel < 5)
+      step_param += 2;
+
+    // prev_mv_sad is not setup for dynamically scaled frames.
+    if (cpi->oxcf.resize_mode != RESIZE_DYNAMIC) {
+      int i;
+      for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) {
+        if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
+          x->pred_mv[ref].row = 0;
+          x->pred_mv[ref].col = 0;
+          x->best_mv.as_int = INVALID_MV;
+
+          if (scaled_ref_frame) {
+            int i;
+            for (i = 0; i < MAX_MB_PLANE; ++i)
+              xd->plane[i].pre[ref_idx] = backup_yv12[i];
+          }
+          return;
+        }
+      }
+    }
+  }
+
+  mvp_full = pred_mv[x->mv_best_ref_index[ref]];
+
+  mvp_full.col >>= 3;
+  mvp_full.row >>= 3;
+
+  bestsme = vp10_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb,
+                                   cond_cost_list(cpi, cost_list),
+                                   &ref_mv, INT_MAX, 1);
+
+  x->mv_col_min = tmp_col_min;
+  x->mv_col_max = tmp_col_max;
+  x->mv_row_min = tmp_row_min;
+  x->mv_row_max = tmp_row_max;
+
+  if (bestsme < INT_MAX) {
+    int dis;  /* TODO: use dis in distortion calculation later. */
+    if (cpi->sf.use_upsampled_references) {
+      const int pw = 4 * num_4x4_blocks_wide_lookup[bsize];
+      const int ph = 4 * num_4x4_blocks_high_lookup[bsize];
+      // Use up-sampled reference frames.
+      struct macroblockd_plane *const pd = &xd->plane[0];
+      struct buf_2d backup_pred = pd->pre[ref_idx];
+      const YV12_BUFFER_CONFIG *upsampled_ref = get_upsampled_ref(cpi, ref);
+
+      // Set pred for Y plane
+      setup_pred_plane(&pd->pre[ref_idx], upsampled_ref->y_buffer,
+                       upsampled_ref->y_stride, (mi_row << 3), (mi_col << 3),
+                       NULL, pd->subsampling_x, pd->subsampling_y);
+
+      bestsme = cpi->find_fractional_mv_step(x, &ref_mv,
+                                             cm->allow_high_precision_mv,
+                                             x->errorperbit,
+                                             &cpi->fn_ptr[bsize],
+                                             cpi->sf.mv.subpel_force_stop,
+                                             cpi->sf.mv.subpel_iters_per_step,
+                                             cond_cost_list(cpi, cost_list),
+                                             x->nmvjointcost, x->mvcost,
+                                             &dis, &x->pred_sse[ref], NULL,
+                                             pw, ph, 1);
+
+      // Restore the reference frames.
+      pd->pre[ref_idx] = backup_pred;
+    } else {
+      cpi->find_fractional_mv_step(x, &ref_mv,
+                                   cm->allow_high_precision_mv,
+                                   x->errorperbit,
+                                   &cpi->fn_ptr[bsize],
+                                   cpi->sf.mv.subpel_force_stop,
+                                   cpi->sf.mv.subpel_iters_per_step,
+                                   cond_cost_list(cpi, cost_list),
+                                   x->nmvjointcost, x->mvcost,
+                                   &dis, &x->pred_sse[ref], NULL, 0, 0, 0);
+    }
+  }
+  *rate_mv = vp10_mv_bit_cost(&x->best_mv.as_mv, &ref_mv,
+                             x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+
+  if (cpi->sf.adaptive_motion_search)
+    x->pred_mv[ref] = x->best_mv.as_mv;
+
+  if (scaled_ref_frame) {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      xd->plane[i].pre[ref_idx] = backup_yv12[i];
+  }
+}
+
+static INLINE void restore_dst_buf(MACROBLOCKD *xd,
+                                   uint8_t *orig_dst[MAX_MB_PLANE],
+                                   int orig_dst_stride[MAX_MB_PLANE]) {
+  int i;
+  for (i = 0; i < MAX_MB_PLANE; i++) {
+    xd->plane[i].dst.buf = orig_dst[i];
+    xd->plane[i].dst.stride = orig_dst_stride[i];
+  }
+}
+
+#if CONFIG_OBMC
+static void single_motion_search_obmc(VP10_COMP *cpi, MACROBLOCK *x,
+                                      BLOCK_SIZE bsize, int mi_row, int mi_col,
+                                      const int32_t* wsrc, const int32_t* mask,
+#if CONFIG_EXT_INTER
+                                      int ref_idx,
+                                      int mv_idx,
+#endif  // CONFIG_EXT_INTER
+                                      int_mv *tmp_mv, int_mv pred_mv,
+                                      int *rate_mv) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const VP10_COMMON *cm = &cpi->common;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0, 0}};
+  int bestsme = INT_MAX;
+  int step_param;
+  int sadpb = x->sadperbit16;
+  MV mvp_full;
+#if CONFIG_EXT_INTER
+  int ref = mbmi->ref_frame[ref_idx];
+  MV ref_mv = x->mbmi_ext->ref_mvs[ref][mv_idx].as_mv;
+#else
+  int ref = mbmi->ref_frame[0];
+  MV ref_mv = x->mbmi_ext->ref_mvs[ref][0].as_mv;
+  int ref_idx = 0;
+#endif  // CONFIG_EXT_INTER
+
+  int tmp_col_min = x->mv_col_min;
+  int tmp_col_max = x->mv_col_max;
+  int tmp_row_min = x->mv_row_min;
+  int tmp_row_max = x->mv_row_max;
+
+  const YV12_BUFFER_CONFIG *scaled_ref_frame = vp10_get_scaled_ref_frame(cpi,
+                                                                         ref);
+
+#if CONFIG_REF_MV
+  vp10_set_mvcost(x, ref);
+#endif
+
+  if (scaled_ref_frame) {
+    int i;
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // motion search code to be used without additional modifications.
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      backup_yv12[i] = xd->plane[i].pre[ref_idx];
+
+    vp10_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL);
+  }
+
+  vp10_set_mv_search_range(x, &ref_mv);
+
+  // Work out the size of the first step in the mv step search.
+  // 0 here is maximum length first step. 1 is VPXMAX >> 1 etc.
+  if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) {
+    // Take wtd average of the step_params based on the last frame's
+    // max mv magnitude and that based on the best ref mvs of the current
+    // block for the given reference.
+    step_param = (vp10_init_search_range(x->max_mv_context[ref]) +
+                    cpi->mv_step_param) / 2;
+  } else {
+    step_param = cpi->mv_step_param;
+  }
+
+  if (cpi->sf.adaptive_motion_search && bsize < cm->sb_size) {
+    int boffset =  2 * (b_width_log2_lookup[cm->sb_size] -
+         VPXMIN(b_height_log2_lookup[bsize], b_width_log2_lookup[bsize]));
     step_param = VPXMAX(step_param, boffset);
   }
 
@@ -2321,7 +6159,146 @@
           if (scaled_ref_frame) {
             int i;
             for (i = 0; i < MAX_MB_PLANE; ++i)
-              xd->plane[i].pre[0] = backup_yv12[i];
+              xd->plane[i].pre[ref_idx] = backup_yv12[i];
+          }
+          return;
+        }
+      }
+    }
+  }
+
+  mvp_full = pred_mv.as_mv;
+  mvp_full.col >>= 3;
+  mvp_full.row >>= 3;
+
+  bestsme = vp10_obmc_full_pixel_diamond(cpi, x, wsrc, mask,
+                                         &mvp_full, step_param, sadpb,
+                                         MAX_MVSEARCH_STEPS - 1 - step_param,
+                                         1, &cpi->fn_ptr[bsize],
+                                         &ref_mv, &tmp_mv->as_mv, ref_idx);
+
+  x->mv_col_min = tmp_col_min;
+  x->mv_col_max = tmp_col_max;
+  x->mv_row_min = tmp_row_min;
+  x->mv_row_max = tmp_row_max;
+
+  if (bestsme < INT_MAX) {
+    int dis;
+    vp10_find_best_obmc_sub_pixel_tree_up(cpi, x,
+                                          wsrc, mask,
+                                          mi_row, mi_col,
+                                          &tmp_mv->as_mv, &ref_mv,
+                                          cm->allow_high_precision_mv,
+                                          x->errorperbit,
+                                          &cpi->fn_ptr[bsize],
+                                          cpi->sf.mv.subpel_force_stop,
+                                          cpi->sf.mv.subpel_iters_per_step,
+                                          x->nmvjointcost, x->mvcost,
+                                          &dis, &x->pred_sse[ref],
+                                          ref_idx,
+                                          cpi->sf.use_upsampled_references);
+  }
+  *rate_mv = vp10_mv_bit_cost(&tmp_mv->as_mv, &ref_mv,
+                              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+
+  if (scaled_ref_frame) {
+    int i;
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      xd->plane[i].pre[ref_idx] = backup_yv12[i];
+  }
+}
+#endif  // CONFIG_OBMC
+
+#if CONFIG_EXT_INTER
+static void do_masked_motion_search(VP10_COMP *cpi, MACROBLOCK *x,
+                                    const uint8_t *mask, int mask_stride,
+                                    BLOCK_SIZE bsize,
+                                    int mi_row, int mi_col,
+                                    int_mv *tmp_mv, int *rate_mv,
+                                    int ref_idx,
+                                    int mv_idx) {
+  MACROBLOCKD *xd = &x->e_mbd;
+  const VP10_COMMON *cm = &cpi->common;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  struct buf_2d backup_yv12[MAX_MB_PLANE] = {{0, 0}};
+  int bestsme = INT_MAX;
+  int step_param;
+  int sadpb = x->sadperbit16;
+  MV mvp_full;
+  int ref = mbmi->ref_frame[ref_idx];
+  MV ref_mv = x->mbmi_ext->ref_mvs[ref][mv_idx].as_mv;
+
+  int tmp_col_min = x->mv_col_min;
+  int tmp_col_max = x->mv_col_max;
+  int tmp_row_min = x->mv_row_min;
+  int tmp_row_max = x->mv_row_max;
+
+  const YV12_BUFFER_CONFIG *scaled_ref_frame =
+      vp10_get_scaled_ref_frame(cpi, ref);
+
+  MV pred_mv[3];
+  pred_mv[0] = x->mbmi_ext->ref_mvs[ref][0].as_mv;
+  pred_mv[1] = x->mbmi_ext->ref_mvs[ref][1].as_mv;
+  pred_mv[2] = x->pred_mv[ref];
+
+#if CONFIG_REF_MV
+  vp10_set_mvcost(x, ref);
+#endif
+
+  if (scaled_ref_frame) {
+    int i;
+    // Swap out the reference frame for a version that's been scaled to
+    // match the resolution of the current frame, allowing the existing
+    // motion search code to be used without additional modifications.
+    for (i = 0; i < MAX_MB_PLANE; i++)
+      backup_yv12[i] = xd->plane[i].pre[ref_idx];
+
+    vp10_setup_pre_planes(xd, ref_idx, scaled_ref_frame, mi_row, mi_col, NULL);
+  }
+
+  vp10_set_mv_search_range(x, &ref_mv);
+
+  // Work out the size of the first step in the mv step search.
+  // 0 here is maximum length first step. 1 is MAX >> 1 etc.
+  if (cpi->sf.mv.auto_mv_step_size && cm->show_frame) {
+    // Take wtd average of the step_params based on the last frame's
+    // max mv magnitude and that based on the best ref mvs of the current
+    // block for the given reference.
+    step_param = (vp10_init_search_range(x->max_mv_context[ref]) +
+                  cpi->mv_step_param) / 2;
+  } else {
+    step_param = cpi->mv_step_param;
+  }
+
+  // TODO(debargha): is show_frame needed here?
+  if (cpi->sf.adaptive_motion_search && bsize < cm->sb_size &&
+      cm->show_frame) {
+    int boffset = 2 * (b_width_log2_lookup[cm->sb_size] -
+          VPXMIN(b_height_log2_lookup[bsize], b_width_log2_lookup[bsize]));
+    step_param = VPXMAX(step_param, boffset);
+  }
+
+  if (cpi->sf.adaptive_motion_search) {
+    int bwl = b_width_log2_lookup[bsize];
+    int bhl = b_height_log2_lookup[bsize];
+    int tlevel = x->pred_mv_sad[ref] >> (bwl + bhl + 4);
+
+    if (tlevel < 5)
+      step_param += 2;
+
+    // prev_mv_sad is not setup for dynamically scaled frames.
+    if (cpi->oxcf.resize_mode != RESIZE_DYNAMIC) {
+      int i;
+      for (i = LAST_FRAME; i <= ALTREF_FRAME && cm->show_frame; ++i) {
+        if ((x->pred_mv_sad[ref] >> 3) > x->pred_mv_sad[i]) {
+          x->pred_mv[ref].row = 0;
+          x->pred_mv[ref].col = 0;
+          tmp_mv->as_int = INVALID_MV;
+
+          if (scaled_ref_frame) {
+            int i;
+            for (i = 0; i < MAX_MB_PLANE; ++i)
+              xd->plane[i].pre[ref_idx] = backup_yv12[i];
           }
           return;
         }
@@ -2334,9 +6311,11 @@
   mvp_full.col >>= 3;
   mvp_full.row >>= 3;
 
-  bestsme = vp10_full_pixel_search(cpi, x, bsize, &mvp_full, step_param, sadpb,
-                                  cond_cost_list(cpi, cost_list),
-                                  &ref_mv, &tmp_mv->as_mv, INT_MAX, 1);
+  bestsme = vp10_masked_full_pixel_diamond(cpi, x, mask, mask_stride,
+                                           &mvp_full, step_param, sadpb,
+                                           MAX_MVSEARCH_STEPS - 1 - step_param,
+                                           1, &cpi->fn_ptr[bsize],
+                                           &ref_mv, &tmp_mv->as_mv, ref_idx);
 
   x->mv_col_min = tmp_col_min;
   x->mv_col_max = tmp_col_max;
@@ -2345,40 +6324,62 @@
 
   if (bestsme < INT_MAX) {
     int dis;  /* TODO: use dis in distortion calculation later. */
-    cpi->find_fractional_mv_step(x, &tmp_mv->as_mv, &ref_mv,
-                                 cm->allow_high_precision_mv,
-                                 x->errorperbit,
-                                 &cpi->fn_ptr[bsize],
-                                 cpi->sf.mv.subpel_force_stop,
-                                 cpi->sf.mv.subpel_iters_per_step,
-                                 cond_cost_list(cpi, cost_list),
-                                 x->nmvjointcost, x->mvcost,
-                                 &dis, &x->pred_sse[ref], NULL, 0, 0);
+    vp10_find_best_masked_sub_pixel_tree_up(cpi, x, mask, mask_stride,
+                                            mi_row, mi_col,
+                                            &tmp_mv->as_mv, &ref_mv,
+                                            cm->allow_high_precision_mv,
+                                            x->errorperbit,
+                                            &cpi->fn_ptr[bsize],
+                                            cpi->sf.mv.subpel_force_stop,
+                                            cpi->sf.mv.subpel_iters_per_step,
+                                            x->nmvjointcost, x->mvcost,
+                                            &dis, &x->pred_sse[ref],
+                                            ref_idx,
+                                            cpi->sf.use_upsampled_references);
   }
   *rate_mv = vp10_mv_bit_cost(&tmp_mv->as_mv, &ref_mv,
-                             x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+                              x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
 
-  if (cpi->sf.adaptive_motion_search)
+  if (cpi->sf.adaptive_motion_search && cm->show_frame)
     x->pred_mv[ref] = tmp_mv->as_mv;
 
   if (scaled_ref_frame) {
     int i;
     for (i = 0; i < MAX_MB_PLANE; i++)
-      xd->plane[i].pre[0] = backup_yv12[i];
+      xd->plane[i].pre[ref_idx] = backup_yv12[i];
   }
 }
 
+static void do_masked_motion_search_indexed(VP10_COMP *cpi, MACROBLOCK *x,
+                                            int wedge_index,
+                                            int wedge_sign,
+                                            BLOCK_SIZE bsize,
+                                            int mi_row, int mi_col,
+                                            int_mv *tmp_mv, int *rate_mv,
+                                            int mv_idx[2],
+                                            int which) {
+  // NOTE: which values: 0 - 0 only, 1 - 1 only, 2 - both
+  MACROBLOCKD *xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  BLOCK_SIZE sb_type = mbmi->sb_type;
+  const uint8_t *mask;
+  const int mask_stride = 4 * num_4x4_blocks_wide_lookup[bsize];
+  mask = vp10_get_contiguous_soft_mask(wedge_index, wedge_sign, sb_type);
 
+  if (which == 0 || which == 2)
+    do_masked_motion_search(cpi, x, mask, mask_stride, bsize,
+                            mi_row, mi_col, &tmp_mv[0], &rate_mv[0],
+                            0, mv_idx[0]);
 
-static INLINE void restore_dst_buf(MACROBLOCKD *xd,
-                                   uint8_t *orig_dst[MAX_MB_PLANE],
-                                   int orig_dst_stride[MAX_MB_PLANE]) {
-  int i;
-  for (i = 0; i < MAX_MB_PLANE; i++) {
-    xd->plane[i].dst.buf = orig_dst[i];
-    xd->plane[i].dst.stride = orig_dst_stride[i];
+  if (which == 1 || which == 2) {
+    // get the negative mask
+    mask = vp10_get_contiguous_soft_mask(wedge_index, !wedge_sign, sb_type);
+    do_masked_motion_search(cpi, x, mask, mask_stride, bsize,
+                            mi_row, mi_col, &tmp_mv[1], &rate_mv[1],
+                            1, mv_idx[1]);
   }
 }
+#endif  // CONFIG_EXT_INTER
 
 // In some situations we want to discount tha pparent cost of a new motion
 // vector. Where there is a subtle motion field and especially where there is
@@ -2413,6 +6414,373 @@
                xd->mb_to_bottom_edge + RIGHT_BOTTOM_MARGIN);
 }
 
+#if CONFIG_EXT_INTER
+static int estimate_wedge_sign(const VP10_COMP *cpi,
+                               const MACROBLOCK *x,
+                               const BLOCK_SIZE bsize,
+                               const uint8_t *pred0, int stride0,
+                               const uint8_t *pred1, int stride1) {
+  const struct macroblock_plane *const p = &x->plane[0];
+  const uint8_t *src = p->src.buf;
+  int src_stride = p->src.stride;
+  const int f_index = bsize - BLOCK_8X8;
+  const int bw = 4 << (b_width_log2_lookup[bsize]);
+  const int bh = 4 << (b_height_log2_lookup[bsize]);
+  uint32_t esq[2][4], var;
+  int64_t tl, br;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (x->e_mbd.cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    pred0 = CONVERT_TO_BYTEPTR(pred0);
+    pred1 = CONVERT_TO_BYTEPTR(pred1);
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  var = cpi->fn_ptr[f_index].vf(
+      src, src_stride,
+      pred0, stride0, &esq[0][0]);
+  var = cpi->fn_ptr[f_index].vf(
+      src + bw / 2, src_stride,
+      pred0 + bw / 2, stride0, &esq[0][1]);
+  var = cpi->fn_ptr[f_index].vf(
+      src + bh / 2 * src_stride, src_stride,
+      pred0 + bh / 2 * stride0, stride0, &esq[0][2]);
+  var = cpi->fn_ptr[f_index].vf(
+      src + bh / 2 * src_stride + bw / 2, src_stride,
+      pred0 + bh / 2 * stride0 + bw / 2, stride0, &esq[0][3]);
+  var = cpi->fn_ptr[f_index].vf(
+      src, src_stride,
+      pred1, stride1, &esq[1][0]);
+  var = cpi->fn_ptr[f_index].vf(
+      src + bw / 2, src_stride,
+      pred1 + bw / 2, stride1, &esq[1][1]);
+  var = cpi->fn_ptr[f_index].vf(
+      src + bh / 2 * src_stride, src_stride,
+      pred1 + bh / 2 * stride1, stride0, &esq[1][2]);
+  var = cpi->fn_ptr[f_index].vf(
+      src + bh / 2 * src_stride + bw / 2, src_stride,
+      pred1 + bh / 2 * stride1 + bw / 2, stride0, &esq[1][3]);
+  (void) var;
+
+  tl = (int64_t)(esq[0][0] + esq[0][1] + esq[0][2]) -
+       (int64_t)(esq[1][0] + esq[1][1] + esq[1][2]);
+  br = (int64_t)(esq[1][3] + esq[1][1] + esq[1][2]) -
+       (int64_t)(esq[0][3] + esq[0][1] + esq[0][2]);
+  return (tl + br > 0);
+}
+#endif  // CONFIG_EXT_INTER
+
+#if !CONFIG_DUAL_FILTER
+static INTERP_FILTER predict_interp_filter(const VP10_COMP *cpi,
+                                           const MACROBLOCK *x,
+                                           const BLOCK_SIZE bsize,
+                                           const int mi_row,
+                                           const int mi_col,
+                                           INTERP_FILTER
+                                           (*single_filter)[MAX_REF_FRAMES]
+                                           ) {
+  INTERP_FILTER best_filter = SWITCHABLE;
+  const VP10_COMMON *cm = &cpi->common;
+  const MACROBLOCKD *xd = &x->e_mbd;
+  int bsl = mi_width_log2_lookup[bsize];
+  int pred_filter_search = cpi->sf.cb_pred_filter_search ?
+      (((mi_row + mi_col) >> bsl) +
+          get_chessboard_index(cm->current_video_frame)) & 0x1 : 0;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  const int is_comp_pred = has_second_ref(mbmi);
+  const int this_mode = mbmi->mode;
+  int refs[2] = { mbmi->ref_frame[0],
+      (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
+#if CONFIG_DUAL_FILTER
+  (void)pred_filter_search;
+  return SWITCHABLE;
+#else
+  if (pred_filter_search) {
+    INTERP_FILTER af = SWITCHABLE, lf = SWITCHABLE;
+    if (xd->up_available)
+      af = xd->mi[-xd->mi_stride]->mbmi.interp_filter;
+    if (xd->left_available)
+      lf = xd->mi[-1]->mbmi.interp_filter;
+
+#if CONFIG_EXT_INTER
+    if ((this_mode != NEWMV && this_mode != NEWFROMNEARMV &&
+        this_mode != NEW_NEWMV) || (af == lf))
+#else
+    if ((this_mode != NEWMV) || (af == lf))
+#endif  // CONFIG_EXT_INTER
+      best_filter = af;
+  }
+#endif
+  if (is_comp_pred) {
+    if (cpi->sf.adaptive_mode_search) {
+#if CONFIG_EXT_INTER
+      switch (this_mode) {
+        case NEAREST_NEARESTMV:
+          if (single_filter[NEARESTMV][refs[0]] ==
+              single_filter[NEARESTMV][refs[1]])
+            best_filter = single_filter[NEARESTMV][refs[0]];
+          break;
+        case NEAREST_NEARMV:
+          if (single_filter[NEARESTMV][refs[0]] ==
+              single_filter[NEARMV][refs[1]])
+            best_filter = single_filter[NEARESTMV][refs[0]];
+          break;
+        case NEAR_NEARESTMV:
+          if (single_filter[NEARMV][refs[0]] ==
+              single_filter[NEARESTMV][refs[1]])
+            best_filter = single_filter[NEARMV][refs[0]];
+          break;
+        case NEAR_NEARMV:
+          if (single_filter[NEARMV][refs[0]] ==
+              single_filter[NEARMV][refs[1]])
+            best_filter = single_filter[NEARMV][refs[0]];
+          break;
+        case ZERO_ZEROMV:
+          if (single_filter[ZEROMV][refs[0]] ==
+              single_filter[ZEROMV][refs[1]])
+            best_filter = single_filter[ZEROMV][refs[0]];
+          break;
+        case NEW_NEWMV:
+          if (single_filter[NEWMV][refs[0]] ==
+              single_filter[NEWMV][refs[1]])
+            best_filter = single_filter[NEWMV][refs[0]];
+          break;
+        case NEAREST_NEWMV:
+          if (single_filter[NEARESTMV][refs[0]] ==
+              single_filter[NEWMV][refs[1]])
+            best_filter = single_filter[NEARESTMV][refs[0]];
+          break;
+        case NEAR_NEWMV:
+          if (single_filter[NEARMV][refs[0]] ==
+              single_filter[NEWMV][refs[1]])
+            best_filter = single_filter[NEARMV][refs[0]];
+          break;
+        case NEW_NEARESTMV:
+          if (single_filter[NEWMV][refs[0]] ==
+              single_filter[NEARESTMV][refs[1]])
+            best_filter = single_filter[NEWMV][refs[0]];
+          break;
+        case NEW_NEARMV:
+          if (single_filter[NEWMV][refs[0]] ==
+              single_filter[NEARMV][refs[1]])
+            best_filter = single_filter[NEWMV][refs[0]];
+          break;
+        default:
+          if (single_filter[this_mode][refs[0]] ==
+              single_filter[this_mode][refs[1]])
+            best_filter = single_filter[this_mode][refs[0]];
+          break;
+      }
+#else
+      if (single_filter[this_mode][refs[0]] ==
+          single_filter[this_mode][refs[1]])
+        best_filter = single_filter[this_mode][refs[0]];
+#endif  // CONFIG_EXT_INTER
+    }
+  }
+  if (cm->interp_filter != BILINEAR) {
+    if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) {
+      best_filter = EIGHTTAP_REGULAR;
+    }
+#if CONFIG_EXT_INTERP
+    else if (!vp10_is_interp_needed(xd) && cm->interp_filter == SWITCHABLE) {
+      best_filter = EIGHTTAP_REGULAR;
+    }
+#endif
+  }
+  return best_filter;
+}
+#endif
+
+#if CONFIG_EXT_INTER
+// Choose the best wedge index and sign
+static int64_t pick_wedge(const VP10_COMP *const cpi,
+                          const MACROBLOCK *const x,
+                          const BLOCK_SIZE bsize,
+                          const uint8_t *const p0,
+                          const uint8_t *const p1,
+                          int *const best_wedge_sign,
+                          int *const best_wedge_index) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const src = &x->plane[0].src;
+  const int bw = 4 * num_4x4_blocks_wide_lookup[bsize];
+  const int bh = 4 * num_4x4_blocks_high_lookup[bsize];
+  const int N = bw * bh;
+  int rate;
+  int64_t dist;
+  int64_t rd, best_rd = INT64_MAX;
+  int wedge_index;
+  int wedge_sign;
+  int wedge_types = (1 << get_wedge_bits_lookup(bsize));
+  const uint8_t *mask;
+  uint64_t sse;
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+  const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
+#else
+  const int bd_round = 0;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  DECLARE_ALIGNED(32, int16_t, r0[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, d10[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, ds[MAX_SB_SQUARE]);
+
+  int64_t sign_limit;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (hbd) {
+    vpx_highbd_subtract_block(bh, bw, r0, bw, src->buf, src->stride,
+                              CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
+    vpx_highbd_subtract_block(bh, bw, r1, bw, src->buf, src->stride,
+                              CONVERT_TO_BYTEPTR(p1), bw, xd->bd);
+    vpx_highbd_subtract_block(bh, bw, d10, bw,
+                              CONVERT_TO_BYTEPTR(p1), bw,
+                              CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
+  } else  // NOLINT
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  {
+    vpx_subtract_block(bh, bw, r0, bw, src->buf, src->stride, p0, bw);
+    vpx_subtract_block(bh, bw, r1, bw, src->buf, src->stride, p1, bw);
+    vpx_subtract_block(bh, bw, d10, bw, p1, bw, p0, bw);
+  }
+
+  sign_limit = ((int64_t)vpx_sum_squares_i16(r0, N)
+                - (int64_t)vpx_sum_squares_i16(r1, N))
+               * (1 << WEDGE_WEIGHT_BITS) / 2;
+
+  vp10_wedge_compute_delta_squares(ds, r0, r1, N);
+
+  for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+    mask = vp10_get_contiguous_soft_mask(wedge_index, 0, bsize);
+    wedge_sign = vp10_wedge_sign_from_residuals(ds, mask, N, sign_limit);
+
+    mask = vp10_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
+    sse = vp10_wedge_sse_from_residuals(r1, d10, mask, N);
+    sse = ROUND_POWER_OF_TWO(sse, bd_round);
+
+    model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
+    rd =  RDCOST(x->rdmult, x->rddiv, rate, dist);
+
+    if (rd < best_rd) {
+      *best_wedge_index = wedge_index;
+      *best_wedge_sign = wedge_sign;
+      best_rd = rd;
+    }
+  }
+
+  return best_rd;
+}
+
+// Choose the best wedge index the specified sign
+static int64_t pick_wedge_fixed_sign(const VP10_COMP *const cpi,
+                                     const MACROBLOCK *const x,
+                                     const BLOCK_SIZE bsize,
+                                     const uint8_t *const p0,
+                                     const uint8_t *const p1,
+                                     const int wedge_sign,
+                                     int *const best_wedge_index) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  const struct buf_2d *const src = &x->plane[0].src;
+  const int bw = 4 * num_4x4_blocks_wide_lookup[bsize];
+  const int bh = 4 * num_4x4_blocks_high_lookup[bsize];
+  const int N = bw * bh;
+  int rate;
+  int64_t dist;
+  int64_t rd, best_rd = INT64_MAX;
+  int wedge_index;
+  int wedge_types = (1 << get_wedge_bits_lookup(bsize));
+  const uint8_t *mask;
+  uint64_t sse;
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int hbd = xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH;
+  const int bd_round = hbd ? (xd->bd - 8) * 2 : 0;
+#else
+  const int bd_round = 0;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  DECLARE_ALIGNED(32, int16_t, r1[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(32, int16_t, d10[MAX_SB_SQUARE]);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (hbd) {
+    vpx_highbd_subtract_block(bh, bw, r1, bw, src->buf, src->stride,
+                              CONVERT_TO_BYTEPTR(p1), bw, xd->bd);
+    vpx_highbd_subtract_block(bh, bw, d10, bw,
+                              CONVERT_TO_BYTEPTR(p1), bw,
+                              CONVERT_TO_BYTEPTR(p0), bw, xd->bd);
+  } else  // NOLINT
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  {
+    vpx_subtract_block(bh, bw, r1, bw, src->buf, src->stride, p1, bw);
+    vpx_subtract_block(bh, bw, d10, bw, p1, bw, p0, bw);
+  }
+
+  for (wedge_index = 0; wedge_index < wedge_types; ++wedge_index) {
+    mask = vp10_get_contiguous_soft_mask(wedge_index, wedge_sign, bsize);
+    sse = vp10_wedge_sse_from_residuals(r1, d10, mask, N);
+    sse = ROUND_POWER_OF_TWO(sse, bd_round);
+
+    model_rd_from_sse(cpi, xd, bsize, 0, sse, &rate, &dist);
+    rd =  RDCOST(x->rdmult, x->rddiv, rate, dist);
+
+    if (rd < best_rd) {
+      *best_wedge_index = wedge_index;
+      best_rd = rd;
+    }
+  }
+
+  return best_rd;
+}
+
+static int64_t pick_interinter_wedge(const VP10_COMP *const cpi,
+                                     const MACROBLOCK *const x,
+                                     const BLOCK_SIZE bsize,
+                                     const uint8_t *const p0,
+                                     const uint8_t *const p1) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const int bw = 4 * num_4x4_blocks_wide_lookup[bsize];
+
+  int64_t rd;
+  int wedge_index = -1;
+  int wedge_sign = 0;
+
+  assert(is_interinter_wedge_used(bsize));
+
+  if (cpi->sf.fast_wedge_sign_estimate) {
+    wedge_sign = estimate_wedge_sign(cpi, x, bsize, p0, bw, p1, bw);
+    rd = pick_wedge_fixed_sign(cpi, x, bsize, p0, p1, wedge_sign, &wedge_index);
+  } else {
+    rd = pick_wedge(cpi, x, bsize, p0, p1, &wedge_sign, &wedge_index);
+  }
+
+  mbmi->interinter_wedge_sign = wedge_sign;
+  mbmi->interinter_wedge_index = wedge_index;
+  return rd;
+}
+
+static int64_t pick_interintra_wedge(const VP10_COMP *const cpi,
+                                     const MACROBLOCK *const x,
+                                     const BLOCK_SIZE bsize,
+                                     const uint8_t *const p0,
+                                     const uint8_t *const p1) {
+  const MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+
+  int64_t rd;
+  int wedge_index = -1;
+
+  assert(is_interintra_wedge_used(bsize));
+
+  rd = pick_wedge_fixed_sign(cpi, x, bsize, p0, p1, 0, &wedge_index);
+
+  mbmi->interintra_wedge_sign = 0;
+  mbmi->interintra_wedge_index = wedge_index;
+  return rd;
+}
+#endif  // CONFIG_EXT_INTER
+
 static int64_t handle_inter_mode(VP10_COMP *cpi, MACROBLOCK *x,
                                  BLOCK_SIZE bsize,
                                  int *rate2, int64_t *distortion,
@@ -2421,13 +6789,25 @@
                                  int *disable_skip,
                                  int_mv (*mode_mv)[MAX_REF_FRAMES],
                                  int mi_row, int mi_col,
+#if CONFIG_OBMC
+                                 uint8_t *dst_buf1[3], int dst_stride1[3],
+                                 uint8_t *dst_buf2[3], int dst_stride2[3],
+                                 const int32_t *const wsrc,
+                                 const int32_t *const mask2d,
+#endif  // CONFIG_OBMC
+#if CONFIG_EXT_INTER
+                                 int_mv single_newmvs[2][MAX_REF_FRAMES],
+                                 int single_newmvs_rate[2][MAX_REF_FRAMES],
+                                 int *compmode_interintra_cost,
+                                 int *compmode_wedge_cost,
+                                 int64_t (*const modelled_rd)[MAX_REF_FRAMES],
+#else
                                  int_mv single_newmv[MAX_REF_FRAMES],
+#endif  // CONFIG_EXT_INTER
                                  INTERP_FILTER (*single_filter)[MAX_REF_FRAMES],
                                  int (*single_skippable)[MAX_REF_FRAMES],
                                  int64_t *psse,
-                                 const int64_t ref_best_rd,
-                                 int64_t *mask_filter,
-                                 int64_t filter_cache[]) {
+                                 const int64_t ref_best_rd) {
   VP10_COMMON *cm = &cpi->common;
   MACROBLOCKD *xd = &x->e_mbd;
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
@@ -2439,12 +6819,46 @@
   int refs[2] = { mbmi->ref_frame[0],
     (mbmi->ref_frame[1] < 0 ? 0 : mbmi->ref_frame[1]) };
   int_mv cur_mv[2];
+  int rate_mv = 0;
+#if CONFIG_EXT_INTER
+  const int bw = 4 * num_4x4_blocks_wide_lookup[bsize];
+  int mv_idx = (this_mode == NEWFROMNEARMV) ? 1 : 0;
+  int_mv single_newmv[MAX_REF_FRAMES];
+  const unsigned int *const interintra_mode_cost =
+    cpi->interintra_mode_cost[size_group_lookup[bsize]];
+  const int is_comp_interintra_pred = (mbmi->ref_frame[1] == INTRA_FRAME);
+#if CONFIG_REF_MV
+  uint8_t ref_frame_type = vp10_ref_frame_type(mbmi->ref_frame);
+#endif
+#endif  // CONFIG_EXT_INTER
 #if CONFIG_VP9_HIGHBITDEPTH
-  DECLARE_ALIGNED(16, uint16_t, tmp_buf16[MAX_MB_PLANE * 64 * 64]);
-  uint8_t *tmp_buf;
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf_[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
 #else
-  DECLARE_ALIGNED(16, uint8_t, tmp_buf[MAX_MB_PLANE * 64 * 64]);
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf_[MAX_MB_PLANE * MAX_SB_SQUARE]);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+  uint8_t *tmp_buf;
+
+#if CONFIG_OBMC || CONFIG_WARPED_MOTION
+  int allow_motvar =
+#if CONFIG_EXT_INTER
+      !is_comp_interintra_pred &&
+#endif  // CONFIG_EXT_INTER
+      is_motvar_allowed(mbmi);
+  int rate2_nocoeff, best_rate2 = INT_MAX,
+      best_skippable, best_xskip, best_disable_skip = 0;
+  int best_rate_y, best_rate_uv;
+#if CONFIG_VAR_TX
+  uint8_t best_blk_skip[MAX_MB_PLANE][MAX_MIB_SIZE * MAX_MIB_SIZE * 4];
+#endif  // CONFIG_VAR_TX
+  int64_t best_distortion = INT64_MAX;
+  MB_MODE_INFO best_mbmi;
+#if CONFIG_EXT_INTER
+  int rate2_bmc_nocoeff;
+  int rate_mv_bmc;
+  MB_MODE_INFO best_bmc_mbmi;
+#endif  // CONFIG_EXT_INTER
+#endif  // CONFIG_OBMC || CONFIG_WARPED_MOTION
+
   int pred_exists = 0;
   int intpel_mv;
   int64_t rd, tmp_rd, best_rd = INT64_MAX;
@@ -2452,96 +6866,164 @@
   uint8_t *orig_dst[MAX_MB_PLANE];
   int orig_dst_stride[MAX_MB_PLANE];
   int rs = 0;
+#if CONFIG_DUAL_FILTER
+  // Index use case:
+  // {0, 1} -> (vertical, horizontal) filter types for the first ref frame
+  // {2, 3} -> (vertical, horizontal) filter types for the second ref frame
+  INTERP_FILTER best_filter[4] = {SWITCHABLE, SWITCHABLE,
+      SWITCHABLE, SWITCHABLE,
+  };
+#else
   INTERP_FILTER best_filter = SWITCHABLE;
-  uint8_t skip_txfm[MAX_MB_PLANE << 2] = {0};
-  int64_t bsse[MAX_MB_PLANE << 2] = {0};
-
-  int bsl = mi_width_log2_lookup[bsize];
-  int pred_filter_search = cpi->sf.cb_pred_filter_search ?
-      (((mi_row + mi_col) >> bsl) +
-       get_chessboard_index(cm->current_video_frame)) & 0x1 : 0;
+#endif
 
   int skip_txfm_sb = 0;
   int64_t skip_sse_sb = INT64_MAX;
   int64_t distortion_y = 0, distortion_uv = 0;
+  int16_t mode_ctx = mbmi_ext->mode_context[refs[0]];
+
+#if CONFIG_EXT_INTER
+  *compmode_interintra_cost = 0;
+  mbmi->use_wedge_interintra = 0;
+  *compmode_wedge_cost = 0;
+  mbmi->use_wedge_interinter = 0;
+
+  // is_comp_interintra_pred implies !is_comp_pred
+  assert(!is_comp_interintra_pred || (!is_comp_pred));
+  // is_comp_interintra_pred implies is_interintra_allowed(mbmi->sb_type)
+  assert(!is_comp_interintra_pred || is_interintra_allowed(mbmi));
+#endif  // CONFIG_EXT_INTER
+
+#if CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+  if (is_comp_pred)
+    mode_ctx = mbmi_ext->compound_mode_context[refs[0]];
+  else
+#endif  // CONFIG_EXT_INTER
+  mode_ctx = vp10_mode_context_analyzer(mbmi_ext->mode_context,
+                                        mbmi->ref_frame, bsize, -1);
+#endif
 
 #if CONFIG_VP9_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf16);
-  } else {
-    tmp_buf = (uint8_t *)tmp_buf16;
-  }
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+    tmp_buf = CONVERT_TO_BYTEPTR(tmp_buf_);
+  else
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-
-  if (pred_filter_search) {
-    INTERP_FILTER af = SWITCHABLE, lf = SWITCHABLE;
-    if (xd->up_available)
-      af = xd->mi[-xd->mi_stride]->mbmi.interp_filter;
-    if (xd->left_available)
-      lf = xd->mi[-1]->mbmi.interp_filter;
-
-    if ((this_mode != NEWMV) || (af == lf))
-      best_filter = af;
-  }
+    tmp_buf = tmp_buf_;
 
   if (is_comp_pred) {
     if (frame_mv[refs[0]].as_int == INVALID_MV ||
         frame_mv[refs[1]].as_int == INVALID_MV)
       return INT64_MAX;
-
-    if (cpi->sf.adaptive_mode_search) {
-      if (single_filter[this_mode][refs[0]] ==
-          single_filter[this_mode][refs[1]])
-        best_filter = single_filter[this_mode][refs[0]];
-    }
   }
 
-  if (this_mode == NEWMV) {
-    int rate_mv;
+  if (have_newmv_in_inter_mode(this_mode)) {
     if (is_comp_pred) {
+#if CONFIG_EXT_INTER
+      for (i = 0; i < 2; ++i) {
+        single_newmv[refs[i]].as_int =
+          single_newmvs[mv_idx][refs[i]].as_int;
+      }
+
+      if (this_mode == NEW_NEWMV) {
+        frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
+        frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
+
+        if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
+          joint_motion_search(cpi, x, bsize, frame_mv,
+                              mi_row, mi_col, NULL, single_newmv, &rate_mv, 0);
+        } else {
+#if CONFIG_REF_MV
+          vp10_set_mvcost(x, mbmi->ref_frame[0]);
+#endif  // CONFIG_REF_MV
+          rate_mv  = vp10_mv_bit_cost(&frame_mv[refs[0]].as_mv,
+                                      &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
+                                      x->nmvjointcost, x->mvcost,
+                                      MV_COST_WEIGHT);
+#if CONFIG_REF_MV
+          vp10_set_mvcost(x, mbmi->ref_frame[1]);
+#endif  // CONFIG_REF_MV
+          rate_mv += vp10_mv_bit_cost(&frame_mv[refs[1]].as_mv,
+                                      &x->mbmi_ext->ref_mvs[refs[1]][0].as_mv,
+                                      x->nmvjointcost, x->mvcost,
+                                      MV_COST_WEIGHT);
+        }
+      } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
+        frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
+        rate_mv = vp10_mv_bit_cost(&frame_mv[refs[1]].as_mv,
+                                   &x->mbmi_ext->ref_mvs[refs[1]][0].as_mv,
+                                   x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+      } else {
+        frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
+        rate_mv = vp10_mv_bit_cost(&frame_mv[refs[0]].as_mv,
+                                   &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
+                                   x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+      }
+#else
       // Initialize mv using single prediction mode result.
       frame_mv[refs[0]].as_int = single_newmv[refs[0]].as_int;
       frame_mv[refs[1]].as_int = single_newmv[refs[1]].as_int;
 
       if (cpi->sf.comp_inter_joint_search_thresh <= bsize) {
         joint_motion_search(cpi, x, bsize, frame_mv,
-                            mi_row, mi_col, single_newmv, &rate_mv);
+                            mi_row, mi_col,
+                            single_newmv, &rate_mv, 0);
       } else {
+#if CONFIG_REF_MV
+        vp10_set_mvcost(x, mbmi->ref_frame[0]);
+#endif  // CONFIG_REF_MV
         rate_mv  = vp10_mv_bit_cost(&frame_mv[refs[0]].as_mv,
                                    &x->mbmi_ext->ref_mvs[refs[0]][0].as_mv,
                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
+#if CONFIG_REF_MV
+        vp10_set_mvcost(x, mbmi->ref_frame[1]);
+#endif  // CONFIG_REF_MV
         rate_mv += vp10_mv_bit_cost(&frame_mv[refs[1]].as_mv,
                                    &x->mbmi_ext->ref_mvs[refs[1]][0].as_mv,
                                    x->nmvjointcost, x->mvcost, MV_COST_WEIGHT);
       }
-      *rate2 += rate_mv;
+#endif  // CONFIG_EXT_INTER
     } else {
-      int_mv tmp_mv;
-      single_motion_search(cpi, x, bsize, mi_row, mi_col,
-                           &tmp_mv, &rate_mv);
-      if (tmp_mv.as_int == INVALID_MV)
+#if CONFIG_EXT_INTER
+      if (is_comp_interintra_pred) {
+        x->best_mv = single_newmvs[mv_idx][refs[0]];
+        rate_mv = single_newmvs_rate[mv_idx][refs[0]];
+      } else {
+        single_motion_search(cpi, x, bsize, mi_row, mi_col,
+                             0, mv_idx, &rate_mv);
+        single_newmvs[mv_idx][refs[0]] = x->best_mv;
+        single_newmvs_rate[mv_idx][refs[0]] = rate_mv;
+      }
+#else
+      single_motion_search(cpi, x, bsize, mi_row, mi_col, &rate_mv);
+      single_newmv[refs[0]] = x->best_mv;
+#endif  // CONFIG_EXT_INTER
+
+      if (x->best_mv.as_int == INVALID_MV)
         return INT64_MAX;
 
-      frame_mv[refs[0]].as_int =
-          xd->mi[0]->bmi[0].as_mv[0].as_int = tmp_mv.as_int;
-      single_newmv[refs[0]].as_int = tmp_mv.as_int;
+      frame_mv[refs[0]] = x->best_mv;
+      xd->mi[0]->bmi[0].as_mv[0] = x->best_mv;
 
       // Estimate the rate implications of a new mv but discount this
       // under certain circumstances where we want to help initiate a weak
       // motion field, where the distortion gain for a single block may not
       // be enough to overcome the cost of a new mv.
-      if (discount_newmv_test(cpi, this_mode, tmp_mv, mode_mv, refs[0])) {
-        *rate2 += VPXMAX((rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
-      } else {
-        *rate2 += rate_mv;
+      if (discount_newmv_test(cpi, this_mode, x->best_mv, mode_mv, refs[0])) {
+        rate_mv = VPXMAX((rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
       }
     }
+    *rate2 += rate_mv;
   }
 
   for (i = 0; i < is_comp_pred + 1; ++i) {
     cur_mv[i] = frame_mv[refs[i]];
     // Clip "next_nearest" so that it does not extend to far out of image
+#if CONFIG_EXT_INTER
+    if (this_mode != NEWMV && this_mode != NEWFROMNEARMV)
+#else
     if (this_mode != NEWMV)
+#endif  // CONFIG_EXT_INTER
       clamp_mv2(&cur_mv[i].as_mv, xd);
 
     if (mv_check_bounds(x, &cur_mv[i].as_mv))
@@ -2549,6 +7031,93 @@
     mbmi->mv[i].as_int = cur_mv[i].as_int;
   }
 
+#if CONFIG_REF_MV
+#if CONFIG_EXT_INTER
+  if (this_mode == NEAREST_NEARESTMV) {
+#else
+  if (this_mode == NEARESTMV && is_comp_pred) {
+    uint8_t ref_frame_type = vp10_ref_frame_type(mbmi->ref_frame);
+#endif  // CONFIG_EXT_INTER
+    if (mbmi_ext->ref_mv_count[ref_frame_type] > 0) {
+      cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv;
+      cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv;
+
+      for (i = 0; i < 2; ++i) {
+        clamp_mv2(&cur_mv[i].as_mv, xd);
+        if (mv_check_bounds(x, &cur_mv[i].as_mv))
+          return INT64_MAX;
+        mbmi->mv[i].as_int = cur_mv[i].as_int;
+      }
+    }
+  }
+
+#if CONFIG_EXT_INTER
+  if (mbmi_ext->ref_mv_count[ref_frame_type] > 0) {
+    if (this_mode == NEAREST_NEWMV || this_mode == NEAREST_NEARMV) {
+      cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv;
+
+      lower_mv_precision(&cur_mv[0].as_mv, cm->allow_high_precision_mv);
+      clamp_mv2(&cur_mv[0].as_mv, xd);
+      if (mv_check_bounds(x, &cur_mv[0].as_mv))
+        return INT64_MAX;
+      mbmi->mv[0].as_int = cur_mv[0].as_int;
+    }
+
+    if (this_mode == NEW_NEARESTMV || this_mode == NEAR_NEARESTMV) {
+      cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv;
+
+      lower_mv_precision(&cur_mv[1].as_mv, cm->allow_high_precision_mv);
+      clamp_mv2(&cur_mv[1].as_mv, xd);
+      if (mv_check_bounds(x, &cur_mv[1].as_mv))
+        return INT64_MAX;
+      mbmi->mv[1].as_int = cur_mv[1].as_int;
+    }
+  }
+
+  if (mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
+    if (this_mode == NEAR_NEWMV ||
+        this_mode == NEAR_NEARESTMV ||
+        this_mode == NEAR_NEARMV) {
+      cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][1].this_mv;
+
+      lower_mv_precision(&cur_mv[0].as_mv, cm->allow_high_precision_mv);
+      clamp_mv2(&cur_mv[0].as_mv, xd);
+      if (mv_check_bounds(x, &cur_mv[0].as_mv))
+        return INT64_MAX;
+      mbmi->mv[0].as_int = cur_mv[0].as_int;
+    }
+
+    if (this_mode == NEW_NEARMV ||
+        this_mode == NEAREST_NEARMV ||
+        this_mode == NEAR_NEARMV) {
+      cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][1].comp_mv;
+
+      lower_mv_precision(&cur_mv[1].as_mv, cm->allow_high_precision_mv);
+      clamp_mv2(&cur_mv[1].as_mv, xd);
+      if (mv_check_bounds(x, &cur_mv[1].as_mv))
+        return INT64_MAX;
+      mbmi->mv[1].as_int = cur_mv[1].as_int;
+    }
+  }
+#else
+  if (this_mode == NEARMV && is_comp_pred) {
+    uint8_t ref_frame_type = vp10_ref_frame_type(mbmi->ref_frame);
+    if (mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
+      int ref_mv_idx = mbmi->ref_mv_idx + 1;
+      cur_mv[0] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].this_mv;
+      cur_mv[1] = mbmi_ext->ref_mv_stack[ref_frame_type][ref_mv_idx].comp_mv;
+
+      for (i = 0; i < 2; ++i) {
+        clamp_mv2(&cur_mv[i].as_mv, xd);
+        if (mv_check_bounds(x, &cur_mv[i].as_mv))
+          return INT64_MAX;
+        mbmi->mv[i].as_int = cur_mv[i].as_int;
+      }
+    }
+  }
+#endif  // CONFIG_EXT_INTER
+#endif  // CONFIG_REF_MV
+
   // do first prediction into the destination buffer. Do the next
   // prediction into a temporary buffer. Then keep track of which one
   // of these currently holds the best predictor, and use the other
@@ -2568,16 +7137,28 @@
   // initiation of a motion field.
   if (discount_newmv_test(cpi, this_mode, frame_mv[refs[0]],
                           mode_mv, refs[0])) {
-    *rate2 += VPXMIN(cost_mv_ref(cpi, this_mode,
-                                 mbmi_ext->mode_context[refs[0]]),
-                     cost_mv_ref(cpi, NEARESTMV,
-                                 mbmi_ext->mode_context[refs[0]]));
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+    *rate2 += VPXMIN(cost_mv_ref(cpi, this_mode, is_comp_pred, mode_ctx),
+                     cost_mv_ref(cpi, NEARESTMV, is_comp_pred, mode_ctx));
+#else
+    *rate2 += VPXMIN(cost_mv_ref(cpi, this_mode, mode_ctx),
+                     cost_mv_ref(cpi, NEARESTMV, mode_ctx));
+#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
   } else {
-    *rate2 += cost_mv_ref(cpi, this_mode, mbmi_ext->mode_context[refs[0]]);
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+    *rate2 += cost_mv_ref(cpi, this_mode, is_comp_pred, mode_ctx);
+#else
+    *rate2 += cost_mv_ref(cpi, this_mode, mode_ctx);
+#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
   }
 
   if (RDCOST(x->rdmult, x->rddiv, *rate2, 0) > ref_best_rd &&
-      mbmi->mode != NEARESTMV)
+#if CONFIG_EXT_INTER
+      mbmi->mode != NEARESTMV && mbmi->mode != NEAREST_NEARESTMV
+#else
+      mbmi->mode != NEARESTMV
+#endif  // CONFIG_EXT_INTER
+     )
     return INT64_MAX;
 
   pred_exists = 0;
@@ -2586,143 +7167,519 @@
   if (is_comp_pred)
     intpel_mv &= !mv_has_subpel(&mbmi->mv[1].as_mv);
 
-  // Search for best switchable filter by checking the variance of
-  // pred error irrespective of whether the filter will be used
-  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
-    filter_cache[i] = INT64_MAX;
+#if !CONFIG_DUAL_FILTER
+  best_filter = predict_interp_filter(cpi, x, bsize, mi_row, mi_col,
+                                      single_filter);
+#endif
 
   if (cm->interp_filter != BILINEAR) {
-    if (x->source_variance < cpi->sf.disable_filter_search_var_thresh) {
-      best_filter = EIGHTTAP;
-    } else if (best_filter == SWITCHABLE) {
-      int newbest;
-      int tmp_rate_sum = 0;
-      int64_t tmp_dist_sum = 0;
+    int newbest;
+    int tmp_rate_sum = 0;
+    int64_t tmp_dist_sum = 0;
 
-      for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
-        int j;
-        int64_t rs_rd;
-        int tmp_skip_sb = 0;
-        int64_t tmp_skip_sse = INT64_MAX;
+#if CONFIG_DUAL_FILTER
+#if CONFIG_EXT_INTERP
+    for (i = 0; i < 25; ++i) {
+#else
+    for (i = 0; i < 9; ++i) {
+#endif
+#else
+    for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
+#endif
+      int j;
+      int64_t rs_rd;
+      int tmp_skip_sb = 0;
+      int64_t tmp_skip_sse = INT64_MAX;
 
-        mbmi->interp_filter = i;
-        rs = vp10_get_switchable_rate(cpi, xd);
-        rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
+#if CONFIG_DUAL_FILTER
+      mbmi->interp_filter[0] = filter_sets[i][0];
+      mbmi->interp_filter[1] = filter_sets[i][1];
+      mbmi->interp_filter[2] = filter_sets[i][0];
+      mbmi->interp_filter[3] = filter_sets[i][1];
+#else
+      mbmi->interp_filter = i;
+#endif
+      rs = vp10_get_switchable_rate(cpi, xd);
+      rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
 
-        if (i > 0 && intpel_mv) {
-          rd = RDCOST(x->rdmult, x->rddiv, tmp_rate_sum, tmp_dist_sum);
-          filter_cache[i] = rd;
-          filter_cache[SWITCHABLE_FILTERS] =
-              VPXMIN(filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
-          if (cm->interp_filter == SWITCHABLE)
-            rd += rs_rd;
-          *mask_filter = VPXMAX(*mask_filter, rd);
-        } else {
-          int rate_sum = 0;
-          int64_t dist_sum = 0;
-          if (i > 0 && cpi->sf.adaptive_interp_filter_search &&
-              (cpi->sf.interp_filter_search_mask & (1 << i))) {
-            rate_sum = INT_MAX;
-            dist_sum = INT64_MAX;
-            continue;
-          }
+      if (i > 0 && intpel_mv && IsInterpolatingFilter(i)) {
+        rd = RDCOST(x->rdmult, x->rddiv, tmp_rate_sum, tmp_dist_sum);
+        if (cm->interp_filter == SWITCHABLE)
+          rd += rs_rd;
+      } else {
+        int rate_sum = 0;
+        int64_t dist_sum = 0;
 
-          if ((cm->interp_filter == SWITCHABLE &&
-               (!i || best_needs_copy)) ||
-              (cm->interp_filter != SWITCHABLE &&
-               (cm->interp_filter == mbmi->interp_filter ||
-                (i == 0 && intpel_mv)))) {
-            restore_dst_buf(xd, orig_dst, orig_dst_stride);
-          } else {
-            for (j = 0; j < MAX_MB_PLANE; j++) {
-              xd->plane[j].dst.buf = tmp_buf + j * 64 * 64;
-              xd->plane[j].dst.stride = 64;
-            }
-          }
-          vp10_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
-          model_rd_for_sb(cpi, bsize, x, xd, &rate_sum, &dist_sum,
-                          &tmp_skip_sb, &tmp_skip_sse);
-
-          rd = RDCOST(x->rdmult, x->rddiv, rate_sum, dist_sum);
-          filter_cache[i] = rd;
-          filter_cache[SWITCHABLE_FILTERS] =
-              VPXMIN(filter_cache[SWITCHABLE_FILTERS], rd + rs_rd);
-          if (cm->interp_filter == SWITCHABLE)
-            rd += rs_rd;
-          *mask_filter = VPXMAX(*mask_filter, rd);
-
-          if (i == 0 && intpel_mv) {
-            tmp_rate_sum = rate_sum;
-            tmp_dist_sum = dist_sum;
-          }
+        if (i > 0 && cpi->sf.adaptive_interp_filter_search &&
+            (cpi->sf.interp_filter_search_mask & (1 << i))) {
+          rate_sum = INT_MAX;
+          dist_sum = INT64_MAX;
+          continue;
         }
 
-        if (i == 0 && cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
-          if (rd / 2 > ref_best_rd) {
-            restore_dst_buf(xd, orig_dst, orig_dst_stride);
-            return INT64_MAX;
-          }
-        }
-        newbest = i == 0 || rd < best_rd;
-
-        if (newbest) {
-          best_rd = rd;
-          best_filter = mbmi->interp_filter;
-          if (cm->interp_filter == SWITCHABLE && i && !intpel_mv)
-            best_needs_copy = !best_needs_copy;
-        }
-
-        if ((cm->interp_filter == SWITCHABLE && newbest) ||
+        if ((cm->interp_filter == SWITCHABLE &&
+             (!i || best_needs_copy)) ||
+#if CONFIG_EXT_INTER
+            is_comp_interintra_pred ||
+#endif  // CONFIG_EXT_INTER
             (cm->interp_filter != SWITCHABLE &&
-             cm->interp_filter == mbmi->interp_filter)) {
-          pred_exists = 1;
-          tmp_rd = best_rd;
+             (
+#if CONFIG_DUAL_FILTER
+              cm->interp_filter == mbmi->interp_filter[0]
+#else
+              cm->interp_filter == mbmi->interp_filter
+#endif
+              ||
+              (i == 0 && intpel_mv && IsInterpolatingFilter(i))))) {
+          restore_dst_buf(xd, orig_dst, orig_dst_stride);
+        } else {
+          for (j = 0; j < MAX_MB_PLANE; j++) {
+            xd->plane[j].dst.buf = tmp_buf + j * MAX_SB_SQUARE;
+            xd->plane[j].dst.stride = MAX_SB_SIZE;
+          }
+        }
+        vp10_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+        model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1,
+                        &rate_sum, &dist_sum, &tmp_skip_sb, &tmp_skip_sse);
 
-          skip_txfm_sb = tmp_skip_sb;
-          skip_sse_sb = tmp_skip_sse;
-          memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
-          memcpy(bsse, x->bsse, sizeof(bsse));
+        rd = RDCOST(x->rdmult, x->rddiv, rate_sum, dist_sum);
+        if (cm->interp_filter == SWITCHABLE)
+          rd += rs_rd;
+
+        if (i == 0 && intpel_mv && IsInterpolatingFilter(i)) {
+          tmp_rate_sum = rate_sum;
+          tmp_dist_sum = dist_sum;
         }
       }
-      restore_dst_buf(xd, orig_dst, orig_dst_stride);
+
+      if (i == 0 && cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
+        if (rd / 2 > ref_best_rd) {
+          restore_dst_buf(xd, orig_dst, orig_dst_stride);
+          return INT64_MAX;
+        }
+      }
+      newbest = i == 0 || rd < best_rd;
+
+      if (newbest) {
+        best_rd = rd;
+#if CONFIG_DUAL_FILTER
+        best_filter[0] = mbmi->interp_filter[0];
+        best_filter[1] = mbmi->interp_filter[1];
+        best_filter[2] = mbmi->interp_filter[2];
+        best_filter[3] = mbmi->interp_filter[3];
+#else
+        best_filter = mbmi->interp_filter;
+#endif
+        if (cm->interp_filter == SWITCHABLE && i &&
+            !(intpel_mv && IsInterpolatingFilter(i)))
+          best_needs_copy = !best_needs_copy;
+      }
+
+      if ((cm->interp_filter == SWITCHABLE && newbest) ||
+          (cm->interp_filter != SWITCHABLE &&
+#if CONFIG_DUAL_FILTER
+           cm->interp_filter == mbmi->interp_filter[0])) {
+#else
+           cm->interp_filter == mbmi->interp_filter)) {
+#endif
+        pred_exists = 1;
+        tmp_rd = best_rd;
+
+        skip_txfm_sb = tmp_skip_sb;
+        skip_sse_sb = tmp_skip_sse;
+      } else {
+        pred_exists = 0;
+      }
     }
+    restore_dst_buf(xd, orig_dst, orig_dst_stride);
   }
+
   // Set the appropriate filter
+#if CONFIG_DUAL_FILTER
+  mbmi->interp_filter[0] = cm->interp_filter != SWITCHABLE ?
+      cm->interp_filter : best_filter[0];
+  mbmi->interp_filter[1] = cm->interp_filter != SWITCHABLE ?
+      cm->interp_filter : best_filter[1];
+  if (mbmi->ref_frame[1] > INTRA_FRAME) {
+    mbmi->interp_filter[2] = cm->interp_filter != SWITCHABLE ?
+        cm->interp_filter : best_filter[2];
+    mbmi->interp_filter[3] = cm->interp_filter != SWITCHABLE ?
+        cm->interp_filter : best_filter[3];
+  }
+#else
   mbmi->interp_filter = cm->interp_filter != SWITCHABLE ?
       cm->interp_filter : best_filter;
+#endif
   rs = cm->interp_filter == SWITCHABLE ? vp10_get_switchable_rate(cpi, xd) : 0;
 
+#if CONFIG_EXT_INTER
+#if CONFIG_OBMC
+  best_bmc_mbmi = *mbmi;
+  rate_mv_bmc = rate_mv;
+  rate2_bmc_nocoeff = *rate2;
+  if (cm->interp_filter == SWITCHABLE)
+    rate2_bmc_nocoeff += rs;
+#endif  // CONFIG_OBMC
+
+  if (is_comp_pred && is_interinter_wedge_used(bsize)) {
+    int rate_sum, rs;
+    int64_t dist_sum;
+    int64_t best_rd_nowedge = INT64_MAX;
+    int64_t best_rd_wedge = INT64_MAX;
+    int tmp_skip_txfm_sb;
+    int64_t tmp_skip_sse_sb;
+
+    rs = vp10_cost_bit(cm->fc->wedge_interinter_prob[bsize], 0);
+    mbmi->use_wedge_interinter = 0;
+    vp10_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+    vp10_subtract_plane(x, bsize, 0);
+    rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+                             &tmp_skip_txfm_sb, &tmp_skip_sse_sb,
+                             INT64_MAX);
+    if (rd != INT64_MAX)
+      rd = RDCOST(x->rdmult, x->rddiv, rs + rate_mv + rate_sum, dist_sum);
+    best_rd_nowedge = rd;
+
+    // Disbale wedge search if source variance is small
+    if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh &&
+        best_rd_nowedge / 3 < ref_best_rd) {
+      uint8_t pred0[2 * MAX_SB_SQUARE];
+      uint8_t pred1[2 * MAX_SB_SQUARE];
+      uint8_t *preds0[1] = {pred0};
+      uint8_t *preds1[1] = {pred1};
+      int strides[1] = {bw};
+
+      mbmi->use_wedge_interinter = 1;
+      rs = vp10_cost_literal(get_interinter_wedge_bits(bsize)) +
+          vp10_cost_bit(cm->fc->wedge_interinter_prob[bsize], 1);
+
+      vp10_build_inter_predictors_for_planes_single_buf(
+          xd, bsize, 0, 0,  mi_row, mi_col, 0, preds0, strides);
+      vp10_build_inter_predictors_for_planes_single_buf(
+          xd, bsize, 0, 0, mi_row, mi_col, 1, preds1, strides);
+
+      // Choose the best wedge
+      best_rd_wedge = pick_interinter_wedge(cpi, x, bsize, pred0, pred1);
+      best_rd_wedge += RDCOST(x->rdmult, x->rddiv, rs + rate_mv, 0);
+
+      if (have_newmv_in_inter_mode(this_mode)) {
+        int_mv tmp_mv[2];
+        int rate_mvs[2], tmp_rate_mv = 0;
+        if (this_mode == NEW_NEWMV) {
+          int mv_idxs[2] = {0, 0};
+          do_masked_motion_search_indexed(cpi, x,
+                                          mbmi->interinter_wedge_index,
+                                          mbmi->interinter_wedge_sign,
+                                          bsize, mi_row, mi_col, tmp_mv, rate_mvs,
+                                          mv_idxs, 2);
+          tmp_rate_mv = rate_mvs[0] + rate_mvs[1];
+          mbmi->mv[0].as_int = tmp_mv[0].as_int;
+          mbmi->mv[1].as_int = tmp_mv[1].as_int;
+        } else if (this_mode == NEW_NEARESTMV || this_mode == NEW_NEARMV) {
+          int mv_idxs[2] = {0, 0};
+          do_masked_motion_search_indexed(cpi, x,
+                                          mbmi->interinter_wedge_index,
+                                          mbmi->interinter_wedge_sign,
+                                          bsize, mi_row, mi_col, tmp_mv, rate_mvs,
+                                          mv_idxs, 0);
+          tmp_rate_mv = rate_mvs[0];
+          mbmi->mv[0].as_int = tmp_mv[0].as_int;
+        } else if (this_mode == NEAREST_NEWMV || this_mode == NEAR_NEWMV) {
+          int mv_idxs[2] = {0, 0};
+          do_masked_motion_search_indexed(cpi, x,
+                                          mbmi->interinter_wedge_index,
+                                          mbmi->interinter_wedge_sign,
+                                          bsize, mi_row, mi_col, tmp_mv, rate_mvs,
+                                          mv_idxs, 1);
+          tmp_rate_mv = rate_mvs[1];
+          mbmi->mv[1].as_int = tmp_mv[1].as_int;
+        }
+        vp10_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+        model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
+                        &tmp_skip_txfm_sb, &tmp_skip_sse_sb);
+        rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate_mv + rate_sum, dist_sum);
+        if (rd < best_rd_wedge) {
+          best_rd_wedge = rd;
+        } else {
+          mbmi->mv[0].as_int = cur_mv[0].as_int;
+          mbmi->mv[1].as_int = cur_mv[1].as_int;
+          tmp_rate_mv = rate_mv;
+          vp10_build_wedge_inter_predictor_from_buf(xd, bsize, 0, 0,
+                                                    preds0, strides,
+                                                    preds1, strides);
+        }
+        vp10_subtract_plane(x, bsize, 0);
+        rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+                                 &tmp_skip_txfm_sb, &tmp_skip_sse_sb,
+                                 INT64_MAX);
+        if (rd != INT64_MAX)
+          rd = RDCOST(x->rdmult, x->rddiv,
+                      rs + tmp_rate_mv + rate_sum, dist_sum);
+        best_rd_wedge = rd;
+
+        if (best_rd_wedge < best_rd_nowedge) {
+          mbmi->use_wedge_interinter = 1;
+          xd->mi[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int;
+          xd->mi[0]->bmi[0].as_mv[1].as_int = mbmi->mv[1].as_int;
+          *rate2 += tmp_rate_mv - rate_mv;
+          rate_mv = tmp_rate_mv;
+        } else {
+          mbmi->use_wedge_interinter = 0;
+          mbmi->mv[0].as_int = cur_mv[0].as_int;
+          mbmi->mv[1].as_int = cur_mv[1].as_int;
+          xd->mi[0]->bmi[0].as_mv[0].as_int = mbmi->mv[0].as_int;
+          xd->mi[0]->bmi[0].as_mv[1].as_int = mbmi->mv[1].as_int;
+        }
+      } else {
+        vp10_build_wedge_inter_predictor_from_buf(xd, bsize,
+                                                  0, 0,
+                                                  preds0, strides,
+                                                  preds1, strides);
+        vp10_subtract_plane(x, bsize, 0);
+        rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+                                 &tmp_skip_txfm_sb, &tmp_skip_sse_sb,
+                                 INT64_MAX);
+        if (rd != INT64_MAX)
+            rd = RDCOST(x->rdmult, x->rddiv, rs + rate_mv + rate_sum, dist_sum);
+          best_rd_wedge = rd;
+        if (best_rd_wedge < best_rd_nowedge) {
+          mbmi->use_wedge_interinter = 1;
+        } else {
+          mbmi->use_wedge_interinter = 0;
+        }
+      }
+    }
+    if (ref_best_rd < INT64_MAX &&
+        VPXMIN(best_rd_wedge, best_rd_nowedge) / 3 > ref_best_rd)
+      return INT64_MAX;
+
+    pred_exists = 0;
+    tmp_rd = VPXMIN(best_rd_wedge, best_rd_nowedge);
+
+    if (mbmi->use_wedge_interinter)
+      *compmode_wedge_cost =
+          vp10_cost_literal(get_interinter_wedge_bits(bsize)) +
+          vp10_cost_bit(cm->fc->wedge_interinter_prob[bsize], 1);
+    else
+      *compmode_wedge_cost =
+          vp10_cost_bit(cm->fc->wedge_interinter_prob[bsize], 0);
+  }
+
+  if (is_comp_interintra_pred) {
+    INTERINTRA_MODE best_interintra_mode = II_DC_PRED;
+    int64_t best_interintra_rd = INT64_MAX;
+    int rmode, rate_sum;
+    int64_t dist_sum;
+    int j;
+    int64_t best_interintra_rd_nowedge = INT64_MAX;
+    int64_t best_interintra_rd_wedge = INT64_MAX;
+    int rwedge;
+    int_mv tmp_mv;
+    int tmp_rate_mv = 0;
+    int tmp_skip_txfm_sb;
+    int64_t tmp_skip_sse_sb;
+    DECLARE_ALIGNED(16, uint8_t, intrapred_[2 * MAX_SB_SQUARE]);
+    uint8_t *intrapred;
+
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+      intrapred = CONVERT_TO_BYTEPTR(intrapred_);
+    else
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      intrapred = intrapred_;
+
+    mbmi->ref_frame[1] = NONE;
+    for (j = 0; j < MAX_MB_PLANE; j++) {
+      xd->plane[j].dst.buf = tmp_buf + j * MAX_SB_SQUARE;
+      xd->plane[j].dst.stride = bw;
+    }
+    vp10_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+    restore_dst_buf(xd, orig_dst, orig_dst_stride);
+    mbmi->ref_frame[1] = INTRA_FRAME;
+    mbmi->use_wedge_interintra = 0;
+
+    for (j = 0; j < INTERINTRA_MODES; ++j) {
+      mbmi->interintra_mode = (INTERINTRA_MODE)j;
+      rmode = interintra_mode_cost[mbmi->interintra_mode];
+      vp10_build_intra_predictors_for_interintra(
+          xd, bsize, 0, intrapred, bw);
+      vp10_combine_interintra(xd, bsize, 0, tmp_buf, bw,
+                              intrapred, bw);
+      model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
+                      &tmp_skip_txfm_sb, &tmp_skip_sse_sb);
+      rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate_mv + rate_sum, dist_sum);
+      if (rd < best_interintra_rd) {
+        best_interintra_rd = rd;
+        best_interintra_mode = mbmi->interintra_mode;
+      }
+    }
+    mbmi->interintra_mode = best_interintra_mode;
+    rmode = interintra_mode_cost[mbmi->interintra_mode];
+    vp10_build_intra_predictors_for_interintra(
+        xd, bsize, 0, intrapred, bw);
+    vp10_combine_interintra(xd, bsize, 0, tmp_buf, bw,
+                            intrapred, bw);
+    vp10_subtract_plane(x, bsize, 0);
+    rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+                             &tmp_skip_txfm_sb, &tmp_skip_sse_sb,
+                             INT64_MAX);
+    if (rd != INT64_MAX)
+      rd = RDCOST(x->rdmult, x->rddiv, rate_mv + rmode + rate_sum, dist_sum);
+    best_interintra_rd = rd;
+
+    if (ref_best_rd < INT64_MAX &&
+        best_interintra_rd > 2 * ref_best_rd) {
+      return INT64_MAX;
+    }
+    if (is_interintra_wedge_used(bsize)) {
+      rwedge = vp10_cost_bit(cm->fc->wedge_interintra_prob[bsize], 0);
+      if (rd != INT64_MAX)
+        rd = RDCOST(x->rdmult, x->rddiv,
+                    rmode + rate_mv + rwedge + rate_sum, dist_sum);
+      best_interintra_rd_nowedge = rd;
+
+      // Disbale wedge search if source variance is small
+      if (x->source_variance > cpi->sf.disable_wedge_search_var_thresh) {
+        mbmi->use_wedge_interintra = 1;
+
+        rwedge = vp10_cost_literal(get_interintra_wedge_bits(bsize)) +
+            vp10_cost_bit(cm->fc->wedge_interintra_prob[bsize], 1);
+
+        best_interintra_rd_wedge = pick_interintra_wedge(cpi, x, bsize,
+                                                         intrapred_, tmp_buf_);
+
+        best_interintra_rd_wedge += RDCOST(x->rdmult, x->rddiv,
+                                           rmode + rate_mv + rwedge, 0);
+        // Refine motion vector.
+        if (have_newmv_in_inter_mode(this_mode)) {
+          // get negative of mask
+          const uint8_t* mask = vp10_get_contiguous_soft_mask(
+              mbmi->interintra_wedge_index, 1, bsize);
+          do_masked_motion_search(cpi, x, mask, bw, bsize,
+                                  mi_row, mi_col, &tmp_mv, &tmp_rate_mv,
+                                  0, mv_idx);
+          mbmi->mv[0].as_int = tmp_mv.as_int;
+          vp10_build_inter_predictors_sby(xd, mi_row, mi_col, bsize);
+          model_rd_for_sb(cpi, bsize, x, xd, 0, 0, &rate_sum, &dist_sum,
+                          &tmp_skip_txfm_sb, &tmp_skip_sse_sb);
+          rd = RDCOST(x->rdmult, x->rddiv,
+                      rmode + tmp_rate_mv + rwedge + rate_sum, dist_sum);
+          if (rd < best_interintra_rd_wedge) {
+            best_interintra_rd_wedge = rd;
+          } else {
+            tmp_mv.as_int = cur_mv[0].as_int;
+            tmp_rate_mv = rate_mv;
+          }
+        } else {
+          tmp_mv.as_int = cur_mv[0].as_int;
+          tmp_rate_mv = rate_mv;
+          vp10_combine_interintra(xd, bsize, 0,
+                                  tmp_buf, bw,
+                                  intrapred, bw);
+        }
+        // Evaluate closer to true rd
+        vp10_subtract_plane(x, bsize, 0);
+        rd = estimate_yrd_for_sb(cpi, bsize, x, &rate_sum, &dist_sum,
+                                 &tmp_skip_txfm_sb, &tmp_skip_sse_sb,
+                                 INT64_MAX);
+        if (rd != INT64_MAX)
+          rd = RDCOST(x->rdmult, x->rddiv,
+                      rmode + tmp_rate_mv + rwedge + rate_sum, dist_sum);
+        best_interintra_rd_wedge = rd;
+        if (best_interintra_rd_wedge < best_interintra_rd_nowedge) {
+          mbmi->use_wedge_interintra = 1;
+          best_interintra_rd = best_interintra_rd_wedge;
+          mbmi->mv[0].as_int = tmp_mv.as_int;
+          *rate2 += tmp_rate_mv - rate_mv;
+          rate_mv = tmp_rate_mv;
+        } else {
+          mbmi->use_wedge_interintra = 0;
+          best_interintra_rd = best_interintra_rd_nowedge;
+          mbmi->mv[0].as_int = cur_mv[0].as_int;
+        }
+      } else {
+        mbmi->use_wedge_interintra = 0;
+        best_interintra_rd = best_interintra_rd_nowedge;
+      }
+    }
+
+    pred_exists = 0;
+    tmp_rd = best_interintra_rd;
+    *compmode_interintra_cost =
+        vp10_cost_bit(cm->fc->interintra_prob[size_group_lookup[bsize]], 1);
+    *compmode_interintra_cost += interintra_mode_cost[mbmi->interintra_mode];
+    if (is_interintra_wedge_used(bsize)) {
+      *compmode_interintra_cost += vp10_cost_bit(
+          cm->fc->wedge_interintra_prob[bsize], mbmi->use_wedge_interintra);
+      if (mbmi->use_wedge_interintra) {
+        *compmode_interintra_cost +=
+            vp10_cost_literal(get_interintra_wedge_bits(bsize));
+      }
+    }
+  } else if (is_interintra_allowed(mbmi)) {
+    *compmode_interintra_cost =
+        vp10_cost_bit(cm->fc->interintra_prob[size_group_lookup[bsize]], 0);
+  }
+
+#if CONFIG_EXT_INTERP
+  if (!vp10_is_interp_needed(xd) && cm->interp_filter == SWITCHABLE) {
+#if CONFIG_DUAL_FILTER
+    for (i = 0; i < 4; ++i)
+      mbmi->interp_filter[i] = EIGHTTAP_REGULAR;
+#else
+    mbmi->interp_filter = EIGHTTAP_REGULAR;
+#endif
+    pred_exists = 0;
+  }
+#endif  // CONFIG_EXT_INTERP
+#endif  // CONFIG_EXT_INTER
+
   if (pred_exists) {
     if (best_needs_copy) {
       // again temporarily set the buffers to local memory to prevent a memcpy
       for (i = 0; i < MAX_MB_PLANE; i++) {
-        xd->plane[i].dst.buf = tmp_buf + i * 64 * 64;
-        xd->plane[i].dst.stride = 64;
+        xd->plane[i].dst.buf = tmp_buf + i * MAX_SB_SQUARE;
+        xd->plane[i].dst.stride = MAX_SB_SIZE;
       }
     }
-    rd = tmp_rd + RDCOST(x->rdmult, x->rddiv, rs, 0);
+    rd = tmp_rd;
   } else {
     int tmp_rate;
     int64_t tmp_dist;
+
     // Handles the special case when a filter that is not in the
     // switchable list (ex. bilinear) is indicated at the frame level, or
     // skip condition holds.
     vp10_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
-    model_rd_for_sb(cpi, bsize, x, xd, &tmp_rate, &tmp_dist,
-                    &skip_txfm_sb, &skip_sse_sb);
+    model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1,
+                    &tmp_rate, &tmp_dist, &skip_txfm_sb, &skip_sse_sb);
     rd = RDCOST(x->rdmult, x->rddiv, rs + tmp_rate, tmp_dist);
-    memcpy(skip_txfm, x->skip_txfm, sizeof(skip_txfm));
-    memcpy(bsse, x->bsse, sizeof(bsse));
   }
 
+#if CONFIG_DUAL_FILTER
+  if (!is_comp_pred)
+    single_filter[this_mode][refs[0]] = mbmi->interp_filter[0];
+#else
   if (!is_comp_pred)
     single_filter[this_mode][refs[0]] = mbmi->interp_filter;
+#endif
 
-  if (cpi->sf.adaptive_mode_search)
-    if (is_comp_pred)
-      if (single_skippable[this_mode][refs[0]] &&
-          single_skippable[this_mode][refs[1]])
-        memset(skip_txfm, SKIP_TXFM_AC_DC, sizeof(skip_txfm));
+#if CONFIG_EXT_INTER
+  if (modelled_rd != NULL) {
+    if (is_comp_pred) {
+      const int mode0 = compound_ref0_mode(this_mode);
+      const int mode1 = compound_ref1_mode(this_mode);
+      int64_t mrd = VPXMIN(modelled_rd[mode0][refs[0]],
+                           modelled_rd[mode1][refs[1]]);
+      if (rd / 4 * 3 > mrd && ref_best_rd < INT64_MAX) {
+        restore_dst_buf(xd, orig_dst, orig_dst_stride);
+        return INT64_MAX;
+      }
+    } else if (!is_comp_interintra_pred) {
+      modelled_rd[this_mode][refs[0]] = rd;
+    }
+  }
+#endif  // CONFIG_EXT_INTER
 
   if (cpi->sf.use_rd_breakout && ref_best_rd < INT64_MAX) {
     // if current pred_error modeled rd is substantially more than the best
@@ -2735,58 +7692,300 @@
 
   if (cm->interp_filter == SWITCHABLE)
     *rate2 += rs;
+#if CONFIG_OBMC
+  rate2_nocoeff = *rate2;
+#endif  // CONFIG_OBMC
 
-  memcpy(x->skip_txfm, skip_txfm, sizeof(skip_txfm));
-  memcpy(x->bsse, bsse, sizeof(bsse));
+#if CONFIG_OBMC || CONFIG_WARPED_MOTION
+  best_rd = INT64_MAX;
+  for (mbmi->motion_variation = SIMPLE_TRANSLATION;
+       mbmi->motion_variation < (allow_motvar ? MOTION_VARIATIONS : 1);
+       mbmi->motion_variation++) {
+    int64_t tmp_rd, tmp_dist;
+    int tmp_rate;
+#if CONFIG_EXT_INTER
+    int tmp_rate2 =
+        mbmi->motion_variation != SIMPLE_TRANSLATION ?
+        rate2_bmc_nocoeff : rate2_nocoeff;
+#else
+    int tmp_rate2 = rate2_nocoeff;
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_EXT_INTERP
+#if CONFIG_DUAL_FILTER
+    INTERP_FILTER obmc_interp_filter[2][2] = {
+        {mbmi->interp_filter[0], mbmi->interp_filter[1]},   // obmc == 0
+        {mbmi->interp_filter[0], mbmi->interp_filter[1]}    // obmc == 1
+    };
+#else
+    INTERP_FILTER obmc_interp_filter[2] = {
+        mbmi->interp_filter,  // obmc == 0
+        mbmi->interp_filter   // obmc == 1
+    };
+#endif  // CONFIG_DUAL_FILTER
+#endif  // CONFIG_EXT_INTERP
 
-  if (!skip_txfm_sb) {
-    int skippable_y, skippable_uv;
-    int64_t sseuv = INT64_MAX;
-    int64_t rdcosty = INT64_MAX;
+#if CONFIG_OBMC
+    if (mbmi->motion_variation == OBMC_CAUSAL) {
+#if CONFIG_EXT_INTER
+      *mbmi = best_bmc_mbmi;
+      mbmi->motion_variation = OBMC_CAUSAL;
+#endif  // CONFIG_EXT_INTER
+      if (!is_comp_pred && have_newmv_in_inter_mode(this_mode)) {
+        int_mv tmp_mv;
+        int_mv pred_mv;
+        int tmp_rate_mv = 0;
 
-    // Y cost and distortion
-    vp10_subtract_plane(x, bsize, 0);
-    super_block_yrd(cpi, x, rate_y, &distortion_y, &skippable_y, psse,
-                    bsize, ref_best_rd);
+        pred_mv.as_int = mbmi->mv[0].as_int;
+        single_motion_search_obmc(cpi, x, bsize, mi_row, mi_col,
+                                  wsrc, mask2d,
+#if CONFIG_EXT_INTER
+                                  0, mv_idx,
+#endif  // CONFIG_EXT_INTER
+                                  &tmp_mv, pred_mv, &tmp_rate_mv);
+        mbmi->mv[0].as_int = tmp_mv.as_int;
+        if (discount_newmv_test(cpi, this_mode, tmp_mv, mode_mv, refs[0])) {
+          tmp_rate_mv = VPXMAX((tmp_rate_mv / NEW_MV_DISCOUNT_FACTOR), 1);
+        }
+#if CONFIG_EXT_INTER
+        tmp_rate2 = rate2_bmc_nocoeff - rate_mv_bmc + tmp_rate_mv;
+#else
+        tmp_rate2 = rate2_nocoeff - rate_mv + tmp_rate_mv;
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_EXT_INTERP
+#if CONFIG_DUAL_FILTER
+        if (!has_subpel_mv_component(xd->mi[0], xd, 0))
+          obmc_interp_filter[1][0] = mbmi->interp_filter[0] = EIGHTTAP_REGULAR;
+        if (!has_subpel_mv_component(xd->mi[0], xd, 1))
+          obmc_interp_filter[1][1] = mbmi->interp_filter[1] = EIGHTTAP_REGULAR;
+#else
+        if (!vp10_is_interp_needed(xd))
+          obmc_interp_filter[1] = mbmi->interp_filter = EIGHTTAP_REGULAR;
+#endif  // CONFIG_DUAL_FILTER
+        // This is not quite correct with CONFIG_DUAL_FILTER when a filter
+        // is needed in only one direction
+        if (!vp10_is_interp_needed(xd))
+          tmp_rate2 -= rs;
+#endif  // CONFIG_EXT_INTERP
+        vp10_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+#if CONFIG_EXT_INTER
+      } else {
+        vp10_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+#endif  // CONFIG_EXT_INTER
+      }
+      vp10_build_obmc_inter_prediction(cm, xd, mi_row, mi_col,
+                                       dst_buf1, dst_stride1,
+                                       dst_buf2, dst_stride2);
+      model_rd_for_sb(cpi, bsize, x, xd, 0, MAX_MB_PLANE - 1,
+                      &tmp_rate, &tmp_dist, &skip_txfm_sb, &skip_sse_sb);
+    }
+#endif  // CONFIG_OBMC
 
-    if (*rate_y == INT_MAX) {
-      *rate2 = INT_MAX;
-      *distortion = INT64_MAX;
-      restore_dst_buf(xd, orig_dst, orig_dst_stride);
-      return INT64_MAX;
+#if CONFIG_WARPED_MOTION
+    if (mbmi->motion_variation == WARPED_CAUSAL) {
+      // TODO(yuec): Add code
+    }
+#endif  // CONFIG_WARPED_MOTION
+    x->skip = 0;
+
+    *rate2 = tmp_rate2;
+    if (allow_motvar)
+      *rate2 += cpi->motvar_cost[bsize][mbmi->motion_variation];
+    *distortion = 0;
+#endif  // CONFIG_OBMC || CONFIG_WARPED_MOTION
+    if (!skip_txfm_sb) {
+      int skippable_y, skippable_uv;
+      int64_t sseuv = INT64_MAX;
+      int64_t rdcosty = INT64_MAX;
+
+      // Y cost and distortion
+      vp10_subtract_plane(x, bsize, 0);
+#if CONFIG_VAR_TX
+      if (cm->tx_mode == TX_MODE_SELECT || xd->lossless[mbmi->segment_id]) {
+        select_tx_type_yrd(cpi, x, rate_y, &distortion_y, &skippable_y, psse,
+                           bsize, ref_best_rd);
+      } else {
+        int idx, idy;
+        super_block_yrd(cpi, x, rate_y, &distortion_y, &skippable_y, psse,
+                        bsize, ref_best_rd);
+        for (idy = 0; idy < xd->n8_h; ++idy)
+          for (idx = 0; idx < xd->n8_w; ++idx)
+            mbmi->inter_tx_size[idy][idx] = mbmi->tx_size;
+        memset(x->blk_skip[0], skippable_y,
+               sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
+      }
+#else
+      super_block_yrd(cpi, x, rate_y, &distortion_y, &skippable_y, psse,
+                      bsize, ref_best_rd);
+#endif  // CONFIG_VAR_TX
+
+      if (*rate_y == INT_MAX) {
+        *rate2 = INT_MAX;
+        *distortion = INT64_MAX;
+#if CONFIG_OBMC || CONFIG_WARPED_MOTION
+        if (mbmi->motion_variation != SIMPLE_TRANSLATION) {
+          continue;
+        } else {
+#endif  // CONFIG_OBMC || CONFIG_WARPED_MOTION
+          restore_dst_buf(xd, orig_dst, orig_dst_stride);
+          return INT64_MAX;
+#if CONFIG_OBMC || CONFIG_WARPED_MOTION
+        }
+#endif  // CONFIG_OBMC || CONFIG_WARPED_MOTION
+      }
+
+      *rate2 += *rate_y;
+      *distortion += distortion_y;
+
+      rdcosty = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
+      rdcosty = VPXMIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, *psse));
+
+#if CONFIG_VAR_TX
+      if (!inter_block_uvrd(cpi, x, rate_uv, &distortion_uv, &skippable_uv,
+                            &sseuv, bsize, ref_best_rd - rdcosty))
+#else
+      if (!super_block_uvrd(cpi, x, rate_uv, &distortion_uv, &skippable_uv,
+                            &sseuv, bsize, ref_best_rd - rdcosty))
+#endif  // CONFIG_VAR_TX
+      {
+        *rate2 = INT_MAX;
+        *distortion = INT64_MAX;
+#if CONFIG_OBMC || CONFIG_WARPED_MOTION
+        continue;
+#else
+        restore_dst_buf(xd, orig_dst, orig_dst_stride);
+        return INT64_MAX;
+#endif  // CONFIG_OBMC || CONFIG_WARPED_MOTION
+      }
+
+      *psse += sseuv;
+      *rate2 += *rate_uv;
+      *distortion += distortion_uv;
+      *skippable = skippable_y && skippable_uv;
+#if CONFIG_OBMC || CONFIG_WARPED_MOTION
+      if (*skippable) {
+        *rate2 -= *rate_uv + *rate_y;
+        *rate_y = 0;
+        *rate_uv = 0;
+        *rate2 += vp10_cost_bit(vp10_get_skip_prob(cm, xd), 1);
+        mbmi->skip = 0;
+        // here mbmi->skip temporarily plays a role as what this_skip2 does
+      } else if (!xd->lossless[mbmi->segment_id] &&
+                 (RDCOST(x->rdmult, x->rddiv, *rate_y + *rate_uv +
+                         vp10_cost_bit(vp10_get_skip_prob(cm, xd), 0),
+                         *distortion) >=
+                  RDCOST(x->rdmult, x->rddiv,
+                         vp10_cost_bit(vp10_get_skip_prob(cm, xd), 1),
+                         *psse))) {
+        *rate2 -= *rate_uv + *rate_y;
+        *rate2 += vp10_cost_bit(vp10_get_skip_prob(cm, xd), 1);
+        *distortion = *psse;
+        *rate_y = 0;
+        *rate_uv = 0;
+        mbmi->skip = 1;
+      } else {
+        *rate2 += vp10_cost_bit(vp10_get_skip_prob(cm, xd), 0);
+        mbmi->skip = 0;
+      }
+      *disable_skip = 0;
+#endif  // CONFIG_OBMC || CONFIG_WARPED_MOTION
+    } else {
+      x->skip = 1;
+      *disable_skip = 1;
+
+      // The cost of skip bit needs to be added.
+#if CONFIG_OBMC || CONFIG_WARPED_MOTION
+      mbmi->skip = 0;
+#endif  // CONFIG_OBMC || CONFIG_WARPED_MOTION
+      *rate2 += vp10_cost_bit(vp10_get_skip_prob(cm, xd), 1);
+
+      *distortion = skip_sse_sb;
+      *psse = skip_sse_sb;
+      *rate_y = 0;
+      *rate_uv = 0;
+      *skippable = 1;
     }
 
-    *rate2 += *rate_y;
-    *distortion += distortion_y;
-
-    rdcosty = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
-    rdcosty = VPXMIN(rdcosty, RDCOST(x->rdmult, x->rddiv, 0, *psse));
-
-    if (!super_block_uvrd(cpi, x, rate_uv, &distortion_uv, &skippable_uv,
-                          &sseuv, bsize, ref_best_rd - rdcosty)) {
-      *rate2 = INT_MAX;
-      *distortion = INT64_MAX;
-      restore_dst_buf(xd, orig_dst, orig_dst_stride);
-      return INT64_MAX;
+#if CONFIG_OBMC || CONFIG_WARPED_MOTION
+    tmp_rd = RDCOST(x->rdmult, x->rddiv, *rate2, *distortion);
+    if (mbmi->motion_variation == SIMPLE_TRANSLATION || (tmp_rd < best_rd)) {
+#if CONFIG_EXT_INTERP
+#if CONFIG_DUAL_FILTER
+      mbmi->interp_filter[0] = obmc_interp_filter[mbmi->motion_variation][0];
+      mbmi->interp_filter[1] = obmc_interp_filter[mbmi->motion_variation][1];
+#else
+      mbmi->interp_filter = obmc_interp_filter[mbmi->motion_variation];
+#endif  // CONFIG_DUAL_FILTER
+#endif  // CONFIG_EXT_INTERP
+      best_mbmi = *mbmi;
+      best_rd = tmp_rd;
+      best_rate2 = *rate2;
+      best_rate_y = *rate_y;
+      best_rate_uv = *rate_uv;
+#if CONFIG_VAR_TX
+      for (i = 0; i < MAX_MB_PLANE; ++i)
+        memcpy(best_blk_skip[i], x->blk_skip[i],
+               sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
+#endif  // CONFIG_VAR_TX
+      best_distortion = *distortion;
+      best_skippable = *skippable;
+      best_xskip = x->skip;
+      best_disable_skip = *disable_skip;
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        x->recon_variance =
+            vp10_high_get_sby_perpixel_variance(
+                cpi, &xd->plane[0].dst, bsize, xd->bd);
+      } else {
+        x->recon_variance =
+            vp10_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
+      }
+#else
+      x->recon_variance =
+          vp10_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     }
-
-    *psse += sseuv;
-    *rate2 += *rate_uv;
-    *distortion += distortion_uv;
-    *skippable = skippable_y && skippable_uv;
-  } else {
-    x->skip = 1;
-    *disable_skip = 1;
-
-    // The cost of skip bit needs to be added.
-    *rate2 += vp10_cost_bit(vp10_get_skip_prob(cm, xd), 1);
-
-    *distortion = skip_sse_sb;
   }
 
+  if (best_rd == INT64_MAX) {
+    *rate2 = INT_MAX;
+    *distortion = INT64_MAX;
+    restore_dst_buf(xd, orig_dst, orig_dst_stride);
+    return INT64_MAX;
+  }
+  *mbmi = best_mbmi;
+  *rate2 = best_rate2;
+  *rate_y = best_rate_y;
+  *rate_uv = best_rate_uv;
+#if CONFIG_VAR_TX
+  for (i = 0; i < MAX_MB_PLANE; ++i)
+    memcpy(x->blk_skip[i], best_blk_skip[i],
+           sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
+#endif  // CONFIG_VAR_TX
+  *distortion = best_distortion;
+  *skippable = best_skippable;
+  x->skip = best_xskip;
+  *disable_skip = best_disable_skip;
+#endif  // CONFIG_OBMC || CONFIG_WARPED_MOTION
+
   if (!is_comp_pred)
     single_skippable[this_mode][refs[0]] = *skippable;
 
+#if !(CONFIG_OBMC || CONFIG_WARPED_MOTION)
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    x->recon_variance =
+      vp10_high_get_sby_perpixel_variance(cpi, &xd->plane[0].dst,
+                                          bsize, xd->bd);
+  } else {
+    x->recon_variance =
+      vp10_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
+  }
+#else
+  x->recon_variance =
+    vp10_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // !(CONFIG_OBMC || CONFIG_WARPED_MOTION)
+
   restore_dst_buf(xd, orig_dst, orig_dst_stride);
   return 0;  // The rate-distortion cost will be re-calculated by caller.
 }
@@ -2823,7 +8022,7 @@
   max_uv_tx_size = get_uv_tx_size_impl(xd->mi[0]->mbmi.tx_size, bsize,
                                        pd[1].subsampling_x,
                                        pd[1].subsampling_y);
-  rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv, &rate_uv_tokenonly,
+  rd_pick_intra_sbuv_mode(cpi, x, &rate_uv, &rate_uv_tokenonly,
                           &dist_uv, &uv_skip, VPXMAX(BLOCK_8X8, bsize),
                           max_uv_tx_size);
 
@@ -2847,14 +8046,11 @@
 #define LOW_VAR_THRESH 16
 #define VLOW_ADJ_MAX 25
 #define VHIGH_ADJ_MAX 8
-static void rd_variance_adjustment(VP10_COMP *cpi,
-                                   MACROBLOCK *x,
-                                   BLOCK_SIZE bsize,
+static void rd_variance_adjustment(MACROBLOCK *x,
                                    int64_t *this_rd,
                                    MV_REFERENCE_FRAME ref_frame,
                                    unsigned int source_variance) {
-  MACROBLOCKD *const xd = &x->e_mbd;
-  unsigned int recon_variance;
+  unsigned int recon_variance = x->recon_variance;
   unsigned int absvar_diff = 0;
   int64_t var_error = 0;
   int64_t var_factor = 0;
@@ -2862,19 +8058,6 @@
   if (*this_rd == INT64_MAX)
     return;
 
-#if CONFIG_VP9_HIGHBITDEPTH
-  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    recon_variance =
-      vp10_high_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize, xd->bd);
-  } else {
-    recon_variance =
-      vp10_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
-  }
-#else
-  recon_variance =
-    vp10_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
   if ((source_variance + recon_variance) > LOW_VAR_THRESH) {
     absvar_diff = (source_variance > recon_variance)
       ? (source_variance - recon_variance)
@@ -2968,15 +8151,266 @@
 // bars embedded in the stream.
 int vp10_active_edge_sb(VP10_COMP *cpi,
                        int mi_row, int mi_col) {
-  return vp10_active_h_edge(cpi, mi_row, MI_BLOCK_SIZE) ||
-         vp10_active_v_edge(cpi, mi_col, MI_BLOCK_SIZE);
+  return vp10_active_h_edge(cpi, mi_row, cpi->common.mib_size) ||
+         vp10_active_v_edge(cpi, mi_col, cpi->common.mib_size);
 }
 
+static void restore_uv_color_map(VP10_COMP *cpi, MACROBLOCK *x) {
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const BLOCK_SIZE bsize = mbmi->sb_type;
+  const int rows = (4 * num_4x4_blocks_high_lookup[bsize]) >>
+      (xd->plane[1].subsampling_y);
+  const int cols = (4 * num_4x4_blocks_wide_lookup[bsize]) >>
+      (xd->plane[1].subsampling_x);
+  int src_stride = x->plane[1].src.stride;
+  const uint8_t *const src_u = x->plane[1].src.buf;
+  const uint8_t *const src_v = x->plane[2].src.buf;
+  float *const data = x->palette_buffer->kmeans_data_buf;
+  uint8_t *const indices = x->palette_buffer->kmeans_indices_buf;
+  float centroids[2 * PALETTE_MAX_SIZE];
+  uint8_t *const color_map = xd->plane[1].color_index_map;
+  int r, c;
+#if CONFIG_VP9_HIGHBITDEPTH
+  const uint16_t *const src_u16 = CONVERT_TO_SHORTPTR(src_u);
+  const uint16_t *const src_v16 = CONVERT_TO_SHORTPTR(src_v);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  (void)cpi;
+
+  for (r = 0; r < rows; ++r) {
+    for (c = 0; c < cols; ++c) {
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (cpi->common.use_highbitdepth) {
+        data[(r * cols + c) * 2 ] =
+            src_u16[r * src_stride + c];
+        data[(r * cols + c) * 2 + 1] =
+            src_v16[r * src_stride + c];
+      } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        data[(r * cols + c) * 2 ] =
+            src_u[r * src_stride + c];
+        data[(r * cols + c) * 2 + 1] =
+            src_v[r * src_stride + c];
+#if CONFIG_VP9_HIGHBITDEPTH
+      }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+    }
+  }
+
+  for (r = 1; r < 3; ++r) {
+    for (c = 0; c < pmi->palette_size[1]; ++c) {
+      centroids[c * 2 + r - 1] = pmi->palette_colors[r * PALETTE_MAX_SIZE + c];
+    }
+  }
+
+  vp10_calc_indices(data, centroids, indices, rows * cols,
+                    pmi->palette_size[1], 2);
+
+  for (r = 0; r < rows; ++r)
+    for (c = 0; c < cols; ++c)
+      color_map[r * cols + c] = indices[r * cols + c];
+}
+
+#if CONFIG_EXT_INTRA
+static void pick_ext_intra_iframe(VP10_COMP *cpi, MACROBLOCK *x,
+                                  PICK_MODE_CONTEXT *ctx, BLOCK_SIZE bsize,
+                                  int *rate_uv_intra, int *rate_uv_tokenonly,
+                                  int64_t *dist_uv, int *skip_uv,
+                                  PREDICTION_MODE *mode_uv,
+                                  EXT_INTRA_MODE_INFO *ext_intra_mode_info_uv,
+                                  PALETTE_MODE_INFO *pmi_uv,
+                                  int8_t *uv_angle_delta,
+                                  int palette_ctx, int skip_mask,
+                                  unsigned int *ref_costs_single,
+                                  int64_t *best_rd, int64_t *best_intra_rd,
+                                  PREDICTION_MODE *best_intra_mode,
+                                  int *best_mode_index, int *best_skip2,
+                                  int *best_mode_skippable,
+#if CONFIG_SUPERTX
+                                  int *returnrate_nocoef,
+#endif  // CONFIG_SUPERTX
+                                  int64_t *best_pred_rd,
+                                  MB_MODE_INFO *best_mbmode, RD_COST *rd_cost) {
+  VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
+  const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
+  int rate2 = 0, rate_y = INT_MAX, skippable = 0, rate_uv, rate_dummy, i;
+  int dc_mode_index;
+  const int * const intra_mode_cost =
+      cpi->mbmode_cost[size_group_lookup[bsize]];
+  int64_t distortion2 = 0, distortion_y = 0, this_rd = *best_rd, distortion_uv;
+  TX_SIZE uv_tx;
+
+  for (i = 0; i < MAX_MODES; ++i)
+    if (vp10_mode_order[i].mode == DC_PRED &&
+        vp10_mode_order[i].ref_frame[0] == INTRA_FRAME)
+      break;
+  dc_mode_index = i;
+  assert(i < MAX_MODES);
+
+  // TODO(huisu): use skip_mask for further speedup.
+  (void)skip_mask;
+  mbmi->mode = DC_PRED;
+  mbmi->uv_mode = DC_PRED;
+  mbmi->ref_frame[0] = INTRA_FRAME;
+  mbmi->ref_frame[1] = NONE;
+  if (!rd_pick_ext_intra_sby(cpi, x, &rate_dummy, &rate_y, &distortion_y,
+                             &skippable, bsize,
+                             intra_mode_cost[mbmi->mode], &this_rd, 0))
+    return;
+  if (rate_y == INT_MAX)
+    return;
+
+  uv_tx = get_uv_tx_size_impl(mbmi->tx_size, bsize,
+                              xd->plane[1].subsampling_x,
+                              xd->plane[1].subsampling_y);
+  if (rate_uv_intra[uv_tx] == INT_MAX) {
+    choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx,
+                         &rate_uv_intra[uv_tx], &rate_uv_tokenonly[uv_tx],
+                         &dist_uv[uv_tx], &skip_uv[uv_tx], &mode_uv[uv_tx]);
+    if (cm->allow_screen_content_tools)
+      pmi_uv[uv_tx] = *pmi;
+    ext_intra_mode_info_uv[uv_tx] = mbmi->ext_intra_mode_info;
+    uv_angle_delta[uv_tx] = mbmi->angle_delta[1];
+  }
+
+  rate_uv = rate_uv_tokenonly[uv_tx];
+  distortion_uv = dist_uv[uv_tx];
+  skippable = skippable && skip_uv[uv_tx];
+  mbmi->uv_mode = mode_uv[uv_tx];
+  if (cm->allow_screen_content_tools) {
+    pmi->palette_size[1] = pmi_uv[uv_tx].palette_size[1];
+    memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
+           pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE,
+           2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
+  }
+  mbmi->angle_delta[1] = uv_angle_delta[uv_tx];
+  mbmi->ext_intra_mode_info.use_ext_intra_mode[1] =
+      ext_intra_mode_info_uv[uv_tx].use_ext_intra_mode[1];
+  if (ext_intra_mode_info_uv[uv_tx].use_ext_intra_mode[1]) {
+    mbmi->ext_intra_mode_info.ext_intra_mode[1] =
+        ext_intra_mode_info_uv[uv_tx].ext_intra_mode[1];
+  }
+
+  rate2 = rate_y + intra_mode_cost[mbmi->mode] + rate_uv +
+      cpi->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode];
+  if (cpi->common.allow_screen_content_tools && mbmi->mode == DC_PRED)
+    rate2 +=
+        vp10_cost_bit(vp10_default_palette_y_mode_prob[bsize - BLOCK_8X8]
+                                                       [palette_ctx], 0);
+
+  if (!xd->lossless[mbmi->segment_id]) {
+    // super_block_yrd above includes the cost of the tx_size in the
+    // tokenonly rate, but for intra blocks, tx_size is always coded
+    // (prediction granularity), so we account for it in the full rate,
+    // not the tokenonly rate.
+    rate_y -=
+        cpi->tx_size_cost[max_tx_size - TX_8X8][get_tx_size_context(xd)]
+                                                [mbmi->tx_size];
+  }
+
+  rate2 += vp10_cost_bit(cm->fc->ext_intra_probs[0],
+                         mbmi->ext_intra_mode_info.use_ext_intra_mode[0]);
+  rate2 += write_uniform_cost(FILTER_INTRA_MODES,
+                              mbmi->ext_intra_mode_info.ext_intra_mode[0]);
+  if (mbmi->uv_mode != DC_PRED && mbmi->uv_mode != TM_PRED) {
+    rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTAS + 1,
+                                MAX_ANGLE_DELTAS +
+                                mbmi->angle_delta[1]);
+  }
+  if (ALLOW_FILTER_INTRA_MODES && mbmi->mode == DC_PRED) {
+    rate2 += vp10_cost_bit(cpi->common.fc->ext_intra_probs[1],
+                           mbmi->ext_intra_mode_info.use_ext_intra_mode[1]);
+    if (mbmi->ext_intra_mode_info.use_ext_intra_mode[1])
+      rate2 +=
+          write_uniform_cost(FILTER_INTRA_MODES,
+                             mbmi->ext_intra_mode_info.ext_intra_mode[1]);
+  }
+  distortion2 = distortion_y + distortion_uv;
+  vp10_encode_intra_block_plane(x, bsize, 0, 0);
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    x->recon_variance =
+        vp10_high_get_sby_perpixel_variance(cpi, &xd->plane[0].dst,
+                                            bsize, xd->bd);
+  } else {
+    x->recon_variance =
+        vp10_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
+  }
+#else
+  x->recon_variance =
+      vp10_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  rate2 += ref_costs_single[INTRA_FRAME];
+
+  if (skippable) {
+    rate2 -= (rate_y + rate_uv);
+    rate_y = 0;
+    rate_uv = 0;
+    rate2 += vp10_cost_bit(vp10_get_skip_prob(cm, xd), 1);
+  } else {
+    rate2 += vp10_cost_bit(vp10_get_skip_prob(cm, xd), 0);
+  }
+  this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+  rd_variance_adjustment(x, &this_rd, INTRA_FRAME, x->source_variance);
+
+  if (this_rd < *best_intra_rd) {
+    *best_intra_rd = this_rd;
+    *best_intra_mode = mbmi->mode;
+  }
+  for (i = 0; i < REFERENCE_MODES; ++i)
+    best_pred_rd[i] = VPXMIN(best_pred_rd[i], this_rd);
+
+  if (this_rd < *best_rd) {
+    *best_mode_index = dc_mode_index;
+    mbmi->mv[0].as_int = 0;
+    rd_cost->rate = rate2;
+#if CONFIG_SUPERTX
+    if (x->skip)
+      *returnrate_nocoef = rate2;
+    else
+      *returnrate_nocoef = rate2 - rate_y - rate_uv;
+    *returnrate_nocoef -= vp10_cost_bit(vp10_get_skip_prob(cm, xd), skippable);
+    *returnrate_nocoef -= vp10_cost_bit(vp10_get_intra_inter_prob(cm, xd),
+                                        mbmi->ref_frame[0] != INTRA_FRAME);
+#endif  // CONFIG_SUPERTX
+    rd_cost->dist = distortion2;
+    rd_cost->rdcost = this_rd;
+    *best_rd = this_rd;
+    *best_mbmode = *mbmi;
+    *best_skip2 = 0;
+    *best_mode_skippable = skippable;
+    memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
+           sizeof(ctx->zcoeff_blk[0]) * ctx->num_4x4_blk);
+  }
+}
+#endif  // CONFIG_EXT_INTRA
+
+#if CONFIG_OBMC
+static void calc_target_weighted_pred(
+    const VP10_COMMON *cm,
+    const MACROBLOCK *x,
+    const MACROBLOCKD *xd,
+    int mi_row, int mi_col,
+    const uint8_t *above, int above_stride,
+    const uint8_t *left, int left_stride,
+    int32_t *mask_buf,
+    int32_t *wsrc_buf);
+#endif  // CONFIG_OBMC
+
 void vp10_rd_pick_inter_mode_sb(VP10_COMP *cpi,
                                 TileDataEnc *tile_data,
                                 MACROBLOCK *x,
                                 int mi_row, int mi_col,
-                                RD_COST *rd_cost, BLOCK_SIZE bsize,
+                                RD_COST *rd_cost,
+#if CONFIG_SUPERTX
+                                int *returnrate_nocoef,
+#endif  // CONFIG_SUPERTX
+                                BLOCK_SIZE bsize,
                                 PICK_MODE_CONTEXT *ctx,
                                 int64_t best_rd_so_far) {
   VP10_COMMON *const cm = &cpi->common;
@@ -2984,6 +8418,7 @@
   SPEED_FEATURES *const sf = &cpi->sf;
   MACROBLOCKD *const xd = &x->e_mbd;
   MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  PALETTE_MODE_INFO *const pmi = &mbmi->palette_mode_info;
   MB_MODE_INFO_EXT *const mbmi_ext = x->mbmi_ext;
   const struct segmentation *const seg = &cm->seg;
   PREDICTION_MODE this_mode;
@@ -2991,17 +8426,33 @@
   unsigned char segment_id = mbmi->segment_id;
   int comp_pred, i, k;
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
-  struct buf_2d yv12_mb[4][MAX_MB_PLANE];
+  struct buf_2d yv12_mb[MAX_REF_FRAMES][MAX_MB_PLANE];
+#if CONFIG_EXT_INTER
+  int_mv single_newmvs[2][MAX_REF_FRAMES] = { { { 0 } },  { { 0 } } };
+  int single_newmvs_rate[2][MAX_REF_FRAMES] = { { 0 }, { 0 } };
+  int64_t modelled_rd[MB_MODE_COUNT][MAX_REF_FRAMES];
+#else
   int_mv single_newmv[MAX_REF_FRAMES] = { { 0 } };
+#endif  // CONFIG_EXT_INTER
   INTERP_FILTER single_inter_filter[MB_MODE_COUNT][MAX_REF_FRAMES];
   int single_skippable[MB_MODE_COUNT][MAX_REF_FRAMES];
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                    VP9_ALT_FLAG };
+  static const int flag_list[REFS_PER_FRAME + 1] = {
+    0,
+    VP9_LAST_FLAG,
+#if CONFIG_EXT_REFS
+    VP9_LAST2_FLAG,
+    VP9_LAST3_FLAG,
+#endif  // CONFIG_EXT_REFS
+    VP9_GOLD_FLAG,
+#if CONFIG_EXT_REFS
+    VP9_BWD_FLAG,
+#endif  // CONFIG_EXT_REFS
+    VP9_ALT_FLAG
+  };
   int64_t best_rd = best_rd_so_far;
+  int best_rate_y = INT_MAX, best_rate_uv = INT_MAX;
   int64_t best_pred_diff[REFERENCE_MODES];
   int64_t best_pred_rd[REFERENCE_MODES];
-  int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
-  int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
   MB_MODE_INFO best_mbmode;
   int best_mode_skippable = 0;
   int midx, best_mode_index = -1;
@@ -3014,32 +8465,95 @@
   int64_t dist_uv[TX_SIZES];
   int skip_uv[TX_SIZES];
   PREDICTION_MODE mode_uv[TX_SIZES];
+  PALETTE_MODE_INFO pmi_uv[TX_SIZES];
+#if CONFIG_EXT_INTRA
+  EXT_INTRA_MODE_INFO ext_intra_mode_info_uv[TX_SIZES];
+  int8_t uv_angle_delta[TX_SIZES], dc_skipped = 1;
+  int is_directional_mode, angle_stats_ready = 0;
+  int rate_overhead, rate_dummy;
+  uint8_t directional_mode_skip_mask[INTRA_MODES];
+#endif  // CONFIG_EXT_INTRA
   const int intra_cost_penalty = vp10_get_intra_cost_penalty(
       cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
+  const int * const intra_mode_cost =
+      cpi->mbmode_cost[size_group_lookup[bsize]];
   int best_skip2 = 0;
   uint8_t ref_frame_skip_mask[2] = { 0 };
+#if CONFIG_EXT_INTER
+  uint32_t mode_skip_mask[MAX_REF_FRAMES] = { 0 };
+  MV_REFERENCE_FRAME best_single_inter_ref = LAST_FRAME;
+  int64_t best_single_inter_rd = INT64_MAX;
+#else
   uint16_t mode_skip_mask[MAX_REF_FRAMES] = { 0 };
+#endif  // CONFIG_EXT_INTER
   int mode_skip_start = sf->mode_skip_start + 1;
   const int *const rd_threshes = rd_opt->threshes[segment_id][bsize];
   const int *const rd_thresh_freq_fact = tile_data->thresh_freq_fact[bsize];
   int64_t mode_threshold[MAX_MODES];
   int *mode_map = tile_data->mode_map[bsize];
   const int mode_search_skip_flags = sf->mode_search_skip_flags;
-  int64_t mask_filter = 0;
-  int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
+  const TX_SIZE max_tx_size = max_txsize_lookup[bsize];
+  int palette_ctx = 0;
+  const int rows = 4 * num_4x4_blocks_high_lookup[bsize];
+  const int cols = 4 * num_4x4_blocks_wide_lookup[bsize];
+  const MODE_INFO *above_mi = xd->above_mi;
+  const MODE_INFO *left_mi = xd->left_mi;
+#if CONFIG_OBMC
+#if CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf1[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf2[2 * MAX_MB_PLANE * MAX_SB_SQUARE]);
+#else
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf1[MAX_MB_PLANE * MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, uint8_t, tmp_buf2[MAX_MB_PLANE * MAX_SB_SQUARE]);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  DECLARE_ALIGNED(16, int32_t, weighted_src_buf[MAX_SB_SQUARE]);
+  DECLARE_ALIGNED(16, int32_t, mask2d_buf[MAX_SB_SQUARE]);
+  uint8_t *dst_buf1[MAX_MB_PLANE], *dst_buf2[MAX_MB_PLANE];
+  int dst_stride1[MAX_MB_PLANE] = {MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE};
+  int dst_stride2[MAX_MB_PLANE] = {MAX_SB_SIZE, MAX_SB_SIZE, MAX_SB_SIZE};
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    int len = sizeof(uint16_t);
+    dst_buf1[0] = CONVERT_TO_BYTEPTR(tmp_buf1);
+    dst_buf1[1] = CONVERT_TO_BYTEPTR(tmp_buf1 + MAX_SB_SQUARE * len);
+    dst_buf1[2] = CONVERT_TO_BYTEPTR(tmp_buf1 + 2 * MAX_SB_SQUARE * len);
+    dst_buf2[0] = CONVERT_TO_BYTEPTR(tmp_buf2);
+    dst_buf2[1] = CONVERT_TO_BYTEPTR(tmp_buf2 + MAX_SB_SQUARE * len);
+    dst_buf2[2] = CONVERT_TO_BYTEPTR(tmp_buf2 + 2 * MAX_SB_SQUARE * len);
+  } else {
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  dst_buf1[0] = tmp_buf1;
+  dst_buf1[1] = tmp_buf1 + MAX_SB_SQUARE;
+  dst_buf1[2] = tmp_buf1 + 2 * MAX_SB_SQUARE;
+  dst_buf2[0] = tmp_buf2;
+  dst_buf2[1] = tmp_buf2 + MAX_SB_SQUARE;
+  dst_buf2[2] = tmp_buf2 + 2 * MAX_SB_SQUARE;
+#if CONFIG_VP9_HIGHBITDEPTH
+  }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // CONFIG_OBMC
 
   vp10_zero(best_mbmode);
+  vp10_zero(pmi_uv);
 
-  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
-    filter_cache[i] = INT64_MAX;
+  if (cm->allow_screen_content_tools) {
+    if (above_mi)
+      palette_ctx += (above_mi->mbmi.palette_mode_info.palette_size[0] > 0);
+    if (left_mi)
+      palette_ctx += (left_mi->mbmi.palette_mode_info.palette_size[0] > 0);
+  }
+
+#if CONFIG_EXT_INTRA
+  memset(directional_mode_skip_mask, 0,
+         sizeof(directional_mode_skip_mask[0]) * INTRA_MODES);
+#endif  // CONFIG_EXT_INTRA
 
   estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
                            &comp_mode_p);
 
   for (i = 0; i < REFERENCE_MODES; ++i)
     best_pred_rd[i] = INT64_MAX;
-  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
-    best_filter_rd[i] = INT64_MAX;
   for (i = 0; i < TX_SIZES; i++)
     rate_uv_intra[i] = INT_MAX;
   for (i = 0; i < MAX_REF_FRAMES; ++i)
@@ -3052,9 +8566,16 @@
   }
 
   rd_cost->rate = INT_MAX;
+#if CONFIG_SUPERTX
+  *returnrate_nocoef = INT_MAX;
+#endif  // CONFIG_SUPERTX
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     x->pred_mv_sad[ref_frame] = INT_MAX;
+    x->mbmi_ext->mode_context[ref_frame] = 0;
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+    x->mbmi_ext->compound_mode_context[ref_frame] = 0;
+#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
       assert(get_ref_frame_buffer(cpi, ref_frame) != NULL);
       setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
@@ -3062,15 +8583,57 @@
     }
     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
     frame_mv[ZEROMV][ref_frame].as_int = 0;
+#if CONFIG_EXT_INTER
+    frame_mv[NEWFROMNEARMV][ref_frame].as_int = INVALID_MV;
+    frame_mv[NEW_NEWMV][ref_frame].as_int = INVALID_MV;
+    frame_mv[ZERO_ZEROMV][ref_frame].as_int = 0;
+#endif  // CONFIG_EXT_INTER
   }
 
+#if CONFIG_REF_MV
+  for (; ref_frame < MODE_CTX_REF_FRAMES; ++ref_frame) {
+    MODE_INFO *const mi = xd->mi[0];
+    int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame];
+    x->mbmi_ext->mode_context[ref_frame] = 0;
+    vp10_find_mv_refs(cm, xd, mi, ref_frame,
+                      &mbmi_ext->ref_mv_count[ref_frame],
+                      mbmi_ext->ref_mv_stack[ref_frame],
+#if CONFIG_EXT_INTER
+                      mbmi_ext->compound_mode_context,
+#endif  // CONFIG_EXT_INTER
+                      candidates, mi_row, mi_col,
+                      NULL, NULL, mbmi_ext->mode_context);
+  }
+#endif  // CONFIG_REF_MV
+
+#if CONFIG_OBMC
+  vp10_build_prediction_by_above_preds(cm, xd, mi_row, mi_col, dst_buf1,
+                                       dst_stride1);
+  vp10_build_prediction_by_left_preds(cm, xd, mi_row, mi_col, dst_buf2,
+                                      dst_stride2);
+  vp10_setup_dst_planes(xd->plane, get_frame_new_buffer(cm), mi_row, mi_col);
+  calc_target_weighted_pred(cm, x, xd, mi_row, mi_col,
+                            dst_buf1[0], dst_stride1[0],
+                            dst_buf2[0], dst_stride2[0],
+                            mask2d_buf, weighted_src_buf);
+#endif  // CONFIG_OBMC
+
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     if (!(cpi->ref_frame_flags & flag_list[ref_frame])) {
       // Skip checking missing references in both single and compound reference
       // modes. Note that a mode will be skipped iff both reference frames
       // are masked out.
-      ref_frame_skip_mask[0] |= (1 << ref_frame);
-      ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+#if CONFIG_EXT_REFS
+      if (ref_frame == BWDREF_FRAME || ref_frame == ALTREF_FRAME) {
+        ref_frame_skip_mask[0] |= (1 << ref_frame);
+        ref_frame_skip_mask[1] |= ((1 << ref_frame) | 0x01);
+      } else {
+#endif  // CONFIG_EXT_REFS
+        ref_frame_skip_mask[0] |= (1 << ref_frame);
+        ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+#if CONFIG_EXT_REFS
+      }
+#endif  // CONFIG_EXT_REFS
     } else {
       for (i = LAST_FRAME; i <= ALTREF_FRAME; ++i) {
         // Skip fixed mv modes for poor references
@@ -3098,18 +8661,38 @@
     // an unfiltered alternative. We allow near/nearest as well
     // because they may result in zero-zero MVs but be cheaper.
     if (cpi->rc.is_src_frame_alt_ref && (cpi->oxcf.arnr_max_frames == 0)) {
-      ref_frame_skip_mask[0] = (1 << LAST_FRAME) | (1 << GOLDEN_FRAME);
+      ref_frame_skip_mask[0] =
+          (1 << LAST_FRAME) |
+#if CONFIG_EXT_REFS
+          (1 << LAST2_FRAME) |
+          (1 << LAST3_FRAME) |
+          (1 << BWDREF_FRAME) |
+#endif  // CONFIG_EXT_REFS
+          (1 << GOLDEN_FRAME);
       ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
+      // TODO(zoeliu): To further explore whether following needs to be done for
+      //               BWDREF_FRAME as well.
       mode_skip_mask[ALTREF_FRAME] = ~INTER_NEAREST_NEAR_ZERO;
       if (frame_mv[NEARMV][ALTREF_FRAME].as_int != 0)
         mode_skip_mask[ALTREF_FRAME] |= (1 << NEARMV);
       if (frame_mv[NEARESTMV][ALTREF_FRAME].as_int != 0)
         mode_skip_mask[ALTREF_FRAME] |= (1 << NEARESTMV);
+#if CONFIG_EXT_INTER
+      if (frame_mv[NEAREST_NEARESTMV][ALTREF_FRAME].as_int != 0)
+        mode_skip_mask[ALTREF_FRAME] |= (1 << NEAREST_NEARESTMV);
+      if (frame_mv[NEAREST_NEARMV][ALTREF_FRAME].as_int != 0)
+        mode_skip_mask[ALTREF_FRAME] |= (1 << NEAREST_NEARMV);
+      if (frame_mv[NEAR_NEARESTMV][ALTREF_FRAME].as_int != 0)
+        mode_skip_mask[ALTREF_FRAME] |= (1 << NEAR_NEARESTMV);
+      if (frame_mv[NEAR_NEARMV][ALTREF_FRAME].as_int != 0)
+        mode_skip_mask[ALTREF_FRAME] |= (1 << NEAR_NEARMV);
+#endif  // CONFIG_EXT_INTER
     }
   }
 
   if (cpi->rc.is_src_frame_alt_ref) {
     if (sf->alt_ref_search_fp) {
+      assert(cpi->ref_frame_flags & flag_list[ALTREF_FRAME]);
       mode_skip_mask[ALTREF_FRAME] = 0;
       ref_frame_skip_mask[0] = ~(1 << ALTREF_FRAME);
       ref_frame_skip_mask[1] = SECOND_REF_FRAME_MASK;
@@ -3155,23 +8738,64 @@
     midx = end_pos;
   }
 
+  if (cpi->sf.tx_type_search.fast_intra_tx_type_search)
+    x->use_default_intra_tx_type = 1;
+  else
+    x->use_default_intra_tx_type = 0;
+
+  if (cpi->sf.tx_type_search.fast_inter_tx_type_search)
+    x->use_default_inter_tx_type = 1;
+  else
+    x->use_default_inter_tx_type = 0;
+
+#if CONFIG_EXT_INTER
+  for (i = 0 ; i < MB_MODE_COUNT ; ++i)
+    for (ref_frame = 0; ref_frame < MAX_REF_FRAMES; ++ref_frame)
+      modelled_rd[i][ref_frame] = INT64_MAX;
+#endif  // CONFIG_EXT_INTER
+
   for (midx = 0; midx < MAX_MODES; ++midx) {
-    int mode_index = mode_map[midx];
+    int mode_index;
     int mode_excluded = 0;
     int64_t this_rd = INT64_MAX;
     int disable_skip = 0;
     int compmode_cost = 0;
+#if CONFIG_EXT_INTER
+    int compmode_interintra_cost = 0;
+    int compmode_wedge_cost = 0;
+#endif  // CONFIG_EXT_INTER
     int rate2 = 0, rate_y = 0, rate_uv = 0;
     int64_t distortion2 = 0, distortion_y = 0, distortion_uv = 0;
     int skippable = 0;
     int this_skip2 = 0;
     int64_t total_sse = INT64_MAX;
     int early_term = 0;
+#if CONFIG_REF_MV
+    uint8_t ref_frame_type;
+#endif
 
+    mode_index = mode_map[midx];
     this_mode = vp10_mode_order[mode_index].mode;
     ref_frame = vp10_mode_order[mode_index].ref_frame[0];
     second_ref_frame = vp10_mode_order[mode_index].ref_frame[1];
 
+#if CONFIG_EXT_INTER
+    if (ref_frame > INTRA_FRAME && second_ref_frame == INTRA_FRAME) {
+      // Mode must by compatible
+      assert(is_interintra_allowed_mode(this_mode));
+
+      if (!is_interintra_allowed_bsize(bsize))
+        continue;
+    }
+
+    if (is_inter_compound_mode(this_mode)) {
+      frame_mv[this_mode][ref_frame].as_int =
+          frame_mv[compound_ref0_mode(this_mode)][ref_frame].as_int;
+      frame_mv[this_mode][second_ref_frame].as_int =
+          frame_mv[compound_ref1_mode(this_mode)][second_ref_frame].as_int;
+    }
+#endif  // CONFIG_EXT_INTER
+
     // Look at the reference frame of the best mode so far and set the
     // skip mask to look at a subset of the remaining modes.
     if (midx == mode_skip_start && best_mode_index >= 0) {
@@ -3182,12 +8806,31 @@
           ref_frame_skip_mask[0] |= LAST_FRAME_MODE_MASK;
           ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
           break;
+#if CONFIG_EXT_REFS
+        case LAST2_FRAME:
+          ref_frame_skip_mask[0] |= LAST2_FRAME_MODE_MASK;
+          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+          break;
+        case LAST3_FRAME:
+          ref_frame_skip_mask[0] |= LAST3_FRAME_MODE_MASK;
+          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+          break;
+#endif  // CONFIG_EXT_REFS
         case GOLDEN_FRAME:
           ref_frame_skip_mask[0] |= GOLDEN_FRAME_MODE_MASK;
           ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
           break;
+#if CONFIG_EXT_REFS
+        case BWDREF_FRAME:
+          ref_frame_skip_mask[0] |= BWDREF_FRAME_MODE_MASK;
+          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+          break;
+#endif  // CONFIG_EXT_REFS
         case ALTREF_FRAME:
-          ref_frame_skip_mask[0] |= ALT_REF_MODE_MASK;
+          ref_frame_skip_mask[0] |= ALTREF_FRAME_MODE_MASK;
+#if CONFIG_EXT_REFS
+          ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+#endif  // CONFIG_EXT_REFS
           break;
         case NONE:
         case MAX_REF_FRAMES:
@@ -3262,8 +8905,12 @@
       }
     } else {
       const MV_REFERENCE_FRAME ref_frames[2] = {ref_frame, second_ref_frame};
-      if (!check_best_zero_mv(cpi, mbmi_ext->mode_context, frame_mv,
-                              this_mode, ref_frames))
+      if (!check_best_zero_mv(cpi, mbmi_ext->mode_context,
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+                              mbmi_ext->compound_mode_context,
+#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
+                              frame_mv,
+                              this_mode, ref_frames, bsize, -1))
         continue;
     }
 
@@ -3271,11 +8918,25 @@
     mbmi->uv_mode = DC_PRED;
     mbmi->ref_frame[0] = ref_frame;
     mbmi->ref_frame[1] = second_ref_frame;
+    pmi->palette_size[0] = 0;
+    pmi->palette_size[1] = 0;
+#if CONFIG_EXT_INTRA
+    mbmi->ext_intra_mode_info.use_ext_intra_mode[0] = 0;
+    mbmi->ext_intra_mode_info.use_ext_intra_mode[1] = 0;
+#endif  // CONFIG_EXT_INTRA
     // Evaluate all sub-pel filters irrespective of whether we can use
     // them for this frame.
-    mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
+#if CONFIG_DUAL_FILTER
+    for (i = 0; i < 4; ++i) {
+      mbmi->interp_filter[i] = cm->interp_filter == SWITCHABLE ?
+          EIGHTTAP_REGULAR : cm->interp_filter;
+    }
+#else
+    mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP_REGULAR
                                                           : cm->interp_filter;
+#endif
     mbmi->mv[0].as_int = mbmi->mv[1].as_int = 0;
+    mbmi->motion_variation = SIMPLE_TRANSLATION;
 
     x->skip = 0;
     set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
@@ -3287,41 +8948,413 @@
         xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
     }
 
+#if CONFIG_EXT_INTER
+    mbmi->interintra_mode = (PREDICTION_MODE)(DC_PRED - 1);
+#endif  // CONFIG_EXT_INTER
+
     if (ref_frame == INTRA_FRAME) {
       TX_SIZE uv_tx;
       struct macroblockd_plane *const pd = &xd->plane[1];
-      memset(x->skip_txfm, 0, sizeof(x->skip_txfm));
+#if CONFIG_EXT_INTRA
+      is_directional_mode = (mbmi->mode != DC_PRED && mbmi->mode != TM_PRED);
+      if (is_directional_mode) {
+        if (!angle_stats_ready) {
+          const int src_stride = x->plane[0].src.stride;
+          const uint8_t *src = x->plane[0].src.buf;
+          const int rows = 4 * num_4x4_blocks_high_lookup[bsize];
+          const int cols = 4 * num_4x4_blocks_wide_lookup[bsize];
+#if CONFIG_VP9_HIGHBITDEPTH
+          if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH)
+            highbd_angle_estimation(src, src_stride, rows, cols,
+                                    directional_mode_skip_mask);
+          else
+#endif
+            angle_estimation(src, src_stride, rows, cols,
+                             directional_mode_skip_mask);
+          angle_stats_ready = 1;
+        }
+        if (directional_mode_skip_mask[mbmi->mode])
+          continue;
+        rate_overhead = write_uniform_cost(2 * MAX_ANGLE_DELTAS + 1, 0) +
+            intra_mode_cost[mbmi->mode];
+        rate_y = INT_MAX;
+        this_rd =
+            rd_pick_intra_angle_sby(cpi, x, &rate_dummy, &rate_y, &distortion_y,
+                                    &skippable, bsize, rate_overhead, best_rd);
+      } else {
+        mbmi->angle_delta[0] = 0;
+        super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,
+                        NULL, bsize, best_rd);
+      }
+#else
       super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,
                       NULL, bsize, best_rd);
+#endif  // CONFIG_EXT_INTRA
+
       if (rate_y == INT_MAX)
         continue;
 
+#if CONFIG_EXT_INTRA
+      if (mbmi->mode == DC_PRED)
+        dc_skipped = 0;
+#endif  // CONFIG_EXT_INTRA
+
       uv_tx = get_uv_tx_size_impl(mbmi->tx_size, bsize, pd->subsampling_x,
                                   pd->subsampling_y);
       if (rate_uv_intra[uv_tx] == INT_MAX) {
         choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx,
                              &rate_uv_intra[uv_tx], &rate_uv_tokenonly[uv_tx],
                              &dist_uv[uv_tx], &skip_uv[uv_tx], &mode_uv[uv_tx]);
+        if (cm->allow_screen_content_tools)
+          pmi_uv[uv_tx] = *pmi;
+#if CONFIG_EXT_INTRA
+        ext_intra_mode_info_uv[uv_tx] = mbmi->ext_intra_mode_info;
+        uv_angle_delta[uv_tx] = mbmi->angle_delta[1];
+#endif  // CONFIG_EXT_INTRA
       }
 
       rate_uv = rate_uv_tokenonly[uv_tx];
       distortion_uv = dist_uv[uv_tx];
       skippable = skippable && skip_uv[uv_tx];
       mbmi->uv_mode = mode_uv[uv_tx];
+      if (cm->allow_screen_content_tools) {
+        pmi->palette_size[1] = pmi_uv[uv_tx].palette_size[1];
+        memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
+               pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE,
+               2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
+      }
+#if CONFIG_EXT_INTRA
+      mbmi->angle_delta[1] = uv_angle_delta[uv_tx];
+      mbmi->ext_intra_mode_info.use_ext_intra_mode[1] =
+          ext_intra_mode_info_uv[uv_tx].use_ext_intra_mode[1];
+      if (ext_intra_mode_info_uv[uv_tx].use_ext_intra_mode[1]) {
+        mbmi->ext_intra_mode_info.ext_intra_mode[1] =
+            ext_intra_mode_info_uv[uv_tx].ext_intra_mode[1];
+      }
+#endif  // CONFIG_EXT_INTRA
 
-      rate2 = rate_y + cpi->mbmode_cost[mbmi->mode] + rate_uv_intra[uv_tx];
+      rate2 = rate_y + intra_mode_cost[mbmi->mode] +
+              rate_uv + cpi->intra_uv_mode_cost[mbmi->mode][mbmi->uv_mode];
+      if (cpi->common.allow_screen_content_tools && mbmi->mode == DC_PRED)
+        rate2 +=
+            vp10_cost_bit(vp10_default_palette_y_mode_prob[bsize - BLOCK_8X8]
+                                                          [palette_ctx], 0);
+
+      if (!xd->lossless[mbmi->segment_id]) {
+        // super_block_yrd above includes the cost of the tx_size in the
+        // tokenonly rate, but for intra blocks, tx_size is always coded
+        // (prediction granularity), so we account for it in the full rate,
+        // not the tokenonly rate.
+        rate_y -=
+            cpi->tx_size_cost[max_tx_size - TX_8X8][get_tx_size_context(xd)]
+                                                   [mbmi->tx_size];
+      }
+#if CONFIG_EXT_INTRA
+      if (is_directional_mode) {
+        int p_angle;
+        const int intra_filter_ctx = vp10_get_pred_context_intra_interp(xd);
+        rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTAS + 1,
+                                    MAX_ANGLE_DELTAS +
+                                    mbmi->angle_delta[0]);
+        p_angle = mode_to_angle_map[mbmi->mode] +
+            mbmi->angle_delta[0] * ANGLE_STEP;
+        if (vp10_is_intra_filter_switchable(p_angle))
+          rate2 += cpi->intra_filter_cost[intra_filter_ctx][mbmi->intra_filter];
+      }
+
+      if (mbmi->mode == DC_PRED && ALLOW_FILTER_INTRA_MODES) {
+        rate2 += vp10_cost_bit(cm->fc->ext_intra_probs[0],
+                               mbmi->ext_intra_mode_info.use_ext_intra_mode[0]);
+        if (mbmi->ext_intra_mode_info.use_ext_intra_mode[0]) {
+          rate2 +=
+              write_uniform_cost(FILTER_INTRA_MODES,
+                                 mbmi->ext_intra_mode_info.ext_intra_mode[0]);
+        }
+      }
+
+      if (mbmi->uv_mode != DC_PRED && mbmi->uv_mode != TM_PRED) {
+        rate2 += write_uniform_cost(2 * MAX_ANGLE_DELTAS + 1,
+                                    MAX_ANGLE_DELTAS +
+                                    mbmi->angle_delta[1]);
+      }
+
+      if (ALLOW_FILTER_INTRA_MODES && mbmi->mode == DC_PRED) {
+        rate2 += vp10_cost_bit(cpi->common.fc->ext_intra_probs[1],
+                               mbmi->ext_intra_mode_info.use_ext_intra_mode[1]);
+        if (mbmi->ext_intra_mode_info.use_ext_intra_mode[1])
+          rate2 +=
+              write_uniform_cost(FILTER_INTRA_MODES,
+                                 mbmi->ext_intra_mode_info.ext_intra_mode[1]);
+      }
+#endif  // CONFIG_EXT_INTRA
       if (this_mode != DC_PRED && this_mode != TM_PRED)
         rate2 += intra_cost_penalty;
       distortion2 = distortion_y + distortion_uv;
+      vp10_encode_intra_block_plane(x, bsize, 0, 1);
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
+        x->recon_variance =
+          vp10_high_get_sby_perpixel_variance(cpi, &xd->plane[0].dst,
+                                              bsize, xd->bd);
+      } else {
+        x->recon_variance =
+          vp10_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
+      }
+#else
+      x->recon_variance =
+        vp10_get_sby_perpixel_variance(cpi, &xd->plane[0].dst, bsize);
+#endif  // CONFIG_VP9_HIGHBITDEPTH
     } else {
+#if CONFIG_REF_MV
+      int_mv backup_ref_mv[2];
+
+      backup_ref_mv[0] = mbmi_ext->ref_mvs[ref_frame][0];
+      if (comp_pred)
+        backup_ref_mv[1] = mbmi_ext->ref_mvs[second_ref_frame][0];
+#endif
+#if CONFIG_EXT_INTER
+      if (second_ref_frame == INTRA_FRAME) {
+        if (best_single_inter_ref != ref_frame)
+          continue;
+        mbmi->interintra_mode = best_intra_mode;
+#if CONFIG_EXT_INTRA
+        // TODO(debargha|geza.lore):
+        // Should we use ext_intra modes for interintra?
+        mbmi->ext_intra_mode_info.use_ext_intra_mode[0] = 0;
+        mbmi->ext_intra_mode_info.use_ext_intra_mode[1] = 0;
+        mbmi->angle_delta[0] = 0;
+        mbmi->angle_delta[1] = 0;
+        mbmi->intra_filter = INTRA_FILTER_LINEAR;
+#endif  // CONFIG_EXT_INTRA
+      }
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_REF_MV
+      mbmi->ref_mv_idx = 0;
+      ref_frame_type = vp10_ref_frame_type(mbmi->ref_frame);
+
+      if (this_mode == NEWMV &&
+          mbmi_ext->ref_mv_count[ref_frame_type] > 1) {
+        int ref;
+        for (ref = 0; ref < 1 + comp_pred; ++ref) {
+          int_mv this_mv = (ref == 0) ?
+              mbmi_ext->ref_mv_stack[ref_frame_type][0].this_mv :
+              mbmi_ext->ref_mv_stack[ref_frame_type][0].comp_mv;
+          clamp_mv_ref(&this_mv.as_mv, xd->n8_w << 3, xd->n8_h << 3, xd);
+          mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0] = this_mv;
+        }
+      }
+#endif
       this_rd = handle_inter_mode(cpi, x, bsize,
                                   &rate2, &distortion2, &skippable,
                                   &rate_y, &rate_uv,
                                   &disable_skip, frame_mv,
                                   mi_row, mi_col,
-                                  single_newmv, single_inter_filter,
-                                  single_skippable, &total_sse, best_rd,
-                                  &mask_filter, filter_cache);
+#if CONFIG_OBMC
+                                  dst_buf1, dst_stride1,
+                                  dst_buf2, dst_stride2,
+                                  weighted_src_buf,
+                                  mask2d_buf,
+#endif  // CONFIG_OBMC
+#if CONFIG_EXT_INTER
+                                  single_newmvs,
+                                  single_newmvs_rate,
+                                  &compmode_interintra_cost,
+                                  &compmode_wedge_cost,
+                                  modelled_rd,
+#else
+                                  single_newmv,
+#endif  // CONFIG_EXT_INTER
+                                  single_inter_filter,
+                                  single_skippable,
+                                  &total_sse, best_rd);
+
+#if CONFIG_REF_MV
+      // TODO(jingning): This needs some refactoring to improve code quality
+      // and reduce redundant steps.
+      if ((mbmi->mode == NEARMV &&
+           mbmi_ext->ref_mv_count[ref_frame_type] > 2) ||
+          (mbmi->mode == NEWMV &&
+           mbmi_ext->ref_mv_count[ref_frame_type] > 1)) {
+        int_mv backup_mv = frame_mv[NEARMV][ref_frame];
+        MB_MODE_INFO backup_mbmi = *mbmi;
+        int backup_skip = x->skip;
+        int64_t tmp_ref_rd = this_rd;
+        int ref_idx;
+
+        // TODO(jingning): This should be deprecated shortly.
+        int idx_offset = (mbmi->mode == NEARMV) ? 1 : 0;
+        int ref_set =
+            VPXMIN(2, mbmi_ext->ref_mv_count[ref_frame_type] - 1 - idx_offset);
+
+        uint8_t drl_ctx = vp10_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type],
+                                       idx_offset);
+        // Dummy
+        int_mv backup_fmv[2];
+        backup_fmv[0] = frame_mv[NEWMV][ref_frame];
+        if (comp_pred)
+          backup_fmv[1] = frame_mv[NEWMV][second_ref_frame];
+
+        rate2 += cpi->drl_mode_cost0[drl_ctx][0];
+
+        if (this_rd < INT64_MAX) {
+          if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, distortion2) <
+              RDCOST(x->rdmult, x->rddiv, 0, total_sse))
+            tmp_ref_rd = RDCOST(x->rdmult, x->rddiv,
+                rate2 + vp10_cost_bit(vp10_get_skip_prob(cm, xd), 0),
+                distortion2);
+          else
+            tmp_ref_rd = RDCOST(x->rdmult, x->rddiv,
+                rate2 + vp10_cost_bit(vp10_get_skip_prob(cm, xd), 1) -
+                rate_y - rate_uv,
+                total_sse);
+        }
+#if CONFIG_VAR_TX
+        for (i = 0; i < MAX_MB_PLANE; ++i)
+          memcpy(x->blk_skip_drl[i], x->blk_skip[i],
+                 sizeof(uint8_t) * ctx->num_4x4_blk);
+#endif
+
+        for (ref_idx = 0; ref_idx < ref_set; ++ref_idx) {
+          int64_t tmp_alt_rd = INT64_MAX;
+          int tmp_rate = 0, tmp_rate_y = 0, tmp_rate_uv = 0;
+          int tmp_skip = 1;
+          int64_t tmp_dist = 0, tmp_sse = 0;
+          int dummy_disable_skip = 0;
+          int ref;
+          int_mv cur_mv;
+
+          mbmi->ref_mv_idx = 1 + ref_idx;
+
+          for (ref = 0; ref < 1 + comp_pred; ++ref) {
+            int_mv this_mv = (ref == 0) ?
+                mbmi_ext->ref_mv_stack[ref_frame_type]
+                                      [mbmi->ref_mv_idx].this_mv :
+                mbmi_ext->ref_mv_stack[ref_frame_type]
+                                      [mbmi->ref_mv_idx].comp_mv;
+            clamp_mv_ref(&this_mv.as_mv, xd->n8_w << 3, xd->n8_h << 3, xd);
+            mbmi_ext->ref_mvs[mbmi->ref_frame[ref]][0] = this_mv;
+          }
+
+          cur_mv = mbmi_ext->ref_mv_stack[ref_frame]
+                                 [mbmi->ref_mv_idx + idx_offset].this_mv;
+          clamp_mv2(&cur_mv.as_mv, xd);
+
+          if (!mv_check_bounds(x, &cur_mv.as_mv)) {
+            INTERP_FILTER dummy_single_inter_filter[MB_MODE_COUNT]
+                                                   [MAX_REF_FRAMES] =
+                                          { { 0 } };
+            int dummy_single_skippable[MB_MODE_COUNT][MAX_REF_FRAMES] =
+                                          { { 0 } };
+            int dummy_disable_skip = 0;
+#if CONFIG_EXT_INTER
+            int_mv dummy_single_newmvs[2][MAX_REF_FRAMES] =
+                                          { { { 0 } },  { { 0 } } };
+            int dummy_single_newmvs_rate[2][MAX_REF_FRAMES] =
+                                          { { 0 }, { 0 } };
+            int dummy_compmode_interintra_cost = 0;
+            int dummy_compmode_wedge_cost = 0;
+#else
+            int_mv dummy_single_newmv[MAX_REF_FRAMES] = { { 0 } };
+#endif
+
+            frame_mv[NEARMV][ref_frame] = cur_mv;
+            tmp_alt_rd = handle_inter_mode(cpi, x, bsize,
+                                           &tmp_rate, &tmp_dist, &tmp_skip,
+                                           &tmp_rate_y, &tmp_rate_uv,
+                                           &dummy_disable_skip, frame_mv,
+                                           mi_row, mi_col,
+#if CONFIG_OBMC
+                                           dst_buf1, dst_stride1,
+                                           dst_buf2, dst_stride2,
+                                           weighted_src_buf,
+                                           mask2d_buf,
+#endif  // CONFIG_OBMC
+#if CONFIG_EXT_INTER
+                                           dummy_single_newmvs,
+                                           dummy_single_newmvs_rate,
+                                           &dummy_compmode_interintra_cost,
+                                           &dummy_compmode_wedge_cost,
+                                           NULL,
+#else
+                                           dummy_single_newmv,
+#endif
+                                           dummy_single_inter_filter,
+                                           dummy_single_skippable,
+                                           &tmp_sse, best_rd);
+          }
+
+          for (i = 0; i < mbmi->ref_mv_idx; ++i) {
+            uint8_t drl1_ctx = 0;
+            drl1_ctx = vp10_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type],
+                                    i + idx_offset);
+            tmp_rate += cpi->drl_mode_cost0[drl1_ctx][1];
+          }
+
+          if (mbmi_ext->ref_mv_count[ref_frame_type] >
+              mbmi->ref_mv_idx + idx_offset + 1 &&
+              ref_idx < ref_set - 1) {
+            uint8_t drl1_ctx =
+                vp10_drl_ctx(mbmi_ext->ref_mv_stack[ref_frame_type],
+                             mbmi->ref_mv_idx + idx_offset);
+            tmp_rate += cpi->drl_mode_cost0[drl1_ctx][0];
+          }
+
+          if (tmp_alt_rd < INT64_MAX) {
+#if CONFIG_OBMC
+            tmp_alt_rd = RDCOST(x->rdmult, x->rddiv, tmp_rate, tmp_dist);
+#else
+            if (RDCOST(x->rdmult, x->rddiv,
+                       tmp_rate_y + tmp_rate_uv, tmp_dist) <
+                RDCOST(x->rdmult, x->rddiv, 0, tmp_sse))
+              tmp_alt_rd = RDCOST(x->rdmult, x->rddiv,
+                  tmp_rate + vp10_cost_bit(vp10_get_skip_prob(cm, xd), 0),
+                  tmp_dist);
+            else
+              tmp_alt_rd = RDCOST(x->rdmult, x->rddiv,
+                  tmp_rate + vp10_cost_bit(vp10_get_skip_prob(cm, xd), 1) -
+                  tmp_rate_y - tmp_rate_uv,
+                  tmp_sse);
+#endif  // CONFIG_OBMC
+          }
+
+          if (tmp_ref_rd > tmp_alt_rd) {
+            rate2 = tmp_rate;
+            disable_skip = dummy_disable_skip;
+            distortion2 = tmp_dist;
+            skippable = tmp_skip;
+            rate_y = tmp_rate_y;
+            rate_uv = tmp_rate_uv;
+            total_sse = tmp_sse;
+            this_rd = tmp_alt_rd;
+            tmp_ref_rd = tmp_alt_rd;
+            backup_mbmi = *mbmi;
+            backup_skip = x->skip;
+#if CONFIG_VAR_TX
+            for (i = 0; i < MAX_MB_PLANE; ++i)
+              memcpy(x->blk_skip_drl[i], x->blk_skip[i],
+                     sizeof(uint8_t) * ctx->num_4x4_blk);
+#endif
+          } else {
+            *mbmi = backup_mbmi;
+            x->skip = backup_skip;
+          }
+        }
+
+        frame_mv[NEARMV][ref_frame] = backup_mv;
+        frame_mv[NEWMV][ref_frame] = backup_fmv[0];
+        if (comp_pred)
+          frame_mv[NEWMV][second_ref_frame] = backup_fmv[1];
+#if CONFIG_VAR_TX
+        for (i = 0; i < MAX_MB_PLANE; ++i)
+          memcpy(x->blk_skip[i], x->blk_skip_drl[i],
+                 sizeof(uint8_t) * ctx->num_4x4_blk);
+#endif
+      }
+      mbmi_ext->ref_mvs[ref_frame][0] = backup_ref_mv[0];
+      if (comp_pred)
+        mbmi_ext->ref_mvs[second_ref_frame][0] = backup_ref_mv[1];
+#endif  // CONFIG_REF_MV
+
       if (this_rd == INT64_MAX)
         continue;
 
@@ -3331,6 +9364,15 @@
         rate2 += compmode_cost;
     }
 
+#if CONFIG_EXT_INTER
+    rate2 += compmode_interintra_cost;
+    if (cm->reference_mode != SINGLE_REFERENCE && comp_pred)
+#if CONFIG_OBMC || CONFIG_WARPED_MOTION
+      if (mbmi->motion_variation == SIMPLE_TRANSLATION)
+#endif  // CONFIG_OBMC || CONFIG_WARPED_MOTION
+        rate2 += compmode_wedge_cost;
+#endif  // CONFIG_EXT_INTER
+
     // Estimate the reference frame signaling cost and add it
     // to the rolling cost variable.
     if (comp_pred) {
@@ -3339,11 +9381,16 @@
       rate2 += ref_costs_single[ref_frame];
     }
 
+#if CONFIG_OBMC
+    if (ref_frame == INTRA_FRAME) {
+#else
     if (!disable_skip) {
+#endif  // CONFIG_OBMC
       if (skippable) {
         // Back out the coefficient coding costs
         rate2 -= (rate_y + rate_uv);
-
+        rate_y = 0;
+        rate_uv = 0;
         // Cost the skip mb case
         rate2 += vp10_cost_bit(vp10_get_skip_prob(cm, xd), 1);
       } else if (ref_frame != INTRA_FRAME && !xd->lossless[mbmi->segment_id]) {
@@ -3358,6 +9405,8 @@
           assert(total_sse >= 0);
           rate2 -= (rate_y + rate_uv);
           this_skip2 = 1;
+          rate_y = 0;
+          rate_uv = 0;
         }
       } else {
         // Add in the cost of the no skip flag.
@@ -3366,31 +9415,43 @@
 
       // Calculate the final RD estimate for this mode.
       this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+#if CONFIG_OBMC
+    } else {
+      this_skip2 = mbmi->skip;
+      this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+      if (this_skip2) {
+        rate_y = 0;
+        rate_uv = 0;
+      }
+#endif  // CONFIG_OBMC
     }
 
     // Apply an adjustment to the rd value based on the similarity of the
     // source variance and reconstructed variance.
-    rd_variance_adjustment(cpi, x, bsize, &this_rd,
-                           ref_frame, x->source_variance);
+    rd_variance_adjustment(x, &this_rd, ref_frame, x->source_variance);
 
     if (ref_frame == INTRA_FRAME) {
-    // Keep record of best intra rd
+      // Keep record of best intra rd
       if (this_rd < best_intra_rd) {
         best_intra_rd = this_rd;
         best_intra_mode = mbmi->mode;
       }
+#if CONFIG_EXT_INTER
+    } else if (second_ref_frame == NONE) {
+      if (this_rd < best_single_inter_rd) {
+        best_single_inter_rd = this_rd;
+        best_single_inter_ref = mbmi->ref_frame[0];
+      }
+#endif  // CONFIG_EXT_INTER
     }
 
     if (!disable_skip && ref_frame == INTRA_FRAME) {
       for (i = 0; i < REFERENCE_MODES; ++i)
         best_pred_rd[i] = VPXMIN(best_pred_rd[i], this_rd);
-      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
-        best_filter_rd[i] = VPXMIN(best_filter_rd[i], this_rd);
     }
 
     // Did this mode help.. i.e. is it the new best mode
     if (this_rd < best_rd || x->skip) {
-      int max_plane = MAX_MB_PLANE;
       if (!mode_excluded) {
         // Note index of best mode so far
         best_mode_index = mode_index;
@@ -3398,23 +9459,43 @@
         if (ref_frame == INTRA_FRAME) {
           /* required for left and above block mv */
           mbmi->mv[0].as_int = 0;
-          max_plane = 1;
         } else {
           best_pred_sse = x->pred_sse[ref_frame];
         }
 
         rd_cost->rate = rate2;
+#if CONFIG_SUPERTX
+        if (x->skip)
+          *returnrate_nocoef = rate2;
+        else
+          *returnrate_nocoef = rate2 - rate_y - rate_uv;
+        *returnrate_nocoef -= vp10_cost_bit(vp10_get_skip_prob(cm, xd),
+            disable_skip || skippable || this_skip2);
+        *returnrate_nocoef -= vp10_cost_bit(vp10_get_intra_inter_prob(cm, xd),
+                                            mbmi->ref_frame[0] != INTRA_FRAME);
+#if CONFIG_OBMC || CONFIG_WARPED_MOTION
+        if (is_inter_block(mbmi) && is_motvar_allowed(mbmi))
+          *returnrate_nocoef -= cpi->motvar_cost[bsize][mbmi->motion_variation];
+#endif  // CONFIG_OBMC || CONFIG_WARPED_MOTION
+#endif  // CONFIG_SUPERTX
         rd_cost->dist = distortion2;
         rd_cost->rdcost = this_rd;
         best_rd = this_rd;
         best_mbmode = *mbmi;
         best_skip2 = this_skip2;
         best_mode_skippable = skippable;
+        best_rate_y = rate_y +
+            vp10_cost_bit(vp10_get_skip_prob(cm, xd), this_skip2 || skippable);
+        best_rate_uv = rate_uv;
 
-        if (!x->select_tx_size)
-          swap_block_ptr(x, ctx, 1, 0, 0, max_plane);
+#if CONFIG_VAR_TX
+        for (i = 0; i < MAX_MB_PLANE; ++i)
+          memcpy(ctx->blk_skip[i], x->blk_skip[i],
+                 sizeof(uint8_t) * ctx->num_4x4_blk);
+#else
         memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
                sizeof(ctx->zcoeff_blk[0]) * ctx->num_4x4_blk);
+#endif
 
         // TODO(debargha): enhance this test with a better distortion prediction
         // based on qp, activity mask and history
@@ -3464,29 +9545,6 @@
       }
       if (hybrid_rd < best_pred_rd[REFERENCE_MODE_SELECT])
         best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
-
-      /* keep record of best filter type */
-      if (!mode_excluded && cm->interp_filter != BILINEAR) {
-        int64_t ref = filter_cache[cm->interp_filter == SWITCHABLE ?
-                              SWITCHABLE_FILTERS : cm->interp_filter];
-
-        for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
-          int64_t adj_rd;
-          if (ref == INT64_MAX)
-            adj_rd = 0;
-          else if (filter_cache[i] == INT64_MAX)
-            // when early termination is triggered, the encoder does not have
-            // access to the rate-distortion cost. it only knows that the cost
-            // should be above the maximum valid value. hence it takes the known
-            // maximum plus an arbitrary constant as the rate-distortion cost.
-            adj_rd = mask_filter - ref + 10;
-          else
-            adj_rd = filter_cache[i] - ref;
-
-          adj_rd += this_rd;
-          best_filter_rd[i] = VPXMIN(best_filter_rd[i], adj_rd);
-        }
-      }
     }
 
     if (early_term)
@@ -3496,15 +9554,327 @@
       break;
   }
 
+  if (xd->lossless[mbmi->segment_id] == 0 && best_mode_index >= 0 &&
+      ((sf->tx_type_search.fast_inter_tx_type_search == 1 &&
+        is_inter_mode(best_mbmode.mode)) ||
+       (sf->tx_type_search.fast_intra_tx_type_search == 1 &&
+        !is_inter_mode(best_mbmode.mode)))) {
+    int rate_y = 0, rate_uv = 0;
+    int64_t dist_y = 0, dist_uv = 0;
+    int skip_y = 0, skip_uv = 0, skip_blk = 0;
+    int64_t sse_y = 0, sse_uv = 0;
+
+    x->use_default_inter_tx_type = 0;
+    x->use_default_intra_tx_type = 0;
+
+    *mbmi = best_mbmode;
+
+    set_ref_ptrs(cm, xd, mbmi->ref_frame[0], mbmi->ref_frame[1]);
+
+    // Select prediction reference frames.
+    for (i = 0; i < MAX_MB_PLANE; i++) {
+      xd->plane[i].pre[0] = yv12_mb[mbmi->ref_frame[0]][i];
+      if (has_second_ref(mbmi))
+        xd->plane[i].pre[1] = yv12_mb[mbmi->ref_frame[1]][i];
+    }
+
+    if (is_inter_mode(mbmi->mode)) {
+      vp10_build_inter_predictors_sb(xd, mi_row, mi_col, bsize);
+#if CONFIG_OBMC
+      if (mbmi->motion_variation == OBMC_CAUSAL)
+        vp10_build_obmc_inter_prediction(cm, xd, mi_row, mi_col,
+                                         dst_buf1, dst_stride1,
+                                         dst_buf2, dst_stride2);
+#endif  // CONFIG_OBMC
+      vp10_subtract_plane(x, bsize, 0);
+#if CONFIG_VAR_TX
+      if (cm->tx_mode == TX_MODE_SELECT || xd->lossless[mbmi->segment_id]) {
+        select_tx_type_yrd(cpi, x, &rate_y, &dist_y, &skip_y, &sse_y,
+                           bsize, INT64_MAX);
+      } else {
+        int idx, idy;
+        super_block_yrd(cpi, x, &rate_y, &dist_y, &skip_y, &sse_y,
+                        bsize, INT64_MAX);
+        for (idy = 0; idy < xd->n8_h; ++idy)
+          for (idx = 0; idx < xd->n8_w; ++idx)
+            mbmi->inter_tx_size[idy][idx] = mbmi->tx_size;
+        memset(x->blk_skip[0], skip_y,
+               sizeof(uint8_t) * xd->n8_h * xd->n8_w * 4);
+      }
+
+      inter_block_uvrd(cpi, x, &rate_uv, &dist_uv, &skip_uv,
+                       &sse_uv, bsize, INT64_MAX);
+#else
+      super_block_yrd(cpi, x, &rate_y, &dist_y, &skip_y, &sse_y,
+                      bsize, INT64_MAX);
+      super_block_uvrd(cpi, x, &rate_uv, &dist_uv, &skip_uv,
+                       &sse_uv, bsize, INT64_MAX);
+#endif  // CONFIG_VAR_TX
+    } else {
+      super_block_yrd(cpi, x, &rate_y, &dist_y, &skip_y, &sse_y,
+                      bsize, INT64_MAX);
+      super_block_uvrd(cpi, x, &rate_uv, &dist_uv, &skip_uv,
+                       &sse_uv, bsize, INT64_MAX);
+    }
+
+    if (RDCOST(x->rdmult, x->rddiv, rate_y + rate_uv, (dist_y + dist_uv)) >
+        RDCOST(x->rdmult, x->rddiv, 0, (sse_y + sse_uv))) {
+      skip_blk = 1;
+      rate_y = vp10_cost_bit(vp10_get_skip_prob(cm, xd), 1);
+      rate_uv = 0;
+      dist_y = sse_y;
+      dist_uv = sse_uv;
+    } else {
+      skip_blk = 0;
+      rate_y += vp10_cost_bit(vp10_get_skip_prob(cm, xd), 0);
+    }
+
+    if (RDCOST(x->rdmult, x->rddiv,
+               best_rate_y + best_rate_uv, rd_cost->dist) >
+        RDCOST(x->rdmult, x->rddiv,
+               rate_y + rate_uv, (dist_y + dist_uv))) {
+#if CONFIG_VAR_TX
+      int idx, idy;
+#endif
+      best_mbmode.tx_type = mbmi->tx_type;
+      best_mbmode.tx_size = mbmi->tx_size;
+#if CONFIG_VAR_TX
+      for (idy = 0; idy < xd->n8_h; ++idy)
+        for (idx = 0; idx < xd->n8_w; ++idx)
+          best_mbmode.inter_tx_size[idy][idx] = mbmi->inter_tx_size[idy][idx];
+
+      for (i = 0; i < MAX_MB_PLANE; ++i)
+        memcpy(ctx->blk_skip[i], x->blk_skip[i],
+               sizeof(uint8_t) * ctx->num_4x4_blk);
+#endif
+      rd_cost->rate += (rate_y + rate_uv - best_rate_y - best_rate_uv);
+      rd_cost->dist = dist_y + dist_uv;
+      rd_cost->rdcost = RDCOST(x->rdmult, x->rddiv,
+                               rd_cost->rate, rd_cost->dist);
+      best_skip2 = skip_blk;
+    }
+  }
+
+  // Only try palette mode when the best mode so far is an intra mode.
+  if (cm->allow_screen_content_tools && !is_inter_mode(best_mbmode.mode)) {
+    PREDICTION_MODE mode_selected;
+    int rate2 = 0, rate_y = 0;
+#if CONFIG_SUPERTX
+    int best_rate_nocoef;
+#endif
+    int64_t distortion2 = 0, distortion_y = 0, dummy_rd = best_rd, this_rd;
+    int skippable = 0, rate_overhead = 0;
+    TX_SIZE best_tx_size, uv_tx;
+    TX_TYPE best_tx_type;
+    PALETTE_MODE_INFO palette_mode_info;
+    uint8_t *const best_palette_color_map =
+        x->palette_buffer->best_palette_color_map;
+    uint8_t *const color_map = xd->plane[0].color_index_map;
+
+    mbmi->mode = DC_PRED;
+    mbmi->uv_mode = DC_PRED;
+    mbmi->ref_frame[0] = INTRA_FRAME;
+    mbmi->ref_frame[1] = NONE;
+    palette_mode_info.palette_size[0] = 0;
+    rate_overhead =
+        rd_pick_palette_intra_sby(cpi, x, bsize, palette_ctx,
+                                  intra_mode_cost[DC_PRED],
+                                  &palette_mode_info, best_palette_color_map,
+                                  &best_tx_size, &best_tx_type, &mode_selected,
+                                  &dummy_rd);
+    if (palette_mode_info.palette_size[0] == 0)
+      goto PALETTE_EXIT;
+
+    pmi->palette_size[0] =
+        palette_mode_info.palette_size[0];
+    if (palette_mode_info.palette_size[0] > 0) {
+      memcpy(pmi->palette_colors, palette_mode_info.palette_colors,
+             PALETTE_MAX_SIZE * sizeof(palette_mode_info.palette_colors[0]));
+      memcpy(color_map, best_palette_color_map,
+             rows * cols * sizeof(best_palette_color_map[0]));
+    }
+    super_block_yrd(cpi, x, &rate_y, &distortion_y, &skippable,
+                    NULL, bsize, best_rd);
+    if (rate_y == INT_MAX)
+      goto PALETTE_EXIT;
+    uv_tx = get_uv_tx_size_impl(mbmi->tx_size, bsize,
+                                xd->plane[1].subsampling_x,
+                                xd->plane[1].subsampling_y);
+    if (rate_uv_intra[uv_tx] == INT_MAX) {
+      choose_intra_uv_mode(cpi, x, ctx, bsize, uv_tx,
+                           &rate_uv_intra[uv_tx], &rate_uv_tokenonly[uv_tx],
+                           &dist_uv[uv_tx], &skip_uv[uv_tx], &mode_uv[uv_tx]);
+      pmi_uv[uv_tx] = *pmi;
+#if CONFIG_EXT_INTRA
+      ext_intra_mode_info_uv[uv_tx] = mbmi->ext_intra_mode_info;
+      uv_angle_delta[uv_tx] = mbmi->angle_delta[1];
+#endif  // CONFIG_EXT_INTRA
+    }
+    mbmi->uv_mode = mode_uv[uv_tx];
+    pmi->palette_size[1] = pmi_uv[uv_tx].palette_size[1];
+    if (pmi->palette_size[1] > 0)
+      memcpy(pmi->palette_colors + PALETTE_MAX_SIZE,
+             pmi_uv[uv_tx].palette_colors + PALETTE_MAX_SIZE,
+             2 * PALETTE_MAX_SIZE * sizeof(pmi->palette_colors[0]));
+#if CONFIG_EXT_INTRA
+    mbmi->angle_delta[1] = uv_angle_delta[uv_tx];
+    mbmi->ext_intra_mode_info.use_ext_intra_mode[1] =
+        ext_intra_mode_info_uv[uv_tx].use_ext_intra_mode[1];
+    if (ext_intra_mode_info_uv[uv_tx].use_ext_intra_mode[1]) {
+      mbmi->ext_intra_mode_info.ext_intra_mode[1] =
+          ext_intra_mode_info_uv[uv_tx].ext_intra_mode[1];
+    }
+#endif  // CONFIG_EXT_INTRA
+    skippable = skippable && skip_uv[uv_tx];
+    distortion2 = distortion_y + dist_uv[uv_tx];
+    rate2 = rate_y + rate_overhead + rate_uv_intra[uv_tx];
+    rate2 += ref_costs_single[INTRA_FRAME];
+
+    if (skippable) {
+      rate2 -= (rate_y + rate_uv_tokenonly[uv_tx]);
+#if CONFIG_SUPERTX
+      best_rate_nocoef = rate2;
+#endif
+      rate2 += vp10_cost_bit(vp10_get_skip_prob(cm, xd), 1);
+    } else {
+#if CONFIG_SUPERTX
+      best_rate_nocoef = rate2 - (rate_y + rate_uv_tokenonly[uv_tx]);
+#endif
+      rate2 += vp10_cost_bit(vp10_get_skip_prob(cm, xd), 0);
+    }
+    this_rd = RDCOST(x->rdmult, x->rddiv, rate2, distortion2);
+    if (this_rd < best_rd) {
+      best_mode_index = 3;
+      mbmi->mv[0].as_int = 0;
+      rd_cost->rate = rate2;
+#if CONFIG_SUPERTX
+      *returnrate_nocoef = best_rate_nocoef;
+#endif
+      rd_cost->dist = distortion2;
+      rd_cost->rdcost = this_rd;
+      best_rd = this_rd;
+      best_mbmode = *mbmi;
+      best_skip2 = 0;
+      best_mode_skippable = skippable;
+      memcpy(ctx->zcoeff_blk, x->zcoeff_blk[mbmi->tx_size],
+             sizeof(ctx->zcoeff_blk[0]) * ctx->num_4x4_blk);
+    }
+  }
+  PALETTE_EXIT:
+
+#if CONFIG_EXT_INTRA
+  // TODO(huisu): ext-intra is turned off in lossless mode for now to
+  // avoid a unit test failure
+  if (!xd->lossless[mbmi->segment_id] &&
+      mbmi->palette_mode_info.palette_size[0] == 0 && !dc_skipped &&
+      best_mode_index >= 0 && (best_intra_rd >> 1)  < best_rd) {
+    pick_ext_intra_iframe(cpi, x, ctx, bsize, rate_uv_intra,
+                          rate_uv_tokenonly, dist_uv, skip_uv,
+                          mode_uv, ext_intra_mode_info_uv,
+                          pmi_uv, uv_angle_delta, palette_ctx, 0,
+                          ref_costs_single, &best_rd, &best_intra_rd,
+                          &best_intra_mode, &best_mode_index,
+                          &best_skip2, &best_mode_skippable,
+#if CONFIG_SUPERTX
+                          returnrate_nocoef,
+#endif  // CONFIG_SUPERTX
+                          best_pred_rd, &best_mbmode, rd_cost);
+  }
+#endif  // CONFIG_EXT_INTRA
+
   // The inter modes' rate costs are not calculated precisely in some cases.
   // Therefore, sometimes, NEWMV is chosen instead of NEARESTMV, NEARMV, and
   // ZEROMV. Here, checks are added for those cases, and the mode decisions
   // are corrected.
-  if (best_mbmode.mode == NEWMV) {
+  if (best_mbmode.mode == NEWMV
+#if CONFIG_EXT_INTER
+      || best_mbmode.mode == NEWFROMNEARMV
+      || best_mbmode.mode == NEW_NEWMV
+#endif  // CONFIG_EXT_INTER
+  ) {
     const MV_REFERENCE_FRAME refs[2] = {best_mbmode.ref_frame[0],
         best_mbmode.ref_frame[1]};
     int comp_pred_mode = refs[1] > INTRA_FRAME;
+#if CONFIG_REF_MV
+    const uint8_t rf_type = vp10_ref_frame_type(best_mbmode.ref_frame);
+    if (!comp_pred_mode) {
+      int i;
+      int ref_set = (mbmi_ext->ref_mv_count[rf_type] >= 2) ?
+          VPXMIN(2, mbmi_ext->ref_mv_count[rf_type] - 2) : INT_MAX;
 
+      for (i = 0; i <= ref_set && ref_set != INT_MAX; ++i) {
+        int_mv cur_mv = mbmi_ext->ref_mv_stack[rf_type][i + 1].this_mv;
+        if (cur_mv.as_int == best_mbmode.mv[0].as_int) {
+          best_mbmode.mode = NEARMV;
+          best_mbmode.ref_mv_idx = i;
+        }
+      }
+
+      if (frame_mv[NEARESTMV][refs[0]].as_int == best_mbmode.mv[0].as_int)
+        best_mbmode.mode = NEARESTMV;
+      else if (best_mbmode.mv[0].as_int == 0)
+        best_mbmode.mode = ZEROMV;
+    } else {
+      int_mv nearestmv[2];
+      int_mv nearmv[2];
+
+#if CONFIG_EXT_INTER
+      if (mbmi_ext->ref_mv_count[rf_type] > 1) {
+        nearmv[0] = mbmi_ext->ref_mv_stack[rf_type][1].this_mv;
+        nearmv[1] = mbmi_ext->ref_mv_stack[rf_type][1].comp_mv;
+      } else {
+        nearmv[0] = frame_mv[NEARMV][refs[0]];
+        nearmv[1] = frame_mv[NEARMV][refs[1]];
+      }
+#else
+      int i;
+      int ref_set = (mbmi_ext->ref_mv_count[rf_type] >= 2) ?
+          VPXMIN(2, mbmi_ext->ref_mv_count[rf_type] - 2) : INT_MAX;
+
+      for (i = 0; i <= ref_set && ref_set != INT_MAX; ++i) {
+        nearmv[0] = mbmi_ext->ref_mv_stack[rf_type][i + 1].this_mv;
+        nearmv[1] = mbmi_ext->ref_mv_stack[rf_type][i + 1].comp_mv;
+
+        if (nearmv[0].as_int == best_mbmode.mv[0].as_int &&
+            nearmv[1].as_int == best_mbmode.mv[1].as_int) {
+          best_mbmode.mode = NEARMV;
+          best_mbmode.ref_mv_idx = i;
+        }
+      }
+#endif
+      if (mbmi_ext->ref_mv_count[rf_type] >= 1) {
+        nearestmv[0] = mbmi_ext->ref_mv_stack[rf_type][0].this_mv;
+        nearestmv[1] = mbmi_ext->ref_mv_stack[rf_type][0].comp_mv;
+      } else {
+        nearestmv[0] = frame_mv[NEARESTMV][refs[0]];
+        nearestmv[1] = frame_mv[NEARESTMV][refs[1]];
+      }
+
+      if (nearestmv[0].as_int == best_mbmode.mv[0].as_int &&
+          nearestmv[1].as_int == best_mbmode.mv[1].as_int)
+#if CONFIG_EXT_INTER
+        best_mbmode.mode = NEAREST_NEARESTMV;
+      else if (nearestmv[0].as_int == best_mbmode.mv[0].as_int &&
+               nearmv[1].as_int == best_mbmode.mv[1].as_int)
+        best_mbmode.mode = NEAREST_NEARMV;
+      else if (nearmv[0].as_int == best_mbmode.mv[0].as_int &&
+               nearestmv[1].as_int == best_mbmode.mv[1].as_int)
+        best_mbmode.mode = NEAR_NEARESTMV;
+      else if (nearmv[0].as_int == best_mbmode.mv[0].as_int &&
+               nearmv[1].as_int == best_mbmode.mv[1].as_int)
+        best_mbmode.mode = NEAR_NEARMV;
+      else if (best_mbmode.mv[0].as_int == 0 && best_mbmode.mv[1].as_int == 0)
+        best_mbmode.mode = ZERO_ZEROMV;
+#else
+        best_mbmode.mode = NEARESTMV;
+      else if (best_mbmode.mv[0].as_int == 0 && best_mbmode.mv[1].as_int == 0)
+        best_mbmode.mode = ZEROMV;
+#endif  // CONFIG_EXT_INTER
+    }
+#else
+#if CONFIG_EXT_INTER
+    if (!comp_pred_mode) {
+#endif  // CONFIG_EXT_INTER
     if (frame_mv[NEARESTMV][refs[0]].as_int == best_mbmode.mv[0].as_int &&
         ((comp_pred_mode && frame_mv[NEARESTMV][refs[1]].as_int ==
             best_mbmode.mv[1].as_int) || !comp_pred_mode))
@@ -3516,8 +9886,58 @@
     else if (best_mbmode.mv[0].as_int == 0 &&
         ((comp_pred_mode && best_mbmode.mv[1].as_int == 0) || !comp_pred_mode))
       best_mbmode.mode = ZEROMV;
+#if CONFIG_EXT_INTER
+    } else {
+      const MV_REFERENCE_FRAME refs[2] = {best_mbmode.ref_frame[0],
+          best_mbmode.ref_frame[1]};
+
+      if (frame_mv[NEAREST_NEARESTMV][refs[0]].as_int ==
+            best_mbmode.mv[0].as_int &&
+          frame_mv[NEAREST_NEARESTMV][refs[1]].as_int ==
+            best_mbmode.mv[1].as_int)
+        best_mbmode.mode = NEAREST_NEARESTMV;
+      else if (frame_mv[NEAREST_NEARMV][refs[0]].as_int ==
+                 best_mbmode.mv[0].as_int &&
+               frame_mv[NEAREST_NEARMV][refs[1]].as_int ==
+                 best_mbmode.mv[1].as_int)
+        best_mbmode.mode = NEAREST_NEARMV;
+      else if (frame_mv[NEAR_NEARESTMV][refs[0]].as_int ==
+                 best_mbmode.mv[0].as_int &&
+               frame_mv[NEAR_NEARESTMV][refs[1]].as_int ==
+                 best_mbmode.mv[1].as_int)
+        best_mbmode.mode = NEAR_NEARESTMV;
+      else if (frame_mv[NEAR_NEARMV][refs[0]].as_int ==
+                 best_mbmode.mv[0].as_int &&
+               frame_mv[NEAR_NEARMV][refs[1]].as_int ==
+                 best_mbmode.mv[1].as_int)
+        best_mbmode.mode = NEAR_NEARMV;
+      else if (best_mbmode.mv[0].as_int == 0 && best_mbmode.mv[1].as_int == 0)
+        best_mbmode.mode = ZERO_ZEROMV;
+    }
+#endif  // CONFIG_EXT_INTER
+#endif
   }
 
+#if CONFIG_REF_MV
+  if (best_mbmode.ref_frame[0] > INTRA_FRAME &&
+      best_mbmode.mv[0].as_int == 0 &&
+#if CONFIG_EXT_INTER
+      (best_mbmode.ref_frame[1] <= INTRA_FRAME)
+#else
+      (best_mbmode.ref_frame[1] == NONE || best_mbmode.mv[1].as_int == 0)
+#endif  // CONFIG_EXT_INTER
+     ) {
+    int16_t mode_ctx = mbmi_ext->mode_context[best_mbmode.ref_frame[0]];
+#if !CONFIG_EXT_INTER
+    if (best_mbmode.ref_frame[1] > NONE)
+      mode_ctx &= (mbmi_ext->mode_context[best_mbmode.ref_frame[1]] | 0x00ff);
+#endif  // !CONFIG_EXT_INTER
+
+    if (mode_ctx & (1 << ALL_ZERO_FLAG_OFFSET))
+      best_mbmode.mode = ZEROMV;
+  }
+#endif
+
   if (best_mode_index < 0 || best_rd >= best_rd_so_far) {
     rd_cost->rate = INT_MAX;
     rd_cost->rdcost = INT64_MAX;
@@ -3531,7 +9951,7 @@
       TX_SIZE uv_tx_size;
       *mbmi = best_mbmode;
       uv_tx_size = get_uv_tx_size(mbmi, &xd->plane[1]);
-      rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra[uv_tx_size],
+      rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_intra[uv_tx_size],
                               &rate_uv_tokenonly[uv_tx_size],
                               &dist_uv[uv_tx_size],
                               &skip_uv[uv_tx_size],
@@ -3540,18 +9960,44 @@
     }
   }
 
+#if CONFIG_DUAL_FILTER
+  assert((cm->interp_filter == SWITCHABLE) ||
+         (cm->interp_filter == best_mbmode.interp_filter[0]) ||
+         !is_inter_block(&best_mbmode));
+  assert((cm->interp_filter == SWITCHABLE) ||
+         (cm->interp_filter == best_mbmode.interp_filter[1]) ||
+         !is_inter_block(&best_mbmode));
+  if (best_mbmode.ref_frame[1] > INTRA_FRAME) {
+    assert((cm->interp_filter == SWITCHABLE) ||
+           (cm->interp_filter == best_mbmode.interp_filter[2]) ||
+           !is_inter_block(&best_mbmode));
+    assert((cm->interp_filter == SWITCHABLE) ||
+           (cm->interp_filter == best_mbmode.interp_filter[3]) ||
+           !is_inter_block(&best_mbmode));
+  }
+#else
   assert((cm->interp_filter == SWITCHABLE) ||
          (cm->interp_filter == best_mbmode.interp_filter) ||
          !is_inter_block(&best_mbmode));
+#endif
 
   if (!cpi->rc.is_src_frame_alt_ref)
-    vp10_update_rd_thresh_fact(tile_data->thresh_freq_fact,
-                              sf->adaptive_rd_thresh, bsize, best_mode_index);
+    vp10_update_rd_thresh_fact(cm, tile_data->thresh_freq_fact,
+                               sf->adaptive_rd_thresh, bsize, best_mode_index);
 
   // macroblock modes
   *mbmi = best_mbmode;
   x->skip |= best_skip2;
 
+#if CONFIG_REF_MV
+  for (i = 0; i < 1 + has_second_ref(mbmi); ++i) {
+    if (mbmi->mode != NEWMV)
+      mbmi->pred_mv[i].as_int = mbmi->mv[i].as_int;
+    else
+      mbmi->pred_mv[i].as_int = mbmi_ext->ref_mvs[mbmi->ref_frame[i]][0].as_int;
+  }
+#endif
+
   for (i = 0; i < REFERENCE_MODES; ++i) {
     if (best_pred_rd[i] == INT64_MAX)
       best_pred_diff[i] = INT_MIN;
@@ -3559,45 +10005,16 @@
       best_pred_diff[i] = best_rd - best_pred_rd[i];
   }
 
-  if (!x->skip) {
-    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
-      if (best_filter_rd[i] == INT64_MAX)
-        best_filter_diff[i] = 0;
-      else
-        best_filter_diff[i] = best_rd - best_filter_rd[i];
-    }
-    if (cm->interp_filter == SWITCHABLE)
-      assert(best_filter_diff[SWITCHABLE_FILTERS] == 0);
-  } else {
-    vp10_zero(best_filter_diff);
-  }
-
-  // TODO(yunqingwang): Moving this line in front of the above best_filter_diff
-  // updating code causes PSNR loss. Need to figure out the confliction.
   x->skip |= best_mode_skippable;
 
-  if (!x->skip && !x->select_tx_size) {
-    int has_high_freq_coeff = 0;
-    int plane;
-    int max_plane = is_inter_block(&xd->mi[0]->mbmi)
-                        ? MAX_MB_PLANE : 1;
-    for (plane = 0; plane < max_plane; ++plane) {
-      x->plane[plane].eobs = ctx->eobs_pbuf[plane][1];
-      has_high_freq_coeff |= vp10_has_high_freq_in_plane(x, bsize, plane);
-    }
-
-    for (plane = max_plane; plane < MAX_MB_PLANE; ++plane) {
-      x->plane[plane].eobs = ctx->eobs_pbuf[plane][2];
-      has_high_freq_coeff |= vp10_has_high_freq_in_plane(x, bsize, plane);
-    }
-
-    best_mode_skippable |= !has_high_freq_coeff;
-  }
-
   assert(best_mode_index >= 0);
 
   store_coding_context(x, ctx, best_mode_index, best_pred_diff,
-                       best_filter_diff, best_mode_skippable);
+                       best_mode_skippable);
+
+  if (cm->allow_screen_content_tools && pmi->palette_size[1] > 0) {
+    restore_uv_color_map(cpi, x);
+  }
 }
 
 void vp10_rd_pick_inter_mode_sb_seg_skip(VP10_COMP *cpi,
@@ -3614,7 +10031,6 @@
   const int comp_pred = 0;
   int i;
   int64_t best_pred_diff[REFERENCE_MODES];
-  int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
   vpx_prob comp_mode_p;
   INTERP_FILTER best_filter = SWITCHABLE;
@@ -3634,35 +10050,69 @@
 
   assert(segfeature_active(&cm->seg, segment_id, SEG_LVL_SKIP));
 
+  mbmi->palette_mode_info.palette_size[0] = 0;
+  mbmi->palette_mode_info.palette_size[1] = 0;
+#if CONFIG_EXT_INTRA
+  mbmi->ext_intra_mode_info.use_ext_intra_mode[0] = 0;
+  mbmi->ext_intra_mode_info.use_ext_intra_mode[1] = 0;
+#endif  // CONFIG_EXT_INTRA
   mbmi->mode = ZEROMV;
+  mbmi->motion_variation = SIMPLE_TRANSLATION;
   mbmi->uv_mode = DC_PRED;
   mbmi->ref_frame[0] = LAST_FRAME;
   mbmi->ref_frame[1] = NONE;
   mbmi->mv[0].as_int = 0;
+#if CONFIG_REF_MV
+  mbmi->ref_mv_idx = 0;
+  mbmi->pred_mv[0].as_int = 0;
+#endif
   x->skip = 1;
 
   if (cm->interp_filter != BILINEAR) {
-    best_filter = EIGHTTAP;
+    best_filter = EIGHTTAP_REGULAR;
     if (cm->interp_filter == SWITCHABLE &&
+#if CONFIG_EXT_INTERP
+        vp10_is_interp_needed(xd) &&
+#endif  // CONFIG_EXT_INTERP
         x->source_variance >= cpi->sf.disable_filter_search_var_thresh) {
       int rs;
       int best_rs = INT_MAX;
       for (i = 0; i < SWITCHABLE_FILTERS; ++i) {
+#if CONFIG_DUAL_FILTER
+        int k;
+        for (k = 0; k < 4; ++k)
+          mbmi->interp_filter[k] = i;
+#else
         mbmi->interp_filter = i;
+#endif
         rs = vp10_get_switchable_rate(cpi, xd);
         if (rs < best_rs) {
           best_rs = rs;
+#if CONFIG_DUAL_FILTER
+          best_filter = mbmi->interp_filter[0];
+#else
           best_filter = mbmi->interp_filter;
+#endif
         }
       }
     }
   }
   // Set the appropriate filter
   if (cm->interp_filter == SWITCHABLE) {
+#if CONFIG_DUAL_FILTER
+    for (i = 0; i < 4; ++i)
+      mbmi->interp_filter[i] = best_filter;
+#else
     mbmi->interp_filter = best_filter;
+#endif
     rate2 += vp10_get_switchable_rate(cpi, xd);
   } else {
+#if CONFIG_DUAL_FILTER
+    for (i = 0; i < 4; ++i)
+      mbmi->interp_filter[0] = cm->interp_filter;
+#else
     mbmi->interp_filter = cm->interp_filter;
+#endif
   }
 
   if (cm->reference_mode == REFERENCE_MODE_SELECT)
@@ -3683,29 +10133,34 @@
     return;
   }
 
+#if CONFIG_DUAL_FILTER
+  assert((cm->interp_filter == SWITCHABLE) ||
+         (cm->interp_filter == mbmi->interp_filter[0]));
+#else
   assert((cm->interp_filter == SWITCHABLE) ||
          (cm->interp_filter == mbmi->interp_filter));
+#endif
 
-  vp10_update_rd_thresh_fact(tile_data->thresh_freq_fact,
-                            cpi->sf.adaptive_rd_thresh, bsize, THR_ZEROMV);
+  vp10_update_rd_thresh_fact(cm, tile_data->thresh_freq_fact,
+                             cpi->sf.adaptive_rd_thresh, bsize, THR_ZEROMV);
 
   vp10_zero(best_pred_diff);
-  vp10_zero(best_filter_diff);
 
-  if (!x->select_tx_size)
-    swap_block_ptr(x, ctx, 1, 0, 0, MAX_MB_PLANE);
   store_coding_context(x, ctx, THR_ZEROMV,
-                       best_pred_diff, best_filter_diff, 0);
+                       best_pred_diff, 0);
 }
 
-void vp10_rd_pick_inter_mode_sub8x8(VP10_COMP *cpi,
-                                   TileDataEnc *tile_data,
-                                   MACROBLOCK *x,
-                                   int mi_row, int mi_col,
-                                   RD_COST *rd_cost,
-                                   BLOCK_SIZE bsize,
-                                   PICK_MODE_CONTEXT *ctx,
-                                   int64_t best_rd_so_far) {
+void vp10_rd_pick_inter_mode_sub8x8(struct VP10_COMP *cpi,
+                                    TileDataEnc *tile_data,
+                                    struct macroblock *x,
+                                    int mi_row, int mi_col,
+                                    struct RD_COST *rd_cost,
+#if CONFIG_SUPERTX
+                                    int *returnrate_nocoef,
+#endif  // CONFIG_SUPERTX
+                                    BLOCK_SIZE bsize,
+                                    PICK_MODE_CONTEXT *ctx,
+                                    int64_t best_rd_so_far) {
   VP10_COMMON *const cm = &cpi->common;
   RD_OPT *const rd_opt = &cpi->rd;
   SPEED_FEATURES *const sf = &cpi->sf;
@@ -3716,45 +10171,80 @@
   unsigned char segment_id = mbmi->segment_id;
   int comp_pred, i;
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
-  struct buf_2d yv12_mb[4][MAX_MB_PLANE];
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                    VP9_ALT_FLAG };
+  struct buf_2d yv12_mb[MAX_REF_FRAMES][MAX_MB_PLANE];
+  static const int flag_list[REFS_PER_FRAME + 1] = {
+    0,
+    VP9_LAST_FLAG,
+#if CONFIG_EXT_REFS
+    VP9_LAST2_FLAG,
+    VP9_LAST3_FLAG,
+#endif  // CONFIG_EXT_REFS
+    VP9_GOLD_FLAG,
+#if CONFIG_EXT_REFS
+    VP9_BWD_FLAG,
+#endif  // CONFIG_EXT_REFS
+    VP9_ALT_FLAG
+  };
   int64_t best_rd = best_rd_so_far;
   int64_t best_yrd = best_rd_so_far;  // FIXME(rbultje) more precise
   int64_t best_pred_diff[REFERENCE_MODES];
   int64_t best_pred_rd[REFERENCE_MODES];
-  int64_t best_filter_rd[SWITCHABLE_FILTER_CONTEXTS];
-  int64_t best_filter_diff[SWITCHABLE_FILTER_CONTEXTS];
   MB_MODE_INFO best_mbmode;
   int ref_index, best_ref_index = 0;
   unsigned int ref_costs_single[MAX_REF_FRAMES], ref_costs_comp[MAX_REF_FRAMES];
   vpx_prob comp_mode_p;
+#if CONFIG_DUAL_FILTER
+  INTERP_FILTER tmp_best_filter[4] = { 0 };
+#else
   INTERP_FILTER tmp_best_filter = SWITCHABLE;
+#endif
   int rate_uv_intra, rate_uv_tokenonly;
   int64_t dist_uv;
   int skip_uv;
   PREDICTION_MODE mode_uv = DC_PRED;
   const int intra_cost_penalty = vp10_get_intra_cost_penalty(
     cm->base_qindex, cm->y_dc_delta_q, cm->bit_depth);
+#if CONFIG_EXT_INTER
+  int_mv seg_mvs[4][2][MAX_REF_FRAMES];
+#else
   int_mv seg_mvs[4][MAX_REF_FRAMES];
+#endif  // CONFIG_EXT_INTER
   b_mode_info best_bmodes[4];
   int best_skip2 = 0;
   int ref_frame_skip_mask[2] = { 0 };
-  int64_t mask_filter = 0;
-  int64_t filter_cache[SWITCHABLE_FILTER_CONTEXTS];
   int internal_active_edge =
     vp10_active_edge_sb(cpi, mi_row, mi_col) && vp10_internal_image_edge(cpi);
 
+#if CONFIG_SUPERTX
+  best_rd_so_far = INT64_MAX;
+  best_rd = best_rd_so_far;
+  best_yrd = best_rd_so_far;
+#endif  // CONFIG_SUPERTX
   memset(x->zcoeff_blk[TX_4X4], 0, 4);
   vp10_zero(best_mbmode);
 
-  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
-    filter_cache[i] = INT64_MAX;
+#if CONFIG_EXT_INTRA
+  mbmi->ext_intra_mode_info.use_ext_intra_mode[0] = 0;
+  mbmi->ext_intra_mode_info.use_ext_intra_mode[1] = 0;
+#endif  // CONFIG_EXT_INTRA
+  mbmi->motion_variation = SIMPLE_TRANSLATION;
+#if CONFIG_EXT_INTER
+  mbmi->use_wedge_interinter = 0;
+  mbmi->use_wedge_interintra = 0;
+#endif  // CONFIG_EXT_INTER
 
   for (i = 0; i < 4; i++) {
     int j;
+#if CONFIG_EXT_INTER
+    int k;
+
+    for (k = 0; k < 2; k++)
+      for (j = 0; j < MAX_REF_FRAMES; j++)
+        seg_mvs[i][k][j].as_int = INVALID_MV;
+#else
     for (j = 0; j < MAX_REF_FRAMES; j++)
       seg_mvs[i][j].as_int = INVALID_MV;
+#endif  // CONFIG_EXT_INTER
   }
 
   estimate_ref_frame_costs(cm, xd, segment_id, ref_costs_single, ref_costs_comp,
@@ -3762,13 +10252,18 @@
 
   for (i = 0; i < REFERENCE_MODES; ++i)
     best_pred_rd[i] = INT64_MAX;
-  for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
-    best_filter_rd[i] = INT64_MAX;
   rate_uv_intra = INT_MAX;
 
   rd_cost->rate = INT_MAX;
+#if CONFIG_SUPERTX
+  *returnrate_nocoef = INT_MAX;
+#endif
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
+    x->mbmi_ext->mode_context[ref_frame] = 0;
+#if CONFIG_REF_MV && CONFIG_EXT_INTER
+    x->mbmi_ext->compound_mode_context[ref_frame] = 0;
+#endif  // CONFIG_REF_MV && CONFIG_EXT_INTER
     if (cpi->ref_frame_flags & flag_list[ref_frame]) {
       setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
                          frame_mv[NEARESTMV], frame_mv[NEARMV],
@@ -3778,9 +10273,15 @@
       ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
     }
     frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
+#if CONFIG_EXT_INTER
+    frame_mv[NEWFROMNEARMV][ref_frame].as_int = INVALID_MV;
+#endif  // CONFIG_EXT_INTER
     frame_mv[ZEROMV][ref_frame].as_int = 0;
   }
 
+  mbmi->palette_mode_info.palette_size[0] = 0;
+  mbmi->palette_mode_info.palette_size[1] = 0;
+
   for (ref_index = 0; ref_index < MAX_REFS; ++ref_index) {
     int mode_excluded = 0;
     int64_t this_rd = INT64_MAX;
@@ -3805,15 +10306,64 @@
           case INTRA_FRAME:
             break;
           case LAST_FRAME:
-            ref_frame_skip_mask[0] |= (1 << GOLDEN_FRAME) | (1 << ALTREF_FRAME);
+            ref_frame_skip_mask[0] |= (1 << GOLDEN_FRAME) |
+#if CONFIG_EXT_REFS
+                                      (1 << LAST2_FRAME) |
+                                      (1 << LAST3_FRAME) |
+                                      (1 << BWDREF_FRAME) |
+#endif  // CONFIG_EXT_REFS
+                                      (1 << ALTREF_FRAME);
             ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
             break;
+#if CONFIG_EXT_REFS
+          case LAST2_FRAME:
+            ref_frame_skip_mask[0] |= (1 << LAST_FRAME) |
+                                      (1 << LAST3_FRAME) |
+                                      (1 << GOLDEN_FRAME) |
+                                      (1 << BWDREF_FRAME) |
+                                      (1 << ALTREF_FRAME);
+            ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+            break;
+          case LAST3_FRAME:
+            ref_frame_skip_mask[0] |= (1 << LAST_FRAME) |
+                                      (1 << LAST2_FRAME) |
+                                      (1 << GOLDEN_FRAME) |
+                                      (1 << BWDREF_FRAME) |
+                                      (1 << ALTREF_FRAME);
+            ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
+            break;
+#endif  // CONFIG_EXT_REFS
           case GOLDEN_FRAME:
-            ref_frame_skip_mask[0] |= (1 << LAST_FRAME) | (1 << ALTREF_FRAME);
+            ref_frame_skip_mask[0] |= (1 << LAST_FRAME) |
+#if CONFIG_EXT_REFS
+                                      (1 << LAST2_FRAME) |
+                                      (1 << LAST3_FRAME) |
+                                      (1 << BWDREF_FRAME) |
+#endif  // CONFIG_EXT_REFS
+                                      (1 << ALTREF_FRAME);
             ref_frame_skip_mask[1] |= SECOND_REF_FRAME_MASK;
             break;
+#if CONFIG_EXT_REFS
+          case BWDREF_FRAME:
+            ref_frame_skip_mask[0] |= (1 << LAST_FRAME) |
+                                      (1 << LAST2_FRAME) |
+                                      (1 << LAST3_FRAME) |
+                                      (1 << GOLDEN_FRAME) |
+                                      (1 << ALTREF_FRAME);
+            ref_frame_skip_mask[1] |= (1 << ALTREF_FRAME) | 0x01;
+            break;
+#endif  // CONFIG_EXT_REFS
           case ALTREF_FRAME:
-            ref_frame_skip_mask[0] |= (1 << GOLDEN_FRAME) | (1 << LAST_FRAME);
+            ref_frame_skip_mask[0] |= (1 << LAST_FRAME) |
+#if CONFIG_EXT_REFS
+                                      (1 << LAST2_FRAME) |
+                                      (1 << LAST3_FRAME) |
+                                      (1 << BWDREF_FRAME) |
+#endif  // CONFIG_EXT_REFS
+                                      (1 << GOLDEN_FRAME);
+#if CONFIG_EXT_REFS
+            ref_frame_skip_mask[1] |= (1 << BWDREF_FRAME) | 0x01;
+#endif  // CONFIG_EXT_REFS
             break;
           case NONE:
           case MAX_REF_FRAMES:
@@ -3888,8 +10438,14 @@
     mbmi->ref_frame[1] = second_ref_frame;
     // Evaluate all sub-pel filters irrespective of whether we can use
     // them for this frame.
-    mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP
+#if CONFIG_DUAL_FILTER
+    for (i = 0; i < 4; ++i)
+      mbmi->interp_filter[i] = cm->interp_filter == SWITCHABLE ?
+          EIGHTTAP_REGULAR : cm->interp_filter;
+#else
+    mbmi->interp_filter = cm->interp_filter == SWITCHABLE ? EIGHTTAP_REGULAR
                                                           : cm->interp_filter;
+#endif
     x->skip = 0;
     set_ref_ptrs(cm, xd, ref_frame, second_ref_frame);
 
@@ -3900,6 +10456,10 @@
         xd->plane[i].pre[1] = yv12_mb[second_ref_frame][i];
     }
 
+#if CONFIG_VAR_TX
+    mbmi->inter_tx_size[0][0] = mbmi->tx_size;
+#endif
+
     if (ref_frame == INTRA_FRAME) {
       int rate;
       if (rd_pick_intra_sub_8x8_y_mode(cpi, x, &rate, &rate_y,
@@ -3932,70 +10492,152 @@
       int switchable_filter_index;
       int_mv *second_ref = comp_pred ?
                              &x->mbmi_ext->ref_mvs[second_ref_frame][0] : NULL;
-      b_mode_info tmp_best_bmodes[16];
+      b_mode_info tmp_best_bmodes[16];  // Should this be 4 ?
       MB_MODE_INFO tmp_best_mbmode;
+#if CONFIG_DUAL_FILTER
+#if CONFIG_EXT_INTERP
+      BEST_SEG_INFO bsi[25];
+#else
+      BEST_SEG_INFO bsi[9];
+#endif
+#else
       BEST_SEG_INFO bsi[SWITCHABLE_FILTERS];
+#endif
       int pred_exists = 0;
       int uv_skippable;
+#if CONFIG_EXT_INTER
+      int_mv compound_seg_newmvs[4][2];
+      for (i = 0; i < 4; i++) {
+        compound_seg_newmvs[i][0].as_int = INVALID_MV;
+        compound_seg_newmvs[i][1].as_int = INVALID_MV;
+      }
+#endif  // CONFIG_EXT_INTER
 
       this_rd_thresh = (ref_frame == LAST_FRAME) ?
           rd_opt->threshes[segment_id][bsize][THR_LAST] :
           rd_opt->threshes[segment_id][bsize][THR_ALTR];
+#if CONFIG_EXT_REFS
+      this_rd_thresh = (ref_frame == LAST2_FRAME) ?
+          rd_opt->threshes[segment_id][bsize][THR_LAST2] : this_rd_thresh;
+      this_rd_thresh = (ref_frame == LAST3_FRAME) ?
+          rd_opt->threshes[segment_id][bsize][THR_LAST3] : this_rd_thresh;
+#endif  // CONFIG_EXT_REFS
       this_rd_thresh = (ref_frame == GOLDEN_FRAME) ?
-      rd_opt->threshes[segment_id][bsize][THR_GOLD] : this_rd_thresh;
-      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; ++i)
-        filter_cache[i] = INT64_MAX;
+          rd_opt->threshes[segment_id][bsize][THR_GOLD] : this_rd_thresh;
+#if CONFIG_EXT_REFS
+      // TODO(zoeliu): To explore whether this_rd_thresh should consider
+      //               BWDREF_FRAME and ALTREF_FRAME
+#endif  // CONFIG_EXT_REFS
 
       // TODO(any): Add search of the tx_type to improve rd performance at the
       // expense of speed.
       mbmi->tx_type = DCT_DCT;
 
       if (cm->interp_filter != BILINEAR) {
-        tmp_best_filter = EIGHTTAP;
+#if CONFIG_DUAL_FILTER
+        tmp_best_filter[0] = EIGHTTAP_REGULAR;
+        tmp_best_filter[1] = EIGHTTAP_REGULAR;
+        tmp_best_filter[2] = EIGHTTAP_REGULAR;
+        tmp_best_filter[3] = EIGHTTAP_REGULAR;
+#else
+        tmp_best_filter = EIGHTTAP_REGULAR;
+#endif
         if (x->source_variance < sf->disable_filter_search_var_thresh) {
-          tmp_best_filter = EIGHTTAP;
+#if CONFIG_DUAL_FILTER
+          tmp_best_filter[0] = EIGHTTAP_REGULAR;
+#else
+          tmp_best_filter = EIGHTTAP_REGULAR;
+#endif
         } else if (sf->adaptive_pred_interp_filter == 1 &&
                    ctx->pred_interp_filter < SWITCHABLE) {
+#if CONFIG_DUAL_FILTER
+          tmp_best_filter[0] = ctx->pred_interp_filter;
+#else
           tmp_best_filter = ctx->pred_interp_filter;
+#endif
         } else if (sf->adaptive_pred_interp_filter == 2) {
-          tmp_best_filter = ctx->pred_interp_filter < SWITCHABLE ?
+#if CONFIG_DUAL_FILTER
+          tmp_best_filter[0] = ctx->pred_interp_filter < SWITCHABLE ?
                               ctx->pred_interp_filter : 0;
+#else
+          tmp_best_filter = ctx->pred_interp_filter < SWITCHABLE ?
+              ctx->pred_interp_filter : 0;
+#endif
         } else {
+#if CONFIG_DUAL_FILTER
+          for (switchable_filter_index = 0;
+#if CONFIG_EXT_INTERP
+               switchable_filter_index < 25;
+#else
+               switchable_filter_index < 9;
+#endif
+               ++switchable_filter_index) {
+#else
           for (switchable_filter_index = 0;
                switchable_filter_index < SWITCHABLE_FILTERS;
                ++switchable_filter_index) {
+#endif
             int newbest, rs;
             int64_t rs_rd;
             MB_MODE_INFO_EXT *mbmi_ext = x->mbmi_ext;
+#if CONFIG_DUAL_FILTER
+            mbmi->interp_filter[0] = filter_sets[switchable_filter_index][0];
+            mbmi->interp_filter[1] = filter_sets[switchable_filter_index][1];
+            mbmi->interp_filter[2] = filter_sets[switchable_filter_index][0];
+            mbmi->interp_filter[3] = filter_sets[switchable_filter_index][1];
+#else
             mbmi->interp_filter = switchable_filter_index;
+#endif
             tmp_rd = rd_pick_best_sub8x8_mode(cpi, x,
                                               &mbmi_ext->ref_mvs[ref_frame][0],
                                               second_ref, best_yrd, &rate,
                                               &rate_y, &distortion,
                                               &skippable, &total_sse,
                                               (int) this_rd_thresh, seg_mvs,
+#if CONFIG_EXT_INTER
+                                              compound_seg_newmvs,
+#endif  // CONFIG_EXT_INTER
                                               bsi, switchable_filter_index,
                                               mi_row, mi_col);
-
+#if CONFIG_EXT_INTERP
+#if CONFIG_DUAL_FILTER
+            if (!vp10_is_interp_needed(xd) && cm->interp_filter == SWITCHABLE &&
+                (mbmi->interp_filter[0] != EIGHTTAP_REGULAR ||
+                 mbmi->interp_filter[1] != EIGHTTAP_REGULAR))  // invalid config
+              continue;
+#else
+            if (!vp10_is_interp_needed(xd) && cm->interp_filter == SWITCHABLE &&
+                mbmi->interp_filter != EIGHTTAP_REGULAR)  // invalid config
+              continue;
+#endif
+#endif  // CONFIG_EXT_INTERP
             if (tmp_rd == INT64_MAX)
               continue;
             rs = vp10_get_switchable_rate(cpi, xd);
             rs_rd = RDCOST(x->rdmult, x->rddiv, rs, 0);
-            filter_cache[switchable_filter_index] = tmp_rd;
-            filter_cache[SWITCHABLE_FILTERS] =
-                VPXMIN(filter_cache[SWITCHABLE_FILTERS], tmp_rd + rs_rd);
             if (cm->interp_filter == SWITCHABLE)
               tmp_rd += rs_rd;
 
-            mask_filter = VPXMAX(mask_filter, tmp_rd);
-
             newbest = (tmp_rd < tmp_best_rd);
             if (newbest) {
+#if CONFIG_DUAL_FILTER
+              tmp_best_filter[0] = mbmi->interp_filter[0];
+              tmp_best_filter[1] = mbmi->interp_filter[1];
+              tmp_best_filter[2] = mbmi->interp_filter[2];
+              tmp_best_filter[3] = mbmi->interp_filter[3];
+#else
               tmp_best_filter = mbmi->interp_filter;
+#endif
               tmp_best_rd = tmp_rd;
             }
             if ((newbest && cm->interp_filter == SWITCHABLE) ||
-                (mbmi->interp_filter == cm->interp_filter &&
+                (
+#if CONFIG_DUAL_FILTER
+                 mbmi->interp_filter[0] == cm->interp_filter
+#else
+                 mbmi->interp_filter == cm->interp_filter
+#endif
+                 &&
                  cm->interp_filter != SWITCHABLE)) {
               tmp_best_rdu = tmp_rd;
               tmp_best_rate = rate;
@@ -4009,17 +10651,6 @@
                 x->zcoeff_blk[TX_4X4][i] = !x->plane[0].eobs[i];
               }
               pred_exists = 1;
-              if (switchable_filter_index == 0 &&
-                  sf->use_rd_breakout &&
-                  best_rd < INT64_MAX) {
-                if (tmp_best_rdu / 2 > best_rd) {
-                  // skip searching the other filters if the first is
-                  // already substantially larger than the best so far
-                  tmp_best_filter = mbmi->interp_filter;
-                  tmp_best_rdu = INT64_MAX;
-                  break;
-                }
-              }
             }
           }  // switchable_filter_index loop
         }
@@ -4028,17 +10659,47 @@
       if (tmp_best_rdu == INT64_MAX && pred_exists)
         continue;
 
+#if CONFIG_DUAL_FILTER
+      mbmi->interp_filter[0] = (cm->interp_filter == SWITCHABLE ?
+                             tmp_best_filter[0] : cm->interp_filter);
+      mbmi->interp_filter[1] = (cm->interp_filter == SWITCHABLE ?
+                             tmp_best_filter[1] : cm->interp_filter);
+      mbmi->interp_filter[2] = (cm->interp_filter == SWITCHABLE ?
+                             tmp_best_filter[2] : cm->interp_filter);
+      mbmi->interp_filter[3] = (cm->interp_filter == SWITCHABLE ?
+                             tmp_best_filter[3] : cm->interp_filter);
+#else
       mbmi->interp_filter = (cm->interp_filter == SWITCHABLE ?
                              tmp_best_filter : cm->interp_filter);
+#endif
+
       if (!pred_exists) {
         // Handles the special case when a filter that is not in the
-        // switchable list (bilinear, 6-tap) is indicated at the frame level
+        // switchable list (bilinear) is indicated at the frame level
         tmp_rd = rd_pick_best_sub8x8_mode(cpi, x,
                                           &x->mbmi_ext->ref_mvs[ref_frame][0],
                                           second_ref, best_yrd, &rate, &rate_y,
                                           &distortion, &skippable, &total_sse,
-                                          (int) this_rd_thresh, seg_mvs, bsi, 0,
+                                          (int) this_rd_thresh, seg_mvs,
+#if CONFIG_EXT_INTER
+                                          compound_seg_newmvs,
+#endif  // CONFIG_EXT_INTER
+                                          bsi, 0,
                                           mi_row, mi_col);
+#if CONFIG_EXT_INTERP
+#if CONFIG_DUAL_FILTER
+        if (!vp10_is_interp_needed(xd) && cm->interp_filter == SWITCHABLE &&
+            (mbmi->interp_filter[0] != EIGHTTAP_REGULAR ||
+             mbmi->interp_filter[1] != EIGHTTAP_REGULAR)) {
+          mbmi->interp_filter[0] = EIGHTTAP_REGULAR;
+          mbmi->interp_filter[1] = EIGHTTAP_REGULAR;
+        }
+#else
+        if (!vp10_is_interp_needed(xd) && cm->interp_filter == SWITCHABLE &&
+            mbmi->interp_filter != EIGHTTAP_REGULAR)
+          mbmi->interp_filter = EIGHTTAP_REGULAR;
+#endif  // CONFIG_DUAL_FILTER
+#endif  // CONFIG_EXT_INTERP
         if (tmp_rd == INT64_MAX)
           continue;
       } else {
@@ -4051,6 +10712,23 @@
         for (i = 0; i < 4; i++)
           xd->mi[0]->bmi[i] = tmp_best_bmodes[i];
       }
+      // Add in the cost of the transform type
+      if (!xd->lossless[mbmi->segment_id]) {
+        int rate_tx_type = 0;
+#if CONFIG_EXT_TX
+        if (get_ext_tx_types(mbmi->tx_size, bsize, 1) > 1) {
+          const int eset = get_ext_tx_set(mbmi->tx_size, bsize, 1);
+          rate_tx_type =
+              cpi->inter_tx_type_costs[eset][mbmi->tx_size][mbmi->tx_type];
+        }
+#else
+        if (mbmi->tx_size < TX_32X32) {
+          rate_tx_type = cpi->inter_tx_type_costs[mbmi->tx_size][mbmi->tx_type];
+        }
+#endif
+        rate += rate_tx_type;
+        rate_y += rate_tx_type;
+      }
 
       rate2 += rate;
       distortion2 += distortion;
@@ -4073,15 +10751,21 @@
         // then dont bother looking at UV
         vp10_build_inter_predictors_sbuv(&x->e_mbd, mi_row, mi_col,
                                         BLOCK_8X8);
-        memset(x->skip_txfm, SKIP_TXFM_NONE, sizeof(x->skip_txfm));
+#if CONFIG_VAR_TX
+        if (!inter_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
+                              &uv_sse, BLOCK_8X8, tmp_best_rdu))
+          continue;
+#else
         if (!super_block_uvrd(cpi, x, &rate_uv, &distortion_uv, &uv_skippable,
                               &uv_sse, BLOCK_8X8, tmp_best_rdu))
           continue;
-
+#endif
         rate2 += rate_uv;
         distortion2 += distortion_uv;
         skippable = skippable && uv_skippable;
         total_sse += uv_sse;
+      } else {
+        continue;
       }
     }
 
@@ -4127,24 +10811,29 @@
     if (!disable_skip && ref_frame == INTRA_FRAME) {
       for (i = 0; i < REFERENCE_MODES; ++i)
         best_pred_rd[i] = VPXMIN(best_pred_rd[i], this_rd);
-      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++)
-        best_filter_rd[i] = VPXMIN(best_filter_rd[i], this_rd);
     }
 
     // Did this mode help.. i.e. is it the new best mode
     if (this_rd < best_rd || x->skip) {
       if (!mode_excluded) {
-        int max_plane = MAX_MB_PLANE;
         // Note index of best mode so far
         best_ref_index = ref_index;
 
         if (ref_frame == INTRA_FRAME) {
           /* required for left and above block mv */
           mbmi->mv[0].as_int = 0;
-          max_plane = 1;
         }
 
         rd_cost->rate = rate2;
+#if CONFIG_SUPERTX
+        *returnrate_nocoef = rate2 - rate_y - rate_uv;
+        if (!disable_skip)
+          *returnrate_nocoef -= vp10_cost_bit(vp10_get_skip_prob(cm, xd),
+                                              this_skip2);
+        *returnrate_nocoef -= vp10_cost_bit(vp10_get_intra_inter_prob(cm, xd),
+                                            mbmi->ref_frame[0] != INTRA_FRAME);
+        assert(*returnrate_nocoef > 0);
+#endif  // CONFIG_SUPERTX
         rd_cost->dist = distortion2;
         rd_cost->rdcost = this_rd;
         best_rd = this_rd;
@@ -4152,10 +10841,14 @@
                    RDCOST(x->rdmult, x->rddiv, rate_uv, distortion_uv);
         best_mbmode = *mbmi;
         best_skip2 = this_skip2;
-        if (!x->select_tx_size)
-          swap_block_ptr(x, ctx, 1, 0, 0, max_plane);
+
+#if CONFIG_VAR_TX
+        for (i = 0; i < MAX_MB_PLANE; ++i)
+          memset(ctx->blk_skip[i], 0, sizeof(uint8_t) * ctx->num_4x4_blk);
+#else
         memcpy(ctx->zcoeff_blk, x->zcoeff_blk[TX_4X4],
                sizeof(ctx->zcoeff_blk[0]) * ctx->num_4x4_blk);
+#endif
 
         for (i = 0; i < 4; i++)
           best_bmodes[i] = xd->mi[0]->bmi[i];
@@ -4208,29 +10901,6 @@
         best_pred_rd[REFERENCE_MODE_SELECT] = hybrid_rd;
     }
 
-    /* keep record of best filter type */
-    if (!mode_excluded && !disable_skip && ref_frame != INTRA_FRAME &&
-        cm->interp_filter != BILINEAR) {
-      int64_t ref = filter_cache[cm->interp_filter == SWITCHABLE ?
-                              SWITCHABLE_FILTERS : cm->interp_filter];
-      int64_t adj_rd;
-      for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
-        if (ref == INT64_MAX)
-          adj_rd = 0;
-        else if (filter_cache[i] == INT64_MAX)
-          // when early termination is triggered, the encoder does not have
-          // access to the rate-distortion cost. it only knows that the cost
-          // should be above the maximum valid value. hence it takes the known
-          // maximum plus an arbitrary constant as the rate-distortion cost.
-          adj_rd = mask_filter - ref + 10;
-        else
-          adj_rd = filter_cache[i] - ref;
-
-        adj_rd += this_rd;
-        best_filter_rd[i] = VPXMIN(best_filter_rd[i], adj_rd);
-      }
-    }
-
     if (early_term)
       break;
 
@@ -4241,6 +10911,9 @@
   if (best_rd >= best_rd_so_far) {
     rd_cost->rate = INT_MAX;
     rd_cost->rdcost = INT64_MAX;
+#if CONFIG_SUPERTX
+    *returnrate_nocoef = INT_MAX;
+#endif  // CONFIG_SUPERTX
     return;
   }
 
@@ -4249,7 +10922,7 @@
     // Do Intra UV best rd mode selection if best mode choice above was intra.
     if (best_mbmode.ref_frame[0] == INTRA_FRAME) {
       *mbmi = best_mbmode;
-      rd_pick_intra_sbuv_mode(cpi, x, ctx, &rate_uv_intra,
+      rd_pick_intra_sbuv_mode(cpi, x, &rate_uv_intra,
                               &rate_uv_tokenonly,
                               &dist_uv,
                               &skip_uv,
@@ -4261,15 +10934,24 @@
     rd_cost->rate = INT_MAX;
     rd_cost->dist = INT64_MAX;
     rd_cost->rdcost = INT64_MAX;
+#if CONFIG_SUPERTX
+    *returnrate_nocoef = INT_MAX;
+#endif  // CONFIG_SUPERTX
     return;
   }
 
+#if CONFIG_DUAL_FILTER
+  assert((cm->interp_filter == SWITCHABLE) ||
+         (cm->interp_filter == best_mbmode.interp_filter[0]) ||
+         !is_inter_block(&best_mbmode));
+#else
   assert((cm->interp_filter == SWITCHABLE) ||
          (cm->interp_filter == best_mbmode.interp_filter) ||
          !is_inter_block(&best_mbmode));
+#endif
 
-  vp10_update_rd_thresh_fact(tile_data->thresh_freq_fact,
-                            sf->adaptive_rd_thresh, bsize, best_ref_index);
+  vp10_update_rd_thresh_fact(cm, tile_data->thresh_freq_fact,
+                             sf->adaptive_rd_thresh, bsize, best_ref_index);
 
   // macroblock modes
   *mbmi = best_mbmode;
@@ -4283,6 +10965,10 @@
 
     mbmi->mv[0].as_int = xd->mi[0]->bmi[3].as_mv[0].as_int;
     mbmi->mv[1].as_int = xd->mi[0]->bmi[3].as_mv[1].as_int;
+#if CONFIG_REF_MV
+    mbmi->pred_mv[0].as_int = xd->mi[0]->bmi[3].pred_mv_s8[0].as_int;
+    mbmi->pred_mv[1].as_int = xd->mi[0]->bmi[3].pred_mv_s8[1].as_int;
+#endif
   }
 
   for (i = 0; i < REFERENCE_MODES; ++i) {
@@ -4292,19 +10978,230 @@
       best_pred_diff[i] = best_rd - best_pred_rd[i];
   }
 
-  if (!x->skip) {
-    for (i = 0; i < SWITCHABLE_FILTER_CONTEXTS; i++) {
-      if (best_filter_rd[i] == INT64_MAX)
-        best_filter_diff[i] = 0;
-      else
-        best_filter_diff[i] = best_rd - best_filter_rd[i];
-    }
-    if (cm->interp_filter == SWITCHABLE)
-      assert(best_filter_diff[SWITCHABLE_FILTERS] == 0);
-  } else {
-    vp10_zero(best_filter_diff);
+  store_coding_context(x, ctx, best_ref_index,
+                       best_pred_diff, 0);
+}
+
+#if CONFIG_OBMC
+// This function has a structure similar to vp10_build_obmc_inter_prediction
+//
+// The OBMC predictor is computed as:
+//
+//  PObmc(x,y) =
+//    VPX_BLEND_A64(Mh(x),
+//                  VPX_BLEND_A64(Mv(y), P(x,y), PAbove(x,y)),
+//                  PLeft(x, y))
+//
+// Scaling up by VPX_BLEND_A64_MAX_ALPHA ** 2 and omitting the intermediate
+// rounding, this can be written as:
+//
+//  VPX_BLEND_A64_MAX_ALPHA * VPX_BLEND_A64_MAX_ALPHA * Pobmc(x,y) =
+//    Mh(x) * Mv(y) * P(x,y) +
+//      Mh(x) * Cv(y) * Pabove(x,y) +
+//      VPX_BLEND_A64_MAX_ALPHA * Ch(x) * PLeft(x, y)
+//
+// Where :
+//
+//  Cv(y) = VPX_BLEND_A64_MAX_ALPHA - Mv(y)
+//  Ch(y) = VPX_BLEND_A64_MAX_ALPHA - Mh(y)
+//
+// This function computes 'wsrc' and 'mask' as:
+//
+//  wsrc(x, y) =
+//    VPX_BLEND_A64_MAX_ALPHA * VPX_BLEND_A64_MAX_ALPHA * src(x, y) -
+//      Mh(x) * Cv(y) * Pabove(x,y) +
+//      VPX_BLEND_A64_MAX_ALPHA * Ch(x) * PLeft(x, y)
+//
+//  mask(x, y) = Mh(x) * Mv(y)
+//
+// These can then be used to efficiently approximate the error for any
+// predictor P in the context of the provided neighbouring predictors by
+// computing:
+//
+//  error(x, y) =
+//    wsrc(x, y) - mask(x, y) * P(x, y) / (VPX_BLEND_A64_MAX_ALPHA ** 2)
+//
+static void calc_target_weighted_pred(
+    const VP10_COMMON *cm,
+    const MACROBLOCK *x,
+    const MACROBLOCKD *xd,
+    int mi_row, int mi_col,
+    const uint8_t *above, int above_stride,
+    const uint8_t *left,  int left_stride,
+    int32_t *mask_buf,
+    int32_t *wsrc_buf) {
+  const BLOCK_SIZE bsize = xd->mi[0]->mbmi.sb_type;
+  int row, col, i;
+  const int bw = 8 * xd->n8_w;
+  const int bh = 8 * xd->n8_h;
+  const int wsrc_stride = bw;
+  const int mask_stride = bw;
+  const int src_scale = VPX_BLEND_A64_MAX_ALPHA * VPX_BLEND_A64_MAX_ALPHA;
+#if CONFIG_VP9_HIGHBITDEPTH
+  const int is_hbd = (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) ? 1 : 0;
+#else
+  const int is_hbd = 0;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+  // plane 0 should not be subsampled
+  assert(xd->plane[0].subsampling_x == 0);
+  assert(xd->plane[0].subsampling_y == 0);
+
+  vp10_zero_array(wsrc_buf, bw * bh);
+  for (i = 0; i < bw * bh; ++i)
+    mask_buf[i] = VPX_BLEND_A64_MAX_ALPHA;
+
+  // handle above row
+  if (xd->up_available) {
+    const int overlap = num_4x4_blocks_high_lookup[bsize] * 2;
+    const int miw = VPXMIN(xd->n8_w, cm->mi_cols - mi_col);
+    const int mi_row_offset = -1;
+    const uint8_t *const mask1d = vp10_get_obmc_mask(overlap);
+
+    assert(miw > 0);
+
+    i = 0;
+    do {  // for each mi in the above row
+      const int mi_col_offset = i;
+      const MB_MODE_INFO *const above_mbmi =
+          &xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]->mbmi;
+      const int mi_step =
+          VPXMIN(xd->n8_w, num_8x8_blocks_wide_lookup[above_mbmi->sb_type]);
+      const int neighbor_bw = mi_step * MI_SIZE;
+
+      if (is_neighbor_overlappable(above_mbmi)) {
+        const int tmp_stride = above_stride;
+        int32_t *wsrc = wsrc_buf + (i * MI_SIZE);
+        int32_t *mask = mask_buf + (i * MI_SIZE);
+
+        if (!is_hbd) {
+          const uint8_t *tmp = above;
+
+          for (row = 0; row < overlap; ++row) {
+            const uint8_t m0 = mask1d[row];
+            const uint8_t m1 = VPX_BLEND_A64_MAX_ALPHA - m0;
+            for (col = 0; col < neighbor_bw; ++col) {
+              wsrc[col] = m1 * tmp[col];
+              mask[col] = m0;
+            }
+            wsrc += wsrc_stride;
+            mask += mask_stride;
+            tmp += tmp_stride;
+          }
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          const uint16_t *tmp = CONVERT_TO_SHORTPTR(above);
+
+          for (row = 0; row < overlap; ++row) {
+            const uint8_t m0 = mask1d[row];
+            const uint8_t m1 = VPX_BLEND_A64_MAX_ALPHA - m0;
+            for (col = 0; col < neighbor_bw; ++col) {
+              wsrc[col] = m1 * tmp[col];
+              mask[col] = m0;
+            }
+            wsrc += wsrc_stride;
+            mask += mask_stride;
+            tmp += tmp_stride;
+          }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        }
+      }
+
+      above += neighbor_bw;
+      i += mi_step;
+    } while (i < miw);
   }
 
-  store_coding_context(x, ctx, best_ref_index,
-                       best_pred_diff, best_filter_diff, 0);
+  for (i = 0; i < bw * bh; ++i) {
+    wsrc_buf[i] *= VPX_BLEND_A64_MAX_ALPHA;
+    mask_buf[i] *= VPX_BLEND_A64_MAX_ALPHA;
+  }
+
+  // handle left column
+  if (xd->left_available) {
+    const int overlap = num_4x4_blocks_wide_lookup[bsize] * 2;
+    const int mih = VPXMIN(xd->n8_h, cm->mi_rows - mi_row);
+    const int mi_col_offset = -1;
+    const uint8_t *const mask1d = vp10_get_obmc_mask(overlap);
+
+    assert(mih > 0);
+
+    i = 0;
+    do {  // for each mi in the left column
+      const int mi_row_offset = i;
+      const MB_MODE_INFO *const left_mbmi =
+          &xd->mi[mi_col_offset + mi_row_offset * xd->mi_stride]->mbmi;
+      const int mi_step =
+          VPXMIN(xd->n8_h, num_8x8_blocks_high_lookup[left_mbmi->sb_type]);
+      const int neighbor_bh = mi_step * MI_SIZE;
+
+      if (is_neighbor_overlappable(left_mbmi)) {
+        const int tmp_stride = left_stride;
+        int32_t *wsrc = wsrc_buf + (i * MI_SIZE * wsrc_stride);
+        int32_t *mask = mask_buf + (i * MI_SIZE * mask_stride);
+
+        if (!is_hbd) {
+          const uint8_t *tmp = left;
+
+          for (row = 0; row < neighbor_bh; ++row) {
+            for (col = 0; col < overlap; ++col) {
+              const uint8_t m0 = mask1d[col];
+              const uint8_t m1 = VPX_BLEND_A64_MAX_ALPHA - m0;
+              wsrc[col] = (wsrc[col] >> VPX_BLEND_A64_ROUND_BITS) * m0 +
+                          (tmp[col] << VPX_BLEND_A64_ROUND_BITS) * m1;
+              mask[col] = (mask[col] >> VPX_BLEND_A64_ROUND_BITS) * m0;
+            }
+            wsrc += wsrc_stride;
+            mask += mask_stride;
+            tmp += tmp_stride;
+          }
+#if CONFIG_VP9_HIGHBITDEPTH
+        } else {
+          const uint16_t *tmp = CONVERT_TO_SHORTPTR(left);
+
+          for (row = 0; row < neighbor_bh; ++row) {
+            for (col = 0; col < overlap; ++col) {
+              const uint8_t m0 = mask1d[col];
+              const uint8_t m1 = VPX_BLEND_A64_MAX_ALPHA - m0;
+              wsrc[col] = (wsrc[col] >> VPX_BLEND_A64_ROUND_BITS) * m0 +
+                          (tmp[col] << VPX_BLEND_A64_ROUND_BITS) * m1;
+              mask[col] = (mask[col] >> VPX_BLEND_A64_ROUND_BITS) * m0;
+            }
+            wsrc += wsrc_stride;
+            mask += mask_stride;
+            tmp += tmp_stride;
+          }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+        }
+      }
+
+      left += neighbor_bh * left_stride;
+      i += mi_step;
+    } while (i < mih);
+  }
+
+  if (!is_hbd) {
+    const uint8_t *src = x->plane[0].src.buf;
+
+    for (row = 0; row < bh; ++row) {
+      for (col = 0; col < bw; ++col) {
+        wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col];
+      }
+      wsrc_buf += wsrc_stride;
+      src += x->plane[0].src.stride;
+    }
+#if CONFIG_VP9_HIGHBITDEPTH
+  } else {
+    const uint16_t *src = CONVERT_TO_SHORTPTR(x->plane[0].src.buf);
+
+    for (row = 0; row < bh; ++row) {
+      for (col = 0; col < bw; ++col) {
+        wsrc_buf[col] = src[col] * src_scale - wsrc_buf[col];
+      }
+      wsrc_buf += wsrc_stride;
+      src += x->plane[0].src.stride;
+    }
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+  }
 }
+#endif  // CONFIG_OBMC
diff --git a/vp10/encoder/rdopt.h b/vp10/encoder/rdopt.h
index b1a8036..da70a22 100644
--- a/vp10/encoder/rdopt.h
+++ b/vp10/encoder/rdopt.h
@@ -43,6 +43,9 @@
                                struct macroblock *x,
                                int mi_row, int mi_col,
                                struct RD_COST *rd_cost,
+#if CONFIG_SUPERTX
+                               int *returnrate_nocoef,
+#endif  // CONFIG_SUPERTX
                                BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
                                int64_t best_rd_so_far);
 
@@ -60,12 +63,32 @@
 int vp10_active_edge_sb(struct VP10_COMP *cpi, int mi_row, int mi_col);
 
 void vp10_rd_pick_inter_mode_sub8x8(struct VP10_COMP *cpi,
-                                   struct TileDataEnc *tile_data,
-                                   struct macroblock *x,
-                                   int mi_row, int mi_col,
-                                   struct RD_COST *rd_cost,
-                                   BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
-                                   int64_t best_rd_so_far);
+                                    struct TileDataEnc *tile_data,
+                                    struct macroblock *x,
+                                    int mi_row, int mi_col,
+                                    struct RD_COST *rd_cost,
+#if CONFIG_SUPERTX
+                                    int *returnrate_nocoef,
+#endif  // CONFIG_SUPERTX
+                                    BLOCK_SIZE bsize, PICK_MODE_CONTEXT *ctx,
+                                    int64_t best_rd_so_far);
+
+#if CONFIG_SUPERTX
+#if CONFIG_VAR_TX
+void vp10_tx_block_rd_b(const VP10_COMP *cpi, MACROBLOCK *x, TX_SIZE tx_size,
+                        int blk_row, int blk_col, int plane, int block,
+                        int plane_bsize, int coeff_ctx,
+                        int *rate, int64_t *dist, int64_t *bsse, int *skip);
+#endif
+
+void vp10_txfm_rd_in_plane_supertx(MACROBLOCK *x,
+                                   const VP10_COMP *cpi,
+                                   int *rate, int64_t *distortion,
+                                   int *skippable, int64_t *sse,
+                                   int64_t ref_best_rd, int plane,
+                                   BLOCK_SIZE bsize, TX_SIZE tx_size,
+                                   int use_fast_coef_casting);
+#endif  // CONFIG_SUPERTX
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp10/encoder/segmentation.c b/vp10/encoder/segmentation.c
index 677910f..bb6e4c4 100644
--- a/vp10/encoder/segmentation.c
+++ b/vp10/encoder/segmentation.c
@@ -58,9 +58,7 @@
     segcounts[4] + segcounts[5], segcounts[6] + segcounts[7]
   };
   const unsigned ccc[2] = { cc[0] + cc[1], cc[2] + cc[3] };
-#if CONFIG_MISC_FIXES
   int i;
-#endif
 
   segment_tree_probs[0] = get_binary_prob(ccc[0], ccc[1]);
   segment_tree_probs[1] = get_binary_prob(cc[0], cc[1]);
@@ -70,16 +68,12 @@
   segment_tree_probs[5] = get_binary_prob(segcounts[4], segcounts[5]);
   segment_tree_probs[6] = get_binary_prob(segcounts[6], segcounts[7]);
 
-#if CONFIG_MISC_FIXES
   for (i = 0; i < 7; i++) {
     const unsigned *ct = i == 0 ? ccc : i < 3 ? cc + (i & 2)
         : segcounts + (i - 3) * 2;
     vp10_prob_diff_update_savings_search(ct,
         cur_tree_probs[i], &segment_tree_probs[i], DIFF_UPDATE_PROB);
   }
-#else
-  (void) cur_tree_probs;
-#endif
 }
 
 // Based on set of segment counts and probabilities calculate a cost estimate
@@ -170,12 +164,103 @@
                           int mi_row, int mi_col,
                           BLOCK_SIZE bsize) {
   const int mis = cm->mi_stride;
-  int bw, bh;
   const int bs = num_8x8_blocks_wide_lookup[bsize], hbs = bs / 2;
+#if CONFIG_EXT_PARTITION_TYPES
+  PARTITION_TYPE partition;
+#else
+  int bw, bh;
+#endif  // CONFIG_EXT_PARTITION_TYPES
 
   if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
     return;
 
+#if CONFIG_EXT_PARTITION_TYPES
+  if (bsize == BLOCK_8X8)
+    partition = PARTITION_NONE;
+  else
+    partition = get_partition(cm, mi_row, mi_col, bsize);
+  switch (partition) {
+    case PARTITION_NONE:
+      count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+                 t_unpred_seg_counts, bs, bs, mi_row, mi_col);
+      break;
+    case PARTITION_HORZ:
+      count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+                 t_unpred_seg_counts, bs, hbs, mi_row, mi_col);
+      count_segs(cm, xd, tile, mi + hbs * mis, no_pred_segcounts,
+                 temporal_predictor_count, t_unpred_seg_counts, bs, hbs,
+                 mi_row + hbs, mi_col);
+      break;
+    case PARTITION_VERT:
+      count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+                 t_unpred_seg_counts, hbs, bs, mi_row, mi_col);
+      count_segs(cm, xd, tile, mi + hbs,
+                 no_pred_segcounts, temporal_predictor_count,
+                 t_unpred_seg_counts, hbs, bs, mi_row, mi_col + hbs);
+      break;
+    case PARTITION_HORZ_A:
+      count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+                 t_unpred_seg_counts, hbs, hbs, mi_row, mi_col);
+      count_segs(cm, xd, tile, mi + hbs, no_pred_segcounts,
+                 temporal_predictor_count, t_unpred_seg_counts, hbs, hbs,
+                 mi_row, mi_col + hbs);
+      count_segs(cm, xd, tile, mi + hbs * mis, no_pred_segcounts,
+                 temporal_predictor_count, t_unpred_seg_counts, bs, hbs,
+                 mi_row + hbs, mi_col);
+      break;
+    case PARTITION_HORZ_B:
+      count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+                 t_unpred_seg_counts, bs, hbs, mi_row, mi_col);
+      count_segs(cm, xd, tile, mi + hbs * mis, no_pred_segcounts,
+                 temporal_predictor_count, t_unpred_seg_counts, hbs, hbs,
+                 mi_row + hbs, mi_col);
+      count_segs(cm, xd, tile, mi + hbs + hbs * mis, no_pred_segcounts,
+                 temporal_predictor_count, t_unpred_seg_counts, hbs, hbs,
+                 mi_row + hbs, mi_col + hbs);
+      break;
+    case PARTITION_VERT_A:
+      count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+                 t_unpred_seg_counts, hbs, hbs, mi_row, mi_col);
+      count_segs(cm, xd, tile, mi + hbs * mis, no_pred_segcounts,
+                 temporal_predictor_count, t_unpred_seg_counts, hbs, hbs,
+                 mi_row + hbs, mi_col);
+      count_segs(cm, xd, tile, mi + hbs,
+                 no_pred_segcounts, temporal_predictor_count,
+                 t_unpred_seg_counts, hbs, bs, mi_row, mi_col + hbs);
+      break;
+    case PARTITION_VERT_B:
+      count_segs(cm, xd, tile, mi, no_pred_segcounts, temporal_predictor_count,
+                 t_unpred_seg_counts, hbs, bs, mi_row, mi_col);
+      count_segs(cm, xd, tile, mi + hbs,
+                 no_pred_segcounts, temporal_predictor_count,
+                 t_unpred_seg_counts, hbs, hbs, mi_row, mi_col + hbs);
+      count_segs(cm, xd, tile, mi + hbs + hbs * mis,
+                 no_pred_segcounts, temporal_predictor_count,
+                 t_unpred_seg_counts, hbs, hbs, mi_row + hbs, mi_col + hbs);
+      break;
+    case PARTITION_SPLIT:
+      {
+        const BLOCK_SIZE subsize = subsize_lookup[PARTITION_SPLIT][bsize];
+        int n;
+
+        assert(num_8x8_blocks_wide_lookup[mi[0]->mbmi.sb_type] < bs &&
+               num_8x8_blocks_high_lookup[mi[0]->mbmi.sb_type] < bs);
+
+        for (n = 0; n < 4; n++) {
+          const int mi_dc = hbs * (n & 1);
+          const int mi_dr = hbs * (n >> 1);
+
+          count_segs_sb(cm, xd, tile, &mi[mi_dr * mis + mi_dc],
+                        no_pred_segcounts, temporal_predictor_count,
+                        t_unpred_seg_counts,
+                        mi_row + mi_dr, mi_col + mi_dc, subsize);
+        }
+      }
+      break;
+    default:
+      assert(0);
+  }
+#else
   bw = num_8x8_blocks_wide_lookup[mi[0]->mbmi.sb_type];
   bh = num_8x8_blocks_high_lookup[mi[0]->mbmi.sb_type];
 
@@ -210,63 +295,55 @@
                     mi_row + mi_dr, mi_col + mi_dc, subsize);
     }
   }
+#endif  // CONFIG_EXT_PARTITION_TYPES
 }
 
 void vp10_choose_segmap_coding_method(VP10_COMMON *cm, MACROBLOCKD *xd) {
   struct segmentation *seg = &cm->seg;
-#if CONFIG_MISC_FIXES
   struct segmentation_probs *segp = &cm->fc->seg;
-#else
-  struct segmentation_probs *segp = &cm->segp;
-#endif
 
   int no_pred_cost;
   int t_pred_cost = INT_MAX;
 
-  int i, tile_col, mi_row, mi_col;
+  int i, tile_col, tile_row, mi_row, mi_col;
 
-#if CONFIG_MISC_FIXES
   unsigned (*temporal_predictor_count)[2] = cm->counts.seg.pred;
   unsigned *no_pred_segcounts = cm->counts.seg.tree_total;
   unsigned *t_unpred_seg_counts = cm->counts.seg.tree_mispred;
-#else
-  unsigned temporal_predictor_count[PREDICTION_PROBS][2] = { { 0 } };
-  unsigned no_pred_segcounts[MAX_SEGMENTS] = { 0 };
-  unsigned t_unpred_seg_counts[MAX_SEGMENTS] = { 0 };
-#endif
 
   vpx_prob no_pred_tree[SEG_TREE_PROBS];
   vpx_prob t_pred_tree[SEG_TREE_PROBS];
   vpx_prob t_nopred_prob[PREDICTION_PROBS];
 
-#if CONFIG_MISC_FIXES
   (void) xd;
-#else
-  // Set default state for the segment tree probabilities and the
-  // temporal coding probabilities
-  memset(segp->tree_probs, 255, sizeof(segp->tree_probs));
-  memset(segp->pred_probs, 255, sizeof(segp->pred_probs));
-#endif
+
+  // We are about to recompute all the segment counts, so zero the accumulators.
+  vp10_zero(cm->counts.seg);
 
   // First of all generate stats regarding how well the last segment map
   // predicts this one
-  for (tile_col = 0; tile_col < 1 << cm->log2_tile_cols; tile_col++) {
-    TileInfo tile;
-    MODE_INFO **mi_ptr;
-    vp10_tile_init(&tile, cm, 0, tile_col);
-
-    mi_ptr = cm->mi_grid_visible + tile.mi_col_start;
-    for (mi_row = 0; mi_row < cm->mi_rows;
-         mi_row += 8, mi_ptr += 8 * cm->mi_stride) {
-      MODE_INFO **mi = mi_ptr;
-      for (mi_col = tile.mi_col_start; mi_col < tile.mi_col_end;
-           mi_col += 8, mi += 8)
-        count_segs_sb(cm, xd, &tile, mi, no_pred_segcounts,
-                      temporal_predictor_count, t_unpred_seg_counts,
-                      mi_row, mi_col, BLOCK_64X64);
+  for (tile_row = 0; tile_row < cm->tile_rows; tile_row++) {
+    TileInfo tile_info;
+    vp10_tile_set_row(&tile_info, cm, tile_row);
+    for (tile_col = 0; tile_col < cm->tile_cols; tile_col++) {
+      MODE_INFO **mi_ptr;
+      vp10_tile_set_col(&tile_info, cm, tile_col);
+      mi_ptr = cm->mi_grid_visible + tile_info.mi_row_start * cm->mi_stride +
+                 tile_info.mi_col_start;
+      for (mi_row = tile_info.mi_row_start; mi_row < tile_info.mi_row_end;
+           mi_row += cm->mib_size, mi_ptr += cm->mib_size * cm->mi_stride) {
+        MODE_INFO **mi = mi_ptr;
+        for (mi_col = tile_info.mi_col_start; mi_col < tile_info.mi_col_end;
+             mi_col += cm->mib_size, mi += cm->mib_size) {
+          count_segs_sb(cm, xd, &tile_info, mi, no_pred_segcounts,
+                        temporal_predictor_count, t_unpred_seg_counts,
+                        mi_row, mi_col, cm->sb_size);
+        }
+      }
     }
   }
 
+
   // Work out probability tree for coding segments without prediction
   // and the cost.
   calc_segtree_probs(no_pred_segcounts, no_pred_tree, segp->tree_probs);
@@ -284,13 +361,10 @@
       const int count0 = temporal_predictor_count[i][0];
       const int count1 = temporal_predictor_count[i][1];
 
-#if CONFIG_MISC_FIXES
+      t_nopred_prob[i] = get_binary_prob(count0, count1);
       vp10_prob_diff_update_savings_search(temporal_predictor_count[i],
                                            segp->pred_probs[i],
                                            &t_nopred_prob[i], DIFF_UPDATE_PROB);
-#else
-      t_nopred_prob[i] = get_binary_prob(count0, count1);
-#endif
 
       // Add in the predictor signaling cost
       t_pred_cost += count0 * vp10_cost_zero(t_nopred_prob[i]) +
@@ -302,30 +376,17 @@
   if (t_pred_cost < no_pred_cost) {
     assert(!cm->error_resilient_mode);
     seg->temporal_update = 1;
-#if !CONFIG_MISC_FIXES
-    memcpy(segp->tree_probs, t_pred_tree, sizeof(t_pred_tree));
-    memcpy(segp->pred_probs, t_nopred_prob, sizeof(t_nopred_prob));
-#endif
   } else {
     seg->temporal_update = 0;
-#if !CONFIG_MISC_FIXES
-    memcpy(segp->tree_probs, no_pred_tree, sizeof(no_pred_tree));
-#endif
   }
 }
 
 void vp10_reset_segment_features(VP10_COMMON *cm) {
   struct segmentation *seg = &cm->seg;
-#if !CONFIG_MISC_FIXES
-  struct segmentation_probs *segp = &cm->segp;
-#endif
 
   // Set up default state for MB feature flags
   seg->enabled = 0;
   seg->update_map = 0;
   seg->update_data = 0;
-#if !CONFIG_MISC_FIXES
-  memset(segp->tree_probs, 255, sizeof(segp->tree_probs));
-#endif
   vp10_clearall_segfeatures(seg);
 }
diff --git a/vp10/encoder/speed_features.c b/vp10/encoder/speed_features.c
index ce0aebe..8f4f11d 100644
--- a/vp10/encoder/speed_features.c
+++ b/vp10/encoder/speed_features.c
@@ -131,7 +131,6 @@
   const int boosted = frame_is_boosted(cpi);
 
   sf->adaptive_rd_thresh = 1;
-  sf->allow_skip_recode = 1;
 
   if (speed >= 1) {
     if ((cpi->twopass.fr_content_type == FC_GRAPHICS_ANIMATION) ||
@@ -159,12 +158,21 @@
 
     sf->tx_size_search_breakout = 1;
     sf->partition_search_breakout_rate_thr = 80;
+    sf->tx_type_search.prune_mode = PRUNE_ONE;
+    sf->tx_type_search.fast_intra_tx_type_search = 1;
+    sf->tx_type_search.fast_inter_tx_type_search = 1;
+    // Use transform domain distortion.
+    // Note var-tx expt always uses pixel domain distortion.
+    sf->use_transform_domain_distortion = 1;
+#if CONFIG_EXT_INTER
+    sf->disable_wedge_search_var_thresh = 100;
+    sf->fast_wedge_sign_estimate = 1;
+#endif  // CONFIG_EXT_INTER
   }
 
   if (speed >= 2) {
     sf->tx_size_search_method = frame_is_boosted(cpi) ? USE_FULL_RD
                                                       : USE_LARGESTALL;
-
     sf->mode_search_skip_flags = (cm->frame_type == KEY_FRAME) ? 0 :
                                  FLAG_SKIP_INTRA_DIRMISMATCH |
                                  FLAG_SKIP_INTRA_BESTINTER |
@@ -174,6 +182,10 @@
     sf->comp_inter_joint_search_thresh = BLOCK_SIZES;
     sf->auto_min_max_partition_size = RELAXED_NEIGHBORING_MIN_MAX;
     sf->allow_partition_search_skip = 1;
+    sf->use_upsampled_references = 0;
+#if CONFIG_EXT_TX
+    sf->tx_type_search.prune_mode = PRUNE_TWO;
+#endif
   }
 
   if (speed >= 3) {
@@ -227,7 +239,6 @@
 static void set_rt_speed_feature_framesize_dependent(VP10_COMP *cpi,
     SPEED_FEATURES *sf, int speed) {
   VP10_COMMON *const cm = &cpi->common;
-
   if (speed >= 1) {
     if (VPXMIN(cm->width, cm->height) >= 720) {
       sf->disable_split_mask = cm->show_frame ? DISABLE_ALL_SPLIT
@@ -270,6 +281,15 @@
   sf->use_fast_coef_costing = 1;
   sf->allow_exhaustive_searches = 0;
   sf->exhaustive_searches_thresh = INT_MAX;
+  sf->use_upsampled_references = 0;
+#if CONFIG_EXT_INTER
+  sf->disable_wedge_search_var_thresh = 100;
+  sf->fast_wedge_sign_estimate = 1;
+#endif  // CONFIG_EXT_INTER
+
+  // Use transform domain distortion computation
+  // Note var-tx expt always uses pixel domain distortion.
+  sf->use_transform_domain_distortion = 1;
 
   if (speed >= 1) {
     sf->use_square_partition_only = !frame_is_intra_only(cm);
@@ -288,6 +308,7 @@
     sf->intra_uv_mode_mask[TX_16X16] = INTRA_DC_H_V;
   }
 
+
   if (speed >= 2) {
     sf->mode_search_skip_flags = (cm->frame_type == KEY_FRAME) ? 0 :
                                  FLAG_SKIP_INTRA_DIRMISMATCH |
@@ -313,7 +334,6 @@
     sf->mv.subpel_iters_per_step = 1;
     sf->adaptive_rd_thresh = 4;
     sf->mode_skip_start = 6;
-    sf->allow_skip_recode = 0;
     sf->optimize_coefficients = 0;
     sf->disable_split_mask = DISABLE_ALL_SPLIT;
     sf->lpf_pick = LPF_PICK_FROM_Q;
@@ -341,12 +361,15 @@
     sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST;
     sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST;
     sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST;
+#if CONFIG_EXT_PARTITION
+    sf->inter_mode_mask[BLOCK_64X128] = INTER_NEAREST;
+    sf->inter_mode_mask[BLOCK_128X64] = INTER_NEAREST;
+    sf->inter_mode_mask[BLOCK_128X128] = INTER_NEAREST;
+#endif  // CONFIG_EXT_PARTITION
     sf->max_intra_bsize = BLOCK_32X32;
-    sf->allow_skip_recode = 1;
   }
 
   if (speed >= 5) {
-    sf->use_quant_fp = !is_keyframe;
     sf->auto_min_max_partition_size = is_keyframe ? RELAXED_NEIGHBORING_MIN_MAX
                                                   : STRICT_NEIGHBORING_MIN_MAX;
     sf->default_max_partition_size = BLOCK_32X32;
@@ -355,11 +378,15 @@
         (frames_since_key % (sf->last_partitioning_redo_frequency << 1) == 1);
     sf->max_delta_qindex = is_keyframe ? 20 : 15;
     sf->partition_search_type = REFERENCE_PARTITION;
-    sf->allow_skip_recode = 0;
     sf->inter_mode_mask[BLOCK_32X32] = INTER_NEAREST_NEW_ZERO;
     sf->inter_mode_mask[BLOCK_32X64] = INTER_NEAREST_NEW_ZERO;
     sf->inter_mode_mask[BLOCK_64X32] = INTER_NEAREST_NEW_ZERO;
     sf->inter_mode_mask[BLOCK_64X64] = INTER_NEAREST_NEW_ZERO;
+#if CONFIG_EXT_PARTITION
+    sf->inter_mode_mask[BLOCK_64X128] = INTER_NEAREST_NEW_ZERO;
+    sf->inter_mode_mask[BLOCK_128X64] = INTER_NEAREST_NEW_ZERO;
+    sf->inter_mode_mask[BLOCK_128X128] = INTER_NEAREST_NEW_ZERO;
+#endif  // CONFIG_EXT_PARTITION
     sf->adaptive_rd_thresh = 2;
     // This feature is only enabled when partition search is disabled.
     sf->reuse_inter_pred_sby = 1;
@@ -464,13 +491,15 @@
   sf->cb_pred_filter_search = 0;
   sf->cb_partition_search = 0;
   sf->alt_ref_search_fp = 0;
-  sf->use_quant_fp = 0;
   sf->partition_search_type = SEARCH_PARTITION;
+  sf->tx_type_search.prune_mode = NO_PRUNE;
+  sf->tx_type_search.fast_intra_tx_type_search = 0;
+  sf->tx_type_search.fast_inter_tx_type_search = 0;
   sf->less_rectangular_check = 0;
   sf->use_square_partition_only = 0;
   sf->auto_min_max_partition_size = NOT_IN_USE;
   sf->rd_auto_partition_min_limit = BLOCK_4X4;
-  sf->default_max_partition_size = BLOCK_64X64;
+  sf->default_max_partition_size = BLOCK_LARGEST;
   sf->default_min_partition_size = BLOCK_4X4;
   sf->adjust_partitioning_from_last_frame = 0;
   sf->last_partitioning_redo_frequency = 4;
@@ -481,6 +510,15 @@
   sf->disable_filter_search_var_thresh = 0;
   sf->adaptive_interp_filter_search = 0;
   sf->allow_partition_search_skip = 0;
+#if CONFIG_EXT_TILE
+  sf->use_upsampled_references = 0;
+#else
+  sf->use_upsampled_references = 1;
+#endif  // CONFIG_EXT_TILE
+#if CONFIG_EXT_INTER
+  sf->disable_wedge_search_var_thresh = 0;
+  sf->fast_wedge_sign_estimate = 0;
+#endif  // CONFIG_EXT_INTER
 
   for (i = 0; i < TX_SIZES; i++) {
     sf->intra_y_mode_mask[i] = INTRA_ALL;
@@ -488,7 +526,6 @@
   }
   sf->use_rd_breakout = 0;
   sf->use_uv_intra_rd_estimate = 0;
-  sf->allow_skip_recode = 0;
   sf->lpf_pick = LPF_PICK_FROM_FULL_IMAGE;
   sf->use_fast_coef_updates = TWO_LOOP;
   sf->use_fast_coef_costing = 0;
@@ -496,7 +533,7 @@
   sf->schedule_mode_search = 0;
   for (i = 0; i < BLOCK_SIZES; ++i)
     sf->inter_mode_mask[i] = INTER_ALL;
-  sf->max_intra_bsize = BLOCK_64X64;
+  sf->max_intra_bsize = BLOCK_LARGEST;
   sf->reuse_inter_pred_sby = 0;
   // This setting only takes effect when partition_search_type is set
   // to FIXED_PARTITION.
@@ -511,11 +548,24 @@
   sf->partition_search_breakout_rate_thr = 0;
   sf->simple_model_rd_from_var = 0;
 
+  // Set this at the appropriate speed levels
+#if CONFIG_EXT_TILE
+  sf->use_transform_domain_distortion = 1;
+#else
+  sf->use_transform_domain_distortion = 0;
+#endif  // CONFIG_EXT_TILE
+
   if (oxcf->mode == REALTIME)
     set_rt_speed_feature(cpi, sf, oxcf->speed, oxcf->content);
   else if (oxcf->mode == GOOD)
     set_good_speed_feature(cpi, cm, sf, oxcf->speed);
 
+  // sf->partition_search_breakout_dist_thr is set assuming max 64x64
+  // blocks. Normalise this if the blocks are bigger.
+  if (MAX_SB_SIZE_LOG2 > 6) {
+    sf->partition_search_breakout_dist_thr <<= 2 * (MAX_SB_SIZE_LOG2 - 6);
+  }
+
   cpi->full_search_sad = vp10_full_search_sad;
   cpi->diamond_search_sad = vp10_diamond_search_sad;
 
diff --git a/vp10/encoder/speed_features.h b/vp10/encoder/speed_features.h
index 3b91999..c1d1f81 100644
--- a/vp10/encoder/speed_features.h
+++ b/vp10/encoder/speed_features.h
@@ -31,6 +31,47 @@
                     (1 << H_PRED)
 };
 
+#if CONFIG_EXT_INTER
+enum {
+  INTER_ALL =
+      (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) |
+      (1 << NEWMV) | (1 << NEWFROMNEARMV) |
+      (1 << NEAREST_NEARESTMV) | (1 << NEAR_NEARMV) | (1 << NEAREST_NEARMV) |
+      (1 << NEAR_NEARESTMV) | (1 << NEW_NEWMV) | (1 << NEAREST_NEWMV) |
+      (1 << NEAR_NEWMV) | (1 << NEW_NEARMV) | (1 << NEW_NEARESTMV) |
+      (1 << ZERO_ZEROMV),
+  INTER_NEAREST = (1 << NEARESTMV) | (1 << NEAREST_NEARESTMV) |
+                  (1 << NEAREST_NEARMV) | (1 << NEAR_NEARESTMV) |
+                  (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV),
+  INTER_NEAREST_NEW = (1 << NEARESTMV) | (1 << NEWMV) | (1 << NEWFROMNEARMV) |
+                      (1 << NEAREST_NEARESTMV) | (1 << NEW_NEWMV) |
+                      (1 << NEAR_NEARESTMV) | (1 << NEAREST_NEARMV) |
+                      (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV) |
+                      (1 << NEW_NEARMV) | (1 << NEAR_NEWMV),
+  INTER_NEAREST_ZERO = (1 << NEARESTMV) | (1 << ZEROMV) |
+                       (1 << NEAREST_NEARESTMV) | (1 << ZERO_ZEROMV) |
+                       (1 << NEAREST_NEARMV) | (1 << NEAR_NEARESTMV) |
+                       (1 << NEAREST_NEWMV) | (1 << NEW_NEARESTMV),
+  INTER_NEAREST_NEW_ZERO =
+      (1 << NEARESTMV) | (1 << ZEROMV) | (1 << NEWMV) | (1 << NEWFROMNEARMV) |
+      (1 << NEAREST_NEARESTMV) | (1 << ZERO_ZEROMV) | (1 << NEW_NEWMV) |
+      (1 << NEAREST_NEARMV) | (1 << NEAR_NEARESTMV) |
+      (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV) |
+      (1 << NEW_NEARMV) | (1 << NEAR_NEWMV),
+  INTER_NEAREST_NEAR_NEW =
+      (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV) | (1 << NEWFROMNEARMV) |
+      (1 << NEAREST_NEARESTMV) | (1 << NEW_NEWMV) |
+      (1 << NEAREST_NEARMV) | (1 << NEAR_NEARESTMV) |
+      (1 << NEW_NEARESTMV) | (1 << NEAREST_NEWMV) |
+      (1 << NEW_NEARMV) | (1 << NEAR_NEWMV) | (1 << NEAR_NEARMV),
+  INTER_NEAREST_NEAR_ZERO =
+      (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) |
+      (1 << NEAREST_NEARESTMV) | (1 << ZERO_ZEROMV) |
+      (1 << NEAREST_NEARMV) | (1 << NEAR_NEARESTMV) |
+      (1 << NEAREST_NEWMV) | (1 << NEW_NEARESTMV) |
+      (1 << NEW_NEARMV) | (1 << NEAR_NEWMV) | (1 << NEAR_NEARMV),
+};
+#else
 enum {
   INTER_ALL = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV) | (1 << NEWMV),
   INTER_NEAREST = (1 << NEARESTMV),
@@ -40,6 +81,7 @@
   INTER_NEAREST_NEAR_NEW = (1 << NEARESTMV) | (1 << NEARMV) | (1 << NEWMV),
   INTER_NEAREST_NEAR_ZERO = (1 << NEARESTMV) | (1 << NEARMV) | (1 << ZEROMV),
 };
+#endif  // CONFIG_EXT_INTER
 
 enum {
   DISABLE_ALL_INTER_SPLIT   = (1 << THR_COMP_GA) |
@@ -135,12 +177,28 @@
 } MODE_SEARCH_SKIP_LOGIC;
 
 typedef enum {
-  FLAG_SKIP_EIGHTTAP = 1 << EIGHTTAP,
+  FLAG_SKIP_EIGHTTAP_REGULAR = 1 << EIGHTTAP_REGULAR,
   FLAG_SKIP_EIGHTTAP_SMOOTH = 1 << EIGHTTAP_SMOOTH,
-  FLAG_SKIP_EIGHTTAP_SHARP = 1 << EIGHTTAP_SHARP,
+  FLAG_SKIP_MULTITAP_SHARP = 1 << MULTITAP_SHARP,
 } INTERP_FILTER_MASK;
 
 typedef enum {
+  NO_PRUNE = 0,
+  // eliminates one tx type in vertical and horizontal direction
+  PRUNE_ONE = 1,
+#if CONFIG_EXT_TX
+  // eliminates two tx types in each direction
+  PRUNE_TWO = 2,
+#endif
+} TX_TYPE_PRUNE_MODE;
+
+typedef struct {
+  TX_TYPE_PRUNE_MODE prune_mode;
+  int fast_intra_tx_type_search;
+  int fast_inter_tx_type_search;
+} TX_TYPE_SEARCH;
+
+typedef enum {
   // Search partitions using RD criterion
   SEARCH_PARTITION,
 
@@ -230,10 +288,6 @@
   // mode to be evaluated. A high value means we will be faster.
   int adaptive_rd_thresh;
 
-  // Speed feature to allow or disallow skipping of recode at block
-  // level within a frame.
-  int allow_skip_recode;
-
   // Coefficient probability model approximation step size
   int coeff_prob_appx_step;
 
@@ -257,6 +311,8 @@
 
   PARTITION_SEARCH_TYPE partition_search_type;
 
+  TX_TYPE_SEARCH tx_type_search;
+
   // Used if partition_search_type = FIXED_SIZE_PARTITION
   BLOCK_SIZE always_this_block_size;
 
@@ -267,8 +323,8 @@
   // Disable testing non square partitions. (eg 16x32)
   int use_square_partition_only;
 
-  // Sets min and max partition sizes for this 64x64 region based on the
-  // same 64x64 in last encoded frame, and the left and above neighbor.
+  // Sets min and max partition sizes for this superblock based on the
+  // same superblock in last encoded frame, and the left and above neighbor.
   AUTO_MIN_MAX_MODE auto_min_max_partition_size;
   // Ensures the rd based auto partition search will always
   // go down at least to the specified level.
@@ -327,9 +383,6 @@
 
   int alt_ref_search_fp;
 
-  // Fast quantization process path
-  int use_quant_fp;
-
   // Use finer quantizer in every other few frames that run variable block
   // partition type search.
   int force_frame_boost;
@@ -346,6 +399,14 @@
   // Choose a very large value (UINT_MAX) to use 8-tap always
   unsigned int disable_filter_search_var_thresh;
 
+#if CONFIG_EXT_INTER
+  // A source variance threshold below which wedge search is disabled
+  unsigned int disable_wedge_search_var_thresh;
+
+  // Whether fast wedge sign estimate is used
+  int fast_wedge_sign_estimate;
+#endif  // CONFIG_EXT_INTER
+
   // These bit masks allow you to enable or disable intra modes for each
   // transform size separately.
   int intra_y_mode_mask[TX_SIZES];
@@ -424,6 +485,13 @@
 
   // Fast approximation of vp10_model_rd_from_var_lapndz
   int simple_model_rd_from_var;
+
+  // Do sub-pixel search in up-sampled reference frames
+  int use_upsampled_references;
+
+  // Whether to compute distortion in the image domain (slower but
+  // more accurate), or in the transform domain (faster but less acurate).
+  int use_transform_domain_distortion;
 } SPEED_FEATURES;
 
 struct VP10_COMP;
diff --git a/vp10/encoder/subexp.c b/vp10/encoder/subexp.c
index eccee8e..4aaffae 100644
--- a/vp10/encoder/subexp.c
+++ b/vp10/encoder/subexp.c
@@ -7,7 +7,7 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
-#include "vpx_dsp/bitwriter.h"
+#include "vp10/encoder/bitwriter.h"
 
 #include "vp10/common/common.h"
 #include "vp10/common/entropy.h"
@@ -25,8 +25,7 @@
   10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
   10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
   10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
-  10, 11 - CONFIG_MISC_FIXES,
-          11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
+  10, 10, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
   11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
   11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
   11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
@@ -81,38 +80,38 @@
 
 static int prob_diff_update_cost(vpx_prob newp, vpx_prob oldp) {
   int delp = remap_prob(newp, oldp);
-  return update_bits[delp] * 256;
+  return update_bits[delp] << VP9_PROB_COST_SHIFT;
 }
 
-static void encode_uniform(vpx_writer *w, int v) {
+static void encode_uniform(vp10_writer *w, int v) {
   const int l = 8;
-  const int m = (1 << l) - 191 + CONFIG_MISC_FIXES;
+  const int m = (1 << l) - 190;
   if (v < m) {
-    vpx_write_literal(w, v, l - 1);
+    vp10_write_literal(w, v, l - 1);
   } else {
-    vpx_write_literal(w, m + ((v - m) >> 1), l - 1);
-    vpx_write_literal(w, (v - m) & 1, 1);
+    vp10_write_literal(w, m + ((v - m) >> 1), l - 1);
+    vp10_write_literal(w, (v - m) & 1, 1);
   }
 }
 
-static INLINE int write_bit_gte(vpx_writer *w, int word, int test) {
-  vpx_write_literal(w, word >= test, 1);
+static INLINE int write_bit_gte(vp10_writer *w, int word, int test) {
+  vp10_write_literal(w, word >= test, 1);
   return word >= test;
 }
 
-static void encode_term_subexp(vpx_writer *w, int word) {
+static void encode_term_subexp(vp10_writer *w, int word) {
   if (!write_bit_gte(w, word, 16)) {
-    vpx_write_literal(w, word, 4);
+    vp10_write_literal(w, word, 4);
   } else if (!write_bit_gte(w, word, 32)) {
-    vpx_write_literal(w, word - 16, 4);
+    vp10_write_literal(w, word - 16, 4);
   } else if (!write_bit_gte(w, word, 64)) {
-    vpx_write_literal(w, word - 32, 5);
+    vp10_write_literal(w, word - 32, 5);
   } else {
     encode_uniform(w, word - 64);
   }
 }
 
-void vp10_write_prob_diff_update(vpx_writer *w, vpx_prob newp, vpx_prob oldp) {
+void vp10_write_prob_diff_update(vp10_writer *w, vpx_prob newp, vpx_prob oldp) {
   const int delp = remap_prob(newp, oldp);
   encode_term_subexp(w, delp);
 }
@@ -181,7 +180,89 @@
   return bestsavings;
 }
 
-void vp10_cond_prob_diff_update(vpx_writer *w, vpx_prob *oldp,
+#if CONFIG_ENTROPY
+static int get_cost(unsigned int ct[][2], vpx_prob p, int n) {
+  int i, p0 = p;
+  unsigned int total_ct[2] = {0 , 0};
+  int cost = 0;
+
+  for (i = 0; i <= n; ++i) {
+    cost += cost_branch256(ct[i], p);
+    total_ct[0] += ct[i][0];
+    total_ct[1] += ct[i][1];
+    if (i < n)
+      p = vp10_merge_probs(p0, total_ct,
+                           COEF_COUNT_SAT_BITS, COEF_MAX_UPDATE_FACTOR_BITS);
+  }
+  return cost;
+}
+
+int vp10_prob_update_search_subframe(unsigned int ct[][2],
+                                     vpx_prob oldp, vpx_prob *bestp,
+                                     vpx_prob upd, int n) {
+  const int old_b = get_cost(ct, oldp, n);
+  int bestsavings = 0;
+  vpx_prob newp, bestnewp = oldp;
+  const int step = *bestp > oldp ? -1 : 1;
+
+  for (newp = *bestp; newp != oldp; newp += step) {
+    const int new_b = get_cost(ct, newp, n);
+    const int update_b = prob_diff_update_cost(newp, oldp) + vp10_cost_upd256;
+    const int savings = old_b - new_b - update_b;
+    if (savings > bestsavings) {
+      bestsavings = savings;
+      bestnewp = newp;
+    }
+  }
+  *bestp = bestnewp;
+  return bestsavings;
+}
+
+int vp10_prob_update_search_model_subframe(unsigned int ct[ENTROPY_NODES]
+                                                          [COEF_PROBS_BUFS][2],
+                                           const vpx_prob *oldp,
+                                           vpx_prob *bestp, vpx_prob upd,
+                                           int stepsize, int n) {
+  int i, old_b, new_b, update_b, savings, bestsavings;
+  int newp;
+  const int step_sign = *bestp > oldp[PIVOT_NODE] ? -1 : 1;
+  const int step = stepsize * step_sign;
+  vpx_prob bestnewp, newplist[ENTROPY_NODES], oldplist[ENTROPY_NODES];
+  vp10_model_to_full_probs(oldp, oldplist);
+  memcpy(newplist, oldp, sizeof(vpx_prob) * UNCONSTRAINED_NODES);
+  for (i = UNCONSTRAINED_NODES, old_b = 0; i < ENTROPY_NODES; ++i)
+    old_b += get_cost(ct[i], oldplist[i], n);
+  old_b += get_cost(ct[PIVOT_NODE], oldplist[PIVOT_NODE], n);
+
+  bestsavings = 0;
+  bestnewp = oldp[PIVOT_NODE];
+
+  assert(stepsize > 0);
+
+  for (newp = *bestp; (newp - oldp[PIVOT_NODE]) * step_sign < 0;
+      newp += step) {
+    if (newp < 1 || newp > 255)
+      continue;
+    newplist[PIVOT_NODE] = newp;
+    vp10_model_to_full_probs(newplist, newplist);
+    for (i = UNCONSTRAINED_NODES, new_b = 0; i < ENTROPY_NODES; ++i)
+      new_b += get_cost(ct[i], newplist[i], n);
+    new_b += get_cost(ct[PIVOT_NODE], newplist[PIVOT_NODE], n);
+    update_b = prob_diff_update_cost(newp, oldp[PIVOT_NODE]) +
+        vp10_cost_upd256;
+    savings = old_b - new_b - update_b;
+    if (savings > bestsavings) {
+      bestsavings = savings;
+      bestnewp = newp;
+    }
+  }
+
+  *bestp = bestnewp;
+  return bestsavings;
+}
+#endif  // CONFIG_ENTROPY
+
+void vp10_cond_prob_diff_update(vp10_writer *w, vpx_prob *oldp,
                                const unsigned int ct[2]) {
   const vpx_prob upd = DIFF_UPDATE_PROB;
   vpx_prob newp = get_binary_prob(ct[0], ct[1]);
@@ -189,11 +270,11 @@
                                                           upd);
   assert(newp >= 1);
   if (savings > 0) {
-    vpx_write(w, 1, upd);
+    vp10_write(w, 1, upd);
     vp10_write_prob_diff_update(w, newp, *oldp);
     *oldp = newp;
   } else {
-    vpx_write(w, 0, upd);
+    vp10_write(w, 0, upd);
   }
 }
 
diff --git a/vp10/encoder/subexp.h b/vp10/encoder/subexp.h
index 091334f..756b499 100644
--- a/vp10/encoder/subexp.h
+++ b/vp10/encoder/subexp.h
@@ -18,12 +18,12 @@
 
 #include "vpx_dsp/prob.h"
 
-struct vpx_writer;
+struct vp10_writer;
 
-void vp10_write_prob_diff_update(struct vpx_writer *w,
+void vp10_write_prob_diff_update(struct vp10_writer *w,
                                 vpx_prob newp, vpx_prob oldp);
 
-void vp10_cond_prob_diff_update(struct vpx_writer *w, vpx_prob *oldp,
+void vp10_cond_prob_diff_update(struct vp10_writer *w, vpx_prob *oldp,
                                const unsigned int ct[2]);
 
 int vp10_prob_diff_update_savings_search(const unsigned int *ct,
@@ -36,9 +36,20 @@
                                               vpx_prob *bestp,
                                               vpx_prob upd,
                                               int stepsize);
-
 int vp10_cond_prob_diff_update_savings(vpx_prob *oldp,
                                        const unsigned int ct[2]);
+
+#if CONFIG_ENTROPY
+int vp10_prob_update_search_subframe(unsigned int ct[][2],
+                                     vpx_prob oldp, vpx_prob *bestp,
+                                     vpx_prob upd, int n);
+int vp10_prob_update_search_model_subframe(unsigned int ct[ENTROPY_NODES]
+                                                          [COEF_PROBS_BUFS][2],
+                                           const vpx_prob *oldp,
+                                           vpx_prob *bestp, vpx_prob upd,
+                                           int stepsize, int n);
+#endif  // CONFIG_ENTROPY
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp10/encoder/temporal_filter.c b/vp10/encoder/temporal_filter.c
index 5278d3b..d125dae 100644
--- a/vp10/encoder/temporal_filter.c
+++ b/vp10/encoder/temporal_filter.c
@@ -45,11 +45,23 @@
                                             int x, int y) {
   const int which_mv = 0;
   const MV mv = { mv_row, mv_col };
-  const InterpKernel *const kernel =
-    vp10_filter_kernels[xd->mi[0]->mbmi.interp_filter];
-
   enum mv_precision mv_precision_uv;
   int uv_stride;
+
+#if USE_TEMPORALFILTER_12TAP
+#if CONFIG_DUAL_FILTER
+  const INTERP_FILTER interp_filter[4] = {
+      TEMPORALFILTER_12TAP, TEMPORALFILTER_12TAP,
+      TEMPORALFILTER_12TAP, TEMPORALFILTER_12TAP
+  };
+#else
+  const INTERP_FILTER interp_filter = TEMPORALFILTER_12TAP;
+#endif
+  (void)xd;
+#else
+  const INTERP_FILTER interp_filter = xd->mi[0]->mbmi.interp_filter;
+#endif  // USE_TEMPORALFILTER_12TAP
+
   if (uv_block_width == 8) {
     uv_stride = (stride + 1) >> 1;
     mv_precision_uv = MV_PRECISION_Q4;
@@ -66,7 +78,8 @@
                                      scale,
                                      16, 16,
                                      which_mv,
-                                     kernel, MV_PRECISION_Q3, x, y, xd->bd);
+                                     interp_filter,
+                                     MV_PRECISION_Q3, x, y, xd->bd);
 
     vp10_highbd_build_inter_predictor(u_mb_ptr, uv_stride,
                                      &pred[256], uv_block_width,
@@ -74,7 +87,8 @@
                                      scale,
                                      uv_block_width, uv_block_height,
                                      which_mv,
-                                     kernel, mv_precision_uv, x, y, xd->bd);
+                                     interp_filter,
+                                     mv_precision_uv, x, y, xd->bd);
 
     vp10_highbd_build_inter_predictor(v_mb_ptr, uv_stride,
                                      &pred[512], uv_block_width,
@@ -82,33 +96,34 @@
                                      scale,
                                      uv_block_width, uv_block_height,
                                      which_mv,
-                                     kernel, mv_precision_uv, x, y, xd->bd);
+                                     interp_filter,
+                                     mv_precision_uv, x, y, xd->bd);
     return;
   }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   vp10_build_inter_predictor(y_mb_ptr, stride,
-                            &pred[0], 16,
-                            &mv,
-                            scale,
-                            16, 16,
-                            which_mv,
-                            kernel, MV_PRECISION_Q3, x, y);
+                             &pred[0], 16,
+                             &mv,
+                             scale,
+                             16, 16,
+                             which_mv,
+                             interp_filter, MV_PRECISION_Q3, x, y);
 
   vp10_build_inter_predictor(u_mb_ptr, uv_stride,
-                            &pred[256], uv_block_width,
-                            &mv,
-                            scale,
-                            uv_block_width, uv_block_height,
-                            which_mv,
-                            kernel, mv_precision_uv, x, y);
+                             &pred[256], uv_block_width,
+                             &mv,
+                             scale,
+                             uv_block_width, uv_block_height,
+                             which_mv,
+                             interp_filter, mv_precision_uv, x, y);
 
   vp10_build_inter_predictor(v_mb_ptr, uv_stride,
-                            &pred[512], uv_block_width,
-                            &mv,
-                            scale,
-                            uv_block_width, uv_block_height,
-                            which_mv,
-                            kernel, mv_precision_uv, x, y);
+                             &pred[512], uv_block_width,
+                             &mv,
+                             scale,
+                             uv_block_width, uv_block_height,
+                             which_mv,
+                             interp_filter, mv_precision_uv, x, y);
 }
 
 void vp10_temporal_filter_init(void) {
@@ -120,14 +135,14 @@
 }
 
 void vp10_temporal_filter_apply_c(uint8_t *frame1,
-                                 unsigned int stride,
-                                 uint8_t *frame2,
-                                 unsigned int block_width,
-                                 unsigned int block_height,
-                                 int strength,
-                                 int filter_weight,
-                                 unsigned int *accumulator,
-                                 uint16_t *count) {
+                                  unsigned int stride,
+                                  uint8_t *frame2,
+                                  unsigned int block_width,
+                                  unsigned int block_height,
+                                  int strength,
+                                  int filter_weight,
+                                  unsigned int *accumulator,
+                                  uint16_t *count) {
   unsigned int i, j, k;
   int modifier;
   int byte = 0;
@@ -135,15 +150,38 @@
 
   for (i = 0, k = 0; i < block_height; i++) {
     for (j = 0; j < block_width; j++, k++) {
-      int src_byte = frame1[byte];
-      int pixel_value = *frame2++;
+      int pixel_value = *frame2;
 
-      modifier   = src_byte - pixel_value;
-      // This is an integer approximation of:
-      // float coeff = (3.0 * modifer * modifier) / pow(2, strength);
-      // modifier =  (int)roundf(coeff > 16 ? 0 : 16-coeff);
-      modifier  *= modifier;
-      modifier  *= 3;
+      // non-local mean approach
+      int diff_sse[9] = { 0 };
+      int idx, idy, index = 0;
+
+      for (idy = -1; idy <= 1; ++idy) {
+        for (idx = -1; idx <= 1; ++idx) {
+          int row = i + idy;
+          int col = j + idx;
+
+          if (row >= 0 && row < (int)block_height &&
+              col >= 0 && col < (int)block_width) {
+            int diff = frame1[byte + idy * (int)stride + idx] -
+                frame2[idy * (int)block_width + idx];
+            diff_sse[index] = diff * diff;
+            ++index;
+          }
+        }
+      }
+
+      assert(index > 0);
+
+      modifier = 0;
+      for (idx = 0; idx < 9; ++idx)
+        modifier += diff_sse[idx];
+
+      modifier *= 3;
+      modifier /= index;
+
+      ++frame2;
+
       modifier  += rounding;
       modifier >>= strength;
 
@@ -165,14 +203,14 @@
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp10_highbd_temporal_filter_apply_c(uint8_t *frame1_8,
-                                        unsigned int stride,
-                                        uint8_t *frame2_8,
-                                        unsigned int block_width,
-                                        unsigned int block_height,
-                                        int strength,
-                                        int filter_weight,
-                                        unsigned int *accumulator,
-                                        uint16_t *count) {
+                                         unsigned int stride,
+                                         uint8_t *frame2_8,
+                                         unsigned int block_width,
+                                         unsigned int block_height,
+                                         int strength,
+                                         int filter_weight,
+                                         unsigned int *accumulator,
+                                         uint16_t *count) {
   uint16_t *frame1 = CONVERT_TO_SHORTPTR(frame1_8);
   uint16_t *frame2 = CONVERT_TO_SHORTPTR(frame2_8);
   unsigned int i, j, k;
@@ -182,15 +220,38 @@
 
   for (i = 0, k = 0; i < block_height; i++) {
     for (j = 0; j < block_width; j++, k++) {
-      int src_byte = frame1[byte];
-      int pixel_value = *frame2++;
+      int pixel_value = *frame2;
 
-      modifier   = src_byte - pixel_value;
-      // This is an integer approximation of:
-      // float coeff = (3.0 * modifer * modifier) / pow(2, strength);
-      // modifier =  (int)roundf(coeff > 16 ? 0 : 16-coeff);
-      modifier *= modifier;
+      // non-local mean approach
+      int diff_sse[9] = { 0 };
+      int idx, idy, index = 0;
+
+      for (idy = -1; idy <= 1; ++idy) {
+        for (idx = -1; idx <= 1; ++idx) {
+          int row = i + idy;
+          int col = j + idx;
+
+          if (row >= 0 && row < (int)block_height &&
+              col >= 0 && col < (int)block_width) {
+            int diff = frame1[byte + idy * (int)stride + idx] -
+                frame2[idy * (int)block_width + idx];
+            diff_sse[index] = diff * diff;
+            ++index;
+          }
+        }
+      }
+
+      assert(index > 0);
+
+      modifier = 0;
+      for (idx = 0; idx < 9; ++idx)
+        modifier += diff_sse[idx];
+
       modifier *= 3;
+      modifier /= index;
+
+      ++frame2;
+
       modifier += rounding;
       modifier >>= strength;
 
@@ -227,7 +288,6 @@
 
   MV best_ref_mv1 = {0, 0};
   MV best_ref_mv1_full; /* full-pixel value of best_ref_mv1 */
-  MV *ref_mv = &x->e_mbd.mi[0]->bmi[0].as_mv[0].as_mv;
 
   // Save input state
   struct buf_2d src = x->plane[0].src;
@@ -245,21 +305,29 @@
   step_param = mv_sf->reduce_first_step_size;
   step_param = VPXMIN(step_param, MAX_MVSEARCH_STEPS - 2);
 
+#if CONFIG_REF_MV
+  x->mvcost = x->mv_cost_stack[0];
+  x->nmvjointcost = x->nmv_vec_cost[0];
+  x->mvsadcost = x->mvcost;
+  x->nmvjointsadcost = x->nmvjointcost;
+#endif
+
   // Ignore mv costing by sending NULL pointer instead of cost arrays
   vp10_hex_search(x, &best_ref_mv1_full, step_param, sadpb, 1,
-                 cond_cost_list(cpi, cost_list),
-                 &cpi->fn_ptr[BLOCK_16X16], 0, &best_ref_mv1, ref_mv);
+                  cond_cost_list(cpi, cost_list),
+                  &cpi->fn_ptr[BLOCK_16X16], 0, &best_ref_mv1);
 
   // Ignore mv costing by sending NULL pointer instead of cost array
-  bestsme = cpi->find_fractional_mv_step(x, ref_mv,
-                                         &best_ref_mv1,
+  bestsme = cpi->find_fractional_mv_step(x, &best_ref_mv1,
                                          cpi->common.allow_high_precision_mv,
                                          x->errorperbit,
                                          &cpi->fn_ptr[BLOCK_16X16],
                                          0, mv_sf->subpel_iters_per_step,
                                          cond_cost_list(cpi, cost_list),
                                          NULL, NULL,
-                                         &distortion, &sse, NULL, 0, 0);
+                                         &distortion, &sse, NULL, 0, 0, 0);
+
+  x->e_mbd.mi[0]->bmi[0].as_mv[0] = x->best_mv;
 
   // Restore input state
   x->plane[0].src = src;
@@ -382,50 +450,50 @@
             int adj_strength = strength + 2 * (mbd->bd - 8);
             // Apply the filter (YUV)
             vp10_highbd_temporal_filter_apply(f->y_buffer + mb_y_offset,
-                                             f->y_stride,
-                                             predictor, 16, 16, adj_strength,
-                                             filter_weight,
-                                             accumulator, count);
+                                              f->y_stride,
+                                              predictor, 16, 16, adj_strength,
+                                              filter_weight,
+                                              accumulator, count);
             vp10_highbd_temporal_filter_apply(f->u_buffer + mb_uv_offset,
-                                             f->uv_stride, predictor + 256,
-                                             mb_uv_width, mb_uv_height,
-                                             adj_strength,
-                                             filter_weight, accumulator + 256,
-                                             count + 256);
+                                              f->uv_stride, predictor + 256,
+                                              mb_uv_width, mb_uv_height,
+                                              adj_strength,
+                                              filter_weight, accumulator + 256,
+                                              count + 256);
             vp10_highbd_temporal_filter_apply(f->v_buffer + mb_uv_offset,
-                                             f->uv_stride, predictor + 512,
-                                             mb_uv_width, mb_uv_height,
-                                             adj_strength, filter_weight,
-                                             accumulator + 512, count + 512);
+                                              f->uv_stride, predictor + 512,
+                                              mb_uv_width, mb_uv_height,
+                                              adj_strength, filter_weight,
+                                              accumulator + 512, count + 512);
           } else {
             // Apply the filter (YUV)
-            vp10_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
-                                      predictor, 16, 16,
-                                      strength, filter_weight,
-                                      accumulator, count);
-            vp10_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride,
-                                      predictor + 256,
-                                      mb_uv_width, mb_uv_height, strength,
-                                      filter_weight, accumulator + 256,
-                                      count + 256);
-            vp10_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride,
-                                      predictor + 512,
-                                      mb_uv_width, mb_uv_height, strength,
-                                      filter_weight, accumulator + 512,
-                                      count + 512);
+            vp10_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
+                                         predictor, 16, 16,
+                                         strength, filter_weight,
+                                         accumulator, count);
+            vp10_temporal_filter_apply_c(f->u_buffer + mb_uv_offset,
+                                         f->uv_stride, predictor + 256,
+                                         mb_uv_width, mb_uv_height, strength,
+                                         filter_weight, accumulator + 256,
+                                         count + 256);
+            vp10_temporal_filter_apply_c(f->v_buffer + mb_uv_offset,
+                                         f->uv_stride, predictor + 512,
+                                         mb_uv_width, mb_uv_height, strength,
+                                         filter_weight, accumulator + 512,
+                                         count + 512);
           }
 #else
           // Apply the filter (YUV)
-          vp10_temporal_filter_apply(f->y_buffer + mb_y_offset, f->y_stride,
+          vp10_temporal_filter_apply_c(f->y_buffer + mb_y_offset, f->y_stride,
                                     predictor, 16, 16,
                                     strength, filter_weight,
                                     accumulator, count);
-          vp10_temporal_filter_apply(f->u_buffer + mb_uv_offset, f->uv_stride,
+          vp10_temporal_filter_apply_c(f->u_buffer + mb_uv_offset, f->uv_stride,
                                     predictor + 256,
                                     mb_uv_width, mb_uv_height, strength,
                                     filter_weight, accumulator + 256,
                                     count + 256);
-          vp10_temporal_filter_apply(f->v_buffer + mb_uv_offset, f->uv_stride,
+          vp10_temporal_filter_apply_c(f->v_buffer + mb_uv_offset, f->uv_stride,
                                     predictor + 512,
                                     mb_uv_width, mb_uv_height, strength,
                                     filter_weight, accumulator + 512,
diff --git a/vp10/encoder/tokenize.c b/vp10/encoder/tokenize.c
index a665a3c..c25f8bc 100644
--- a/vp10/encoder/tokenize.c
+++ b/vp10/encoder/tokenize.c
@@ -50,6 +50,35 @@
 const TOKENVALUE *vp10_dct_cat_lt_10_value_tokens = dct_cat_lt_10_value_tokens +
     (sizeof(dct_cat_lt_10_value_tokens) / sizeof(*dct_cat_lt_10_value_tokens))
     / 2;
+// The corresponding costs of the extrabits for the tokens in the above table
+// are stored in the table below. The values are obtained from looking up the
+// entry for the specified extrabits in the table corresponding to the token
+// (as defined in cost element vp10_extra_bits)
+// e.g. {9, 63} maps to cat5_cost[63 >> 1], {1, 1} maps to sign_cost[1 >> 1]
+static const int dct_cat_lt_10_value_cost[] = {
+  3773, 3750, 3704, 3681, 3623, 3600, 3554, 3531,
+  3432, 3409, 3363, 3340, 3282, 3259, 3213, 3190,
+  3136, 3113, 3067, 3044, 2986, 2963, 2917, 2894,
+  2795, 2772, 2726, 2703, 2645, 2622, 2576, 2553,
+  3197, 3116, 3058, 2977, 2881, 2800,
+  2742, 2661, 2615, 2534, 2476, 2395,
+  2299, 2218, 2160, 2079,
+  2566, 2427, 2334, 2195, 2023, 1884, 1791, 1652,
+  1893, 1696, 1453, 1256, 1229, 864,
+  512, 512, 512, 512, 0,
+  512, 512, 512, 512,
+  864, 1229, 1256, 1453, 1696, 1893,
+  1652, 1791, 1884, 2023, 2195, 2334, 2427, 2566,
+  2079, 2160, 2218, 2299, 2395, 2476, 2534, 2615,
+  2661, 2742, 2800, 2881, 2977, 3058, 3116, 3197,
+  2553, 2576, 2622, 2645, 2703, 2726, 2772, 2795,
+  2894, 2917, 2963, 2986, 3044, 3067, 3113, 3136,
+  3190, 3213, 3259, 3282, 3340, 3363, 3409, 3432,
+  3531, 3554, 3600, 3623, 3681, 3704, 3750, 3773,
+};
+const int *vp10_dct_cat_lt_10_value_cost = dct_cat_lt_10_value_cost +
+    (sizeof(dct_cat_lt_10_value_cost) / sizeof(*dct_cat_lt_10_value_cost))
+    / 2;
 
 // Array indices are identical to previously-existing CONTEXT_NODE indices
 const vpx_tree_index vp10_coef_tree[TREE_SIZE(ENTROPY_TOKENS)] = {
@@ -75,292 +104,170 @@
     14, 14, 16, 16, 18, 18, 20, 20, 22, 22, 24, 24, 26, 26, 0, 0};
 
 static const int16_t zero_cost[] = {0};
-static const int16_t sign_cost[] = {255, 257};
-static const int16_t cat1_cost[] = {429, 431, 616, 618};
-static const int16_t cat2_cost[] = {624, 626, 727, 729, 848, 850, 951, 953};
-static const int16_t cat3_cost[] = {
-  820, 822, 893, 895, 940, 942, 1013, 1015, 1096, 1098, 1169, 1171, 1216, 1218,
-  1289, 1291
-};
-static const int16_t cat4_cost[] = {
-  1032, 1034, 1075, 1077, 1105, 1107, 1148, 1150, 1194, 1196, 1237, 1239,
-  1267, 1269, 1310, 1312, 1328, 1330, 1371, 1373, 1401, 1403, 1444, 1446,
-  1490, 1492, 1533, 1535, 1563, 1565, 1606, 1608
-};
-static const int16_t cat5_cost[] = {
-  1269, 1271, 1283, 1285, 1306, 1308, 1320,
-  1322, 1347, 1349, 1361, 1363, 1384, 1386, 1398, 1400, 1443, 1445, 1457,
-  1459, 1480, 1482, 1494, 1496, 1521, 1523, 1535, 1537, 1558, 1560, 1572,
-  1574, 1592, 1594, 1606, 1608, 1629, 1631, 1643, 1645, 1670, 1672, 1684,
-  1686, 1707, 1709, 1721, 1723, 1766, 1768, 1780, 1782, 1803, 1805, 1817,
-  1819, 1844, 1846, 1858, 1860, 1881, 1883, 1895, 1897
-};
+static const int16_t sign_cost[1] = {512};
+static const int16_t cat1_cost[1 << 1] = {864, 1229};
+static const int16_t cat2_cost[1 << 2] = {1256, 1453, 1696, 1893};
+static const int16_t cat3_cost[1 << 3] = {1652, 1791, 1884, 2023,
+                                          2195, 2334, 2427, 2566};
+static const int16_t cat4_cost[1 << 4] = {2079, 2160, 2218, 2299, 2395, 2476,
+                                          2534, 2615, 2661, 2742, 2800, 2881,
+                                          2977, 3058, 3116, 3197};
+static const int16_t cat5_cost[1 << 5] = {
+    2553, 2576, 2622, 2645, 2703, 2726, 2772, 2795, 2894, 2917, 2963,
+    2986, 3044, 3067, 3113, 3136, 3190, 3213, 3259, 3282, 3340, 3363,
+    3409, 3432, 3531, 3554, 3600, 3623, 3681, 3704, 3750, 3773};
 const int16_t vp10_cat6_low_cost[256] = {
-  1638, 1640, 1646, 1648, 1652, 1654, 1660, 1662,
-  1670, 1672, 1678, 1680, 1684, 1686, 1692, 1694, 1711, 1713, 1719, 1721,
-  1725, 1727, 1733, 1735, 1743, 1745, 1751, 1753, 1757, 1759, 1765, 1767,
-  1787, 1789, 1795, 1797, 1801, 1803, 1809, 1811, 1819, 1821, 1827, 1829,
-  1833, 1835, 1841, 1843, 1860, 1862, 1868, 1870, 1874, 1876, 1882, 1884,
-  1892, 1894, 1900, 1902, 1906, 1908, 1914, 1916, 1940, 1942, 1948, 1950,
-  1954, 1956, 1962, 1964, 1972, 1974, 1980, 1982, 1986, 1988, 1994, 1996,
-  2013, 2015, 2021, 2023, 2027, 2029, 2035, 2037, 2045, 2047, 2053, 2055,
-  2059, 2061, 2067, 2069, 2089, 2091, 2097, 2099, 2103, 2105, 2111, 2113,
-  2121, 2123, 2129, 2131, 2135, 2137, 2143, 2145, 2162, 2164, 2170, 2172,
-  2176, 2178, 2184, 2186, 2194, 2196, 2202, 2204, 2208, 2210, 2216, 2218,
-  2082, 2084, 2090, 2092, 2096, 2098, 2104, 2106, 2114, 2116, 2122, 2124,
-  2128, 2130, 2136, 2138, 2155, 2157, 2163, 2165, 2169, 2171, 2177, 2179,
-  2187, 2189, 2195, 2197, 2201, 2203, 2209, 2211, 2231, 2233, 2239, 2241,
-  2245, 2247, 2253, 2255, 2263, 2265, 2271, 2273, 2277, 2279, 2285, 2287,
-  2304, 2306, 2312, 2314, 2318, 2320, 2326, 2328, 2336, 2338, 2344, 2346,
-  2350, 2352, 2358, 2360, 2384, 2386, 2392, 2394, 2398, 2400, 2406, 2408,
-  2416, 2418, 2424, 2426, 2430, 2432, 2438, 2440, 2457, 2459, 2465, 2467,
-  2471, 2473, 2479, 2481, 2489, 2491, 2497, 2499, 2503, 2505, 2511, 2513,
-  2533, 2535, 2541, 2543, 2547, 2549, 2555, 2557, 2565, 2567, 2573, 2575,
-  2579, 2581, 2587, 2589, 2606, 2608, 2614, 2616, 2620, 2622, 2628, 2630,
-  2638, 2640, 2646, 2648, 2652, 2654, 2660, 2662
-};
-const int16_t vp10_cat6_high_cost[128] = {
-  72, 892, 1183, 2003, 1448, 2268, 2559, 3379,
-  1709, 2529, 2820, 3640, 3085, 3905, 4196, 5016, 2118, 2938, 3229, 4049,
-  3494, 4314, 4605, 5425, 3755, 4575, 4866, 5686, 5131, 5951, 6242, 7062,
-  2118, 2938, 3229, 4049, 3494, 4314, 4605, 5425, 3755, 4575, 4866, 5686,
-  5131, 5951, 6242, 7062, 4164, 4984, 5275, 6095, 5540, 6360, 6651, 7471,
-  5801, 6621, 6912, 7732, 7177, 7997, 8288, 9108, 2118, 2938, 3229, 4049,
-  3494, 4314, 4605, 5425, 3755, 4575, 4866, 5686, 5131, 5951, 6242, 7062,
-  4164, 4984, 5275, 6095, 5540, 6360, 6651, 7471, 5801, 6621, 6912, 7732,
-  7177, 7997, 8288, 9108, 4164, 4984, 5275, 6095, 5540, 6360, 6651, 7471,
-  5801, 6621, 6912, 7732, 7177, 7997, 8288, 9108, 6210, 7030, 7321, 8141,
-  7586, 8406, 8697, 9517, 7847, 8667, 8958, 9778, 9223, 10043, 10334, 11154
-};
+    3378, 3390, 3401, 3413, 3435, 3447, 3458, 3470, 3517, 3529, 3540, 3552,
+    3574, 3586, 3597, 3609, 3671, 3683, 3694, 3706, 3728, 3740, 3751, 3763,
+    3810, 3822, 3833, 3845, 3867, 3879, 3890, 3902, 3973, 3985, 3996, 4008,
+    4030, 4042, 4053, 4065, 4112, 4124, 4135, 4147, 4169, 4181, 4192, 4204,
+    4266, 4278, 4289, 4301, 4323, 4335, 4346, 4358, 4405, 4417, 4428, 4440,
+    4462, 4474, 4485, 4497, 4253, 4265, 4276, 4288, 4310, 4322, 4333, 4345,
+    4392, 4404, 4415, 4427, 4449, 4461, 4472, 4484, 4546, 4558, 4569, 4581,
+    4603, 4615, 4626, 4638, 4685, 4697, 4708, 4720, 4742, 4754, 4765, 4777,
+    4848, 4860, 4871, 4883, 4905, 4917, 4928, 4940, 4987, 4999, 5010, 5022,
+    5044, 5056, 5067, 5079, 5141, 5153, 5164, 5176, 5198, 5210, 5221, 5233,
+    5280, 5292, 5303, 5315, 5337, 5349, 5360, 5372, 4988, 5000, 5011, 5023,
+    5045, 5057, 5068, 5080, 5127, 5139, 5150, 5162, 5184, 5196, 5207, 5219,
+    5281, 5293, 5304, 5316, 5338, 5350, 5361, 5373, 5420, 5432, 5443, 5455,
+    5477, 5489, 5500, 5512, 5583, 5595, 5606, 5618, 5640, 5652, 5663, 5675,
+    5722, 5734, 5745, 5757, 5779, 5791, 5802, 5814, 5876, 5888, 5899, 5911,
+    5933, 5945, 5956, 5968, 6015, 6027, 6038, 6050, 6072, 6084, 6095, 6107,
+    5863, 5875, 5886, 5898, 5920, 5932, 5943, 5955, 6002, 6014, 6025, 6037,
+    6059, 6071, 6082, 6094, 6156, 6168, 6179, 6191, 6213, 6225, 6236, 6248,
+    6295, 6307, 6318, 6330, 6352, 6364, 6375, 6387, 6458, 6470, 6481, 6493,
+    6515, 6527, 6538, 6550, 6597, 6609, 6620, 6632, 6654, 6666, 6677, 6689,
+    6751, 6763, 6774, 6786, 6808, 6820, 6831, 6843, 6890, 6902, 6913, 6925,
+    6947, 6959, 6970, 6982};
+const int vp10_cat6_high_cost[64] = {
+    88,    2251,  2727,  4890,  3148,  5311,  5787,  7950,  3666,  5829,  6305,
+    8468,  6726,  8889,  9365,  11528, 3666,  5829,  6305,  8468,  6726,  8889,
+    9365,  11528, 7244,  9407,  9883,  12046, 10304, 12467, 12943, 15106, 3666,
+    5829,  6305,  8468,  6726,  8889,  9365,  11528, 7244,  9407,  9883,  12046,
+    10304, 12467, 12943, 15106, 7244,  9407,  9883,  12046, 10304, 12467, 12943,
+    15106, 10822, 12985, 13461, 15624, 13882, 16045, 16521, 18684};
 
 #if CONFIG_VP9_HIGHBITDEPTH
-const int16_t vp10_cat6_high10_high_cost[512] = {
-  74, 894, 1185, 2005, 1450, 2270, 2561,
-  3381, 1711, 2531, 2822, 3642, 3087, 3907, 4198, 5018, 2120, 2940, 3231,
-  4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868, 5688, 5133, 5953, 6244,
-  7064, 2120, 2940, 3231, 4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868,
-  5688, 5133, 5953, 6244, 7064, 4166, 4986, 5277, 6097, 5542, 6362, 6653,
-  7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 2120, 2940, 3231,
-  4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868, 5688, 5133, 5953, 6244,
-  7064, 4166, 4986, 5277, 6097, 5542, 6362, 6653, 7473, 5803, 6623, 6914,
-  7734, 7179, 7999, 8290, 9110, 4166, 4986, 5277, 6097, 5542, 6362, 6653,
-  7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 6212, 7032, 7323,
-  8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225, 10045, 10336,
-  11156, 2120, 2940, 3231, 4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868,
-  5688, 5133, 5953, 6244, 7064, 4166, 4986, 5277, 6097, 5542, 6362, 6653,
-  7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 4166, 4986, 5277,
-  6097, 5542, 6362, 6653, 7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290,
-  9110, 6212, 7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960,
-  9780, 9225, 10045, 10336, 11156, 4166, 4986, 5277, 6097, 5542, 6362, 6653,
-  7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 6212, 7032, 7323,
-  8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225, 10045, 10336,
-  11156, 6212, 7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960,
-  9780, 9225, 10045, 10336, 11156, 8258, 9078, 9369, 10189, 9634, 10454,
-  10745, 11565, 9895, 10715, 11006, 11826, 11271, 12091, 12382, 13202, 2120,
-  2940, 3231, 4051, 3496, 4316, 4607, 5427, 3757, 4577, 4868, 5688, 5133,
-  5953, 6244, 7064, 4166, 4986, 5277, 6097, 5542, 6362, 6653, 7473, 5803,
-  6623, 6914, 7734, 7179, 7999, 8290, 9110, 4166, 4986, 5277, 6097, 5542,
-  6362, 6653, 7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290, 9110, 6212,
-  7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225,
-  10045, 10336, 11156, 4166, 4986, 5277, 6097, 5542, 6362, 6653, 7473, 5803,
-  6623, 6914, 7734, 7179, 7999, 8290, 9110, 6212, 7032, 7323, 8143, 7588,
-  8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225, 10045, 10336, 11156, 6212,
-  7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960, 9780, 9225,
-  10045, 10336, 11156, 8258, 9078, 9369, 10189, 9634, 10454, 10745, 11565,
-  9895, 10715, 11006, 11826, 11271, 12091, 12382, 13202, 4166, 4986, 5277,
-  6097, 5542, 6362, 6653, 7473, 5803, 6623, 6914, 7734, 7179, 7999, 8290,
-  9110, 6212, 7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669, 8960,
-  9780, 9225, 10045, 10336, 11156, 6212, 7032, 7323, 8143, 7588, 8408, 8699,
-  9519, 7849, 8669, 8960, 9780, 9225, 10045, 10336, 11156, 8258, 9078, 9369,
-  10189, 9634, 10454, 10745, 11565, 9895, 10715, 11006, 11826, 11271, 12091,
-  12382, 13202, 6212, 7032, 7323, 8143, 7588, 8408, 8699, 9519, 7849, 8669,
-  8960, 9780, 9225, 10045, 10336, 11156, 8258, 9078, 9369, 10189, 9634, 10454,
-  10745, 11565, 9895, 10715, 11006, 11826, 11271, 12091, 12382, 13202, 8258,
-  9078, 9369, 10189, 9634, 10454, 10745, 11565, 9895, 10715, 11006, 11826,
-  11271, 12091, 12382, 13202, 10304, 11124, 11415, 12235, 11680, 12500, 12791,
-  13611, 11941, 12761, 13052, 13872, 13317, 14137, 14428, 15248,
-};
-const int16_t vp10_cat6_high12_high_cost[2048] = {
-  76, 896, 1187, 2007, 1452, 2272, 2563,
-  3383, 1713, 2533, 2824, 3644, 3089, 3909, 4200, 5020, 2122, 2942, 3233,
-  4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870, 5690, 5135, 5955, 6246,
-  7066, 2122, 2942, 3233, 4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870,
-  5690, 5135, 5955, 6246, 7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655,
-  7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 2122, 2942, 3233,
-  4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870, 5690, 5135, 5955, 6246,
-  7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916,
-  7736, 7181, 8001, 8292, 9112, 4168, 4988, 5279, 6099, 5544, 6364, 6655,
-  7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325,
-  8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338,
-  11158, 2122, 2942, 3233, 4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870,
-  5690, 5135, 5955, 6246, 7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655,
-  7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 4168, 4988, 5279,
-  6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292,
-  9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962,
-  9782, 9227, 10047, 10338, 11158, 4168, 4988, 5279, 6099, 5544, 6364, 6655,
-  7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325,
-  8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338,
-  11158, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962,
-  9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456,
-  10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 2122,
-  2942, 3233, 4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870, 5690, 5135,
-  5955, 6246, 7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805,
-  6625, 6916, 7736, 7181, 8001, 8292, 9112, 4168, 4988, 5279, 6099, 5544,
-  6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214,
-  7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227,
-  10047, 10338, 11158, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805,
-  6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590,
-  8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 6214,
-  7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227,
-  10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567,
-  9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 4168, 4988, 5279,
-  6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292,
-  9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962,
-  9782, 9227, 10047, 10338, 11158, 6214, 7034, 7325, 8145, 7590, 8410, 8701,
-  9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371,
-  10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093,
-  12384, 13204, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671,
-  8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456,
-  10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 8260,
-  9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828,
-  11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793,
-  13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 2122, 2942,
-  3233, 4053, 3498, 4318, 4609, 5429, 3759, 4579, 4870, 5690, 5135, 5955,
-  6246, 7066, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625,
-  6916, 7736, 7181, 8001, 8292, 9112, 4168, 4988, 5279, 6099, 5544, 6364,
-  6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034,
-  7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047,
-  10338, 11158, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625,
-  6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410,
-  8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 6214, 7034,
-  7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047,
-  10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897,
-  10717, 11008, 11828, 11273, 12093, 12384, 13204, 4168, 4988, 5279, 6099,
-  5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112,
-  6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782,
-  9227, 10047, 10338, 11158, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521,
-  7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191,
-  9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384,
-  13204, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962,
-  9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456,
-  10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 8260,
-  9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828,
-  11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793,
-  13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 4168, 4988,
-  5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001,
-  8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671,
-  8962, 9782, 9227, 10047, 10338, 11158, 6214, 7034, 7325, 8145, 7590, 8410,
-  8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080,
-  9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273,
-  12093, 12384, 13204, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851,
-  8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636,
-  10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204,
-  8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008,
-  11828, 11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502,
-  12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 6214,
-  7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227,
-  10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567,
-  9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 8260, 9080, 9371,
-  10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093,
-  12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943,
-  12763, 13054, 13874, 13319, 14139, 14430, 15250, 8260, 9080, 9371, 10191,
-  9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384,
-  13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763,
-  13054, 13874, 13319, 14139, 14430, 15250, 10306, 11126, 11417, 12237, 11682,
-  12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250,
-  12352, 13172, 13463, 14283, 13728, 14548, 14839, 15659, 13989, 14809, 15100,
-  15920, 15365, 16185, 16476, 17296, 2122, 2942, 3233, 4053, 3498, 4318, 4609,
-  5429, 3759, 4579, 4870, 5690, 5135, 5955, 6246, 7066, 4168, 4988, 5279,
-  6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292,
-  9112, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916,
-  7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701,
-  9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 4168, 4988, 5279,
-  6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916, 7736, 7181, 8001, 8292,
-  9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962,
-  9782, 9227, 10047, 10338, 11158, 6214, 7034, 7325, 8145, 7590, 8410, 8701,
-  9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371,
-  10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093,
-  12384, 13204, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625,
-  6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410,
-  8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 6214, 7034,
-  7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047,
-  10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897,
-  10717, 11008, 11828, 11273, 12093, 12384, 13204, 6214, 7034, 7325, 8145,
-  7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158,
-  8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008,
-  11828, 11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191, 9636, 10456,
-  10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306,
-  11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874,
-  13319, 14139, 14430, 15250, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475,
-  5805, 6625, 6916, 7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145,
-  7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158,
-  6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782,
-  9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747,
-  11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 6214, 7034,
-  7325, 8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047,
-  10338, 11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897,
-  10717, 11008, 11828, 11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191,
-  9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384,
-  13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763,
-  13054, 13874, 13319, 14139, 14430, 15250, 6214, 7034, 7325, 8145, 7590,
-  8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260,
-  9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828,
-  11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191, 9636, 10456, 10747,
-  11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306, 11126,
-  11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319,
-  14139, 14430, 15250, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567,
-  9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306, 11126, 11417,
-  12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139,
-  14430, 15250, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943,
-  12763, 13054, 13874, 13319, 14139, 14430, 15250, 12352, 13172, 13463, 14283,
-  13728, 14548, 14839, 15659, 13989, 14809, 15100, 15920, 15365, 16185, 16476,
-  17296, 4168, 4988, 5279, 6099, 5544, 6364, 6655, 7475, 5805, 6625, 6916,
-  7736, 7181, 8001, 8292, 9112, 6214, 7034, 7325, 8145, 7590, 8410, 8701,
-  9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 6214, 7034, 7325,
-  8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338,
-  11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717,
-  11008, 11828, 11273, 12093, 12384, 13204, 6214, 7034, 7325, 8145, 7590,
-  8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260,
-  9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828,
-  11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191, 9636, 10456, 10747,
-  11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306, 11126,
-  11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319,
-  14139, 14430, 15250, 6214, 7034, 7325, 8145, 7590, 8410, 8701, 9521, 7851,
-  8671, 8962, 9782, 9227, 10047, 10338, 11158, 8260, 9080, 9371, 10191, 9636,
-  10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204,
-  8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008,
-  11828, 11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502,
-  12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 8260,
-  9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717, 11008, 11828,
-  11273, 12093, 12384, 13204, 10306, 11126, 11417, 12237, 11682, 12502, 12793,
-  13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 10306, 11126,
-  11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319,
-  14139, 14430, 15250, 12352, 13172, 13463, 14283, 13728, 14548, 14839, 15659,
-  13989, 14809, 15100, 15920, 15365, 16185, 16476, 17296, 6214, 7034, 7325,
-  8145, 7590, 8410, 8701, 9521, 7851, 8671, 8962, 9782, 9227, 10047, 10338,
-  11158, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567, 9897, 10717,
-  11008, 11828, 11273, 12093, 12384, 13204, 8260, 9080, 9371, 10191, 9636,
-  10456, 10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204,
-  10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054,
-  13874, 13319, 14139, 14430, 15250, 8260, 9080, 9371, 10191, 9636, 10456,
-  10747, 11567, 9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306,
-  11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874,
-  13319, 14139, 14430, 15250, 10306, 11126, 11417, 12237, 11682, 12502, 12793,
-  13613, 11943, 12763, 13054, 13874, 13319, 14139, 14430, 15250, 12352, 13172,
-  13463, 14283, 13728, 14548, 14839, 15659, 13989, 14809, 15100, 15920, 15365,
-  16185, 16476, 17296, 8260, 9080, 9371, 10191, 9636, 10456, 10747, 11567,
-  9897, 10717, 11008, 11828, 11273, 12093, 12384, 13204, 10306, 11126, 11417,
-  12237, 11682, 12502, 12793, 13613, 11943, 12763, 13054, 13874, 13319, 14139,
-  14430, 15250, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943,
-  12763, 13054, 13874, 13319, 14139, 14430, 15250, 12352, 13172, 13463, 14283,
-  13728, 14548, 14839, 15659, 13989, 14809, 15100, 15920, 15365, 16185, 16476,
-  17296, 10306, 11126, 11417, 12237, 11682, 12502, 12793, 13613, 11943, 12763,
-  13054, 13874, 13319, 14139, 14430, 15250, 12352, 13172, 13463, 14283, 13728,
-  14548, 14839, 15659, 13989, 14809, 15100, 15920, 15365, 16185, 16476, 17296,
-  12352, 13172, 13463, 14283, 13728, 14548, 14839, 15659, 13989, 14809, 15100,
-  15920, 15365, 16185, 16476, 17296, 14398, 15218, 15509, 16329, 15774, 16594,
-  16885, 17705, 16035, 16855, 17146, 17966, 17411, 18231, 18522, 19342
-};
+const int vp10_cat6_high10_high_cost[256] = {
+    94,    2257,  2733,  4896,  3154,  5317,  5793,  7956,  3672,  5835,  6311,
+    8474,  6732,  8895,  9371,  11534, 3672,  5835,  6311,  8474,  6732,  8895,
+    9371,  11534, 7250,  9413,  9889,  12052, 10310, 12473, 12949, 15112, 3672,
+    5835,  6311,  8474,  6732,  8895,  9371,  11534, 7250,  9413,  9889,  12052,
+    10310, 12473, 12949, 15112, 7250,  9413,  9889,  12052, 10310, 12473, 12949,
+    15112, 10828, 12991, 13467, 15630, 13888, 16051, 16527, 18690, 4187,  6350,
+    6826,  8989,  7247,  9410,  9886,  12049, 7765,  9928,  10404, 12567, 10825,
+    12988, 13464, 15627, 7765,  9928,  10404, 12567, 10825, 12988, 13464, 15627,
+    11343, 13506, 13982, 16145, 14403, 16566, 17042, 19205, 7765,  9928,  10404,
+    12567, 10825, 12988, 13464, 15627, 11343, 13506, 13982, 16145, 14403, 16566,
+    17042, 19205, 11343, 13506, 13982, 16145, 14403, 16566, 17042, 19205, 14921,
+    17084, 17560, 19723, 17981, 20144, 20620, 22783, 4187,  6350,  6826,  8989,
+    7247,  9410,  9886,  12049, 7765,  9928,  10404, 12567, 10825, 12988, 13464,
+    15627, 7765,  9928,  10404, 12567, 10825, 12988, 13464, 15627, 11343, 13506,
+    13982, 16145, 14403, 16566, 17042, 19205, 7765,  9928,  10404, 12567, 10825,
+    12988, 13464, 15627, 11343, 13506, 13982, 16145, 14403, 16566, 17042, 19205,
+    11343, 13506, 13982, 16145, 14403, 16566, 17042, 19205, 14921, 17084, 17560,
+    19723, 17981, 20144, 20620, 22783, 8280,  10443, 10919, 13082, 11340, 13503,
+    13979, 16142, 11858, 14021, 14497, 16660, 14918, 17081, 17557, 19720, 11858,
+    14021, 14497, 16660, 14918, 17081, 17557, 19720, 15436, 17599, 18075, 20238,
+    18496, 20659, 21135, 23298, 11858, 14021, 14497, 16660, 14918, 17081, 17557,
+    19720, 15436, 17599, 18075, 20238, 18496, 20659, 21135, 23298, 15436, 17599,
+    18075, 20238, 18496, 20659, 21135, 23298, 19014, 21177, 21653, 23816, 22074,
+    24237, 24713, 26876};
+const int vp10_cat6_high12_high_cost[1024] = {
+    100,   2263,  2739,  4902,  3160,  5323,  5799,  7962,  3678,  5841,  6317,
+    8480,  6738,  8901,  9377,  11540, 3678,  5841,  6317,  8480,  6738,  8901,
+    9377,  11540, 7256,  9419,  9895,  12058, 10316, 12479, 12955, 15118, 3678,
+    5841,  6317,  8480,  6738,  8901,  9377,  11540, 7256,  9419,  9895,  12058,
+    10316, 12479, 12955, 15118, 7256,  9419,  9895,  12058, 10316, 12479, 12955,
+    15118, 10834, 12997, 13473, 15636, 13894, 16057, 16533, 18696, 4193,  6356,
+    6832,  8995,  7253,  9416,  9892,  12055, 7771,  9934,  10410, 12573, 10831,
+    12994, 13470, 15633, 7771,  9934,  10410, 12573, 10831, 12994, 13470, 15633,
+    11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 7771,  9934,  10410,
+    12573, 10831, 12994, 13470, 15633, 11349, 13512, 13988, 16151, 14409, 16572,
+    17048, 19211, 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 14927,
+    17090, 17566, 19729, 17987, 20150, 20626, 22789, 4193,  6356,  6832,  8995,
+    7253,  9416,  9892,  12055, 7771,  9934,  10410, 12573, 10831, 12994, 13470,
+    15633, 7771,  9934,  10410, 12573, 10831, 12994, 13470, 15633, 11349, 13512,
+    13988, 16151, 14409, 16572, 17048, 19211, 7771,  9934,  10410, 12573, 10831,
+    12994, 13470, 15633, 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211,
+    11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 14927, 17090, 17566,
+    19729, 17987, 20150, 20626, 22789, 8286,  10449, 10925, 13088, 11346, 13509,
+    13985, 16148, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 11864,
+    14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081, 20244,
+    18502, 20665, 21141, 23304, 11864, 14027, 14503, 16666, 14924, 17087, 17563,
+    19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 15442, 17605,
+    18081, 20244, 18502, 20665, 21141, 23304, 19020, 21183, 21659, 23822, 22080,
+    24243, 24719, 26882, 4193,  6356,  6832,  8995,  7253,  9416,  9892,  12055,
+    7771,  9934,  10410, 12573, 10831, 12994, 13470, 15633, 7771,  9934,  10410,
+    12573, 10831, 12994, 13470, 15633, 11349, 13512, 13988, 16151, 14409, 16572,
+    17048, 19211, 7771,  9934,  10410, 12573, 10831, 12994, 13470, 15633, 11349,
+    13512, 13988, 16151, 14409, 16572, 17048, 19211, 11349, 13512, 13988, 16151,
+    14409, 16572, 17048, 19211, 14927, 17090, 17566, 19729, 17987, 20150, 20626,
+    22789, 8286,  10449, 10925, 13088, 11346, 13509, 13985, 16148, 11864, 14027,
+    14503, 16666, 14924, 17087, 17563, 19726, 11864, 14027, 14503, 16666, 14924,
+    17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304,
+    11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081,
+    20244, 18502, 20665, 21141, 23304, 15442, 17605, 18081, 20244, 18502, 20665,
+    21141, 23304, 19020, 21183, 21659, 23822, 22080, 24243, 24719, 26882, 8286,
+    10449, 10925, 13088, 11346, 13509, 13985, 16148, 11864, 14027, 14503, 16666,
+    14924, 17087, 17563, 19726, 11864, 14027, 14503, 16666, 14924, 17087, 17563,
+    19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 11864, 14027,
+    14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502,
+    20665, 21141, 23304, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304,
+    19020, 21183, 21659, 23822, 22080, 24243, 24719, 26882, 12379, 14542, 15018,
+    17181, 15439, 17602, 18078, 20241, 15957, 18120, 18596, 20759, 19017, 21180,
+    21656, 23819, 15957, 18120, 18596, 20759, 19017, 21180, 21656, 23819, 19535,
+    21698, 22174, 24337, 22595, 24758, 25234, 27397, 15957, 18120, 18596, 20759,
+    19017, 21180, 21656, 23819, 19535, 21698, 22174, 24337, 22595, 24758, 25234,
+    27397, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 23113, 25276,
+    25752, 27915, 26173, 28336, 28812, 30975, 4193,  6356,  6832,  8995,  7253,
+    9416,  9892,  12055, 7771,  9934,  10410, 12573, 10831, 12994, 13470, 15633,
+    7771,  9934,  10410, 12573, 10831, 12994, 13470, 15633, 11349, 13512, 13988,
+    16151, 14409, 16572, 17048, 19211, 7771,  9934,  10410, 12573, 10831, 12994,
+    13470, 15633, 11349, 13512, 13988, 16151, 14409, 16572, 17048, 19211, 11349,
+    13512, 13988, 16151, 14409, 16572, 17048, 19211, 14927, 17090, 17566, 19729,
+    17987, 20150, 20626, 22789, 8286,  10449, 10925, 13088, 11346, 13509, 13985,
+    16148, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 11864, 14027,
+    14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502,
+    20665, 21141, 23304, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726,
+    15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 15442, 17605, 18081,
+    20244, 18502, 20665, 21141, 23304, 19020, 21183, 21659, 23822, 22080, 24243,
+    24719, 26882, 8286,  10449, 10925, 13088, 11346, 13509, 13985, 16148, 11864,
+    14027, 14503, 16666, 14924, 17087, 17563, 19726, 11864, 14027, 14503, 16666,
+    14924, 17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665, 21141,
+    23304, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726, 15442, 17605,
+    18081, 20244, 18502, 20665, 21141, 23304, 15442, 17605, 18081, 20244, 18502,
+    20665, 21141, 23304, 19020, 21183, 21659, 23822, 22080, 24243, 24719, 26882,
+    12379, 14542, 15018, 17181, 15439, 17602, 18078, 20241, 15957, 18120, 18596,
+    20759, 19017, 21180, 21656, 23819, 15957, 18120, 18596, 20759, 19017, 21180,
+    21656, 23819, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 15957,
+    18120, 18596, 20759, 19017, 21180, 21656, 23819, 19535, 21698, 22174, 24337,
+    22595, 24758, 25234, 27397, 19535, 21698, 22174, 24337, 22595, 24758, 25234,
+    27397, 23113, 25276, 25752, 27915, 26173, 28336, 28812, 30975, 8286,  10449,
+    10925, 13088, 11346, 13509, 13985, 16148, 11864, 14027, 14503, 16666, 14924,
+    17087, 17563, 19726, 11864, 14027, 14503, 16666, 14924, 17087, 17563, 19726,
+    15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 11864, 14027, 14503,
+    16666, 14924, 17087, 17563, 19726, 15442, 17605, 18081, 20244, 18502, 20665,
+    21141, 23304, 15442, 17605, 18081, 20244, 18502, 20665, 21141, 23304, 19020,
+    21183, 21659, 23822, 22080, 24243, 24719, 26882, 12379, 14542, 15018, 17181,
+    15439, 17602, 18078, 20241, 15957, 18120, 18596, 20759, 19017, 21180, 21656,
+    23819, 15957, 18120, 18596, 20759, 19017, 21180, 21656, 23819, 19535, 21698,
+    22174, 24337, 22595, 24758, 25234, 27397, 15957, 18120, 18596, 20759, 19017,
+    21180, 21656, 23819, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397,
+    19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 23113, 25276, 25752,
+    27915, 26173, 28336, 28812, 30975, 12379, 14542, 15018, 17181, 15439, 17602,
+    18078, 20241, 15957, 18120, 18596, 20759, 19017, 21180, 21656, 23819, 15957,
+    18120, 18596, 20759, 19017, 21180, 21656, 23819, 19535, 21698, 22174, 24337,
+    22595, 24758, 25234, 27397, 15957, 18120, 18596, 20759, 19017, 21180, 21656,
+    23819, 19535, 21698, 22174, 24337, 22595, 24758, 25234, 27397, 19535, 21698,
+    22174, 24337, 22595, 24758, 25234, 27397, 23113, 25276, 25752, 27915, 26173,
+    28336, 28812, 30975, 16472, 18635, 19111, 21274, 19532, 21695, 22171, 24334,
+    20050, 22213, 22689, 24852, 23110, 25273, 25749, 27912, 20050, 22213, 22689,
+    24852, 23110, 25273, 25749, 27912, 23628, 25791, 26267, 28430, 26688, 28851,
+    29327, 31490, 20050, 22213, 22689, 24852, 23110, 25273, 25749, 27912, 23628,
+    25791, 26267, 28430, 26688, 28851, 29327, 31490, 23628, 25791, 26267, 28430,
+    26688, 28851, 29327, 31490, 27206, 29369, 29845, 32008, 30266, 32429, 32905,
+    35068};
 #endif
 
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -428,11 +335,12 @@
 };
 #endif
 
+#if !CONFIG_ANS
 const struct vp10_token vp10_coef_encodings[ENTROPY_TOKENS] = {
   {2, 2}, {6, 3}, {28, 5}, {58, 6}, {59, 6}, {60, 6}, {61, 6}, {124, 7},
   {125, 7}, {126, 7}, {127, 7}, {0, 1}
 };
-
+#endif  // !CONFIG_ANS
 
 struct tokenize_b_args {
   VP10_COMP *cpi;
@@ -455,12 +363,17 @@
 }
 
 static INLINE void add_token(TOKENEXTRA **t, const vpx_prob *context_tree,
+#if CONFIG_ANS
+                             const rans_dec_lut *token_cdf,
+#endif  // CONFIG_ANS
                              int32_t extra, uint8_t token,
-                             uint8_t skip_eob_node,
-                             unsigned int *counts) {
+                             uint8_t skip_eob_node, unsigned int *counts) {
   (*t)->token = token;
   (*t)->extra = extra;
   (*t)->context_tree = context_tree;
+#if CONFIG_ANS
+  (*t)->token_cdf = token_cdf;
+#endif  // CONFIG_ANS
   (*t)->skip_eob_node = skip_eob_node;
   (*t)++;
   ++counts[token];
@@ -484,6 +397,43 @@
   return segfeature_active(seg, segment_id, SEG_LVL_SKIP) ? 0 : eob_max;
 }
 
+void vp10_tokenize_palette_sb(struct ThreadData *const td,
+                              BLOCK_SIZE bsize, int plane,
+                              TOKENEXTRA **t) {
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
+  uint8_t *color_map = xd->plane[plane != 0].color_index_map;
+  PALETTE_MODE_INFO *pmi = &mbmi->palette_mode_info;
+  int n = pmi->palette_size[plane != 0];
+  int i, j, k;
+  int color_new_idx = -1, color_ctx, color_order[PALETTE_MAX_SIZE];
+  const int rows = (4 * num_4x4_blocks_high_lookup[bsize]) >>
+      (xd->plane[plane != 0].subsampling_y);
+  const int cols = (4 * num_4x4_blocks_wide_lookup[bsize]) >>
+      (xd->plane[plane != 0].subsampling_x);
+  const vpx_prob (* const probs)[PALETTE_COLOR_CONTEXTS][PALETTE_COLORS - 1] =
+      plane == 0 ? vp10_default_palette_y_color_prob :
+          vp10_default_palette_uv_color_prob;
+
+  for (i = 0; i < rows; ++i) {
+    for (j = (i == 0 ? 1 : 0); j < cols; ++j) {
+      color_ctx = vp10_get_palette_color_context(color_map, cols, i, j, n,
+                                                 color_order);
+      for (k = 0; k < n; ++k)
+        if (color_map[i * cols + j] == color_order[k]) {
+          color_new_idx = k;
+          break;
+        }
+      assert(color_new_idx >= 0 && color_new_idx < n);
+      (*t)->token = color_new_idx;
+      (*t)->context_tree = probs[n - 2][color_ctx];
+      (*t)->skip_eob_node = 0;
+      ++(*t);
+    }
+  }
+}
+
 static void tokenize_b(int plane, int block, int blk_row, int blk_col,
                        BLOCK_SIZE plane_bsize,
                        TX_SIZE tx_size, void *arg) {
@@ -493,7 +443,7 @@
   MACROBLOCK *const x = &td->mb;
   MACROBLOCKD *const xd = &x->e_mbd;
   TOKENEXTRA **tp = args->tp;
-  uint8_t token_cache[32 * 32];
+  uint8_t token_cache[MAX_TX_SQUARE];
   struct macroblock_plane *p = &x->plane[plane];
   struct macroblockd_plane *pd = &xd->plane[plane];
   MB_MODE_INFO *mbmi = &xd->mi[0]->mbmi;
@@ -503,19 +453,34 @@
   int eob = p->eobs[block];
   const PLANE_TYPE type = pd->plane_type;
   const tran_low_t *qcoeff = BLOCK_OFFSET(p->qcoeff, block);
+#if CONFIG_SUPERTX
+  const int segment_id = VPXMIN(mbmi->segment_id, mbmi->segment_id_supertx);
+#else
   const int segment_id = mbmi->segment_id;
+#endif  // CONFIG_SUEPRTX
   const int16_t *scan, *nb;
-  const TX_TYPE tx_type = get_tx_type(type, xd, block);
-  const scan_order *const so = get_scan(tx_size, tx_type);
+  const TX_TYPE tx_type = get_tx_type(type, xd, block, tx_size);
+  const scan_order *const so = get_scan(tx_size, tx_type, is_inter_block(mbmi));
   const int ref = is_inter_block(mbmi);
   unsigned int (*const counts)[COEFF_CONTEXTS][ENTROPY_TOKENS] =
       td->rd_counts.coef_counts[tx_size][type][ref];
+#if CONFIG_ENTROPY
+  vpx_prob (*coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
+      cpi->subframe_stats.coef_probs_buf[cpi->common.coef_probs_update_idx]
+                                        [tx_size][type][ref];
+#else
   vpx_prob (*const coef_probs)[COEFF_CONTEXTS][UNCONSTRAINED_NODES] =
       cpi->common.fc->coef_probs[tx_size][type][ref];
+#endif  // CONFIG_ENTROPY
+#if CONFIG_ANS
+  rans_dec_lut (*const coef_cdfs)[COEFF_CONTEXTS] =
+      cpi->common.fc->coef_cdfs[tx_size][type][ref];
+#endif  // CONFIG_ANS
   unsigned int (*const eob_branch)[COEFF_CONTEXTS] =
       td->counts->eob_branch[tx_size][type][ref];
   const uint8_t *const band = get_band_translate(tx_size);
   const int seg_eob = get_tx_eob(&cpi->common.seg, segment_id, tx_size);
+  int skip_eob = 0;
   int16_t token;
   EXTRABIT extra;
   pt = get_entropy_context(tx_size, pd->above_context + blk_col,
@@ -525,31 +490,21 @@
   c = 0;
 
   while (c < eob) {
-    int v = 0;
-    int skip_eob = 0;
-    v = qcoeff[scan[c]];
-
-    while (!v) {
-      add_token_no_extra(&t, coef_probs[band[c]][pt], ZERO_TOKEN, skip_eob,
-                         counts[band[c]][pt]);
-      eob_branch[band[c]][pt] += !skip_eob;
-
-      skip_eob = 1;
-      token_cache[scan[c]] = 0;
-      ++c;
-      pt = get_coef_context(nb, token_cache, c);
-      v = qcoeff[scan[c]];
-    }
+    const int v = qcoeff[scan[c]];
+    eob_branch[band[c]][pt] += !skip_eob;
 
     vp10_get_token_extra(v, &token, &extra);
 
-    add_token(&t, coef_probs[band[c]][pt], extra, (uint8_t)token,
-              (uint8_t)skip_eob, counts[band[c]][pt]);
-    eob_branch[band[c]][pt] += !skip_eob;
+    add_token(&t, coef_probs[band[c]][pt],
+#if CONFIG_ANS
+              (const rans_dec_lut*)&coef_cdfs[band[c]][pt],
+#endif  // CONFIG_ANS
+              extra, (uint8_t)token, (uint8_t)skip_eob, counts[band[c]][pt]);
 
     token_cache[scan[c]] = vp10_pt_energy_class[token];
     ++c;
     pt = get_coef_context(nb, token_cache, c);
+    skip_eob = (token == ZERO_TOKEN);
   }
   if (c < seg_eob) {
     add_token_no_extra(&t, coef_probs[band[c]][pt], EOB_TOKEN, 0,
@@ -609,6 +564,118 @@
   return result;
 }
 
+#if CONFIG_VAR_TX
+void tokenize_tx(ThreadData *td, TOKENEXTRA **t,
+                 int dry_run, TX_SIZE tx_size, BLOCK_SIZE plane_bsize,
+                 int blk_row, int blk_col, int block, int plane,
+                 void *arg) {
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  const struct macroblockd_plane *const pd = &xd->plane[plane];
+  const BLOCK_SIZE bsize = txsize_to_bsize[tx_size];
+  const int tx_row = blk_row >> (1 - pd->subsampling_y);
+  const int tx_col = blk_col >> (1 - pd->subsampling_x);
+  const TX_SIZE plane_tx_size = plane ?
+      get_uv_tx_size_impl(mbmi->inter_tx_size[tx_row][tx_col], bsize, 0, 0) :
+      mbmi->inter_tx_size[tx_row][tx_col];
+
+  int max_blocks_high = num_4x4_blocks_high_lookup[plane_bsize];
+  int max_blocks_wide = num_4x4_blocks_wide_lookup[plane_bsize];
+  if (xd->mb_to_bottom_edge < 0)
+    max_blocks_high += xd->mb_to_bottom_edge >> (5 + pd->subsampling_y);
+  if (xd->mb_to_right_edge < 0)
+    max_blocks_wide += xd->mb_to_right_edge >> (5 + pd->subsampling_x);
+
+  if (blk_row >= max_blocks_high || blk_col >= max_blocks_wide)
+    return;
+
+  if (tx_size == plane_tx_size) {
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    BLOCK_SIZE plane_bsize = get_plane_block_size(mbmi->sb_type, pd);
+    if (!dry_run)
+      tokenize_b(plane, block, blk_row, blk_col, plane_bsize, tx_size, arg);
+    else
+      set_entropy_context_b(plane, block, blk_row, blk_col,
+                            plane_bsize, tx_size, arg);
+  } else {
+    int bsl = b_width_log2_lookup[bsize];
+    int i;
+
+    assert(bsl > 0);
+    --bsl;
+
+    for (i = 0; i < 4; ++i) {
+      const int offsetr = blk_row + ((i >> 1) << bsl);
+      const int offsetc = blk_col + ((i & 0x01) << bsl);
+      int step = 1 << (2 * (tx_size - 1));
+
+      if (offsetr >= max_blocks_high || offsetc >= max_blocks_wide)
+        continue;
+
+      tokenize_tx(td, t, dry_run, tx_size - 1, plane_bsize,
+                  offsetr, offsetc, block + i * step, plane, arg);
+    }
+  }
+}
+
+void vp10_tokenize_sb_inter(VP10_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
+                            int dry_run, int mi_row, int mi_col,
+                            BLOCK_SIZE bsize) {
+  VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCK *const x = &td->mb;
+  MACROBLOCKD *const xd = &x->e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  TOKENEXTRA *t_backup = *t;
+  const int ctx = vp10_get_skip_context(xd);
+  const int skip_inc = !segfeature_active(&cm->seg, mbmi->segment_id,
+                                          SEG_LVL_SKIP);
+  struct tokenize_b_args arg = {cpi, td, t};
+  int plane;
+  if (mi_row >= cm->mi_rows || mi_col >= cm->mi_cols)
+    return;
+
+  if (mbmi->skip) {
+    if (!dry_run)
+      td->counts->skip[ctx][1] += skip_inc;
+    reset_skip_context(xd, bsize);
+    if (dry_run)
+      *t = t_backup;
+    return;
+  }
+
+  if (!dry_run)
+    td->counts->skip[ctx][0] += skip_inc;
+  else
+    *t = t_backup;
+
+  for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+    const struct macroblockd_plane *const pd = &xd->plane[plane];
+    const BLOCK_SIZE plane_bsize = get_plane_block_size(bsize, pd);
+    const int mi_width = num_4x4_blocks_wide_lookup[plane_bsize];
+    const int mi_height = num_4x4_blocks_high_lookup[plane_bsize];
+    const TX_SIZE max_tx_size = max_txsize_lookup[plane_bsize];
+    const BLOCK_SIZE txb_size = txsize_to_bsize[max_tx_size];
+    int bh = num_4x4_blocks_wide_lookup[txb_size];
+    int idx, idy;
+    int block = 0;
+    int step = 1 << (max_tx_size * 2);
+    for (idy = 0; idy < mi_height; idy += bh) {
+      for (idx = 0; idx < mi_width; idx += bh) {
+        tokenize_tx(td, t, dry_run, max_tx_size, plane_bsize, idy, idx,
+                    block, plane, &arg);
+        block += step;
+      }
+    }
+
+    if (!dry_run) {
+      (*t)->token = EOSB_TOKEN;
+      (*t)++;
+    }
+  }
+}
+#endif
+
 void vp10_tokenize_sb(VP10_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
                      int dry_run, BLOCK_SIZE bsize) {
   VP10_COMMON *const cm = &cpi->common;
@@ -641,3 +708,40 @@
     vp10_foreach_transformed_block(xd, bsize, set_entropy_context_b, &arg);
   }
 }
+
+#if CONFIG_SUPERTX
+void vp10_tokenize_sb_supertx(VP10_COMP *cpi, ThreadData *td, TOKENEXTRA **t,
+                              int dry_run, BLOCK_SIZE bsize) {
+  VP10_COMMON *const cm = &cpi->common;
+  MACROBLOCKD *const xd = &td->mb.e_mbd;
+  MB_MODE_INFO *const mbmi = &xd->mi[0]->mbmi;
+  TOKENEXTRA *t_backup = *t;
+  const int ctx = vp10_get_skip_context(xd);
+  const int skip_inc = !segfeature_active(&cm->seg, mbmi->segment_id_supertx,
+                                          SEG_LVL_SKIP);
+  struct tokenize_b_args arg = {cpi, td, t};
+  if (mbmi->skip) {
+    if (!dry_run)
+      td->counts->skip[ctx][1] += skip_inc;
+    reset_skip_context(xd, bsize);
+    if (dry_run)
+      *t = t_backup;
+    return;
+  }
+
+  if (!dry_run) {
+    int plane;
+    td->counts->skip[ctx][0] += skip_inc;
+
+    for (plane = 0; plane < MAX_MB_PLANE; ++plane) {
+      vp10_foreach_transformed_block_in_plane(xd, bsize, plane, tokenize_b,
+                                              &arg);
+      (*t)->token = EOSB_TOKEN;
+      (*t)++;
+    }
+  } else {
+    vp10_foreach_transformed_block(xd, bsize, set_entropy_context_b, &arg);
+    *t = t_backup;
+  }
+}
+#endif  // CONFIG_SUPERTX
diff --git a/vp10/encoder/tokenize.h b/vp10/encoder/tokenize.h
index 5bad415..c9b20df 100644
--- a/vp10/encoder/tokenize.h
+++ b/vp10/encoder/tokenize.h
@@ -36,6 +36,9 @@
 
 typedef struct {
   const vpx_prob *context_tree;
+#if CONFIG_ANS
+  const rans_dec_lut *token_cdf;
+#endif  // CONFIG_ANS
   EXTRABIT extra;
   uint8_t token;
   uint8_t skip_eob_node;
@@ -43,7 +46,9 @@
 
 extern const vpx_tree_index vp10_coef_tree[];
 extern const vpx_tree_index vp10_coef_con_tree[];
+#if !CONFIG_ANS
 extern const struct vp10_token vp10_coef_encodings[];
+#endif  // !CONFIG_ANS
 
 int vp10_is_skippable_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
 int vp10_has_high_freq_in_plane(MACROBLOCK *x, BLOCK_SIZE bsize, int plane);
@@ -51,8 +56,21 @@
 struct VP10_COMP;
 struct ThreadData;
 
+#if CONFIG_VAR_TX
+void vp10_tokenize_sb_inter(struct VP10_COMP *cpi, struct ThreadData *td,
+                            TOKENEXTRA **t, int dry_run, int mi_row, int mi_col,
+                            BLOCK_SIZE bsize);
+#endif
+
+void vp10_tokenize_palette_sb(struct ThreadData *const td,
+                              BLOCK_SIZE bsize, int plane,
+                              TOKENEXTRA **t);
 void vp10_tokenize_sb(struct VP10_COMP *cpi, struct ThreadData *td,
                      TOKENEXTRA **t, int dry_run, BLOCK_SIZE bsize);
+#if CONFIG_SUPERTX
+void vp10_tokenize_sb_supertx(struct VP10_COMP *cpi, struct ThreadData *td,
+                              TOKENEXTRA **t, int dry_run, BLOCK_SIZE bsize);
+#endif
 
 extern const int16_t *vp10_dct_value_cost_ptr;
 /* TODO: The Token field should be broken out into a separate char array to
@@ -61,26 +79,27 @@
  */
 extern const TOKENVALUE *vp10_dct_value_tokens_ptr;
 extern const TOKENVALUE *vp10_dct_cat_lt_10_value_tokens;
+extern const int *vp10_dct_cat_lt_10_value_cost;
 extern const int16_t vp10_cat6_low_cost[256];
-extern const int16_t vp10_cat6_high_cost[128];
-extern const int16_t vp10_cat6_high10_high_cost[512];
-extern const int16_t vp10_cat6_high12_high_cost[2048];
-static INLINE int16_t vp10_get_cost(int16_t token, EXTRABIT extrabits,
-                                   const int16_t *cat6_high_table) {
+extern const int vp10_cat6_high_cost[64];
+extern const int vp10_cat6_high10_high_cost[256];
+extern const int vp10_cat6_high12_high_cost[1024];
+static INLINE int vp10_get_cost(int16_t token, EXTRABIT extrabits,
+                               const int *cat6_high_table) {
   if (token != CATEGORY6_TOKEN)
-    return vp10_extra_bits[token].cost[extrabits];
-  return vp10_cat6_low_cost[extrabits & 0xff]
-      + cat6_high_table[extrabits >> 8];
+    return vp10_extra_bits[token].cost[extrabits >> 1];
+  return vp10_cat6_low_cost[(extrabits >> 1) & 0xff]
+      + cat6_high_table[extrabits >> 9];
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-static INLINE const int16_t* vp10_get_high_cost_table(int bit_depth) {
+static INLINE const int* vp10_get_high_cost_table(int bit_depth) {
   return bit_depth == 8 ? vp10_cat6_high_cost
       : (bit_depth == 10 ? vp10_cat6_high10_high_cost :
          vp10_cat6_high12_high_cost);
 }
 #else
-static INLINE const int16_t* vp10_get_high_cost_table(int bit_depth) {
+static INLINE const int* vp10_get_high_cost_table(int bit_depth) {
   (void) bit_depth;
   return vp10_cat6_high_cost;
 }
@@ -104,6 +123,18 @@
   return vp10_dct_cat_lt_10_value_tokens[v].token;
 }
 
+static INLINE int vp10_get_token_cost(int v, int16_t *token,
+                                          const int *cat6_high_table) {
+  if (v >= CAT6_MIN_VAL || v <= -CAT6_MIN_VAL) {
+    EXTRABIT extrabits;
+    *token = CATEGORY6_TOKEN;
+    extrabits = abs(v) - CAT6_MIN_VAL;
+    return vp10_cat6_low_cost[extrabits & 0xff]
+        + cat6_high_table[extrabits >> 8];
+  }
+  *token = vp10_dct_cat_lt_10_value_tokens[v].token;
+  return vp10_dct_cat_lt_10_value_cost[v];
+}
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vp10/encoder/treewriter.h b/vp10/encoder/treewriter.h
index 6b76a03..eeb5a6d 100644
--- a/vp10/encoder/treewriter.h
+++ b/vp10/encoder/treewriter.h
@@ -11,7 +11,15 @@
 #ifndef VP10_ENCODER_TREEWRITER_H_
 #define VP10_ENCODER_TREEWRITER_H_
 
+#ifdef VP10_FORCE_VPXBOOL_TREEWRITER
 #include "vpx_dsp/bitwriter.h"
+#define tree_writer vpx_writer
+#define tree_bit_write vpx_write
+#else
+#include "vp10/encoder/bitwriter.h"
+#define tree_writer vp10_writer
+#define tree_bit_write vp10_write
+#endif
 
 #ifdef __cplusplus
 extern "C" {
@@ -28,22 +36,24 @@
 
 void vp10_tokens_from_tree(struct vp10_token*, const vpx_tree_index *);
 
-static INLINE void vp10_write_tree(vpx_writer *w, const vpx_tree_index *tree,
+static INLINE void vp10_write_tree(tree_writer *w, const vpx_tree_index *tree,
                                   const vpx_prob *probs, int bits, int len,
                                   vpx_tree_index i) {
   do {
     const int bit = (bits >> --len) & 1;
-    vpx_write(w, bit, probs[i >> 1]);
+    tree_bit_write(w, bit, probs[i >> 1]);
     i = tree[i + bit];
   } while (len);
 }
 
-static INLINE void vp10_write_token(vpx_writer *w, const vpx_tree_index *tree,
+static INLINE void vp10_write_token(tree_writer *w, const vpx_tree_index *tree,
                                    const vpx_prob *probs,
                                    const struct vp10_token *token) {
   vp10_write_tree(w, tree, probs, token->value, token->len, 0);
 }
 
+#undef tree_writer
+#undef tree_bit_write
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vp10/encoder/variance_tree.c b/vp10/encoder/variance_tree.c
new file mode 100644
index 0000000..d11ef2d
--- /dev/null
+++ b/vp10/encoder/variance_tree.c
@@ -0,0 +1,63 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vp10/encoder/variance_tree.h"
+#include "vp10/encoder/encoder.h"
+
+
+
+void vp10_setup_var_tree(struct VP10Common *cm, ThreadData *td) {
+  int i, j;
+#if CONFIG_EXT_PARTITION
+  const int leaf_nodes = 1024;
+  const int tree_nodes = 1024 + 256 + 64 + 16 + 4 + 1;
+#else
+  const int leaf_nodes = 256;
+  const int tree_nodes = 256 + 64 + 16 + 4 + 1;
+#endif  // CONFIG_EXT_PARTITION
+  int index = 0;
+  VAR_TREE *this_var;
+  int nodes;
+
+  vpx_free(td->var_tree);
+  CHECK_MEM_ERROR(cm, td->var_tree, vpx_calloc(tree_nodes,
+                                              sizeof(*td->var_tree)));
+
+  this_var = &td->var_tree[0];
+
+  // Sets up all the leaf nodes in the tree.
+  for (index = 0; index < leaf_nodes; ++index) {
+    VAR_TREE *const leaf = &td->var_tree[index];
+    leaf->split[0] = NULL;
+  }
+
+  // Each node has 4 leaf nodes, fill in the child pointers
+  // from leafs to the root.
+  for (nodes = leaf_nodes >> 2; nodes > 0; nodes >>= 2) {
+    for (i = 0; i < nodes; ++i, ++index) {
+      VAR_TREE *const node = &td->var_tree[index];
+      for (j = 0; j < 4; j++)
+        node->split[j] = this_var++;
+    }
+  }
+
+  // Set up the root node for the largest superblock size
+  i = MAX_MIB_SIZE_LOG2 - MIN_MIB_SIZE_LOG2;
+  td->var_root[i] = &td->var_tree[tree_nodes - 1];
+  // Set up the root nodes for the rest of the possible superblock sizes
+  while (--i >= 0) {
+    td->var_root[i] = td->var_root[i+1]->split[0];
+  }
+}
+
+void vp10_free_var_tree(ThreadData *td) {
+  vpx_free(td->var_tree);
+  td->var_tree = NULL;
+}
diff --git a/vp10/encoder/variance_tree.h b/vp10/encoder/variance_tree.h
new file mode 100644
index 0000000..a10f7e7
--- /dev/null
+++ b/vp10/encoder/variance_tree.h
@@ -0,0 +1,98 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VP10_ENCODER_VARIANCE_TREE_H_
+#define VP10_ENCODER_VARIANCE_TREE_H_
+
+#include <assert.h>
+
+#include "./vpx_config.h"
+
+#include "vpx/vpx_integer.h"
+
+#include "vp10/common/enums.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP10Common;
+struct ThreadData;
+
+typedef struct {
+  int64_t sum_square_error;
+  int64_t sum_error;
+  int log2_count;
+  int variance;
+} var;
+
+typedef struct {
+  var none;
+  var horz[2];
+  var vert[2];
+} partition_variance;
+
+typedef struct VAR_TREE {
+  int force_split;
+  partition_variance variances;
+  struct VAR_TREE *split[4];
+  BLOCK_SIZE bsize;
+  const uint8_t *src;
+  const uint8_t *ref;
+  int src_stride;
+  int ref_stride;
+  int width;
+  int height;
+#if CONFIG_VP9_HIGHBITDEPTH
+  int highbd;
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+} VAR_TREE;
+
+void vp10_setup_var_tree(struct VP10Common *cm, struct ThreadData *td);
+void vp10_free_var_tree(struct ThreadData *td);
+
+// Set variance values given sum square error, sum error, count.
+static INLINE void fill_variance(int64_t s2, int64_t s, int c, var *v) {
+  v->sum_square_error = s2;
+  v->sum_error = s;
+  v->log2_count = c;
+  v->variance = (int)(256 * (v->sum_square_error -
+      ((v->sum_error * v->sum_error) >> v->log2_count)) >> v->log2_count);
+}
+
+static INLINE void sum_2_variances(const var *a, const var *b, var *r) {
+  assert(a->log2_count == b->log2_count);
+  fill_variance(a->sum_square_error + b->sum_square_error,
+                a->sum_error + b->sum_error, a->log2_count + 1, r);
+}
+
+static INLINE void fill_variance_node(VAR_TREE *vt) {
+  sum_2_variances(&vt->split[0]->variances.none,
+                  &vt->split[1]->variances.none,
+                  &vt->variances.horz[0]);
+  sum_2_variances(&vt->split[2]->variances.none,
+                  &vt->split[3]->variances.none,
+                  &vt->variances.horz[1]);
+  sum_2_variances(&vt->split[0]->variances.none,
+                  &vt->split[2]->variances.none,
+                  &vt->variances.vert[0]);
+  sum_2_variances(&vt->split[1]->variances.none,
+                  &vt->split[3]->variances.none,
+                  &vt->variances.vert[1]);
+  sum_2_variances(&vt->variances.vert[0],
+                  &vt->variances.vert[1],
+                  &vt->variances.none);
+}
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif /* VP10_ENCODER_VARIANCE_TREE_H_ */
diff --git a/vp10/encoder/wedge_utils.c b/vp10/encoder/wedge_utils.c
new file mode 100644
index 0000000..d97008d
--- /dev/null
+++ b/vp10/encoder/wedge_utils.c
@@ -0,0 +1,135 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vpx/vpx_integer.h"
+
+#include "vpx_ports/mem.h"
+
+#include "vpx_dsp/vpx_dsp_common.h"
+
+#include "vp10/common/reconinter.h"
+
+#define MAX_MASK_VALUE  (1 << WEDGE_WEIGHT_BITS)
+
+/**
+ * Computes SSE of a compound predictor constructed from 2 fundamental
+ * predictors p0 and p1 using blending with mask.
+ *
+ * r1:  Residuals of p1.
+ *      (source - p1)
+ * d:   Difference of p1 and p0.
+ *      (p1 - p0)
+ * m:   The blending mask
+ * N:   Number of pixels
+ *
+ * 'r1', 'd', and 'm' are contiguous.
+ *
+ * Computes:
+ *  Sum((MAX_MASK_VALUE*r1 + mask*d)**2), which is equivalent to:
+ *  Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2),
+ *    where r0 is (source - p0), and r1 is (source - p1), which is in turn
+ *    is equivalent to:
+ *  Sum((source*MAX_MASK_VALUE - (mask*p0 + (MAX_MASK_VALUE-mask)*p1))**2),
+ *    which is the SSE of the residuals of the compound predictor scaled up by
+ *    MAX_MASK_VALUE**2.
+ *
+ * Note that we clamp the partial term in the loop to 16 bits signed. This is
+ * to facilitate equivalent SIMD implementation. It should have no effect if
+ * residuals are within 16 - WEDGE_WEIGHT_BITS (=10) signed, which always
+ * holds for 8 bit input, and on real input, it should hold practically always,
+ * as residuals are expected to be small.
+ */
+uint64_t vp10_wedge_sse_from_residuals_c(const int16_t *r1,
+                                         const int16_t *d,
+                                         const uint8_t *m,
+                                         int N) {
+  uint64_t csse = 0;
+  int i;
+  assert(N % 64 == 0);
+  for (i = 0 ; i < N ; i++) {
+    int32_t t = MAX_MASK_VALUE*r1[i] + m[i]*d[i];
+    t = clamp(t, INT16_MIN, INT16_MAX);
+    csse += t*t;
+  }
+  return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
+}
+
+/**
+ * Choose the mask sign for a compound predictor.
+ *
+ * ds:    Difference of the squares of the residuals.
+ *        r0**2 - r1**2
+ * m:     The blending mask
+ * N:     Number of pixels
+ * limit: Pre-computed threshold value.
+ *        MAX_MASK_VALUE/2 * (sum(r0**2) - sum(r1**2))
+ *
+ * 'ds' and 'm' are contiguous.
+ *
+ * Returns true if the negated mask has lower SSE compared to the positive
+ * mask. Computation is based on:
+ *  Sum((mask*r0 + (MAX_MASK_VALUE-mask)*r1)**2)
+ *                                     >
+ *                                Sum(((MAX_MASK_VALUE-mask)*r0 + mask*r1)**2)
+ *
+ *  which can be simplified to:
+ *
+ *  Sum(mask*(r0**2 - r1**2)) > MAX_MASK_VALUE/2 * (sum(r0**2) - sum(r1**2))
+ *
+ *  The right hand side does not depend on the mask, and needs to be passed as
+ *  the 'limit' parameter.
+ *
+ *  After pre-computing (r0**2 - r1**2), which is passed in as 'ds', the left
+ *  hand side is simply a scalar product between an int16_t and uint8_t vector.
+ *
+ *  Note that for efficiency, ds is stored on 16 bits. Real input residuals
+ *  being small, this should not cause a noticeable issue.
+ */
+int vp10_wedge_sign_from_residuals_c(const int16_t *ds,
+                                     const uint8_t *m,
+                                     int N,
+                                     int64_t limit) {
+  int64_t acc = 0;
+
+  assert(N % 64 == 0);
+
+  do {
+    acc += *ds++ * *m++;
+  } while (--N);
+
+  return acc > limit;
+}
+
+/**
+ * Compute the element-wise difference of the squares of 2 arrays.
+ *
+ * d: Difference of the squares of the inputs: a**2 - b**2
+ * a: First input array
+ * b: Second input array
+ * N: Number of elements
+ *
+ * 'd', 'a', and 'b' are contiguous.
+ *
+ * The result is saturated to signed 16 bits.
+ */
+void vp10_wedge_compute_delta_squares_c(int16_t *d,
+                                        const int16_t *a,
+                                        const int16_t *b,
+                                        int N) {
+  int i;
+
+  assert(N % 64 == 0);
+
+  for (i = 0 ; i < N ; i++)
+    d[i] = clamp(a[i]*a[i] - b[i]*b[i], INT16_MIN, INT16_MAX);
+}
+
diff --git a/vp10/encoder/x86/dct_intrin_sse2.c b/vp10/encoder/x86/dct_intrin_sse2.c
index e111157..ea0ccb8 100644
--- a/vp10/encoder/x86/dct_intrin_sse2.c
+++ b/vp10/encoder/x86/dct_intrin_sse2.c
@@ -19,15 +19,29 @@
 #include "vpx_ports/mem.h"
 
 static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
-                                   int stride) {
+                                   int stride, int flipud, int fliplr) {
   const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
   const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
   __m128i mask;
 
-  in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
-  in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
-  in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
-  in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+  if (!flipud) {
+    in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+    in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+    in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+    in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+  } else {
+    in[0] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+    in[1] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+    in[2] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+    in[3] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+  }
+
+  if (fliplr) {
+    in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
+    in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
+    in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
+    in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
+  }
 
   in[0] = _mm_slli_epi16(in[0], 4);
   in[1] = _mm_slli_epi16(in[1], 4);
@@ -151,6 +165,41 @@
   transpose_4x4(in);
 }
 
+#if CONFIG_EXT_TX
+static void fidtx4_sse2(__m128i *in) {
+  const __m128i k__zero_epi16 = _mm_set1_epi16((int16_t)0);
+  const __m128i k__sqrt2_epi16 = _mm_set1_epi16((int16_t)Sqrt2);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  __m128i v0, v1, v2, v3;
+  __m128i u0, u1, u2, u3;
+
+  v0 = _mm_unpacklo_epi16(in[0], k__zero_epi16);
+  v1 = _mm_unpacklo_epi16(in[1], k__zero_epi16);
+  v2 = _mm_unpacklo_epi16(in[2], k__zero_epi16);
+  v3 = _mm_unpacklo_epi16(in[3], k__zero_epi16);
+
+  u0 = _mm_madd_epi16(v0, k__sqrt2_epi16);
+  u1 = _mm_madd_epi16(v1, k__sqrt2_epi16);
+  u2 = _mm_madd_epi16(v2, k__sqrt2_epi16);
+  u3 = _mm_madd_epi16(v3, k__sqrt2_epi16);
+
+  v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(u0, u2);
+  in[1] = _mm_packs_epi32(u1, u3);
+  transpose_4x4(in);
+}
+#endif  // CONFIG_EXT_TX
+
 void vp10_fht4x4_sse2(const int16_t *input, tran_low_t *output,
                      int stride, int tx_type) {
   __m128i in[4];
@@ -160,26 +209,93 @@
       vpx_fdct4x4_sse2(input, output, stride);
       break;
     case ADST_DCT:
-      load_buffer_4x4(input, in, stride);
+      load_buffer_4x4(input, in, stride, 0, 0);
       fadst4_sse2(in);
       fdct4_sse2(in);
       write_buffer_4x4(output, in);
       break;
     case DCT_ADST:
-      load_buffer_4x4(input, in, stride);
+      load_buffer_4x4(input, in, stride, 0, 0);
       fdct4_sse2(in);
       fadst4_sse2(in);
       write_buffer_4x4(output, in);
       break;
     case ADST_ADST:
-      load_buffer_4x4(input, in, stride);
+      load_buffer_4x4(input, in, stride, 0, 0);
       fadst4_sse2(in);
       fadst4_sse2(in);
       write_buffer_4x4(output, in);
       break;
-   default:
-     assert(0);
-     break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      load_buffer_4x4(input, in, stride, 1, 0);
+      fadst4_sse2(in);
+      fdct4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_4x4(input, in, stride, 0, 1);
+      fdct4_sse2(in);
+      fadst4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_4x4(input, in, stride, 1, 1);
+      fadst4_sse2(in);
+      fadst4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_4x4(input, in, stride, 0, 1);
+      fadst4_sse2(in);
+      fadst4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_4x4(input, in, stride, 1, 0);
+      fadst4_sse2(in);
+      fadst4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case V_DCT:
+      load_buffer_4x4(input, in, stride, 0, 0);
+      fdct4_sse2(in);
+      fidtx4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case H_DCT:
+      load_buffer_4x4(input, in, stride, 0, 0);
+      fidtx4_sse2(in);
+      fdct4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case V_ADST:
+      load_buffer_4x4(input, in, stride, 0, 0);
+      fadst4_sse2(in);
+      fidtx4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case H_ADST:
+      load_buffer_4x4(input, in, stride, 0, 0);
+      fidtx4_sse2(in);
+      fadst4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case V_FLIPADST:
+      load_buffer_4x4(input, in, stride, 1, 0);
+      fadst4_sse2(in);
+      fidtx4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+    case H_FLIPADST:
+      load_buffer_4x4(input, in, stride, 0, 1);
+      fidtx4_sse2(in);
+      fadst4_sse2(in);
+      write_buffer_4x4(output, in);
+      break;
+#endif  // CONFIG_EXT_TX
+    default:
+      assert(0);
   }
 }
 
@@ -627,15 +743,37 @@
 
 // load 8x8 array
 static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
-                                   int stride) {
-  in[0]  = _mm_load_si128((const __m128i *)(input + 0 * stride));
-  in[1]  = _mm_load_si128((const __m128i *)(input + 1 * stride));
-  in[2]  = _mm_load_si128((const __m128i *)(input + 2 * stride));
-  in[3]  = _mm_load_si128((const __m128i *)(input + 3 * stride));
-  in[4]  = _mm_load_si128((const __m128i *)(input + 4 * stride));
-  in[5]  = _mm_load_si128((const __m128i *)(input + 5 * stride));
-  in[6]  = _mm_load_si128((const __m128i *)(input + 6 * stride));
-  in[7]  = _mm_load_si128((const __m128i *)(input + 7 * stride));
+                                   int stride, int flipud, int fliplr) {
+  if (!flipud) {
+    in[0]  = _mm_load_si128((const __m128i *)(input + 0 * stride));
+    in[1]  = _mm_load_si128((const __m128i *)(input + 1 * stride));
+    in[2]  = _mm_load_si128((const __m128i *)(input + 2 * stride));
+    in[3]  = _mm_load_si128((const __m128i *)(input + 3 * stride));
+    in[4]  = _mm_load_si128((const __m128i *)(input + 4 * stride));
+    in[5]  = _mm_load_si128((const __m128i *)(input + 5 * stride));
+    in[6]  = _mm_load_si128((const __m128i *)(input + 6 * stride));
+    in[7]  = _mm_load_si128((const __m128i *)(input + 7 * stride));
+  } else {
+    in[0]  = _mm_load_si128((const __m128i *)(input + 7 * stride));
+    in[1]  = _mm_load_si128((const __m128i *)(input + 6 * stride));
+    in[2]  = _mm_load_si128((const __m128i *)(input + 5 * stride));
+    in[3]  = _mm_load_si128((const __m128i *)(input + 4 * stride));
+    in[4]  = _mm_load_si128((const __m128i *)(input + 3 * stride));
+    in[5]  = _mm_load_si128((const __m128i *)(input + 2 * stride));
+    in[6]  = _mm_load_si128((const __m128i *)(input + 1 * stride));
+    in[7]  = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  }
+
+  if (fliplr) {
+    in[0] = mm_reverse_epi16(in[0]);
+    in[1] = mm_reverse_epi16(in[1]);
+    in[2] = mm_reverse_epi16(in[2]);
+    in[3] = mm_reverse_epi16(in[3]);
+    in[4] = mm_reverse_epi16(in[4]);
+    in[5] = mm_reverse_epi16(in[5]);
+    in[6] = mm_reverse_epi16(in[6]);
+    in[7] = mm_reverse_epi16(in[7]);
+  }
 
   in[0] = _mm_slli_epi16(in[0], 2);
   in[1] = _mm_slli_epi16(in[1], 2);
@@ -1135,6 +1273,21 @@
   array_transpose_8x8(in, in);
 }
 
+#if CONFIG_EXT_TX
+static void fidtx8_sse2(__m128i *in) {
+  in[0] = _mm_slli_epi16(in[0], 1);
+  in[1] = _mm_slli_epi16(in[1], 1);
+  in[2] = _mm_slli_epi16(in[2], 1);
+  in[3] = _mm_slli_epi16(in[3], 1);
+  in[4] = _mm_slli_epi16(in[4], 1);
+  in[5] = _mm_slli_epi16(in[5], 1);
+  in[6] = _mm_slli_epi16(in[6], 1);
+  in[7] = _mm_slli_epi16(in[7], 1);
+
+  array_transpose_8x8(in, in);
+}
+#endif  // CONFIG_EXT_TX
+
 void vp10_fht8x8_sse2(const int16_t *input, tran_low_t *output,
                      int stride, int tx_type) {
   __m128i in[8];
@@ -1144,42 +1297,142 @@
       vpx_fdct8x8_sse2(input, output, stride);
       break;
     case ADST_DCT:
-      load_buffer_8x8(input, in, stride);
+      load_buffer_8x8(input, in, stride, 0, 0);
       fadst8_sse2(in);
       fdct8_sse2(in);
       right_shift_8x8(in, 1);
       write_buffer_8x8(output, in, 8);
       break;
     case DCT_ADST:
-      load_buffer_8x8(input, in, stride);
+      load_buffer_8x8(input, in, stride, 0, 0);
       fdct8_sse2(in);
       fadst8_sse2(in);
       right_shift_8x8(in, 1);
       write_buffer_8x8(output, in, 8);
       break;
     case ADST_ADST:
-      load_buffer_8x8(input, in, stride);
+      load_buffer_8x8(input, in, stride, 0, 0);
       fadst8_sse2(in);
       fadst8_sse2(in);
       right_shift_8x8(in, 1);
       write_buffer_8x8(output, in, 8);
       break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      load_buffer_8x8(input, in, stride, 1, 0);
+      fadst8_sse2(in);
+      fdct8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_8x8(input, in, stride, 0, 1);
+      fdct8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_8x8(input, in, stride, 1, 1);
+      fadst8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_8x8(input, in, stride, 0, 1);
+      fadst8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_8x8(input, in, stride, 1, 0);
+      fadst8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case V_DCT:
+      load_buffer_8x8(input, in, stride, 0, 0);
+      fdct8_sse2(in);
+      fidtx8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case H_DCT:
+      load_buffer_8x8(input, in, stride, 0, 0);
+      fidtx8_sse2(in);
+      fdct8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case V_ADST:
+      load_buffer_8x8(input, in, stride, 0, 0);
+      fadst8_sse2(in);
+      fidtx8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case H_ADST:
+      load_buffer_8x8(input, in, stride, 0, 0);
+      fidtx8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case V_FLIPADST:
+      load_buffer_8x8(input, in, stride, 1, 0);
+      fadst8_sse2(in);
+      fidtx8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case H_FLIPADST:
+      load_buffer_8x8(input, in, stride, 0, 1);
+      fidtx8_sse2(in);
+      fadst8_sse2(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+#endif  // CONFIG_EXT_TX
     default:
       assert(0);
-      break;
   }
 }
 
 static INLINE void load_buffer_16x16(const int16_t* input, __m128i *in0,
-                                     __m128i *in1, int stride) {
-  // load first 8 columns
-  load_buffer_8x8(input, in0, stride);
-  load_buffer_8x8(input + 8 * stride, in0 + 8, stride);
+                                     __m128i *in1, int stride,
+                                     int flipud, int fliplr) {
+  // Load 4 8x8 blocks
+  const int16_t *topL = input;
+  const int16_t *topR = input + 8;
+  const int16_t *botL = input + 8 * stride;
+  const int16_t *botR = input + 8 * stride + 8;
 
-  input += 8;
+  const int16_t *tmp;
+
+  if (flipud) {
+    // Swap left columns
+    tmp = topL; topL = botL; botL = tmp;
+    // Swap right columns
+    tmp = topR; topR = botR; botR = tmp;
+  }
+
+  if (fliplr) {
+    // Swap top rows
+    tmp = topL; topL = topR; topR = tmp;
+    // Swap bottom rows
+    tmp = botL; botL = botR; botR = tmp;
+  }
+
+  // load first 8 columns
+  load_buffer_8x8(topL, in0,     stride, flipud, fliplr);
+  load_buffer_8x8(botL, in0 + 8, stride, flipud, fliplr);
+
   // load second 8 columns
-  load_buffer_8x8(input, in1, stride);
-  load_buffer_8x8(input + 8 * stride, in1 + 8, stride);
+  load_buffer_8x8(topR, in1,     stride, flipud, fliplr);
+  load_buffer_8x8(botR, in1 + 8, stride, flipud, fliplr);
 }
 
 static INLINE void write_buffer_16x16(tran_low_t *output, __m128i *in0,
@@ -1225,7 +1478,7 @@
   const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
   const __m128i k__cospi_m16_p16 = pair_set_epi16(-cospi_16_64, cospi_16_64);
   const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
-  const __m128i k__cospi_p08_m24 = pair_set_epi16(cospi_8_64, -cospi_24_64);
+  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
   const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
   const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
   const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
@@ -1429,10 +1682,10 @@
 
   v[0] = _mm_madd_epi16(u[0], k__cospi_m08_p24);
   v[1] = _mm_madd_epi16(u[1], k__cospi_m08_p24);
-  v[2] = _mm_madd_epi16(u[2], k__cospi_p24_p08);
-  v[3] = _mm_madd_epi16(u[3], k__cospi_p24_p08);
-  v[4] = _mm_madd_epi16(u[2], k__cospi_p08_m24);
-  v[5] = _mm_madd_epi16(u[3], k__cospi_p08_m24);
+  v[2] = _mm_madd_epi16(u[2], k__cospi_m24_m08);
+  v[3] = _mm_madd_epi16(u[3], k__cospi_m24_m08);
+  v[4] = _mm_madd_epi16(u[2], k__cospi_m08_p24);
+  v[5] = _mm_madd_epi16(u[3], k__cospi_m08_p24);
   v[6] = _mm_madd_epi16(u[0], k__cospi_p24_p08);
   v[7] = _mm_madd_epi16(u[1], k__cospi_p24_p08);
 
@@ -1462,10 +1715,10 @@
   // stage 5
   s[0] = _mm_add_epi16(p[0], t[1]);
   s[1] = _mm_sub_epi16(p[0], t[1]);
-  s[2] = _mm_add_epi16(p[3], t[2]);
-  s[3] = _mm_sub_epi16(p[3], t[2]);
-  s[4] = _mm_sub_epi16(p[4], t[5]);
-  s[5] = _mm_add_epi16(p[4], t[5]);
+  s[2] = _mm_sub_epi16(p[3], t[2]);
+  s[3] = _mm_add_epi16(p[3], t[2]);
+  s[4] = _mm_add_epi16(p[4], t[5]);
+  s[5] = _mm_sub_epi16(p[4], t[5]);
   s[6] = _mm_sub_epi16(p[7], t[6]);
   s[7] = _mm_add_epi16(p[7], t[6]);
 
@@ -2022,6 +2275,204 @@
   array_transpose_16x16(in0, in1);
 }
 
+#if CONFIG_EXT_TX
+static void fidtx16_8col(__m128i *in) {
+  const __m128i k__zero_epi16 = _mm_set1_epi16((int16_t)0);
+  const __m128i k__sqrt2_epi16 = _mm_set1_epi16((int16_t)Sqrt2);
+  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
+
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  __m128i y0, y1, y2, y3, y4, y5, y6, y7;
+
+  in[0] = _mm_slli_epi16(in[0], 1);
+  in[1] = _mm_slli_epi16(in[1], 1);
+  in[2] = _mm_slli_epi16(in[2], 1);
+  in[3] = _mm_slli_epi16(in[3], 1);
+  in[4] = _mm_slli_epi16(in[4], 1);
+  in[5] = _mm_slli_epi16(in[5], 1);
+  in[6] = _mm_slli_epi16(in[6], 1);
+  in[7] = _mm_slli_epi16(in[7], 1);
+  in[8] = _mm_slli_epi16(in[8], 1);
+  in[9] = _mm_slli_epi16(in[9], 1);
+  in[10] = _mm_slli_epi16(in[10], 1);
+  in[11] = _mm_slli_epi16(in[11], 1);
+  in[12] = _mm_slli_epi16(in[12], 1);
+  in[13] = _mm_slli_epi16(in[13], 1);
+  in[14] = _mm_slli_epi16(in[14], 1);
+  in[15] = _mm_slli_epi16(in[15], 1);
+
+  v0 = _mm_unpacklo_epi16(in[0], k__zero_epi16);
+  v1 = _mm_unpacklo_epi16(in[1], k__zero_epi16);
+  v2 = _mm_unpacklo_epi16(in[2], k__zero_epi16);
+  v3 = _mm_unpacklo_epi16(in[3], k__zero_epi16);
+  v4 = _mm_unpacklo_epi16(in[4], k__zero_epi16);
+  v5 = _mm_unpacklo_epi16(in[5], k__zero_epi16);
+  v6 = _mm_unpacklo_epi16(in[6], k__zero_epi16);
+  v7 = _mm_unpacklo_epi16(in[7], k__zero_epi16);
+
+  u0 = _mm_unpacklo_epi16(in[8], k__zero_epi16);
+  u1 = _mm_unpacklo_epi16(in[9], k__zero_epi16);
+  u2 = _mm_unpacklo_epi16(in[10], k__zero_epi16);
+  u3 = _mm_unpacklo_epi16(in[11], k__zero_epi16);
+  u4 = _mm_unpacklo_epi16(in[12], k__zero_epi16);
+  u5 = _mm_unpacklo_epi16(in[13], k__zero_epi16);
+  u6 = _mm_unpacklo_epi16(in[14], k__zero_epi16);
+  u7 = _mm_unpacklo_epi16(in[15], k__zero_epi16);
+
+  x0 = _mm_unpackhi_epi16(in[0], k__zero_epi16);
+  x1 = _mm_unpackhi_epi16(in[1], k__zero_epi16);
+  x2 = _mm_unpackhi_epi16(in[2], k__zero_epi16);
+  x3 = _mm_unpackhi_epi16(in[3], k__zero_epi16);
+  x4 = _mm_unpackhi_epi16(in[4], k__zero_epi16);
+  x5 = _mm_unpackhi_epi16(in[5], k__zero_epi16);
+  x6 = _mm_unpackhi_epi16(in[6], k__zero_epi16);
+  x7 = _mm_unpackhi_epi16(in[7], k__zero_epi16);
+
+  y0 = _mm_unpackhi_epi16(in[8], k__zero_epi16);
+  y1 = _mm_unpackhi_epi16(in[9], k__zero_epi16);
+  y2 = _mm_unpackhi_epi16(in[10], k__zero_epi16);
+  y3 = _mm_unpackhi_epi16(in[11], k__zero_epi16);
+  y4 = _mm_unpackhi_epi16(in[12], k__zero_epi16);
+  y5 = _mm_unpackhi_epi16(in[13], k__zero_epi16);
+  y6 = _mm_unpackhi_epi16(in[14], k__zero_epi16);
+  y7 = _mm_unpackhi_epi16(in[15], k__zero_epi16);
+
+  v0 = _mm_madd_epi16(v0, k__sqrt2_epi16);
+  v1 = _mm_madd_epi16(v1, k__sqrt2_epi16);
+  v2 = _mm_madd_epi16(v2, k__sqrt2_epi16);
+  v3 = _mm_madd_epi16(v3, k__sqrt2_epi16);
+  v4 = _mm_madd_epi16(v4, k__sqrt2_epi16);
+  v5 = _mm_madd_epi16(v5, k__sqrt2_epi16);
+  v6 = _mm_madd_epi16(v6, k__sqrt2_epi16);
+  v7 = _mm_madd_epi16(v7, k__sqrt2_epi16);
+
+  x0 = _mm_madd_epi16(x0, k__sqrt2_epi16);
+  x1 = _mm_madd_epi16(x1, k__sqrt2_epi16);
+  x2 = _mm_madd_epi16(x2, k__sqrt2_epi16);
+  x3 = _mm_madd_epi16(x3, k__sqrt2_epi16);
+  x4 = _mm_madd_epi16(x4, k__sqrt2_epi16);
+  x5 = _mm_madd_epi16(x5, k__sqrt2_epi16);
+  x6 = _mm_madd_epi16(x6, k__sqrt2_epi16);
+  x7 = _mm_madd_epi16(x7, k__sqrt2_epi16);
+
+  u0 = _mm_madd_epi16(u0, k__sqrt2_epi16);
+  u1 = _mm_madd_epi16(u1, k__sqrt2_epi16);
+  u2 = _mm_madd_epi16(u2, k__sqrt2_epi16);
+  u3 = _mm_madd_epi16(u3, k__sqrt2_epi16);
+  u4 = _mm_madd_epi16(u4, k__sqrt2_epi16);
+  u5 = _mm_madd_epi16(u5, k__sqrt2_epi16);
+  u6 = _mm_madd_epi16(u6, k__sqrt2_epi16);
+  u7 = _mm_madd_epi16(u7, k__sqrt2_epi16);
+
+  y0 = _mm_madd_epi16(y0, k__sqrt2_epi16);
+  y1 = _mm_madd_epi16(y1, k__sqrt2_epi16);
+  y2 = _mm_madd_epi16(y2, k__sqrt2_epi16);
+  y3 = _mm_madd_epi16(y3, k__sqrt2_epi16);
+  y4 = _mm_madd_epi16(y4, k__sqrt2_epi16);
+  y5 = _mm_madd_epi16(y5, k__sqrt2_epi16);
+  y6 = _mm_madd_epi16(y6, k__sqrt2_epi16);
+  y7 = _mm_madd_epi16(y7, k__sqrt2_epi16);
+
+  v0 = _mm_add_epi32(v0, k__DCT_CONST_ROUNDING);
+  v1 = _mm_add_epi32(v1, k__DCT_CONST_ROUNDING);
+  v2 = _mm_add_epi32(v2, k__DCT_CONST_ROUNDING);
+  v3 = _mm_add_epi32(v3, k__DCT_CONST_ROUNDING);
+  v4 = _mm_add_epi32(v4, k__DCT_CONST_ROUNDING);
+  v5 = _mm_add_epi32(v5, k__DCT_CONST_ROUNDING);
+  v6 = _mm_add_epi32(v6, k__DCT_CONST_ROUNDING);
+  v7 = _mm_add_epi32(v7, k__DCT_CONST_ROUNDING);
+
+  x0 = _mm_add_epi32(x0, k__DCT_CONST_ROUNDING);
+  x1 = _mm_add_epi32(x1, k__DCT_CONST_ROUNDING);
+  x2 = _mm_add_epi32(x2, k__DCT_CONST_ROUNDING);
+  x3 = _mm_add_epi32(x3, k__DCT_CONST_ROUNDING);
+  x4 = _mm_add_epi32(x4, k__DCT_CONST_ROUNDING);
+  x5 = _mm_add_epi32(x5, k__DCT_CONST_ROUNDING);
+  x6 = _mm_add_epi32(x6, k__DCT_CONST_ROUNDING);
+  x7 = _mm_add_epi32(x7, k__DCT_CONST_ROUNDING);
+
+  u0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
+  u1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
+  u2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
+  u3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
+  u4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
+  u5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
+  u6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
+  u7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
+
+  y0 = _mm_add_epi32(y0, k__DCT_CONST_ROUNDING);
+  y1 = _mm_add_epi32(y1, k__DCT_CONST_ROUNDING);
+  y2 = _mm_add_epi32(y2, k__DCT_CONST_ROUNDING);
+  y3 = _mm_add_epi32(y3, k__DCT_CONST_ROUNDING);
+  y4 = _mm_add_epi32(y4, k__DCT_CONST_ROUNDING);
+  y5 = _mm_add_epi32(y5, k__DCT_CONST_ROUNDING);
+  y6 = _mm_add_epi32(y6, k__DCT_CONST_ROUNDING);
+  y7 = _mm_add_epi32(y7, k__DCT_CONST_ROUNDING);
+
+  v0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
+  v1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
+  v2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
+  v3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
+  v4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
+  v5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
+  v6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
+  v7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
+
+  x0 = _mm_srai_epi32(x0, DCT_CONST_BITS);
+  x1 = _mm_srai_epi32(x1, DCT_CONST_BITS);
+  x2 = _mm_srai_epi32(x2, DCT_CONST_BITS);
+  x3 = _mm_srai_epi32(x3, DCT_CONST_BITS);
+  x4 = _mm_srai_epi32(x4, DCT_CONST_BITS);
+  x5 = _mm_srai_epi32(x5, DCT_CONST_BITS);
+  x6 = _mm_srai_epi32(x6, DCT_CONST_BITS);
+  x7 = _mm_srai_epi32(x7, DCT_CONST_BITS);
+
+  u0 = _mm_srai_epi32(u0, DCT_CONST_BITS);
+  u1 = _mm_srai_epi32(u1, DCT_CONST_BITS);
+  u2 = _mm_srai_epi32(u2, DCT_CONST_BITS);
+  u3 = _mm_srai_epi32(u3, DCT_CONST_BITS);
+  u4 = _mm_srai_epi32(u4, DCT_CONST_BITS);
+  u5 = _mm_srai_epi32(u5, DCT_CONST_BITS);
+  u6 = _mm_srai_epi32(u6, DCT_CONST_BITS);
+  u7 = _mm_srai_epi32(u7, DCT_CONST_BITS);
+
+  y0 = _mm_srai_epi32(y0, DCT_CONST_BITS);
+  y1 = _mm_srai_epi32(y1, DCT_CONST_BITS);
+  y2 = _mm_srai_epi32(y2, DCT_CONST_BITS);
+  y3 = _mm_srai_epi32(y3, DCT_CONST_BITS);
+  y4 = _mm_srai_epi32(y4, DCT_CONST_BITS);
+  y5 = _mm_srai_epi32(y5, DCT_CONST_BITS);
+  y6 = _mm_srai_epi32(y6, DCT_CONST_BITS);
+  y7 = _mm_srai_epi32(y7, DCT_CONST_BITS);
+
+  in[0] = _mm_packs_epi32(v0, x0);
+  in[1] = _mm_packs_epi32(v1, x1);
+  in[2] = _mm_packs_epi32(v2, x2);
+  in[3] = _mm_packs_epi32(v3, x3);
+  in[4] = _mm_packs_epi32(v4, x4);
+  in[5] = _mm_packs_epi32(v5, x5);
+  in[6] = _mm_packs_epi32(v6, x6);
+  in[7] = _mm_packs_epi32(v7, x7);
+
+  in[8] = _mm_packs_epi32(u0, y0);
+  in[9] = _mm_packs_epi32(u1, y1);
+  in[10] = _mm_packs_epi32(u2, y2);
+  in[11] = _mm_packs_epi32(u3, y3);
+  in[12] = _mm_packs_epi32(u4, y4);
+  in[13] = _mm_packs_epi32(u5, y5);
+  in[14] = _mm_packs_epi32(u6, y6);
+  in[15] = _mm_packs_epi32(u7, y7);
+}
+
+static void fidtx16_sse2(__m128i *in0, __m128i *in1) {
+  fidtx16_8col(in0);
+  fidtx16_8col(in1);
+  array_transpose_16x16(in0, in1);
+}
+#endif  // CONFIG_EXT_TX
+
 void vp10_fht16x16_sse2(const int16_t *input, tran_low_t *output,
                        int stride, int tx_type) {
   __m128i in0[16], in1[16];
@@ -2031,26 +2482,105 @@
       vpx_fdct16x16_sse2(input, output, stride);
       break;
     case ADST_DCT:
-      load_buffer_16x16(input, in0, in1, stride);
+      load_buffer_16x16(input, in0, in1, stride, 0, 0);
       fadst16_sse2(in0, in1);
       right_shift_16x16(in0, in1);
       fdct16_sse2(in0, in1);
       write_buffer_16x16(output, in0, in1, 16);
       break;
     case DCT_ADST:
-      load_buffer_16x16(input, in0, in1, stride);
+      load_buffer_16x16(input, in0, in1, stride, 0, 0);
       fdct16_sse2(in0, in1);
       right_shift_16x16(in0, in1);
       fadst16_sse2(in0, in1);
       write_buffer_16x16(output, in0, in1, 16);
       break;
     case ADST_ADST:
-      load_buffer_16x16(input, in0, in1, stride);
+      load_buffer_16x16(input, in0, in1, stride, 0, 0);
       fadst16_sse2(in0, in1);
       right_shift_16x16(in0, in1);
       fadst16_sse2(in0, in1);
       write_buffer_16x16(output, in0, in1, 16);
       break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      load_buffer_16x16(input, in0, in1, stride, 1, 0);
+      fadst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fdct16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case DCT_FLIPADST:
+      load_buffer_16x16(input, in0, in1, stride, 0, 1);
+      fdct16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case FLIPADST_FLIPADST:
+      load_buffer_16x16(input, in0, in1, stride, 1, 1);
+      fadst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case ADST_FLIPADST:
+      load_buffer_16x16(input, in0, in1, stride, 0, 1);
+      fadst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case FLIPADST_ADST:
+      load_buffer_16x16(input, in0, in1, stride, 1, 0);
+      fadst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case V_DCT:
+      load_buffer_16x16(input, in0, in1, stride, 0, 0);
+      fdct16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fidtx16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case H_DCT:
+      load_buffer_16x16(input, in0, in1, stride, 0, 0);
+      fidtx16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fdct16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case V_ADST:
+      load_buffer_16x16(input, in0, in1, stride, 0, 0);
+      fadst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fidtx16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case H_ADST:
+      load_buffer_16x16(input, in0, in1, stride, 0, 0);
+      fidtx16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case V_FLIPADST:
+      load_buffer_16x16(input, in0, in1, stride, 1, 0);
+      fadst16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fidtx16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case H_FLIPADST:
+      load_buffer_16x16(input, in0, in1, stride, 0, 1);
+      fidtx16_sse2(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16_sse2(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+#endif  // CONFIG_EXT_TX
     default:
       assert(0);
       break;
diff --git a/vp10/encoder/x86/denoiser_sse2.c b/vp10/encoder/x86/denoiser_sse2.c
index 047974e..5c1303a 100644
--- a/vp10/encoder/x86/denoiser_sse2.c
+++ b/vp10/encoder/x86/denoiser_sse2.c
@@ -361,9 +361,7 @@
                                        avg, avg_stride,
                                        increase_denoising,
                                        bs, motion_magnitude, 8);
-  } else if (bs == BLOCK_16X8 || bs == BLOCK_16X16 || bs == BLOCK_16X32 ||
-             bs == BLOCK_32X16|| bs == BLOCK_32X32 || bs == BLOCK_32X64 ||
-             bs == BLOCK_64X32 || bs == BLOCK_64X64) {
+  } else if (bs < BLOCK_SIZES) {
     return vp10_denoiser_NxM_sse2_big(sig, sig_stride,
                                      mc_avg, mc_avg_stride,
                                      avg, avg_stride,
diff --git a/vp10/encoder/x86/highbd_fwd_txfm_sse4.c b/vp10/encoder/x86/highbd_fwd_txfm_sse4.c
new file mode 100644
index 0000000..dffdf20
--- /dev/null
+++ b/vp10/encoder/x86/highbd_fwd_txfm_sse4.c
@@ -0,0 +1,1890 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "./vp10_rtcd.h"
+#include "./vpx_config.h"
+#include "vp10/common/vp10_fwd_txfm2d_cfg.h"
+#include "vp10/common/vp10_txfm.h"
+#include "vp10/common/x86/highbd_txfm_utility_sse4.h"
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/x86/txfm_common_sse2.h"
+#include "vpx_ports/mem.h"
+
+static INLINE void load_buffer_4x4(const int16_t *input, __m128i *in,
+                                   int stride, int flipud, int fliplr,
+                                   int shift) {
+  if (!flipud) {
+    in[0] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+    in[1] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+    in[2] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+    in[3] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+  } else {
+    in[0] = _mm_loadl_epi64((const __m128i *)(input + 3 * stride));
+    in[1] = _mm_loadl_epi64((const __m128i *)(input + 2 * stride));
+    in[2] = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
+    in[3] = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
+  }
+
+  if (fliplr) {
+    in[0] = _mm_shufflelo_epi16(in[0], 0x1b);
+    in[1] = _mm_shufflelo_epi16(in[1], 0x1b);
+    in[2] = _mm_shufflelo_epi16(in[2], 0x1b);
+    in[3] = _mm_shufflelo_epi16(in[3], 0x1b);
+  }
+
+  in[0] = _mm_cvtepi16_epi32(in[0]);
+  in[1] = _mm_cvtepi16_epi32(in[1]);
+  in[2] = _mm_cvtepi16_epi32(in[2]);
+  in[3] = _mm_cvtepi16_epi32(in[3]);
+
+  in[0] = _mm_slli_epi32(in[0], shift);
+  in[1] = _mm_slli_epi32(in[1], shift);
+  in[2] = _mm_slli_epi32(in[2], shift);
+  in[3] = _mm_slli_epi32(in[3], shift);
+}
+
+// We only use stage-2 bit;
+// shift[0] is used in load_buffer_4x4()
+// shift[1] is used in txfm_func_col()
+// shift[2] is used in txfm_func_row()
+static void fdct4x4_sse4_1(__m128i *in, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  __m128i s0, s1, s2, s3;
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3;
+
+  s0 = _mm_add_epi32(in[0], in[3]);
+  s1 = _mm_add_epi32(in[1], in[2]);
+  s2 = _mm_sub_epi32(in[1], in[2]);
+  s3 = _mm_sub_epi32(in[0], in[3]);
+
+  // btf_32_sse4_1_type0(cospi32, cospi32, s[01], u[02], bit);
+  u0 = _mm_mullo_epi32(s0, cospi32);
+  u1 = _mm_mullo_epi32(s1, cospi32);
+  u2 = _mm_add_epi32(u0, u1);
+  v0 = _mm_sub_epi32(u0, u1);
+
+  u3 = _mm_add_epi32(u2, rnding);
+  v1 = _mm_add_epi32(v0, rnding);
+
+  u0 = _mm_srai_epi32(u3, bit);
+  u2 = _mm_srai_epi32(v1, bit);
+
+  // btf_32_sse4_1_type1(cospi48, cospi16, s[23], u[13], bit);
+  v0 = _mm_mullo_epi32(s2, cospi48);
+  v1 = _mm_mullo_epi32(s3, cospi16);
+  v2 = _mm_add_epi32(v0, v1);
+
+  v3 = _mm_add_epi32(v2, rnding);
+  u1 = _mm_srai_epi32(v3, bit);
+
+  v0 = _mm_mullo_epi32(s2, cospi16);
+  v1 = _mm_mullo_epi32(s3, cospi48);
+  v2 = _mm_sub_epi32(v1, v0);
+
+  v3 = _mm_add_epi32(v2, rnding);
+  u3 = _mm_srai_epi32(v3, bit);
+
+  // Note: shift[1] and shift[2] are zeros
+
+  // Transpose 4x4 32-bit
+  v0 = _mm_unpacklo_epi32(u0, u1);
+  v1 = _mm_unpackhi_epi32(u0, u1);
+  v2 = _mm_unpacklo_epi32(u2, u3);
+  v3 = _mm_unpackhi_epi32(u2, u3);
+
+  in[0] = _mm_unpacklo_epi64(v0, v2);
+  in[1] = _mm_unpackhi_epi64(v0, v2);
+  in[2] = _mm_unpacklo_epi64(v1, v3);
+  in[3] = _mm_unpackhi_epi64(v1, v3);
+}
+
+static INLINE void write_buffer_4x4(__m128i *res, tran_low_t *output) {
+  _mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
+  _mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
+  _mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
+  _mm_store_si128((__m128i *)(output + 3 * 4), res[3]);
+}
+
+// Note:
+//  We implement vp10_fwd_txfm2d_4x4(). This function is kept here since
+//  vp10_highbd_fht4x4_c() is not removed yet
+void vp10_highbd_fht4x4_sse4_1(const int16_t *input, tran_low_t *output,
+                               int stride, int tx_type) {
+  (void)input;
+  (void)output;
+  (void)stride;
+  (void)tx_type;
+  assert(0);
+}
+
+static void fadst4x4_sse4_1(__m128i *in, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i kZero = _mm_setzero_si128();
+  __m128i s0, s1, s2, s3;
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3;
+
+  // stage 0
+  // stage 1
+  // stage 2
+  u0 = _mm_mullo_epi32(in[3], cospi8);
+  u1 = _mm_mullo_epi32(in[0], cospi56);
+  u2 = _mm_add_epi32(u0, u1);
+  s0 = _mm_add_epi32(u2, rnding);
+  s0 = _mm_srai_epi32(s0, bit);
+
+  v0 = _mm_mullo_epi32(in[3], cospi56);
+  v1 = _mm_mullo_epi32(in[0], cospi8);
+  v2 = _mm_sub_epi32(v0, v1);
+  s1 = _mm_add_epi32(v2, rnding);
+  s1 = _mm_srai_epi32(s1, bit);
+
+  u0 = _mm_mullo_epi32(in[1], cospi40);
+  u1 = _mm_mullo_epi32(in[2], cospi24);
+  u2 = _mm_add_epi32(u0, u1);
+  s2 = _mm_add_epi32(u2, rnding);
+  s2 = _mm_srai_epi32(s2, bit);
+
+  v0 = _mm_mullo_epi32(in[1], cospi24);
+  v1 = _mm_mullo_epi32(in[2], cospi40);
+  v2 = _mm_sub_epi32(v0, v1);
+  s3 = _mm_add_epi32(v2, rnding);
+  s3 = _mm_srai_epi32(s3, bit);
+
+  // stage 3
+  u0 = _mm_add_epi32(s0, s2);
+  u2 = _mm_sub_epi32(s0, s2);
+  u1 = _mm_add_epi32(s1, s3);
+  u3 = _mm_sub_epi32(s1, s3);
+
+  // stage 4
+  v0 = _mm_mullo_epi32(u2, cospi32);
+  v1 = _mm_mullo_epi32(u3, cospi32);
+  v2 = _mm_add_epi32(v0, v1);
+  s2 = _mm_add_epi32(v2, rnding);
+  u2 = _mm_srai_epi32(s2, bit);
+
+  v2 = _mm_sub_epi32(v0, v1);
+  s3 = _mm_add_epi32(v2, rnding);
+  u3 = _mm_srai_epi32(s3, bit);
+
+  // u0, u1, u2, u3
+  u2 = _mm_sub_epi32(kZero, u2);
+  u1 = _mm_sub_epi32(kZero, u1);
+
+  // u0, u2, u3, u1
+  // Transpose 4x4 32-bit
+  v0 = _mm_unpacklo_epi32(u0, u2);
+  v1 = _mm_unpackhi_epi32(u0, u2);
+  v2 = _mm_unpacklo_epi32(u3, u1);
+  v3 = _mm_unpackhi_epi32(u3, u1);
+
+  in[0] = _mm_unpacklo_epi64(v0, v2);
+  in[1] = _mm_unpackhi_epi64(v0, v2);
+  in[2] = _mm_unpacklo_epi64(v1, v3);
+  in[3] = _mm_unpackhi_epi64(v1, v3);
+}
+
+void vp10_fwd_txfm2d_4x4_sse4_1(const int16_t *input, int32_t *coeff,
+                                int input_stride, int tx_type, int bd) {
+  __m128i in[4];
+  const TXFM_2D_CFG *cfg = NULL;
+
+  switch (tx_type) {
+    case DCT_DCT:
+      cfg = &fwd_txfm_2d_cfg_dct_dct_4;
+      load_buffer_4x4(input, in, input_stride, 0, 0, cfg->shift[0]);
+      fdct4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      fdct4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      write_buffer_4x4(in, coeff);
+      break;
+    case ADST_DCT:
+      cfg = &fwd_txfm_2d_cfg_adst_dct_4;
+      load_buffer_4x4(input, in, input_stride, 0, 0, cfg->shift[0]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      fdct4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      write_buffer_4x4(in, coeff);
+      break;
+    case DCT_ADST:
+      cfg = &fwd_txfm_2d_cfg_dct_adst_4;
+      load_buffer_4x4(input, in, input_stride, 0, 0, cfg->shift[0]);
+      fdct4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      write_buffer_4x4(in, coeff);
+      break;
+    case ADST_ADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_4;
+      load_buffer_4x4(input, in, input_stride, 0, 0, cfg->shift[0]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      write_buffer_4x4(in, coeff);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      cfg = &fwd_txfm_2d_cfg_adst_dct_4;
+      load_buffer_4x4(input, in, input_stride, 1, 0, cfg->shift[0]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      fdct4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      write_buffer_4x4(in, coeff);
+      break;
+    case DCT_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_dct_adst_4;
+      load_buffer_4x4(input, in, input_stride, 0, 1, cfg->shift[0]);
+      fdct4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      write_buffer_4x4(in, coeff);
+      break;
+    case FLIPADST_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_4;
+      load_buffer_4x4(input, in, input_stride, 1, 1, cfg->shift[0]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      write_buffer_4x4(in, coeff);
+      break;
+    case ADST_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_4;
+      load_buffer_4x4(input, in, input_stride, 0, 1, cfg->shift[0]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      write_buffer_4x4(in, coeff);
+      break;
+    case FLIPADST_ADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_4;
+      load_buffer_4x4(input, in, input_stride, 1, 0, cfg->shift[0]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_col[2]);
+      fadst4x4_sse4_1(in, cfg->cos_bit_row[2]);
+      write_buffer_4x4(in, coeff);
+      break;
+#endif
+    default:
+      assert(0);
+  }
+  (void)bd;
+}
+
+static INLINE void load_buffer_8x8(const int16_t *input, __m128i *in,
+                                   int stride, int flipud, int fliplr,
+                                   int shift) {
+  __m128i u;
+  if (!flipud) {
+    in[0]  = _mm_load_si128((const __m128i *)(input + 0 * stride));
+    in[1]  = _mm_load_si128((const __m128i *)(input + 1 * stride));
+    in[2]  = _mm_load_si128((const __m128i *)(input + 2 * stride));
+    in[3]  = _mm_load_si128((const __m128i *)(input + 3 * stride));
+    in[4]  = _mm_load_si128((const __m128i *)(input + 4 * stride));
+    in[5]  = _mm_load_si128((const __m128i *)(input + 5 * stride));
+    in[6]  = _mm_load_si128((const __m128i *)(input + 6 * stride));
+    in[7]  = _mm_load_si128((const __m128i *)(input + 7 * stride));
+  } else {
+    in[0]  = _mm_load_si128((const __m128i *)(input + 7 * stride));
+    in[1]  = _mm_load_si128((const __m128i *)(input + 6 * stride));
+    in[2]  = _mm_load_si128((const __m128i *)(input + 5 * stride));
+    in[3]  = _mm_load_si128((const __m128i *)(input + 4 * stride));
+    in[4]  = _mm_load_si128((const __m128i *)(input + 3 * stride));
+    in[5]  = _mm_load_si128((const __m128i *)(input + 2 * stride));
+    in[6]  = _mm_load_si128((const __m128i *)(input + 1 * stride));
+    in[7]  = _mm_load_si128((const __m128i *)(input + 0 * stride));
+  }
+
+  if (fliplr) {
+    in[0] = mm_reverse_epi16(in[0]);
+    in[1] = mm_reverse_epi16(in[1]);
+    in[2] = mm_reverse_epi16(in[2]);
+    in[3] = mm_reverse_epi16(in[3]);
+    in[4] = mm_reverse_epi16(in[4]);
+    in[5] = mm_reverse_epi16(in[5]);
+    in[6] = mm_reverse_epi16(in[6]);
+    in[7] = mm_reverse_epi16(in[7]);
+  }
+
+  u = _mm_unpackhi_epi64(in[4], in[4]);
+  in[8] = _mm_cvtepi16_epi32(in[4]);
+  in[9] = _mm_cvtepi16_epi32(u);
+
+  u = _mm_unpackhi_epi64(in[5], in[5]);
+  in[10] = _mm_cvtepi16_epi32(in[5]);
+  in[11] = _mm_cvtepi16_epi32(u);
+
+  u = _mm_unpackhi_epi64(in[6], in[6]);
+  in[12] = _mm_cvtepi16_epi32(in[6]);
+  in[13] = _mm_cvtepi16_epi32(u);
+
+  u = _mm_unpackhi_epi64(in[7], in[7]);
+  in[14] = _mm_cvtepi16_epi32(in[7]);
+  in[15] = _mm_cvtepi16_epi32(u);
+
+  u = _mm_unpackhi_epi64(in[3], in[3]);
+  in[6] = _mm_cvtepi16_epi32(in[3]);
+  in[7] = _mm_cvtepi16_epi32(u);
+
+  u = _mm_unpackhi_epi64(in[2], in[2]);
+  in[4] = _mm_cvtepi16_epi32(in[2]);
+  in[5] = _mm_cvtepi16_epi32(u);
+
+  u = _mm_unpackhi_epi64(in[1], in[1]);
+  in[2] = _mm_cvtepi16_epi32(in[1]);
+  in[3] = _mm_cvtepi16_epi32(u);
+
+  u = _mm_unpackhi_epi64(in[0], in[0]);
+  in[0] = _mm_cvtepi16_epi32(in[0]);
+  in[1] = _mm_cvtepi16_epi32(u);
+
+  in[0] = _mm_slli_epi32(in[0], shift);
+  in[1] = _mm_slli_epi32(in[1], shift);
+  in[2] = _mm_slli_epi32(in[2], shift);
+  in[3] = _mm_slli_epi32(in[3], shift);
+  in[4] = _mm_slli_epi32(in[4], shift);
+  in[5] = _mm_slli_epi32(in[5], shift);
+  in[6] = _mm_slli_epi32(in[6], shift);
+  in[7] = _mm_slli_epi32(in[7], shift);
+
+  in[8] = _mm_slli_epi32(in[8], shift);
+  in[9] = _mm_slli_epi32(in[9], shift);
+  in[10] = _mm_slli_epi32(in[10], shift);
+  in[11] = _mm_slli_epi32(in[11], shift);
+  in[12] = _mm_slli_epi32(in[12], shift);
+  in[13] = _mm_slli_epi32(in[13], shift);
+  in[14] = _mm_slli_epi32(in[14], shift);
+  in[15] = _mm_slli_epi32(in[15], shift);
+}
+
+static INLINE void col_txfm_8x8_rounding(__m128i *in, int shift) {
+  const __m128i rounding = _mm_set1_epi32(1 << (shift - 1));
+
+  in[0] = _mm_add_epi32(in[0], rounding);
+  in[1] = _mm_add_epi32(in[1], rounding);
+  in[2] = _mm_add_epi32(in[2], rounding);
+  in[3] = _mm_add_epi32(in[3], rounding);
+  in[4] = _mm_add_epi32(in[4], rounding);
+  in[5] = _mm_add_epi32(in[5], rounding);
+  in[6] = _mm_add_epi32(in[6], rounding);
+  in[7] = _mm_add_epi32(in[7], rounding);
+  in[8] = _mm_add_epi32(in[8], rounding);
+  in[9] = _mm_add_epi32(in[9], rounding);
+  in[10] = _mm_add_epi32(in[10], rounding);
+  in[11] = _mm_add_epi32(in[11], rounding);
+  in[12] = _mm_add_epi32(in[12], rounding);
+  in[13] = _mm_add_epi32(in[13], rounding);
+  in[14] = _mm_add_epi32(in[14], rounding);
+  in[15] = _mm_add_epi32(in[15], rounding);
+
+  in[0] = _mm_srai_epi32(in[0], shift);
+  in[1] = _mm_srai_epi32(in[1], shift);
+  in[2] = _mm_srai_epi32(in[2], shift);
+  in[3] = _mm_srai_epi32(in[3], shift);
+  in[4] = _mm_srai_epi32(in[4], shift);
+  in[5] = _mm_srai_epi32(in[5], shift);
+  in[6] = _mm_srai_epi32(in[6], shift);
+  in[7] = _mm_srai_epi32(in[7], shift);
+  in[8] = _mm_srai_epi32(in[8], shift);
+  in[9] = _mm_srai_epi32(in[9], shift);
+  in[10] = _mm_srai_epi32(in[10], shift);
+  in[11] = _mm_srai_epi32(in[11], shift);
+  in[12] = _mm_srai_epi32(in[12], shift);
+  in[13] = _mm_srai_epi32(in[13], shift);
+  in[14] = _mm_srai_epi32(in[14], shift);
+  in[15] = _mm_srai_epi32(in[15], shift);
+}
+
+static INLINE void write_buffer_8x8(const __m128i *res, tran_low_t *output) {
+  _mm_store_si128((__m128i *)(output + 0 * 4), res[0]);
+  _mm_store_si128((__m128i *)(output + 1 * 4), res[1]);
+  _mm_store_si128((__m128i *)(output + 2 * 4), res[2]);
+  _mm_store_si128((__m128i *)(output + 3 * 4), res[3]);
+
+  _mm_store_si128((__m128i *)(output + 4 * 4), res[4]);
+  _mm_store_si128((__m128i *)(output + 5 * 4), res[5]);
+  _mm_store_si128((__m128i *)(output + 6 * 4), res[6]);
+  _mm_store_si128((__m128i *)(output + 7 * 4), res[7]);
+
+  _mm_store_si128((__m128i *)(output + 8 * 4), res[8]);
+  _mm_store_si128((__m128i *)(output + 9 * 4), res[9]);
+  _mm_store_si128((__m128i *)(output + 10 * 4), res[10]);
+  _mm_store_si128((__m128i *)(output + 11 * 4), res[11]);
+
+  _mm_store_si128((__m128i *)(output + 12 * 4), res[12]);
+  _mm_store_si128((__m128i *)(output + 13 * 4), res[13]);
+  _mm_store_si128((__m128i *)(output + 14 * 4), res[14]);
+  _mm_store_si128((__m128i *)(output + 15 * 4), res[15]);
+}
+
+static void fdct8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  __m128i u[8], v[8];
+
+  // Even 8 points 0, 2, ..., 14
+  // stage 0
+  // stage 1
+  u[0] = _mm_add_epi32(in[0], in[14]);
+  v[7] = _mm_sub_epi32(in[0], in[14]);  // v[7]
+  u[1] = _mm_add_epi32(in[2], in[12]);
+  u[6] = _mm_sub_epi32(in[2], in[12]);
+  u[2] = _mm_add_epi32(in[4], in[10]);
+  u[5] = _mm_sub_epi32(in[4], in[10]);
+  u[3] = _mm_add_epi32(in[6], in[8]);
+  v[4] = _mm_sub_epi32(in[6], in[8]);   // v[4]
+
+  // stage 2
+  v[0] = _mm_add_epi32(u[0], u[3]);
+  v[3] = _mm_sub_epi32(u[0], u[3]);
+  v[1] = _mm_add_epi32(u[1], u[2]);
+  v[2] = _mm_sub_epi32(u[1], u[2]);
+
+  v[5] = _mm_mullo_epi32(u[5], cospim32);
+  v[6] = _mm_mullo_epi32(u[6], cospi32);
+  v[5] = _mm_add_epi32(v[5], v[6]);
+  v[5] = _mm_add_epi32(v[5], rnding);
+  v[5] = _mm_srai_epi32(v[5], bit);
+
+  u[0] = _mm_mullo_epi32(u[5], cospi32);
+  v[6] = _mm_mullo_epi32(u[6], cospim32);
+  v[6] = _mm_sub_epi32(u[0], v[6]);
+  v[6] = _mm_add_epi32(v[6], rnding);
+  v[6] = _mm_srai_epi32(v[6], bit);
+
+  // stage 3
+  // type 0
+  v[0] = _mm_mullo_epi32(v[0], cospi32);
+  v[1] = _mm_mullo_epi32(v[1], cospi32);
+  u[0] = _mm_add_epi32(v[0], v[1]);
+  u[0] = _mm_add_epi32(u[0], rnding);
+  u[0] = _mm_srai_epi32(u[0], bit);
+
+  u[1] = _mm_sub_epi32(v[0], v[1]);
+  u[1] = _mm_add_epi32(u[1], rnding);
+  u[1] = _mm_srai_epi32(u[1], bit);
+
+  // type 1
+  v[0] = _mm_mullo_epi32(v[2], cospi48);
+  v[1] = _mm_mullo_epi32(v[3], cospi16);
+  u[2] = _mm_add_epi32(v[0], v[1]);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  v[0] = _mm_mullo_epi32(v[2], cospi16);
+  v[1] = _mm_mullo_epi32(v[3], cospi48);
+  u[3] = _mm_sub_epi32(v[1], v[0]);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  u[4] = _mm_add_epi32(v[4], v[5]);
+  u[5] = _mm_sub_epi32(v[4], v[5]);
+  u[6] = _mm_sub_epi32(v[7], v[6]);
+  u[7] = _mm_add_epi32(v[7], v[6]);
+
+  // stage 4
+  // stage 5
+  v[0] = _mm_mullo_epi32(u[4], cospi56);
+  v[1] = _mm_mullo_epi32(u[7], cospi8);
+  v[0] = _mm_add_epi32(v[0], v[1]);
+  v[0] = _mm_add_epi32(v[0], rnding);
+  out[2] = _mm_srai_epi32(v[0], bit);   // buf0[4]
+
+  v[0] = _mm_mullo_epi32(u[4], cospi8);
+  v[1] = _mm_mullo_epi32(u[7], cospi56);
+  v[0] = _mm_sub_epi32(v[1], v[0]);
+  v[0] = _mm_add_epi32(v[0], rnding);
+  out[14] = _mm_srai_epi32(v[0], bit);  // buf0[7]
+
+  v[0] = _mm_mullo_epi32(u[5], cospi24);
+  v[1] = _mm_mullo_epi32(u[6], cospi40);
+  v[0] = _mm_add_epi32(v[0], v[1]);
+  v[0] = _mm_add_epi32(v[0], rnding);
+  out[10] = _mm_srai_epi32(v[0], bit);  // buf0[5]
+
+  v[0] = _mm_mullo_epi32(u[5], cospi40);
+  v[1] = _mm_mullo_epi32(u[6], cospi24);
+  v[0] = _mm_sub_epi32(v[1], v[0]);
+  v[0] = _mm_add_epi32(v[0], rnding);
+  out[6] = _mm_srai_epi32(v[0], bit);   // buf0[6]
+
+  out[0] = u[0];   // buf0[0]
+  out[8] = u[1];   // buf0[1]
+  out[4] = u[2];   // buf0[2]
+  out[12] = u[3];  // buf0[3]
+
+  // Odd 8 points: 1, 3, ..., 15
+  // stage 0
+  // stage 1
+  u[0] = _mm_add_epi32(in[1], in[15]);
+  v[7] = _mm_sub_epi32(in[1], in[15]);  // v[7]
+  u[1] = _mm_add_epi32(in[3], in[13]);
+  u[6] = _mm_sub_epi32(in[3], in[13]);
+  u[2] = _mm_add_epi32(in[5], in[11]);
+  u[5] = _mm_sub_epi32(in[5], in[11]);
+  u[3] = _mm_add_epi32(in[7], in[9]);
+  v[4] = _mm_sub_epi32(in[7], in[9]);   // v[4]
+
+  // stage 2
+  v[0] = _mm_add_epi32(u[0], u[3]);
+  v[3] = _mm_sub_epi32(u[0], u[3]);
+  v[1] = _mm_add_epi32(u[1], u[2]);
+  v[2] = _mm_sub_epi32(u[1], u[2]);
+
+  v[5] = _mm_mullo_epi32(u[5], cospim32);
+  v[6] = _mm_mullo_epi32(u[6], cospi32);
+  v[5] = _mm_add_epi32(v[5], v[6]);
+  v[5] = _mm_add_epi32(v[5], rnding);
+  v[5] = _mm_srai_epi32(v[5], bit);
+
+  u[0] = _mm_mullo_epi32(u[5], cospi32);
+  v[6] = _mm_mullo_epi32(u[6], cospim32);
+  v[6] = _mm_sub_epi32(u[0], v[6]);
+  v[6] = _mm_add_epi32(v[6], rnding);
+  v[6] = _mm_srai_epi32(v[6], bit);
+
+  // stage 3
+  // type 0
+  v[0] = _mm_mullo_epi32(v[0], cospi32);
+  v[1] = _mm_mullo_epi32(v[1], cospi32);
+  u[0] = _mm_add_epi32(v[0], v[1]);
+  u[0] = _mm_add_epi32(u[0], rnding);
+  u[0] = _mm_srai_epi32(u[0], bit);
+
+  u[1] = _mm_sub_epi32(v[0], v[1]);
+  u[1] = _mm_add_epi32(u[1], rnding);
+  u[1] = _mm_srai_epi32(u[1], bit);
+
+  // type 1
+  v[0] = _mm_mullo_epi32(v[2], cospi48);
+  v[1] = _mm_mullo_epi32(v[3], cospi16);
+  u[2] = _mm_add_epi32(v[0], v[1]);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  v[0] = _mm_mullo_epi32(v[2], cospi16);
+  v[1] = _mm_mullo_epi32(v[3], cospi48);
+  u[3] = _mm_sub_epi32(v[1], v[0]);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  u[4] = _mm_add_epi32(v[4], v[5]);
+  u[5] = _mm_sub_epi32(v[4], v[5]);
+  u[6] = _mm_sub_epi32(v[7], v[6]);
+  u[7] = _mm_add_epi32(v[7], v[6]);
+
+  // stage 4
+  // stage 5
+  v[0] = _mm_mullo_epi32(u[4], cospi56);
+  v[1] = _mm_mullo_epi32(u[7], cospi8);
+  v[0] = _mm_add_epi32(v[0], v[1]);
+  v[0] = _mm_add_epi32(v[0], rnding);
+  out[3] = _mm_srai_epi32(v[0], bit);   // buf0[4]
+
+  v[0] = _mm_mullo_epi32(u[4], cospi8);
+  v[1] = _mm_mullo_epi32(u[7], cospi56);
+  v[0] = _mm_sub_epi32(v[1], v[0]);
+  v[0] = _mm_add_epi32(v[0], rnding);
+  out[15] = _mm_srai_epi32(v[0], bit);  // buf0[7]
+
+  v[0] = _mm_mullo_epi32(u[5], cospi24);
+  v[1] = _mm_mullo_epi32(u[6], cospi40);
+  v[0] = _mm_add_epi32(v[0], v[1]);
+  v[0] = _mm_add_epi32(v[0], rnding);
+  out[11] = _mm_srai_epi32(v[0], bit);  // buf0[5]
+
+  v[0] = _mm_mullo_epi32(u[5], cospi40);
+  v[1] = _mm_mullo_epi32(u[6], cospi24);
+  v[0] = _mm_sub_epi32(v[1], v[0]);
+  v[0] = _mm_add_epi32(v[0], rnding);
+  out[7] = _mm_srai_epi32(v[0], bit);   // buf0[6]
+
+  out[1] = u[0];   // buf0[0]
+  out[9] = u[1];   // buf0[1]
+  out[5] = u[2];   // buf0[2]
+  out[13] = u[3];  // buf0[3]
+}
+
+static void fadst8x8_sse4_1(__m128i *in, __m128i *out, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  const __m128i kZero = _mm_setzero_si128();
+  __m128i u[8], v[8], x;
+
+  // Even 8 points: 0, 2, ..., 14
+  // stage 0
+  // stage 1
+  // stage 2
+  // (1)
+  u[0] = _mm_mullo_epi32(in[14], cospi4);
+  x = _mm_mullo_epi32(in[0], cospi60);
+  u[0] = _mm_add_epi32(u[0], x);
+  u[0] = _mm_add_epi32(u[0], rnding);
+  u[0] = _mm_srai_epi32(u[0], bit);
+
+  u[1] = _mm_mullo_epi32(in[14], cospi60);
+  x = _mm_mullo_epi32(in[0], cospi4);
+  u[1] = _mm_sub_epi32(u[1], x);
+  u[1] = _mm_add_epi32(u[1], rnding);
+  u[1] = _mm_srai_epi32(u[1], bit);
+
+  // (2)
+  u[2] = _mm_mullo_epi32(in[10], cospi20);
+  x = _mm_mullo_epi32(in[4], cospi44);
+  u[2] = _mm_add_epi32(u[2], x);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  u[3] = _mm_mullo_epi32(in[10], cospi44);
+  x = _mm_mullo_epi32(in[4], cospi20);
+  u[3] = _mm_sub_epi32(u[3], x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  // (3)
+  u[4] = _mm_mullo_epi32(in[6], cospi36);
+  x = _mm_mullo_epi32(in[8], cospi28);
+  u[4] = _mm_add_epi32(u[4], x);
+  u[4] = _mm_add_epi32(u[4], rnding);
+  u[4] = _mm_srai_epi32(u[4], bit);
+
+  u[5] = _mm_mullo_epi32(in[6], cospi28);
+  x = _mm_mullo_epi32(in[8], cospi36);
+  u[5] = _mm_sub_epi32(u[5], x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
+
+  // (4)
+  u[6] = _mm_mullo_epi32(in[2], cospi52);
+  x = _mm_mullo_epi32(in[12], cospi12);
+  u[6] = _mm_add_epi32(u[6], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_mullo_epi32(in[2], cospi12);
+  x = _mm_mullo_epi32(in[12], cospi52);
+  u[7] = _mm_sub_epi32(u[7], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 3
+  v[0] = _mm_add_epi32(u[0], u[4]);
+  v[4] = _mm_sub_epi32(u[0], u[4]);
+  v[1] = _mm_add_epi32(u[1], u[5]);
+  v[5] = _mm_sub_epi32(u[1], u[5]);
+  v[2] = _mm_add_epi32(u[2], u[6]);
+  v[6] = _mm_sub_epi32(u[2], u[6]);
+  v[3] = _mm_add_epi32(u[3], u[7]);
+  v[7] = _mm_sub_epi32(u[3], u[7]);
+
+  // stage 4
+  u[0] = v[0];
+  u[1] = v[1];
+  u[2] = v[2];
+  u[3] = v[3];
+
+  u[4] = _mm_mullo_epi32(v[4], cospi16);
+  x = _mm_mullo_epi32(v[5], cospi48);
+  u[4] = _mm_add_epi32(u[4], x);
+  u[4] = _mm_add_epi32(u[4], rnding);
+  u[4] = _mm_srai_epi32(u[4], bit);
+
+  u[5] = _mm_mullo_epi32(v[4], cospi48);
+  x = _mm_mullo_epi32(v[5], cospi16);
+  u[5] = _mm_sub_epi32(u[5], x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
+
+  u[6] = _mm_mullo_epi32(v[6], cospim48);
+  x = _mm_mullo_epi32(v[7], cospi16);
+  u[6] = _mm_add_epi32(u[6], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_mullo_epi32(v[6], cospi16);
+  x = _mm_mullo_epi32(v[7], cospim48);
+  u[7] = _mm_sub_epi32(u[7], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 5
+  v[0] = _mm_add_epi32(u[0], u[2]);
+  v[2] = _mm_sub_epi32(u[0], u[2]);
+  v[1] = _mm_add_epi32(u[1], u[3]);
+  v[3] = _mm_sub_epi32(u[1], u[3]);
+  v[4] = _mm_add_epi32(u[4], u[6]);
+  v[6] = _mm_sub_epi32(u[4], u[6]);
+  v[5] = _mm_add_epi32(u[5], u[7]);
+  v[7] = _mm_sub_epi32(u[5], u[7]);
+
+  // stage 6
+  u[0] = v[0];
+  u[1] = v[1];
+  u[4] = v[4];
+  u[5] = v[5];
+
+  v[0] = _mm_mullo_epi32(v[2], cospi32);
+  x = _mm_mullo_epi32(v[3], cospi32);
+  u[2] = _mm_add_epi32(v[0], x);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  u[3] = _mm_sub_epi32(v[0], x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  v[0] = _mm_mullo_epi32(v[6], cospi32);
+  x = _mm_mullo_epi32(v[7], cospi32);
+  u[6] = _mm_add_epi32(v[0], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_sub_epi32(v[0], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 7
+  out[0] = u[0];
+  out[2] = _mm_sub_epi32(kZero, u[4]);
+  out[4] = u[6];
+  out[6] = _mm_sub_epi32(kZero, u[2]);
+  out[8] = u[3];
+  out[10] = _mm_sub_epi32(kZero, u[7]);
+  out[12] = u[5];
+  out[14] = _mm_sub_epi32(kZero, u[1]);
+
+  // Odd 8 points: 1, 3, ..., 15
+  // stage 0
+  // stage 1
+  // stage 2
+  // (1)
+  u[0] = _mm_mullo_epi32(in[15], cospi4);
+  x = _mm_mullo_epi32(in[1], cospi60);
+  u[0] = _mm_add_epi32(u[0], x);
+  u[0] = _mm_add_epi32(u[0], rnding);
+  u[0] = _mm_srai_epi32(u[0], bit);
+
+  u[1] = _mm_mullo_epi32(in[15], cospi60);
+  x = _mm_mullo_epi32(in[1], cospi4);
+  u[1] = _mm_sub_epi32(u[1], x);
+  u[1] = _mm_add_epi32(u[1], rnding);
+  u[1] = _mm_srai_epi32(u[1], bit);
+
+  // (2)
+  u[2] = _mm_mullo_epi32(in[11], cospi20);
+  x = _mm_mullo_epi32(in[5], cospi44);
+  u[2] = _mm_add_epi32(u[2], x);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  u[3] = _mm_mullo_epi32(in[11], cospi44);
+  x = _mm_mullo_epi32(in[5], cospi20);
+  u[3] = _mm_sub_epi32(u[3], x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  // (3)
+  u[4] = _mm_mullo_epi32(in[7], cospi36);
+  x = _mm_mullo_epi32(in[9], cospi28);
+  u[4] = _mm_add_epi32(u[4], x);
+  u[4] = _mm_add_epi32(u[4], rnding);
+  u[4] = _mm_srai_epi32(u[4], bit);
+
+  u[5] = _mm_mullo_epi32(in[7], cospi28);
+  x = _mm_mullo_epi32(in[9], cospi36);
+  u[5] = _mm_sub_epi32(u[5], x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
+
+  // (4)
+  u[6] = _mm_mullo_epi32(in[3], cospi52);
+  x = _mm_mullo_epi32(in[13], cospi12);
+  u[6] = _mm_add_epi32(u[6], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_mullo_epi32(in[3], cospi12);
+  x = _mm_mullo_epi32(in[13], cospi52);
+  u[7] = _mm_sub_epi32(u[7], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 3
+  v[0] = _mm_add_epi32(u[0], u[4]);
+  v[4] = _mm_sub_epi32(u[0], u[4]);
+  v[1] = _mm_add_epi32(u[1], u[5]);
+  v[5] = _mm_sub_epi32(u[1], u[5]);
+  v[2] = _mm_add_epi32(u[2], u[6]);
+  v[6] = _mm_sub_epi32(u[2], u[6]);
+  v[3] = _mm_add_epi32(u[3], u[7]);
+  v[7] = _mm_sub_epi32(u[3], u[7]);
+
+  // stage 4
+  u[0] = v[0];
+  u[1] = v[1];
+  u[2] = v[2];
+  u[3] = v[3];
+
+  u[4] = _mm_mullo_epi32(v[4], cospi16);
+  x = _mm_mullo_epi32(v[5], cospi48);
+  u[4] = _mm_add_epi32(u[4], x);
+  u[4] = _mm_add_epi32(u[4], rnding);
+  u[4] = _mm_srai_epi32(u[4], bit);
+
+  u[5] = _mm_mullo_epi32(v[4], cospi48);
+  x = _mm_mullo_epi32(v[5], cospi16);
+  u[5] = _mm_sub_epi32(u[5], x);
+  u[5] = _mm_add_epi32(u[5], rnding);
+  u[5] = _mm_srai_epi32(u[5], bit);
+
+  u[6] = _mm_mullo_epi32(v[6], cospim48);
+  x = _mm_mullo_epi32(v[7], cospi16);
+  u[6] = _mm_add_epi32(u[6], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_mullo_epi32(v[6], cospi16);
+  x = _mm_mullo_epi32(v[7], cospim48);
+  u[7] = _mm_sub_epi32(u[7], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 5
+  v[0] = _mm_add_epi32(u[0], u[2]);
+  v[2] = _mm_sub_epi32(u[0], u[2]);
+  v[1] = _mm_add_epi32(u[1], u[3]);
+  v[3] = _mm_sub_epi32(u[1], u[3]);
+  v[4] = _mm_add_epi32(u[4], u[6]);
+  v[6] = _mm_sub_epi32(u[4], u[6]);
+  v[5] = _mm_add_epi32(u[5], u[7]);
+  v[7] = _mm_sub_epi32(u[5], u[7]);
+
+  // stage 6
+  u[0] = v[0];
+  u[1] = v[1];
+  u[4] = v[4];
+  u[5] = v[5];
+
+  v[0] = _mm_mullo_epi32(v[2], cospi32);
+  x = _mm_mullo_epi32(v[3], cospi32);
+  u[2] = _mm_add_epi32(v[0], x);
+  u[2] = _mm_add_epi32(u[2], rnding);
+  u[2] = _mm_srai_epi32(u[2], bit);
+
+  u[3] = _mm_sub_epi32(v[0], x);
+  u[3] = _mm_add_epi32(u[3], rnding);
+  u[3] = _mm_srai_epi32(u[3], bit);
+
+  v[0] = _mm_mullo_epi32(v[6], cospi32);
+  x = _mm_mullo_epi32(v[7], cospi32);
+  u[6] = _mm_add_epi32(v[0], x);
+  u[6] = _mm_add_epi32(u[6], rnding);
+  u[6] = _mm_srai_epi32(u[6], bit);
+
+  u[7] = _mm_sub_epi32(v[0], x);
+  u[7] = _mm_add_epi32(u[7], rnding);
+  u[7] = _mm_srai_epi32(u[7], bit);
+
+  // stage 7
+  out[1] = u[0];
+  out[3] = _mm_sub_epi32(kZero, u[4]);
+  out[5] = u[6];
+  out[7] = _mm_sub_epi32(kZero, u[2]);
+  out[9] = u[3];
+  out[11] = _mm_sub_epi32(kZero, u[7]);
+  out[13] = u[5];
+  out[15] = _mm_sub_epi32(kZero, u[1]);
+}
+
+void vp10_fwd_txfm2d_8x8_sse4_1(const int16_t *input, int32_t *coeff,
+                                int stride, int tx_type, int bd) {
+  __m128i in[16], out[16];
+  const TXFM_2D_CFG *cfg = NULL;
+
+  switch (tx_type) {
+    case DCT_DCT:
+      cfg = &fwd_txfm_2d_cfg_dct_dct_8;
+      load_buffer_8x8(input, in, stride, 0, 0, cfg->shift[0]);
+      fdct8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      transpose_8x8(out, in);
+      fdct8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case ADST_DCT:
+      cfg = &fwd_txfm_2d_cfg_adst_dct_8;
+      load_buffer_8x8(input, in, stride, 0, 0, cfg->shift[0]);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      transpose_8x8(out, in);
+      fdct8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case DCT_ADST:
+      cfg = &fwd_txfm_2d_cfg_dct_adst_8;
+      load_buffer_8x8(input, in, stride, 0, 0, cfg->shift[0]);
+      fdct8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case ADST_ADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_8;
+      load_buffer_8x8(input, in, stride, 0, 0, cfg->shift[0]);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      cfg = &fwd_txfm_2d_cfg_adst_dct_8;
+      load_buffer_8x8(input, in, stride, 1, 0, cfg->shift[0]);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      transpose_8x8(out, in);
+      fdct8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case DCT_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_dct_adst_8;
+      load_buffer_8x8(input, in, stride, 0, 1, cfg->shift[0]);
+      fdct8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case FLIPADST_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_8;
+      load_buffer_8x8(input, in, stride, 1, 1, cfg->shift[0]);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case ADST_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_8;
+      load_buffer_8x8(input, in, stride, 0, 1, cfg->shift[0]);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+    case FLIPADST_ADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_8;
+      load_buffer_8x8(input, in, stride, 1, 0, cfg->shift[0]);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_col[2]);
+      col_txfm_8x8_rounding(out, -cfg->shift[1]);
+      transpose_8x8(out, in);
+      fadst8x8_sse4_1(in, out, cfg->cos_bit_row[2]);
+      transpose_8x8(out, in);
+      write_buffer_8x8(in, coeff);
+      break;
+#endif  // CONFIG_EXT_TX
+    default:
+      assert(0);
+  }
+  (void)bd;
+}
+
+// Hybrid Transform 16x16
+
+static INLINE void convert_8x8_to_16x16(const __m128i *in, __m128i *out) {
+  int row_index = 0;
+  int dst_index = 0;
+  int src_index = 0;
+
+  // row 0, 1, .., 7
+  do {
+    out[dst_index] = in[src_index];
+    out[dst_index + 1] = in[src_index + 1];
+    out[dst_index + 2] = in[src_index + 16];
+    out[dst_index + 3] = in[src_index + 17];
+    dst_index += 4;
+    src_index += 2;
+    row_index += 1;
+  } while (row_index < 8);
+
+  // row 8, 9, ..., 15
+  src_index += 16;
+  do {
+    out[dst_index] = in[src_index];
+    out[dst_index + 1] = in[src_index + 1];
+    out[dst_index + 2] = in[src_index + 16];
+    out[dst_index + 3] = in[src_index + 17];
+    dst_index += 4;
+    src_index += 2;
+    row_index += 1;
+  } while (row_index < 16);
+}
+
+static INLINE void load_buffer_16x16(const int16_t* input, __m128i *out,
+                                     int stride, int flipud, int fliplr,
+                                     int shift) {
+  __m128i in[64];
+  // Load 4 8x8 blocks
+  const int16_t *topL = input;
+  const int16_t *topR = input + 8;
+  const int16_t *botL = input + 8 * stride;
+  const int16_t *botR = input + 8 * stride + 8;
+
+  const int16_t *tmp;
+
+  if (flipud) {
+    // Swap left columns
+    tmp = topL; topL = botL; botL = tmp;
+    // Swap right columns
+    tmp = topR; topR = botR; botR = tmp;
+  }
+
+  if (fliplr) {
+    // Swap top rows
+    tmp = topL; topL = topR; topR = tmp;
+    // Swap bottom rows
+    tmp = botL; botL = botR; botR = tmp;
+  }
+
+  // load first 8 columns
+  load_buffer_8x8(topL, &in[0],  stride, flipud, fliplr, shift);
+  load_buffer_8x8(botL, &in[32], stride, flipud, fliplr, shift);
+
+  // load second 8 columns
+  load_buffer_8x8(topR, &in[16], stride, flipud, fliplr, shift);
+  load_buffer_8x8(botR, &in[48], stride, flipud, fliplr, shift);
+
+  convert_8x8_to_16x16(in, out);
+}
+
+static void fdct16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i cospim32 = _mm_set1_epi32(-cospi[32]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospim16 = _mm_set1_epi32(-cospi[16]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi60 = _mm_set1_epi32(cospi[60]);
+  const __m128i cospi4 = _mm_set1_epi32(cospi[4]);
+  const __m128i cospi28 = _mm_set1_epi32(cospi[28]);
+  const __m128i cospi36 = _mm_set1_epi32(cospi[36]);
+  const __m128i cospi44 = _mm_set1_epi32(cospi[44]);
+  const __m128i cospi20 = _mm_set1_epi32(cospi[20]);
+  const __m128i cospi12 = _mm_set1_epi32(cospi[12]);
+  const __m128i cospi52 = _mm_set1_epi32(cospi[52]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  __m128i u[16], v[16], x;
+  const int col_num = 4;
+  int col;
+
+  // Calculate the column 0, 1, 2, 3
+  for (col = 0; col < col_num; ++col) {
+    // stage 0
+    // stage 1
+    u[0]  = _mm_add_epi32(in[0 * col_num + col], in[15 * col_num + col]);
+    u[15] = _mm_sub_epi32(in[0 * col_num + col], in[15 * col_num + col]);
+    u[1]  = _mm_add_epi32(in[1 * col_num + col], in[14 * col_num + col]);
+    u[14] = _mm_sub_epi32(in[1 * col_num + col], in[14 * col_num + col]);
+    u[2]  = _mm_add_epi32(in[2 * col_num + col], in[13 * col_num + col]);
+    u[13] = _mm_sub_epi32(in[2 * col_num + col], in[13 * col_num + col]);
+    u[3]  = _mm_add_epi32(in[3 * col_num + col], in[12 * col_num + col]);
+    u[12] = _mm_sub_epi32(in[3 * col_num + col], in[12 * col_num + col]);
+    u[4]  = _mm_add_epi32(in[4 * col_num + col], in[11 * col_num + col]);
+    u[11] = _mm_sub_epi32(in[4 * col_num + col], in[11 * col_num + col]);
+    u[5]  = _mm_add_epi32(in[5 * col_num + col], in[10 * col_num + col]);
+    u[10] = _mm_sub_epi32(in[5 * col_num + col], in[10 * col_num + col]);
+    u[6]  = _mm_add_epi32(in[6 * col_num + col], in[9 * col_num + col]);
+    u[9]  = _mm_sub_epi32(in[6 * col_num + col], in[9 * col_num + col]);
+    u[7]  = _mm_add_epi32(in[7 * col_num + col], in[8 * col_num + col]);
+    u[8]  = _mm_sub_epi32(in[7 * col_num + col], in[8 * col_num + col]);
+
+    // stage 2
+    v[0] = _mm_add_epi32(u[0], u[7]);
+    v[7] = _mm_sub_epi32(u[0], u[7]);
+    v[1] = _mm_add_epi32(u[1], u[6]);
+    v[6] = _mm_sub_epi32(u[1], u[6]);
+    v[2] = _mm_add_epi32(u[2], u[5]);
+    v[5] = _mm_sub_epi32(u[2], u[5]);
+    v[3] = _mm_add_epi32(u[3], u[4]);
+    v[4] = _mm_sub_epi32(u[3], u[4]);
+    v[8] = u[8];
+    v[9] = u[9];
+
+    v[10] = _mm_mullo_epi32(u[10], cospim32);
+    x = _mm_mullo_epi32(u[13], cospi32);
+    v[10] = _mm_add_epi32(v[10], x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[13] = _mm_mullo_epi32(u[10], cospi32);
+    x = _mm_mullo_epi32(u[13], cospim32);
+    v[13] = _mm_sub_epi32(v[13], x);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    v[11] = _mm_mullo_epi32(u[11], cospim32);
+    x = _mm_mullo_epi32(u[12], cospi32);
+    v[11] = _mm_add_epi32(v[11], x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = _mm_mullo_epi32(u[11], cospi32);
+    x = _mm_mullo_epi32(u[12], cospim32);
+    v[12] = _mm_sub_epi32(v[12], x);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
+    v[14] = u[14];
+    v[15] = u[15];
+
+    // stage 3
+    u[0] = _mm_add_epi32(v[0], v[3]);
+    u[3] = _mm_sub_epi32(v[0], v[3]);
+    u[1] = _mm_add_epi32(v[1], v[2]);
+    u[2] = _mm_sub_epi32(v[1], v[2]);
+    u[4] = v[4];
+
+    u[5] = _mm_mullo_epi32(v[5], cospim32);
+    x = _mm_mullo_epi32(v[6], cospi32);
+    u[5] = _mm_add_epi32(u[5], x);
+    u[5] = _mm_add_epi32(u[5], rnding);
+    u[5] = _mm_srai_epi32(u[5], bit);
+
+    u[6] = _mm_mullo_epi32(v[5], cospi32);
+    x = _mm_mullo_epi32(v[6], cospim32);
+    u[6] = _mm_sub_epi32(u[6], x);
+    u[6] = _mm_add_epi32(u[6], rnding);
+    u[6] = _mm_srai_epi32(u[6], bit);
+
+    u[7] = v[7];
+    u[8]  = _mm_add_epi32(v[8], v[11]);
+    u[11] = _mm_sub_epi32(v[8], v[11]);
+    u[9]  = _mm_add_epi32(v[9], v[10]);
+    u[10] = _mm_sub_epi32(v[9], v[10]);
+    u[12] = _mm_sub_epi32(v[15], v[12]);
+    u[15] = _mm_add_epi32(v[15], v[12]);
+    u[13] = _mm_sub_epi32(v[14], v[13]);
+    u[14] = _mm_add_epi32(v[14], v[13]);
+
+    // stage 4
+    u[0] = _mm_mullo_epi32(u[0], cospi32);
+    u[1] = _mm_mullo_epi32(u[1], cospi32);
+    v[0] = _mm_add_epi32(u[0], u[1]);
+    v[0] = _mm_add_epi32(v[0], rnding);
+    v[0] = _mm_srai_epi32(v[0], bit);
+
+    v[1] = _mm_sub_epi32(u[0], u[1]);
+    v[1] = _mm_add_epi32(v[1], rnding);
+    v[1] = _mm_srai_epi32(v[1], bit);
+
+    v[2] = _mm_mullo_epi32(u[2], cospi48);
+    x = _mm_mullo_epi32(u[3], cospi16);
+    v[2] = _mm_add_epi32(v[2], x);
+    v[2] = _mm_add_epi32(v[2], rnding);
+    v[2] = _mm_srai_epi32(v[2], bit);
+
+    v[3] = _mm_mullo_epi32(u[2], cospi16);
+    x = _mm_mullo_epi32(u[3], cospi48);
+    v[3] = _mm_sub_epi32(x, v[3]);
+    v[3] = _mm_add_epi32(v[3], rnding);
+    v[3] = _mm_srai_epi32(v[3], bit);
+
+    v[4] = _mm_add_epi32(u[4], u[5]);
+    v[5] = _mm_sub_epi32(u[4], u[5]);
+    v[6] = _mm_sub_epi32(u[7], u[6]);
+    v[7] = _mm_add_epi32(u[7], u[6]);
+    v[8] = u[8];
+
+    v[9] = _mm_mullo_epi32(u[9], cospim16);
+    x = _mm_mullo_epi32(u[14], cospi48);
+    v[9] = _mm_add_epi32(v[9], x);
+    v[9] = _mm_add_epi32(v[9], rnding);
+    v[9] = _mm_srai_epi32(v[9], bit);
+
+    v[14] = _mm_mullo_epi32(u[9], cospi48);
+    x = _mm_mullo_epi32(u[14], cospim16);
+    v[14] = _mm_sub_epi32(v[14], x);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[10] = _mm_mullo_epi32(u[10], cospim48);
+    x = _mm_mullo_epi32(u[13], cospim16);
+    v[10] = _mm_add_epi32(v[10], x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[13] = _mm_mullo_epi32(u[10], cospim16);
+    x = _mm_mullo_epi32(u[13], cospim48);
+    v[13] = _mm_sub_epi32(v[13], x);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    v[11] = u[11];
+    v[12] = u[12];
+    v[15] = u[15];
+
+    // stage 5
+    u[0] = v[0];
+    u[1] = v[1];
+    u[2] = v[2];
+    u[3] = v[3];
+
+    u[4] = _mm_mullo_epi32(v[4], cospi56);
+    x = _mm_mullo_epi32(v[7], cospi8);
+    u[4] = _mm_add_epi32(u[4], x);
+    u[4] = _mm_add_epi32(u[4], rnding);
+    u[4] = _mm_srai_epi32(u[4], bit);
+
+    u[7] = _mm_mullo_epi32(v[4], cospi8);
+    x = _mm_mullo_epi32(v[7], cospi56);
+    u[7] = _mm_sub_epi32(x, u[7]);
+    u[7] = _mm_add_epi32(u[7], rnding);
+    u[7] = _mm_srai_epi32(u[7], bit);
+
+    u[5] = _mm_mullo_epi32(v[5], cospi24);
+    x = _mm_mullo_epi32(v[6], cospi40);
+    u[5] = _mm_add_epi32(u[5], x);
+    u[5] = _mm_add_epi32(u[5], rnding);
+    u[5] = _mm_srai_epi32(u[5], bit);
+
+    u[6] = _mm_mullo_epi32(v[5], cospi40);
+    x = _mm_mullo_epi32(v[6], cospi24);
+    u[6] = _mm_sub_epi32(x, u[6]);
+    u[6] = _mm_add_epi32(u[6], rnding);
+    u[6] = _mm_srai_epi32(u[6], bit);
+
+    u[8] = _mm_add_epi32(v[8], v[9]);
+    u[9] = _mm_sub_epi32(v[8], v[9]);
+    u[10] = _mm_sub_epi32(v[11], v[10]);
+    u[11] = _mm_add_epi32(v[11], v[10]);
+    u[12] = _mm_add_epi32(v[12], v[13]);
+    u[13] = _mm_sub_epi32(v[12], v[13]);
+    u[14] = _mm_sub_epi32(v[15], v[14]);
+    u[15] = _mm_add_epi32(v[15], v[14]);
+
+    // stage 6
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+
+    v[8] = _mm_mullo_epi32(u[8], cospi60);
+    x = _mm_mullo_epi32(u[15], cospi4);
+    v[8] = _mm_add_epi32(v[8], x);
+    v[8] = _mm_add_epi32(v[8], rnding);
+    v[8] = _mm_srai_epi32(v[8], bit);
+
+    v[15] = _mm_mullo_epi32(u[8], cospi4);
+    x = _mm_mullo_epi32(u[15], cospi60);
+    v[15] = _mm_sub_epi32(x, v[15]);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
+
+    v[9] = _mm_mullo_epi32(u[9], cospi28);
+    x = _mm_mullo_epi32(u[14], cospi36);
+    v[9] = _mm_add_epi32(v[9], x);
+    v[9] = _mm_add_epi32(v[9], rnding);
+    v[9] = _mm_srai_epi32(v[9], bit);
+
+    v[14] = _mm_mullo_epi32(u[9], cospi36);
+    x = _mm_mullo_epi32(u[14], cospi28);
+    v[14] = _mm_sub_epi32(x, v[14]);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[10] = _mm_mullo_epi32(u[10], cospi44);
+    x = _mm_mullo_epi32(u[13], cospi20);
+    v[10] = _mm_add_epi32(v[10], x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[13] = _mm_mullo_epi32(u[10], cospi20);
+    x = _mm_mullo_epi32(u[13], cospi44);
+    v[13] = _mm_sub_epi32(x, v[13]);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    v[11] = _mm_mullo_epi32(u[11], cospi12);
+    x = _mm_mullo_epi32(u[12], cospi52);
+    v[11] = _mm_add_epi32(v[11], x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = _mm_mullo_epi32(u[11], cospi52);
+    x = _mm_mullo_epi32(u[12], cospi12);
+    v[12] = _mm_sub_epi32(x, v[12]);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
+
+    out[0 * col_num + col] = v[0];
+    out[1 * col_num + col] = v[8];
+    out[2 * col_num + col] = v[4];
+    out[3 * col_num + col] = v[12];
+    out[4 * col_num + col] = v[2];
+    out[5 * col_num + col] = v[10];
+    out[6 * col_num + col] = v[6];
+    out[7 * col_num + col] = v[14];
+    out[8 * col_num + col] = v[1];
+    out[9 * col_num + col] = v[9];
+    out[10 * col_num + col] = v[5];
+    out[11 * col_num + col] = v[13];
+    out[12 * col_num + col] = v[3];
+    out[13 * col_num + col] = v[11];
+    out[14 * col_num + col] = v[7];
+    out[15 * col_num + col] = v[15];
+  }
+}
+
+static void fadst16x16_sse4_1(__m128i *in, __m128i *out, int bit) {
+  const int32_t *cospi = cospi_arr[bit - cos_bit_min];
+  const __m128i cospi2 = _mm_set1_epi32(cospi[2]);
+  const __m128i cospi62 = _mm_set1_epi32(cospi[62]);
+  const __m128i cospi10 = _mm_set1_epi32(cospi[10]);
+  const __m128i cospi54 = _mm_set1_epi32(cospi[54]);
+  const __m128i cospi18 = _mm_set1_epi32(cospi[18]);
+  const __m128i cospi46 = _mm_set1_epi32(cospi[46]);
+  const __m128i cospi26 = _mm_set1_epi32(cospi[26]);
+  const __m128i cospi38 = _mm_set1_epi32(cospi[38]);
+  const __m128i cospi34 = _mm_set1_epi32(cospi[34]);
+  const __m128i cospi30 = _mm_set1_epi32(cospi[30]);
+  const __m128i cospi42 = _mm_set1_epi32(cospi[42]);
+  const __m128i cospi22 = _mm_set1_epi32(cospi[22]);
+  const __m128i cospi50 = _mm_set1_epi32(cospi[50]);
+  const __m128i cospi14 = _mm_set1_epi32(cospi[14]);
+  const __m128i cospi58 = _mm_set1_epi32(cospi[58]);
+  const __m128i cospi6 = _mm_set1_epi32(cospi[6]);
+  const __m128i cospi8 = _mm_set1_epi32(cospi[8]);
+  const __m128i cospi56 = _mm_set1_epi32(cospi[56]);
+  const __m128i cospi40 = _mm_set1_epi32(cospi[40]);
+  const __m128i cospi24 = _mm_set1_epi32(cospi[24]);
+  const __m128i cospim56 = _mm_set1_epi32(-cospi[56]);
+  const __m128i cospim24 = _mm_set1_epi32(-cospi[24]);
+  const __m128i cospi48 = _mm_set1_epi32(cospi[48]);
+  const __m128i cospi16 = _mm_set1_epi32(cospi[16]);
+  const __m128i cospim48 = _mm_set1_epi32(-cospi[48]);
+  const __m128i cospi32 = _mm_set1_epi32(cospi[32]);
+  const __m128i rnding = _mm_set1_epi32(1 << (bit - 1));
+  __m128i u[16], v[16], x, y;
+  const int col_num = 4;
+  int col;
+
+  // Calculate the column 0, 1, 2, 3
+  for (col = 0; col < col_num; ++col) {
+    // stage 0
+    // stage 1
+    // stage 2
+    v[0] = _mm_mullo_epi32(in[15 * col_num + col], cospi2);
+    x = _mm_mullo_epi32(in[0 * col_num + col], cospi62);
+    v[0] = _mm_add_epi32(v[0], x);
+    v[0] = _mm_add_epi32(v[0], rnding);
+    v[0] = _mm_srai_epi32(v[0], bit);
+
+    v[1] = _mm_mullo_epi32(in[15 * col_num + col], cospi62);
+    x = _mm_mullo_epi32(in[0 * col_num + col], cospi2);
+    v[1] = _mm_sub_epi32(v[1], x);
+    v[1] = _mm_add_epi32(v[1], rnding);
+    v[1] = _mm_srai_epi32(v[1], bit);
+
+    v[2] = _mm_mullo_epi32(in[13 * col_num + col], cospi10);
+    x = _mm_mullo_epi32(in[2 * col_num + col], cospi54);
+    v[2] = _mm_add_epi32(v[2], x);
+    v[2] = _mm_add_epi32(v[2], rnding);
+    v[2] = _mm_srai_epi32(v[2], bit);
+
+    v[3] = _mm_mullo_epi32(in[13 * col_num + col], cospi54);
+    x = _mm_mullo_epi32(in[2 * col_num + col], cospi10);
+    v[3] = _mm_sub_epi32(v[3], x);
+    v[3] = _mm_add_epi32(v[3], rnding);
+    v[3] = _mm_srai_epi32(v[3], bit);
+
+    v[4] = _mm_mullo_epi32(in[11 * col_num + col], cospi18);
+    x = _mm_mullo_epi32(in[4 * col_num + col], cospi46);
+    v[4] = _mm_add_epi32(v[4], x);
+    v[4] = _mm_add_epi32(v[4], rnding);
+    v[4] = _mm_srai_epi32(v[4], bit);
+
+    v[5] = _mm_mullo_epi32(in[11 * col_num + col], cospi46);
+    x = _mm_mullo_epi32(in[4 * col_num + col], cospi18);
+    v[5] = _mm_sub_epi32(v[5], x);
+    v[5] = _mm_add_epi32(v[5], rnding);
+    v[5] = _mm_srai_epi32(v[5], bit);
+
+    v[6] = _mm_mullo_epi32(in[9 * col_num + col], cospi26);
+    x = _mm_mullo_epi32(in[6 * col_num + col], cospi38);
+    v[6] = _mm_add_epi32(v[6], x);
+    v[6] = _mm_add_epi32(v[6], rnding);
+    v[6] = _mm_srai_epi32(v[6], bit);
+
+    v[7] = _mm_mullo_epi32(in[9 * col_num + col], cospi38);
+    x = _mm_mullo_epi32(in[6 * col_num + col], cospi26);
+    v[7] = _mm_sub_epi32(v[7], x);
+    v[7] = _mm_add_epi32(v[7], rnding);
+    v[7] = _mm_srai_epi32(v[7], bit);
+
+    v[8] = _mm_mullo_epi32(in[7 * col_num + col], cospi34);
+    x = _mm_mullo_epi32(in[8 * col_num + col], cospi30);
+    v[8] = _mm_add_epi32(v[8], x);
+    v[8] = _mm_add_epi32(v[8], rnding);
+    v[8] = _mm_srai_epi32(v[8], bit);
+
+    v[9] = _mm_mullo_epi32(in[7 * col_num + col], cospi30);
+    x = _mm_mullo_epi32(in[8 * col_num + col], cospi34);
+    v[9] = _mm_sub_epi32(v[9], x);
+    v[9] = _mm_add_epi32(v[9], rnding);
+    v[9] = _mm_srai_epi32(v[9], bit);
+
+    v[10] = _mm_mullo_epi32(in[5 * col_num + col], cospi42);
+    x = _mm_mullo_epi32(in[10 * col_num + col], cospi22);
+    v[10] = _mm_add_epi32(v[10], x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[11] = _mm_mullo_epi32(in[5 * col_num + col], cospi22);
+    x = _mm_mullo_epi32(in[10 * col_num + col], cospi42);
+    v[11] = _mm_sub_epi32(v[11], x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = _mm_mullo_epi32(in[3 * col_num + col], cospi50);
+    x = _mm_mullo_epi32(in[12 * col_num + col], cospi14);
+    v[12] = _mm_add_epi32(v[12], x);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
+
+    v[13] = _mm_mullo_epi32(in[3 * col_num + col], cospi14);
+    x = _mm_mullo_epi32(in[12 * col_num + col], cospi50);
+    v[13] = _mm_sub_epi32(v[13], x);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    v[14] = _mm_mullo_epi32(in[1 * col_num + col], cospi58);
+    x = _mm_mullo_epi32(in[14 * col_num + col], cospi6);
+    v[14] = _mm_add_epi32(v[14], x);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[15] = _mm_mullo_epi32(in[1 * col_num + col], cospi6);
+    x = _mm_mullo_epi32(in[14 * col_num + col], cospi58);
+    v[15] = _mm_sub_epi32(v[15], x);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
+
+    // stage 3
+    u[0] = _mm_add_epi32(v[0], v[8]);
+    u[8] = _mm_sub_epi32(v[0], v[8]);
+    u[1] = _mm_add_epi32(v[1], v[9]);
+    u[9] = _mm_sub_epi32(v[1], v[9]);
+    u[2] = _mm_add_epi32(v[2], v[10]);
+    u[10] = _mm_sub_epi32(v[2], v[10]);
+    u[3] = _mm_add_epi32(v[3], v[11]);
+    u[11] = _mm_sub_epi32(v[3], v[11]);
+    u[4] = _mm_add_epi32(v[4], v[12]);
+    u[12] = _mm_sub_epi32(v[4], v[12]);
+    u[5] = _mm_add_epi32(v[5], v[13]);
+    u[13] = _mm_sub_epi32(v[5], v[13]);
+    u[6] = _mm_add_epi32(v[6], v[14]);
+    u[14] = _mm_sub_epi32(v[6], v[14]);
+    u[7] = _mm_add_epi32(v[7], v[15]);
+    u[15] = _mm_sub_epi32(v[7], v[15]);
+
+    // stage 4
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+    v[4] = u[4];
+    v[5] = u[5];
+    v[6] = u[6];
+    v[7] = u[7];
+
+    v[8] = _mm_mullo_epi32(u[8], cospi8);
+    x = _mm_mullo_epi32(u[9], cospi56);
+    v[8] = _mm_add_epi32(v[8], x);
+    v[8] = _mm_add_epi32(v[8], rnding);
+    v[8] = _mm_srai_epi32(v[8], bit);
+
+    v[9] = _mm_mullo_epi32(u[8], cospi56);
+    x = _mm_mullo_epi32(u[9], cospi8);
+    v[9] = _mm_sub_epi32(v[9], x);
+    v[9] = _mm_add_epi32(v[9], rnding);
+    v[9] = _mm_srai_epi32(v[9], bit);
+
+    v[10] = _mm_mullo_epi32(u[10], cospi40);
+    x = _mm_mullo_epi32(u[11], cospi24);
+    v[10] = _mm_add_epi32(v[10], x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[11] = _mm_mullo_epi32(u[10], cospi24);
+    x = _mm_mullo_epi32(u[11], cospi40);
+    v[11] = _mm_sub_epi32(v[11], x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = _mm_mullo_epi32(u[12], cospim56);
+    x = _mm_mullo_epi32(u[13], cospi8);
+    v[12] = _mm_add_epi32(v[12], x);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
+
+    v[13] = _mm_mullo_epi32(u[12], cospi8);
+    x = _mm_mullo_epi32(u[13], cospim56);
+    v[13] = _mm_sub_epi32(v[13], x);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    v[14] = _mm_mullo_epi32(u[14], cospim24);
+    x = _mm_mullo_epi32(u[15], cospi40);
+    v[14] = _mm_add_epi32(v[14], x);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[15] = _mm_mullo_epi32(u[14], cospi40);
+    x = _mm_mullo_epi32(u[15], cospim24);
+    v[15] = _mm_sub_epi32(v[15], x);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
+
+    // stage 5
+    u[0] = _mm_add_epi32(v[0], v[4]);
+    u[4] = _mm_sub_epi32(v[0], v[4]);
+    u[1] = _mm_add_epi32(v[1], v[5]);
+    u[5] = _mm_sub_epi32(v[1], v[5]);
+    u[2] = _mm_add_epi32(v[2], v[6]);
+    u[6] = _mm_sub_epi32(v[2], v[6]);
+    u[3] = _mm_add_epi32(v[3], v[7]);
+    u[7] = _mm_sub_epi32(v[3], v[7]);
+    u[8] = _mm_add_epi32(v[8], v[12]);
+    u[12] = _mm_sub_epi32(v[8], v[12]);
+    u[9] = _mm_add_epi32(v[9], v[13]);
+    u[13] = _mm_sub_epi32(v[9], v[13]);
+    u[10] = _mm_add_epi32(v[10], v[14]);
+    u[14] = _mm_sub_epi32(v[10], v[14]);
+    u[11] = _mm_add_epi32(v[11], v[15]);
+    u[15] = _mm_sub_epi32(v[11], v[15]);
+
+    // stage 6
+    v[0] = u[0];
+    v[1] = u[1];
+    v[2] = u[2];
+    v[3] = u[3];
+
+    v[4] = _mm_mullo_epi32(u[4], cospi16);
+    x = _mm_mullo_epi32(u[5], cospi48);
+    v[4] = _mm_add_epi32(v[4], x);
+    v[4] = _mm_add_epi32(v[4], rnding);
+    v[4] = _mm_srai_epi32(v[4], bit);
+
+    v[5] = _mm_mullo_epi32(u[4], cospi48);
+    x = _mm_mullo_epi32(u[5], cospi16);
+    v[5] = _mm_sub_epi32(v[5], x);
+    v[5] = _mm_add_epi32(v[5], rnding);
+    v[5] = _mm_srai_epi32(v[5], bit);
+
+    v[6] = _mm_mullo_epi32(u[6], cospim48);
+    x = _mm_mullo_epi32(u[7], cospi16);
+    v[6] = _mm_add_epi32(v[6], x);
+    v[6] = _mm_add_epi32(v[6], rnding);
+    v[6] = _mm_srai_epi32(v[6], bit);
+
+    v[7] = _mm_mullo_epi32(u[6], cospi16);
+    x = _mm_mullo_epi32(u[7], cospim48);
+    v[7] = _mm_sub_epi32(v[7], x);
+    v[7] = _mm_add_epi32(v[7], rnding);
+    v[7] = _mm_srai_epi32(v[7], bit);
+
+    v[8] = u[8];
+    v[9] = u[9];
+    v[10] = u[10];
+    v[11] = u[11];
+
+    v[12] = _mm_mullo_epi32(u[12], cospi16);
+    x = _mm_mullo_epi32(u[13], cospi48);
+    v[12] = _mm_add_epi32(v[12], x);
+    v[12] = _mm_add_epi32(v[12], rnding);
+    v[12] = _mm_srai_epi32(v[12], bit);
+
+    v[13] = _mm_mullo_epi32(u[12], cospi48);
+    x = _mm_mullo_epi32(u[13], cospi16);
+    v[13] = _mm_sub_epi32(v[13], x);
+    v[13] = _mm_add_epi32(v[13], rnding);
+    v[13] = _mm_srai_epi32(v[13], bit);
+
+    v[14] = _mm_mullo_epi32(u[14], cospim48);
+    x = _mm_mullo_epi32(u[15], cospi16);
+    v[14] = _mm_add_epi32(v[14], x);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[15] = _mm_mullo_epi32(u[14], cospi16);
+    x = _mm_mullo_epi32(u[15], cospim48);
+    v[15] = _mm_sub_epi32(v[15], x);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
+
+    // stage 7
+    u[0] = _mm_add_epi32(v[0], v[2]);
+    u[2] = _mm_sub_epi32(v[0], v[2]);
+    u[1] = _mm_add_epi32(v[1], v[3]);
+    u[3] = _mm_sub_epi32(v[1], v[3]);
+    u[4] = _mm_add_epi32(v[4], v[6]);
+    u[6] = _mm_sub_epi32(v[4], v[6]);
+    u[5] = _mm_add_epi32(v[5], v[7]);
+    u[7] = _mm_sub_epi32(v[5], v[7]);
+    u[8] = _mm_add_epi32(v[8], v[10]);
+    u[10] = _mm_sub_epi32(v[8], v[10]);
+    u[9] = _mm_add_epi32(v[9], v[11]);
+    u[11] = _mm_sub_epi32(v[9], v[11]);
+    u[12] = _mm_add_epi32(v[12], v[14]);
+    u[14] = _mm_sub_epi32(v[12], v[14]);
+    u[13] = _mm_add_epi32(v[13], v[15]);
+    u[15] = _mm_sub_epi32(v[13], v[15]);
+
+    // stage 8
+    v[0] = u[0];
+    v[1] = u[1];
+
+    y = _mm_mullo_epi32(u[2], cospi32);
+    x = _mm_mullo_epi32(u[3], cospi32);
+    v[2] = _mm_add_epi32(y, x);
+    v[2] = _mm_add_epi32(v[2], rnding);
+    v[2] = _mm_srai_epi32(v[2], bit);
+
+    v[3] = _mm_sub_epi32(y, x);
+    v[3] = _mm_add_epi32(v[3], rnding);
+    v[3] = _mm_srai_epi32(v[3], bit);
+
+    v[4] = u[4];
+    v[5] = u[5];
+
+    y = _mm_mullo_epi32(u[6], cospi32);
+    x = _mm_mullo_epi32(u[7], cospi32);
+    v[6] = _mm_add_epi32(y, x);
+    v[6] = _mm_add_epi32(v[6], rnding);
+    v[6] = _mm_srai_epi32(v[6], bit);
+
+    v[7] = _mm_sub_epi32(y, x);
+    v[7] = _mm_add_epi32(v[7], rnding);
+    v[7] = _mm_srai_epi32(v[7], bit);
+
+    v[8] = u[8];
+    v[9] = u[9];
+
+    y = _mm_mullo_epi32(u[10], cospi32);
+    x = _mm_mullo_epi32(u[11], cospi32);
+    v[10] = _mm_add_epi32(y, x);
+    v[10] = _mm_add_epi32(v[10], rnding);
+    v[10] = _mm_srai_epi32(v[10], bit);
+
+    v[11] = _mm_sub_epi32(y, x);
+    v[11] = _mm_add_epi32(v[11], rnding);
+    v[11] = _mm_srai_epi32(v[11], bit);
+
+    v[12] = u[12];
+    v[13] = u[13];
+
+    y = _mm_mullo_epi32(u[14], cospi32);
+    x = _mm_mullo_epi32(u[15], cospi32);
+    v[14] = _mm_add_epi32(y, x);
+    v[14] = _mm_add_epi32(v[14], rnding);
+    v[14] = _mm_srai_epi32(v[14], bit);
+
+    v[15] = _mm_sub_epi32(y, x);
+    v[15] = _mm_add_epi32(v[15], rnding);
+    v[15] = _mm_srai_epi32(v[15], bit);
+
+    // stage 9
+    out[0 * col_num + col] = v[0];
+    out[1 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[8]);
+    out[2 * col_num + col] = v[12];
+    out[3 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[4]);
+    out[4 * col_num + col] = v[6];
+    out[5 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[14]);
+    out[6 * col_num + col] = v[10];
+    out[7 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[2]);
+    out[8 * col_num + col] = v[3];
+    out[9 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[11]);
+    out[10 * col_num + col] = v[15];
+    out[11 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[7]);
+    out[12 * col_num + col] = v[5];
+    out[13 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[13]);
+    out[14 * col_num + col] = v[9];
+    out[15 * col_num + col] = _mm_sub_epi32(_mm_set1_epi32(0), v[1]);
+  }
+}
+
+static void col_txfm_16x16_rounding(__m128i *in, int shift) {
+  // Note:
+  //  We split 16x16 rounding into 4 sections of 8x8 rounding,
+  //  instead of 4 columns
+  col_txfm_8x8_rounding(&in[0], shift);
+  col_txfm_8x8_rounding(&in[16], shift);
+  col_txfm_8x8_rounding(&in[32], shift);
+  col_txfm_8x8_rounding(&in[48], shift);
+}
+
+static void write_buffer_16x16(const __m128i *in, tran_low_t *output) {
+  const int size_8x8 = 16 * 4;
+  write_buffer_8x8(&in[0], output);
+  output += size_8x8;
+  write_buffer_8x8(&in[16], output);
+  output += size_8x8;
+  write_buffer_8x8(&in[32], output);
+  output += size_8x8;
+  write_buffer_8x8(&in[48], output);
+}
+
+void vp10_fwd_txfm2d_16x16_sse4_1(const int16_t *input, int32_t *coeff,
+                                  int stride, int tx_type, int bd) {
+  __m128i in[64], out[64];
+  const TXFM_2D_CFG *cfg = NULL;
+
+  switch (tx_type) {
+    case DCT_DCT:
+      cfg = &fwd_txfm_2d_cfg_dct_dct_16;
+      load_buffer_16x16(input, in, stride, 0, 0, cfg->shift[0]);
+      fdct16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      transpose_16x16(out, in);
+      fdct16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case ADST_DCT:
+      cfg = &fwd_txfm_2d_cfg_adst_dct_16;
+      load_buffer_16x16(input, in, stride, 0, 0, cfg->shift[0]);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      transpose_16x16(out, in);
+      fdct16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case DCT_ADST:
+      cfg = &fwd_txfm_2d_cfg_dct_adst_16;
+      load_buffer_16x16(input, in, stride, 0, 0, cfg->shift[0]);
+      fdct16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case ADST_ADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_16;
+      load_buffer_16x16(input, in, stride, 0, 0, cfg->shift[0]);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+#if CONFIG_EXT_TX
+    case FLIPADST_DCT:
+      cfg = &fwd_txfm_2d_cfg_adst_dct_16;
+      load_buffer_16x16(input, in, stride, 1, 0, cfg->shift[0]);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      transpose_16x16(out, in);
+      fdct16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case DCT_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_dct_adst_16;
+      load_buffer_16x16(input, in, stride, 0, 1, cfg->shift[0]);
+      fdct16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case FLIPADST_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_16;
+      load_buffer_16x16(input, in, stride, 1, 1, cfg->shift[0]);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case ADST_FLIPADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_16;
+      load_buffer_16x16(input, in, stride, 0, 1, cfg->shift[0]);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+    case FLIPADST_ADST:
+      cfg = &fwd_txfm_2d_cfg_adst_adst_16;
+      load_buffer_16x16(input, in, stride, 1, 0, cfg->shift[0]);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_col[0]);
+      col_txfm_16x16_rounding(out, -cfg->shift[1]);
+      transpose_16x16(out, in);
+      fadst16x16_sse4_1(in, out, cfg->cos_bit_row[0]);
+      transpose_16x16(out, in);
+      write_buffer_16x16(in, coeff);
+      break;
+#endif  // CONFIG_EXT_TX
+    default:
+      assert(0);
+  }
+  (void)bd;
+}
diff --git a/vp10/encoder/x86/wedge_utils_sse2.c b/vp10/encoder/x86/wedge_utils_sse2.c
new file mode 100644
index 0000000..b881d58
--- /dev/null
+++ b/vp10/encoder/x86/wedge_utils_sse2.c
@@ -0,0 +1,260 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "vpx_dsp/x86/synonyms.h"
+
+#include "vpx/vpx_integer.h"
+
+#include "vp10/common/reconinter.h"
+
+#define MAX_MASK_VALUE  (1 << WEDGE_WEIGHT_BITS)
+
+/**
+ * See vp10_wedge_sse_from_residuals_c
+ */
+uint64_t vp10_wedge_sse_from_residuals_sse2(const int16_t *r1,
+                                            const int16_t *d,
+                                            const uint8_t *m,
+                                            int N) {
+  int n = -N;
+  int n8 = n + 8;
+
+  uint64_t csse;
+
+  const __m128i v_mask_max_w = _mm_set1_epi16(MAX_MASK_VALUE);
+  const __m128i v_zext_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
+
+  __m128i v_acc0_q = _mm_setzero_si128();
+
+  assert(N % 64 == 0);
+
+  r1 += N;
+  d += N;
+  m += N;
+
+  do {
+    const __m128i v_r0_w = xx_load_128(r1 + n);
+    const __m128i v_r1_w = xx_load_128(r1 + n8);
+    const __m128i v_d0_w = xx_load_128(d + n);
+    const __m128i v_d1_w = xx_load_128(d + n8);
+    const __m128i v_m01_b = xx_load_128(m + n);
+
+    const __m128i v_rd0l_w = _mm_unpacklo_epi16(v_d0_w, v_r0_w);
+    const __m128i v_rd0h_w = _mm_unpackhi_epi16(v_d0_w, v_r0_w);
+    const __m128i v_rd1l_w = _mm_unpacklo_epi16(v_d1_w, v_r1_w);
+    const __m128i v_rd1h_w = _mm_unpackhi_epi16(v_d1_w, v_r1_w);
+    const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128());
+    const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128());
+
+    const __m128i v_m0l_w = _mm_unpacklo_epi16(v_m0_w, v_mask_max_w);
+    const __m128i v_m0h_w = _mm_unpackhi_epi16(v_m0_w, v_mask_max_w);
+    const __m128i v_m1l_w = _mm_unpacklo_epi16(v_m1_w, v_mask_max_w);
+    const __m128i v_m1h_w = _mm_unpackhi_epi16(v_m1_w, v_mask_max_w);
+
+    const __m128i v_t0l_d = _mm_madd_epi16(v_rd0l_w, v_m0l_w);
+    const __m128i v_t0h_d = _mm_madd_epi16(v_rd0h_w, v_m0h_w);
+    const __m128i v_t1l_d = _mm_madd_epi16(v_rd1l_w, v_m1l_w);
+    const __m128i v_t1h_d = _mm_madd_epi16(v_rd1h_w, v_m1h_w);
+
+    const __m128i v_t0_w = _mm_packs_epi32(v_t0l_d, v_t0h_d);
+    const __m128i v_t1_w = _mm_packs_epi32(v_t1l_d, v_t1h_d);
+
+    const __m128i v_sq0_d = _mm_madd_epi16(v_t0_w, v_t0_w);
+    const __m128i v_sq1_d = _mm_madd_epi16(v_t1_w, v_t1_w);
+
+    const __m128i v_sum0_q = _mm_add_epi64(_mm_and_si128(v_sq0_d, v_zext_q),
+                                           _mm_srli_epi64(v_sq0_d, 32));
+    const __m128i v_sum1_q = _mm_add_epi64(_mm_and_si128(v_sq1_d, v_zext_q),
+                                           _mm_srli_epi64(v_sq1_d, 32));
+
+    v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum0_q);
+    v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum1_q);
+
+    n8 += 16;
+    n += 16;
+  } while (n);
+
+  v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8));
+
+#if ARCH_X86_64
+  csse = (uint64_t)_mm_cvtsi128_si64(v_acc0_q);
+#else
+  xx_storel_64(&csse, v_acc0_q);
+#endif
+
+  return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS);
+}
+
+/**
+ * See vp10_wedge_sign_from_residuals_c
+ */
+int vp10_wedge_sign_from_residuals_sse2(const int16_t *ds,
+                                        const uint8_t *m,
+                                        int N,
+                                        int64_t limit) {
+  int64_t acc;
+
+  __m128i v_sign_d;
+  __m128i v_acc0_d = _mm_setzero_si128();
+  __m128i v_acc1_d = _mm_setzero_si128();
+  __m128i v_acc_q;
+
+  // Input size limited to 8192 by the use of 32 bit accumulators and m
+  // being between [0, 64]. Overflow might happen at larger sizes,
+  // though it is practically impossible on real video input.
+  assert(N < 8192);
+  assert(N % 64 == 0);
+
+  do {
+    const __m128i v_m01_b = xx_load_128(m);
+    const __m128i v_m23_b = xx_load_128(m + 16);
+    const __m128i v_m45_b = xx_load_128(m + 32);
+    const __m128i v_m67_b = xx_load_128(m + 48);
+
+    const __m128i v_d0_w = xx_load_128(ds);
+    const __m128i v_d1_w = xx_load_128(ds + 8);
+    const __m128i v_d2_w = xx_load_128(ds + 16);
+    const __m128i v_d3_w = xx_load_128(ds + 24);
+    const __m128i v_d4_w = xx_load_128(ds + 32);
+    const __m128i v_d5_w = xx_load_128(ds + 40);
+    const __m128i v_d6_w = xx_load_128(ds + 48);
+    const __m128i v_d7_w = xx_load_128(ds + 56);
+
+    const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128());
+    const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128());
+    const __m128i v_m2_w = _mm_unpacklo_epi8(v_m23_b, _mm_setzero_si128());
+    const __m128i v_m3_w = _mm_unpackhi_epi8(v_m23_b, _mm_setzero_si128());
+    const __m128i v_m4_w = _mm_unpacklo_epi8(v_m45_b, _mm_setzero_si128());
+    const __m128i v_m5_w = _mm_unpackhi_epi8(v_m45_b, _mm_setzero_si128());
+    const __m128i v_m6_w = _mm_unpacklo_epi8(v_m67_b, _mm_setzero_si128());
+    const __m128i v_m7_w = _mm_unpackhi_epi8(v_m67_b, _mm_setzero_si128());
+
+    const __m128i v_p0_d = _mm_madd_epi16(v_d0_w, v_m0_w);
+    const __m128i v_p1_d = _mm_madd_epi16(v_d1_w, v_m1_w);
+    const __m128i v_p2_d = _mm_madd_epi16(v_d2_w, v_m2_w);
+    const __m128i v_p3_d = _mm_madd_epi16(v_d3_w, v_m3_w);
+    const __m128i v_p4_d = _mm_madd_epi16(v_d4_w, v_m4_w);
+    const __m128i v_p5_d = _mm_madd_epi16(v_d5_w, v_m5_w);
+    const __m128i v_p6_d = _mm_madd_epi16(v_d6_w, v_m6_w);
+    const __m128i v_p7_d = _mm_madd_epi16(v_d7_w, v_m7_w);
+
+    const __m128i v_p01_d = _mm_add_epi32(v_p0_d, v_p1_d);
+    const __m128i v_p23_d = _mm_add_epi32(v_p2_d, v_p3_d);
+    const __m128i v_p45_d = _mm_add_epi32(v_p4_d, v_p5_d);
+    const __m128i v_p67_d = _mm_add_epi32(v_p6_d, v_p7_d);
+
+    const __m128i v_p0123_d = _mm_add_epi32(v_p01_d, v_p23_d);
+    const __m128i v_p4567_d = _mm_add_epi32(v_p45_d, v_p67_d);
+
+    v_acc0_d = _mm_add_epi32(v_acc0_d, v_p0123_d);
+    v_acc1_d = _mm_add_epi32(v_acc1_d, v_p4567_d);
+
+    ds += 64;
+    m += 64;
+
+    N -= 64;
+  } while (N);
+
+  v_sign_d = _mm_cmplt_epi32(v_acc0_d, _mm_setzero_si128());
+  v_acc0_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc0_d, v_sign_d),
+                          _mm_unpackhi_epi32(v_acc0_d, v_sign_d));
+
+  v_sign_d = _mm_cmplt_epi32(v_acc1_d, _mm_setzero_si128());
+  v_acc1_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc1_d, v_sign_d),
+                          _mm_unpackhi_epi32(v_acc1_d, v_sign_d));
+
+  v_acc_q = _mm_add_epi64(v_acc0_d, v_acc1_d);
+
+  v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
+
+#if ARCH_X86_64
+  acc = (uint64_t)_mm_cvtsi128_si64(v_acc_q);
+#else
+  xx_storel_64(&acc, v_acc_q);
+#endif
+
+  return acc > limit;
+}
+
+// Negate under mask
+static INLINE __m128i negm_epi16(__m128i v_v_w, __m128i v_mask_w) {
+  return _mm_sub_epi16(_mm_xor_si128(v_v_w, v_mask_w), v_mask_w);
+}
+
+/**
+ * vp10_wedge_compute_delta_squares_c
+ */
+void vp10_wedge_compute_delta_squares_sse2(int16_t *d,
+                                          const int16_t *a,
+                                          const int16_t *b,
+                                          int N) {
+  const __m128i v_neg_w = _mm_set_epi16(0xffff, 0, 0xffff, 0,
+                                        0xffff, 0, 0xffff, 0);
+
+  assert(N % 64 == 0);
+
+  do {
+    const __m128i v_a0_w = xx_load_128(a);
+    const __m128i v_b0_w = xx_load_128(b);
+    const __m128i v_a1_w = xx_load_128(a + 8);
+    const __m128i v_b1_w = xx_load_128(b + 8);
+    const __m128i v_a2_w = xx_load_128(a + 16);
+    const __m128i v_b2_w = xx_load_128(b + 16);
+    const __m128i v_a3_w = xx_load_128(a + 24);
+    const __m128i v_b3_w = xx_load_128(b + 24);
+
+    const __m128i v_ab0l_w = _mm_unpacklo_epi16(v_a0_w, v_b0_w);
+    const __m128i v_ab0h_w = _mm_unpackhi_epi16(v_a0_w, v_b0_w);
+    const __m128i v_ab1l_w = _mm_unpacklo_epi16(v_a1_w, v_b1_w);
+    const __m128i v_ab1h_w = _mm_unpackhi_epi16(v_a1_w, v_b1_w);
+    const __m128i v_ab2l_w = _mm_unpacklo_epi16(v_a2_w, v_b2_w);
+    const __m128i v_ab2h_w = _mm_unpackhi_epi16(v_a2_w, v_b2_w);
+    const __m128i v_ab3l_w = _mm_unpacklo_epi16(v_a3_w, v_b3_w);
+    const __m128i v_ab3h_w = _mm_unpackhi_epi16(v_a3_w, v_b3_w);
+
+    // Negate top word of pairs
+    const __m128i v_abl0n_w = negm_epi16(v_ab0l_w, v_neg_w);
+    const __m128i v_abh0n_w = negm_epi16(v_ab0h_w, v_neg_w);
+    const __m128i v_abl1n_w = negm_epi16(v_ab1l_w, v_neg_w);
+    const __m128i v_abh1n_w = negm_epi16(v_ab1h_w, v_neg_w);
+    const __m128i v_abl2n_w = negm_epi16(v_ab2l_w, v_neg_w);
+    const __m128i v_abh2n_w = negm_epi16(v_ab2h_w, v_neg_w);
+    const __m128i v_abl3n_w = negm_epi16(v_ab3l_w, v_neg_w);
+    const __m128i v_abh3n_w = negm_epi16(v_ab3h_w, v_neg_w);
+
+    const __m128i v_r0l_w = _mm_madd_epi16(v_ab0l_w, v_abl0n_w);
+    const __m128i v_r0h_w = _mm_madd_epi16(v_ab0h_w, v_abh0n_w);
+    const __m128i v_r1l_w = _mm_madd_epi16(v_ab1l_w, v_abl1n_w);
+    const __m128i v_r1h_w = _mm_madd_epi16(v_ab1h_w, v_abh1n_w);
+    const __m128i v_r2l_w = _mm_madd_epi16(v_ab2l_w, v_abl2n_w);
+    const __m128i v_r2h_w = _mm_madd_epi16(v_ab2h_w, v_abh2n_w);
+    const __m128i v_r3l_w = _mm_madd_epi16(v_ab3l_w, v_abl3n_w);
+    const __m128i v_r3h_w = _mm_madd_epi16(v_ab3h_w, v_abh3n_w);
+
+    const __m128i v_r0_w = _mm_packs_epi32(v_r0l_w, v_r0h_w);
+    const __m128i v_r1_w = _mm_packs_epi32(v_r1l_w, v_r1h_w);
+    const __m128i v_r2_w = _mm_packs_epi32(v_r2l_w, v_r2h_w);
+    const __m128i v_r3_w = _mm_packs_epi32(v_r3l_w, v_r3h_w);
+
+    xx_store_128(d, v_r0_w);
+    xx_store_128(d + 8, v_r1_w);
+    xx_store_128(d + 16, v_r2_w);
+    xx_store_128(d + 24, v_r3_w);
+
+    a += 32;
+    b += 32;
+    d += 32;
+    N -= 32;
+  } while (N);
+}
+
diff --git a/vp10/vp10_common.mk b/vp10/vp10_common.mk
index 2eb3488..e68e083 100644
--- a/vp10/vp10_common.mk
+++ b/vp10/vp10_common.mk
@@ -63,15 +63,41 @@
 VP10_COMMON_SRCS-yes += common/scan.h
 VP10_COMMON_SRCS-yes += common/vp10_fwd_txfm.h
 VP10_COMMON_SRCS-yes += common/vp10_fwd_txfm.c
+VP10_COMMON_SRCS-yes += common/vp10_txfm.h
+VP10_COMMON_SRCS-yes += common/vp10_fwd_txfm1d.h
+VP10_COMMON_SRCS-yes += common/vp10_fwd_txfm1d.c
+VP10_COMMON_SRCS-yes += common/vp10_inv_txfm1d.h
+VP10_COMMON_SRCS-yes += common/vp10_inv_txfm1d.c
+VP10_COMMON_SRCS-yes += common/vp10_fwd_txfm2d.c
+VP10_COMMON_SRCS-yes += common/vp10_fwd_txfm2d_cfg.h
+VP10_COMMON_SRCS-yes += common/vp10_inv_txfm2d.c
+VP10_COMMON_SRCS-yes += common/vp10_inv_txfm2d_cfg.h
+VP10_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp10_convolve_ssse3.c
+VP10_COMMON_SRCS-$(HAVE_SSSE3) += common/x86/vp10_convolve_filters_ssse3.c
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+VP10_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp10_highbd_convolve_sse4.c
+VP10_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp10_highbd_convolve_filters_sse4.c
+endif
+VP10_COMMON_SRCS-yes += common/vp10_convolve.c
+VP10_COMMON_SRCS-yes += common/vp10_convolve.h
+VP10_COMMON_SRCS-$(CONFIG_ANS) += common/ans.h
+VP10_COMMON_SRCS-$(CONFIG_ANS) += common/divide.h
+VP10_COMMON_SRCS-$(CONFIG_ANS) += common/divide.c
 
 VP10_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/postproc.h
 VP10_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/postproc.c
+VP10_COMMON_SRCS-$(CONFIG_LOOP_RESTORATION) += common/restoration.h
+VP10_COMMON_SRCS-$(CONFIG_LOOP_RESTORATION) += common/restoration.c
 VP10_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/mfqe.h
 VP10_COMMON_SRCS-$(CONFIG_VP9_POSTPROC) += common/mfqe.c
 ifeq ($(CONFIG_VP9_POSTPROC),yes)
 VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/mfqe_sse2.asm
 VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/postproc_sse2.asm
 endif
+ifeq (yes,$(filter yes,$(CONFIG_GLOBAL_MOTION) $(CONFIG_WARPED_MOTION)))
+VP10_COMMON_SRCS-yes += common/warped_motion.h
+VP10_COMMON_SRCS-yes += common/warped_motion.c
+endif
 
 ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP10_COMMON_SRCS-$(HAVE_DSPR2)  += common/mips/dspr2/itrans4_dspr2.c
@@ -92,6 +118,13 @@
 VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp10_fwd_txfm_sse2.c
 VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp10_fwd_dct32x32_impl_sse2.h
 VP10_COMMON_SRCS-$(HAVE_SSE2) += common/x86/vp10_fwd_txfm_impl_sse2.h
+VP10_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp10_txfm1d_sse4.h
+VP10_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp10_fwd_txfm1d_sse4.c
+VP10_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/vp10_fwd_txfm2d_sse4.c
+
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+VP10_COMMON_SRCS-$(HAVE_SSE4_1) += common/x86/highbd_txfm_utility_sse4.h
+endif
 
 ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 VP10_COMMON_SRCS-$(HAVE_NEON) += common/arm/neon/iht4x4_add_neon.c
diff --git a/vp10/vp10_cx_iface.c b/vp10/vp10_cx_iface.c
index ea99d42..a707d11 100644
--- a/vp10/vp10_cx_iface.c
+++ b/vp10/vp10_cx_iface.c
@@ -25,6 +25,9 @@
 struct vp10_extracfg {
   int                         cpu_used;  // available cpu percentage in 1/16
   unsigned int                enable_auto_alt_ref;
+#if CONFIG_EXT_REFS
+  unsigned int                enable_auto_bwd_ref;
+#endif  // CONFIG_EXT_REFS
   unsigned int                noise_sensitivity;
   unsigned int                sharpness;
   unsigned int                static_thresh;
@@ -49,35 +52,45 @@
   int                         color_range;
   int                         render_width;
   int                         render_height;
+  vpx_superblock_size_t       superblock_size;
 };
 
 static struct vp10_extracfg default_extra_cfg = {
-  0,                          // cpu_used
-  1,                          // enable_auto_alt_ref
-  0,                          // noise_sensitivity
-  0,                          // sharpness
-  0,                          // static_thresh
-  6,                          // tile_columns
-  0,                          // tile_rows
-  7,                          // arnr_max_frames
-  5,                          // arnr_strength
-  0,                          // min_gf_interval; 0 -> default decision
-  0,                          // max_gf_interval; 0 -> default decision
-  VP8_TUNE_PSNR,              // tuning
-  10,                         // cq_level
-  0,                          // rc_max_intra_bitrate_pct
-  0,                          // rc_max_inter_bitrate_pct
-  0,                          // gf_cbr_boost_pct
-  0,                          // lossless
-  1,                          // frame_parallel_decoding_mode
-  NO_AQ,                      // aq_mode
-  0,                          // frame_periodic_delta_q
-  VPX_BITS_8,                 // Bit depth
-  VP9E_CONTENT_DEFAULT,       // content
-  VPX_CS_UNKNOWN,             // color space
-  0,                          // color range
-  0,                          // render width
-  0,                          // render height
+  0,                            // cpu_used
+  1,                            // enable_auto_alt_ref
+#if CONFIG_EXT_REFS
+  0,                            // enable_auto_bwd_ref
+#endif  // CONFIG_EXT_REFS
+  0,                            // noise_sensitivity
+  0,                            // sharpness
+  0,                            // static_thresh
+#if CONFIG_EXT_TILE
+  UINT_MAX,                     // tile_columns
+  UINT_MAX,                     // tile_rows
+#else
+  0,                            // tile_columns
+  0,                            // tile_rows
+#endif  // CONFIG_EXT_TILE
+  7,                            // arnr_max_frames
+  5,                            // arnr_strength
+  0,                            // min_gf_interval; 0 -> default decision
+  0,                            // max_gf_interval; 0 -> default decision
+  VP8_TUNE_PSNR,                // tuning
+  10,                           // cq_level
+  0,                            // rc_max_intra_bitrate_pct
+  0,                            // rc_max_inter_bitrate_pct
+  0,                            // gf_cbr_boost_pct
+  0,                            // lossless
+  1,                            // frame_parallel_decoding_mode
+  NO_AQ,                        // aq_mode
+  0,                            // frame_periodic_delta_q
+  VPX_BITS_8,                   // Bit depth
+  VP9E_CONTENT_DEFAULT,         // content
+  VPX_CS_UNKNOWN,               // color space
+  0,                            // color range
+  0,                            // render width
+  0,                            // render height
+  VPX_SUPERBLOCK_SIZE_DYNAMIC   // superblock_size
 };
 
 struct vpx_codec_alg_priv {
@@ -92,9 +105,6 @@
   size_t                  pending_cx_data_sz;
   int                     pending_frame_count;
   size_t                  pending_frame_sizes[8];
-#if !CONFIG_MISC_FIXES
-  size_t                  pending_frame_magnitude;
-#endif
   vpx_image_t             preview_img;
   vpx_enc_frame_flags_t   next_frame_flags;
   vp8_postproc_cfg_t      preview_ppcfg;
@@ -105,19 +115,6 @@
   BufferPool              *buffer_pool;
 };
 
-static VP9_REFFRAME ref_frame_to_vp10_reframe(vpx_ref_frame_type_t frame) {
-  switch (frame) {
-    case VP8_LAST_FRAME:
-      return VP9_LAST_FLAG;
-    case VP8_GOLD_FRAME:
-      return VP9_GOLD_FLAG;
-    case VP8_ALTR_FRAME:
-      return VP9_ALT_FLAG;
-  }
-  assert(0 && "Invalid Reference Frame");
-  return VP9_LAST_FLAG;
-}
-
 static vpx_codec_err_t update_error_state(vpx_codec_alg_priv_t *ctx,
     const struct vpx_internal_error_info *error) {
   const vpx_codec_err_t res = error->error_code;
@@ -208,10 +205,35 @@
           "or kf_max_dist instead.");
 
   RANGE_CHECK(extra_cfg, enable_auto_alt_ref, 0, 2);
+#if CONFIG_EXT_REFS
+  RANGE_CHECK(extra_cfg, enable_auto_bwd_ref, 0, 2);
+#endif  // CONFIG_EXT_REFS
   RANGE_CHECK(extra_cfg, cpu_used, -8, 8);
   RANGE_CHECK_HI(extra_cfg, noise_sensitivity, 6);
+  RANGE_CHECK(extra_cfg, superblock_size,
+              VPX_SUPERBLOCK_SIZE_64X64, VPX_SUPERBLOCK_SIZE_DYNAMIC);
+#if CONFIG_EXT_TILE
+  // TODO(any): Waring. If CONFIG_EXT_TILE is true, tile_columns really
+  // means tile_width, and tile_rows really means tile_hight. The interface
+  // should be sanitized.
+#if CONFIG_EXT_PARTITION
+  if (extra_cfg->superblock_size != VPX_SUPERBLOCK_SIZE_64X64) {
+    if (extra_cfg->tile_columns != UINT_MAX)
+      RANGE_CHECK(extra_cfg, tile_columns, 1, 32);
+    if (extra_cfg->tile_rows != UINT_MAX)
+      RANGE_CHECK(extra_cfg, tile_rows, 1, 32);
+  } else
+#endif  // CONFIG_EXT_PARTITION
+  {
+    if (extra_cfg->tile_columns != UINT_MAX)
+      RANGE_CHECK(extra_cfg, tile_columns, 1, 64);
+    if (extra_cfg->tile_rows != UINT_MAX)
+      RANGE_CHECK(extra_cfg, tile_rows, 1, 64);
+  }
+#else
   RANGE_CHECK(extra_cfg, tile_columns, 0, 6);
   RANGE_CHECK(extra_cfg, tile_rows, 0, 2);
+#endif  // CONFIG_EXT_TILE
   RANGE_CHECK_HI(extra_cfg, sharpness, 7);
   RANGE_CHECK(extra_cfg, arnr_max_frames, 0, 15);
   RANGE_CHECK_HI(extra_cfg, arnr_strength, 6);
@@ -398,6 +420,9 @@
   oxcf->speed                  =  abs(extra_cfg->cpu_used);
   oxcf->encode_breakout        =  extra_cfg->static_thresh;
   oxcf->enable_auto_arf        =  extra_cfg->enable_auto_alt_ref;
+#if CONFIG_EXT_REFS
+  oxcf->enable_auto_brf        =  extra_cfg->enable_auto_bwd_ref;
+#endif  // CONFIG_EXT_REFS
   oxcf->noise_sensitivity      =  extra_cfg->noise_sensitivity;
   oxcf->sharpness              =  extra_cfg->sharpness;
 
@@ -419,8 +444,25 @@
   oxcf->tuning = extra_cfg->tuning;
   oxcf->content = extra_cfg->content;
 
+#if CONFIG_EXT_PARTITION
+  oxcf->superblock_size = extra_cfg->superblock_size;
+#endif  // CONFIG_EXT_PARTITION
+
+#if CONFIG_EXT_TILE
+  {
+#if CONFIG_EXT_PARTITION
+    const unsigned int max =
+      extra_cfg->superblock_size == VPX_SUPERBLOCK_SIZE_64X64 ? 64 : 32;
+#else
+    const unsigned int max = 64;
+#endif  // CONFIG_EXT_PARTITION
+    oxcf->tile_columns = VPXMIN(extra_cfg->tile_columns, max);
+    oxcf->tile_rows    = VPXMIN(extra_cfg->tile_rows, max);
+  }
+#else
   oxcf->tile_columns = extra_cfg->tile_columns;
   oxcf->tile_rows    = extra_cfg->tile_rows;
+#endif  // CONFIG_EXT_TILE
 
   oxcf->error_resilient_mode         = cfg->g_error_resilient;
   oxcf->frame_parallel_decoding_mode = extra_cfg->frame_parallel_decoding_mode;
@@ -544,6 +586,15 @@
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+#if CONFIG_EXT_REFS
+static vpx_codec_err_t ctrl_set_enable_auto_bwd_ref(vpx_codec_alg_priv_t *ctx,
+                                                    va_list args) {
+  struct vp10_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.enable_auto_bwd_ref = CAST(VP8E_SET_ENABLEAUTOBWDREF, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+#endif  // CONFIG_EXT_REFS
+
 static vpx_codec_err_t ctrl_set_noise_sensitivity(vpx_codec_alg_priv_t *ctx,
                                                   va_list args) {
   struct vp10_extracfg extra_cfg = ctx->extra_cfg;
@@ -784,39 +835,30 @@
   uint8_t marker = 0xc0;
   unsigned int mask;
   int mag, index_sz;
-#if CONFIG_MISC_FIXES
   int i;
   size_t max_frame_sz = 0;
-#endif
 
   assert(ctx->pending_frame_count);
   assert(ctx->pending_frame_count <= 8);
 
   // Add the number of frames to the marker byte
   marker |= ctx->pending_frame_count - 1;
-#if CONFIG_MISC_FIXES
   for (i = 0; i < ctx->pending_frame_count - 1; i++) {
     const size_t frame_sz = (unsigned int) ctx->pending_frame_sizes[i] - 1;
     max_frame_sz = frame_sz > max_frame_sz ? frame_sz : max_frame_sz;
   }
-#endif
 
   // Choose the magnitude
   for (mag = 0, mask = 0xff; mag < 4; mag++) {
-#if CONFIG_MISC_FIXES
     if (max_frame_sz <= mask)
       break;
-#else
-    if (ctx->pending_frame_magnitude < mask)
-      break;
-#endif
     mask <<= 8;
     mask |= 0xff;
   }
   marker |= mag << 3;
 
   // Write the index
-  index_sz = 2 + (mag + 1) * (ctx->pending_frame_count - CONFIG_MISC_FIXES);
+  index_sz = 2 + (mag + 1) * (ctx->pending_frame_count - 1);
   if (ctx->pending_cx_data_sz + index_sz < ctx->cx_data_sz) {
     uint8_t *x = ctx->pending_cx_data + ctx->pending_cx_data_sz;
     int i, j;
@@ -836,11 +878,11 @@
 #endif
 
     *x++ = marker;
-    for (i = 0; i < ctx->pending_frame_count - CONFIG_MISC_FIXES; i++) {
+    for (i = 0; i < ctx->pending_frame_count - 1; i++) {
       unsigned int this_sz;
 
       assert(ctx->pending_frame_sizes[i] > 0);
-      this_sz = (unsigned int)ctx->pending_frame_sizes[i] - CONFIG_MISC_FIXES;
+      this_sz = (unsigned int)ctx->pending_frame_sizes[i] - 1;
       for (j = 0; j <= mag; j++) {
         *x++ = this_sz & 0xff;
         this_sz >>= 8;
@@ -901,10 +943,14 @@
     // TODO(jzern) the checks related to cpi's validity should be treated as a
     // failure condition, encoder setup is done fully in init() currently.
     if (res == VPX_CODEC_OK) {
+#if CONFIG_EXT_REFS
+      data_sz = ctx->cfg.g_w * ctx->cfg.g_h * get_image_bps(img);
+#else
       // There's no codec control for multiple alt-refs so check the encoder
       // instance for its status to determine the compressed data size.
       data_sz = ctx->cfg.g_w * ctx->cfg.g_h * get_image_bps(img) / 8 *
                 (cpi->multi_arf_allowed ? 8 : 2);
+#endif  // CONFIG_EXT_REFS
       if (data_sz < 4096)
         data_sz = 4096;
       if (ctx->cx_data == NULL || ctx->cx_data_sz < data_sz) {
@@ -1005,9 +1051,6 @@
             ctx->pending_cx_data = cx_data;
           ctx->pending_cx_data_sz += size;
           ctx->pending_frame_sizes[ctx->pending_frame_count++] = size;
-#if !CONFIG_MISC_FIXES
-          ctx->pending_frame_magnitude |= size;
-#endif
           cx_data += size;
           cx_data_sz -= size;
 
@@ -1024,9 +1067,6 @@
             ctx->pending_cx_data = NULL;
             ctx->pending_cx_data_sz = 0;
             ctx->pending_frame_count = 0;
-#if !CONFIG_MISC_FIXES
-            ctx->pending_frame_magnitude = 0;
-#endif
             ctx->output_cx_pkt_cb.output_cx_pkt(
                 &pkt, ctx->output_cx_pkt_cb.user_priv);
           }
@@ -1043,9 +1083,6 @@
 
         if (ctx->pending_cx_data) {
           ctx->pending_frame_sizes[ctx->pending_frame_count++] = size;
-#if !CONFIG_MISC_FIXES
-          ctx->pending_frame_magnitude |= size;
-#endif
           ctx->pending_cx_data_sz += size;
           // write the superframe only for the case when
           if (!ctx->output_cx_pkt_cb.output_cx_pkt)
@@ -1055,9 +1092,6 @@
           ctx->pending_cx_data = NULL;
           ctx->pending_cx_data_sz = 0;
           ctx->pending_frame_count = 0;
-#if !CONFIG_MISC_FIXES
-          ctx->pending_frame_magnitude = 0;
-#endif
         } else {
           pkt.data.frame.buf = cx_data;
           pkt.data.frame.sz  = size;
@@ -1132,6 +1166,24 @@
   }
 }
 
+static vpx_codec_err_t ctrl_get_new_frame_image(vpx_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  vpx_image_t *const new_img = va_arg(args, vpx_image_t *);
+
+  if (new_img != NULL) {
+    YV12_BUFFER_CONFIG new_frame;
+
+    if (vp10_get_last_show_frame(ctx->cpi, &new_frame) == 0) {
+      yuvconfig2image(new_img, &new_frame, NULL);
+      return VPX_CODEC_OK;
+    } else {
+      return VPX_CODEC_ERROR;
+    }
+  } else {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+}
+
 static vpx_codec_err_t ctrl_set_previewpp(vpx_codec_alg_priv_t *ctx,
                                           va_list args) {
 #if CONFIG_VP9_POSTPROC
@@ -1169,6 +1221,14 @@
   }
 }
 
+static vpx_codec_err_t ctrl_use_reference(vpx_codec_alg_priv_t *ctx,
+                                          va_list args) {
+  const int reference_flag = va_arg(args, int);
+
+  vp10_use_as_reference(ctx->cpi, reference_flag);
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_err_t ctrl_set_roi_map(vpx_codec_alg_priv_t *ctx,
                                         va_list args) {
   (void)ctx;
@@ -1263,8 +1323,16 @@
   return update_extra_cfg(ctx, &extra_cfg);
 }
 
+static vpx_codec_err_t ctrl_set_superblock_size(vpx_codec_alg_priv_t *ctx,
+                                            va_list args) {
+  struct vp10_extracfg extra_cfg = ctx->extra_cfg;
+  extra_cfg.superblock_size = CAST(VP10E_SET_SUPERBLOCK_SIZE, args);
+  return update_extra_cfg(ctx, &extra_cfg);
+}
+
 static vpx_codec_ctrl_fn_map_t encoder_ctrl_maps[] = {
   {VP8_COPY_REFERENCE,                ctrl_copy_reference},
+  {VP8E_USE_REFERENCE,                ctrl_use_reference},
 
   // Setters
   {VP8_SET_REFERENCE,                 ctrl_set_reference},
@@ -1274,6 +1342,9 @@
   {VP8E_SET_SCALEMODE,                ctrl_set_scale_mode},
   {VP8E_SET_CPUUSED,                  ctrl_set_cpuused},
   {VP8E_SET_ENABLEAUTOALTREF,         ctrl_set_enable_auto_alt_ref},
+#if CONFIG_EXT_REFS
+  {VP8E_SET_ENABLEAUTOBWDREF,         ctrl_set_enable_auto_bwd_ref},
+#endif  // CONFIG_EXT_REFS
   {VP8E_SET_SHARPNESS,                ctrl_set_sharpness},
   {VP8E_SET_STATIC_THRESHOLD,         ctrl_set_static_thresh},
   {VP9E_SET_TILE_COLUMNS,             ctrl_set_tile_columns},
@@ -1298,12 +1369,14 @@
   {VP9E_SET_MIN_GF_INTERVAL,          ctrl_set_min_gf_interval},
   {VP9E_SET_MAX_GF_INTERVAL,          ctrl_set_max_gf_interval},
   {VP9E_SET_RENDER_SIZE,              ctrl_set_render_size},
+  {VP10E_SET_SUPERBLOCK_SIZE,         ctrl_set_superblock_size},
 
   // Getters
   {VP8E_GET_LAST_QUANTIZER,           ctrl_get_quantizer},
   {VP8E_GET_LAST_QUANTIZER_64,        ctrl_get_quantizer64},
   {VP9_GET_REFERENCE,                 ctrl_get_reference},
   {VP9E_GET_ACTIVEMAP,                ctrl_get_active_map},
+  {VP10_GET_NEW_FRAME_IMAGE,          ctrl_get_new_frame_image},
 
   { -1, NULL},
 };
diff --git a/vp10/vp10_dx_iface.c b/vp10/vp10_dx_iface.c
index 33337a4..d5c4c1c 100644
--- a/vp10/vp10_dx_iface.c
+++ b/vp10/vp10_dx_iface.c
@@ -23,6 +23,7 @@
 
 #include "vp10/common/alloccommon.h"
 #include "vp10/common/frame_buffers.h"
+#include "vp10/common/enums.h"
 
 #include "vp10/decoder/decoder.h"
 #include "vp10/decoder/decodeframe.h"
@@ -57,6 +58,8 @@
   int                     last_show_frame;  // Index of last output frame.
   int                     byte_alignment;
   int                     skip_loop_filter;
+  int                     decode_tile_row;
+  int                     decode_tile_col;
 
   // Frame parallel related.
   int                     frame_parallel_decode;  // frame-based threading.
@@ -122,6 +125,9 @@
 #if CONFIG_VP9_POSTPROC
       vp10_free_postproc_buffers(&frame_worker_data->pbi->common);
 #endif
+#if CONFIG_LOOP_RESTORATION
+      vp10_free_restoration_buffers(&frame_worker_data->pbi->common);
+#endif  // CONFIG_LOOP_RESTORATION
       vp10_decoder_remove(frame_worker_data->pbi);
       vpx_free(frame_worker_data->scratch_buffer);
 #if CONFIG_MULTITHREAD
@@ -496,6 +502,11 @@
     frame_worker_data->pbi->decrypt_cb = ctx->decrypt_cb;
     frame_worker_data->pbi->decrypt_state = ctx->decrypt_state;
 
+#if CONFIG_EXT_TILE
+    frame_worker_data->pbi->dec_tile_row = ctx->decode_tile_row;
+    frame_worker_data->pbi->dec_tile_col = ctx->decode_tile_col;
+#endif  // CONFIG_EXT_TILE
+
     worker->had_error = 0;
     winterface->execute(worker);
 
@@ -772,6 +783,39 @@
           if (ctx->need_resync)
             return NULL;
           yuvconfig2image(&ctx->img, &sd, frame_worker_data->user_priv);
+
+
+#if CONFIG_EXT_TILE
+          if (frame_worker_data->pbi->dec_tile_row >= 0) {
+            const int tile_row = VPXMIN(frame_worker_data->pbi->dec_tile_row,
+                                        cm->tile_rows - 1);
+            const int mi_row = tile_row * cm->tile_height;
+            const int ssy = ctx->img.y_chroma_shift;
+            int plane;
+            ctx->img.planes[0] += mi_row * MI_SIZE * ctx->img.stride[0];
+            for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+              ctx->img.planes[plane] += mi_row * (MI_SIZE >> ssy) *
+                                        ctx->img.stride[plane];
+            }
+            ctx->img.d_h = VPXMIN(cm->tile_height, cm->mi_rows - mi_row) *
+                           MI_SIZE;
+          }
+
+          if (frame_worker_data->pbi->dec_tile_col >= 0) {
+            const int tile_col = VPXMIN(frame_worker_data->pbi->dec_tile_col,
+                                        cm->tile_cols - 1);
+            const int mi_col = tile_col * cm->tile_width;
+            const int ssx = ctx->img.x_chroma_shift;
+            int plane;
+            ctx->img.planes[0] += mi_col * MI_SIZE;
+            for (plane = 1; plane < MAX_MB_PLANE; ++plane) {
+              ctx->img.planes[plane] += mi_col * (MI_SIZE >> ssx);
+            }
+            ctx->img.d_w = VPXMIN(cm->tile_width, cm->mi_cols - mi_col) *
+                           MI_SIZE;
+          }
+#endif  // CONFIG_EXT_TILE
+
           ctx->img.fb_priv = frame_bufs[cm->new_fb_idx].raw_frame_buffer.priv;
           img = &ctx->img;
           return img;
@@ -824,7 +868,8 @@
     FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
     image2yuvconfig(&frame->img, &sd);
     return vp10_set_reference_dec(&frame_worker_data->pbi->common,
-                                 (VP9_REFFRAME)frame->frame_type, &sd);
+                                  ref_frame_to_vp10_reframe(frame->frame_type),
+                                  &sd);
   } else {
     return VPX_CODEC_INVALID_PARAM;
   }
@@ -876,6 +921,32 @@
   }
 }
 
+static vpx_codec_err_t ctrl_get_new_frame_image(vpx_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  vpx_image_t *new_img = va_arg(args, vpx_image_t *);
+
+  // Only support this function in serial decode.
+  if (ctx->frame_parallel_decode) {
+    set_error_detail(ctx, "Not supported in frame parallel decode");
+    return VPX_CODEC_INCAPABLE;
+  }
+
+  if (new_img) {
+    YV12_BUFFER_CONFIG new_frame;
+    VPxWorker *const worker = ctx->frame_workers;
+    FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
+
+    if (vp10_get_frame_to_show(frame_worker_data->pbi, &new_frame) == 0) {
+      yuvconfig2image(new_img, &new_frame, NULL);
+      return VPX_CODEC_OK;
+    } else {
+      return VPX_CODEC_ERROR;
+    }
+  } else {
+    return VPX_CODEC_INVALID_PARAM;
+  }
+}
+
 static vpx_codec_err_t ctrl_set_postproc(vpx_codec_alg_priv_t *ctx,
                                          va_list args) {
 #if CONFIG_VP9_POSTPROC
@@ -1075,6 +1146,18 @@
   return VPX_CODEC_OK;
 }
 
+static vpx_codec_err_t ctrl_set_decode_tile_row(vpx_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  ctx->decode_tile_row = va_arg(args, int);
+  return VPX_CODEC_OK;
+}
+
+static vpx_codec_err_t ctrl_set_decode_tile_col(vpx_codec_alg_priv_t *ctx,
+                                                va_list args) {
+  ctx->decode_tile_col = va_arg(args, int);
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_ctrl_fn_map_t decoder_ctrl_maps[] = {
   {VP8_COPY_REFERENCE,            ctrl_copy_reference},
 
@@ -1089,6 +1172,8 @@
   {VPXD_SET_DECRYPTOR,            ctrl_set_decryptor},
   {VP9_SET_BYTE_ALIGNMENT,        ctrl_set_byte_alignment},
   {VP9_SET_SKIP_LOOP_FILTER,      ctrl_set_skip_loop_filter},
+  {VP10_SET_DECODE_TILE_ROW,      ctrl_set_decode_tile_row},
+  {VP10_SET_DECODE_TILE_COL,      ctrl_set_decode_tile_col},
 
   // Getters
   {VP8D_GET_LAST_REF_UPDATES,     ctrl_get_last_ref_updates},
@@ -1097,6 +1182,7 @@
   {VP9D_GET_DISPLAY_SIZE,         ctrl_get_render_size},
   {VP9D_GET_BIT_DEPTH,            ctrl_get_bit_depth},
   {VP9D_GET_FRAME_SIZE,           ctrl_get_frame_size},
+  {VP10_GET_NEW_FRAME_IMAGE,      ctrl_get_new_frame_image},
 
   { -1, NULL},
 };
diff --git a/vp10/vp10_iface_common.h b/vp10/vp10_iface_common.h
index b2b4b7d..8a80bbe 100644
--- a/vp10/vp10_iface_common.h
+++ b/vp10/vp10_iface_common.h
@@ -133,4 +133,16 @@
   return VPX_CODEC_OK;
 }
 
+static VP9_REFFRAME ref_frame_to_vp10_reframe(vpx_ref_frame_type_t frame) {
+  switch (frame) {
+    case VP8_LAST_FRAME:
+      return VP9_LAST_FLAG;
+    case VP8_GOLD_FRAME:
+      return VP9_GOLD_FLAG;
+    case VP8_ALTR_FRAME:
+      return VP9_ALT_FLAG;
+  }
+  assert(0 && "Invalid Reference Frame");
+  return VP9_LAST_FLAG;
+}
 #endif  // VP10_VP10_IFACE_COMMON_H_
diff --git a/vp10/vp10cx.mk b/vp10/vp10cx.mk
index 4f265b5..5d5c88a 100644
--- a/vp10/vp10cx.mk
+++ b/vp10/vp10cx.mk
@@ -18,11 +18,16 @@
 VP10_CX_SRCS-yes += vp10_cx_iface.c
 
 VP10_CX_SRCS-yes += encoder/bitstream.c
+VP10_CX_SRCS-yes += encoder/bitwriter.h
 VP10_CX_SRCS-yes += encoder/context_tree.c
 VP10_CX_SRCS-yes += encoder/context_tree.h
+VP10_CX_SRCS-yes += encoder/variance_tree.c
+VP10_CX_SRCS-yes += encoder/variance_tree.h
 VP10_CX_SRCS-yes += encoder/cost.h
 VP10_CX_SRCS-yes += encoder/cost.c
 VP10_CX_SRCS-yes += encoder/dct.c
+VP10_CX_SRCS-yes += encoder/hybrid_fwd_txfm.c
+VP10_CX_SRCS-yes += encoder/hybrid_fwd_txfm.h
 VP10_CX_SRCS-$(CONFIG_VP9_TEMPORAL_DENOISING) += encoder/denoiser.c
 VP10_CX_SRCS-$(CONFIG_VP9_TEMPORAL_DENOISING) += encoder/denoiser.h
 VP10_CX_SRCS-yes += encoder/encodeframe.c
@@ -51,8 +56,12 @@
 VP10_CX_SRCS-yes += encoder/treewriter.h
 VP10_CX_SRCS-yes += encoder/mcomp.c
 VP10_CX_SRCS-yes += encoder/encoder.c
+VP10_CX_SRCS-yes += encoder/palette.h
+VP10_CX_SRCS-yes += encoder/palette.c
 VP10_CX_SRCS-yes += encoder/picklpf.c
 VP10_CX_SRCS-yes += encoder/picklpf.h
+VP10_CX_SRCS-$(CONFIG_LOOP_RESTORATION) += encoder/pickrst.c
+VP10_CX_SRCS-$(CONFIG_LOOP_RESTORATION) += encoder/pickrst.h
 VP10_CX_SRCS-yes += encoder/quantize.c
 VP10_CX_SRCS-yes += encoder/ratectrl.c
 VP10_CX_SRCS-yes += encoder/rd.c
@@ -66,6 +75,8 @@
 VP10_CX_SRCS-yes += encoder/resize.c
 VP10_CX_SRCS-yes += encoder/resize.h
 VP10_CX_SRCS-$(CONFIG_INTERNAL_STATS) += encoder/blockiness.c
+VP10_CX_SRCS-$(CONFIG_ANS) += encoder/buf_ans.h
+VP10_CX_SRCS-$(CONFIG_ANS) += encoder/buf_ans.c
 
 VP10_CX_SRCS-yes += encoder/tokenize.c
 VP10_CX_SRCS-yes += encoder/treewriter.c
@@ -105,10 +116,18 @@
 
 VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/dct_intrin_sse2.c
 VP10_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/dct_ssse3.c
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+VP10_CX_SRCS-$(HAVE_SSE4_1) += encoder/x86/highbd_fwd_txfm_sse4.c
+VP10_CX_SRCS-$(HAVE_SSE4_1) += common/x86/highbd_inv_txfm_sse4.c
+endif
 
 ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes)
 VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/denoiser_sse2.c
 endif
+ifeq ($(CONFIG_EXT_INTER),yes)
+VP10_CX_SRCS-yes += encoder/wedge_utils.c
+VP10_CX_SRCS-$(HAVE_SSE2) += encoder/x86/wedge_utils_sse2.c
+endif
 
 VP10_CX_SRCS-$(HAVE_AVX2) += encoder/x86/error_intrin_avx2.c
 
diff --git a/vp10/vp10dx.mk b/vp10/vp10dx.mk
index fce6d0d..ae68475 100644
--- a/vp10/vp10dx.mk
+++ b/vp10/vp10dx.mk
@@ -29,5 +29,6 @@
 VP10_DX_SRCS-yes += decoder/decoder.h
 VP10_DX_SRCS-yes += decoder/dsubexp.c
 VP10_DX_SRCS-yes += decoder/dsubexp.h
+VP10_DX_SRCS-yes += decoder/bitreader.h
 
 VP10_DX_SRCS-yes := $(filter-out $(VP10_DX_SRCS_REMOVE-yes),$(VP10_DX_SRCS-yes))
diff --git a/vp8/common/common.h b/vp8/common/common.h
index e58a9cc..c42e875 100644
--- a/vp8/common/common.h
+++ b/vp8/common/common.h
@@ -32,13 +32,13 @@
 /* Use this for variably-sized arrays. */
 
 #define vp8_copy_array( Dest, Src, N) { \
-        assert( sizeof( *Dest) == sizeof( *Src)); \
-        memcpy( Dest, Src, N * sizeof( *Src)); \
+        assert( sizeof( *(Dest)) == sizeof( *(Src))); \
+        memcpy( Dest, Src, N * sizeof( *(Src))); \
     }
 
-#define vp8_zero( Dest)  memset( &Dest, 0, sizeof( Dest));
+#define vp8_zero( Dest)  memset( &(Dest), 0, sizeof( Dest));
 
-#define vp8_zero_array( Dest, N)  memset( Dest, 0, N * sizeof( *Dest));
+#define vp8_zero_array( Dest, N)  memset( Dest, 0, N * sizeof( *(Dest)));
 
 
 #ifdef __cplusplus
diff --git a/vp8/encoder/onyx_if.c b/vp8/encoder/onyx_if.c
index d5a0fff..8511af2 100644
--- a/vp8/encoder/onyx_if.c
+++ b/vp8/encoder/onyx_if.c
@@ -21,7 +21,7 @@
 #include "vp8/common/alloccommon.h"
 #include "mcomp.h"
 #include "firstpass.h"
-#include "vpx/internal/vpx_psnr.h"
+#include "vpx_dsp/psnr.h"
 #include "vpx_scale/vpx_scale.h"
 #include "vp8/common/extend.h"
 #include "ratectrl.h"
@@ -2001,8 +2001,6 @@
 
     cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
 #if CONFIG_INTERNAL_STATS
-    cpi->b_calculate_ssimg = 0;
-
     cpi->count = 0;
     cpi->bytes = 0;
 
@@ -2023,14 +2021,6 @@
         cpi->summed_weights = 0;
     }
 
-    if (cpi->b_calculate_ssimg)
-    {
-        cpi->total_ssimg_y = 0;
-        cpi->total_ssimg_u = 0;
-        cpi->total_ssimg_v = 0;
-        cpi->total_ssimg_all = 0;
-    }
-
 #endif
 
     cpi->first_time_stamp_ever = 0x7FFFFFFF;
@@ -2313,45 +2303,6 @@
                                rate_err, fabs(rate_err));
                 }
             }
-
-            if (cpi->b_calculate_ssimg)
-            {
-                if (cpi->oxcf.number_of_layers > 1)
-                {
-                    int i;
-
-                    fprintf(f, "Layer\tBitRate\tSSIM_Y\tSSIM_U\tSSIM_V\tSSIM_A\t"
-                               "Time(us)\n");
-                    for (i=0; i<(int)cpi->oxcf.number_of_layers; i++)
-                    {
-                        double dr = (double)cpi->bytes_in_layer[i] *
-                                    8.0 / 1000.0  / time_encoded;
-                        fprintf(f, "%5d\t%7.3f\t%6.4f\t"
-                                "%6.4f\t%6.4f\t%6.4f\t%8.0f\n",
-                                i, dr,
-                                cpi->total_ssimg_y_in_layer[i] /
-                                     cpi->frames_in_layer[i],
-                                cpi->total_ssimg_u_in_layer[i] /
-                                     cpi->frames_in_layer[i],
-                                cpi->total_ssimg_v_in_layer[i] /
-                                     cpi->frames_in_layer[i],
-                                cpi->total_ssimg_all_in_layer[i] /
-                                     cpi->frames_in_layer[i],
-                                total_encode_time);
-                    }
-                }
-                else
-                {
-                    fprintf(f, "BitRate\tSSIM_Y\tSSIM_U\tSSIM_V\tSSIM_A\t"
-                               "Time(us)\n");
-                    fprintf(f, "%7.3f\t%6.4f\t%6.4f\t%6.4f\t%6.4f\t%8.0f\n", dr,
-                            cpi->total_ssimg_y / cpi->count,
-                            cpi->total_ssimg_u / cpi->count,
-                            cpi->total_ssimg_v / cpi->count,
-                            cpi->total_ssimg_all / cpi->count, total_encode_time);
-                }
-            }
-
             fclose(f);
 #if 0
             f = fopen("qskip.stt", "a");
@@ -5746,38 +5697,6 @@
                 }
 #endif
             }
-
-            if (cpi->b_calculate_ssimg)
-            {
-                double y, u, v, frame_all;
-                frame_all = vpx_calc_ssimg(cpi->Source, cm->frame_to_show,
-                    &y, &u, &v);
-
-                if (cpi->oxcf.number_of_layers > 1)
-                {
-                    unsigned int i;
-
-                    for (i=cpi->current_layer;
-                         i<cpi->oxcf.number_of_layers; i++)
-                    {
-                        if (!cpi->b_calculate_psnr)
-                            cpi->frames_in_layer[i]++;
-
-                        cpi->total_ssimg_y_in_layer[i] += y;
-                        cpi->total_ssimg_u_in_layer[i] += u;
-                        cpi->total_ssimg_v_in_layer[i] += v;
-                        cpi->total_ssimg_all_in_layer[i] += frame_all;
-                    }
-                }
-                else
-                {
-                    cpi->total_ssimg_y += y;
-                    cpi->total_ssimg_u += u;
-                    cpi->total_ssimg_v += v;
-                    cpi->total_ssimg_all += frame_all;
-                }
-            }
-
         }
     }
 
diff --git a/vp8/encoder/onyx_int.h b/vp8/encoder/onyx_int.h
index 44fbbd4..716f878c 100644
--- a/vp8/encoder/onyx_int.h
+++ b/vp8/encoder/onyx_int.h
@@ -636,13 +636,6 @@
     double summed_weights;
     unsigned int tot_recode_hits;
 
-
-    double total_ssimg_y;
-    double total_ssimg_u;
-    double total_ssimg_v;
-    double total_ssimg_all;
-
-    int b_calculate_ssimg;
 #endif
     int b_calculate_psnr;
 
@@ -688,11 +681,6 @@
     double sum_ssim[VPX_TS_MAX_LAYERS];
     double sum_weights[VPX_TS_MAX_LAYERS];
 
-    double total_ssimg_y_in_layer[VPX_TS_MAX_LAYERS];
-    double total_ssimg_u_in_layer[VPX_TS_MAX_LAYERS];
-    double total_ssimg_v_in_layer[VPX_TS_MAX_LAYERS];
-    double total_ssimg_all_in_layer[VPX_TS_MAX_LAYERS];
-
 #if CONFIG_MULTI_RES_ENCODING
     /* Number of MBs per row at lower-resolution level */
     int    mr_low_res_mb_cols;
diff --git a/vp9/common/vp9_common.h b/vp9/common/vp9_common.h
index 908fa80..38815ac 100644
--- a/vp9/common/vp9_common.h
+++ b/vp9/common/vp9_common.h
@@ -33,12 +33,12 @@
 
 // Use this for variably-sized arrays.
 #define vp9_copy_array(dest, src, n) {       \
-    assert(sizeof(*dest) == sizeof(*src));   \
-    memcpy(dest, src, n * sizeof(*src)); \
+    assert(sizeof(*(dest)) == sizeof(*(src)));   \
+    memcpy(dest, src, n * sizeof(*(src))); \
   }
 
 #define vp9_zero(dest) memset(&(dest), 0, sizeof(dest))
-#define vp9_zero_array(dest, n) memset(dest, 0, n * sizeof(*dest))
+#define vp9_zero_array(dest, n) memset(dest, 0, n * sizeof(*(dest)))
 
 static INLINE int get_unsigned_bits(unsigned int num_values) {
   return num_values > 0 ? get_msb(num_values) + 1 : 0;
diff --git a/vp9/common/vp9_thread_common.c b/vp9/common/vp9_thread_common.c
index db78d6b..033326d 100644
--- a/vp9/common/vp9_thread_common.c
+++ b/vp9/common/vp9_thread_common.c
@@ -379,11 +379,11 @@
   for (i = 0; i < REF_CONTEXTS; i++)
     for (j = 0; j < 2; j++)
       for (k = 0; k < 2; k++)
-      accum->single_ref[i][j][k] += counts->single_ref[i][j][k];
+        accum->single_ref[i][j][k] += counts->single_ref[i][j][k];
 
   for (i = 0; i < REF_CONTEXTS; i++)
     for (j = 0; j < 2; j++)
-      accum->comp_ref[i][j] += counts->comp_ref[i][j];
+        accum->comp_ref[i][j] += counts->comp_ref[i][j];
 
   for (i = 0; i < TX_SIZE_CONTEXTS; i++) {
     for (j = 0; j < TX_SIZES; j++)
diff --git a/vp9/decoder/vp9_decoder.c b/vp9/decoder/vp9_decoder.c
index 935c04f..891613f 100644
--- a/vp9/decoder/vp9_decoder.c
+++ b/vp9/decoder/vp9_decoder.c
@@ -187,47 +187,46 @@
 vpx_codec_err_t vp9_set_reference_dec(VP9_COMMON *cm,
                                       VP9_REFFRAME ref_frame_flag,
                                       YV12_BUFFER_CONFIG *sd) {
-  RefBuffer *ref_buf = NULL;
-  RefCntBuffer *const frame_bufs = cm->buffer_pool->frame_bufs;
+  int idx;
+  YV12_BUFFER_CONFIG *ref_buf = NULL;
 
   // TODO(jkoleszar): The decoder doesn't have any real knowledge of what the
   // encoder is using the frame buffers for. This is just a stub to keep the
   // vpxenc --test-decode functionality working, and will be replaced in a
   // later commit that adds VP9-specific controls for this functionality.
+
+  // (Yunqing) The set_reference control depends on the following setting in
+  // encoder.
+  // cpi->lst_fb_idx = 0;
+  // cpi->gld_fb_idx = 1;
+  // cpi->alt_fb_idx = 2;
   if (ref_frame_flag == VP9_LAST_FLAG) {
-    ref_buf = &cm->frame_refs[0];
+    idx = cm->ref_frame_map[0];
   } else if (ref_frame_flag == VP9_GOLD_FLAG) {
-    ref_buf = &cm->frame_refs[1];
+    idx = cm->ref_frame_map[1];
   } else if (ref_frame_flag == VP9_ALT_FLAG) {
-    ref_buf = &cm->frame_refs[2];
+    idx = cm->ref_frame_map[2];
   } else {
     vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                        "Invalid reference frame");
     return cm->error.error_code;
   }
 
-  if (!equal_dimensions(ref_buf->buf, sd)) {
+  if (idx < 0 || idx >= FRAME_BUFFERS) {
+    vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                       "Invalid reference frame map");
+    return cm->error.error_code;
+  }
+
+  // Get the destination reference buffer.
+  ref_buf = &cm->buffer_pool->frame_bufs[idx].buf;
+
+  if (!equal_dimensions(ref_buf, sd)) {
     vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
                        "Incorrect buffer dimensions");
   } else {
-    int *ref_fb_ptr = &ref_buf->idx;
-
-    // Find an empty frame buffer.
-    const int free_fb = get_free_fb(cm);
-    if (cm->new_fb_idx == INVALID_IDX) {
-      vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
-                         "Unable to find free frame buffer");
-      return cm->error.error_code;
-    }
-
-    // Decrease ref_count since it will be increased again in
-    // ref_cnt_fb() below.
-    --frame_bufs[free_fb].ref_count;
-
-    // Manage the reference counters and copy image.
-    ref_cnt_fb(frame_bufs, ref_fb_ptr, free_fb);
-    ref_buf->buf = &frame_bufs[*ref_fb_ptr].buf;
-    vp8_yv12_copy_frame(sd, ref_buf->buf);
+    // Overwrite the reference frame buffer.
+    vp8_yv12_copy_frame(sd, ref_buf);
   }
 
   return cm->error.error_code;
@@ -247,9 +246,9 @@
     decrease_ref_count(old_idx, frame_bufs, pool);
 
     // Release the reference frame in reference map.
-    if (mask & 1) {
+    if (mask & 1)
       decrease_ref_count(old_idx, frame_bufs, pool);
-    }
+
     cm->ref_frame_map[ref_index] = cm->next_ref_frame_map[ref_index];
     ++ref_index;
   }
@@ -271,7 +270,7 @@
   }
 
   // Invalidate these references until the next frame starts.
-  for (ref_index = 0; ref_index < 3; ref_index++)
+  for (ref_index = 0; ref_index < REFS_PER_FRAME; ref_index++)
     cm->frame_refs[ref_index].idx = -1;
 }
 
@@ -332,7 +331,6 @@
     pbi->cur_buf = &frame_bufs[cm->new_fb_idx];
   }
 
-
   if (setjmp(cm->error.jmp)) {
     const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
     int i;
@@ -357,9 +355,8 @@
         decrease_ref_count(old_idx, frame_bufs, pool);
 
         // Release the reference frame in reference map.
-        if (mask & 1) {
+        if (mask & 1)
           decrease_ref_count(old_idx, frame_bufs, pool);
-        }
         ++ref_index;
       }
 
diff --git a/vp9/encoder/vp9_aq_cyclicrefresh.c b/vp9/encoder/vp9_aq_cyclicrefresh.c
index 3e1a0a5..d8920fb 100644
--- a/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -20,7 +20,6 @@
 
 #include "vp9/encoder/vp9_ratectrl.h"
 #include "vp9/encoder/vp9_segmentation.h"
-
 CYCLIC_REFRESH *vp9_cyclic_refresh_alloc(int mi_rows, int mi_cols) {
   size_t last_coded_q_map_size;
   CYCLIC_REFRESH *const cr = vpx_calloc(1, sizeof(*cr));
diff --git a/vp9/encoder/vp9_encodeframe.c b/vp9/encoder/vp9_encodeframe.c
index 984f98a..e6a75d9 100644
--- a/vp9/encoder/vp9_encodeframe.c
+++ b/vp9/encoder/vp9_encodeframe.c
@@ -4272,7 +4272,7 @@
     // either compound, single or hybrid prediction as per whatever has
     // worked best for that type of frame in the past.
     // It also predicts whether another coding mode would have worked
-    // better that this coding mode. If that is the case, it remembers
+    // better than this coding mode. If that is the case, it remembers
     // that for subsequent frames.
     // It does the same analysis for transform size selection also.
     const MV_REFERENCE_FRAME frame_type = get_frame_type(cpi);
diff --git a/vp9/encoder/vp9_encoder.c b/vp9/encoder/vp9_encoder.c
index 3f88d9c..fde1cb9 100644
--- a/vp9/encoder/vp9_encoder.c
+++ b/vp9/encoder/vp9_encoder.c
@@ -16,7 +16,7 @@
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
 #include "./vpx_scale_rtcd.h"
-#include "vpx/internal/vpx_psnr.h"
+#include "vpx_dsp/psnr.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/vpx_filter.h"
 #if CONFIG_INTERNAL_STATS
@@ -1804,7 +1804,6 @@
   init_level_info(&cpi->level_info);
 
 #if CONFIG_INTERNAL_STATS
-  cpi->b_calculate_ssimg = 0;
   cpi->b_calculate_blockiness = 1;
   cpi->b_calculate_consistency = 1;
   cpi->total_inconsistency = 0;
@@ -1828,11 +1827,7 @@
     cpi->summedp_weights = 0;
   }
 
-  if (cpi->b_calculate_ssimg) {
-    cpi->ssimg.worst= 100.0;
-  }
   cpi->fastssim.worst = 100.0;
-
   cpi->psnrhvs.worst = 100.0;
 
   if (cpi->b_calculate_blockiness) {
@@ -2126,13 +2121,6 @@
           SNPRINT2(results, "\t%7.3f", consistency);
           SNPRINT2(results, "\t%7.3f", cpi->worst_consistency);
         }
-
-        if (cpi->b_calculate_ssimg) {
-          SNPRINT(headings, "\t  SSIMG\tWtSSIMG");
-          SNPRINT2(results, "\t%7.3f", cpi->ssimg.stat[ALL] / cpi->count);
-          SNPRINT2(results, "\t%7.3f", cpi->ssimg.worst);
-        }
-
         fprintf(f, "%s\t    Time  Rc-Err Abs Err\n", headings);
         fprintf(f, "%s\t%8.0f %7.2f %7.2f\n", results,
                 total_encode_time, rate_err, fabs(rate_err));
@@ -2226,271 +2214,15 @@
 #endif
 }
 
-/* TODO(yaowu): The block_variance calls the unoptimized versions of variance()
- * and highbd_8_variance(). It should not.
- */
-static void encoder_variance(const uint8_t *a, int  a_stride,
-                             const uint8_t *b, int  b_stride,
-                             int  w, int  h, unsigned int *sse, int *sum) {
-  int i, j;
-
-  *sum = 0;
-  *sse = 0;
-
-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; j++) {
-      const int diff = a[j] - b[j];
-      *sum += diff;
-      *sse += diff * diff;
-    }
-
-    a += a_stride;
-    b += b_stride;
-  }
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static void encoder_highbd_variance64(const uint8_t *a8, int  a_stride,
-                                      const uint8_t *b8, int  b_stride,
-                                      int w, int h, uint64_t *sse,
-                                      int64_t *sum) {
-  int i, j;
-
-  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-  *sum = 0;
-  *sse = 0;
-
-  for (i = 0; i < h; i++) {
-    for (j = 0; j < w; j++) {
-      const int diff = a[j] - b[j];
-      *sum += diff;
-      *sse += diff * diff;
-    }
-    a += a_stride;
-    b += b_stride;
-  }
-}
-
-static void encoder_highbd_8_variance(const uint8_t *a8, int  a_stride,
-                                      const uint8_t *b8, int  b_stride,
-                                      int w, int h,
-                                      unsigned int *sse, int *sum) {
-  uint64_t sse_long = 0;
-  int64_t sum_long = 0;
-  encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h,
-                            &sse_long, &sum_long);
-  *sse = (unsigned int)sse_long;
-  *sum = (int)sum_long;
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-static int64_t get_sse(const uint8_t *a, int a_stride,
-                       const uint8_t *b, int b_stride,
-                       int width, int height) {
-  const int dw = width % 16;
-  const int dh = height % 16;
-  int64_t total_sse = 0;
-  unsigned int sse = 0;
-  int sum = 0;
-  int x, y;
-
-  if (dw > 0) {
-    encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride,
-                     dw, height, &sse, &sum);
-    total_sse += sse;
-  }
-
-  if (dh > 0) {
-    encoder_variance(&a[(height - dh) * a_stride], a_stride,
-                     &b[(height - dh) * b_stride], b_stride,
-                     width - dw, dh, &sse, &sum);
-    total_sse += sse;
-  }
-
-  for (y = 0; y < height / 16; ++y) {
-    const uint8_t *pa = a;
-    const uint8_t *pb = b;
-    for (x = 0; x < width / 16; ++x) {
-      vpx_mse16x16(pa, a_stride, pb, b_stride, &sse);
-      total_sse += sse;
-
-      pa += 16;
-      pb += 16;
-    }
-
-    a += 16 * a_stride;
-    b += 16 * b_stride;
-  }
-
-  return total_sse;
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride,
-                                    const uint8_t *b8, int b_stride,
-                                    int width, int height,
-                                    unsigned int input_shift) {
-  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
-  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
-  int64_t total_sse = 0;
-  int x, y;
-  for (y = 0; y < height; ++y) {
-    for (x = 0; x < width; ++x) {
-      int64_t diff;
-      diff = (a[x] >> input_shift) - (b[x] >> input_shift);
-      total_sse += diff * diff;
-    }
-    a += a_stride;
-    b += b_stride;
-  }
-  return total_sse;
-}
-
-static int64_t highbd_get_sse(const uint8_t *a, int a_stride,
-                              const uint8_t *b, int b_stride,
-                              int width, int height) {
-  int64_t total_sse = 0;
-  int x, y;
-  const int dw = width % 16;
-  const int dh = height % 16;
-  unsigned int sse = 0;
-  int sum = 0;
-  if (dw > 0) {
-    encoder_highbd_8_variance(&a[width - dw], a_stride,
-                              &b[width - dw], b_stride,
-                              dw, height, &sse, &sum);
-    total_sse += sse;
-  }
-  if (dh > 0) {
-    encoder_highbd_8_variance(&a[(height - dh) * a_stride], a_stride,
-                              &b[(height - dh) * b_stride], b_stride,
-                              width - dw, dh, &sse, &sum);
-    total_sse += sse;
-  }
-  for (y = 0; y < height / 16; ++y) {
-    const uint8_t *pa = a;
-    const uint8_t *pb = b;
-    for (x = 0; x < width / 16; ++x) {
-      vpx_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse);
-      total_sse += sse;
-      pa += 16;
-      pb += 16;
-    }
-    a += 16 * a_stride;
-    b += 16 * b_stride;
-  }
-  return total_sse;
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
-typedef struct {
-  double psnr[4];       // total/y/u/v
-  uint64_t sse[4];      // total/y/u/v
-  uint32_t samples[4];  // total/y/u/v
-} PSNR_STATS;
-
-#if CONFIG_VP9_HIGHBITDEPTH
-static void calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
-                             const YV12_BUFFER_CONFIG *b,
-                             PSNR_STATS *psnr,
-                             unsigned int bit_depth,
-                             unsigned int in_bit_depth) {
-  const int widths[3] =
-      {a->y_crop_width,  a->uv_crop_width,  a->uv_crop_width };
-  const int heights[3] =
-      {a->y_crop_height, a->uv_crop_height, a->uv_crop_height};
-  const uint8_t *a_planes[3] = {a->y_buffer, a->u_buffer,  a->v_buffer };
-  const int a_strides[3] = {a->y_stride, a->uv_stride, a->uv_stride};
-  const uint8_t *b_planes[3] = {b->y_buffer, b->u_buffer,  b->v_buffer };
-  const int b_strides[3] = {b->y_stride, b->uv_stride, b->uv_stride};
-  int i;
-  uint64_t total_sse = 0;
-  uint32_t total_samples = 0;
-  const double peak = (double)((1 << in_bit_depth) - 1);
-  const unsigned int input_shift = bit_depth - in_bit_depth;
-
-  for (i = 0; i < 3; ++i) {
-    const int w = widths[i];
-    const int h = heights[i];
-    const uint32_t samples = w * h;
-    uint64_t sse;
-    if (a->flags & YV12_FLAG_HIGHBITDEPTH) {
-      if (input_shift) {
-        sse = highbd_get_sse_shift(a_planes[i], a_strides[i],
-                                   b_planes[i], b_strides[i], w, h,
-                                   input_shift);
-      } else {
-        sse = highbd_get_sse(a_planes[i], a_strides[i],
-                             b_planes[i], b_strides[i], w, h);
-      }
-    } else {
-      sse = get_sse(a_planes[i], a_strides[i],
-                    b_planes[i], b_strides[i],
-                    w, h);
-    }
-    psnr->sse[1 + i] = sse;
-    psnr->samples[1 + i] = samples;
-    psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, peak, (double)sse);
-
-    total_sse += sse;
-    total_samples += samples;
-  }
-
-  psnr->sse[0] = total_sse;
-  psnr->samples[0] = total_samples;
-  psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, peak,
-                                  (double)total_sse);
-}
-
-#else  // !CONFIG_VP9_HIGHBITDEPTH
-
-static void calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
-                      PSNR_STATS *psnr) {
-  static const double peak = 255.0;
-  const int widths[3]        = {
-      a->y_crop_width, a->uv_crop_width, a->uv_crop_width};
-  const int heights[3]       = {
-      a->y_crop_height, a->uv_crop_height, a->uv_crop_height};
-  const uint8_t *a_planes[3] = {a->y_buffer, a->u_buffer, a->v_buffer};
-  const int a_strides[3]     = {a->y_stride, a->uv_stride, a->uv_stride};
-  const uint8_t *b_planes[3] = {b->y_buffer, b->u_buffer, b->v_buffer};
-  const int b_strides[3]     = {b->y_stride, b->uv_stride, b->uv_stride};
-  int i;
-  uint64_t total_sse = 0;
-  uint32_t total_samples = 0;
-
-  for (i = 0; i < 3; ++i) {
-    const int w = widths[i];
-    const int h = heights[i];
-    const uint32_t samples = w * h;
-    const uint64_t sse = get_sse(a_planes[i], a_strides[i],
-                                 b_planes[i], b_strides[i],
-                                 w, h);
-    psnr->sse[1 + i] = sse;
-    psnr->samples[1 + i] = samples;
-    psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, peak, (double)sse);
-
-    total_sse += sse;
-    total_samples += samples;
-  }
-
-  psnr->sse[0] = total_sse;
-  psnr->samples[0] = total_samples;
-  psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, peak,
-                                  (double)total_sse);
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
 static void generate_psnr_packet(VP9_COMP *cpi) {
   struct vpx_codec_cx_pkt pkt;
   int i;
   PSNR_STATS psnr;
 #if CONFIG_VP9_HIGHBITDEPTH
-  calc_highbd_psnr(cpi->Source, cpi->common.frame_to_show, &psnr,
-                   cpi->td.mb.e_mbd.bd, cpi->oxcf.input_bit_depth);
+  vpx_calc_highbd_psnr(cpi->Source, cpi->common.frame_to_show, &psnr,
+                       cpi->td.mb.e_mbd.bd, cpi->oxcf.input_bit_depth);
 #else
-  calc_psnr(cpi->Source, cpi->common.frame_to_show, &psnr);
+  vpx_calc_psnr(cpi->Source, cpi->common.frame_to_show, &psnr);
 #endif
 
   for (i = 0; i < 4; ++i) {
@@ -3160,12 +2892,12 @@
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (cm->use_highbitdepth) {
-    recon_err = vp9_highbd_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+    recon_err = vpx_highbd_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
   } else {
-    recon_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+    recon_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
   }
 #else
-  recon_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+  recon_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 
@@ -3721,12 +3453,12 @@
 
 #if CONFIG_VP9_HIGHBITDEPTH
         if (cm->use_highbitdepth) {
-          kf_err = vp9_highbd_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+          kf_err = vpx_highbd_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
         } else {
-          kf_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+          kf_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
         }
 #else
-        kf_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+        kf_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
         // Prevent possible divide by zero error below for perfect KF
@@ -4142,13 +3874,13 @@
   if (cpi->rc.next_key_frame_forced && cpi->rc.frames_to_key == 1) {
 #if CONFIG_VP9_HIGHBITDEPTH
     if (cm->use_highbitdepth) {
-      cpi->ambient_err = vp9_highbd_get_y_sse(cpi->Source,
+      cpi->ambient_err = vpx_highbd_get_y_sse(cpi->Source,
                                               get_frame_new_buffer(cm));
     } else {
-      cpi->ambient_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+      cpi->ambient_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
     }
 #else
-    cpi->ambient_err = vp9_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
+    cpi->ambient_err = vpx_get_y_sse(cpi->Source, get_frame_new_buffer(cm));
 #endif  // CONFIG_VP9_HIGHBITDEPTH
   }
 
@@ -4872,7 +4604,15 @@
     cpi->bytes += (int)(*size);
 
     if (cm->show_frame) {
+      uint32_t bit_depth = 8;
+      uint32_t in_bit_depth = 8;
       cpi->count++;
+#if CONFIG_VP9_HIGHBITDEPTH
+    if (cm->use_highbitdepth) {
+      in_bit_depth = cpi->oxcf.input_bit_depth;
+      bit_depth = cm->bit_depth;
+    }
+#endif
 
       if (cpi->b_calculate_psnr) {
         YV12_BUFFER_CONFIG *orig = cpi->Source;
@@ -4880,10 +4620,10 @@
         YV12_BUFFER_CONFIG *pp = &cm->post_proc_buffer;
         PSNR_STATS psnr;
 #if CONFIG_VP9_HIGHBITDEPTH
-        calc_highbd_psnr(orig, recon, &psnr, cpi->td.mb.e_mbd.bd,
-                         cpi->oxcf.input_bit_depth);
+        vpx_calc_highbd_psnr(orig, recon, &psnr, cpi->td.mb.e_mbd.bd,
+                             in_bit_depth);
 #else
-        calc_psnr(orig, recon, &psnr);
+        vpx_calc_psnr(orig, recon, &psnr);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
         adjust_image_stat(psnr.psnr[1], psnr.psnr[2], psnr.psnr[3],
@@ -4896,7 +4636,7 @@
           PSNR_STATS psnr2;
           double frame_ssim2 = 0, weight = 0;
 #if CONFIG_VP9_POSTPROC
-          if (vpx_alloc_frame_buffer(&cm->post_proc_buffer,
+          if (vpx_alloc_frame_buffer(pp,
                                      recon->y_crop_width, recon->y_crop_height,
                                      cm->subsampling_x, cm->subsampling_y,
 #if CONFIG_VP9_HIGHBITDEPTH
@@ -4908,16 +4648,16 @@
                                "Failed to allocate post processing buffer");
           }
 
-          vp9_deblock(cm->frame_to_show, &cm->post_proc_buffer,
+          vp9_deblock(cm->frame_to_show, pp,
                       cm->lf.filter_level * 10 / 6);
 #endif
           vpx_clear_system_state();
 
 #if CONFIG_VP9_HIGHBITDEPTH
-          calc_highbd_psnr(orig, pp, &psnr2, cpi->td.mb.e_mbd.bd,
+          vpx_calc_highbd_psnr(orig, pp, &psnr2, cpi->td.mb.e_mbd.bd,
                            cpi->oxcf.input_bit_depth);
 #else
-          calc_psnr(orig, pp, &psnr2);
+          vpx_calc_psnr(orig, pp, &psnr2);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
           cpi->totalp_sq_error += psnr2.sse[0];
@@ -4928,7 +4668,7 @@
 #if CONFIG_VP9_HIGHBITDEPTH
           if (cm->use_highbitdepth) {
             frame_ssim2 = vpx_highbd_calc_ssim(orig, recon, &weight,
-                                               (int)cm->bit_depth);
+                                               bit_depth, in_bit_depth);
           } else {
             frame_ssim2 = vpx_calc_ssim(orig, recon, &weight);
           }
@@ -4943,12 +4683,12 @@
 #if CONFIG_VP9_HIGHBITDEPTH
           if (cm->use_highbitdepth) {
             frame_ssim2 = vpx_highbd_calc_ssim(
-                orig, &cm->post_proc_buffer, &weight, (int)cm->bit_depth);
+                orig, pp, &weight, bit_depth, in_bit_depth);
           } else {
-            frame_ssim2 = vpx_calc_ssim(orig, &cm->post_proc_buffer, &weight);
+            frame_ssim2 = vpx_calc_ssim(orig, pp, &weight);
           }
 #else
-          frame_ssim2 = vpx_calc_ssim(orig, &cm->post_proc_buffer, &weight);
+          frame_ssim2 = vpx_calc_ssim(orig, pp, &weight);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
           cpi->summedp_quality += frame_ssim2 * weight;
@@ -5000,37 +4740,16 @@
         }
       }
 
-      if (cpi->b_calculate_ssimg) {
-        double y, u, v, frame_all;
-#if CONFIG_VP9_HIGHBITDEPTH
-        if (cm->use_highbitdepth) {
-          frame_all = vpx_highbd_calc_ssimg(cpi->Source, cm->frame_to_show, &y,
-                                            &u, &v, (int)cm->bit_depth);
-        } else {
-          frame_all = vpx_calc_ssimg(cpi->Source, cm->frame_to_show, &y, &u,
-                                     &v);
-        }
-#else
-        frame_all = vpx_calc_ssimg(cpi->Source, cm->frame_to_show, &y, &u, &v);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-        adjust_image_stat(y, u, v, frame_all, &cpi->ssimg);
-      }
-#if CONFIG_VP9_HIGHBITDEPTH
-      if (!cm->use_highbitdepth)
-#endif
       {
         double y, u, v, frame_all;
         frame_all = vpx_calc_fastssim(cpi->Source, cm->frame_to_show, &y, &u,
-                                      &v);
+                                      &v, bit_depth, in_bit_depth);
         adjust_image_stat(y, u, v, frame_all, &cpi->fastssim);
-        /* TODO(JBB): add 10/12 bit support */
       }
-#if CONFIG_VP9_HIGHBITDEPTH
-      if (!cm->use_highbitdepth)
-#endif
       {
         double y, u, v, frame_all;
-        frame_all = vpx_psnrhvs(cpi->Source, cm->frame_to_show, &y, &u, &v);
+        frame_all = vpx_psnrhvs(cpi->Source, cm->frame_to_show, &y, &u, &v,
+                                bit_depth, in_bit_depth);
         adjust_image_stat(y, u, v, frame_all, &cpi->psnrhvs);
       }
     }
@@ -5158,28 +4877,6 @@
   return;
 }
 
-int64_t vp9_get_y_sse(const YV12_BUFFER_CONFIG *a,
-                      const YV12_BUFFER_CONFIG *b) {
-  assert(a->y_crop_width == b->y_crop_width);
-  assert(a->y_crop_height == b->y_crop_height);
-
-  return get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
-                 a->y_crop_width, a->y_crop_height);
-}
-
-#if CONFIG_VP9_HIGHBITDEPTH
-int64_t vp9_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
-                             const YV12_BUFFER_CONFIG *b) {
-  assert(a->y_crop_width == b->y_crop_width);
-  assert(a->y_crop_height == b->y_crop_height);
-  assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
-  assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
-
-  return highbd_get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
-                        a->y_crop_width, a->y_crop_height);
-}
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
 int vp9_get_quantizer(VP9_COMP *cpi) {
   return cpi->common.base_qindex;
 }
diff --git a/vp9/encoder/vp9_encoder.h b/vp9/encoder/vp9_encoder.h
index b65dfa8..9a8b15a 100644
--- a/vp9/encoder/vp9_encoder.h
+++ b/vp9/encoder/vp9_encoder.h
@@ -501,13 +501,10 @@
   unsigned int tot_recode_hits;
   double worst_ssim;
 
-  ImageStat ssimg;
   ImageStat fastssim;
   ImageStat psnrhvs;
 
-  int b_calculate_ssimg;
   int b_calculate_blockiness;
-
   int b_calculate_consistency;
 
   double total_inconsistency;
@@ -688,12 +685,6 @@
   return get_token_alloc(tile_mb_rows, tile_mb_cols);
 }
 
-int64_t vp9_get_y_sse(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b);
-#if CONFIG_VP9_HIGHBITDEPTH
-int64_t vp9_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
-                             const YV12_BUFFER_CONFIG *b);
-#endif  // CONFIG_VP9_HIGHBITDEPTH
-
 void vp9_scale_references(VP9_COMP *cpi);
 
 void vp9_update_reference_frames(VP9_COMP *cpi);
diff --git a/vp9/encoder/vp9_picklpf.c b/vp9/encoder/vp9_picklpf.c
index f6b1dfc..80ab238 100644
--- a/vp9/encoder/vp9_picklpf.c
+++ b/vp9/encoder/vp9_picklpf.c
@@ -12,7 +12,7 @@
 #include <limits.h>
 
 #include "./vpx_scale_rtcd.h"
-
+#include "vpx_dsp/psnr.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
 
@@ -52,12 +52,12 @@
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (cm->use_highbitdepth) {
-    filt_err = vp9_highbd_get_y_sse(sd, cm->frame_to_show);
+    filt_err = vpx_highbd_get_y_sse(sd, cm->frame_to_show);
   } else {
-    filt_err = vp9_get_y_sse(sd, cm->frame_to_show);
+    filt_err = vpx_get_y_sse(sd, cm->frame_to_show);
   }
 #else
-  filt_err = vp9_get_y_sse(sd, cm->frame_to_show);
+  filt_err = vpx_get_y_sse(sd, cm->frame_to_show);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
   // Re-instate the unfiltered frame
diff --git a/vp9/encoder/vp9_rd.c b/vp9/encoder/vp9_rd.c
index fba45ad..5ec7b25 100644
--- a/vp9/encoder/vp9_rd.c
+++ b/vp9/encoder/vp9_rd.c
@@ -596,6 +596,12 @@
 
   rd->thresh_mult[THR_NEARMV] += 1000;
   rd->thresh_mult[THR_NEARA] += 1000;
+  rd->thresh_mult[THR_NEARG] += 1000;
+
+  rd->thresh_mult[THR_ZEROMV] += 2000;
+  rd->thresh_mult[THR_ZEROG] += 2000;
+  rd->thresh_mult[THR_ZEROA] += 2000;
+
   rd->thresh_mult[THR_COMP_NEARESTLA] += 1000;
   rd->thresh_mult[THR_COMP_NEARESTGA] += 1000;
 
@@ -603,13 +609,9 @@
 
   rd->thresh_mult[THR_COMP_NEARLA] += 1500;
   rd->thresh_mult[THR_COMP_NEWLA] += 2000;
-  rd->thresh_mult[THR_NEARG] += 1000;
   rd->thresh_mult[THR_COMP_NEARGA] += 1500;
   rd->thresh_mult[THR_COMP_NEWGA] += 2000;
 
-  rd->thresh_mult[THR_ZEROMV] += 2000;
-  rd->thresh_mult[THR_ZEROG] += 2000;
-  rd->thresh_mult[THR_ZEROA] += 2000;
   rd->thresh_mult[THR_COMP_ZEROLA] += 2500;
   rd->thresh_mult[THR_COMP_ZEROGA] += 2500;
 
@@ -624,9 +626,10 @@
 }
 
 void vp9_set_rd_speed_thresholds_sub8x8(VP9_COMP *cpi) {
-  static const int thresh_mult[2][MAX_REFS] =
-      {{2500, 2500, 2500, 4500, 4500, 2500},
-       {2000, 2000, 2000, 4000, 4000, 2000}};
+  static const int thresh_mult[2][MAX_REFS] = {
+    {2500, 2500, 2500, 4500, 4500, 2500},
+    {2000, 2000, 2000, 4000, 4000, 2000}
+  };
   RD_OPT *const rd = &cpi->rd;
   const int idx = cpi->oxcf.mode == BEST;
   memcpy(rd->thresh_mult_sub8x8, thresh_mult[idx], sizeof(thresh_mult[idx]));
diff --git a/vp9/encoder/x86/vp9_dct_ssse3.c b/vp9/encoder/x86/vp9_dct_ssse3.c
index b09eac0..1a1d4ea 100644
--- a/vp9/encoder/x86/vp9_dct_ssse3.c
+++ b/vp9/encoder/x86/vp9_dct_ssse3.c
@@ -9,11 +9,6 @@
  */
 
 #include <assert.h>
-#if defined(_MSC_VER) && _MSC_VER <= 1500
-// Need to include math.h before calling tmmintrin.h/intrin.h
-// in certain versions of MSVS.
-#include <math.h>
-#endif
 #include <tmmintrin.h>  // SSSE3
 
 #include "./vp9_rtcd.h"
diff --git a/vp9/encoder/x86/vp9_frame_scale_ssse3.c b/vp9/encoder/x86/vp9_frame_scale_ssse3.c
index 38af3b1..23325d6 100644
--- a/vp9/encoder/x86/vp9_frame_scale_ssse3.c
+++ b/vp9/encoder/x86/vp9_frame_scale_ssse3.c
@@ -8,11 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#if defined(_MSC_VER) && _MSC_VER <= 1500
-// Need to include math.h before calling tmmintrin.h/intrin.h
-// in certain versions of MSVS.
-#include <math.h>
-#endif
 #include <tmmintrin.h>  // SSSE3
 
 #include "./vp9_rtcd.h"
diff --git a/vp9/vp9_cx_iface.c b/vp9/vp9_cx_iface.c
index 9ad86cb..51c6fbb 100644
--- a/vp9/vp9_cx_iface.c
+++ b/vp9/vp9_cx_iface.c
@@ -105,19 +105,6 @@
   BufferPool              *buffer_pool;
 };
 
-static VP9_REFFRAME ref_frame_to_vp9_reframe(vpx_ref_frame_type_t frame) {
-  switch (frame) {
-    case VP8_LAST_FRAME:
-      return VP9_LAST_FLAG;
-    case VP8_GOLD_FRAME:
-      return VP9_GOLD_FLAG;
-    case VP8_ALTR_FRAME:
-      return VP9_ALT_FLAG;
-  }
-  assert(0 && "Invalid Reference Frame");
-  return VP9_LAST_FLAG;
-}
-
 static vpx_codec_err_t update_error_state(vpx_codec_alg_priv_t *ctx,
     const struct vpx_internal_error_info *error) {
   const vpx_codec_err_t res = error->error_code;
diff --git a/vp9/vp9_dx_iface.c b/vp9/vp9_dx_iface.c
index be5d160..4ee5ce1 100644
--- a/vp9/vp9_dx_iface.c
+++ b/vp9/vp9_dx_iface.c
@@ -776,7 +776,8 @@
     FrameWorkerData *const frame_worker_data = (FrameWorkerData *)worker->data1;
     image2yuvconfig(&frame->img, &sd);
     return vp9_set_reference_dec(&frame_worker_data->pbi->common,
-                                 (VP9_REFFRAME)frame->frame_type, &sd);
+                                 ref_frame_to_vp9_reframe(frame->frame_type),
+                                 &sd);
   } else {
     return VPX_CODEC_INVALID_PARAM;
   }
diff --git a/vp9/vp9_iface_common.h b/vp9/vp9_iface_common.h
index 938d422..44a5e81 100644
--- a/vp9/vp9_iface_common.h
+++ b/vp9/vp9_iface_common.h
@@ -133,4 +133,16 @@
   return VPX_CODEC_OK;
 }
 
+static VP9_REFFRAME ref_frame_to_vp9_reframe(vpx_ref_frame_type_t frame) {
+  switch (frame) {
+    case VP8_LAST_FRAME:
+      return VP9_LAST_FLAG;
+    case VP8_GOLD_FRAME:
+      return VP9_GOLD_FLAG;
+    case VP8_ALTR_FRAME:
+      return VP9_ALT_FLAG;
+  }
+  assert(0 && "Invalid Reference Frame");
+  return VP9_LAST_FLAG;
+}
 #endif  // VP9_VP9_IFACE_COMMON_H_
diff --git a/vpx/internal/vpx_psnr.h b/vpx/internal/vpx_psnr.h
deleted file mode 100644
index 07d81bb..0000000
--- a/vpx/internal/vpx_psnr.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#ifndef VPX_INTERNAL_VPX_PSNR_H_
-#define VPX_INTERNAL_VPX_PSNR_H_
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-// TODO(dkovalev) change vpx_sse_to_psnr signature: double -> int64_t
-
-/*!\brief Converts SSE to PSNR
- *
- * Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PNSR).
- *
- * \param[in]    samples       Number of samples
- * \param[in]    peak          Max sample value
- * \param[in]    sse           Sum of squared errors
- */
-double vpx_sse_to_psnr(double samples, double peak, double sse);
-
-#ifdef __cplusplus
-}  // extern "C"
-#endif
-
-#endif  // VPX_INTERNAL_VPX_PSNR_H_
diff --git a/vpx/src/vpx_psnr.c b/vpx/src/vpx_psnr.c
deleted file mode 100644
index 05843ac..0000000
--- a/vpx/src/vpx_psnr.c
+++ /dev/null
@@ -1,24 +0,0 @@
-/*
- *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
- *
- *  Use of this source code is governed by a BSD-style license
- *  that can be found in the LICENSE file in the root of the source
- *  tree. An additional intellectual property rights grant can be found
- *  in the file PATENTS.  All contributing project authors may
- *  be found in the AUTHORS file in the root of the source tree.
- */
-
-#include <math.h>
-
-#include "vpx/internal/vpx_psnr.h"
-
-#define MAX_PSNR 100.0
-
-double vpx_sse_to_psnr(double samples, double peak, double sse) {
-  if (sse > 0.0) {
-    const double psnr = 10.0 * log10(samples * peak * peak / sse);
-    return psnr > MAX_PSNR ? MAX_PSNR : psnr;
-  } else {
-    return MAX_PSNR;
-  }
-}
diff --git a/vpx/vp8.h b/vpx/vp8.h
index 8a035f9..ba67c38 100644
--- a/vpx/vp8.h
+++ b/vpx/vp8.h
@@ -56,6 +56,9 @@
    */
   VP9_GET_REFERENCE           = 128,  /**< get a pointer to a reference frame */
   VP8_COMMON_CTRL_ID_MAX,
+
+  VP10_GET_NEW_FRAME_IMAGE    = 192,  /**< get a pointer to the new frame */
+
   VP8_DECODER_CTRL_ID_START   = 256
 };
 
@@ -137,6 +140,8 @@
 #define VPX_CTRL_VP8_SET_DBG_DISPLAY_MV
 VPX_CTRL_USE_TYPE(VP9_GET_REFERENCE,           vp9_ref_frame_t *)
 #define VPX_CTRL_VP9_GET_REFERENCE
+VPX_CTRL_USE_TYPE(VP10_GET_NEW_FRAME_IMAGE,    vpx_image_t *)
+#define VPX_CTRL_VP10_GET_NEW_FRAME_IMAGE
 
 /*!\endcond */
 /*! @} - end defgroup vp8 */
diff --git a/vpx/vp8cx.h b/vpx/vp8cx.h
index 2752a86..dcc35c9 100644
--- a/vpx/vp8cx.h
+++ b/vpx/vp8cx.h
@@ -141,6 +141,12 @@
  * \sa #vpx_codec_control
  */
 enum vp8e_enc_control_id {
+  /*!\brief Codec control function to set which reference frame encoder can use.
+   *
+   * Supported in codecs: VP8, VP9
+   */
+  VP8E_USE_REFERENCE         = 7,
+
   /*!\brief Codec control function to pass an ROI map to encoder.
    *
    * Supported in codecs: VP8, VP9
@@ -178,6 +184,15 @@
    */
   VP8E_SET_ENABLEAUTOALTREF,
 
+#if CONFIG_EXT_REFS
+  /*!\brief Codec control function to enable automatic set and use
+   * bwd-pred frames.
+   *
+   * Supported in codecs: VP10
+   */
+  VP8E_SET_ENABLEAUTOBWDREF,
+#endif  // CONFIG_EXT_REFS
+
   /*!\brief control function to set noise sensitivity
    *
    * 0: off, 1: OnYOnly, 2: OnYUV,
@@ -568,7 +583,16 @@
    *
    * Supported in codecs: VP9
    */
-  VP9E_GET_LEVEL
+  VP9E_GET_LEVEL,
+
+  /*!\brief Codec control function to set intended superblock size.
+   *
+   * By default, the superblock size is determined separately for each
+   * frame by the encoder.
+   *
+   * Supported in codecs: VP10
+   */
+  VP10E_SET_SUPERBLOCK_SIZE,
 };
 
 /*!\brief vpx 1-D scaling mode
@@ -718,6 +742,8 @@
  *
  */
 
+VPX_CTRL_USE_TYPE_DEPRECATED(VP8E_USE_REFERENCE, int)
+#define VPX_CTRL_VP8E_USE_REFERENCE
 VPX_CTRL_USE_TYPE(VP8E_SET_FRAME_FLAGS,        int)
 #define VPX_CTRL_VP8E_SET_FRAME_FLAGS
 VPX_CTRL_USE_TYPE(VP8E_SET_TEMPORAL_LAYER_ID,  int)
@@ -742,6 +768,12 @@
 #define VPX_CTRL_VP8E_SET_CPUUSED
 VPX_CTRL_USE_TYPE(VP8E_SET_ENABLEAUTOALTREF,   unsigned int)
 #define VPX_CTRL_VP8E_SET_ENABLEAUTOALTREF
+
+#if CONFIG_EXT_REFS
+VPX_CTRL_USE_TYPE(VP8E_SET_ENABLEAUTOBWDREF,   unsigned int)
+#define VPX_CTRL_VP8E_SET_ENABLEAUTOBWDREF
+#endif  // CONFIG_EXT_REFS
+
 VPX_CTRL_USE_TYPE(VP8E_SET_NOISE_SENSITIVITY,  unsigned int)
 #define VPX_CTRL_VP8E_SET_NOISE_SENSITIVITY
 VPX_CTRL_USE_TYPE(VP8E_SET_SHARPNESS,          unsigned int)
@@ -821,15 +853,21 @@
 VPX_CTRL_USE_TYPE(VP9E_SET_SVC_REF_FRAME_CONFIG, vpx_svc_ref_frame_config_t *)
 #define VPX_CTRL_VP9E_SET_SVC_REF_FRAME_CONFIG
 
-VPX_CTRL_USE_TYPE(VP9E_SET_RENDER_SIZE, int *)
+/*!\brief
+ *
+ * TODO(rbultje) : add support of the control in ffmpeg
+ */
 #define VPX_CTRL_VP9E_SET_RENDER_SIZE
+VPX_CTRL_USE_TYPE(VP9E_SET_RENDER_SIZE, int *)
+
+VPX_CTRL_USE_TYPE(VP10E_SET_SUPERBLOCK_SIZE, unsigned int)
+#define VPX_CTRL_VP10E_SET_SUPERBLOCK_SIZE
 
 VPX_CTRL_USE_TYPE(VP9E_SET_TARGET_LEVEL,  unsigned int)
 #define VPX_CTRL_VP9E_SET_TARGET_LEVEL
 
 VPX_CTRL_USE_TYPE(VP9E_GET_LEVEL, int *)
 #define VPX_CTRL_VP9E_GET_LEVEL
-
 /*!\endcond */
 /*! @} - end defgroup vp8_encoder */
 #ifdef __cplusplus
diff --git a/vpx/vp8dx.h b/vpx/vp8dx.h
index 1f02fd5..347521e 100644
--- a/vpx/vp8dx.h
+++ b/vpx/vp8dx.h
@@ -121,7 +121,16 @@
    */
   VP9_SET_SKIP_LOOP_FILTER,
 
-  VP8_DECODER_CTRL_ID_MAX
+  VP8_DECODER_CTRL_ID_MAX,
+
+  /** control function to set the range of tile decoding. A value that is
+   * greater and equal to zero indicates only the specific row/column is
+   * decoded. A value that is -1 indicates the whole row/column is decoded.
+   * A special case is both values are -1 that means the whole frame is
+   * decoded.
+   */
+  VP10_SET_DECODE_TILE_ROW,
+  VP10_SET_DECODE_TILE_COL
 };
 
 /** Decrypt n bytes of data from input -> output, using the decrypt_state
@@ -174,7 +183,10 @@
 #define VPX_CTRL_VP9D_GET_FRAME_SIZE
 VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int)
 #define VPX_CTRL_VP9_INVERT_TILE_DECODE_ORDER
-
+VPX_CTRL_USE_TYPE(VP10_SET_DECODE_TILE_ROW,     int)
+#define VPX_CTRL_VP10_SET_DECODE_TILE_ROW
+VPX_CTRL_USE_TYPE(VP10_SET_DECODE_TILE_COL,     int)
+#define VPX_CTRL_VP10_SET_DECODE_TILE_COL
 /*!\endcond */
 /*! @} - end defgroup vp8_decoder */
 
diff --git a/vpx/vpx_codec.h b/vpx/vpx_codec.h
index b6037bb..e65e3f4 100644
--- a/vpx/vpx_codec.h
+++ b/vpx/vpx_codec.h
@@ -222,6 +222,18 @@
     VPX_BITS_12 = 12,  /**< 12 bits */
   } vpx_bit_depth_t;
 
+  /*!\brief Superblock size selection.
+   *
+   * Defines the superblock size used for encoding. The superblock size can
+   * either be fixed at 64x64 or 128x128 pixels, or it can be dynamically
+   * selected by the encoder for each frame.
+   */
+  typedef enum vpx_superblock_size {
+    VPX_SUPERBLOCK_SIZE_64X64,    /**< Always use 64x64 superblocks. */
+    VPX_SUPERBLOCK_SIZE_128X128,  /**< Always use 128x128 superblocks. */
+    VPX_SUPERBLOCK_SIZE_DYNAMIC   /**< Select superblock size dynamically. */
+  } vpx_superblock_size_t;
+
   /*
    * Library Version Number Interface
    *
diff --git a/vpx/vpx_codec.mk b/vpx/vpx_codec.mk
index ccdef04..b77f458 100644
--- a/vpx/vpx_codec.mk
+++ b/vpx/vpx_codec.mk
@@ -36,10 +36,8 @@
 API_SRCS-yes += src/vpx_encoder.c
 API_SRCS-yes += vpx_encoder.h
 API_SRCS-yes += internal/vpx_codec_internal.h
-API_SRCS-yes += internal/vpx_psnr.h
 API_SRCS-yes += src/vpx_codec.c
 API_SRCS-yes += src/vpx_image.c
-API_SRCS-yes += src/vpx_psnr.c
 API_SRCS-yes += vpx_codec.h
 API_SRCS-yes += vpx_codec.mk
 API_SRCS-yes += vpx_frame_buffer.h
diff --git a/vpx/vpx_integer.h b/vpx/vpx_integer.h
index 829c9d1..2945c87 100644
--- a/vpx/vpx_integer.h
+++ b/vpx/vpx_integer.h
@@ -24,7 +24,7 @@
 #define VPX_INLINE inline
 #endif
 
-#if (defined(_MSC_VER) && (_MSC_VER < 1600)) || defined(VPX_EMULATE_INTTYPES)
+#if defined(VPX_EMULATE_INTTYPES)
 typedef signed char  int8_t;
 typedef signed short int16_t;
 typedef signed int   int32_t;
@@ -33,16 +33,6 @@
 typedef unsigned short uint16_t;
 typedef unsigned int   uint32_t;
 
-#if (defined(_MSC_VER) && (_MSC_VER < 1600))
-typedef signed __int64   int64_t;
-typedef unsigned __int64 uint64_t;
-#define INT64_MAX _I64_MAX
-#define INT32_MAX _I32_MAX
-#define INT32_MIN _I32_MIN
-#define INT16_MAX _I16_MAX
-#define INT16_MIN _I16_MIN
-#endif
-
 #ifndef _UINTPTR_T_DEFINED
 typedef size_t uintptr_t;
 #endif
diff --git a/vpx_dsp/avg.c b/vpx_dsp/avg.c
index a8c9966..cf7fd36 100644
--- a/vpx_dsp/avg.c
+++ b/vpx_dsp/avg.c
@@ -12,22 +12,22 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/mem.h"
 
-unsigned int vpx_avg_8x8_c(const uint8_t *s, int p) {
+unsigned int vpx_avg_8x8_c(const uint8_t *src, int stride) {
   int i, j;
   int sum = 0;
-  for (i = 0; i < 8; ++i, s+=p)
-    for (j = 0; j < 8; sum += s[j], ++j) {}
+  for (i = 0; i < 8; ++i, src += stride)
+    for (j = 0; j < 8; sum += src[j], ++j) {}
 
-  return (sum + 32) >> 6;
+  return ROUND_POWER_OF_TWO(sum, 6);
 }
 
-unsigned int vpx_avg_4x4_c(const uint8_t *s, int p) {
+unsigned int vpx_avg_4x4_c(const uint8_t *src, int stride) {
   int i, j;
   int sum = 0;
-  for (i = 0; i < 4; ++i, s+=p)
-    for (j = 0; j < 4; sum += s[j], ++j) {}
+  for (i = 0; i < 4; ++i, src += stride)
+    for (j = 0; j < 4; sum += src[j], ++j) {}
 
-  return (sum + 8) >> 4;
+  return ROUND_POWER_OF_TWO(sum, 4);
 }
 
 // src_diff: first pass, 9 bit, dynamic range [-255, 255]
@@ -178,14 +178,15 @@
   return var;
 }
 
-void vpx_minmax_8x8_c(const uint8_t *s, int p, const uint8_t *d, int dp,
+void vpx_minmax_8x8_c(const uint8_t *src, int src_stride,
+                      const uint8_t *ref, int ref_stride,
                       int *min, int *max) {
   int i, j;
   *min = 255;
   *max = 0;
-  for (i = 0; i < 8; ++i, s += p, d += dp) {
+  for (i = 0; i < 8; ++i, src += src_stride, ref += ref_stride) {
     for (j = 0; j < 8; ++j) {
-      int diff = abs(s[j]-d[j]);
+      int diff = abs(src[j]-ref[j]);
       *min = diff < *min ? diff : *min;
       *max = diff > *max ? diff : *max;
     }
@@ -193,24 +194,24 @@
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-unsigned int vpx_highbd_avg_8x8_c(const uint8_t *s8, int p) {
+unsigned int vpx_highbd_avg_8x8_c(const uint8_t *src, int stride) {
   int i, j;
   int sum = 0;
-  const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
-  for (i = 0; i < 8; ++i, s+=p)
+  const uint16_t* s = CONVERT_TO_SHORTPTR(src);
+  for (i = 0; i < 8; ++i, s += stride)
     for (j = 0; j < 8; sum += s[j], ++j) {}
 
-  return (sum + 32) >> 6;
+  return ROUND_POWER_OF_TWO(sum, 6);
 }
 
-unsigned int vpx_highbd_avg_4x4_c(const uint8_t *s8, int p) {
+unsigned int vpx_highbd_avg_4x4_c(const uint8_t *src, int stride) {
   int i, j;
   int sum = 0;
-  const uint16_t* s = CONVERT_TO_SHORTPTR(s8);
-  for (i = 0; i < 4; ++i, s+=p)
+  const uint16_t* s = CONVERT_TO_SHORTPTR(src);
+  for (i = 0; i < 4; ++i, s+=stride)
     for (j = 0; j < 4; sum += s[j], ++j) {}
 
-  return (sum + 8) >> 4;
+  return ROUND_POWER_OF_TWO(sum, 4);
 }
 
 void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8,
diff --git a/vpx_dsp/bitreader_buffer.c b/vpx_dsp/bitreader_buffer.c
index d7b55cf..595b9bb 100644
--- a/vpx_dsp/bitreader_buffer.c
+++ b/vpx_dsp/bitreader_buffer.c
@@ -43,11 +43,7 @@
 
 int vpx_rb_read_inv_signed_literal(struct vpx_read_bit_buffer *rb,
                                    int bits) {
-#if CONFIG_MISC_FIXES
   const int nbits = sizeof(unsigned) * 8 - bits - 1;
   const unsigned value = (unsigned)vpx_rb_read_literal(rb, bits + 1) << nbits;
   return ((int) value) >> nbits;
-#else
-  return vpx_rb_read_signed_literal(rb, bits);
-#endif
 }
diff --git a/vpx_dsp/bitwriter_buffer.c b/vpx_dsp/bitwriter_buffer.c
index 6182a72..8633372 100644
--- a/vpx_dsp/bitwriter_buffer.c
+++ b/vpx_dsp/bitwriter_buffer.c
@@ -39,10 +39,5 @@
 
 void vpx_wb_write_inv_signed_literal(struct vpx_write_bit_buffer *wb,
                                      int data, int bits) {
-#if CONFIG_MISC_FIXES
   vpx_wb_write_literal(wb, data, bits + 1);
-#else
-  vpx_wb_write_literal(wb, abs(data), bits);
-  vpx_wb_write_bit(wb, data < 0);
-#endif
 }
diff --git a/vpx_dsp/blend.h b/vpx_dsp/blend.h
new file mode 100644
index 0000000..109183a
--- /dev/null
+++ b/vpx_dsp/blend.h
@@ -0,0 +1,40 @@
+/*
+*  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+*
+*  Use of this source code is governed by a BSD-style license
+*  that can be found in the LICENSE file in the root of the source
+*  tree. An additional intellectual property rights grant can be found
+*  in the file PATENTS.  All contributing project authors may
+*  be found in the AUTHORS file in the root of the source tree.
+*/
+
+#ifndef VPX_DSP_BLEND_H_
+#define VPX_DSP_BLEND_H_
+
+#include "vpx_ports/mem.h"
+
+// Various blending functions and macros.
+// See also the vpx_blend_* functions in vpx_dsp_rtcd.h
+
+// Alpha blending with alpha values from the range [0, 64], where 64
+// means use the first input and 0 means use the second input.
+#define VPX_BLEND_A64_ROUND_BITS  6
+#define VPX_BLEND_A64_MAX_ALPHA   (1 << VPX_BLEND_A64_ROUND_BITS)   // 64
+
+#define VPX_BLEND_A64(a, v0, v1)                                              \
+  ROUND_POWER_OF_TWO((a) * (v0) + (VPX_BLEND_A64_MAX_ALPHA - (a)) * (v1),     \
+                     VPX_BLEND_A64_ROUND_BITS)
+
+// Alpha blending with alpha values from the range [0, 256], where 256
+// means use the first input and 0 means use the second input.
+#define VPX_BLEND_A256_ROUND_BITS 8
+#define VPX_BLEND_A256_MAX_ALPHA  (1 << VPX_BLEND_A256_ROUND_BITS)  // 256
+
+#define VPX_BLEND_A256(a, v0, v1)                                             \
+  ROUND_POWER_OF_TWO((a) * (v0) + (VPX_BLEND_A256_MAX_ALPHA - (a)) * (v1),    \
+                     VPX_BLEND_A256_ROUND_BITS)
+
+// Blending by averaging.
+#define VPX_BLEND_AVG(v0, v1)   ROUND_POWER_OF_TWO((v0) + (v1), 1)
+
+#endif  // VPX_DSP_BLEND_H_
diff --git a/vpx_dsp/blend_a64_hmask.c b/vpx_dsp/blend_a64_hmask.c
new file mode 100644
index 0000000..90f3415
--- /dev/null
+++ b/vpx_dsp/blend_a64_hmask.c
@@ -0,0 +1,73 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/blend.h"
+
+#include "./vpx_dsp_rtcd.h"
+
+void vpx_blend_a64_hmask_c(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w) {
+  int i, j;
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
+      dst[i * dst_stride + j] = VPX_BLEND_A64(mask[j],
+                                              src0[i * src0_stride + j],
+                                              src1[i * src1_stride + j]);
+    }
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_blend_a64_hmask_c(
+    uint8_t *dst_8, uint32_t dst_stride,
+    const uint8_t *src0_8, uint32_t src0_stride,
+    const uint8_t *src1_8, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w, int bd) {
+  int i, j;
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+  const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+  const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  assert(bd == 8 || bd == 10 || bd == 12);
+
+  for (i = 0; i < h; ++i) {
+    for (j = 0; j < w; ++j) {
+      dst[i * dst_stride + j] = VPX_BLEND_A64(mask[j],
+                                              src0[i * src0_stride + j],
+                                              src1[i * src1_stride + j]);
+    }
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/blend_a64_mask.c b/vpx_dsp/blend_a64_mask.c
new file mode 100644
index 0000000..1649798
--- /dev/null
+++ b/vpx_dsp/blend_a64_mask.c
@@ -0,0 +1,151 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/blend.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+
+#include "./vpx_dsp_rtcd.h"
+
+// Blending with alpha mask. Mask values come from the range [0, 64],
+// as described for VPX_BLEND_A64 in vpx_dsp/blned.h. src0 or src1 can
+// be the same as dst, or dst can be different from both sources.
+
+void vpx_blend_a64_mask_c(uint8_t *dst, uint32_t dst_stride,
+                          const uint8_t *src0, uint32_t src0_stride,
+                          const uint8_t *src1, uint32_t src1_stride,
+                          const uint8_t *mask, uint32_t mask_stride,
+                          int h, int w, int subh, int subw) {
+  int i, j;
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  if (subw == 0 && subh == 0) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m = mask[i * mask_stride + j];
+        dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+                                                src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  } else if (subw == 1 && subh == 1) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m =
+            ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + (2 * j)] +
+                               mask[(2 * i + 1) * mask_stride + (2 * j)] +
+                               mask[(2 * i) * mask_stride + (2 * j + 1)] +
+                               mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
+                               2);
+        dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+                                                src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  } else if (subw == 1 && subh == 0) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m = VPX_BLEND_AVG(mask[i * mask_stride + (2 * j)],
+                                    mask[i * mask_stride + (2 * j + 1)]);
+        dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+                                                src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  } else {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m = VPX_BLEND_AVG(mask[(2 * i) * mask_stride + j],
+                                    mask[(2 * i + 1) * mask_stride + j]);
+        dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+                                                src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_blend_a64_mask_c(uint8_t *dst_8, uint32_t dst_stride,
+                                 const uint8_t *src0_8, uint32_t src0_stride,
+                                 const uint8_t *src1_8, uint32_t src1_stride,
+                                 const uint8_t *mask, uint32_t mask_stride,
+                                 int h, int w, int subh, int subw, int bd) {
+  int i, j;
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+  const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+  const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  assert(bd == 8 || bd == 10 || bd == 12);
+
+  if (subw == 0 && subh == 0) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m = mask[i * mask_stride + j];
+        dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+                                                src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  } else if (subw == 1 && subh == 1) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m =
+            ROUND_POWER_OF_TWO(mask[(2 * i) * mask_stride + (2 * j)] +
+                               mask[(2 * i + 1) * mask_stride + (2 * j)] +
+                               mask[(2 * i) * mask_stride + (2 * j + 1)] +
+                               mask[(2 * i + 1) * mask_stride + (2 * j + 1)],
+                               2);
+        dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+                                                src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  } else if (subw == 1 && subh == 0) {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m = VPX_BLEND_AVG(mask[i * mask_stride + (2 * j)],
+                                    mask[i * mask_stride + (2 * j + 1)]);
+        dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+                                                src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  } else {
+    for (i = 0; i < h; ++i) {
+      for (j = 0; j < w; ++j) {
+        const int m = VPX_BLEND_AVG(mask[(2 * i) * mask_stride + j],
+                                    mask[(2 * i + 1) * mask_stride + j]);
+        dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+                                                src0[i * src0_stride + j],
+                                                src1[i * src1_stride + j]);
+      }
+    }
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/blend_a64_vmask.c b/vpx_dsp/blend_a64_vmask.c
new file mode 100644
index 0000000..5d48a83
--- /dev/null
+++ b/vpx_dsp/blend_a64_vmask.c
@@ -0,0 +1,75 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/blend.h"
+
+#include "./vpx_dsp_rtcd.h"
+
+void vpx_blend_a64_vmask_c(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w) {
+  int i, j;
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  for (i = 0; i < h; ++i) {
+    const int m = mask[i];
+    for (j = 0; j < w; ++j) {
+      dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+                                              src0[i * src0_stride + j],
+                                              src1[i * src1_stride + j]);
+    }
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_blend_a64_vmask_c(
+    uint8_t *dst_8, uint32_t dst_stride,
+    const uint8_t *src0_8, uint32_t src0_stride,
+    const uint8_t *src1_8, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w, int bd) {
+  int i, j;
+  uint16_t *dst = CONVERT_TO_SHORTPTR(dst_8);
+  const uint16_t *src0 = CONVERT_TO_SHORTPTR(src0_8);
+  const uint16_t *src1 = CONVERT_TO_SHORTPTR(src1_8);
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  assert(bd == 8 || bd == 10 || bd == 12);
+
+  for (i = 0; i < h; ++i) {
+    const int m = mask[i];
+    for (j = 0; j < w; ++j) {
+      dst[i * dst_stride + j] = VPX_BLEND_A64(m,
+                                              src0[i * src0_stride + j],
+                                              src1[i * src1_stride + j]);
+    }
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/fastssim.c b/vpx_dsp/fastssim.c
index 1405a30..7d90891 100644
--- a/vpx_dsp/fastssim.c
+++ b/vpx_dsp/fastssim.c
@@ -10,6 +10,7 @@
  *  This code was originally written by: Nathan E. Egge, at the Daala
  *  project.
  */
+#include <assert.h>
 #include <math.h>
 #include <stdlib.h>
 #include <string.h>
@@ -17,19 +18,24 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/ssim.h"
 #include "vpx_ports/system_state.h"
-/* TODO(jbb): High bit depth version of this code needed */
+
 typedef struct fs_level fs_level;
 typedef struct fs_ctx fs_ctx;
 
 #define SSIM_C1 (255 * 255 * 0.01 * 0.01)
 #define SSIM_C2 (255 * 255 * 0.03 * 0.03)
-
+#if CONFIG_VP9_HIGHBITDEPTH
+#define SSIM_C1_10 (1023 * 1023 * 0.01 * 0.01)
+#define SSIM_C1_12 (4095 * 4095 * 0.01 * 0.01)
+#define SSIM_C2_10 (1023 * 1023 * 0.03 * 0.03)
+#define SSIM_C2_12 (4095 * 4095 * 0.03 * 0.03)
+#endif
 #define FS_MINI(_a, _b) ((_a) < (_b) ? (_a) : (_b))
 #define FS_MAXI(_a, _b) ((_a) > (_b) ? (_a) : (_b))
 
 struct fs_level {
-  uint16_t *im1;
-  uint16_t *im2;
+  uint32_t *im1;
+  uint32_t *im2;
   double *ssim;
   int w;
   int h;
@@ -80,7 +86,7 @@
     level_size += sizeof(*_ctx->level[l].ssim) - 1;
     level_size /= sizeof(*_ctx->level[l].ssim);
     level_size *= sizeof(*_ctx->level[l].ssim);
-    _ctx->level[l].im1 = (uint16_t *) data;
+    _ctx->level[l].im1 = (uint32_t *)data;
     _ctx->level[l].im2 = _ctx->level[l].im1 + im_size;
     data += level_size;
     _ctx->level[l].ssim = (double *) data;
@@ -96,10 +102,10 @@
 }
 
 static void fs_downsample_level(fs_ctx *_ctx, int _l) {
-  const uint16_t *src1;
-  const uint16_t *src2;
-  uint16_t *dst1;
-  uint16_t *dst2;
+  const uint32_t *src1;
+  const uint32_t *src2;
+  uint32_t *dst1;
+  uint32_t *dst2;
   int w2;
   int h2;
   int w;
@@ -132,11 +138,12 @@
   }
 }
 
-static void fs_downsample_level0(fs_ctx *_ctx, const unsigned char *_src1,
-                                 int _s1ystride, const unsigned char *_src2,
-                                 int _s2ystride, int _w, int _h) {
-  uint16_t *dst1;
-  uint16_t *dst2;
+static void fs_downsample_level0(fs_ctx *_ctx, const uint8_t *_src1,
+                                 int _s1ystride, const uint8_t *_src2,
+                                 int _s2ystride, int _w, int _h,
+                                 uint32_t bd, uint32_t shift) {
+  uint32_t *dst1;
+  uint32_t *dst2;
   int w;
   int h;
   int i;
@@ -155,21 +162,34 @@
       int i1;
       i0 = 2 * i;
       i1 = FS_MINI(i0 + 1, _w);
-      dst1[j * w + i] = _src1[j0 * _s1ystride + i0]
-          + _src1[j0 * _s1ystride + i1] + _src1[j1 * _s1ystride + i0]
-          + _src1[j1 * _s1ystride + i1];
-      dst2[j * w + i] = _src2[j0 * _s2ystride + i0]
-          + _src2[j0 * _s2ystride + i1] + _src2[j1 * _s2ystride + i0]
-          + _src2[j1 * _s2ystride + i1];
+      if (bd == 8 && shift == 0) {
+        dst1[j * w + i] = _src1[j0 * _s1ystride + i0]
+            + _src1[j0 * _s1ystride + i1] + _src1[j1 * _s1ystride + i0]
+            + _src1[j1 * _s1ystride + i1];
+        dst2[j * w + i] = _src2[j0 * _s2ystride + i0]
+            + _src2[j0 * _s2ystride + i1] + _src2[j1 * _s2ystride + i0]
+            + _src2[j1 * _s2ystride + i1];
+      } else {
+        uint16_t * src1s = CONVERT_TO_SHORTPTR(_src1);
+        uint16_t * src2s = CONVERT_TO_SHORTPTR(_src2);
+        dst1[j * w + i] = (src1s[j0 * _s1ystride + i0] >> shift)
+              + (src1s[j0 * _s1ystride + i1] >> shift)
+              + (src1s[j1 * _s1ystride + i0] >> shift)
+              + (src1s[j1 * _s1ystride + i1] >> shift);
+        dst2[j * w + i] = (src2s[j0 * _s2ystride + i0] >> shift)
+              + (src2s[j0 * _s2ystride + i1] >> shift)
+              + (src2s[j1 * _s2ystride + i0] >> shift)
+              + (src2s[j1 * _s2ystride + i1] >> shift);
+      }
     }
   }
 }
 
-static void fs_apply_luminance(fs_ctx *_ctx, int _l) {
+static void fs_apply_luminance(fs_ctx *_ctx, int _l, int bit_depth) {
   unsigned *col_sums_x;
   unsigned *col_sums_y;
-  uint16_t *im1;
-  uint16_t *im2;
+  uint32_t *im1;
+  uint32_t *im2;
   double *ssim;
   double c1;
   int w;
@@ -178,6 +198,15 @@
   int j1offs;
   int i;
   int j;
+  double ssim_c1 = SSIM_C1;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (bit_depth == 10)
+    ssim_c1 = SSIM_C1_10;
+  if (bit_depth == 12)
+    ssim_c1 = SSIM_C1_12;
+#else
+  assert(bit_depth == 8);
+#endif
   w = _ctx->level[_l].w;
   h = _ctx->level[_l].h;
   col_sums_x = _ctx->col_buf;
@@ -196,7 +225,7 @@
       col_sums_y[i] += im2[j1offs + i];
   }
   ssim = _ctx->level[_l].ssim;
-  c1 = (double) (SSIM_C1 * 4096 * (1 << 4 * _l));
+  c1 = (double) (ssim_c1 * 4096 * (1 << 4 * _l));
   for (j = 0; j < h; j++) {
     unsigned mux;
     unsigned muy;
@@ -294,9 +323,9 @@
   } \
   while (0)
 
-static void fs_calc_structure(fs_ctx *_ctx, int _l) {
-  uint16_t *im1;
-  uint16_t *im2;
+static void fs_calc_structure(fs_ctx *_ctx, int _l, int bit_depth) {
+  uint32_t *im1;
+  uint32_t *im2;
   unsigned *gx_buf;
   unsigned *gy_buf;
   double *ssim;
@@ -309,6 +338,16 @@
   int h;
   int i;
   int j;
+  double ssim_c2 = SSIM_C2;
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (bit_depth == 10)
+    ssim_c2 = SSIM_C2_10;
+  if (bit_depth == 12)
+    ssim_c2 = SSIM_C2_12;
+#else
+  assert(bit_depth == 8);
+#endif
+
   w = _ctx->level[_l].w;
   h = _ctx->level[_l].h;
   im1 = _ctx->level[_l].im1;
@@ -318,7 +357,7 @@
   stride = w + 8;
   gy_buf = gx_buf + 8 * stride;
   memset(gx_buf, 0, 2 * 8 * stride * sizeof(*gx_buf));
-  c2 = SSIM_C2 * (1 << 4 * _l) * 16 * 104;
+  c2 = ssim_c2 * (1 << 4 * _l) * 16 * 104;
   for (j = 0; j < h + 4; j++) {
     if (j < h - 1) {
       for (i = 0; i < w - 1; i++) {
@@ -326,11 +365,11 @@
         unsigned g2;
         unsigned gx;
         unsigned gy;
-        g1 = abs(im1[(j + 1) * w + i + 1] - im1[j * w + i]);
-        g2 = abs(im1[(j + 1) * w + i] - im1[j * w + i + 1]);
+        g1 = abs((int)im1[(j + 1) * w + i + 1] - (int)im1[j * w + i]);
+        g2 = abs((int)im1[(j + 1) * w + i] - (int)im1[j * w + i + 1]);
         gx = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2);
-        g1 = abs(im2[(j + 1) * w + i + 1] - im2[j * w + i]);
-        g2 = abs(im2[(j + 1) * w + i] - im2[j * w + i + 1]);
+        g1 = abs((int)im2[(j + 1) * w + i + 1] - (int)im2[j * w + i]);
+        g2 = abs((int)im2[(j + 1) * w + i] - (int)im2[j * w + i + 1]);
         gy = 4 * FS_MAXI(g1, g2) + FS_MINI(g1, g2);
         gx_buf[(j & 7) * stride + i + 4] = gx;
         gy_buf[(j & 7) * stride + i + 4] = gy;
@@ -421,48 +460,55 @@
   return pow(ret / (w * h), FS_WEIGHTS[_l]);
 }
 
-static double calc_ssim(const unsigned char *_src, int _systride,
-                 const unsigned char *_dst, int _dystride, int _w, int _h) {
+static double convert_ssim_db(double _ssim, double _weight) {
+  assert(_weight >= _ssim);
+  if ((_weight - _ssim) < 1e-10)
+    return MAX_SSIM_DB;
+  return 10 * (log10(_weight) - log10(_weight - _ssim));
+}
+
+static double calc_ssim(const uint8_t *_src, int _systride,
+                        const uint8_t *_dst, int _dystride,
+                        int _w, int _h, uint32_t _bd, uint32_t _shift) {
   fs_ctx ctx;
   double ret;
   int l;
   ret = 1;
   fs_ctx_init(&ctx, _w, _h, FS_NLEVELS);
-  fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride, _w, _h);
+  fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride,
+                       _w, _h, _bd, _shift);
   for (l = 0; l < FS_NLEVELS - 1; l++) {
-    fs_calc_structure(&ctx, l);
+    fs_calc_structure(&ctx, l, _bd);
     ret *= fs_average(&ctx, l);
     fs_downsample_level(&ctx, l + 1);
   }
-  fs_calc_structure(&ctx, l);
-  fs_apply_luminance(&ctx, l);
+  fs_calc_structure(&ctx, l, _bd);
+  fs_apply_luminance(&ctx, l, _bd);
   ret *= fs_average(&ctx, l);
   fs_ctx_clear(&ctx);
   return ret;
 }
 
-static double convert_ssim_db(double _ssim, double _weight) {
-  return 10 * (log10(_weight) - log10(_weight - _ssim));
-}
-
 double vpx_calc_fastssim(const YV12_BUFFER_CONFIG *source,
                          const YV12_BUFFER_CONFIG *dest,
-                         double *ssim_y, double *ssim_u, double *ssim_v) {
+                         double *ssim_y, double *ssim_u, double *ssim_v,
+                         uint32_t bd, uint32_t in_bd) {
   double ssimv;
+  uint32_t bd_shift = 0;
   vpx_clear_system_state();
+  assert(bd >= in_bd);
+  bd_shift = bd - in_bd;
 
   *ssim_y = calc_ssim(source->y_buffer, source->y_stride, dest->y_buffer,
                       dest->y_stride, source->y_crop_width,
-                      source->y_crop_height);
-
+                      source->y_crop_height, in_bd, bd_shift);
   *ssim_u = calc_ssim(source->u_buffer, source->uv_stride, dest->u_buffer,
                       dest->uv_stride, source->uv_crop_width,
-                      source->uv_crop_height);
-
+                      source->uv_crop_height, in_bd, bd_shift);
   *ssim_v = calc_ssim(source->v_buffer, source->uv_stride, dest->v_buffer,
                       dest->uv_stride, source->uv_crop_width,
-                      source->uv_crop_height);
-  ssimv = (*ssim_y) * .8 + .1 * ((*ssim_u) + (*ssim_v));
+                      source->uv_crop_height, in_bd, bd_shift);
 
+  ssimv = (*ssim_y) * .8 + .1 * ((*ssim_u) + (*ssim_v));
   return convert_ssim_db(ssimv, 1.0);
 }
diff --git a/vpx_dsp/intrapred.c b/vpx_dsp/intrapred.c
index cc4a74b..b1076f8 100644
--- a/vpx_dsp/intrapred.c
+++ b/vpx_dsp/intrapred.c
@@ -44,7 +44,6 @@
       dst[r * stride + c] = dst[(r + 1) * stride + c - 2];
 }
 
-#if CONFIG_MISC_FIXES
 static INLINE void d207e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                    const uint8_t *above, const uint8_t *left) {
   int r, c;
@@ -59,7 +58,6 @@
     dst += stride;
   }
 }
-#endif  // CONFIG_MISC_FIXES
 
 static INLINE void d63_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                  const uint8_t *above, const uint8_t *left) {
@@ -78,7 +76,6 @@
   }
 }
 
-#if CONFIG_MISC_FIXES
 static INLINE void d63e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                   const uint8_t *above, const uint8_t *left) {
   int r, c;
@@ -92,7 +89,6 @@
     dst += stride;
   }
 }
-#endif  // CONFIG_MISC_FIXES
 
 static INLINE void d45_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                  const uint8_t *above, const uint8_t *left) {
@@ -113,7 +109,6 @@
   }
 }
 
-#if CONFIG_MISC_FIXES
 static INLINE void d45e_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                   const uint8_t *above, const uint8_t *left) {
   int r, c;
@@ -126,7 +121,6 @@
     dst += stride;
   }
 }
-#endif  // CONFIG_MISC_FIXES
 
 static INLINE void d117_predictor(uint8_t *dst, ptrdiff_t stride, int bs,
                                   const uint8_t *above, const uint8_t *left) {
@@ -544,7 +538,6 @@
   }
 }
 
-#if CONFIG_MISC_FIXES
 static INLINE void highbd_d207e_predictor(uint16_t *dst, ptrdiff_t stride,
                                           int bs, const uint16_t *above,
                                           const uint16_t *left, int bd) {
@@ -561,7 +554,6 @@
     dst += stride;
   }
 }
-#endif  // CONFIG_MISC_FIXES
 
 static INLINE void highbd_d63_predictor(uint16_t *dst, ptrdiff_t stride,
                                         int bs, const uint16_t *above,
@@ -597,7 +589,6 @@
   }
 }
 
-#if CONFIG_MISC_FIXES
 static INLINE void highbd_d45e_predictor(uint16_t *dst, ptrdiff_t stride,
                                          int bs, const uint16_t *above,
                                          const uint16_t *left, int bd) {
@@ -612,7 +603,6 @@
     dst += stride;
   }
 }
-#endif  // CONFIG_MISC_FIXES
 
 static INLINE void highbd_d117_predictor(uint16_t *dst, ptrdiff_t stride,
                                          int bs, const uint16_t *above,
@@ -852,11 +842,9 @@
 intra_pred_no_4x4(d207)
 intra_pred_no_4x4(d63)
 intra_pred_no_4x4(d45)
-#if CONFIG_MISC_FIXES
 intra_pred_allsizes(d207e)
 intra_pred_allsizes(d63e)
 intra_pred_no_4x4(d45e)
-#endif
 intra_pred_no_4x4(d117)
 intra_pred_no_4x4(d135)
 intra_pred_no_4x4(d153)
diff --git a/vpx_dsp/inv_txfm.c b/vpx_dsp/inv_txfm.c
index e18d31d..707cb92 100644
--- a/vpx_dsp/inv_txfm.c
+++ b/vpx_dsp/inv_txfm.c
@@ -2062,8 +2062,8 @@
   }
 }
 
-static void highbd_idct32_c(const tran_low_t *input,
-                            tran_low_t *output, int bd) {
+void vpx_highbd_idct32_c(const tran_low_t *input,
+                         tran_low_t *output, int bd) {
   tran_low_t step1[32], step2[32];
   tran_high_t temp1, temp2;
   (void) bd;
@@ -2452,7 +2452,7 @@
       zero_coeff[j] = zero_coeff[2 * j] | zero_coeff[2 * j + 1];
 
     if (zero_coeff[0] | zero_coeff[1])
-      highbd_idct32_c(input, outptr, bd);
+      vpx_highbd_idct32_c(input, outptr, bd);
     else
       memset(outptr, 0, sizeof(tran_low_t) * 32);
     input += 32;
@@ -2463,7 +2463,7 @@
   for (i = 0; i < 32; ++i) {
     for (j = 0; j < 32; ++j)
       temp_in[j] = out[j * 32 + i];
-    highbd_idct32_c(temp_in, temp_out, bd);
+    vpx_highbd_idct32_c(temp_in, temp_out, bd);
     for (j = 0; j < 32; ++j) {
       dest[j * stride + i] = highbd_clip_pixel_add(
           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
@@ -2482,7 +2482,7 @@
   // Rows
   // Only upper-left 8x8 has non-zero coeff.
   for (i = 0; i < 8; ++i) {
-    highbd_idct32_c(input, outptr, bd);
+    vpx_highbd_idct32_c(input, outptr, bd);
     input += 32;
     outptr += 32;
   }
@@ -2490,7 +2490,7 @@
   for (i = 0; i < 32; ++i) {
     for (j = 0; j < 32; ++j)
       temp_in[j] = out[j * 32 + i];
-    highbd_idct32_c(temp_in, temp_out, bd);
+    vpx_highbd_idct32_c(temp_in, temp_out, bd);
     for (j = 0; j < 32; ++j) {
       dest[j * stride + i] = highbd_clip_pixel_add(
           dest[j * stride + i], ROUND_POWER_OF_TWO(temp_out[j], 6), bd);
diff --git a/vpx_dsp/inv_txfm.h b/vpx_dsp/inv_txfm.h
index 9cfe1be..6397e66 100644
--- a/vpx_dsp/inv_txfm.h
+++ b/vpx_dsp/inv_txfm.h
@@ -110,6 +110,7 @@
 void vpx_highbd_idct4_c(const tran_low_t *input, tran_low_t *output, int bd);
 void vpx_highbd_idct8_c(const tran_low_t *input, tran_low_t *output, int bd);
 void vpx_highbd_idct16_c(const tran_low_t *input, tran_low_t *output, int bd);
+void vpx_highbd_idct32_c(const tran_low_t *input, tran_low_t *output, int bd);
 
 void vpx_highbd_iadst4_c(const tran_low_t *input, tran_low_t *output, int bd);
 void vpx_highbd_iadst8_c(const tran_low_t *input, tran_low_t *output, int bd);
diff --git a/vpx_dsp/psnr.c b/vpx_dsp/psnr.c
new file mode 100644
index 0000000..1655f11
--- /dev/null
+++ b/vpx_dsp/psnr.c
@@ -0,0 +1,296 @@
+/*
+*  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+*
+*  Use of this source code is governed by a BSD-style license
+*  that can be found in the LICENSE file in the root of the source
+*  tree. An additional intellectual property rights grant can be found
+*  in the file PATENTS.  All contributing project authors may
+*  be found in the AUTHORS file in the root of the source tree.
+*/
+
+#include <math.h>
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/psnr.h"
+#include "vpx_scale/yv12config.h"
+
+
+double vpx_sse_to_psnr(double samples, double peak, double sse) {
+  if (sse > 0.0) {
+    const double psnr = 10.0 * log10(samples * peak * peak / sse);
+    return psnr > MAX_PSNR ? MAX_PSNR : psnr;
+  } else {
+    return MAX_PSNR;
+  }
+}
+
+/* TODO(yaowu): The block_variance calls the unoptimized versions of variance()
+* and highbd_8_variance(). It should not.
+*/
+static void encoder_variance(const uint8_t *a, int  a_stride,
+  const uint8_t *b, int  b_stride,
+  int  w, int  h, unsigned int *sse, int *sum) {
+  int i, j;
+
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      const int diff = a[j] - b[j];
+      *sum += diff;
+      *sse += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static void encoder_highbd_variance64(const uint8_t *a8, int  a_stride,
+  const uint8_t *b8, int  b_stride,
+  int w, int h, uint64_t *sse,
+  uint64_t *sum) {
+  int i, j;
+
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      const int diff = a[j] - b[j];
+      *sum += diff;
+      *sse += diff * diff;
+    }
+    a += a_stride;
+    b += b_stride;
+  }
+}
+
+static void encoder_highbd_8_variance(const uint8_t *a8, int  a_stride,
+  const uint8_t *b8, int  b_stride,
+  int w, int h,
+  unsigned int *sse, int *sum) {
+  uint64_t sse_long = 0;
+  uint64_t sum_long = 0;
+  encoder_highbd_variance64(a8, a_stride, b8, b_stride, w, h,
+    &sse_long, &sum_long);
+  *sse = (unsigned int)sse_long;
+  *sum = (int)sum_long;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static int64_t get_sse(const uint8_t *a, int a_stride,
+  const uint8_t *b, int b_stride,
+  int width, int height) {
+  const int dw = width % 16;
+  const int dh = height % 16;
+  int64_t total_sse = 0;
+  unsigned int sse = 0;
+  int sum = 0;
+  int x, y;
+
+  if (dw > 0) {
+    encoder_variance(&a[width - dw], a_stride, &b[width - dw], b_stride,
+      dw, height, &sse, &sum);
+    total_sse += sse;
+  }
+
+  if (dh > 0) {
+    encoder_variance(&a[(height - dh) * a_stride], a_stride,
+      &b[(height - dh) * b_stride], b_stride,
+      width - dw, dh, &sse, &sum);
+    total_sse += sse;
+  }
+
+  for (y = 0; y < height / 16; ++y) {
+    const uint8_t *pa = a;
+    const uint8_t *pb = b;
+    for (x = 0; x < width / 16; ++x) {
+      vpx_mse16x16(pa, a_stride, pb, b_stride, &sse);
+      total_sse += sse;
+
+      pa += 16;
+      pb += 16;
+    }
+
+    a += 16 * a_stride;
+    b += 16 * b_stride;
+  }
+
+  return total_sse;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static int64_t highbd_get_sse_shift(const uint8_t *a8, int a_stride,
+                                    const uint8_t *b8, int b_stride,
+                                    int width, int height,
+                                    unsigned int input_shift) {
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+  int64_t total_sse = 0;
+  int x, y;
+  for (y = 0; y < height; ++y) {
+    for (x = 0; x < width; ++x) {
+      int64_t diff;
+      diff = (a[x] >> input_shift) - (b[x] >> input_shift);
+      total_sse += diff * diff;
+    }
+    a += a_stride;
+    b += b_stride;
+  }
+  return total_sse;
+}
+
+static int64_t highbd_get_sse(const uint8_t *a, int a_stride,
+                              const uint8_t *b, int b_stride,
+                              int width, int height) {
+  int64_t total_sse = 0;
+  int x, y;
+  const int dw = width % 16;
+  const int dh = height % 16;
+  unsigned int sse = 0;
+  int sum = 0;
+  if (dw > 0) {
+    encoder_highbd_8_variance(&a[width - dw], a_stride,
+      &b[width - dw], b_stride,
+      dw, height, &sse, &sum);
+    total_sse += sse;
+  }
+  if (dh > 0) {
+    encoder_highbd_8_variance(&a[(height - dh) * a_stride], a_stride,
+      &b[(height - dh) * b_stride], b_stride,
+      width - dw, dh, &sse, &sum);
+    total_sse += sse;
+  }
+  for (y = 0; y < height / 16; ++y) {
+    const uint8_t *pa = a;
+    const uint8_t *pb = b;
+    for (x = 0; x < width / 16; ++x) {
+      vpx_highbd_8_mse16x16(pa, a_stride, pb, b_stride, &sse);
+      total_sse += sse;
+      pa += 16;
+      pb += 16;
+    }
+    a += 16 * a_stride;
+    b += 16 * b_stride;
+  }
+  return total_sse;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+
+int64_t vpx_get_y_sse(const YV12_BUFFER_CONFIG *a,
+  const YV12_BUFFER_CONFIG *b) {
+  assert(a->y_crop_width == b->y_crop_width);
+  assert(a->y_crop_height == b->y_crop_height);
+
+  return get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
+    a->y_crop_width, a->y_crop_height);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+int64_t vpx_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
+  const YV12_BUFFER_CONFIG *b) {
+  assert(a->y_crop_width == b->y_crop_width);
+  assert(a->y_crop_height == b->y_crop_height);
+  assert((a->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+  assert((b->flags & YV12_FLAG_HIGHBITDEPTH) != 0);
+
+  return highbd_get_sse(a->y_buffer, a->y_stride, b->y_buffer, b->y_stride,
+    a->y_crop_width, a->y_crop_height);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
+                          const YV12_BUFFER_CONFIG *b,
+                          PSNR_STATS *psnr, uint32_t bit_depth,
+                          uint32_t in_bit_depth) {
+  const int widths[3] =
+  { a->y_crop_width, a->uv_crop_width, a->uv_crop_width };
+  const int heights[3] =
+  { a->y_crop_height, a->uv_crop_height, a->uv_crop_height };
+  const uint8_t *a_planes[3] = { a->y_buffer, a->u_buffer, a->v_buffer };
+  const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride };
+  const uint8_t *b_planes[3] = { b->y_buffer, b->u_buffer, b->v_buffer };
+  const int b_strides[3] = { b->y_stride, b->uv_stride, b->uv_stride };
+  int i;
+  uint64_t total_sse = 0;
+  uint32_t total_samples = 0;
+  const double peak = (double)((1 << in_bit_depth) - 1);
+  const unsigned int input_shift = bit_depth - in_bit_depth;
+
+  for (i = 0; i < 3; ++i) {
+    const int w = widths[i];
+    const int h = heights[i];
+    const uint32_t samples = w * h;
+    uint64_t sse;
+    if (a->flags & YV12_FLAG_HIGHBITDEPTH) {
+      if (input_shift) {
+        sse = highbd_get_sse_shift(a_planes[i], a_strides[i],
+          b_planes[i], b_strides[i], w, h,
+          input_shift);
+      } else {
+        sse = highbd_get_sse(a_planes[i], a_strides[i],
+          b_planes[i], b_strides[i], w, h);
+      }
+    } else {
+      sse = get_sse(a_planes[i], a_strides[i],
+        b_planes[i], b_strides[i],
+        w, h);
+    }
+    psnr->sse[1 + i] = sse;
+    psnr->samples[1 + i] = samples;
+    psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, peak, (double)sse);
+
+    total_sse += sse;
+    total_samples += samples;
+  }
+
+  psnr->sse[0] = total_sse;
+  psnr->samples[0] = total_samples;
+  psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, peak,
+    (double)total_sse);
+}
+
+#endif  // !CONFIG_VP9_HIGHBITDEPTH
+
+void vpx_calc_psnr(const YV12_BUFFER_CONFIG *a, const YV12_BUFFER_CONFIG *b,
+                   PSNR_STATS *psnr) {
+  static const double peak = 255.0;
+  const int widths[3] = {
+    a->y_crop_width, a->uv_crop_width, a->uv_crop_width };
+  const int heights[3] = {
+    a->y_crop_height, a->uv_crop_height, a->uv_crop_height };
+  const uint8_t *a_planes[3] = { a->y_buffer, a->u_buffer, a->v_buffer };
+  const int a_strides[3] = { a->y_stride, a->uv_stride, a->uv_stride };
+  const uint8_t *b_planes[3] = { b->y_buffer, b->u_buffer, b->v_buffer };
+  const int b_strides[3] = { b->y_stride, b->uv_stride, b->uv_stride };
+  int i;
+  uint64_t total_sse = 0;
+  uint32_t total_samples = 0;
+
+  for (i = 0; i < 3; ++i) {
+    const int w = widths[i];
+    const int h = heights[i];
+    const uint32_t samples = w * h;
+    const uint64_t sse = get_sse(a_planes[i], a_strides[i],
+      b_planes[i], b_strides[i],
+      w, h);
+    psnr->sse[1 + i] = sse;
+    psnr->samples[1 + i] = samples;
+    psnr->psnr[1 + i] = vpx_sse_to_psnr(samples, peak, (double)sse);
+
+    total_sse += sse;
+    total_samples += samples;
+  }
+
+  psnr->sse[0] = total_sse;
+  psnr->samples[0] = total_samples;
+  psnr->psnr[0] = vpx_sse_to_psnr((double)total_samples, peak,
+    (double)total_sse);
+}
diff --git a/vpx_dsp/psnr.h b/vpx_dsp/psnr.h
new file mode 100644
index 0000000..e25b450
--- /dev/null
+++ b/vpx_dsp/psnr.h
@@ -0,0 +1,63 @@
+/*
+*  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+*
+*  Use of this source code is governed by a BSD-style license
+*  that can be found in the LICENSE file in the root of the source
+*  tree. An additional intellectual property rights grant can be found
+*  in the file PATENTS.  All contributing project authors may
+*  be found in the AUTHORS file in the root of the source tree.
+*/
+
+#ifndef VPX_DSP_PSNR_H_
+#define VPX_DSP_PSNR_H_
+
+
+#include "vpx_scale/yv12config.h"
+
+#define MAX_PSNR 100.0
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+  double psnr[4];       // total/y/u/v
+  uint64_t sse[4];      // total/y/u/v
+  uint32_t samples[4];  // total/y/u/v
+} PSNR_STATS;
+
+// TODO(dkovalev) change vpx_sse_to_psnr signature: double -> int64_t
+
+/*!\brief Converts SSE to PSNR
+*
+* Converts sum of squared errros (SSE) to peak signal-to-noise ratio (PNSR).
+*
+* \param[in]    samples       Number of samples
+* \param[in]    peak          Max sample value
+* \param[in]    sse           Sum of squared errors
+*/
+double vpx_sse_to_psnr(double samples, double peak, double sse);
+int64_t vpx_get_y_sse(const YV12_BUFFER_CONFIG *a,
+                      const YV12_BUFFER_CONFIG *b);
+#if CONFIG_VP9_HIGHBITDEPTH
+int64_t vpx_highbd_get_y_sse(const YV12_BUFFER_CONFIG *a,
+                             const YV12_BUFFER_CONFIG *b);
+void vpx_calc_highbd_psnr(const YV12_BUFFER_CONFIG *a,
+                      const YV12_BUFFER_CONFIG *b,
+                      PSNR_STATS *psnr,
+                      unsigned int bit_depth,
+                      unsigned int in_bit_depth);
+#endif
+void vpx_calc_psnr(const YV12_BUFFER_CONFIG *a,
+               const YV12_BUFFER_CONFIG *b,
+               PSNR_STATS *psnr);
+
+double vpx_psnrhvs(const YV12_BUFFER_CONFIG *source,
+                   const YV12_BUFFER_CONFIG *dest,
+                   double *phvs_y, double *phvs_u,
+                   double *phvs_v, uint32_t bd, uint32_t in_bd);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+#endif  // VPX_DSP_PSNR_H_
diff --git a/vpx_dsp/psnrhvs.c b/vpx_dsp/psnrhvs.c
index 3001705..095ba5d 100644
--- a/vpx_dsp/psnrhvs.c
+++ b/vpx_dsp/psnrhvs.c
@@ -10,6 +10,7 @@
  *  This code was originally written by: Gregory Maxwell, at the Daala
  *  project.
  */
+#include <assert.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <math.h>
@@ -18,6 +19,7 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/ssim.h"
 #include "vpx_ports/system_state.h"
+#include "vpx_dsp/psnr.h"
 
 #if !defined(M_PI)
 # define M_PI (3.141592653589793238462643)
@@ -26,14 +28,29 @@
 
 static void od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
                            int xstride) {
+  int i, j;
   (void) xstride;
   vpx_fdct8x8(x, y, ystride);
+  for (i = 0; i < 8; i++)
+    for (j = 0; j< 8; j++)
+      *(y + ystride*i + j) = (*(y + ystride*i + j) + 4) >> 3;
 }
+#if CONFIG_VP9_HIGHBITDEPTH
+static void hbd_od_bin_fdct8x8(tran_low_t *y, int ystride, const int16_t *x,
+                           int xstride) {
+  int i, j;
+  (void) xstride;
+  vpx_highbd_fdct8x8(x, y, ystride);
+  for (i = 0; i < 8; i++)
+    for (j = 0; j< 8; j++)
+      *(y + ystride*i + j) = (*(y + ystride*i + j) + 4) >> 3;
+}
+#endif
 
 /* Normalized inverse quantization matrix for 8x8 DCT at the point of
  * transparency. This is not the JPEG based matrix from the paper,
  this one gives a slightly higher MOS agreement.*/
-static const float csf_y[8][8] = {
+static const double csf_y[8][8] = {
     {1.6193873005, 2.2901594831, 2.08509755623, 1.48366094411, 1.00227514334,
      0.678296995242, 0.466224900598, 0.3265091542},
     {2.2901594831, 1.94321815382, 2.04793073064, 1.68731108984, 1.2305666963,
@@ -50,7 +67,7 @@
      0.283006984131, 0.215017739696, 0.168869545842, 0.136153931001},
     {0.3265091542, 0.436405793551, 0.372504254596, 0.295774038565,
      0.226951348204, 0.17408067321, 0.136153931001, 0.109083846276}};
-static const float csf_cb420[8][8] = {
+static const double csf_cb420[8][8] = {
     {1.91113096927, 2.46074210438, 1.18284184739, 1.14982565193, 1.05017074788,
      0.898018824055, 0.74725392039, 0.615105596242},
     {2.46074210438, 1.58529308355, 1.21363250036, 1.38190029285, 1.33100189972,
@@ -67,7 +84,7 @@
      0.55002013668, 0.454353482512, 0.389234902883, 0.342353999733},
     {0.615105596242, 0.830890433625, 0.731221236837, 0.608694761374,
      0.495804539034, 0.407050308965, 0.342353999733, 0.295530605237}};
-static const float csf_cr420[8][8] = {
+static const double csf_cr420[8][8] = {
     {2.03871978502, 2.62502345193, 1.26180942886, 1.11019789803, 1.01397751469,
      0.867069376285, 0.721500455585, 0.593906509971},
     {2.62502345193, 1.69112867013, 1.17180569821, 1.3342742857, 1.28513006198,
@@ -85,23 +102,38 @@
     {0.593906509971, 0.802254508198, 0.706020324706, 0.587716619023,
      0.478717061273, 0.393021669543, 0.330555063063, 0.285345396658}};
 
-static double convert_score_db(double _score, double _weight) {
-  return 10 * (log10(255 * 255) - log10(_weight * _score));
+static double convert_score_db(double _score, double _weight, int bit_depth) {
+  int16_t pix_max = 255;
+  assert(_score * _weight >= 0.0);
+  if (bit_depth == 10)
+    pix_max = 1023;
+  else if (bit_depth == 12)
+    pix_max = 4095;
+
+  if (_weight * _score < pix_max * pix_max * 1e-10)
+    return MAX_PSNR;
+  return 10 * (log10(pix_max * pix_max) - log10(_weight * _score));
 }
 
-static double calc_psnrhvs(const unsigned char *_src, int _systride,
-                           const unsigned char *_dst, int _dystride,
-                           double _par, int _w, int _h, int _step,
-                           const float _csf[8][8]) {
-  float ret;
+static double calc_psnrhvs(const unsigned char *src, int _systride,
+                               const unsigned char *dst, int _dystride,
+                               double _par, int _w, int _h, int _step,
+                               const double _csf[8][8], uint32_t bit_depth,
+                               uint32_t _shift) {
+  double ret;
+  const uint8_t *_src8 = src;
+  const uint8_t *_dst8 = dst;
+  const uint16_t *_src16 = CONVERT_TO_SHORTPTR(src);
+  const uint16_t *_dst16 = CONVERT_TO_SHORTPTR(dst);
   int16_t dct_s[8 * 8], dct_d[8 * 8];
   tran_low_t dct_s_coef[8 * 8], dct_d_coef[8 * 8];
-  float mask[8][8];
+  double mask[8][8];
   int pixels;
   int x;
   int y;
   (void) _par;
   ret = pixels = 0;
+
   /*In the PSNR-HVS-M paper[1] the authors describe the construction of
    their masking table as "we have used the quantization table for the
    color component Y of JPEG [6] that has been also obtained on the
@@ -126,23 +158,28 @@
     for (x = 0; x < _w - 7; x += _step) {
       int i;
       int j;
-      float s_means[4];
-      float d_means[4];
-      float s_vars[4];
-      float d_vars[4];
-      float s_gmean = 0;
-      float d_gmean = 0;
-      float s_gvar = 0;
-      float d_gvar = 0;
-      float s_mask = 0;
-      float d_mask = 0;
+      double s_means[4];
+      double d_means[4];
+      double s_vars[4];
+      double d_vars[4];
+      double s_gmean = 0;
+      double d_gmean = 0;
+      double s_gvar = 0;
+      double d_gvar = 0;
+      double s_mask = 0;
+      double d_mask = 0;
       for (i = 0; i < 4; i++)
         s_means[i] = d_means[i] = s_vars[i] = d_vars[i] = 0;
       for (i = 0; i < 8; i++) {
         for (j = 0; j < 8; j++) {
           int sub = ((i & 12) >> 2) + ((j & 12) >> 1);
-          dct_s[i * 8 + j] = _src[(y + i) * _systride + (j + x)];
-          dct_d[i * 8 + j] = _dst[(y + i) * _dystride + (j + x)];
+          if (bit_depth == 8 && _shift == 0) {
+            dct_s[i * 8 + j] = _src8[(y + i) * _systride + (j + x)];
+            dct_d[i * 8 + j] = _dst8[(y + i) * _dystride + (j + x)];
+          } else if (bit_depth == 10 || bit_depth == 12) {
+            dct_s[i * 8 + j] = _src16[(y + i) * _systride + (j + x)] >> _shift;
+            dct_d[i * 8 + j] = _dst16[(y + i) * _dystride + (j + x)] >> _shift;
+          }
           s_gmean += dct_s[i * 8 + j];
           d_gmean += dct_d[i * 8 + j];
           s_means[sub] += dct_s[i * 8 + j];
@@ -176,8 +213,16 @@
         s_gvar = (s_vars[0] + s_vars[1] + s_vars[2] + s_vars[3]) / s_gvar;
       if (d_gvar > 0)
         d_gvar = (d_vars[0] + d_vars[1] + d_vars[2] + d_vars[3]) / d_gvar;
-      od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
-      od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
+#if CONFIG_VP9_HIGHBITDEPTH
+      if (bit_depth == 10 || bit_depth == 12) {
+        hbd_od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
+        hbd_od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
+      }
+#endif
+      if (bit_depth == 8) {
+        od_bin_fdct8x8(dct_s_coef, 8, dct_s, 8);
+        od_bin_fdct8x8(dct_d_coef, 8, dct_d, 8);
+      }
       for (i = 0; i < 8; i++)
         for (j = (i == 0); j < 8; j++)
           s_mask += dct_s_coef[i * 8 + j] * dct_s_coef[i * 8 + j] * mask[i][j];
@@ -190,8 +235,8 @@
         s_mask = d_mask;
       for (i = 0; i < 8; i++) {
         for (j = 0; j < 8; j++) {
-          float err;
-          err = fabs((float)(dct_s_coef[i * 8 + j] - dct_d_coef[i * 8 + j]));
+          double err;
+          err = fabs((double)(dct_s_coef[i * 8 + j] - dct_d_coef[i * 8 + j]));
           if (i != 0 || j != 0)
             err = err < s_mask / mask[i][j] ? 0 : err - s_mask / mask[i][j];
           ret += (err * _csf[i][j]) * (err * _csf[i][j]);
@@ -203,25 +248,35 @@
   ret /= pixels;
   return ret;
 }
-double vpx_psnrhvs(const YV12_BUFFER_CONFIG *source,
+
+double vpx_psnrhvs(const YV12_BUFFER_CONFIG *src,
                    const YV12_BUFFER_CONFIG *dest, double *y_psnrhvs,
-                   double *u_psnrhvs, double *v_psnrhvs) {
+                   double *u_psnrhvs, double *v_psnrhvs,
+                   uint32_t bd, uint32_t in_bd) {
   double psnrhvs;
   const double par = 1.0;
   const int step = 7;
+  uint32_t bd_shift = 0;
   vpx_clear_system_state();
-  *y_psnrhvs = calc_psnrhvs(source->y_buffer, source->y_stride, dest->y_buffer,
-                            dest->y_stride, par, source->y_crop_width,
-                            source->y_crop_height, step, csf_y);
 
-  *u_psnrhvs = calc_psnrhvs(source->u_buffer, source->uv_stride, dest->u_buffer,
-                            dest->uv_stride, par, source->uv_crop_width,
-                            source->uv_crop_height, step, csf_cb420);
+  assert(bd == 8 || bd == 10 || bd == 12);
+  assert(bd >= in_bd);
 
-  *v_psnrhvs = calc_psnrhvs(source->v_buffer, source->uv_stride, dest->v_buffer,
-                            dest->uv_stride, par, source->uv_crop_width,
-                            source->uv_crop_height, step, csf_cr420);
+  bd_shift = bd - in_bd;
+
+  *y_psnrhvs = calc_psnrhvs(src->y_buffer, src->y_stride, dest->y_buffer,
+                            dest->y_stride, par, src->y_crop_width,
+                            src->y_crop_height, step, csf_y, bd,
+                            bd_shift);
+  *u_psnrhvs = calc_psnrhvs(src->u_buffer, src->uv_stride, dest->u_buffer,
+                            dest->uv_stride, par, src->uv_crop_width,
+                            src->uv_crop_height, step, csf_cb420, bd,
+                            bd_shift);
+  *v_psnrhvs = calc_psnrhvs(src->v_buffer, src->uv_stride, dest->v_buffer,
+                            dest->uv_stride, par, src->uv_crop_width,
+                            src->uv_crop_height, step, csf_cr420, bd,
+                            bd_shift);
   psnrhvs = (*y_psnrhvs) * .8 + .1 * ((*u_psnrhvs) + (*v_psnrhvs));
-
-  return convert_score_db(psnrhvs, 1.0);
+  return convert_score_db(psnrhvs, 1.0, in_bd);
 }
+
diff --git a/vpx_dsp/sad.c b/vpx_dsp/sad.c
index f1f951f..e49148d 100644
--- a/vpx_dsp/sad.c
+++ b/vpx_dsp/sad.c
@@ -67,6 +67,22 @@
     sad_array[i] = vpx_sad##m##x##n##_c(src, src_stride, ref_array[i], ref_stride); \
 }
 
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+// 128x128
+sadMxN(128, 128)
+sadMxNxK(128, 128, 3)
+sadMxNxK(128, 128, 8)
+sadMxNx4D(128, 128)
+
+// 128x64
+sadMxN(128, 64)
+sadMxNx4D(128, 64)
+
+// 64x128
+sadMxN(64, 128)
+sadMxNx4D(64, 128)
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
+
 // 64x64
 sadMxN(64, 64)
 sadMxNxK(64, 64, 3)
@@ -206,6 +222,22 @@
   } \
 }
 
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+// 128x128
+highbd_sadMxN(128, 128)
+highbd_sadMxNxK(128, 128, 3)
+highbd_sadMxNxK(128, 128, 8)
+highbd_sadMxNx4D(128, 128)
+
+// 128x64
+highbd_sadMxN(128, 64)
+highbd_sadMxNx4D(128, 64)
+
+// 64x128
+highbd_sadMxN(64, 128)
+highbd_sadMxNx4D(64, 128)
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
+
 // 64x64
 highbd_sadMxN(64, 64)
 highbd_sadMxNxK(64, 64, 3)
@@ -275,3 +307,204 @@
 highbd_sadMxNx4D(4, 4)
 
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_VP10 && CONFIG_EXT_INTER
+static INLINE unsigned int masked_sad(const uint8_t *a, int a_stride,
+                                      const uint8_t *b, int b_stride,
+                                      const uint8_t *m, int m_stride,
+                                      int width, int height) {
+  int y, x;
+  unsigned int sad = 0;
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++)
+      sad += m[x] * abs(a[x] - b[x]);
+
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  }
+  sad = (sad + 31) >> 6;
+
+  return sad;
+}
+
+#define MASKSADMxN(m, n) \
+unsigned int vpx_masked_sad##m##x##n##_c(const uint8_t *src, int src_stride, \
+                                         const uint8_t *ref, int ref_stride, \
+                                         const uint8_t *msk, int msk_stride) { \
+  return masked_sad(src, src_stride, ref, ref_stride, msk, msk_stride, m, n); \
+}
+
+#if CONFIG_EXT_PARTITION
+MASKSADMxN(128, 128)
+MASKSADMxN(128, 64)
+MASKSADMxN(64, 128)
+#endif  // CONFIG_EXT_PARTITION
+MASKSADMxN(64, 64)
+MASKSADMxN(64, 32)
+MASKSADMxN(32, 64)
+MASKSADMxN(32, 32)
+MASKSADMxN(32, 16)
+MASKSADMxN(16, 32)
+MASKSADMxN(16, 16)
+MASKSADMxN(16, 8)
+MASKSADMxN(8, 16)
+MASKSADMxN(8, 8)
+MASKSADMxN(8, 4)
+MASKSADMxN(4, 8)
+MASKSADMxN(4, 4)
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE unsigned int highbd_masked_sad(const uint8_t *a8, int a_stride,
+                                             const uint8_t *b8, int b_stride,
+                                             const uint8_t *m, int m_stride,
+                                             int width, int height) {
+  int y, x;
+  unsigned int sad = 0;
+  const uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  const uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++)
+      sad += m[x] * abs(a[x] - b[x]);
+
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  }
+  sad = (sad + 31) >> 6;
+
+  return sad;
+}
+
+#define HIGHBD_MASKSADMXN(m, n) \
+unsigned int vpx_highbd_masked_sad##m##x##n##_c(const uint8_t *src, \
+                                                int src_stride, \
+                                                const uint8_t *ref, \
+                                                int ref_stride, \
+                                                const uint8_t *msk, \
+                                                int msk_stride) { \
+  return highbd_masked_sad(src, src_stride, ref, ref_stride, \
+                           msk, msk_stride, m, n); \
+}
+
+#if CONFIG_EXT_PARTITION
+HIGHBD_MASKSADMXN(128, 128)
+HIGHBD_MASKSADMXN(128, 64)
+HIGHBD_MASKSADMXN(64, 128)
+#endif  // CONFIG_EXT_PARTITION
+HIGHBD_MASKSADMXN(64, 64)
+HIGHBD_MASKSADMXN(64, 32)
+HIGHBD_MASKSADMXN(32, 64)
+HIGHBD_MASKSADMXN(32, 32)
+HIGHBD_MASKSADMXN(32, 16)
+HIGHBD_MASKSADMXN(16, 32)
+HIGHBD_MASKSADMXN(16, 16)
+HIGHBD_MASKSADMXN(16, 8)
+HIGHBD_MASKSADMXN(8, 16)
+HIGHBD_MASKSADMXN(8, 8)
+HIGHBD_MASKSADMXN(8, 4)
+HIGHBD_MASKSADMXN(4, 8)
+HIGHBD_MASKSADMXN(4, 4)
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // CONFIG_VP10 && CONFIG_EXT_INTER
+
+#if CONFIG_VP10 && CONFIG_OBMC
+// pre: predictor being evaluated
+// wsrc: target weighted prediction (has been *4096 to keep precision)
+// mask: 2d weights (scaled by 4096)
+static INLINE unsigned int obmc_sad(const uint8_t *pre, int pre_stride,
+                                    const int32_t *wsrc,
+                                    const int32_t *mask,
+                                    int width, int height) {
+  int y, x;
+  unsigned int sad = 0;
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++)
+      sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12);
+
+    pre += pre_stride;
+    wsrc += width;
+    mask += width;
+  }
+
+  return sad;
+}
+
+#define OBMCSADMxN(m, n)                                                      \
+unsigned int vpx_obmc_sad##m##x##n##_c(const uint8_t *ref, int ref_stride,    \
+                                       const int32_t *wsrc,                   \
+                                       const int32_t *mask) {                 \
+  return obmc_sad(ref, ref_stride, wsrc, mask, m, n);                         \
+}
+
+#if CONFIG_EXT_PARTITION
+OBMCSADMxN(128, 128)
+OBMCSADMxN(128, 64)
+OBMCSADMxN(64, 128)
+#endif  // CONFIG_EXT_PARTITION
+OBMCSADMxN(64, 64)
+OBMCSADMxN(64, 32)
+OBMCSADMxN(32, 64)
+OBMCSADMxN(32, 32)
+OBMCSADMxN(32, 16)
+OBMCSADMxN(16, 32)
+OBMCSADMxN(16, 16)
+OBMCSADMxN(16, 8)
+OBMCSADMxN(8, 16)
+OBMCSADMxN(8, 8)
+OBMCSADMxN(8, 4)
+OBMCSADMxN(4, 8)
+OBMCSADMxN(4, 4)
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE unsigned int highbd_obmc_sad(const uint8_t *pre8, int pre_stride,
+                                           const int32_t *wsrc,
+                                           const int32_t *mask,
+                                           int width, int height) {
+  int y, x;
+  unsigned int sad = 0;
+  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+
+  for (y = 0; y < height; y++) {
+    for (x = 0; x < width; x++)
+      sad += ROUND_POWER_OF_TWO(abs(wsrc[x] - pre[x] * mask[x]), 12);
+
+    pre += pre_stride;
+    wsrc += width;
+    mask += width;
+  }
+
+  return sad;
+}
+
+#define HIGHBD_OBMCSADMXN(m, n)                                               \
+unsigned int vpx_highbd_obmc_sad##m##x##n##_c(const uint8_t *ref,             \
+                                              int ref_stride,                 \
+                                              const int32_t *wsrc,            \
+                                              const int32_t *mask) {          \
+  return highbd_obmc_sad(ref, ref_stride, wsrc, mask, m, n);                  \
+}
+
+#if CONFIG_EXT_PARTITION
+HIGHBD_OBMCSADMXN(128, 128)
+HIGHBD_OBMCSADMXN(128, 64)
+HIGHBD_OBMCSADMXN(64, 128)
+#endif  // CONFIG_EXT_PARTITION
+HIGHBD_OBMCSADMXN(64, 64)
+HIGHBD_OBMCSADMXN(64, 32)
+HIGHBD_OBMCSADMXN(32, 64)
+HIGHBD_OBMCSADMXN(32, 32)
+HIGHBD_OBMCSADMXN(32, 16)
+HIGHBD_OBMCSADMXN(16, 32)
+HIGHBD_OBMCSADMXN(16, 16)
+HIGHBD_OBMCSADMXN(16, 8)
+HIGHBD_OBMCSADMXN(8, 16)
+HIGHBD_OBMCSADMXN(8, 8)
+HIGHBD_OBMCSADMXN(8, 4)
+HIGHBD_OBMCSADMXN(4, 8)
+HIGHBD_OBMCSADMXN(4, 4)
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // CONFIG_VP10 && CONFIG_OBMC
diff --git a/vpx_dsp/ssim.c b/vpx_dsp/ssim.c
index cfe5bb3..632e272 100644
--- a/vpx_dsp/ssim.c
+++ b/vpx_dsp/ssim.c
@@ -8,6 +8,7 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include <assert.h>
 #include <math.h>
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/ssim.h"
@@ -66,16 +67,31 @@
 
 static const int64_t cc1 =  26634;  // (64^2*(.01*255)^2
 static const int64_t cc2 = 239708;  // (64^2*(.03*255)^2
+static const int64_t cc1_10 = 428658;  // (64^2*(.01*1023)^2
+static const int64_t cc2_10 = 3857925;  // (64^2*(.03*1023)^2
+static const int64_t cc1_12 = 6868593;  // (64^2*(.01*4095)^2
+static const int64_t cc2_12 = 61817334;  // (64^2*(.03*4095)^2
 
 static double similarity(uint32_t sum_s, uint32_t sum_r,
                          uint32_t sum_sq_s, uint32_t sum_sq_r,
-                         uint32_t sum_sxr, int count) {
+                         uint32_t sum_sxr, int count,
+                         uint32_t bd) {
   int64_t ssim_n, ssim_d;
   int64_t c1, c2;
-
-  // scale the constants by number of pixels
-  c1 = (cc1 * count * count) >> 12;
-  c2 = (cc2 * count * count) >> 12;
+  if (bd == 8) {
+    // scale the constants by number of pixels
+    c1 = (cc1 * count * count) >> 12;
+    c2 = (cc2 * count * count) >> 12;
+  } else if (bd == 10) {
+    c1 = (cc1_10 * count * count) >> 12;
+    c2 = (cc2_10 * count * count) >> 12;
+  } else if (bd == 12) {
+    c1 = (cc1_12 * count * count) >> 12;
+    c2 = (cc2_12 * count * count) >> 12;
+  } else {
+    c1 = c2 = 0;
+    assert(0);
+  }
 
   ssim_n = (2 * sum_s * sum_r + c1) * ((int64_t) 2 * count * sum_sxr -
                                        (int64_t) 2 * sum_s * sum_r + c2);
@@ -91,22 +107,21 @@
   uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
   vpx_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
                      &sum_sxr);
-  return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64);
+  return similarity(sum_s, sum_r, sum_sq_s, sum_sq_r, sum_sxr, 64, 8);
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
 static double highbd_ssim_8x8(const uint16_t *s, int sp, const uint16_t *r,
-                              int rp, unsigned int bd) {
+                              int rp, uint32_t bd, uint32_t shift) {
   uint32_t sum_s = 0, sum_r = 0, sum_sq_s = 0, sum_sq_r = 0, sum_sxr = 0;
-  const int oshift = bd - 8;
   vpx_highbd_ssim_parms_8x8(s, sp, r, rp, &sum_s, &sum_r, &sum_sq_s, &sum_sq_r,
                             &sum_sxr);
-  return similarity(sum_s >> oshift,
-                    sum_r >> oshift,
-                    sum_sq_s >> (2 * oshift),
-                    sum_sq_r >> (2 * oshift),
-                    sum_sxr >> (2 * oshift),
-                    64);
+  return similarity(sum_s >> shift,
+                    sum_r >> shift,
+                    sum_sq_s >> (2 * shift),
+                    sum_sq_r >> (2 * shift),
+                    sum_sxr >> (2 * shift),
+                    64, bd);
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
@@ -136,7 +151,7 @@
 #if CONFIG_VP9_HIGHBITDEPTH
 static double vpx_highbd_ssim2(const uint8_t *img1, const uint8_t *img2,
                                int stride_img1, int stride_img2, int width,
-                               int height, unsigned int bd) {
+                               int height, uint32_t bd, uint32_t shift) {
   int i, j;
   int samples = 0;
   double ssim_total = 0;
@@ -147,7 +162,7 @@
     for (j = 0; j <= width - 8; j += 4) {
       double v = highbd_ssim_8x8(CONVERT_TO_SHORTPTR(img1 + j), stride_img1,
                                  CONVERT_TO_SHORTPTR(img2 + j), stride_img2,
-                                 bd);
+                                 bd, shift);
       ssim_total += v;
       samples++;
     }
@@ -182,31 +197,6 @@
   return ssimv;
 }
 
-double vpx_calc_ssimg(const YV12_BUFFER_CONFIG *source,
-                      const YV12_BUFFER_CONFIG *dest,
-                      double *ssim_y, double *ssim_u, double *ssim_v) {
-  double ssim_all = 0;
-  double a, b, c;
-
-  a = vpx_ssim2(source->y_buffer, dest->y_buffer,
-                source->y_stride, dest->y_stride,
-                source->y_crop_width, source->y_crop_height);
-
-  b = vpx_ssim2(source->u_buffer, dest->u_buffer,
-                source->uv_stride, dest->uv_stride,
-                source->uv_crop_width, source->uv_crop_height);
-
-  c = vpx_ssim2(source->v_buffer, dest->v_buffer,
-                source->uv_stride, dest->uv_stride,
-                source->uv_crop_width, source->uv_crop_height);
-  *ssim_y = a;
-  *ssim_u = b;
-  *ssim_v = c;
-  ssim_all = (a * 4 + b + c) / 6;
-
-  return ssim_all;
-}
-
 // traditional ssim as per: http://en.wikipedia.org/wiki/Structural_similarity
 //
 // Re working out the math ->
@@ -455,21 +445,28 @@
 #if CONFIG_VP9_HIGHBITDEPTH
 double vpx_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
                             const YV12_BUFFER_CONFIG *dest,
-                            double *weight, unsigned int bd) {
+                            double *weight, uint32_t bd, uint32_t in_bd) {
   double a, b, c;
   double ssimv;
+  uint32_t shift = 0;
+
+  assert(bd >= in_bd);
+  shift = bd - in_bd;
 
   a = vpx_highbd_ssim2(source->y_buffer, dest->y_buffer,
                        source->y_stride, dest->y_stride,
-                       source->y_crop_width, source->y_crop_height, bd);
+                       source->y_crop_width, source->y_crop_height,
+                       in_bd, shift);
 
   b = vpx_highbd_ssim2(source->u_buffer, dest->u_buffer,
                        source->uv_stride, dest->uv_stride,
-                       source->uv_crop_width, source->uv_crop_height, bd);
+                       source->uv_crop_width, source->uv_crop_height,
+                       in_bd, shift);
 
   c = vpx_highbd_ssim2(source->v_buffer, dest->v_buffer,
                        source->uv_stride, dest->uv_stride,
-                       source->uv_crop_width, source->uv_crop_height, bd);
+                       source->uv_crop_width, source->uv_crop_height,
+                       in_bd, shift);
 
   ssimv = a * .8 + .1 * (b + c);
 
@@ -478,28 +475,4 @@
   return ssimv;
 }
 
-double vpx_highbd_calc_ssimg(const YV12_BUFFER_CONFIG *source,
-                             const YV12_BUFFER_CONFIG *dest, double *ssim_y,
-                             double *ssim_u, double *ssim_v, unsigned int bd) {
-  double ssim_all = 0;
-  double a, b, c;
-
-  a = vpx_highbd_ssim2(source->y_buffer, dest->y_buffer,
-                       source->y_stride, dest->y_stride,
-                       source->y_crop_width, source->y_crop_height, bd);
-
-  b = vpx_highbd_ssim2(source->u_buffer, dest->u_buffer,
-                       source->uv_stride, dest->uv_stride,
-                       source->uv_crop_width, source->uv_crop_height, bd);
-
-  c = vpx_highbd_ssim2(source->v_buffer, dest->v_buffer,
-                       source->uv_stride, dest->uv_stride,
-                       source->uv_crop_width, source->uv_crop_height, bd);
-  *ssim_y = a;
-  *ssim_u = b;
-  *ssim_v = c;
-  ssim_all = (a * 4 + b + c) / 6;
-
-  return ssim_all;
-}
 #endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/ssim.h b/vpx_dsp/ssim.h
index 132f7f9..d4d6b0d 100644
--- a/vpx_dsp/ssim.h
+++ b/vpx_dsp/ssim.h
@@ -11,6 +11,8 @@
 #ifndef VPX_DSP_SSIM_H_
 #define VPX_DSP_SSIM_H_
 
+#define MAX_SSIM_DB 100.0;
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -68,30 +70,16 @@
                      const YV12_BUFFER_CONFIG *dest,
                      double *weight);
 
-double vpx_calc_ssimg(const YV12_BUFFER_CONFIG *source,
-                      const YV12_BUFFER_CONFIG *dest,
-                      double *ssim_y, double *ssim_u, double *ssim_v);
-
 double vpx_calc_fastssim(const YV12_BUFFER_CONFIG *source,
                          const YV12_BUFFER_CONFIG *dest,
-                         double *ssim_y, double *ssim_u, double *ssim_v);
-
-double vpx_psnrhvs(const YV12_BUFFER_CONFIG *source,
-                   const YV12_BUFFER_CONFIG *dest,
-                   double *ssim_y, double *ssim_u, double *ssim_v);
+                         double *ssim_y, double *ssim_u,
+                         double *ssim_v, uint32_t bd, uint32_t in_bd);
 
 #if CONFIG_VP9_HIGHBITDEPTH
 double vpx_highbd_calc_ssim(const YV12_BUFFER_CONFIG *source,
                             const YV12_BUFFER_CONFIG *dest,
                             double *weight,
-                            unsigned int bd);
-
-double vpx_highbd_calc_ssimg(const YV12_BUFFER_CONFIG *source,
-                             const YV12_BUFFER_CONFIG *dest,
-                             double *ssim_y,
-                             double *ssim_u,
-                             double *ssim_v,
-                             unsigned int bd);
+                            uint32_t bd, uint32_t in_bd);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #ifdef __cplusplus
diff --git a/vpx_dsp/sum_squares.c b/vpx_dsp/sum_squares.c
new file mode 100644
index 0000000..c72461c
--- /dev/null
+++ b/vpx_dsp/sum_squares.c
@@ -0,0 +1,39 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+
+#include "./vpx_dsp_rtcd.h"
+
+uint64_t vpx_sum_squares_2d_i16_c(const int16_t *src, int src_stride,
+                                  int size) {
+  int r, c;
+  uint64_t ss = 0;
+
+  for (r = 0; r < size; r++) {
+    for (c = 0; c < size; c++) {
+      const int16_t v = src[c];
+      ss += v*v;
+    }
+    src  += src_stride;
+  }
+
+  return ss;
+}
+
+uint64_t vpx_sum_squares_i16_c(const int16_t *src, uint32_t n) {
+  uint64_t ss = 0;
+  do {
+    const int16_t v = *src++;
+    ss += v*v;
+  } while (--n);
+
+  return ss;
+}
diff --git a/vpx_dsp/txfm_common.h b/vpx_dsp/txfm_common.h
index 442e6a5..9b0e990 100644
--- a/vpx_dsp/txfm_common.h
+++ b/vpx_dsp/txfm_common.h
@@ -57,10 +57,13 @@
 static const tran_high_t cospi_30_64 = 1606;
 static const tran_high_t cospi_31_64 = 804;
 
-//  16384 * sqrt(2) * sin(kPi/9) * 2 / 3
+// 16384 * sqrt(2) * sin(kPi/9) * 2 / 3
 static const tran_high_t sinpi_1_9 = 5283;
 static const tran_high_t sinpi_2_9 = 9929;
 static const tran_high_t sinpi_3_9 = 13377;
 static const tran_high_t sinpi_4_9 = 15212;
 
+// 16384 * sqrt(2)
+static const tran_high_t Sqrt2 = 23170;
+
 #endif  // VPX_DSP_TXFM_COMMON_H_
diff --git a/vpx_dsp/variance.c b/vpx_dsp/variance.c
index d960c54..3fd80dc 100644
--- a/vpx_dsp/variance.c
+++ b/vpx_dsp/variance.c
@@ -7,6 +7,7 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+#include <stdlib.h>
 
 #include "./vpx_config.h"
 #include "./vpx_dsp_rtcd.h"
@@ -15,17 +16,7 @@
 #include "vpx/vpx_integer.h"
 
 #include "vpx_dsp/variance.h"
-
-static const uint8_t bilinear_filters[8][2] = {
-  { 128,   0  },
-  { 112,  16  },
-  {  96,  32  },
-  {  80,  48  },
-  {  64,  64  },
-  {  48,  80  },
-  {  32,  96  },
-  {  16, 112  },
-};
+#include "vpx_dsp/vpx_filter.h"
 
 uint32_t vpx_get4x4sse_cs_c(const uint8_t *a, int  a_stride,
                             const uint8_t *b, int  b_stride) {
@@ -175,9 +166,9 @@
   uint8_t temp2[H * W]; \
 \
   var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
-                                    bilinear_filters[xoffset]); \
+                                    bilinear_filters_2t[xoffset]); \
   var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                     bilinear_filters[yoffset]); \
+                                     bilinear_filters_2t[yoffset]); \
 \
   return vpx_variance##W##x##H##_c(temp2, W, b, b_stride, sse); \
 }
@@ -195,9 +186,9 @@
   DECLARE_ALIGNED(16, uint8_t, temp3[H * W]); \
 \
   var_filter_block2d_bil_first_pass(a, fdata3, a_stride, 1, H + 1, W, \
-                                    bilinear_filters[xoffset]); \
+                                    bilinear_filters_2t[xoffset]); \
   var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                     bilinear_filters[yoffset]); \
+                                     bilinear_filters_2t[yoffset]); \
 \
   vpx_comp_avg_pred(temp3, second_pred, W, H, temp2, W); \
 \
@@ -234,6 +225,11 @@
     SUBPIX_VAR(W, H) \
     SUBPIX_AVG_VAR(W, H)
 
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+VARIANCES(128, 128)
+VARIANCES(128, 64)
+VARIANCES(64, 128)
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
 VARIANCES(64, 64)
 VARIANCES(64, 32)
 VARIANCES(32, 64)
@@ -272,6 +268,39 @@
   }
 }
 
+// Get pred block from up-sampled reference.
+void vpx_upsampled_pred_c(uint8_t *comp_pred,
+                          int width, int height,
+                          const uint8_t *ref,  int ref_stride) {
+    int i, j, k;
+    int stride = ref_stride << 3;
+
+    for (i = 0; i < height; i++) {
+      for (j = 0, k = 0; j < width; j++, k += 8) {
+        comp_pred[j] = ref[k];
+      }
+      comp_pred += width;
+      ref += stride;
+    }
+}
+
+void vpx_comp_avg_upsampled_pred_c(uint8_t *comp_pred, const uint8_t *pred,
+                                   int width, int height,
+                                   const uint8_t *ref, int ref_stride) {
+    int i, j;
+    int stride = ref_stride << 3;
+
+    for (i = 0; i < height; i++) {
+      for (j = 0; j < width; j++) {
+        const int tmp = ref[(j << 3)] + pred[j];
+        comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+      }
+      comp_pred += width;
+      pred += width;
+      ref += stride;
+    }
+}
+
 #if CONFIG_VP9_HIGHBITDEPTH
 static void highbd_variance64(const uint8_t *a8, int  a_stride,
                               const uint8_t *b8, int  b_stride,
@@ -409,7 +438,7 @@
   return *sse; \
 }
 
-static void highbd_var_filter_block2d_bil_first_pass(
+void vpx_highbd_var_filter_block2d_bil_first_pass(
     const uint8_t *src_ptr8,
     uint16_t *output_ptr,
     unsigned int src_pixels_per_line,
@@ -435,7 +464,7 @@
   }
 }
 
-static void highbd_var_filter_block2d_bil_second_pass(
+void vpx_highbd_var_filter_block2d_bil_second_pass(
     const uint16_t *src_ptr,
     uint16_t *output_ptr,
     unsigned int src_pixels_per_line,
@@ -468,13 +497,14 @@
   uint16_t fdata3[(H + 1) * W]; \
   uint16_t temp2[H * W]; \
 \
-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                           W, bilinear_filters[xoffset]); \
-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            bilinear_filters[yoffset]); \
+  vpx_highbd_var_filter_block2d_bil_first_pass( \
+      src, fdata3, src_stride, 1, H + 1, \
+      W, bilinear_filters_2t[xoffset]); \
+  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                                bilinear_filters_2t[yoffset]); \
 \
   return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), W, dst, \
-                                          dst_stride, sse); \
+                                            dst_stride, sse); \
 } \
 \
 uint32_t vpx_highbd_10_sub_pixel_variance##W##x##H##_c( \
@@ -485,10 +515,11 @@
   uint16_t fdata3[(H + 1) * W]; \
   uint16_t temp2[H * W]; \
 \
-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                           W, bilinear_filters[xoffset]); \
-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            bilinear_filters[yoffset]); \
+  vpx_highbd_var_filter_block2d_bil_first_pass( \
+      src, fdata3, src_stride, 1, H + 1, \
+      W, bilinear_filters_2t[xoffset]); \
+  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                                bilinear_filters_2t[yoffset]); \
 \
   return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
                                              W, dst, dst_stride, sse); \
@@ -502,10 +533,11 @@
   uint16_t fdata3[(H + 1) * W]; \
   uint16_t temp2[H * W]; \
 \
-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                           W, bilinear_filters[xoffset]); \
-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            bilinear_filters[yoffset]); \
+  vpx_highbd_var_filter_block2d_bil_first_pass( \
+      src, fdata3, src_stride, 1, H + 1, \
+      W, bilinear_filters_2t[xoffset]); \
+  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                                bilinear_filters_2t[yoffset]); \
 \
   return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
                                              W, dst, dst_stride, sse); \
@@ -522,16 +554,17 @@
   uint16_t temp2[H * W]; \
   DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
 \
-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                           W, bilinear_filters[xoffset]); \
-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            bilinear_filters[yoffset]); \
+  vpx_highbd_var_filter_block2d_bil_first_pass( \
+      src, fdata3, src_stride, 1, H + 1, \
+      W, bilinear_filters_2t[xoffset]); \
+  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                                bilinear_filters_2t[yoffset]); \
 \
-  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
-                           CONVERT_TO_BYTEPTR(temp2), W); \
+  vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \
+                             CONVERT_TO_BYTEPTR(temp2), W); \
 \
   return vpx_highbd_8_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), W, dst, \
-                                          dst_stride, sse); \
+                                            dst_stride, sse);           \
 } \
 \
 uint32_t vpx_highbd_10_sub_pixel_avg_variance##W##x##H##_c( \
@@ -544,13 +577,14 @@
   uint16_t temp2[H * W]; \
   DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
 \
-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                           W, bilinear_filters[xoffset]); \
-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            bilinear_filters[yoffset]); \
+  vpx_highbd_var_filter_block2d_bil_first_pass( \
+      src, fdata3, src_stride, 1, H + 1, \
+      W, bilinear_filters_2t[xoffset]); \
+  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                                bilinear_filters_2t[yoffset]); \
 \
-  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
-                           CONVERT_TO_BYTEPTR(temp2), W); \
+  vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \
+                             CONVERT_TO_BYTEPTR(temp2), W); \
 \
   return vpx_highbd_10_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
                                              W, dst, dst_stride, sse); \
@@ -566,13 +600,14 @@
   uint16_t temp2[H * W]; \
   DECLARE_ALIGNED(16, uint16_t, temp3[H * W]); \
 \
-  highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, \
-                                           W, bilinear_filters[xoffset]); \
-  highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
-                                            bilinear_filters[yoffset]); \
+  vpx_highbd_var_filter_block2d_bil_first_pass( \
+      src, fdata3, src_stride, 1, H + 1, \
+      W, bilinear_filters_2t[xoffset]); \
+  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                                bilinear_filters_2t[yoffset]); \
 \
-  vpx_highbd_comp_avg_pred(temp3, second_pred, W, H, \
-                           CONVERT_TO_BYTEPTR(temp2), W); \
+  vpx_highbd_comp_avg_pred_c(temp3, second_pred, W, H, \
+                             CONVERT_TO_BYTEPTR(temp2), W); \
 \
   return vpx_highbd_12_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp3), \
                                              W, dst, dst_stride, sse); \
@@ -584,6 +619,11 @@
     HIGHBD_SUBPIX_VAR(W, H) \
     HIGHBD_SUBPIX_AVG_VAR(W, H)
 
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+HIGHBD_VARIANCES(128, 128)
+HIGHBD_VARIANCES(128, 64)
+HIGHBD_VARIANCES(64, 128)
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
 HIGHBD_VARIANCES(64, 64)
 HIGHBD_VARIANCES(64, 32)
 HIGHBD_VARIANCES(32, 64)
@@ -606,9 +646,9 @@
 HIGHBD_MSE(8, 16)
 HIGHBD_MSE(8, 8)
 
-void vpx_highbd_comp_avg_pred(uint16_t *comp_pred, const uint8_t *pred8,
-                              int width, int height, const uint8_t *ref8,
-                              int ref_stride) {
+void vpx_highbd_comp_avg_pred_c(uint16_t *comp_pred, const uint8_t *pred8,
+                                int width, int height, const uint8_t *ref8,
+                                int ref_stride) {
   int i, j;
   uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
   uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
@@ -622,4 +662,672 @@
     ref += ref_stride;
   }
 }
+
+void vpx_highbd_upsampled_pred_c(uint16_t *comp_pred,
+                                 int width, int height,
+                                 const uint8_t *ref8,
+                                 int ref_stride) {
+  int i, j;
+  int stride = ref_stride << 3;
+
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      comp_pred[j] = ref[(j << 3)];
+    }
+    comp_pred += width;
+    ref += stride;
+  }
+}
+
+void vpx_highbd_comp_avg_upsampled_pred_c(uint16_t *comp_pred,
+                                          const uint8_t *pred8,
+                                          int width, int height,
+                                          const uint8_t *ref8,
+                                          int ref_stride) {
+  int i, j;
+  int stride = ref_stride << 3;
+
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+  for (i = 0; i < height; ++i) {
+    for (j = 0; j < width; ++j) {
+      const int tmp = pred[j] + ref[(j << 3)];
+      comp_pred[j] = ROUND_POWER_OF_TWO(tmp, 1);
+    }
+    comp_pred += width;
+    pred += width;
+    ref += stride;
+  }
+}
 #endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#if CONFIG_VP10 && CONFIG_EXT_INTER
+void masked_variance(const uint8_t *a, int  a_stride,
+                     const uint8_t *b, int  b_stride,
+                     const uint8_t *m, int  m_stride,
+                     int  w, int  h, unsigned int *sse, int *sum) {
+  int i, j;
+
+  int64_t sum64 = 0;
+  uint64_t sse64 = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      const int diff = (a[j] - b[j]) * (m[j]);
+      sum64 += diff;
+      sse64 += diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  }
+  sum64 = (sum64 >= 0) ? sum64  : -sum64;
+  *sum = (int)ROUND_POWER_OF_TWO(sum64, 6);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(sse64, 12);
+}
+
+#define MASK_VAR(W, H) \
+unsigned int vpx_masked_variance##W##x##H##_c(const uint8_t *a, int a_stride, \
+                                              const uint8_t *b, int b_stride, \
+                                              const uint8_t *m, int m_stride, \
+                                              unsigned int *sse) { \
+  int sum; \
+  masked_variance(a, a_stride, b, b_stride, m, m_stride, W, H, sse, &sum); \
+  return *sse - (((int64_t)sum * sum) / (W * H)); \
+}
+
+#define MASK_SUBPIX_VAR(W, H) \
+unsigned int vpx_masked_sub_pixel_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  const uint8_t *msk, int msk_stride, \
+  unsigned int *sse) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint8_t temp2[H * W]; \
+\
+  var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, H + 1, W, \
+                                    bilinear_filters_2t[xoffset]); \
+  var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                     bilinear_filters_2t[yoffset]); \
+\
+  return vpx_masked_variance##W##x##H##_c(temp2, W, dst, dst_stride, \
+                                          msk, msk_stride, sse); \
+}
+
+MASK_VAR(4, 4)
+MASK_SUBPIX_VAR(4, 4)
+
+MASK_VAR(4, 8)
+MASK_SUBPIX_VAR(4, 8)
+
+MASK_VAR(8, 4)
+MASK_SUBPIX_VAR(8, 4)
+
+MASK_VAR(8, 8)
+MASK_SUBPIX_VAR(8, 8)
+
+MASK_VAR(8, 16)
+MASK_SUBPIX_VAR(8, 16)
+
+MASK_VAR(16, 8)
+MASK_SUBPIX_VAR(16, 8)
+
+MASK_VAR(16, 16)
+MASK_SUBPIX_VAR(16, 16)
+
+MASK_VAR(16, 32)
+MASK_SUBPIX_VAR(16, 32)
+
+MASK_VAR(32, 16)
+MASK_SUBPIX_VAR(32, 16)
+
+MASK_VAR(32, 32)
+MASK_SUBPIX_VAR(32, 32)
+
+MASK_VAR(32, 64)
+MASK_SUBPIX_VAR(32, 64)
+
+MASK_VAR(64, 32)
+MASK_SUBPIX_VAR(64, 32)
+
+MASK_VAR(64, 64)
+MASK_SUBPIX_VAR(64, 64)
+
+#if CONFIG_EXT_PARTITION
+MASK_VAR(64, 128)
+MASK_SUBPIX_VAR(64, 128)
+
+MASK_VAR(128, 64)
+MASK_SUBPIX_VAR(128, 64)
+
+MASK_VAR(128, 128)
+MASK_SUBPIX_VAR(128, 128)
+#endif  // CONFIG_EXT_PARTITION
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void highbd_masked_variance64(const uint8_t *a8, int  a_stride,
+                              const uint8_t *b8, int  b_stride,
+                              const uint8_t *m, int  m_stride,
+                              int  w, int  h,
+                              uint64_t *sse, int64_t *sum) {
+  int i, j;
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+
+  *sum = 0;
+  *sse = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      const int diff = (a[j] - b[j]) * (m[j]);
+      *sum += (int64_t)diff;
+      *sse += (int64_t)diff * diff;
+    }
+
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  }
+  *sum = (*sum >= 0) ? *sum  : -*sum;
+  *sum = ROUND_POWER_OF_TWO(*sum, 6);
+  *sse = ROUND_POWER_OF_TWO(*sse, 12);
+}
+
+void highbd_masked_variance(const uint8_t *a8, int  a_stride,
+                            const uint8_t *b8, int  b_stride,
+                            const uint8_t *m, int  m_stride,
+                            int  w, int  h,
+                            unsigned int *sse, int *sum) {
+  int64_t sum64;
+  uint64_t sse64;
+  highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride,
+                           w, h, &sse64, &sum64);
+  *sum = (int)sum64;
+  *sse = (unsigned int)sse64;
+}
+
+void highbd_10_masked_variance(const uint8_t *a8, int  a_stride,
+                               const uint8_t *b8, int  b_stride,
+                               const uint8_t *m, int  m_stride,
+                               int  w, int  h,
+                               unsigned int *sse, int *sum) {
+  int64_t sum64;
+  uint64_t sse64;
+  highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride,
+                           w, h, &sse64, &sum64);
+  *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
+  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
+}
+
+void highbd_12_masked_variance(const uint8_t *a8, int  a_stride,
+                               const uint8_t *b8, int  b_stride,
+                               const uint8_t *m, int  m_stride,
+                               int  w, int  h,
+                               unsigned int *sse, int *sum) {
+  int64_t sum64;
+  uint64_t sse64;
+  highbd_masked_variance64(a8, a_stride, b8, b_stride, m, m_stride,
+                           w, h, &sse64, &sum64);
+  *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
+  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
+}
+
+#define HIGHBD_MASK_VAR(W, H) \
+unsigned int vpx_highbd_masked_variance##W##x##H##_c(const uint8_t *a, \
+                                                     int a_stride, \
+                                                     const uint8_t *b, \
+                                                     int b_stride, \
+                                                     const uint8_t *m, \
+                                                     int m_stride, \
+                                                     unsigned int *sse) { \
+  int sum; \
+  highbd_masked_variance(a, a_stride, b, b_stride, m, m_stride, \
+                         W, H, sse, &sum); \
+  return *sse - (((int64_t)sum * sum) / (W * H)); \
+} \
+\
+unsigned int vpx_highbd_10_masked_variance##W##x##H##_c(const uint8_t *a, \
+                                                        int a_stride, \
+                                                        const uint8_t *b, \
+                                                        int b_stride, \
+                                                        const uint8_t *m, \
+                                                        int m_stride, \
+                                                        unsigned int *sse) { \
+  int sum; \
+  highbd_10_masked_variance(a, a_stride, b, b_stride, m, m_stride, \
+                            W, H, sse, &sum); \
+  return *sse - (((int64_t)sum * sum) / (W * H)); \
+} \
+\
+unsigned int vpx_highbd_12_masked_variance##W##x##H##_c(const uint8_t *a, \
+                                                        int a_stride, \
+                                                        const uint8_t *b, \
+                                                        int b_stride, \
+                                                        const uint8_t *m, \
+                                                        int m_stride, \
+                                                        unsigned int *sse) { \
+  int sum; \
+  highbd_12_masked_variance(a, a_stride, b, b_stride, m, m_stride, \
+                            W, H, sse, &sum); \
+  return *sse - (((int64_t)sum * sum) / (W * H)); \
+}
+
+#define HIGHBD_MASK_SUBPIX_VAR(W, H) \
+unsigned int vpx_highbd_masked_sub_pixel_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  const uint8_t *msk, int msk_stride, \
+  unsigned int *sse) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+\
+  vpx_highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, \
+                                               H + 1, W, \
+                                               bilinear_filters_2t[xoffset]); \
+  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                                bilinear_filters_2t[yoffset]); \
+\
+  return vpx_highbd_masked_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
+                                                 W, dst, dst_stride, \
+                                                 msk, msk_stride, sse); \
+} \
+\
+unsigned int vpx_highbd_10_masked_sub_pixel_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  const uint8_t *msk, int msk_stride, \
+  unsigned int *sse) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+\
+  vpx_highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, \
+                                               H + 1, W, \
+                                               bilinear_filters_2t[xoffset]); \
+  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                                bilinear_filters_2t[yoffset]); \
+\
+  return vpx_highbd_10_masked_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
+                                                    W, dst, dst_stride, \
+                                                    msk, msk_stride, sse); \
+} \
+\
+unsigned int vpx_highbd_12_masked_sub_pixel_variance##W##x##H##_c( \
+  const uint8_t *src, int  src_stride, \
+  int xoffset, int  yoffset, \
+  const uint8_t *dst, int dst_stride, \
+  const uint8_t *msk, int msk_stride, \
+  unsigned int *sse) { \
+  uint16_t fdata3[(H + 1) * W]; \
+  uint16_t temp2[H * W]; \
+\
+  vpx_highbd_var_filter_block2d_bil_first_pass(src, fdata3, src_stride, 1, \
+                                               H + 1, W, \
+                                               bilinear_filters_2t[xoffset]); \
+  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W, \
+                                                bilinear_filters_2t[yoffset]); \
+\
+  return vpx_highbd_12_masked_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2), \
+                                                    W, dst, dst_stride, \
+                                                    msk, msk_stride, sse); \
+}
+
+HIGHBD_MASK_VAR(4, 4)
+HIGHBD_MASK_SUBPIX_VAR(4, 4)
+
+HIGHBD_MASK_VAR(4, 8)
+HIGHBD_MASK_SUBPIX_VAR(4, 8)
+
+HIGHBD_MASK_VAR(8, 4)
+HIGHBD_MASK_SUBPIX_VAR(8, 4)
+
+HIGHBD_MASK_VAR(8, 8)
+HIGHBD_MASK_SUBPIX_VAR(8, 8)
+
+HIGHBD_MASK_VAR(8, 16)
+HIGHBD_MASK_SUBPIX_VAR(8, 16)
+
+HIGHBD_MASK_VAR(16, 8)
+HIGHBD_MASK_SUBPIX_VAR(16, 8)
+
+HIGHBD_MASK_VAR(16, 16)
+HIGHBD_MASK_SUBPIX_VAR(16, 16)
+
+HIGHBD_MASK_VAR(16, 32)
+HIGHBD_MASK_SUBPIX_VAR(16, 32)
+
+HIGHBD_MASK_VAR(32, 16)
+HIGHBD_MASK_SUBPIX_VAR(32, 16)
+
+HIGHBD_MASK_VAR(32, 32)
+HIGHBD_MASK_SUBPIX_VAR(32, 32)
+
+HIGHBD_MASK_VAR(32, 64)
+HIGHBD_MASK_SUBPIX_VAR(32, 64)
+
+HIGHBD_MASK_VAR(64, 32)
+HIGHBD_MASK_SUBPIX_VAR(64, 32)
+
+HIGHBD_MASK_VAR(64, 64)
+HIGHBD_MASK_SUBPIX_VAR(64, 64)
+
+#if CONFIG_EXT_PARTITION
+HIGHBD_MASK_VAR(64, 128)
+HIGHBD_MASK_SUBPIX_VAR(64, 128)
+
+HIGHBD_MASK_VAR(128, 64)
+HIGHBD_MASK_SUBPIX_VAR(128, 64)
+
+HIGHBD_MASK_VAR(128, 128)
+HIGHBD_MASK_SUBPIX_VAR(128, 128)
+#endif  // CONFIG_EXT_PARTITION
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // CONFIG_VP10 && CONFIG_EXT_INTER
+
+#if CONFIG_VP10 && CONFIG_OBMC
+static INLINE void obmc_variance(const uint8_t *pre, int  pre_stride,
+                                 const int32_t *wsrc, const int32_t *mask,
+                                 int w, int h, unsigned int *sse, int *sum) {
+  int i, j;
+
+  *sse = 0;
+  *sum = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
+      *sum += diff;
+      *sse += diff * diff;
+    }
+
+    pre += pre_stride;
+    wsrc += w;
+    mask += w;
+  }
+}
+
+#define OBMC_VAR(W, H) \
+unsigned int vpx_obmc_variance##W##x##H##_c(const uint8_t *pre,               \
+                                            int pre_stride,                   \
+                                            const int32_t *wsrc,              \
+                                            const int32_t *mask,              \
+                                            unsigned int *sse) {              \
+  int sum;                                                                    \
+  obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);                \
+  return *sse - (((int64_t)sum * sum) / (W * H));                             \
+}
+
+#define OBMC_SUBPIX_VAR(W, H) \
+unsigned int vpx_obmc_sub_pixel_variance##W##x##H##_c(const uint8_t *pre,     \
+                                                      int pre_stride,         \
+                                                      int xoffset,            \
+                                                      int yoffset,            \
+                                                      const int32_t *wsrc,    \
+                                                      const int32_t *mask,    \
+                                                      unsigned int *sse) {    \
+  uint16_t fdata3[(H + 1) * W];                                               \
+  uint8_t temp2[H * W];                                                       \
+                                                                              \
+  var_filter_block2d_bil_first_pass(pre, fdata3, pre_stride, 1, H + 1, W,     \
+                                    bilinear_filters_2t[xoffset]);            \
+  var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,               \
+                                     bilinear_filters_2t[yoffset]);           \
+                                                                              \
+  return vpx_obmc_variance##W##x##H##_c(temp2, W, wsrc, mask, sse);           \
+}
+
+OBMC_VAR(4, 4)
+OBMC_SUBPIX_VAR(4, 4)
+
+OBMC_VAR(4, 8)
+OBMC_SUBPIX_VAR(4, 8)
+
+OBMC_VAR(8, 4)
+OBMC_SUBPIX_VAR(8, 4)
+
+OBMC_VAR(8, 8)
+OBMC_SUBPIX_VAR(8, 8)
+
+OBMC_VAR(8, 16)
+OBMC_SUBPIX_VAR(8, 16)
+
+OBMC_VAR(16, 8)
+OBMC_SUBPIX_VAR(16, 8)
+
+OBMC_VAR(16, 16)
+OBMC_SUBPIX_VAR(16, 16)
+
+OBMC_VAR(16, 32)
+OBMC_SUBPIX_VAR(16, 32)
+
+OBMC_VAR(32, 16)
+OBMC_SUBPIX_VAR(32, 16)
+
+OBMC_VAR(32, 32)
+OBMC_SUBPIX_VAR(32, 32)
+
+OBMC_VAR(32, 64)
+OBMC_SUBPIX_VAR(32, 64)
+
+OBMC_VAR(64, 32)
+OBMC_SUBPIX_VAR(64, 32)
+
+OBMC_VAR(64, 64)
+OBMC_SUBPIX_VAR(64, 64)
+
+#if CONFIG_EXT_PARTITION
+OBMC_VAR(64, 128)
+OBMC_SUBPIX_VAR(64, 128)
+
+OBMC_VAR(128, 64)
+OBMC_SUBPIX_VAR(128, 64)
+
+OBMC_VAR(128, 128)
+OBMC_SUBPIX_VAR(128, 128)
+#endif  // CONFIG_EXT_PARTITION
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void highbd_obmc_variance64(const uint8_t *pre8, int pre_stride,
+                                          const int32_t *wsrc,
+                                          const int32_t *mask,
+                                          int w, int h,
+                                          uint64_t *sse, int64_t *sum) {
+  int i, j;
+  uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+
+  *sse = 0;
+  *sum = 0;
+
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j++) {
+      int diff = ROUND_POWER_OF_TWO_SIGNED(wsrc[j] - pre[j] * mask[j], 12);
+      *sum += diff;
+      *sse += diff * diff;
+    }
+
+    pre += pre_stride;
+    wsrc += w;
+    mask += w;
+  }
+}
+
+static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
+                                        const int32_t *wsrc,
+                                        const int32_t *mask,
+                                        int w, int h,
+                                        unsigned int *sse, int *sum) {
+  int64_t sum64;
+  uint64_t sse64;
+  highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
+  *sum = (int)sum64;
+  *sse = (unsigned int)sse64;
+}
+
+static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int  pre_stride,
+                                           const int32_t *wsrc,
+                                           const int32_t *mask,
+                                           int w, int h,
+                                           unsigned int *sse, int *sum) {
+  int64_t sum64;
+  uint64_t sse64;
+  highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
+  *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
+  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
+}
+
+static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int  pre_stride,
+                                           const int32_t *wsrc,
+                                           const int32_t *mask,
+                                           int w, int h,
+                                           unsigned int *sse, int *sum) {
+  int64_t sum64;
+  uint64_t sse64;
+  highbd_obmc_variance64(pre8, pre_stride, wsrc, mask, w, h, &sse64, &sum64);
+  *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
+  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
+}
+
+#define HIGHBD_OBMC_VAR(W, H)                                                 \
+unsigned int vpx_highbd_obmc_variance##W##x##H##_c(const uint8_t *pre,        \
+                                                   int pre_stride,            \
+                                                   const int32_t *wsrc,       \
+                                                   const int32_t *mask,       \
+                                                   unsigned int *sse) {       \
+  int sum;                                                                    \
+  highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);         \
+  return *sse - (((int64_t)sum * sum) / (W * H));                             \
+}                                                                             \
+                                                                              \
+unsigned int vpx_highbd_10_obmc_variance##W##x##H##_c(const uint8_t *pre,     \
+                                                      int pre_stride,         \
+                                                      const int32_t *wsrc,    \
+                                                      const int32_t *mask,    \
+                                                      unsigned int *sse) {    \
+  int sum;                                                                    \
+  highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);      \
+  return *sse - (((int64_t)sum * sum) / (W * H));                             \
+}                                                                             \
+                                                                              \
+unsigned int vpx_highbd_12_obmc_variance##W##x##H##_c(const uint8_t *pre,     \
+                                                      int pre_stride,         \
+                                                      const int32_t *wsrc,    \
+                                                      const int32_t *mask,    \
+                                                      unsigned int *sse) {    \
+  int sum;                                                                    \
+  highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);      \
+  return *sse - (((int64_t)sum * sum) / (W * H));                             \
+}
+
+#define HIGHBD_OBMC_SUBPIX_VAR(W, H)                                          \
+unsigned int vpx_highbd_obmc_sub_pixel_variance##W##x##H##_c(                 \
+                                        const uint8_t *pre, int pre_stride,   \
+                                        int xoffset, int  yoffset,            \
+                                        const int32_t *wsrc,                  \
+                                        const int32_t *mask,                  \
+                                        unsigned int *sse) {                  \
+  uint16_t fdata3[(H + 1) * W];                                               \
+  uint16_t temp2[H * W];                                                      \
+                                                                              \
+  vpx_highbd_var_filter_block2d_bil_first_pass(pre, fdata3, pre_stride, 1,    \
+                                               H + 1, W,                      \
+                                               bilinear_filters_2t[xoffset]); \
+  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,    \
+                                               bilinear_filters_2t[yoffset]); \
+                                                                              \
+  return vpx_highbd_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2),     \
+                                               W, wsrc, mask, sse);           \
+}                                                                             \
+                                                                              \
+unsigned int vpx_highbd_10_obmc_sub_pixel_variance##W##x##H##_c(              \
+                                        const uint8_t *pre, int pre_stride,   \
+                                        int xoffset, int  yoffset,            \
+                                        const int32_t *wsrc,                  \
+                                        const int32_t *mask,                  \
+                                        unsigned int *sse) {                  \
+  uint16_t fdata3[(H + 1) * W];                                               \
+  uint16_t temp2[H * W];                                                      \
+                                                                              \
+  vpx_highbd_var_filter_block2d_bil_first_pass(pre, fdata3, pre_stride, 1,    \
+                                               H + 1, W,                      \
+                                               bilinear_filters_2t[xoffset]); \
+  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,    \
+                                               bilinear_filters_2t[yoffset]); \
+                                                                              \
+  return vpx_highbd_10_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2),  \
+                                                  W, wsrc, mask, sse);        \
+}                                                                             \
+                                                                              \
+unsigned int vpx_highbd_12_obmc_sub_pixel_variance##W##x##H##_c(              \
+                                        const uint8_t *pre, int pre_stride,   \
+                                        int xoffset, int  yoffset,            \
+                                        const int32_t *wsrc,                  \
+                                        const int32_t *mask,                  \
+                                        unsigned int *sse) {                  \
+  uint16_t fdata3[(H + 1) * W];                                               \
+  uint16_t temp2[H * W];                                                      \
+                                                                              \
+  vpx_highbd_var_filter_block2d_bil_first_pass(pre, fdata3, pre_stride, 1,    \
+                                               H + 1, W,                      \
+                                               bilinear_filters_2t[xoffset]); \
+  vpx_highbd_var_filter_block2d_bil_second_pass(fdata3, temp2, W, W, H, W,    \
+                                               bilinear_filters_2t[yoffset]); \
+                                                                              \
+  return vpx_highbd_12_obmc_variance##W##x##H##_c(CONVERT_TO_BYTEPTR(temp2),  \
+                                                  W, wsrc, mask, sse);        \
+}
+
+HIGHBD_OBMC_VAR(4, 4)
+HIGHBD_OBMC_SUBPIX_VAR(4, 4)
+
+HIGHBD_OBMC_VAR(4, 8)
+HIGHBD_OBMC_SUBPIX_VAR(4, 8)
+
+HIGHBD_OBMC_VAR(8, 4)
+HIGHBD_OBMC_SUBPIX_VAR(8, 4)
+
+HIGHBD_OBMC_VAR(8, 8)
+HIGHBD_OBMC_SUBPIX_VAR(8, 8)
+
+HIGHBD_OBMC_VAR(8, 16)
+HIGHBD_OBMC_SUBPIX_VAR(8, 16)
+
+HIGHBD_OBMC_VAR(16, 8)
+HIGHBD_OBMC_SUBPIX_VAR(16, 8)
+
+HIGHBD_OBMC_VAR(16, 16)
+HIGHBD_OBMC_SUBPIX_VAR(16, 16)
+
+HIGHBD_OBMC_VAR(16, 32)
+HIGHBD_OBMC_SUBPIX_VAR(16, 32)
+
+HIGHBD_OBMC_VAR(32, 16)
+HIGHBD_OBMC_SUBPIX_VAR(32, 16)
+
+HIGHBD_OBMC_VAR(32, 32)
+HIGHBD_OBMC_SUBPIX_VAR(32, 32)
+
+HIGHBD_OBMC_VAR(32, 64)
+HIGHBD_OBMC_SUBPIX_VAR(32, 64)
+
+HIGHBD_OBMC_VAR(64, 32)
+HIGHBD_OBMC_SUBPIX_VAR(64, 32)
+
+HIGHBD_OBMC_VAR(64, 64)
+HIGHBD_OBMC_SUBPIX_VAR(64, 64)
+
+#if CONFIG_EXT_PARTITION
+HIGHBD_OBMC_VAR(64, 128)
+HIGHBD_OBMC_SUBPIX_VAR(64, 128)
+
+HIGHBD_OBMC_VAR(128, 64)
+HIGHBD_OBMC_SUBPIX_VAR(128, 64)
+
+HIGHBD_OBMC_VAR(128, 128)
+HIGHBD_OBMC_SUBPIX_VAR(128, 128)
+#endif  // CONFIG_EXT_PARTITION
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+#endif  // CONFIG_VP10 && CONFIG_OBMC
diff --git a/vpx_dsp/variance.h b/vpx_dsp/variance.h
index cd0fd98..aaef8c0 100644
--- a/vpx_dsp/variance.h
+++ b/vpx_dsp/variance.h
@@ -23,10 +23,10 @@
 #define FILTER_WEIGHT 128
 
 typedef unsigned int(*vpx_sad_fn_t)(const uint8_t *a, int a_stride,
-                                    const uint8_t *b_ptr, int b_stride);
+                                    const uint8_t *b, int b_stride);
 
-typedef unsigned int(*vpx_sad_avg_fn_t)(const uint8_t *a_ptr, int a_stride,
-                                        const uint8_t *b_ptr, int b_stride,
+typedef unsigned int(*vpx_sad_avg_fn_t)(const uint8_t *a, int a_stride,
+                                        const uint8_t *b, int b_stride,
                                         const uint8_t *second_pred);
 
 typedef void (*vp8_copy32xn_fn_t)(const uint8_t *a, int a_stride,
@@ -50,10 +50,10 @@
                                                 const uint8_t *b, int b_stride,
                                                 unsigned int *sse);
 
-typedef unsigned int (*vpx_subp_avg_variance_fn_t)(const uint8_t *a_ptr,
+typedef unsigned int (*vpx_subp_avg_variance_fn_t)(const uint8_t *a,
                                                    int a_stride,
                                                    int xoffset, int yoffset,
-                                                   const uint8_t *b_ptr,
+                                                   const uint8_t *b,
                                                    int b_stride,
                                                    unsigned int *sse,
                                                    const uint8_t *second_pred);
@@ -74,7 +74,49 @@
 } vp8_variance_fn_ptr_t;
 #endif  // CONFIG_VP8
 
-#if CONFIG_VP9 || CONFIG_VP10
+#if CONFIG_VP10 && CONFIG_EXT_INTER
+typedef unsigned int(*vpx_masked_sad_fn_t)(const uint8_t *src,
+                                           int src_stride,
+                                           const uint8_t *ref,
+                                           int ref_stride,
+                                           const uint8_t *msk_ptr,
+                                           int msk_stride);
+typedef unsigned int (*vpx_masked_variance_fn_t)(const uint8_t *src,
+                                                 int src_stride,
+                                                 const uint8_t *ref,
+                                                 int ref_stride,
+                                                 const uint8_t *msk,
+                                                 int msk_stride,
+                                                 unsigned int *sse);
+typedef unsigned int (*vpx_masked_subpixvariance_fn_t)(const uint8_t *src,
+                                                       int src_stride,
+                                                       int xoffset, int yoffset,
+                                                       const uint8_t *ref,
+                                                       int ref_stride,
+                                                       const uint8_t *msk,
+                                                       int msk_stride,
+                                                       unsigned int *sse);
+#endif  // CONFIG_VP10 && CONFIG_EXT_INTER
+
+#if CONFIG_VP10 && CONFIG_OBMC
+typedef unsigned int(*vpx_obmc_sad_fn_t)(const uint8_t *pred,
+                                         int pred_stride,
+                                         const int32_t *wsrc,
+                                         const int32_t *msk);
+typedef unsigned int (*vpx_obmc_variance_fn_t)(const uint8_t *pred,
+                                               int pred_stride,
+                                               const int32_t *wsrc,
+                                               const int32_t *msk,
+                                               unsigned int *sse);
+typedef unsigned int (*vpx_obmc_subpixvariance_fn_t)(const uint8_t *pred,
+                                                     int pred_stride,
+                                                     int xoffset, int yoffset,
+                                                     const int32_t *wsrc,
+                                                     const int32_t *msk,
+                                                     unsigned int *sse);
+#endif  // CONFIG_VP10 && CONFIG_OBMC
+
+#if CONFIG_VP9
 typedef struct vp9_variance_vtable {
   vpx_sad_fn_t               sdf;
   vpx_sad_avg_fn_t           sdaf;
@@ -85,7 +127,48 @@
   vpx_sad_multi_fn_t         sdx8f;
   vpx_sad_multi_d_fn_t       sdx4df;
 } vp9_variance_fn_ptr_t;
-#endif  // CONFIG_VP9 || CONFIG_VP10
+#endif  // CONFIG_VP9
+
+#if CONFIG_VP10
+typedef struct vp10_variance_vtable {
+  vpx_sad_fn_t                   sdf;
+  vpx_sad_avg_fn_t               sdaf;
+  vpx_variance_fn_t              vf;
+  vpx_subpixvariance_fn_t        svf;
+  vpx_subp_avg_variance_fn_t     svaf;
+  vpx_sad_multi_fn_t             sdx3f;
+  vpx_sad_multi_fn_t             sdx8f;
+  vpx_sad_multi_d_fn_t           sdx4df;
+#if CONFIG_EXT_INTER
+  vpx_masked_sad_fn_t            msdf;
+  vpx_masked_variance_fn_t       mvf;
+  vpx_masked_subpixvariance_fn_t msvf;
+#endif  // CONFIG_EXT_INTER
+#if CONFIG_OBMC
+  vpx_obmc_sad_fn_t              osdf;
+  vpx_obmc_variance_fn_t         ovf;
+  vpx_obmc_subpixvariance_fn_t   osvf;
+#endif  // CONFIG_OBMC
+} vp10_variance_fn_ptr_t;
+#endif  // CONFIG_VP10
+
+void vpx_highbd_var_filter_block2d_bil_first_pass(
+    const uint8_t *src_ptr8,
+    uint16_t *output_ptr,
+    unsigned int src_pixels_per_line,
+    int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    const uint8_t *filter);
+
+void vpx_highbd_var_filter_block2d_bil_second_pass(
+    const uint16_t *src_ptr,
+    uint16_t *output_ptr,
+    unsigned int src_pixels_per_line,
+    unsigned int pixel_step,
+    unsigned int output_height,
+    unsigned int output_width,
+    const uint8_t *filter);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/vpx_dsp/vpx_convolve.c b/vpx_dsp/vpx_convolve.c
index 2d1c927..59d0488 100644
--- a/vpx_dsp/vpx_convolve.c
+++ b/vpx_dsp/vpx_convolve.c
@@ -130,18 +130,21 @@
   // --Must round-up because block may be located at sub-pixel position.
   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
-  uint8_t temp[135 * 64];
+  uint8_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
   int intermediate_height =
           (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
 
-  assert(w <= 64);
-  assert(h <= 64);
+  assert(w <= MAX_SB_SIZE);
+  assert(h <= MAX_SB_SIZE);
+
   assert(y_step_q4 <= 32);
   assert(x_step_q4 <= 32);
 
-  convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64,
+  convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
+                 temp, MAX_SB_SIZE,
                  x_filters, x0_q4, x_step_q4, w, intermediate_height);
-  convolve_vert(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride,
+  convolve_vert(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE,
+                dst, dst_stride,
                 y_filters, y0_q4, y_step_q4, w, h);
 }
 
@@ -237,13 +240,14 @@
                          const int16_t *filter_y, int y_step_q4,
                          int w, int h) {
   /* Fixed size intermediate buffer places limits on parameters. */
-  DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
-  assert(w <= 64);
-  assert(h <= 64);
+  DECLARE_ALIGNED(16, uint8_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
+  assert(w <= MAX_SB_SIZE);
+  assert(h <= MAX_SB_SIZE);
 
-  vpx_convolve8_c(src, src_stride, temp, 64,
+  vpx_convolve8_c(src, src_stride, temp, MAX_SB_SIZE,
                   filter_x, x_step_q4, filter_y, y_step_q4, w, h);
-  vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
+  vpx_convolve_avg_c(temp, MAX_SB_SIZE, dst, dst_stride,
+                     NULL, 0, NULL, 0, w, h);
 }
 
 void vpx_convolve_copy_c(const uint8_t *src, ptrdiff_t src_stride,
@@ -459,22 +463,23 @@
   // --Must round-up because block may be located at sub-pixel position.
   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
-  uint16_t temp[64 * 135];
+  uint16_t temp[MAX_EXT_SIZE * MAX_SB_SIZE];
   int intermediate_height =
           (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
 
-  assert(w <= 64);
-  assert(h <= 64);
+  assert(w <= MAX_SB_SIZE);
+  assert(h <= MAX_SB_SIZE);
   assert(y_step_q4 <= 32);
   assert(x_step_q4 <= 32);
 
-  highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                        src_stride, CONVERT_TO_BYTEPTR(temp), 64,
+  highbd_convolve_horiz(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride,
+                        CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE,
                         x_filters, x0_q4, x_step_q4, w,
                         intermediate_height, bd);
-  highbd_convolve_vert(CONVERT_TO_BYTEPTR(temp) + 64 * (SUBPEL_TAPS / 2 - 1),
-                       64, dst, dst_stride, y_filters, y0_q4, y_step_q4,
-                       w, h, bd);
+  highbd_convolve_vert(
+    CONVERT_TO_BYTEPTR(temp) + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1), MAX_SB_SIZE,
+    dst, dst_stride,
+    y_filters, y0_q4, y_step_q4, w, h, bd);
 }
 
 
@@ -556,13 +561,15 @@
                                 const int16_t *filter_y, int y_step_q4,
                                 int w, int h, int bd) {
   // Fixed size intermediate buffer places limits on parameters.
-  DECLARE_ALIGNED(16, uint16_t, temp[64 * 64]);
-  assert(w <= 64);
-  assert(h <= 64);
+  DECLARE_ALIGNED(16, uint16_t, temp[MAX_SB_SIZE * MAX_SB_SIZE]);
+  assert(w <= MAX_SB_SIZE);
+  assert(h <= MAX_SB_SIZE);
 
-  vpx_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), 64,
+  vpx_highbd_convolve8_c(src, src_stride,
+                         CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE,
                          filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
-  vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), 64, dst, dst_stride,
+  vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), MAX_SB_SIZE,
+                            dst, dst_stride,
                             NULL, 0, NULL, 0, w, h, bd);
 }
 
diff --git a/vpx_dsp/vpx_convolve.h b/vpx_dsp/vpx_convolve.h
index 9ed3f17..bd8679d 100644
--- a/vpx_dsp/vpx_convolve.h
+++ b/vpx_dsp/vpx_convolve.h
@@ -17,6 +17,24 @@
 extern "C" {
 #endif
 
+// Note: Fixed size intermediate buffers, place limits on parameters
+// of some functions. 2d filtering proceeds in 2 steps:
+//   (1) Interpolate horizontally into an intermediate buffer, temp.
+//   (2) Interpolate temp vertically to derive the sub-pixel result.
+// Deriving the maximum number of rows in the temp buffer (135):
+// --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
+// --Largest block size is 64x64 pixels.
+// --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
+//   original frame (in 1/16th pixel units).
+// --Must round-up because block may be located at sub-pixel position.
+// --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
+// --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
+#if CONFIG_VP10 && CONFIG_EXT_PARTITION
+# define MAX_EXT_SIZE 263
+#else
+# define MAX_EXT_SIZE 135
+#endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
+
 typedef void (*convolve_fn_t)(const uint8_t *src, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
                               const int16_t *filter_x, int x_step_q4,
diff --git a/vpx_dsp/vpx_dsp.mk b/vpx_dsp/vpx_dsp.mk
index 0e2b1a8..06b46d3 100644
--- a/vpx_dsp/vpx_dsp.mk
+++ b/vpx_dsp/vpx_dsp.mk
@@ -13,6 +13,8 @@
 
 DSP_SRCS-$(HAVE_MSA)    += mips/macros_msa.h
 
+DSP_SRCS-$(ARCH_X86)$(ARCH_X86_64)   += x86/synonyms.h
+
 # bit reader
 DSP_SRCS-yes += prob.h
 DSP_SRCS-yes += prob.c
@@ -22,6 +24,8 @@
 DSP_SRCS-yes += bitwriter.c
 DSP_SRCS-yes += bitwriter_buffer.c
 DSP_SRCS-yes += bitwriter_buffer.h
+DSP_SRCS-yes += psnr.c
+DSP_SRCS-yes += psnr.h
 DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.c
 DSP_SRCS-$(CONFIG_INTERNAL_STATS) += ssim.h
 DSP_SRCS-$(CONFIG_INTERNAL_STATS) += psnrhvs.c
@@ -68,6 +72,19 @@
 DSP_SRCS-$(HAVE_DSPR2)  += mips/common_dspr2.h
 DSP_SRCS-$(HAVE_DSPR2)  += mips/common_dspr2.c
 
+# inter predictions
+
+ifeq ($(CONFIG_VP10),yes)
+DSP_SRCS-yes            += blend.h
+DSP_SRCS-yes            += blend_a64_mask.c
+DSP_SRCS-yes            += blend_a64_hmask.c
+DSP_SRCS-yes            += blend_a64_vmask.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_sse4.h
+DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_mask_sse4.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_hmask_sse4.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/blend_a64_vmask_sse4.c
+endif  #CONFIG_VP10
+
 # interpolation filters
 DSP_SRCS-yes += vpx_convolve.c
 DSP_SRCS-yes += vpx_convolve.h
@@ -270,8 +287,19 @@
 endif
 endif
 
+# high bit depth subtract
+ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
+DSP_SRCS-$(HAVE_SSE2)  += x86/highbd_subtract_sse2.c
+endif
+
 endif  # CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
 
+ifeq ($(CONFIG_VP10_ENCODER),yes)
+DSP_SRCS-yes            += sum_squares.c
+
+DSP_SRCS-$(HAVE_SSE2)   += x86/sum_squares_sse2.c
+endif # CONFIG_VP10_ENCODER
+
 ifeq ($(CONFIG_ENCODERS),yes)
 DSP_SRCS-yes            += sad.c
 DSP_SRCS-yes            += subtract.c
@@ -290,6 +318,17 @@
 DSP_SRCS-$(HAVE_AVX2)   += x86/sad4d_avx2.c
 DSP_SRCS-$(HAVE_AVX2)   += x86/sad_avx2.c
 
+ifeq ($(CONFIG_VP10_ENCODER),yes)
+ifeq ($(CONFIG_EXT_INTER),yes)
+DSP_SRCS-$(HAVE_SSSE3)  += x86/masked_sad_intrin_ssse3.c
+DSP_SRCS-$(HAVE_SSSE3)  += x86/masked_variance_intrin_ssse3.c
+endif  #CONFIG_EXT_INTER
+ifeq ($(CONFIG_OBMC),yes)
+DSP_SRCS-$(HAVE_SSE4_1) += x86/obmc_sad_sse4.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/obmc_variance_sse4.c
+endif  #CONFIG_OBMC
+endif  #CONFIG_VP10_ENCODER
+
 ifeq ($(CONFIG_USE_X86INC),yes)
 DSP_SRCS-$(HAVE_SSE)    += x86/sad4d_sse2.asm
 DSP_SRCS-$(HAVE_SSE)    += x86/sad_sse2.asm
@@ -302,7 +341,6 @@
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
 endif  # CONFIG_VP9_HIGHBITDEPTH
 endif  # CONFIG_USE_X86INC
-
 endif  # CONFIG_ENCODERS
 
 ifneq ($(filter yes,$(CONFIG_ENCODERS) $(CONFIG_POSTPROC) $(CONFIG_VP9_POSTPROC)),)
@@ -339,6 +377,7 @@
 
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_sse2.c
+DSP_SRCS-$(HAVE_SSE4_1) += x86/highbd_variance_sse4.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_variance_impl_sse2.asm
 ifeq ($(CONFIG_USE_X86INC),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_subpel_variance_impl_sse2.asm
diff --git a/vpx_dsp/vpx_dsp_common.h b/vpx_dsp/vpx_dsp_common.h
index a1d0a51..82bafb5 100644
--- a/vpx_dsp/vpx_dsp_common.h
+++ b/vpx_dsp/vpx_dsp_common.h
@@ -19,9 +19,33 @@
 extern "C" {
 #endif
 
+#ifndef MAX_SB_SIZE
+# if CONFIG_VP10 && CONFIG_EXT_PARTITION
+#   define MAX_SB_SIZE 128
+# else
+#   define MAX_SB_SIZE 64
+# endif  // CONFIG_VP10 && CONFIG_EXT_PARTITION
+#endif  // ndef MAX_SB_SIZE
+
 #define VPXMIN(x, y) (((x) < (y)) ? (x) : (y))
 #define VPXMAX(x, y) (((x) > (y)) ? (x) : (y))
 
+#define IMPLIES(a, b)  (!(a) || (b))  //  Logical 'a implies b' (or 'a -> b')
+
+#define IS_POWER_OF_TWO(x)  (((x) & ((x) - 1)) == 0)
+
+// These can be used to give a hint about branch outcomes.
+// This can have an effect, even if your target processor has a
+// good branch predictor, as these hints can affect basic block
+// ordering by the compiler.
+#ifdef __GNUC__
+# define LIKELY(v)    __builtin_expect(v, 1)
+# define UNLIKELY(v)  __builtin_expect(v, 0)
+#else
+# define LIKELY(v)    (v)
+# define UNLIKELY(v)  (v)
+#endif
+
 #if CONFIG_VP9_HIGHBITDEPTH
 // Note:
 // tran_low_t  is the datatype used for final transform coefficients.
diff --git a/vpx_dsp/vpx_dsp_rtcd_defs.pl b/vpx_dsp/vpx_dsp_rtcd_defs.pl
index a62acb7..a04a684 100644
--- a/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -50,6 +50,19 @@
   $avx2_x86_64 = 'avx2';
 }
 
+if (vpx_config("CONFIG_EXT_PARTITION") eq "yes") {
+  @block_widths = (4, 8, 16, 32, 64, 128)
+} else {
+  @block_widths = (4, 8, 16, 32, 64)
+}
+
+@block_sizes = ();
+foreach $w (@block_widths) {
+  foreach $h (@block_widths) {
+    push @block_sizes, [$w, $h] if ($w <= 2*$h && $h <= 2*$w) ;
+  }
+}
+
 #
 # Intra prediction
 #
@@ -453,52 +466,44 @@
 #
 # Sub Pixel Filters
 #
-add_proto qw/void vpx_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve_copy neon dspr2 msa/, "$sse2_x86inc";
-
-add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve_avg neon dspr2 msa/, "$sse2_x86inc";
-
-add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8 sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
-
-add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_horiz sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
-
-add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_vert sse2 ssse3 neon dspr2 msa/, "$avx2_ssse3";
-
-add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg sse2 ssse3 neon dspr2 msa/;
-
+add_proto qw/void vpx_convolve_copy/,       "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_convolve_avg/,        "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_convolve8/,           "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_convolve8_horiz/,     "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_convolve8_vert/,      "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_convolve8_avg/,       "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
 add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 neon dspr2 msa/;
+add_proto qw/void vpx_convolve8_avg_vert/,  "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_2d/,           "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_horiz/,        "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_vert/,         "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_avg_2d/,       "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_avg_horiz/,    "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
+add_proto qw/void vpx_scaled_avg_vert/,     "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
 
-add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg_vert sse2 ssse3 neon dspr2 msa/;
+specialize qw/vpx_convolve_copy                 /, "$sse2_x86inc";
+specialize qw/vpx_convolve_avg                  /, "$sse2_x86inc";
+specialize qw/vpx_convolve8           sse2 ssse3/, "$avx2_ssse3";
+specialize qw/vpx_convolve8_horiz     sse2 ssse3/, "$avx2_ssse3";
+specialize qw/vpx_convolve8_vert      sse2 ssse3/, "$avx2_ssse3";
+specialize qw/vpx_convolve8_avg       sse2 ssse3/;
+specialize qw/vpx_convolve8_avg_horiz sse2 ssse3/;
+specialize qw/vpx_convolve8_avg_vert  sse2 ssse3/;
+specialize qw/vpx_scaled_2d                ssse3/;
 
-add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_scaled_2d ssse3/;
-
-add_proto qw/void vpx_scaled_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_scaled_horiz/;
-
-add_proto qw/void vpx_scaled_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_scaled_vert/;
-
-add_proto qw/void vpx_scaled_avg_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_scaled_avg_2d/;
-
-add_proto qw/void vpx_scaled_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_scaled_avg_horiz/;
-
-add_proto qw/void vpx_scaled_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h";
-specialize qw/vpx_scaled_avg_vert/;
+# TODO(any): These need to be extended to up to 128x128 block sizes
+if (!(vpx_config("CONFIG_VP10") eq "yes" && vpx_config("CONFIG_EXT_PARTITION") eq "yes")) {
+  specialize qw/vpx_convolve_copy       neon dspr2 msa/;
+  specialize qw/vpx_convolve_avg        neon dspr2 msa/;
+  specialize qw/vpx_convolve8           neon dspr2 msa/;
+  specialize qw/vpx_convolve8_horiz     neon dspr2 msa/;
+  specialize qw/vpx_convolve8_vert      neon dspr2 msa/;
+  specialize qw/vpx_convolve8_avg       neon dspr2 msa/;
+  specialize qw/vpx_convolve8_avg_horiz neon dspr2 msa/;
+  specialize qw/vpx_convolve8_avg_vert  neon dspr2 msa/;
+}
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-  #
-  # Sub Pixel Filters
-  #
   add_proto qw/void vpx_highbd_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bps";
   specialize qw/vpx_highbd_convolve_copy/, "$sse2_x86inc";
 
@@ -954,6 +959,27 @@
   }  # CONFIG_VP9_HIGHBITDEPTH
 }  # CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
 
+if (vpx_config("CONFIG_VP10") eq "yes") {
+  #
+  # Alpha blending with mask
+  #
+  add_proto qw/void vpx_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx";
+  add_proto qw/void vpx_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w";
+  add_proto qw/void vpx_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w";
+  specialize "vpx_blend_a64_mask", qw/sse4_1/;
+  specialize "vpx_blend_a64_hmask", qw/sse4_1/;
+  specialize "vpx_blend_a64_vmask", qw/sse4_1/;
+
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void vpx_highbd_blend_a64_mask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, uint32_t mask_stride, int h, int w, int suby, int subx, int bd";
+    add_proto qw/void vpx_highbd_blend_a64_hmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w, int bd";
+    add_proto qw/void vpx_highbd_blend_a64_vmask/, "uint8_t *dst, uint32_t dst_stride, const uint8_t *src0, uint32_t src0_stride, const uint8_t *src1, uint32_t src1_stride, const uint8_t *mask, int h, int w, int bd";
+    specialize "vpx_highbd_blend_a64_mask", qw/sse4_1/;
+    specialize "vpx_highbd_blend_a64_hmask", qw/sse4_1/;
+    specialize "vpx_highbd_blend_a64_vmask", qw/sse4_1/;
+  }
+}  # CONFIG_VP10
+
 if (vpx_config("CONFIG_ENCODERS") eq "yes") {
 #
 # Block subtraction
@@ -961,14 +987,16 @@
 add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
 specialize qw/vpx_subtract_block neon msa/, "$sse2_x86inc";
 
-#
-# Single block SAD
-#
-add_proto qw/unsigned int vpx_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad64x64 avx2 neon msa/, "$sse2_x86inc";
+if (vpx_config("CONFIG_VP10_ENCODER") eq "yes") {
+  #
+  # Sum of Squares
+  #
+  add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size";
+  specialize qw/vpx_sum_squares_2d_i16 sse2/;
 
-add_proto qw/unsigned int vpx_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad64x32 avx2 msa/, "$sse2_x86inc";
+  add_proto qw/uint64_t vpx_sum_squares_i16/, "const int16_t *src, uint32_t N";
+  specialize qw/vpx_sum_squares_i16 sse2/;
+}
 
 add_proto qw/unsigned int vpx_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
 specialize qw/vpx_sad32x64 avx2 msa/, "$sse2_x86inc";
@@ -1007,14 +1035,31 @@
 # Avg
 #
 if ((vpx_config("CONFIG_VP9_ENCODER") eq "yes") || (vpx_config("CONFIG_VP10_ENCODER") eq "yes")) {
+  #
+  # Avg
+  #
   add_proto qw/unsigned int vpx_avg_8x8/, "const uint8_t *, int p";
   specialize qw/vpx_avg_8x8 sse2 neon msa/;
-
   add_proto qw/unsigned int vpx_avg_4x4/, "const uint8_t *, int p";
   specialize qw/vpx_avg_4x4 sse2 neon msa/;
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/unsigned int vpx_highbd_avg_8x8/, "const uint8_t *, int p";
+    specialize qw/vpx_highbd_avg_8x8/;
+    add_proto qw/unsigned int vpx_highbd_avg_4x4/, "const uint8_t *, int p";
+    specialize qw/vpx_highbd_avg_4x4/;
+    add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
+    specialize qw/vpx_highbd_subtract_block sse2/;
+  }
 
+  #
+  # Minmax
+  #
   add_proto qw/void vpx_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
   specialize qw/vpx_minmax_8x8 sse2 neon/;
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+    add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
+    specialize qw/vpx_highbd_minmax_8x8/;
+  }
 
   add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, int src_stride, int16_t *coeff";
   specialize qw/vpx_hadamard_8x8 sse2 neon/, "$ssse3_x86_64_x86inc";
@@ -1035,352 +1080,205 @@
   specialize qw/vpx_vector_var neon sse2/;
 }  # CONFIG_VP9_ENCODER || CONFIG_VP10_ENCODER
 
-add_proto qw/unsigned int vpx_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad64x64_avg avx2 msa/, "$sse2_x86inc";
+#
+# Single block SAD / Single block Avg SAD
+#
+foreach (@block_sizes) {
+  ($w, $h) = @$_;
+  add_proto qw/unsigned int/, "vpx_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+  add_proto qw/unsigned int/, "vpx_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+}
 
-add_proto qw/unsigned int vpx_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad64x32_avg avx2 msa/, "$sse2_x86inc";
+specialize qw/vpx_sad128x128                       /, "$sse2_x86inc";
+specialize qw/vpx_sad128x64                        /, "$sse2_x86inc";
+specialize qw/vpx_sad64x128                        /, "$sse2_x86inc";
+specialize qw/vpx_sad64x64      avx2       neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad64x32      avx2            msa/, "$sse2_x86inc";
+specialize qw/vpx_sad32x64      avx2            msa/, "$sse2_x86inc";
+specialize qw/vpx_sad32x32      avx2       neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad32x16      avx2            msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x32                      msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x16           media neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x8                  neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x16                  neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x8                   neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x4                        msa/, "$sse2_x86inc";
+specialize qw/vpx_sad4x8                        msa/, "$sse2_x86inc";
+specialize qw/vpx_sad4x4                   neon msa/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vpx_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad32x64_avg avx2 msa/, "$sse2_x86inc";
+specialize qw/vpx_sad128x128_avg         /, "$sse2_x86inc";
+specialize qw/vpx_sad128x64_avg          /, "$sse2_x86inc";
+specialize qw/vpx_sad64x128_avg          /, "$sse2_x86inc";
+specialize qw/vpx_sad64x64_avg   avx2 msa/, "$sse2_x86inc";
+specialize qw/vpx_sad64x32_avg   avx2 msa/, "$sse2_x86inc";
+specialize qw/vpx_sad32x64_avg   avx2 msa/, "$sse2_x86inc";
+specialize qw/vpx_sad32x32_avg   avx2 msa/, "$sse2_x86inc";
+specialize qw/vpx_sad32x16_avg   avx2 msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x32_avg        msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x16_avg        msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x8_avg         msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x16_avg         msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x8_avg          msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x4_avg          msa/, "$sse2_x86inc";
+specialize qw/vpx_sad4x8_avg          msa/, "$sse2_x86inc";
+specialize qw/vpx_sad4x4_avg          msa/, "$sse2_x86inc";
 
-add_proto qw/unsigned int vpx_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad32x32_avg avx2 msa/, "$sse2_x86inc";
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  foreach (@block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/unsigned int/, "vpx_highbd_sad${w}x${h}", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
+    add_proto qw/unsigned int/, "vpx_highbd_sad${w}x${h}_avg", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
+    if ($w != 128 && $h != 128 && $w != 4) {
+      specialize "vpx_highbd_sad${w}x${h}", "$sse2_x86inc";
+      specialize "vpx_highbd_sad${w}x${h}_avg", "$sse2_x86inc";
+    }
+  }
+}
 
-add_proto qw/unsigned int vpx_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad32x16_avg avx2 msa/, "$sse2_x86inc";
+#
+# Masked SAD
+#
+if (vpx_config("CONFIG_EXT_INTER") eq "yes") {
+  foreach (@block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/unsigned int/, "vpx_masked_sad${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
+    specialize "vpx_masked_sad${w}x${h}", qw/ssse3/;
+  }
 
-add_proto qw/unsigned int vpx_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad16x32_avg msa/, "$sse2_x86inc";
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+    foreach (@block_sizes) {
+      ($w, $h) = @$_;
+      add_proto qw/unsigned int/, "vpx_highbd_masked_sad${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride";
+      specialize "vpx_highbd_masked_sad${w}x${h}", qw/ssse3/;
+    }
+  }
+}
 
-add_proto qw/unsigned int vpx_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad16x16_avg msa/, "$sse2_x86inc";
+#
+# OBMC SAD
+#
+if (vpx_config("CONFIG_OBMC") eq "yes") {
+  foreach (@block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/unsigned int/, "vpx_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
+    specialize "vpx_obmc_sad${w}x${h}", qw/sse4_1/;
+  }
 
-add_proto qw/unsigned int vpx_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad16x8_avg msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad8x16_avg msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad8x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad8x8_avg msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad8x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad8x4_avg msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad4x8_avg msa/, "$sse2_x86inc";
-
-add_proto qw/unsigned int vpx_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad4x4_avg msa/, "$sse2_x86inc";
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+    foreach (@block_sizes) {
+      ($w, $h) = @$_;
+      add_proto qw/unsigned int/, "vpx_highbd_obmc_sad${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask";
+      specialize "vpx_highbd_obmc_sad${w}x${h}", qw/sse4_1/;
+    }
+  }
+}
 
 #
 # Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
 #
 # Blocks of 3
-add_proto qw/void vpx_sad64x64x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad64x64x3 msa/;
-
-add_proto qw/void vpx_sad32x32x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x32x3 msa/;
-
-add_proto qw/void vpx_sad16x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+foreach $s (@block_widths) {
+  add_proto qw/void/, "vpx_sad${s}x${s}x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+}
+specialize qw/vpx_sad64x64x3            msa/;
+specialize qw/vpx_sad32x32x3            msa/;
 specialize qw/vpx_sad16x16x3 sse3 ssse3 msa/;
+specialize qw/vpx_sad8x8x3   sse3       msa/;
+specialize qw/vpx_sad4x4x3   sse3       msa/;
 
-add_proto qw/void vpx_sad16x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+add_proto qw/void/, "vpx_sad16x8x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad16x8x3 sse3 ssse3 msa/;
-
-add_proto qw/void vpx_sad8x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+add_proto qw/void/, "vpx_sad8x16x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad8x16x3 sse3 msa/;
 
-add_proto qw/void vpx_sad8x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x8x3 sse3 msa/;
-
-add_proto qw/void vpx_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x4x3 sse3 msa/;
-
 # Blocks of 8
-add_proto qw/void vpx_sad64x64x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad64x64x8 msa/;
-
-add_proto qw/void vpx_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x32x8 msa/;
-
-add_proto qw/void vpx_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+foreach $s (@block_widths) {
+  add_proto qw/void/, "vpx_sad${s}x${s}x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+}
+specialize qw/vpx_sad64x64x8        msa/;
+specialize qw/vpx_sad32x32x8        msa/;
 specialize qw/vpx_sad16x16x8 sse4_1 msa/;
+specialize qw/vpx_sad8x8x8   sse4_1 msa/;
+specialize qw/vpx_sad4x4x8   sse4_1 msa/;
 
-add_proto qw/void vpx_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+add_proto qw/void/, "vpx_sad16x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad16x8x8 sse4_1 msa/;
-
-add_proto qw/void vpx_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+add_proto qw/void/, "vpx_sad8x16x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad8x16x8 sse4_1 msa/;
-
-add_proto qw/void vpx_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x8x8 sse4_1 msa/;
-
-add_proto qw/void vpx_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+add_proto qw/void/, "vpx_sad8x4x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad8x4x8 msa/;
-
-add_proto qw/void vpx_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+add_proto qw/void/, "vpx_sad4x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
 specialize qw/vpx_sad4x8x8 msa/;
 
-add_proto qw/void vpx_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x4x8 sse4_1 msa/;
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  foreach $s (@block_widths) {
+    # Blocks of 3
+    add_proto qw/void/, "vpx_highbd_sad${s}x${s}x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+    # Blocks of 8
+    add_proto qw/void/, "vpx_highbd_sad${s}x${s}x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  }
+  # Blocks of 3
+  add_proto qw/void/, "vpx_highbd_sad16x8x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  add_proto qw/void/, "vpx_highbd_sad8x16x3", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  # Blocks of 8
+  add_proto qw/void/, "vpx_highbd_sad16x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  add_proto qw/void/, "vpx_highbd_sad8x16x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  add_proto qw/void/, "vpx_highbd_sad8x4x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+  add_proto qw/void/, "vpx_highbd_sad4x8x8", "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
+}
 
 #
 # Multi-block SAD, comparing a reference to N independent blocks
 #
-add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad64x64x4d avx2 neon msa/, "$sse2_x86inc";
+foreach (@block_sizes) {
+  ($w, $h) = @$_;
+  add_proto qw/void/, "vpx_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+}
 
-add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad64x32x4d msa/, "$sse2_x86inc";
+specialize qw/vpx_sad128x128x4d              /, "$sse2_x86inc";
+specialize qw/vpx_sad128x64x4d               /, "$sse2_x86inc";
+specialize qw/vpx_sad64x128x4d               /, "$sse2_x86inc";
+specialize qw/vpx_sad64x64x4d   avx2 neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad64x32x4d             msa/, "$sse2_x86inc";
+specialize qw/vpx_sad32x64x4d             msa/, "$sse2_x86inc";
+specialize qw/vpx_sad32x32x4d   avx2 neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad32x16x4d             msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x32x4d             msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x16x4d        neon msa/, "$sse2_x86inc";
+specialize qw/vpx_sad16x8x4d              msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x16x4d              msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x8x4d               msa/, "$sse2_x86inc";
+specialize qw/vpx_sad8x4x4d               msa/, "$sse2_x86inc";
+specialize qw/vpx_sad4x8x4d               msa/, "$sse2_x86inc";
+specialize qw/vpx_sad4x4x4d               msa/, "$sse2_x86inc";
 
-add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x64x4d msa/, "$sse2_x86inc";
-
-add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x32x4d avx2 neon msa/, "$sse2_x86inc";
-
-add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x16x4d msa/, "$sse2_x86inc";
-
-add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x32x4d msa/, "$sse2_x86inc";
-
-add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x16x4d neon msa/, "$sse2_x86inc";
-
-add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x8x4d msa/, "$sse2_x86inc";
-
-add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x16x4d msa/, "$sse2_x86inc";
-
-add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x8x4d msa/, "$sse2_x86inc";
-
-add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x4x4d msa/, "$sse2_x86inc";
-
-add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x8x4d msa/, "$sse2_x86inc";
-
-add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x4x4d msa/, "$sse2_x86inc";
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  #
+  # Multi-block SAD, comparing a reference to N independent blocks
+  #
+  foreach (@block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/void/, "vpx_highbd_sad${w}x${h}x4d", "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_ptr[], int ref_stride, uint32_t *sad_array";
+    if ($w != 128 && $h != 128) {
+      specialize "vpx_highbd_sad${w}x${h}x4d", "$sse2_x86inc";
+    }
+  }
+}
 
 #
 # Structured Similarity (SSIM)
 #
 if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
-    add_proto qw/void vpx_ssim_parms_8x8/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
-    specialize qw/vpx_ssim_parms_8x8/, "$sse2_x86_64";
+  add_proto qw/void vpx_ssim_parms_8x8/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+  specialize qw/vpx_ssim_parms_8x8/, "$sse2_x86_64";
 
-    add_proto qw/void vpx_ssim_parms_16x16/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
-    specialize qw/vpx_ssim_parms_16x16/, "$sse2_x86_64";
-}
+  add_proto qw/void vpx_ssim_parms_16x16/, "const uint8_t *s, int sp, const uint8_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
+  specialize qw/vpx_ssim_parms_16x16/, "$sse2_x86_64";
 
-if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-  #
-  # Block subtraction
-  #
-  add_proto qw/void vpx_highbd_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride, int bd";
-  specialize qw/vpx_highbd_subtract_block/;
-
-  #
-  # Single block SAD
-  #
-  add_proto qw/unsigned int vpx_highbd_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad64x64/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad64x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad32x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad32x64/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad32x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad32x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad16x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad16x32/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad16x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad16x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad8x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad8x16/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad8x8/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad8x4/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad4x8/;
-
-  add_proto qw/unsigned int vpx_highbd_sad4x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-  specialize qw/vpx_highbd_sad4x4/;
-
-  #
-  # Avg
-  #
-  add_proto qw/unsigned int vpx_highbd_avg_8x8/, "const uint8_t *, int p";
-  specialize qw/vpx_highbd_avg_8x8/;
-  add_proto qw/unsigned int vpx_highbd_avg_4x4/, "const uint8_t *, int p";
-  specialize qw/vpx_highbd_avg_4x4/;
-  add_proto qw/void vpx_highbd_minmax_8x8/, "const uint8_t *s, int p, const uint8_t *d, int dp, int *min, int *max";
-  specialize qw/vpx_highbd_minmax_8x8/;
-
-  add_proto qw/unsigned int vpx_highbd_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad64x64_avg/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad64x32_avg/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad32x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad32x64_avg/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad32x32_avg/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad32x16_avg/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad16x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad16x32_avg/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad16x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad16x16_avg/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad16x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad16x8_avg/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad8x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad8x16_avg/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad8x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad8x8_avg/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad8x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad8x4_avg/, "$sse2_x86inc";
-
-  add_proto qw/unsigned int vpx_highbd_sad4x8_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad4x8_avg/;
-
-  add_proto qw/unsigned int vpx_highbd_sad4x4_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-  specialize qw/vpx_highbd_sad4x4_avg/;
-
-  #
-  # Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
-  #
-  # Blocks of 3
-  add_proto qw/void vpx_highbd_sad64x64x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad64x64x3/;
-
-  add_proto qw/void vpx_highbd_sad32x32x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad32x32x3/;
-
-  add_proto qw/void vpx_highbd_sad16x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad16x16x3/;
-
-  add_proto qw/void vpx_highbd_sad16x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad16x8x3/;
-
-  add_proto qw/void vpx_highbd_sad8x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad8x16x3/;
-
-  add_proto qw/void vpx_highbd_sad8x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad8x8x3/;
-
-  add_proto qw/void vpx_highbd_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad4x4x3/;
-
-  # Blocks of 8
-  add_proto qw/void vpx_highbd_sad64x64x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad64x64x8/;
-
-  add_proto qw/void vpx_highbd_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad32x32x8/;
-
-  add_proto qw/void vpx_highbd_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad16x16x8/;
-
-  add_proto qw/void vpx_highbd_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad16x8x8/;
-
-  add_proto qw/void vpx_highbd_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad8x16x8/;
-
-  add_proto qw/void vpx_highbd_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad8x8x8/;
-
-  add_proto qw/void vpx_highbd_sad8x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad8x4x8/;
-
-  add_proto qw/void vpx_highbd_sad4x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad4x8x8/;
-
-  add_proto qw/void vpx_highbd_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad4x4x8/;
-
-  #
-  # Multi-block SAD, comparing a reference to N independent blocks
-  #
-  add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad64x64x4d/, "$sse2_x86inc";
-
-  add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad64x32x4d/, "$sse2_x86inc";
-
-  add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad32x64x4d/, "$sse2_x86inc";
-
-  add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad32x32x4d/, "$sse2_x86inc";
-
-  add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad32x16x4d/, "$sse2_x86inc";
-
-  add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad16x32x4d/, "$sse2_x86inc";
-
-  add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad16x16x4d/, "$sse2_x86inc";
-
-  add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad16x8x4d/, "$sse2_x86inc";
-
-  add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad8x16x4d/, "$sse2_x86inc";
-
-  add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad8x8x4d/, "$sse2_x86inc";
-
-  add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad8x4x4d/, "$sse2_x86inc";
-
-  add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad4x8x4d/, "$sse2_x86inc";
-
-  add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_ptr[], int ref_stride, uint32_t *sad_array";
-  specialize qw/vpx_highbd_sad4x4x4d/, "$sse2_x86inc";
-
-  #
-  # Structured Similarity (SSIM)
-  #
-  if (vpx_config("CONFIG_INTERNAL_STATS") eq "yes") {
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void vpx_highbd_ssim_parms_8x8/, "const uint16_t *s, int sp, const uint16_t *r, int rp, uint32_t *sum_s, uint32_t *sum_r, uint32_t *sum_sq_s, uint32_t *sum_sq_r, uint32_t *sum_sxr";
-    specialize qw/vpx_highbd_ssim_parms_8x8/;
   }
-}  # CONFIG_VP9_HIGHBITDEPTH
+}
 }  # CONFIG_ENCODERS
 
 if (vpx_config("CONFIG_ENCODERS") eq "yes" || vpx_config("CONFIG_POSTPROC") eq "yes" || vpx_config("CONFIG_VP9_POSTPROC") eq "yes") {
@@ -1431,72 +1329,188 @@
 # Specialty Variance
 #
 add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_get16x16var sse2 avx2 neon msa/;
 
 add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_get8x8var sse2 neon msa/;
+
+specialize qw/vpx_get16x16var sse2 avx2 neon msa/;
+specialize qw/vpx_get8x8var   sse2      neon msa/;
+
 
 add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_mse16x16 sse2 avx2 media neon msa/;
-
 add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_mse16x8 sse2 msa/;
-
 add_proto qw/unsigned int vpx_mse8x16/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_mse8x16 sse2 msa/;
-
 add_proto qw/unsigned int vpx_mse8x8/, "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
-  specialize qw/vpx_mse8x8 sse2 msa/;
 
+specialize qw/vpx_mse16x16          sse2 avx2 media neon msa/;
+specialize qw/vpx_mse16x8           sse2                 msa/;
+specialize qw/vpx_mse8x16           sse2                 msa/;
+specialize qw/vpx_mse8x8            sse2                 msa/;
+
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  foreach $bd (8, 10, 12) {
+    add_proto qw/void/, "vpx_highbd_${bd}_get16x16var", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+    add_proto qw/void/, "vpx_highbd_${bd}_get8x8var", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
+
+    add_proto qw/unsigned int/, "vpx_highbd_${bd}_mse16x16", "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+    add_proto qw/unsigned int/, "vpx_highbd_${bd}_mse16x8", "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+    add_proto qw/unsigned int/, "vpx_highbd_${bd}_mse8x16", "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+    add_proto qw/unsigned int/, "vpx_highbd_${bd}_mse8x8", "const uint8_t *src_ptr, int  source_stride, const uint8_t *ref_ptr, int  recon_stride, unsigned int *sse";
+
+    specialize "vpx_highbd_${bd}_mse16x16", qw/sse2/;
+    specialize "vpx_highbd_${bd}_mse8x8", qw/sse2/;
+  }
+}
+
+#
+# ...
+#
+add_proto qw/void vpx_upsampled_pred/, "uint8_t *comp_pred, int width, int height, const uint8_t *ref, int ref_stride";
+specialize qw/vpx_upsampled_pred sse2/;
+add_proto qw/void vpx_comp_avg_upsampled_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
+specialize qw/vpx_comp_avg_upsampled_pred sse2/;
+
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  add_proto qw/void vpx_highbd_upsampled_pred/, "uint16_t *comp_pred, int width, int height, const uint8_t *ref8, int ref_stride";
+  specialize qw/vpx_highbd_upsampled_pred sse2/;
+  add_proto qw/void vpx_highbd_comp_avg_upsampled_pred/, "uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride";
+  specialize qw/vpx_highbd_comp_avg_upsampled_pred sse2/;
+}
+
+#
+# ...
+#
 add_proto qw/unsigned int vpx_get_mb_ss/, "const int16_t *";
-  specialize qw/vpx_get_mb_ss sse2 msa/;
+add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int ref_stride";
 
-add_proto qw/unsigned int vpx_get4x4sse_cs/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride";
-  specialize qw/vpx_get4x4sse_cs neon msa/;
-
-add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
+specialize qw/vpx_get_mb_ss sse2 msa/;
+specialize qw/vpx_get4x4sse_cs neon msa/;
 
 #
-# Subpixel Variance
+# Variance / Subpixel Variance / Subpixel Avg Variance
 #
-add_proto qw/uint32_t vpx_sub_pixel_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance64x64 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+foreach (@block_sizes) {
+  ($w, $h) = @$_;
+  add_proto qw/unsigned int/, "vpx_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
+  add_proto qw/uint32_t/, "vpx_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+  add_proto qw/uint32_t/, "vpx_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+}
 
-add_proto qw/uint32_t vpx_sub_pixel_variance64x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance64x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_variance64x64     sse2 avx2       neon msa/;
+specialize qw/vpx_variance64x32     sse2 avx2       neon msa/;
+specialize qw/vpx_variance32x64     sse2            neon msa/;
+specialize qw/vpx_variance32x32     sse2 avx2       neon msa/;
+specialize qw/vpx_variance32x16     sse2 avx2            msa/;
+specialize qw/vpx_variance16x32     sse2                 msa/;
+specialize qw/vpx_variance16x16     sse2 avx2 media neon msa/;
+specialize qw/vpx_variance16x8      sse2            neon msa/;
+specialize qw/vpx_variance8x16      sse2            neon msa/;
+specialize qw/vpx_variance8x8       sse2      media neon msa/;
+specialize qw/vpx_variance8x4       sse2                 msa/;
+specialize qw/vpx_variance4x8       sse2                 msa/;
+specialize qw/vpx_variance4x4       sse2                 msa/;
 
-add_proto qw/uint32_t vpx_sub_pixel_variance32x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance32x64 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance64x64     avx2       neon msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance64x32                     msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance32x64                     msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance32x32     avx2       neon msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance32x16                     msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance16x32                     msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance16x16          media neon msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance16x8                      msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance8x16                      msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance8x8            media neon msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance8x4                       msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance4x8                       msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_variance4x4                       msa/,                 "$sse2_x86inc", "$ssse3_x86inc";
 
-add_proto qw/uint32_t vpx_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance32x32 avx2 neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_avg_variance64x64 avx2 msa/,                "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_avg_variance64x32      msa/,                "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_avg_variance32x64      msa/,                "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_avg_variance32x32 avx2 msa/,                "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_avg_variance32x16      msa/,                "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_avg_variance16x32      msa/,                "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_avg_variance16x16      msa/,                "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_avg_variance16x8       msa/,                "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_avg_variance8x16       msa/,                "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_avg_variance8x8        msa/,                "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_avg_variance8x4        msa/,                "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_avg_variance4x8        msa/,                "$sse2_x86inc", "$ssse3_x86inc";
+specialize qw/vpx_sub_pixel_avg_variance4x4        msa/,                "$sse2_x86inc", "$ssse3_x86inc";
 
-add_proto qw/uint32_t vpx_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance32x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+  foreach $bd (8, 10, 12) {
+    foreach (@block_sizes) {
+      ($w, $h) = @$_;
+      add_proto qw/unsigned int/, "vpx_highbd_${bd}_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      add_proto qw/uint32_t/, "vpx_highbd_${bd}_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
+      add_proto qw/uint32_t/, "vpx_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
+      if ($w != 128 && $h != 128 && $w != 4 && $h != 4) {
+        specialize "vpx_highbd_${bd}_variance${w}x${h}", "sse2";
+      }
+      if ($w == 4 && $h == 4) {
+        specialize "vpx_highbd_${bd}_variance${w}x${h}", "sse4_1";
+      }
+      if ($w != 128 && $h != 128 && $w != 4) {
+        specialize "vpx_highbd_${bd}_sub_pixel_variance${w}x${h}", $sse2_x86inc;
+        specialize "vpx_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", $sse2_x86inc;
+      }
+      if ($w == 4 && $h == 4) {
+        specialize "vpx_highbd_${bd}_sub_pixel_variance${w}x${h}", "sse4_1";
+        specialize "vpx_highbd_${bd}_sub_pixel_avg_variance${w}x${h}", "sse4_1";
+      }
+    }
+  }
+}  # CONFIG_VP9_HIGHBITDEPTH
 
-add_proto qw/uint32_t vpx_sub_pixel_variance16x32/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance16x32 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+if (vpx_config("CONFIG_EXT_INTER") eq "yes") {
+#
+# Masked Variance / Masked Subpixel Variance
+#
+  foreach (@block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/unsigned int/, "vpx_masked_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+    add_proto qw/unsigned int/, "vpx_masked_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+    specialize "vpx_masked_variance${w}x${h}", qw/ssse3/;
+    specialize "vpx_masked_sub_pixel_variance${w}x${h}", qw/ssse3/;
+  }
 
-add_proto qw/uint32_t vpx_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance16x16 media neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+    foreach $bd ("_", "_10_", "_12_") {
+      foreach (@block_sizes) {
+        ($w, $h) = @$_;
+        add_proto qw/unsigned int/, "vpx_highbd${bd}masked_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *mask, int mask_stride, unsigned int *sse";
+        add_proto qw/unsigned int/, "vpx_highbd${bd}masked_sub_pixel_variance${w}x${h}", "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, const uint8_t *m, int m_stride, unsigned int *sse";
+        specialize "vpx_highbd${bd}masked_variance${w}x${h}", qw/ssse3/;
+        specialize "vpx_highbd${bd}masked_sub_pixel_variance${w}x${h}", qw/ssse3/;
+      }
+    }
+  }
+}
 
-add_proto qw/uint32_t vpx_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance16x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+#
+# OBMC Variance / OBMC Subpixel Variance
+#
+if (vpx_config("CONFIG_OBMC") eq "yes") {
+  foreach (@block_sizes) {
+    ($w, $h) = @$_;
+    add_proto qw/unsigned int/, "vpx_obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
+    add_proto qw/unsigned int/, "vpx_obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
+    specialize "vpx_obmc_variance${w}x${h}", q/sse4_1/;
+    specialize "vpx_obmc_sub_pixel_variance${w}x${h}";
+  }
 
-add_proto qw/uint32_t vpx_sub_pixel_variance8x16/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance8x16 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/uint32_t vpx_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance8x8 media neon msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance8x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/uint32_t vpx_sub_pixel_variance4x8/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance4x8 msa/, "$sse2_x86inc", "$ssse3_x86inc";
-
-add_proto qw/uint32_t vpx_sub_pixel_variance4x4/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance4x4 msa/, "$sse2_x86inc", "$ssse3_x86inc";
+  if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
+    foreach $bd ("_", "_10_", "_12_") {
+      foreach (@block_sizes) {
+        ($w, $h) = @$_;
+        add_proto qw/unsigned int/, "vpx_highbd${bd}obmc_variance${w}x${h}", "const uint8_t *pre, int pre_stride, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
+        add_proto qw/unsigned int/, "vpx_highbd${bd}obmc_sub_pixel_variance${w}x${h}", "const uint8_t *pre, int pre_stride, int xoffset, int yoffset, const int32_t *wsrc, const int32_t *mask, unsigned int *sse";
+        specialize "vpx_highbd${bd}obmc_variance${w}x${h}", qw/sse4_1/;
+        specialize "vpx_highbd${bd}obmc_sub_pixel_variance${w}x${h}";
+      }
+    }
+  }
+}
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int source_stride, int xoffset, int  yoffset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_sub_pixel_avg_variance64x64 avx2 msa/, "$sse2_x86inc", "$ssse3_x86inc";
@@ -1549,6 +1563,10 @@
 add_proto qw/uint32_t vpx_variance_halfpixvar16x16_hv/, "const unsigned char *src_ptr, int source_stride, const unsigned char *ref_ptr, int  ref_stride, uint32_t *sse";
   specialize qw/vpx_variance_halfpixvar16x16_hv sse2 media/;
 
+#
+# Comp Avg
+#
+add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
   add_proto qw/unsigned int vpx_highbd_12_variance64x64/, "const uint8_t *src_ptr, int source_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_highbd_12_variance64x64 sse2/;
diff --git a/vpx_dsp/vpx_filter.h b/vpx_dsp/vpx_filter.h
index 2617feb..cfe8161 100644
--- a/vpx_dsp/vpx_filter.h
+++ b/vpx_dsp/vpx_filter.h
@@ -27,6 +27,21 @@
 
 typedef int16_t InterpKernel[SUBPEL_TAPS];
 
+#define BIL_SUBPEL_BITS    3
+#define BIL_SUBPEL_SHIFTS  (1 << BIL_SUBPEL_BITS)
+
+// 2 tap bilinear filters
+static const uint8_t bilinear_filters_2t[BIL_SUBPEL_SHIFTS][2] = {
+  { 128,   0  },
+  { 112,  16  },
+  {  96,  32  },
+  {  80,  48  },
+  {  64,  64  },
+  {  48,  80  },
+  {  32,  96  },
+  {  16, 112  },
+};
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
diff --git a/vpx_dsp/x86/avg_intrin_sse2.c b/vpx_dsp/x86/avg_intrin_sse2.c
index f9af6cf..ecc215a 100644
--- a/vpx_dsp/x86/avg_intrin_sse2.c
+++ b/vpx_dsp/x86/avg_intrin_sse2.c
@@ -10,6 +10,8 @@
 
 #include <emmintrin.h>
 
+#include "vpx_dsp/x86/synonyms.h"
+
 #include "./vpx_dsp_rtcd.h"
 #include "vpx_ports/mem.h"
 
@@ -121,13 +123,14 @@
 unsigned int vpx_avg_4x4_sse2(const uint8_t *s, int p) {
   __m128i s0, s1, u0;
   unsigned int avg = 0;
+
   u0  = _mm_setzero_si128();
-  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
+  s0 = _mm_unpacklo_epi8(xx_loadl_32(s), u0);
+  s1 = _mm_unpacklo_epi8(xx_loadl_32(s + p), u0);
   s0 = _mm_adds_epu16(s0, s1);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
+  s1 = _mm_unpacklo_epi8(xx_loadl_32(s + 2 * p), u0);
   s0 = _mm_adds_epu16(s0, s1);
-  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
+  s1 = _mm_unpacklo_epi8(xx_loadl_32(s + 3 * p), u0);
   s0 = _mm_adds_epu16(s0, s1);
 
   s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4));
diff --git a/vpx_dsp/x86/blend_a64_hmask_sse4.c b/vpx_dsp/x86/blend_a64_hmask_sse4.c
new file mode 100644
index 0000000..a10e077
--- /dev/null
+++ b/vpx_dsp/x86/blend_a64_hmask_sse4.c
@@ -0,0 +1,41 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "vpx/vpx_integer.h"
+
+#include "./vpx_dsp_rtcd.h"
+
+// To start out, just dispatch to the function using the 2D mask and
+// pass mask stride as 0. This can be improved upon if necessary.
+
+void vpx_blend_a64_hmask_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w) {
+  vpx_blend_a64_mask_sse4_1(dst, dst_stride,
+                            src0, src0_stride,
+                            src1, src1_stride,
+                            mask, 0, h, w, 0, 0);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+void vpx_highbd_blend_a64_hmask_sse4_1(
+    uint8_t *dst_8, uint32_t dst_stride,
+    const uint8_t *src0_8, uint32_t src0_stride,
+    const uint8_t *src1_8, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w,
+    int bd) {
+  vpx_highbd_blend_a64_mask_sse4_1(dst_8, dst_stride,
+                                   src0_8, src0_stride,
+                                   src1_8, src1_stride,
+                                   mask, 0, h, w, 0, 0, bd);
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/x86/blend_a64_mask_sse4.c b/vpx_dsp/x86/blend_a64_mask_sse4.c
new file mode 100644
index 0000000..cdb40c2
--- /dev/null
+++ b/vpx_dsp/x86/blend_a64_mask_sse4.c
@@ -0,0 +1,1007 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h>  // SSE4.1
+
+#include <assert.h>
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/blend.h"
+
+#include "vpx_dsp/x86/synonyms.h"
+#include "vpx_dsp/x86/blend_sse4.h"
+
+#include "./vpx_dsp_rtcd.h"
+
+//////////////////////////////////////////////////////////////////////////////
+// No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_a64_mask_w4_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+  (void)w;
+
+  do {
+    const __m128i v_m0_b = xx_loadl_32(mask);
+    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
+
+    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+    xx_storel_32(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_w8_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+  (void)w;
+
+  do {
+    const __m128i v_m0_b = xx_loadl_64(mask);
+    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
+
+    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+    xx_storel_64(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_w16n_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+  do {
+    int c;
+    for (c = 0; c < w; c += 16) {
+      const __m128i v_m0l_b = xx_loadl_64(mask + c);
+      const __m128i v_m0h_b = xx_loadl_64(mask + c + 8);
+      const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_m0l_b);
+      const __m128i v_m0h_w = _mm_cvtepu8_epi16(v_m0h_b);
+      const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
+      const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
+
+      const __m128i v_resl_w = blend_8(src0 + c, src1 + c,
+                                       v_m0l_w, v_m1l_w);
+      const __m128i v_resh_w = blend_8(src0 + c + 8, src1 + c + 8,
+                                       v_m0h_w, v_m1h_w);
+
+      const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+
+      xx_storeu_128(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_a64_mask_sx_w4_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+                                         0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+  (void)w;
+
+  do {
+    const __m128i v_r_b = xx_loadl_64(mask);
+    const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
+
+    const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
+
+    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+    xx_storel_32(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_sx_w8_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+                                         0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+  (void)w;
+
+  do {
+    const __m128i v_r_b = xx_loadu_128(mask);
+    const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
+
+    const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
+
+    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+    xx_storel_64(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_sx_w16n_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+                                         0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+  do {
+    int c;
+    for (c = 0; c < w; c += 16) {
+      const __m128i v_rl_b = xx_loadu_128(mask + 2 * c);
+      const __m128i v_rh_b = xx_loadu_128(mask + 2 * c + 16);
+      const __m128i v_al_b = _mm_avg_epu8(v_rl_b, _mm_srli_si128(v_rl_b, 1));
+      const __m128i v_ah_b = _mm_avg_epu8(v_rh_b, _mm_srli_si128(v_rh_b, 1));
+
+      const __m128i v_m0l_w = _mm_and_si128(v_al_b, v_zmask_b);
+      const __m128i v_m0h_w = _mm_and_si128(v_ah_b, v_zmask_b);
+      const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
+      const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
+
+      const __m128i v_resl_w = blend_8(src0 + c, src1 + c,
+                                       v_m0l_w, v_m1l_w);
+      const __m128i v_resh_w = blend_8(src0 + c + 8, src1 + c + 8,
+                                       v_m0h_w, v_m1h_w);
+
+      const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+
+      xx_storeu_128(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_a64_mask_sy_w4_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+  (void)w;
+
+  do {
+    const __m128i v_ra_b = xx_loadl_32(mask);
+    const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
+    const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
+
+    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+    xx_storel_32(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_sy_w8_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+  (void)w;
+
+  do {
+    const __m128i v_ra_b = xx_loadl_64(mask);
+    const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
+    const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
+
+    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+    xx_storel_64(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_sy_w16n_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  const __m128i v_zero = _mm_setzero_si128();
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+  do {
+    int c;
+    for (c = 0; c < w; c += 16) {
+      const __m128i v_ra_b = xx_loadu_128(mask + c);
+      const __m128i v_rb_b = xx_loadu_128(mask + c + mask_stride);
+      const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+      const __m128i v_m0l_w = _mm_cvtepu8_epi16(v_a_b);
+      const __m128i v_m0h_w = _mm_unpackhi_epi8(v_a_b, v_zero);
+      const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
+      const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
+
+      const __m128i v_resl_w = blend_8(src0 + c, src1 + c,
+                                       v_m0l_w, v_m1l_w);
+      const __m128i v_resh_w = blend_8(src0 + c + 8, src1 + c + 8,
+                                       v_m0h_w, v_m1h_w);
+
+      const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+
+      xx_storeu_128(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal and Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_a64_mask_sx_sy_w4_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+                                         0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+  (void)w;
+
+  do {
+    const __m128i v_ra_b = xx_loadl_64(mask);
+    const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
+    const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+    const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
+    const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1),
+                                           v_zmask_b);
+    const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
+
+    const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
+
+    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+    xx_storel_32(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_sx_sy_w8_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+                                         0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+  (void)w;
+
+  do {
+    const __m128i v_ra_b = xx_loadu_128(mask);
+    const __m128i v_rb_b = xx_loadu_128(mask + mask_stride);
+    const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+    const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
+    const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1),
+                                           v_zmask_b);
+    const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
+
+    const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
+
+    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+    xx_storel_64(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_sx_sy_w16n_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+                                         0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+  do {
+    int c;
+    for (c = 0; c < w; c += 16) {
+      const __m128i v_ral_b = xx_loadu_128(mask + 2 * c);
+      const __m128i v_rah_b = xx_loadu_128(mask + 2 * c + 16);
+      const __m128i v_rbl_b = xx_loadu_128(mask + mask_stride + 2 * c);
+      const __m128i v_rbh_b = xx_loadu_128(mask + mask_stride + 2 * c + 16);
+      const __m128i v_rvsl_b = _mm_add_epi8(v_ral_b, v_rbl_b);
+      const __m128i v_rvsh_b = _mm_add_epi8(v_rah_b, v_rbh_b);
+      const __m128i v_rvsal_w = _mm_and_si128(v_rvsl_b, v_zmask_b);
+      const __m128i v_rvsah_w = _mm_and_si128(v_rvsh_b, v_zmask_b);
+      const __m128i v_rvsbl_w = _mm_and_si128(_mm_srli_si128(v_rvsl_b, 1),
+                                              v_zmask_b);
+      const __m128i v_rvsbh_w = _mm_and_si128(_mm_srli_si128(v_rvsh_b, 1),
+                                              v_zmask_b);
+      const __m128i v_rsl_w = _mm_add_epi16(v_rvsal_w, v_rvsbl_w);
+      const __m128i v_rsh_w = _mm_add_epi16(v_rvsah_w, v_rvsbh_w);
+
+      const __m128i v_m0l_w = xx_roundn_epu16(v_rsl_w, 2);
+      const __m128i v_m0h_w = xx_roundn_epu16(v_rsh_w, 2);
+      const __m128i v_m1l_w = _mm_sub_epi16(v_maxval_w, v_m0l_w);
+      const __m128i v_m1h_w = _mm_sub_epi16(v_maxval_w, v_m0h_w);
+
+      const __m128i v_resl_w = blend_8(src0 + c, src1 + c,
+                                       v_m0l_w, v_m1l_w);
+      const __m128i v_resh_w = blend_8(src0 + c + 8, src1 + c + 8,
+                                       v_m0h_w, v_m1h_w);
+
+      const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+
+      xx_storeu_128(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+
+void vpx_blend_a64_mask_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w, int suby, int subx) {
+  typedef  void (*blend_fn)(uint8_t *dst, uint32_t dst_stride,
+                            const uint8_t *src0, uint32_t src0_stride,
+                            const uint8_t *src1, uint32_t src1_stride,
+                            const uint8_t *mask, uint32_t mask_stride,
+                            int h, int w);
+
+  // Dimensions are: width_index X subx X suby
+  static const blend_fn blend[3][2][2] = {
+    {     // w % 16 == 0
+      {blend_a64_mask_w16n_sse4_1, blend_a64_mask_sy_w16n_sse4_1},
+      {blend_a64_mask_sx_w16n_sse4_1, blend_a64_mask_sx_sy_w16n_sse4_1}
+    }, {  // w == 4
+      {blend_a64_mask_w4_sse4_1, blend_a64_mask_sy_w4_sse4_1},
+      {blend_a64_mask_sx_w4_sse4_1, blend_a64_mask_sx_sy_w4_sse4_1}
+    }, {  // w == 8
+      {blend_a64_mask_w8_sse4_1, blend_a64_mask_sy_w8_sse4_1},
+      {blend_a64_mask_sx_w8_sse4_1, blend_a64_mask_sx_sy_w8_sse4_1}
+    }
+  };
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
+    vpx_blend_a64_mask_c(dst, dst_stride,
+                         src0, src0_stride,
+                         src1, src1_stride,
+                         mask, mask_stride,
+                         h, w, suby, subx);
+  } else {
+    blend[(w >> 2) & 3][subx != 0][suby != 0](dst, dst_stride,
+                                              src0, src0_stride,
+                                              src1, src1_stride,
+                                              mask, mask_stride,
+                                              h, w);
+  }
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+//////////////////////////////////////////////////////////////////////////////
+// No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_a64_mask_bn_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, blend_unit_fn blend) {
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+  do {
+    const __m128i v_m0_b = xx_loadl_32(mask);
+    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+    xx_storel_64(dst, v_res_w);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_b10_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  (void)w;
+  blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                              src1_stride, mask, mask_stride, h,
+                              blend_4_b10);
+}
+
+static void blend_a64_mask_b12_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  (void)w;
+  blend_a64_mask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                              src1_stride, mask, mask_stride, h,
+                              blend_4_b12);
+}
+
+static inline void blend_a64_mask_bn_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w, blend_unit_fn blend) {
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+  do {
+    int c;
+    for (c = 0; c < w; c += 8) {
+      const __m128i v_m0_b = xx_loadl_64(mask + c);
+      const __m128i v_m0_w = _mm_cvtepu8_epi16(v_m0_b);
+      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+      xx_storeu_128(dst + c, v_res_w);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_b10_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                               src1_stride, mask, mask_stride, h, w,
+                               blend_8_b10);
+}
+
+static void blend_a64_mask_b12_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  blend_a64_mask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                               src1_stride, mask, mask_stride, h, w,
+                               blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_a64_mask_bn_sx_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, blend_unit_fn blend) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+                                         0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+  do {
+    const __m128i v_r_b = xx_loadl_64(mask);
+    const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
+
+    const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+    xx_storel_64(dst, v_res_w);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_b10_sx_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  (void)w;
+  blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                 src1_stride, mask, mask_stride, h,
+                                 blend_4_b10);
+}
+
+static void blend_a64_mask_b12_sx_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  (void)w;
+  blend_a64_mask_bn_sx_w4_sse4_1(dst, dst_stride, src0,  src0_stride, src1,
+                                 src1_stride, mask, mask_stride, h,
+                                 blend_4_b12);
+}
+
+static INLINE void blend_a64_mask_bn_sx_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w, blend_unit_fn blend) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+                                         0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+  do {
+    int c;
+    for (c = 0; c < w; c += 8) {
+      const __m128i v_r_b = xx_loadu_128(mask + 2 * c);
+      const __m128i v_a_b = _mm_avg_epu8(v_r_b, _mm_srli_si128(v_r_b, 1));
+
+      const __m128i v_m0_w = _mm_and_si128(v_a_b, v_zmask_b);
+      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+      xx_storeu_128(dst + c, v_res_w);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_b10_sx_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, mask_stride, h, w,
+                                  blend_8_b10);
+}
+
+static void blend_a64_mask_b12_sx_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  blend_a64_mask_bn_sx_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, mask_stride, h, w,
+                                  blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_a64_mask_bn_sy_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, blend_unit_fn blend) {
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+  do {
+    const __m128i v_ra_b = xx_loadl_32(mask);
+    const __m128i v_rb_b = xx_loadl_32(mask + mask_stride);
+    const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+    const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+    xx_storel_64(dst, v_res_w);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_b10_sy_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  (void)w;
+  blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                 src1_stride, mask, mask_stride, h,
+                                 blend_4_b10);
+}
+
+static void blend_a64_mask_b12_sy_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  (void)w;
+  blend_a64_mask_bn_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                 src1_stride, mask, mask_stride, h,
+                                 blend_4_b12);
+}
+
+static INLINE void blend_a64_mask_bn_sy_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w, blend_unit_fn blend) {
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+  do {
+    int c;
+    for (c = 0; c < w; c += 8) {
+      const __m128i v_ra_b = xx_loadl_64(mask + c);
+      const __m128i v_rb_b = xx_loadl_64(mask + c + mask_stride);
+      const __m128i v_a_b = _mm_avg_epu8(v_ra_b, v_rb_b);
+
+      const __m128i v_m0_w = _mm_cvtepu8_epi16(v_a_b);
+      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+      xx_storeu_128(dst + c, v_res_w);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_b10_sy_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, mask_stride, h, w,
+                                  blend_8_b10);
+}
+
+static void blend_a64_mask_b12_sy_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  blend_a64_mask_bn_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                  src1_stride, mask, mask_stride, h, w,
+                                  blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Horizontal and Vertical sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_a64_mask_bn_sx_sy_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, blend_unit_fn blend) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+                                         0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+  do {
+    const __m128i v_ra_b = xx_loadl_64(mask);
+    const __m128i v_rb_b = xx_loadl_64(mask + mask_stride);
+    const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+    const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
+    const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1),
+                                           v_zmask_b);
+    const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
+
+    const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+    xx_storel_64(dst, v_res_w);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_b10_sx_sy_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  (void)w;
+  blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                    src1_stride, mask, mask_stride, h,
+                                    blend_4_b10);
+}
+
+static void blend_a64_mask_b12_sx_sy_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  (void)w;
+  blend_a64_mask_bn_sx_sy_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                    src1_stride, mask, mask_stride, h,
+                                    blend_4_b12);
+}
+
+static INLINE void blend_a64_mask_bn_sx_sy_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w, blend_unit_fn blend) {
+  const __m128i v_zmask_b = _mm_set_epi8(0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff,
+                                         0, 0xff, 0, 0xff, 0, 0xff, 0, 0xff);
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+  do {
+    int c;
+    for (c = 0; c < w; c += 8) {
+      const __m128i v_ra_b = xx_loadu_128(mask + 2 * c);
+      const __m128i v_rb_b = xx_loadu_128(mask + 2 * c + mask_stride);
+      const __m128i v_rvs_b = _mm_add_epi8(v_ra_b, v_rb_b);
+      const __m128i v_rvsa_w = _mm_and_si128(v_rvs_b, v_zmask_b);
+      const __m128i v_rvsb_w = _mm_and_si128(_mm_srli_si128(v_rvs_b, 1),
+                                             v_zmask_b);
+      const __m128i v_rs_w = _mm_add_epi16(v_rvsa_w, v_rvsb_w);
+
+      const __m128i v_m0_w = xx_roundn_epu16(v_rs_w, 2);
+      const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+      xx_storeu_128(dst + c, v_res_w);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 2 * mask_stride;
+  } while (--h);
+}
+
+static void blend_a64_mask_b10_sx_sy_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                     src1_stride, mask, mask_stride, h, w,
+                                     blend_8_b10);
+}
+
+static void blend_a64_mask_b12_sx_sy_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w) {
+  blend_a64_mask_bn_sx_sy_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                     src1_stride, mask, mask_stride, h, w,
+                                     blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+
+void vpx_highbd_blend_a64_mask_sse4_1(
+    uint8_t *dst_8, uint32_t dst_stride,
+    const uint8_t *src0_8, uint32_t src0_stride,
+    const uint8_t *src1_8, uint32_t src1_stride,
+    const uint8_t *mask, uint32_t mask_stride,
+    int h, int w, int suby, int subx, int bd) {
+  typedef  void (*blend_fn)(uint16_t *dst, uint32_t dst_stride,
+                            const uint16_t *src0, uint32_t src0_stride,
+                            const uint16_t *src1, uint32_t src1_stride,
+                            const uint8_t *mask, uint32_t mask_stride,
+                            int h, int w);
+
+  // Dimensions are: bd_index X width_index X subx X suby
+  static const blend_fn blend[2][2][2][2] = {
+    {   // bd == 8 or 10
+      {     // w % 8 == 0
+        {blend_a64_mask_b10_w8n_sse4_1, blend_a64_mask_b10_sy_w8n_sse4_1},
+        {blend_a64_mask_b10_sx_w8n_sse4_1, blend_a64_mask_b10_sx_sy_w8n_sse4_1}
+      }, {  // w == 4
+        {blend_a64_mask_b10_w4_sse4_1, blend_a64_mask_b10_sy_w4_sse4_1},
+        {blend_a64_mask_b10_sx_w4_sse4_1, blend_a64_mask_b10_sx_sy_w4_sse4_1}
+      }
+    },
+    {   // bd == 12
+      {     // w % 8 == 0
+        {blend_a64_mask_b12_w8n_sse4_1, blend_a64_mask_b12_sy_w8n_sse4_1},
+        {blend_a64_mask_b12_sx_w8n_sse4_1, blend_a64_mask_b12_sx_sy_w8n_sse4_1}
+      }, {  // w == 4
+        {blend_a64_mask_b12_w4_sse4_1, blend_a64_mask_b12_sy_w4_sse4_1},
+        {blend_a64_mask_b12_sx_w4_sse4_1, blend_a64_mask_b12_sx_sy_w4_sse4_1}
+      }
+    }
+  };
+
+  assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
+  assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  assert(bd == 8 || bd == 10 || bd == 12);
+  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
+    vpx_highbd_blend_a64_mask_c(dst_8, dst_stride,
+                                src0_8, src0_stride,
+                                src1_8, src1_stride,
+                                mask, mask_stride,
+                                h, w, suby, subx, bd);
+  } else {
+    uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
+    const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
+    const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
+
+    blend[bd == 12][(w >> 2) & 1][subx != 0][suby != 0](dst, dst_stride,
+                                                        src0, src0_stride,
+                                                        src1, src1_stride,
+                                                        mask, mask_stride,
+                                                        h, w);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/x86/blend_a64_vmask_sse4.c b/vpx_dsp/x86/blend_a64_vmask_sse4.c
new file mode 100644
index 0000000..4b0f38d
--- /dev/null
+++ b/vpx_dsp/x86/blend_a64_vmask_sse4.c
@@ -0,0 +1,293 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h>  // SSE4.1
+
+#include <assert.h>
+
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/blend.h"
+
+#include "vpx_dsp/x86/synonyms.h"
+#include "vpx_dsp/x86/blend_sse4.h"
+
+#include "./vpx_dsp_rtcd.h"
+
+//////////////////////////////////////////////////////////////////////////////
+// Implementation - No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static void blend_a64_vmask_w4_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w) {
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+  (void)w;
+
+  do {
+    const __m128i v_m0_w = _mm_set1_epi16(*mask);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend_4(src0, src1, v_m0_w, v_m1_w);
+
+    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+    xx_storel_32(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 1;
+  } while (--h);
+}
+
+static void blend_a64_vmask_w8_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w) {
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+  (void)w;
+
+  do {
+    const __m128i v_m0_w = _mm_set1_epi16(*mask);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend_8(src0, src1, v_m0_w, v_m1_w);
+
+    const __m128i v_res_b = _mm_packus_epi16(v_res_w, v_res_w);
+
+    xx_storel_64(dst, v_res_b);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 1;
+  } while (--h);
+}
+
+static void blend_a64_vmask_w16n_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w) {
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+  do {
+    int c;
+    const __m128i v_m0_w = _mm_set1_epi16(*mask);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+    for (c = 0; c < w; c += 16) {
+      const __m128i v_resl_w = blend_8(src0 + c, src1 + c,
+                                       v_m0_w, v_m1_w);
+      const __m128i v_resh_w = blend_8(src0 + c + 8, src1 + c + 8,
+                                       v_m0_w, v_m1_w);
+
+      const __m128i v_res_b = _mm_packus_epi16(v_resl_w, v_resh_w);
+
+      xx_storeu_128(dst + c, v_res_b);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 1;
+  } while (--h);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+
+void vpx_blend_a64_vmask_sse4_1(
+    uint8_t *dst, uint32_t dst_stride,
+    const uint8_t *src0, uint32_t src0_stride,
+    const uint8_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w) {
+  typedef  void (*blend_fn)(uint8_t *dst, uint32_t dst_stride,
+                            const uint8_t *src0, uint32_t src0_stride,
+                            const uint8_t *src1, uint32_t src1_stride,
+                            const uint8_t *mask, int h, int w);
+
+  // Dimension: width_index
+  static const blend_fn blend[9] = {
+    blend_a64_vmask_w16n_sse4_1,  // w % 16 == 0
+    vpx_blend_a64_vmask_c,        // w == 1
+    vpx_blend_a64_vmask_c,        // w == 2
+    NULL,                         // INVALID
+    blend_a64_vmask_w4_sse4_1,    // w == 4
+    NULL,                         // INVALID
+    NULL,                         // INVALID
+    NULL,                         // INVALID
+    blend_a64_vmask_w8_sse4_1,    // w == 8
+  };
+
+  assert(IMPLIES(src0 == dst, src0_stride == dst_stride));
+  assert(IMPLIES(src1 == dst, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  blend[w & 0xf](dst, dst_stride,
+                 src0, src0_stride,
+                 src1, src1_stride,
+                 mask, h, w);
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+//////////////////////////////////////////////////////////////////////////////
+// Implementation - No sub-sampling
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE void blend_a64_vmask_bn_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, blend_unit_fn blend) {
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+  do {
+    const __m128i v_m0_w = _mm_set1_epi16(*mask);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+
+    const __m128i v_res_w = blend(src0, src1, v_m0_w, v_m1_w);
+
+    xx_storel_64(dst, v_res_w);
+
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 1;
+  } while (--h);
+}
+
+static void blend_a64_vmask_b10_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w) {
+  (void)w;
+  blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                               src1_stride, mask, h,
+                               blend_4_b10);
+}
+
+static void blend_a64_vmask_b12_w4_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w) {
+  (void)w;
+  blend_a64_vmask_bn_w4_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                               src1_stride, mask, h,
+                               blend_4_b12);
+}
+
+static inline void blend_a64_vmask_bn_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w, blend_unit_fn blend) {
+  const __m128i v_maxval_w = _mm_set1_epi16(VPX_BLEND_A64_MAX_ALPHA);
+
+  do {
+    int c;
+    const __m128i v_m0_w = _mm_set1_epi16(*mask);
+    const __m128i v_m1_w = _mm_sub_epi16(v_maxval_w, v_m0_w);
+    for (c = 0; c < w; c += 8) {
+      const __m128i v_res_w = blend(src0 + c, src1 + c, v_m0_w, v_m1_w);
+
+      xx_storeu_128(dst + c, v_res_w);
+    }
+    dst += dst_stride;
+    src0 += src0_stride;
+    src1 += src1_stride;
+    mask += 1;
+  } while (--h);
+}
+
+static void blend_a64_vmask_b10_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w) {
+  blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                src1_stride, mask, h, w,
+                                blend_8_b10);
+}
+
+static void blend_a64_vmask_b12_w8n_sse4_1(
+    uint16_t *dst, uint32_t dst_stride,
+    const uint16_t *src0, uint32_t src0_stride,
+    const uint16_t *src1, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w) {
+  blend_a64_vmask_bn_w8n_sse4_1(dst, dst_stride, src0, src0_stride, src1,
+                                src1_stride, mask, h, w,
+                                blend_8_b12);
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// Dispatch
+//////////////////////////////////////////////////////////////////////////////
+
+void vpx_highbd_blend_a64_vmask_sse4_1(
+    uint8_t *dst_8, uint32_t dst_stride,
+    const uint8_t *src0_8, uint32_t src0_stride,
+    const uint8_t *src1_8, uint32_t src1_stride,
+    const uint8_t *mask, int h, int w, int bd) {
+  typedef  void (*blend_fn)(uint16_t *dst, uint32_t dst_stride,
+                            const uint16_t *src0, uint32_t src0_stride,
+                            const uint16_t *src1, uint32_t src1_stride,
+                            const uint8_t *mask, int h, int w);
+
+  // Dimensions are: bd_index X width_index
+  static const blend_fn blend[2][2] = {
+    {     // bd == 8 or 10
+      blend_a64_vmask_b10_w8n_sse4_1,  // w % 8 == 0
+      blend_a64_vmask_b10_w4_sse4_1,   // w == 4
+    }, {  // bd == 12
+      blend_a64_vmask_b12_w8n_sse4_1,  // w % 8 == 0
+      blend_a64_vmask_b12_w4_sse4_1,   // w == 4
+    }
+  };
+
+  assert(IMPLIES(src0_8 == dst_8, src0_stride == dst_stride));
+  assert(IMPLIES(src1_8 == dst_8, src1_stride == dst_stride));
+
+  assert(h >= 1);
+  assert(w >= 1);
+  assert(IS_POWER_OF_TWO(h));
+  assert(IS_POWER_OF_TWO(w));
+
+  assert(bd == 8 || bd == 10 || bd == 12);
+
+  if (UNLIKELY((h | w) & 3)) {  // if (w <= 2 || h <= 2)
+    vpx_highbd_blend_a64_vmask_c(dst_8, dst_stride,
+                                 src0_8, src0_stride,
+                                 src1_8, src1_stride,
+                                 mask, h, w, bd);
+  } else {
+    uint16_t *const dst = CONVERT_TO_SHORTPTR(dst_8);
+    const uint16_t *const src0 = CONVERT_TO_SHORTPTR(src0_8);
+    const uint16_t *const src1 = CONVERT_TO_SHORTPTR(src1_8);
+
+    blend[bd == 12][(w >> 2) & 1](dst, dst_stride,
+                                  src0, src0_stride,
+                                  src1, src1_stride,
+                                  mask, h, w);
+  }
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/x86/blend_sse4.h b/vpx_dsp/x86/blend_sse4.h
new file mode 100644
index 0000000..9b74f90
--- /dev/null
+++ b/vpx_dsp/x86/blend_sse4.h
@@ -0,0 +1,145 @@
+/*
+*  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+*
+*  Use of this source code is governed by a BSD-style license
+*  that can be found in the LICENSE file in the root of the source
+*  tree. An additional intellectual property rights grant can be found
+*  in the file PATENTS.  All contributing project authors may
+*  be found in the AUTHORS file in the root of the source tree.
+*/
+
+#ifndef VPX_DSP_X86_BLEND_SSE4_H_
+#define VPX_DSP_X86_BLEND_SSE4_H_
+
+#include "vpx_dsp/blend.h"
+#include "vpx_dsp/x86/synonyms.h"
+
+//////////////////////////////////////////////////////////////////////////////
+// Common kernels
+//////////////////////////////////////////////////////////////////////////////
+
+static INLINE __m128i blend_4(const uint8_t *src0, const uint8_t *src1,
+                              const __m128i v_m0_w, const __m128i v_m1_w) {
+  const __m128i v_s0_b = xx_loadl_32(src0);
+  const __m128i v_s1_b = xx_loadl_32(src1);
+  const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
+  const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
+
+  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, VPX_BLEND_A64_ROUND_BITS);
+
+  return v_res_w;
+}
+
+static INLINE __m128i blend_8(const uint8_t *src0, const uint8_t *src1,
+                              const __m128i v_m0_w, const __m128i v_m1_w) {
+  const __m128i v_s0_b = xx_loadl_64(src0);
+  const __m128i v_s1_b = xx_loadl_64(src1);
+  const __m128i v_s0_w = _mm_cvtepu8_epi16(v_s0_b);
+  const __m128i v_s1_w = _mm_cvtepu8_epi16(v_s1_b);
+
+  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, VPX_BLEND_A64_ROUND_BITS);
+
+  return v_res_w;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef __m128i (*blend_unit_fn)(const uint16_t *src0, const uint16_t *src1,
+                                 const __m128i v_m0_w, const __m128i v_m1_w);
+
+static INLINE __m128i blend_4_b10(const uint16_t *src0, const uint16_t *src1,
+                                  const __m128i v_m0_w, const __m128i v_m1_w) {
+  const __m128i v_s0_w = xx_loadl_64(src0);
+  const __m128i v_s1_w = xx_loadl_64(src1);
+
+  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, VPX_BLEND_A64_ROUND_BITS);
+
+  return v_res_w;
+}
+
+static INLINE __m128i blend_8_b10(const uint16_t *src0, const uint16_t *src1,
+                                  const __m128i v_m0_w, const __m128i v_m1_w) {
+  const __m128i v_s0_w = xx_loadu_128(src0);
+  const __m128i v_s1_w = xx_loadu_128(src1);
+
+  const __m128i v_p0_w = _mm_mullo_epi16(v_s0_w, v_m0_w);
+  const __m128i v_p1_w = _mm_mullo_epi16(v_s1_w, v_m1_w);
+
+  const __m128i v_sum_w = _mm_add_epi16(v_p0_w, v_p1_w);
+
+  const __m128i v_res_w = xx_roundn_epu16(v_sum_w, VPX_BLEND_A64_ROUND_BITS);
+
+  return v_res_w;
+}
+
+static INLINE __m128i blend_4_b12(const uint16_t *src0, const uint16_t *src1,
+                                  const __m128i v_m0_w, const __m128i v_m1_w) {
+  const __m128i v_s0_w = xx_loadl_64(src0);
+  const __m128i v_s1_w = xx_loadl_64(src1);
+
+  // Interleave
+  const __m128i v_m01_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
+  const __m128i v_s01_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
+
+  // Multiply-Add
+  const __m128i v_sum_d = _mm_madd_epi16(v_s01_w, v_m01_w);
+
+  // Scale
+  const __m128i v_ssum_d = _mm_srli_epi32(v_sum_d,
+                                          VPX_BLEND_A64_ROUND_BITS - 1);
+
+  // Pack
+  const __m128i v_pssum_d = _mm_packs_epi32(v_ssum_d, v_ssum_d);
+
+  // Round
+  const __m128i v_res_w = xx_round_epu16(v_pssum_d);
+
+  return v_res_w;
+}
+
+static INLINE __m128i blend_8_b12(const uint16_t *src0, const uint16_t *src1,
+                                  const __m128i v_m0_w, const __m128i v_m1_w) {
+  const __m128i v_s0_w = xx_loadu_128(src0);
+  const __m128i v_s1_w = xx_loadu_128(src1);
+
+  // Interleave
+  const __m128i v_m01l_w = _mm_unpacklo_epi16(v_m0_w, v_m1_w);
+  const __m128i v_m01h_w = _mm_unpackhi_epi16(v_m0_w, v_m1_w);
+  const __m128i v_s01l_w = _mm_unpacklo_epi16(v_s0_w, v_s1_w);
+  const __m128i v_s01h_w = _mm_unpackhi_epi16(v_s0_w, v_s1_w);
+
+  // Multiply-Add
+  const __m128i v_suml_d = _mm_madd_epi16(v_s01l_w, v_m01l_w);
+  const __m128i v_sumh_d = _mm_madd_epi16(v_s01h_w, v_m01h_w);
+
+  // Scale
+  const __m128i v_ssuml_d = _mm_srli_epi32(v_suml_d,
+                                           VPX_BLEND_A64_ROUND_BITS - 1);
+  const __m128i v_ssumh_d = _mm_srli_epi32(v_sumh_d,
+                                           VPX_BLEND_A64_ROUND_BITS - 1);
+
+  // Pack
+  const __m128i v_pssum_d = _mm_packs_epi32(v_ssuml_d, v_ssumh_d);
+
+  // Round
+  const __m128i v_res_w = xx_round_epu16(v_pssum_d);
+
+  return v_res_w;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+#endif  // VPX_DSP_X86_BLEND_SSE4_H_
diff --git a/vpx_dsp/x86/convolve.h b/vpx_dsp/x86/convolve.h
index 7e43eb7..ab387d6 100644
--- a/vpx_dsp/x86/convolve.h
+++ b/vpx_dsp/x86/convolve.h
@@ -15,6 +15,7 @@
 #include "./vpx_config.h"
 #include "vpx/vpx_integer.h"
 #include "vpx_ports/mem.h"
+#include "vpx_dsp/vpx_convolve.h"
 
 typedef void filter8_1dfunction (
   const uint8_t *src_ptr,
@@ -98,24 +99,27 @@
                               int w, int h) { \
   assert(filter_x[3] != 128); \
   assert(filter_y[3] != 128); \
-  assert(w <= 64); \
-  assert(h <= 64); \
+  assert(w <= MAX_SB_SIZE); \
+  assert(h <= MAX_SB_SIZE); \
   assert(x_step_q4 == 16); \
   assert(y_step_q4 == 16); \
-  if (filter_x[0] | filter_x[1] | filter_x[2]) { \
-    DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 71]); \
-    vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, fdata2, 64, \
+  if (filter_x[0] || filter_x[1] || filter_x[2]|| \
+      filter_y[0] || filter_y[1] || filter_y[2]) { \
+    DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE+7)]); \
+    vpx_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
+                              fdata2, MAX_SB_SIZE, \
                               filter_x, x_step_q4, filter_y, y_step_q4, \
                               w, h + 7); \
-    vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * 64, 64, dst, dst_stride, \
+    vpx_convolve8_##avg##vert_##opt(fdata2 + 3 * MAX_SB_SIZE, MAX_SB_SIZE, \
+                                    dst, dst_stride, \
                                     filter_x, x_step_q4, filter_y, \
                                     y_step_q4, w, h); \
   } else { \
-    DECLARE_ALIGNED(16, uint8_t, fdata2[64 * 65]); \
-    vpx_convolve8_horiz_##opt(src, src_stride, fdata2, 64, \
+    DECLARE_ALIGNED(16, uint8_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE+1)]); \
+    vpx_convolve8_horiz_##opt(src, src_stride, fdata2, MAX_SB_SIZE, \
                               filter_x, x_step_q4, filter_y, y_step_q4, \
                               w, h + 1); \
-    vpx_convolve8_##avg##vert_##opt(fdata2, 64, dst, dst_stride, \
+    vpx_convolve8_##avg##vert_##opt(fdata2, MAX_SB_SIZE, dst, dst_stride, \
                                     filter_x, x_step_q4, filter_y, \
                                     y_step_q4, w, h); \
   } \
@@ -235,30 +239,40 @@
                                      const int16_t *filter_x, int x_step_q4, \
                                      const int16_t *filter_y, int y_step_q4, \
                                      int w, int h, int bd) { \
-  assert(w <= 64); \
-  assert(h <= 64); \
+  assert(w <= MAX_SB_SIZE); \
+  assert(h <= MAX_SB_SIZE); \
   if (x_step_q4 == 16 && y_step_q4 == 16) { \
-    if ((filter_x[0] | filter_x[1] | filter_x[2]) || filter_x[3] == 128) { \
-      DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 71]); \
-      vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, src_stride, \
-                                       CONVERT_TO_BYTEPTR(fdata2), 64, \
+    if (filter_x[0] || filter_x[1] || filter_x[2] || filter_x[3] == 128 || \
+        filter_y[0] || filter_y[1] || filter_y[2] || filter_y[3] == 128) { \
+      DECLARE_ALIGNED(16, uint16_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE+7)]); \
+      vpx_highbd_convolve8_horiz_##opt(src - 3 * src_stride, \
+                                       src_stride, \
+                                       CONVERT_TO_BYTEPTR(fdata2), \
+                                       MAX_SB_SIZE, \
                                        filter_x, x_step_q4, \
                                        filter_y, y_step_q4, \
                                        w, h + 7, bd); \
-      vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2) + 192, \
-                                             64, dst, dst_stride, \
-                                             filter_x, x_step_q4, \
-                                             filter_y, y_step_q4, \
-                                             w, h, bd); \
+      vpx_highbd_convolve8_##avg##vert_##opt( \
+        CONVERT_TO_BYTEPTR(fdata2) + 3 * MAX_SB_SIZE, \
+        MAX_SB_SIZE, \
+        dst, \
+        dst_stride, \
+        filter_x, x_step_q4, \
+        filter_y, y_step_q4, \
+        w, h, bd); \
     } else { \
-      DECLARE_ALIGNED(16, uint16_t, fdata2[64 * 65]); \
-      vpx_highbd_convolve8_horiz_##opt(src, src_stride, \
-                                       CONVERT_TO_BYTEPTR(fdata2), 64, \
+      DECLARE_ALIGNED(16, uint16_t, fdata2[MAX_SB_SIZE * (MAX_SB_SIZE+1)]); \
+      vpx_highbd_convolve8_horiz_##opt(src, \
+                                       src_stride, \
+                                       CONVERT_TO_BYTEPTR(fdata2), \
+                                       MAX_SB_SIZE, \
                                        filter_x, x_step_q4, \
                                        filter_y, y_step_q4, \
                                        w, h + 1, bd); \
-      vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), 64, \
-                                             dst, dst_stride, \
+      vpx_highbd_convolve8_##avg##vert_##opt(CONVERT_TO_BYTEPTR(fdata2), \
+                                             MAX_SB_SIZE, \
+                                             dst, \
+                                             dst_stride, \
                                              filter_x, x_step_q4, \
                                              filter_y, y_step_q4, \
                                              w, h, bd); \
diff --git a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
index fd46bef..164ffcf 100644
--- a/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -10,6 +10,7 @@
 
 #include <emmintrin.h>
 
+#include "./vpx_dsp_rtcd.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/mem.h"
diff --git a/vpx_dsp/x86/highbd_subtract_sse2.c b/vpx_dsp/x86/highbd_subtract_sse2.c
new file mode 100644
index 0000000..33e464b
--- /dev/null
+++ b/vpx_dsp/x86/highbd_subtract_sse2.c
@@ -0,0 +1,366 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include <stddef.h>
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+typedef void (*SubtractWxHFuncType)(
+    int16_t *diff, ptrdiff_t diff_stride,
+    const uint16_t *src, ptrdiff_t src_stride,
+    const uint16_t *pred, ptrdiff_t pred_stride);
+
+static void subtract_4x4(int16_t *diff, ptrdiff_t diff_stride,
+                         const uint16_t *src, ptrdiff_t src_stride,
+                         const uint16_t *pred, ptrdiff_t pred_stride) {
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3;
+  __m128i x0, x1, x2, x3;
+  int64_t *store_diff = (int64_t *) (diff + 0 * diff_stride);
+
+  u0 = _mm_loadu_si128((__m128i const *) (src + 0 * src_stride));
+  u1 = _mm_loadu_si128((__m128i const *) (src + 1 * src_stride));
+  u2 = _mm_loadu_si128((__m128i const *) (src + 2 * src_stride));
+  u3 = _mm_loadu_si128((__m128i const *) (src + 3 * src_stride));
+
+  v0 = _mm_loadu_si128((__m128i const *) (pred + 0 * pred_stride));
+  v1 = _mm_loadu_si128((__m128i const *) (pred + 1 * pred_stride));
+  v2 = _mm_loadu_si128((__m128i const *) (pred + 2 * pred_stride));
+  v3 = _mm_loadu_si128((__m128i const *) (pred + 3 * pred_stride));
+
+  x0 = _mm_sub_epi16(u0, v0);
+  x1 = _mm_sub_epi16(u1, v1);
+  x2 = _mm_sub_epi16(u2, v2);
+  x3 = _mm_sub_epi16(u3, v3);
+
+  _mm_storel_epi64((__m128i *)store_diff, x0);
+  store_diff = (int64_t *) (diff + 1 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x1);
+  store_diff = (int64_t *) (diff + 2 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x2);
+  store_diff = (int64_t *) (diff + 3 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x3);
+}
+
+static void subtract_4x8(int16_t *diff, ptrdiff_t diff_stride,
+                         const uint16_t *src, ptrdiff_t src_stride,
+                         const uint16_t *pred, ptrdiff_t pred_stride) {
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+  int64_t *store_diff = (int64_t *) (diff + 0 * diff_stride);
+
+  u0 = _mm_loadu_si128((__m128i const *) (src + 0 * src_stride));
+  u1 = _mm_loadu_si128((__m128i const *) (src + 1 * src_stride));
+  u2 = _mm_loadu_si128((__m128i const *) (src + 2 * src_stride));
+  u3 = _mm_loadu_si128((__m128i const *) (src + 3 * src_stride));
+  u4 = _mm_loadu_si128((__m128i const *) (src + 4 * src_stride));
+  u5 = _mm_loadu_si128((__m128i const *) (src + 5 * src_stride));
+  u6 = _mm_loadu_si128((__m128i const *) (src + 6 * src_stride));
+  u7 = _mm_loadu_si128((__m128i const *) (src + 7 * src_stride));
+
+  v0 = _mm_loadu_si128((__m128i const *) (pred + 0 * pred_stride));
+  v1 = _mm_loadu_si128((__m128i const *) (pred + 1 * pred_stride));
+  v2 = _mm_loadu_si128((__m128i const *) (pred + 2 * pred_stride));
+  v3 = _mm_loadu_si128((__m128i const *) (pred + 3 * pred_stride));
+  v4 = _mm_loadu_si128((__m128i const *) (pred + 4 * pred_stride));
+  v5 = _mm_loadu_si128((__m128i const *) (pred + 5 * pred_stride));
+  v6 = _mm_loadu_si128((__m128i const *) (pred + 6 * pred_stride));
+  v7 = _mm_loadu_si128((__m128i const *) (pred + 7 * pred_stride));
+
+  x0 = _mm_sub_epi16(u0, v0);
+  x1 = _mm_sub_epi16(u1, v1);
+  x2 = _mm_sub_epi16(u2, v2);
+  x3 = _mm_sub_epi16(u3, v3);
+  x4 = _mm_sub_epi16(u4, v4);
+  x5 = _mm_sub_epi16(u5, v5);
+  x6 = _mm_sub_epi16(u6, v6);
+  x7 = _mm_sub_epi16(u7, v7);
+
+  _mm_storel_epi64((__m128i *)store_diff, x0);
+  store_diff = (int64_t *) (diff + 1 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x1);
+  store_diff = (int64_t *) (diff + 2 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x2);
+  store_diff = (int64_t *) (diff + 3 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x3);
+  store_diff = (int64_t *) (diff + 4 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x4);
+  store_diff = (int64_t *) (diff + 5 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x5);
+  store_diff = (int64_t *) (diff + 6 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x6);
+  store_diff = (int64_t *) (diff + 7 * diff_stride);
+  _mm_storel_epi64((__m128i *)store_diff, x7);
+}
+
+static void subtract_8x4(int16_t *diff, ptrdiff_t diff_stride,
+                         const uint16_t *src, ptrdiff_t src_stride,
+                         const uint16_t *pred, ptrdiff_t pred_stride) {
+  __m128i u0, u1, u2, u3;
+  __m128i v0, v1, v2, v3;
+  __m128i x0, x1, x2, x3;
+
+  u0 = _mm_loadu_si128((__m128i const *) (src + 0 * src_stride));
+  u1 = _mm_loadu_si128((__m128i const *) (src + 1 * src_stride));
+  u2 = _mm_loadu_si128((__m128i const *) (src + 2 * src_stride));
+  u3 = _mm_loadu_si128((__m128i const *) (src + 3 * src_stride));
+
+  v0 = _mm_loadu_si128((__m128i const *) (pred + 0 * pred_stride));
+  v1 = _mm_loadu_si128((__m128i const *) (pred + 1 * pred_stride));
+  v2 = _mm_loadu_si128((__m128i const *) (pred + 2 * pred_stride));
+  v3 = _mm_loadu_si128((__m128i const *) (pred + 3 * pred_stride));
+
+  x0 = _mm_sub_epi16(u0, v0);
+  x1 = _mm_sub_epi16(u1, v1);
+  x2 = _mm_sub_epi16(u2, v2);
+  x3 = _mm_sub_epi16(u3, v3);
+
+  _mm_storeu_si128((__m128i *) (diff + 0 * diff_stride), x0);
+  _mm_storeu_si128((__m128i *) (diff + 1 * diff_stride), x1);
+  _mm_storeu_si128((__m128i *) (diff + 2 * diff_stride), x2);
+  _mm_storeu_si128((__m128i *) (diff + 3 * diff_stride), x3);
+}
+
+static void subtract_8x8(int16_t *diff, ptrdiff_t diff_stride,
+                         const uint16_t *src, ptrdiff_t src_stride,
+                         const uint16_t *pred, ptrdiff_t pred_stride) {
+  __m128i u0, u1, u2, u3, u4, u5, u6, u7;
+  __m128i v0, v1, v2, v3, v4, v5, v6, v7;
+  __m128i x0, x1, x2, x3, x4, x5, x6, x7;
+
+  u0 = _mm_loadu_si128((__m128i const *) (src + 0 * src_stride));
+  u1 = _mm_loadu_si128((__m128i const *) (src + 1 * src_stride));
+  u2 = _mm_loadu_si128((__m128i const *) (src + 2 * src_stride));
+  u3 = _mm_loadu_si128((__m128i const *) (src + 3 * src_stride));
+  u4 = _mm_loadu_si128((__m128i const *) (src + 4 * src_stride));
+  u5 = _mm_loadu_si128((__m128i const *) (src + 5 * src_stride));
+  u6 = _mm_loadu_si128((__m128i const *) (src + 6 * src_stride));
+  u7 = _mm_loadu_si128((__m128i const *) (src + 7 * src_stride));
+
+  v0 = _mm_loadu_si128((__m128i const *) (pred + 0 * pred_stride));
+  v1 = _mm_loadu_si128((__m128i const *) (pred + 1 * pred_stride));
+  v2 = _mm_loadu_si128((__m128i const *) (pred + 2 * pred_stride));
+  v3 = _mm_loadu_si128((__m128i const *) (pred + 3 * pred_stride));
+  v4 = _mm_loadu_si128((__m128i const *) (pred + 4 * pred_stride));
+  v5 = _mm_loadu_si128((__m128i const *) (pred + 5 * pred_stride));
+  v6 = _mm_loadu_si128((__m128i const *) (pred + 6 * pred_stride));
+  v7 = _mm_loadu_si128((__m128i const *) (pred + 7 * pred_stride));
+
+  x0 = _mm_sub_epi16(u0, v0);
+  x1 = _mm_sub_epi16(u1, v1);
+  x2 = _mm_sub_epi16(u2, v2);
+  x3 = _mm_sub_epi16(u3, v3);
+  x4 = _mm_sub_epi16(u4, v4);
+  x5 = _mm_sub_epi16(u5, v5);
+  x6 = _mm_sub_epi16(u6, v6);
+  x7 = _mm_sub_epi16(u7, v7);
+
+  _mm_storeu_si128((__m128i *) (diff + 0 * diff_stride), x0);
+  _mm_storeu_si128((__m128i *) (diff + 1 * diff_stride), x1);
+  _mm_storeu_si128((__m128i *) (diff + 2 * diff_stride), x2);
+  _mm_storeu_si128((__m128i *) (diff + 3 * diff_stride), x3);
+  _mm_storeu_si128((__m128i *) (diff + 4 * diff_stride), x4);
+  _mm_storeu_si128((__m128i *) (diff + 5 * diff_stride), x5);
+  _mm_storeu_si128((__m128i *) (diff + 6 * diff_stride), x6);
+  _mm_storeu_si128((__m128i *) (diff + 7 * diff_stride), x7);
+}
+
+static void subtract_8x16(int16_t *diff, ptrdiff_t diff_stride,
+                          const uint16_t *src, ptrdiff_t src_stride,
+                          const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += diff_stride << 3;
+  src += src_stride << 3;
+  pred += pred_stride << 3;
+  subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_16x8(int16_t *diff, ptrdiff_t diff_stride,
+                          const uint16_t *src, ptrdiff_t src_stride,
+                          const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += 8;
+  src += 8;
+  pred += 8;
+  subtract_8x8(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_16x16(int16_t *diff, ptrdiff_t diff_stride,
+                           const uint16_t *src, ptrdiff_t src_stride,
+                           const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_16x8(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += diff_stride << 3;
+  src += src_stride << 3;
+  pred += pred_stride << 3;
+  subtract_16x8(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_16x32(int16_t *diff, ptrdiff_t diff_stride,
+                           const uint16_t *src, ptrdiff_t src_stride,
+                           const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += diff_stride << 4;
+  src += src_stride << 4;
+  pred += pred_stride << 4;
+  subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_32x16(int16_t *diff, ptrdiff_t diff_stride,
+                           const uint16_t *src, ptrdiff_t src_stride,
+                           const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += 16;
+  src += 16;
+  pred += 16;
+  subtract_16x16(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_32x32(int16_t *diff, ptrdiff_t diff_stride,
+                           const uint16_t *src, ptrdiff_t src_stride,
+                           const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_32x16(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += diff_stride << 4;
+  src += src_stride << 4;
+  pred += pred_stride << 4;
+  subtract_32x16(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_32x64(int16_t *diff, ptrdiff_t diff_stride,
+                           const uint16_t *src, ptrdiff_t src_stride,
+                           const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += diff_stride << 5;
+  src += src_stride << 5;
+  pred += pred_stride << 5;
+  subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_64x32(int16_t *diff, ptrdiff_t diff_stride,
+                           const uint16_t *src, ptrdiff_t src_stride,
+                           const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += 32;
+  src += 32;
+  pred += 32;
+  subtract_32x32(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_64x64(int16_t *diff, ptrdiff_t diff_stride,
+                           const uint16_t *src, ptrdiff_t src_stride,
+                           const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_64x32(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += diff_stride << 5;
+  src += src_stride << 5;
+  pred += pred_stride << 5;
+  subtract_64x32(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_64x128(int16_t *diff, ptrdiff_t diff_stride,
+                            const uint16_t *src, ptrdiff_t src_stride,
+                            const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += diff_stride << 6;
+  src += src_stride << 6;
+  pred += pred_stride << 6;
+  subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_128x64(int16_t *diff, ptrdiff_t diff_stride,
+                            const uint16_t *src, ptrdiff_t src_stride,
+                            const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += 64;
+  src += 64;
+  pred += 64;
+  subtract_64x64(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static void subtract_128x128(int16_t *diff, ptrdiff_t diff_stride,
+                             const uint16_t *src, ptrdiff_t src_stride,
+                             const uint16_t *pred, ptrdiff_t pred_stride) {
+  subtract_128x64(diff, diff_stride, src, src_stride, pred, pred_stride);
+  diff += diff_stride << 6;
+  src += src_stride << 6;
+  pred += pred_stride << 6;
+  subtract_128x64(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
+
+static SubtractWxHFuncType getSubtractFunc(int rows, int cols) {
+  SubtractWxHFuncType ret_func_ptr = NULL;
+  if (rows == 4) {
+    if (cols == 4) {
+      ret_func_ptr = subtract_4x4;
+    } else if (cols == 8) {
+      ret_func_ptr = subtract_8x4;
+    }
+  } else if (rows == 8) {
+    if (cols == 4) {
+      ret_func_ptr = subtract_4x8;
+    } else if (cols == 8) {
+      ret_func_ptr = subtract_8x8;
+    } else if (cols == 16) {
+      ret_func_ptr = subtract_16x8;
+    }
+  } else if (rows == 16) {
+    if (cols == 8) {
+      ret_func_ptr = subtract_8x16;
+    } else if (cols == 16) {
+      ret_func_ptr = subtract_16x16;
+    } else if (cols == 32) {
+      ret_func_ptr = subtract_32x16;
+    }
+  } else if (rows == 32) {
+    if (cols == 16) {
+      ret_func_ptr = subtract_16x32;
+    } else if (cols == 32) {
+      ret_func_ptr = subtract_32x32;
+    } else if (cols == 64) {
+      ret_func_ptr = subtract_64x32;
+    }
+  } else if (rows == 64) {
+    if (cols == 32) {
+      ret_func_ptr = subtract_32x64;
+    } else if (cols == 64) {
+      ret_func_ptr = subtract_64x64;
+    } else if (cols == 128) {
+      ret_func_ptr = subtract_128x64;
+    }
+  } else if (rows == 128) {
+    if (cols == 64) {
+      ret_func_ptr = subtract_64x128;
+    } else if (cols == 128) {
+      ret_func_ptr = subtract_128x128;
+    }
+  }
+  if (!ret_func_ptr) {
+    assert(0);
+  }
+  return ret_func_ptr;
+}
+
+void vpx_highbd_subtract_block_sse2(
+    int rows, int cols,
+    int16_t *diff, ptrdiff_t diff_stride,
+    const uint8_t *src8, ptrdiff_t src_stride,
+    const uint8_t *pred8,
+    ptrdiff_t pred_stride,
+    int bd) {
+  uint16_t *src = CONVERT_TO_SHORTPTR(src8);
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  SubtractWxHFuncType func;
+  (void) bd;
+
+  func = getSubtractFunc(rows, cols);
+  func(diff, diff_stride, src, src_stride, pred, pred_stride);
+}
diff --git a/vpx_dsp/x86/highbd_variance_sse2.c b/vpx_dsp/x86/highbd_variance_sse2.c
index 14d029c..7bfa383 100644
--- a/vpx_dsp/x86/highbd_variance_sse2.c
+++ b/vpx_dsp/x86/highbd_variance_sse2.c
@@ -7,7 +7,11 @@
  *  in the file PATENTS.  All contributing project authors may
  *  be found in the AUTHORS file in the root of the source tree.
  */
+
+#include <emmintrin.h>  // SSE2
+
 #include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
 
 #include "vpx_ports/mem.h"
 
@@ -590,3 +594,136 @@
 #undef FNS
 #undef FN
 #endif  // CONFIG_USE_X86INC
+
+void vpx_highbd_upsampled_pred_sse2(uint16_t *comp_pred,
+                                    int width, int height,
+                                    const uint8_t *ref8,
+                                    int ref_stride) {
+  int i, j;
+  int stride = ref_stride << 3;
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+
+  if (width >= 8) {
+    // read 8 points at one time
+    for (i = 0; i < height; i++) {
+      for (j = 0; j < width; j+= 8) {
+        __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref);
+        __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8));
+        __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16));
+        __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24));
+        __m128i s4 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 32));
+        __m128i s5 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 40));
+        __m128i s6 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 48));
+        __m128i s7 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 56));
+        __m128i t0, t1, t2, t3;
+
+        t0 = _mm_unpacklo_epi16(s0, s1);
+        t1 = _mm_unpacklo_epi16(s2, s3);
+        t2 = _mm_unpacklo_epi16(s4, s5);
+        t3 = _mm_unpacklo_epi16(s6, s7);
+        t0 = _mm_unpacklo_epi32(t0, t1);
+        t2 = _mm_unpacklo_epi32(t2, t3);
+        t0 = _mm_unpacklo_epi64(t0, t2);
+
+        _mm_storeu_si128((__m128i *)(comp_pred), t0);
+        comp_pred += 8;
+        ref += 64;                            // 8 * 8;
+      }
+      ref += stride - (width << 3);
+    }
+  } else {
+    // read 4 points at one time
+    for (i = 0; i < height; i++) {
+      for (j = 0; j < width; j+= 4) {
+        __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref);
+        __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8));
+        __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16));
+        __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24));
+        __m128i t0, t1;
+
+        t0 = _mm_unpacklo_epi16(s0, s1);
+        t1 = _mm_unpacklo_epi16(s2, s3);
+        t0 = _mm_unpacklo_epi32(t0, t1);
+
+        _mm_storel_epi64((__m128i *)(comp_pred), t0);
+        comp_pred += 4;
+        ref += 4 * 8;
+      }
+      ref += stride - (width << 3);
+    }
+  }
+}
+
+void vpx_highbd_comp_avg_upsampled_pred_sse2(uint16_t *comp_pred,
+                                             const uint8_t *pred8,
+                                             int width, int height,
+                                             const uint8_t *ref8,
+                                             int ref_stride) {
+  const __m128i one = _mm_set1_epi16(1);
+  int i, j;
+  int stride = ref_stride << 3;
+  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
+  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
+
+  if (width >= 8) {
+    // read 8 points at one time
+    for (i = 0; i < height; i++) {
+      for (j = 0; j < width; j+= 8) {
+        __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref);
+        __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8));
+        __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16));
+        __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24));
+        __m128i s4 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 32));
+        __m128i s5 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 40));
+        __m128i s6 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 48));
+        __m128i s7 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 56));
+        __m128i p0 = _mm_loadu_si128((const __m128i *)pred);
+        __m128i t0, t1, t2, t3;
+
+        t0 = _mm_unpacklo_epi16(s0, s1);
+        t1 = _mm_unpacklo_epi16(s2, s3);
+        t2 = _mm_unpacklo_epi16(s4, s5);
+        t3 = _mm_unpacklo_epi16(s6, s7);
+        t0 = _mm_unpacklo_epi32(t0, t1);
+        t2 = _mm_unpacklo_epi32(t2, t3);
+        t0 = _mm_unpacklo_epi64(t0, t2);
+
+        p0 = _mm_adds_epu16(t0, p0);
+        p0 = _mm_adds_epu16(p0, one);
+        p0 = _mm_srli_epi16(p0, 1);
+
+        _mm_storeu_si128((__m128i *)(comp_pred), p0);
+        comp_pred += 8;
+        pred += 8;
+        ref += 8 * 8;
+      }
+      ref += stride - (width << 3);
+    }
+  } else {
+    // read 4 points at one time
+    for (i = 0; i < height; i++) {
+      for (j = 0; j < width; j+= 4) {
+        __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref);
+        __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8));
+        __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16));
+        __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24));
+        __m128i p0 = _mm_loadl_epi64((const __m128i *)pred);
+        __m128i t0, t1;
+
+        t0 = _mm_unpacklo_epi16(s0, s1);
+        t1 = _mm_unpacklo_epi16(s2, s3);
+        t0 = _mm_unpacklo_epi32(t0, t1);
+
+        p0 = _mm_adds_epu16(t0, p0);
+        p0 = _mm_adds_epu16(p0, one);
+        p0 = _mm_srli_epi16(p0, 1);
+
+        _mm_storel_epi64((__m128i *)(comp_pred), p0);
+        comp_pred += 4;
+        pred += 4;
+        ref += 4 * 8;
+      }
+      ref += stride - (width << 3);
+    }
+  }
+}
diff --git a/vpx_dsp/x86/highbd_variance_sse4.c b/vpx_dsp/x86/highbd_variance_sse4.c
new file mode 100644
index 0000000..4d0b75d
--- /dev/null
+++ b/vpx_dsp/x86/highbd_variance_sse4.c
@@ -0,0 +1,246 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <smmintrin.h> /* SSE4.1 */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_dsp/variance.h"
+#include "vpx_dsp/vpx_filter.h"
+
+static INLINE void variance4x4_64_sse4_1(const uint8_t *a8, int a_stride,
+                                         const uint8_t *b8, int b_stride,
+                                         uint64_t *sse, int64_t *sum) {
+  __m128i u0, u1, u2, u3;
+  __m128i s0, s1, s2, s3;
+  __m128i t0, t1, x0, y0;
+  __m128i a0, a1, a2, a3;
+  __m128i b0, b1, b2, b3;
+  __m128i k_one_epi16 = _mm_set1_epi16((int16_t)1);
+
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);
+
+  a0 = _mm_loadl_epi64((__m128i const *)(a + 0 * a_stride));
+  a1 = _mm_loadl_epi64((__m128i const *)(a + 1 * a_stride));
+  a2 = _mm_loadl_epi64((__m128i const *)(a + 2 * a_stride));
+  a3 = _mm_loadl_epi64((__m128i const *)(a + 3 * a_stride));
+
+  b0 = _mm_loadl_epi64((__m128i const *)(b + 0 * b_stride));
+  b1 = _mm_loadl_epi64((__m128i const *)(b + 1 * b_stride));
+  b2 = _mm_loadl_epi64((__m128i const *)(b + 2 * b_stride));
+  b3 = _mm_loadl_epi64((__m128i const *)(b + 3 * b_stride));
+
+  u0 = _mm_unpacklo_epi16(a0, a1);
+  u1 = _mm_unpacklo_epi16(a2, a3);
+  u2 = _mm_unpacklo_epi16(b0, b1);
+  u3 = _mm_unpacklo_epi16(b2, b3);
+
+  s0 = _mm_sub_epi16(u0, u2);
+  s1 = _mm_sub_epi16(u1, u3);
+
+  t0 = _mm_madd_epi16(s0, k_one_epi16);
+  t1 = _mm_madd_epi16(s1, k_one_epi16);
+
+  s2 = _mm_hadd_epi32(t0, t1);
+  s3 = _mm_hadd_epi32(s2, s2);
+  y0 = _mm_hadd_epi32(s3, s3);
+
+  t0 = _mm_madd_epi16(s0, s0);
+  t1 = _mm_madd_epi16(s1, s1);
+
+  s2 = _mm_hadd_epi32(t0, t1);
+  s3 = _mm_hadd_epi32(s2, s2);
+  x0 = _mm_hadd_epi32(s3, s3);
+
+  *sse = (uint64_t)_mm_extract_epi32(x0, 0);
+  *sum = (int64_t)_mm_extract_epi32(y0, 0);
+}
+
+uint32_t vpx_highbd_8_variance4x4_sse4_1(const uint8_t *a,
+                                         int a_stride,
+                                         const uint8_t *b,
+                                         int b_stride,
+                                         uint32_t *sse) {
+  int64_t sum;
+  uint64_t local_sse;
+
+  variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
+  *sse = (uint32_t)local_sse;
+
+  return *sse - (uint32_t)((sum * sum) >> 4);
+}
+
+uint32_t vpx_highbd_10_variance4x4_sse4_1(const uint8_t *a,
+                                          int a_stride,
+                                          const uint8_t *b,
+                                          int b_stride,
+                                          uint32_t *sse) {
+  int64_t sum;
+  uint64_t local_sse;
+
+  variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 4);
+  sum = ROUND_POWER_OF_TWO(sum, 2);
+
+  return *sse - (uint32_t)((sum * sum) >> 4);
+}
+
+uint32_t vpx_highbd_12_variance4x4_sse4_1(const uint8_t *a,
+                                          int a_stride,
+                                          const uint8_t *b,
+                                          int b_stride,
+                                          uint32_t *sse) {
+  int64_t sum;
+  uint64_t local_sse;
+
+  variance4x4_64_sse4_1(a, a_stride, b, b_stride, &local_sse, &sum);
+  *sse = (uint32_t)ROUND_POWER_OF_TWO(local_sse, 8);
+  sum = ROUND_POWER_OF_TWO(sum, 4);
+
+  return *sse - (uint32_t)((sum * sum) >> 4);
+}
+
+// Sub-pixel
+uint32_t vpx_highbd_8_sub_pixel_variance4x4_sse4_1(
+    const uint8_t *src, int  src_stride,
+    int xoffset, int  yoffset,
+    const uint8_t *dst, int dst_stride,
+    uint32_t *sse) {
+
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+
+  vpx_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1,
+      4, bilinear_filters_2t[xoffset]);
+  vpx_highbd_var_filter_block2d_bil_second_pass(
+      fdata3, temp2, 4, 4, 4, 4,
+      bilinear_filters_2t[yoffset]);
+
+  return vpx_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp2),
+                                  4, dst, dst_stride, sse);
+}
+
+uint32_t vpx_highbd_10_sub_pixel_variance4x4_sse4_1(
+    const uint8_t *src, int  src_stride,
+    int xoffset, int  yoffset,
+    const uint8_t *dst, int dst_stride,
+    uint32_t *sse) {
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+
+  vpx_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1,
+      4, bilinear_filters_2t[xoffset]);
+  vpx_highbd_var_filter_block2d_bil_second_pass(
+      fdata3, temp2, 4, 4, 4, 4,
+      bilinear_filters_2t[yoffset]);
+
+  return vpx_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp2),
+                                   4, dst, dst_stride, sse);
+}
+
+uint32_t vpx_highbd_12_sub_pixel_variance4x4_sse4_1(
+    const uint8_t *src, int  src_stride,
+    int xoffset, int  yoffset,
+    const uint8_t *dst, int dst_stride,
+    uint32_t *sse) {
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+
+  vpx_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1,
+      4, bilinear_filters_2t[xoffset]);
+  vpx_highbd_var_filter_block2d_bil_second_pass(
+      fdata3, temp2, 4, 4, 4, 4,
+      bilinear_filters_2t[yoffset]);
+
+  return vpx_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp2),
+                                   4, dst, dst_stride, sse);
+}
+
+// Sub-pixel average
+
+uint32_t vpx_highbd_8_sub_pixel_avg_variance4x4_sse4_1(
+    const uint8_t *src, int  src_stride,
+    int xoffset, int  yoffset,
+    const uint8_t *dst, int dst_stride,
+    uint32_t *sse,
+    const uint8_t *second_pred) {
+
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+  DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
+
+  vpx_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1,
+      4, bilinear_filters_2t[xoffset]);
+  vpx_highbd_var_filter_block2d_bil_second_pass(
+      fdata3, temp2, 4, 4, 4, 4,
+      bilinear_filters_2t[yoffset]);
+
+  vpx_highbd_comp_avg_pred(temp3, second_pred, 4, 4,
+                           CONVERT_TO_BYTEPTR(temp2), 4);
+
+  return vpx_highbd_8_variance4x4(CONVERT_TO_BYTEPTR(temp3),
+                                  4, dst, dst_stride, sse);
+}
+
+uint32_t vpx_highbd_10_sub_pixel_avg_variance4x4_sse4_1(
+    const uint8_t *src, int  src_stride,
+    int xoffset, int  yoffset,
+    const uint8_t *dst, int dst_stride,
+    uint32_t *sse,
+    const uint8_t *second_pred) {
+
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+  DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
+
+  vpx_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1,
+      4, bilinear_filters_2t[xoffset]);
+  vpx_highbd_var_filter_block2d_bil_second_pass(
+      fdata3, temp2, 4, 4, 4, 4,
+      bilinear_filters_2t[yoffset]);
+
+  vpx_highbd_comp_avg_pred(temp3, second_pred, 4, 4,
+                           CONVERT_TO_BYTEPTR(temp2), 4);
+
+  return vpx_highbd_10_variance4x4(CONVERT_TO_BYTEPTR(temp3),
+                                   4, dst, dst_stride, sse);
+}
+
+uint32_t vpx_highbd_12_sub_pixel_avg_variance4x4_sse4_1(
+    const uint8_t *src, int  src_stride,
+    int xoffset, int  yoffset,
+    const uint8_t *dst, int dst_stride,
+    uint32_t *sse,
+    const uint8_t *second_pred) {
+
+  uint16_t fdata3[(4 + 1) * 4];
+  uint16_t temp2[4 * 4];
+  DECLARE_ALIGNED(16, uint16_t, temp3[4 * 4]);
+
+  vpx_highbd_var_filter_block2d_bil_first_pass(
+      src, fdata3, src_stride, 1, 4 + 1,
+      4, bilinear_filters_2t[xoffset]);
+  vpx_highbd_var_filter_block2d_bil_second_pass(
+      fdata3, temp2, 4, 4, 4, 4,
+      bilinear_filters_2t[yoffset]);
+
+  vpx_highbd_comp_avg_pred(temp3, second_pred, 4, 4,
+                           CONVERT_TO_BYTEPTR(temp2), 4);
+
+  return vpx_highbd_12_variance4x4(CONVERT_TO_BYTEPTR(temp3),
+                                   4, dst, dst_stride, sse);
+}
diff --git a/vpx_dsp/x86/masked_sad_intrin_ssse3.c b/vpx_dsp/x86/masked_sad_intrin_ssse3.c
new file mode 100644
index 0000000..8b9ff10
--- /dev/null
+++ b/vpx_dsp/x86/masked_sad_intrin_ssse3.c
@@ -0,0 +1,377 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+#include <emmintrin.h>
+#include <tmmintrin.h>
+
+#include "vpx_ports/mem.h"
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+static INLINE __m128i width8_load_2rows(const uint8_t *ptr, int stride) {
+  __m128i temp1 = _mm_loadl_epi64((const __m128i*)ptr);
+  __m128i temp2 = _mm_loadl_epi64((const __m128i*)(ptr + stride));
+  return _mm_unpacklo_epi64(temp1, temp2);
+}
+
+static INLINE __m128i width4_load_4rows(const uint8_t *ptr, int stride) {
+  __m128i temp1 = _mm_cvtsi32_si128(*(const uint32_t*)ptr);
+  __m128i temp2 = _mm_cvtsi32_si128(*(const uint32_t*)(ptr + stride));
+  __m128i temp3 = _mm_unpacklo_epi32(temp1, temp2);
+  temp1 = _mm_cvtsi32_si128(*(const uint32_t*)(ptr + stride * 2));
+  temp2 = _mm_cvtsi32_si128(*(const uint32_t*)(ptr + stride * 3));
+  temp1 = _mm_unpacklo_epi32(temp1, temp2);
+  return _mm_unpacklo_epi64(temp3, temp1);
+}
+
+static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride,
+                                            const uint8_t *b_ptr, int b_stride,
+                                            const uint8_t *m_ptr, int m_stride,
+                                            int width, int height);
+
+static INLINE unsigned int masked_sad8xh_ssse3(const uint8_t *a_ptr,
+                                               int a_stride,
+                                               const uint8_t *b_ptr,
+                                               int b_stride,
+                                               const uint8_t *m_ptr,
+                                               int m_stride,
+                                               int height);
+
+static INLINE unsigned int masked_sad4xh_ssse3(const uint8_t *a_ptr,
+                                               int a_stride,
+                                               const uint8_t *b_ptr,
+                                               int b_stride,
+                                               const uint8_t *m_ptr,
+                                               int m_stride,
+                                               int height);
+
+#define MASKSADMXN_SSSE3(m, n) \
+unsigned int vpx_masked_sad##m##x##n##_ssse3(const uint8_t *src, \
+                                             int src_stride, \
+                                             const uint8_t *ref, \
+                                             int ref_stride, \
+                                             const uint8_t *msk, \
+                                             int msk_stride) { \
+  return masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, msk_stride, \
+                          m, n); \
+}
+
+#if CONFIG_EXT_PARTITION
+MASKSADMXN_SSSE3(128, 128)
+MASKSADMXN_SSSE3(128, 64)
+MASKSADMXN_SSSE3(64, 128)
+#endif  // CONFIG_EXT_PARTITION
+MASKSADMXN_SSSE3(64, 64)
+MASKSADMXN_SSSE3(64, 32)
+MASKSADMXN_SSSE3(32, 64)
+MASKSADMXN_SSSE3(32, 32)
+MASKSADMXN_SSSE3(32, 16)
+MASKSADMXN_SSSE3(16, 32)
+MASKSADMXN_SSSE3(16, 16)
+MASKSADMXN_SSSE3(16, 8)
+
+#define MASKSAD8XN_SSSE3(n) \
+unsigned int vpx_masked_sad8x##n##_ssse3(const uint8_t *src, \
+                                         int src_stride, \
+                                         const uint8_t *ref, \
+                                         int ref_stride, \
+                                         const uint8_t *msk, \
+                                         int msk_stride) { \
+  return masked_sad8xh_ssse3(src, src_stride, ref, ref_stride, msk, \
+                             msk_stride, n); \
+}
+
+MASKSAD8XN_SSSE3(16)
+MASKSAD8XN_SSSE3(8)
+MASKSAD8XN_SSSE3(4)
+
+#define MASKSAD4XN_SSSE3(n) \
+unsigned int vpx_masked_sad4x##n##_ssse3(const uint8_t *src, int src_stride, \
+                                         const uint8_t *ref, int ref_stride, \
+                                         const uint8_t *msk, int msk_stride) { \
+  return masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk, \
+                             msk_stride, n); \
+}
+
+MASKSAD4XN_SSSE3(8)
+MASKSAD4XN_SSSE3(4)
+
+// For width a multiple of 16
+// Assumes values in m are <=64
+static INLINE unsigned int masked_sad_ssse3(const uint8_t *a_ptr, int a_stride,
+                                            const uint8_t *b_ptr, int b_stride,
+                                            const uint8_t *m_ptr, int m_stride,
+                                            int width, int height) {
+  int y, x;
+  __m128i a, b, m, temp1, temp2;
+  __m128i res = _mm_setzero_si128();
+  __m128i one = _mm_set1_epi16(1);
+  // For each row
+  for (y = 0; y < height; y++) {
+    // Covering the full width
+    for (x = 0; x < width; x += 16) {
+      // Load a, b, m in xmm registers
+      a = _mm_loadu_si128((const __m128i*)(a_ptr + x));
+      b = _mm_loadu_si128((const __m128i*)(b_ptr + x));
+      m = _mm_loadu_si128((const __m128i*)(m_ptr + x));
+
+      // Calculate the difference between a & b
+      temp1 = _mm_subs_epu8(a, b);
+      temp2 = _mm_subs_epu8(b, a);
+      temp1 = _mm_or_si128(temp1, temp2);
+
+      // Multiply by m and add together
+      temp2 = _mm_maddubs_epi16(temp1, m);
+      // Pad out row result to 32 bit integers & add to running total
+      res = _mm_add_epi32(res, _mm_madd_epi16(temp2, one));
+    }
+    // Move onto the next row
+    a_ptr += a_stride;
+    b_ptr += b_stride;
+    m_ptr += m_stride;
+  }
+  res = _mm_hadd_epi32(res, _mm_setzero_si128());
+  res = _mm_hadd_epi32(res, _mm_setzero_si128());
+  // sad = (sad + 31) >> 6;
+  return (_mm_cvtsi128_si32(res) + 31) >> 6;
+}
+
+static INLINE unsigned int masked_sad8xh_ssse3(const uint8_t *a_ptr,
+                                               int a_stride,
+                                               const uint8_t *b_ptr,
+                                               int b_stride,
+                                               const uint8_t *m_ptr,
+                                               int m_stride,
+                                               int height) {
+  int y;
+  __m128i a, b, m, temp1, temp2, row_res;
+  __m128i res = _mm_setzero_si128();
+  __m128i one = _mm_set1_epi16(1);
+  // Add the masked SAD for 2 rows at a time
+  for (y = 0; y < height; y += 2) {
+    // Load a, b, m in xmm registers
+    a = width8_load_2rows(a_ptr, a_stride);
+    b = width8_load_2rows(b_ptr, b_stride);
+    m = width8_load_2rows(m_ptr, m_stride);
+
+    // Calculate the difference between a & b
+    temp1 = _mm_subs_epu8(a, b);
+    temp2 = _mm_subs_epu8(b, a);
+    temp1 = _mm_or_si128(temp1, temp2);
+
+    // Multiply by m and add together
+    row_res = _mm_maddubs_epi16(temp1, m);
+
+    // Pad out row result to 32 bit integers & add to running total
+    res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one));
+
+    // Move onto the next rows
+    a_ptr += a_stride * 2;
+    b_ptr += b_stride * 2;
+    m_ptr += m_stride * 2;
+  }
+  res = _mm_hadd_epi32(res, _mm_setzero_si128());
+  res = _mm_hadd_epi32(res, _mm_setzero_si128());
+  // sad = (sad + 31) >> 6;
+  return (_mm_cvtsi128_si32(res) + 31) >> 6;
+}
+
+static INLINE unsigned int masked_sad4xh_ssse3(const uint8_t *a_ptr,
+                                               int a_stride,
+                                               const uint8_t *b_ptr,
+                                               int b_stride,
+                                               const uint8_t *m_ptr,
+                                               int m_stride,
+                                               int height) {
+  int y;
+  __m128i a, b, m, temp1, temp2, row_res;
+  __m128i res = _mm_setzero_si128();
+  __m128i one = _mm_set1_epi16(1);
+  // Add the masked SAD for 4 rows at a time
+  for (y = 0; y < height; y += 4) {
+    // Load a, b, m in xmm registers
+    a = width4_load_4rows(a_ptr, a_stride);
+    b = width4_load_4rows(b_ptr, b_stride);
+    m = width4_load_4rows(m_ptr, m_stride);
+
+    // Calculate the difference between a & b
+    temp1 = _mm_subs_epu8(a, b);
+    temp2 = _mm_subs_epu8(b, a);
+    temp1 = _mm_or_si128(temp1, temp2);
+
+    // Multiply by m and add together
+    row_res = _mm_maddubs_epi16(temp1, m);
+
+    // Pad out row result to 32 bit integers & add to running total
+    res = _mm_add_epi32(res, _mm_madd_epi16(row_res, one));
+
+    // Move onto the next rows
+    a_ptr += a_stride * 4;
+    b_ptr += b_stride * 4;
+    m_ptr += m_stride * 4;
+  }
+  // Pad out row result to 32 bit integers & add to running total
+  res = _mm_hadd_epi32(res, _mm_setzero_si128());
+  res = _mm_hadd_epi32(res, _mm_setzero_si128());
+  // sad = (sad + 31) >> 6;
+  return (_mm_cvtsi128_si32(res) + 31) >> 6;
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE __m128i highbd_width4_load_2rows(const uint16_t *ptr,
+                                               int stride) {
+  __m128i temp1 = _mm_loadl_epi64((const __m128i*)ptr);
+  __m128i temp2 = _mm_loadl_epi64((const __m128i*)(ptr + stride));
+  return _mm_unpacklo_epi64(temp1, temp2);
+}
+
+static INLINE unsigned int highbd_masked_sad_ssse3(const uint8_t *a8_ptr,
+                                                   int a_stride,
+                                                   const uint8_t *b8_ptr,
+                                                   int b_stride,
+                                                   const uint8_t *m_ptr,
+                                                   int m_stride,
+                                                   int width, int height);
+
+static INLINE unsigned int highbd_masked_sad4xh_ssse3(const uint8_t *a8_ptr,
+                                                      int a_stride,
+                                                      const uint8_t *b8_ptr,
+                                                      int b_stride,
+                                                      const uint8_t *m_ptr,
+                                                      int m_stride,
+                                                      int height);
+
+#define HIGHBD_MASKSADMXN_SSSE3(m, n) \
+unsigned int vpx_highbd_masked_sad##m##x##n##_ssse3(const uint8_t *src, \
+                                                    int src_stride, \
+                                                    const uint8_t *ref, \
+                                                    int ref_stride, \
+                                                    const uint8_t *msk, \
+                                                    int msk_stride) { \
+  return highbd_masked_sad_ssse3(src, src_stride, ref, ref_stride, msk, \
+                                 msk_stride, m, n); \
+}
+
+#if CONFIG_EXT_PARTITION
+HIGHBD_MASKSADMXN_SSSE3(128, 128)
+HIGHBD_MASKSADMXN_SSSE3(128, 64)
+HIGHBD_MASKSADMXN_SSSE3(64, 128)
+#endif  // CONFIG_EXT_PARTITION
+HIGHBD_MASKSADMXN_SSSE3(64, 64)
+HIGHBD_MASKSADMXN_SSSE3(64, 32)
+HIGHBD_MASKSADMXN_SSSE3(32, 64)
+HIGHBD_MASKSADMXN_SSSE3(32, 32)
+HIGHBD_MASKSADMXN_SSSE3(32, 16)
+HIGHBD_MASKSADMXN_SSSE3(16, 32)
+HIGHBD_MASKSADMXN_SSSE3(16, 16)
+HIGHBD_MASKSADMXN_SSSE3(16, 8)
+HIGHBD_MASKSADMXN_SSSE3(8, 16)
+HIGHBD_MASKSADMXN_SSSE3(8, 8)
+HIGHBD_MASKSADMXN_SSSE3(8, 4)
+
+#define HIGHBD_MASKSAD4XN_SSSE3(n) \
+unsigned int vpx_highbd_masked_sad4x##n##_ssse3(const uint8_t *src, \
+                                                int src_stride, \
+                                                const uint8_t *ref, \
+                                                int ref_stride, \
+                                                const uint8_t *msk, \
+                                                int msk_stride) { \
+  return highbd_masked_sad4xh_ssse3(src, src_stride, ref, ref_stride, msk, \
+                                    msk_stride, n); \
+}
+
+HIGHBD_MASKSAD4XN_SSSE3(8)
+HIGHBD_MASKSAD4XN_SSSE3(4)
+
+// For width a multiple of 8
+// Assumes values in m are <=64
+static INLINE unsigned int highbd_masked_sad_ssse3(const uint8_t *a8_ptr,
+                                                   int a_stride,
+                                                   const uint8_t *b8_ptr,
+                                                   int b_stride,
+                                                   const uint8_t *m_ptr,
+                                                   int m_stride,
+                                                   int width, int height) {
+  int y, x;
+  __m128i a, b, m, temp1, temp2;
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr);
+  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr);
+  __m128i res = _mm_setzero_si128();
+  // For each row
+  for (y = 0; y < height; y++) {
+    // Covering the full width
+    for (x = 0; x < width; x += 8) {
+      // Load a, b, m in xmm registers
+      a = _mm_loadu_si128((const __m128i*)(a_ptr + x));
+      b = _mm_loadu_si128((const __m128i*)(b_ptr + x));
+      m = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(m_ptr + x)),
+                            _mm_setzero_si128());
+
+      // Calculate the difference between a & b
+      temp1 = _mm_subs_epu16(a, b);
+      temp2 = _mm_subs_epu16(b, a);
+      temp1 = _mm_or_si128(temp1, temp2);
+
+      // Add result of multiplying by m and add pairs together to running total
+      res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m));
+    }
+    // Move onto the next row
+    a_ptr += a_stride;
+    b_ptr += b_stride;
+    m_ptr += m_stride;
+  }
+  res = _mm_hadd_epi32(res, _mm_setzero_si128());
+  res = _mm_hadd_epi32(res, _mm_setzero_si128());
+  // sad = (sad + 31) >> 6;
+  return (_mm_cvtsi128_si32(res) + 31) >> 6;
+}
+
+static INLINE unsigned int highbd_masked_sad4xh_ssse3(const uint8_t *a8_ptr,
+                                                      int a_stride,
+                                                      const uint8_t *b8_ptr,
+                                                      int b_stride,
+                                                      const uint8_t *m_ptr,
+                                                      int m_stride,
+                                                      int height) {
+  int y;
+  __m128i a, b, m, temp1, temp2;
+  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8_ptr);
+  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8_ptr);
+  __m128i res = _mm_setzero_si128();
+  // Add the masked SAD for 2 rows at a time
+  for (y = 0; y < height; y += 2) {
+    // Load a, b, m in xmm registers
+    a = highbd_width4_load_2rows(a_ptr, a_stride);
+    b = highbd_width4_load_2rows(b_ptr, b_stride);
+    temp1 = _mm_loadl_epi64((const __m128i*)m_ptr);
+    temp2 = _mm_loadl_epi64((const __m128i*)(m_ptr + m_stride));
+    m = _mm_unpacklo_epi8(_mm_unpacklo_epi32(temp1, temp2),
+                          _mm_setzero_si128());
+
+    // Calculate the difference between a & b
+    temp1 = _mm_subs_epu16(a, b);
+    temp2 = _mm_subs_epu16(b, a);
+    temp1 = _mm_or_si128(temp1, temp2);
+
+    // Multiply by m and add together
+    res = _mm_add_epi32(res, _mm_madd_epi16(temp1, m));
+
+    // Move onto the next rows
+    a_ptr += a_stride * 2;
+    b_ptr += b_stride * 2;
+    m_ptr += m_stride * 2;
+  }
+  res = _mm_hadd_epi32(res, _mm_setzero_si128());
+  res = _mm_hadd_epi32(res, _mm_setzero_si128());
+  // sad = (sad + 31) >> 6;
+  return (_mm_cvtsi128_si32(res) + 31) >> 6;
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/x86/masked_variance_intrin_ssse3.c b/vpx_dsp/x86/masked_variance_intrin_ssse3.c
new file mode 100644
index 0000000..a0c2b6e
--- /dev/null
+++ b/vpx_dsp/x86/masked_variance_intrin_ssse3.c
@@ -0,0 +1,2051 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <stdlib.h>
+#include <emmintrin.h>
+#include <tmmintrin.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/vpx_filter.h"
+
+
+// Half pixel shift
+#define HALF_PIXEL_OFFSET (BIL_SUBPEL_SHIFTS/2)
+
+/*****************************************************************************
+ * Horizontal additions
+ *****************************************************************************/
+
+static INLINE int32_t hsum_epi32_si32(__m128i v_d) {
+  v_d = _mm_hadd_epi32(v_d, v_d);
+  v_d = _mm_hadd_epi32(v_d, v_d);
+  return _mm_cvtsi128_si32(v_d);
+}
+
+static INLINE int64_t hsum_epi64_si64(__m128i v_q) {
+  v_q = _mm_add_epi64(v_q, _mm_srli_si128(v_q, 8));
+#if ARCH_X86_64
+  return _mm_cvtsi128_si64(v_q);
+#else
+  {
+    int64_t tmp;
+    _mm_storel_epi64((__m128i*)&tmp, v_q);
+    return tmp;
+  }
+#endif
+}
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE int64_t hsum_epi32_si64(__m128i v_d) {
+  const __m128i v_sign_d =  _mm_cmplt_epi32(v_d, _mm_setzero_si128());
+  const __m128i v_0_q = _mm_unpacklo_epi32(v_d, v_sign_d);
+  const __m128i v_1_q = _mm_unpackhi_epi32(v_d, v_sign_d);
+  return hsum_epi64_si64(_mm_add_epi64(v_0_q, v_1_q));
+}
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+
+static INLINE uint32_t calc_masked_variance(__m128i v_sum_d, __m128i v_sse_q,
+                                            uint32_t* sse,
+                                            const int w, const int h) {
+  int64_t sum64;
+  uint64_t sse64;
+
+  // Horizontal sum
+  sum64 = hsum_epi32_si32(v_sum_d);
+  sse64 = hsum_epi64_si64(v_sse_q);
+
+  sum64 = (sum64 >= 0) ? sum64 : -sum64;
+
+  // Round
+  sum64 = ROUND_POWER_OF_TWO(sum64, 6);
+  sse64 = ROUND_POWER_OF_TWO(sse64, 12);
+
+  // Store the SSE
+  *sse = (uint32_t)sse64;
+  // Compute the variance
+  return  *sse - (uint32_t)((sum64 * sum64) / (w * h));
+}
+
+/*****************************************************************************
+ * n*16 Wide versions
+ *****************************************************************************/
+
+static INLINE unsigned int masked_variancewxh_ssse3(
+    const uint8_t *a, int  a_stride,
+    const uint8_t *b, int  b_stride,
+    const uint8_t *m, int  m_stride,
+    int w, int  h,
+    unsigned int *sse) {
+  int ii, jj;
+
+  const __m128i v_zero = _mm_setzero_si128();
+
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
+
+  assert((w % 16) == 0);
+
+  for (ii = 0; ii < h; ii++) {
+    for (jj = 0 ; jj < w ; jj += 16) {
+      // Load inputs - 8 bits
+      const __m128i v_a_b = _mm_loadu_si128((const __m128i*)(a+jj));
+      const __m128i v_b_b = _mm_loadu_si128((const __m128i*)(b+jj));
+      const __m128i v_m_b = _mm_loadu_si128((const __m128i*)(m+jj));
+
+      // Unpack to 16 bits - still containing max 8 bits
+      const __m128i v_a0_w = _mm_unpacklo_epi8(v_a_b, v_zero);
+      const __m128i v_b0_w = _mm_unpacklo_epi8(v_b_b, v_zero);
+      const __m128i v_m0_w = _mm_unpacklo_epi8(v_m_b, v_zero);
+      const __m128i v_a1_w = _mm_unpackhi_epi8(v_a_b, v_zero);
+      const __m128i v_b1_w = _mm_unpackhi_epi8(v_b_b, v_zero);
+      const __m128i v_m1_w = _mm_unpackhi_epi8(v_m_b, v_zero);
+
+      // Difference: [-255, 255]
+      const __m128i v_d0_w = _mm_sub_epi16(v_a0_w, v_b0_w);
+      const __m128i v_d1_w = _mm_sub_epi16(v_a1_w, v_b1_w);
+
+      // Error - [-255, 255] * [0, 64] = [0xc040, 0x3fc0] => fits in 15 bits
+      const __m128i v_e0_w = _mm_mullo_epi16(v_d0_w, v_m0_w);
+      const __m128i v_e0_d = _mm_madd_epi16(v_d0_w, v_m0_w);
+      const __m128i v_e1_w = _mm_mullo_epi16(v_d1_w, v_m1_w);
+      const __m128i v_e1_d = _mm_madd_epi16(v_d1_w, v_m1_w);
+
+      // Squared error - using madd it's max (15 bits * 15 bits) * 2 = 31 bits
+      const __m128i v_se0_d = _mm_madd_epi16(v_e0_w, v_e0_w);
+      const __m128i v_se1_d = _mm_madd_epi16(v_e1_w, v_e1_w);
+
+      // Sum of v_se{0,1}_d - 31 bits + 31 bits = 32 bits
+      const __m128i v_se_d = _mm_add_epi32(v_se0_d, v_se1_d);
+
+      // Unpack Squared error to 64 bits
+      const __m128i v_se_lo_q = _mm_unpacklo_epi32(v_se_d, v_zero);
+      const __m128i v_se_hi_q = _mm_unpackhi_epi32(v_se_d, v_zero);
+
+      // Accumulate
+      v_sum_d = _mm_add_epi32(v_sum_d, v_e0_d);
+      v_sum_d = _mm_add_epi32(v_sum_d, v_e1_d);
+      v_sse_q = _mm_add_epi64(v_sse_q, v_se_lo_q);
+      v_sse_q = _mm_add_epi64(v_sse_q, v_se_hi_q);
+    }
+
+    // Move on to next row
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  }
+
+  return calc_masked_variance(v_sum_d, v_sse_q, sse, w, h);
+}
+
+#define MASKED_VARWXH(W, H)                                               \
+unsigned int vpx_masked_variance##W##x##H##_ssse3(                        \
+  const uint8_t *a, int a_stride,                                         \
+  const uint8_t *b, int b_stride,                                         \
+  const uint8_t *m, int m_stride,                                         \
+  unsigned int *sse) {                                                    \
+  return masked_variancewxh_ssse3(a, a_stride,                            \
+                                  b, b_stride,                            \
+                                  m, m_stride,                            \
+                                  W, H, sse);                             \
+}
+
+MASKED_VARWXH(16, 8)
+MASKED_VARWXH(16, 16)
+MASKED_VARWXH(16, 32)
+MASKED_VARWXH(32, 16)
+MASKED_VARWXH(32, 32)
+MASKED_VARWXH(32, 64)
+MASKED_VARWXH(64, 32)
+MASKED_VARWXH(64, 64)
+#if CONFIG_EXT_PARTITION
+MASKED_VARWXH(64, 128)
+MASKED_VARWXH(128, 64)
+MASKED_VARWXH(128, 128)
+#endif  // CONFIG_EXT_PARTITION
+
+/*****************************************************************************
+ * 8 Wide versions
+ *****************************************************************************/
+
+static INLINE unsigned int masked_variance8xh_ssse3(
+    const uint8_t *a, int  a_stride,
+    const uint8_t *b, int  b_stride,
+    const uint8_t *m, int  m_stride,
+    int  h,
+    unsigned int *sse) {
+  int ii;
+
+  const __m128i v_zero = _mm_setzero_si128();
+
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
+
+  for (ii = 0; ii < h; ii++) {
+    // Load inputs - 8 bits
+    const __m128i v_a_b = _mm_loadl_epi64((const __m128i*)a);
+    const __m128i v_b_b = _mm_loadl_epi64((const __m128i*)b);
+    const __m128i v_m_b = _mm_loadl_epi64((const __m128i*)m);
+
+    // Unpack to 16 bits - still containing max 8 bits
+    const __m128i v_a_w = _mm_unpacklo_epi8(v_a_b, v_zero);
+    const __m128i v_b_w = _mm_unpacklo_epi8(v_b_b, v_zero);
+    const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero);
+
+    // Difference: [-255, 255]
+    const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+
+    // Error - [-255, 255] * [0, 64] = [0xc040, 0x3fc0] => fits in 15 bits
+    const __m128i v_e_w = _mm_mullo_epi16(v_d_w, v_m_w);
+    const __m128i v_e_d = _mm_madd_epi16(v_d_w, v_m_w);
+
+    // Squared error - using madd it's max (15 bits * 15 bits) * 2 = 31 bits
+    const __m128i v_se_d = _mm_madd_epi16(v_e_w, v_e_w);
+
+    // Unpack Squared error to 64 bits
+    const __m128i v_se_lo_q = _mm_unpacklo_epi32(v_se_d, v_zero);
+    const __m128i v_se_hi_q = _mm_unpackhi_epi32(v_se_d, v_zero);
+
+    // Accumulate
+    v_sum_d = _mm_add_epi32(v_sum_d, v_e_d);
+    v_sse_q = _mm_add_epi64(v_sse_q, v_se_lo_q);
+    v_sse_q = _mm_add_epi64(v_sse_q, v_se_hi_q);
+
+    // Move on to next row
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  }
+
+  return calc_masked_variance(v_sum_d, v_sse_q, sse, 8, h);
+}
+
+#define MASKED_VAR8XH(H)                                                  \
+unsigned int vpx_masked_variance8x##H##_ssse3(                            \
+  const uint8_t *a, int a_stride,                                         \
+  const uint8_t *b, int b_stride,                                         \
+  const uint8_t *m, int m_stride,                                         \
+  unsigned int *sse) {                                                    \
+  return masked_variance8xh_ssse3(a, a_stride,                            \
+                                  b, b_stride,                            \
+                                  m, m_stride,                            \
+                                  H, sse);                                \
+}
+
+MASKED_VAR8XH(4)
+MASKED_VAR8XH(8)
+MASKED_VAR8XH(16)
+
+/*****************************************************************************
+ * 4 Wide versions
+ *****************************************************************************/
+
+static INLINE unsigned int masked_variance4xh_ssse3(
+    const uint8_t *a, int  a_stride,
+    const uint8_t *b, int  b_stride,
+    const uint8_t *m, int  m_stride,
+    int  h,
+    unsigned int *sse) {
+  int ii;
+
+  const __m128i v_zero = _mm_setzero_si128();
+
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
+
+  assert((h % 2) == 0);
+
+  for (ii = 0; ii < h/2; ii++) {
+    // Load 2 input rows - 8 bits
+    const __m128i v_a0_b = _mm_cvtsi32_si128(*(const uint32_t*)a);
+    const __m128i v_b0_b = _mm_cvtsi32_si128(*(const uint32_t*)b);
+    const __m128i v_m0_b = _mm_cvtsi32_si128(*(const uint32_t*)m);
+    const __m128i v_a1_b = _mm_cvtsi32_si128(*(const uint32_t*)(a + a_stride));
+    const __m128i v_b1_b = _mm_cvtsi32_si128(*(const uint32_t*)(b + b_stride));
+    const __m128i v_m1_b = _mm_cvtsi32_si128(*(const uint32_t*)(m + m_stride));
+
+    // Interleave 2 rows into a single register
+    const __m128i v_a_b = _mm_unpacklo_epi32(v_a0_b, v_a1_b);
+    const __m128i v_b_b = _mm_unpacklo_epi32(v_b0_b, v_b1_b);
+    const __m128i v_m_b = _mm_unpacklo_epi32(v_m0_b, v_m1_b);
+
+    // Unpack to 16 bits - still containing max 8 bits
+    const __m128i v_a_w = _mm_unpacklo_epi8(v_a_b, v_zero);
+    const __m128i v_b_w = _mm_unpacklo_epi8(v_b_b, v_zero);
+    const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero);
+
+    // Difference: [-255, 255]
+    const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+
+    // Error - [-255, 255] * [0, 64] = [0xc040, 0x3fc0] => fits in 15 bits
+    const __m128i v_e_w = _mm_mullo_epi16(v_d_w, v_m_w);
+    const __m128i v_e_d = _mm_madd_epi16(v_d_w, v_m_w);
+
+    // Squared error - using madd it's max (15 bits * 15 bits) * 2 = 31 bits
+    const __m128i v_se_d = _mm_madd_epi16(v_e_w, v_e_w);
+
+    // Unpack Squared error to 64 bits
+    const __m128i v_se_lo_q = _mm_unpacklo_epi32(v_se_d, v_zero);
+    const __m128i v_se_hi_q = _mm_unpackhi_epi32(v_se_d, v_zero);
+
+    // Accumulate
+    v_sum_d = _mm_add_epi32(v_sum_d, v_e_d);
+    v_sse_q = _mm_add_epi64(v_sse_q, v_se_lo_q);
+    v_sse_q = _mm_add_epi64(v_sse_q, v_se_hi_q);
+
+    // Move on to next 2 row
+    a += a_stride * 2;
+    b += b_stride * 2;
+    m += m_stride * 2;
+  }
+
+  return calc_masked_variance(v_sum_d, v_sse_q, sse, 4, h);
+}
+
+#define MASKED_VAR4XH(H)                                                  \
+unsigned int vpx_masked_variance4x##H##_ssse3(                            \
+  const uint8_t *a, int a_stride,                                         \
+  const uint8_t *b, int b_stride,                                         \
+  const uint8_t *m, int m_stride,                                         \
+  unsigned int *sse) {                                                    \
+  return masked_variance4xh_ssse3(a, a_stride,                            \
+                                  b, b_stride,                            \
+                                  m, m_stride,                            \
+                                  H, sse);                                \
+}
+
+MASKED_VAR4XH(4)
+MASKED_VAR4XH(8)
+
+#if CONFIG_VP9_HIGHBITDEPTH
+
+// Main calculation for n*8 wide blocks
+static INLINE void highbd_masked_variance64_ssse3(
+    const uint16_t *a, int  a_stride,
+    const uint16_t *b, int  b_stride,
+    const uint8_t *m, int  m_stride,
+    int w, int  h,
+    int64_t *sum, uint64_t *sse) {
+  int ii, jj;
+
+  const __m128i v_zero = _mm_setzero_si128();
+
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
+
+  assert((w % 8) == 0);
+
+  for (ii = 0; ii < h; ii++) {
+    for (jj = 0 ; jj < w ; jj += 8) {
+      // Load inputs - 8 bits
+      const __m128i v_a_w = _mm_loadu_si128((const __m128i*)(a+jj));
+      const __m128i v_b_w = _mm_loadu_si128((const __m128i*)(b+jj));
+      const __m128i v_m_b = _mm_loadl_epi64((const __m128i*)(m+jj));
+
+      // Unpack m to 16 bits - still containing max 8 bits
+      const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero);
+
+      // Difference: [-4095, 4095]
+      const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+
+      // Error - [-4095, 4095] * [0, 64] => sum of 2 of these fits in 19 bits
+      const __m128i v_e_d = _mm_madd_epi16(v_d_w, v_m_w);
+
+      // Squared error - max (18 bits * 18 bits) = 36 bits (no sign bit)
+      const __m128i v_absd_w = _mm_abs_epi16(v_d_w);
+      const __m128i v_dlo_d = _mm_unpacklo_epi16(v_absd_w, v_zero);
+      const __m128i v_mlo_d = _mm_unpacklo_epi16(v_m_w, v_zero);
+      const __m128i v_elo_d = _mm_madd_epi16(v_dlo_d, v_mlo_d);
+      const __m128i v_dhi_d = _mm_unpackhi_epi16(v_absd_w, v_zero);
+      const __m128i v_mhi_d = _mm_unpackhi_epi16(v_m_w, v_zero);
+      const __m128i v_ehi_d = _mm_madd_epi16(v_dhi_d, v_mhi_d);
+      // Square and sum the errors -> 36bits * 4 = 38bits
+      __m128i v_se0_q, v_se1_q, v_se2_q, v_se3_q, v_se_q, v_elo1_d, v_ehi3_d;
+      v_se0_q = _mm_mul_epu32(v_elo_d, v_elo_d);
+      v_elo1_d = _mm_srli_si128(v_elo_d, 4);
+      v_se1_q = _mm_mul_epu32(v_elo1_d, v_elo1_d);
+      v_se0_q = _mm_add_epi64(v_se0_q, v_se1_q);
+      v_se2_q = _mm_mul_epu32(v_ehi_d, v_ehi_d);
+      v_ehi3_d = _mm_srli_si128(v_ehi_d, 4);
+      v_se3_q = _mm_mul_epu32(v_ehi3_d, v_ehi3_d);
+      v_se1_q = _mm_add_epi64(v_se2_q, v_se3_q);
+      v_se_q = _mm_add_epi64(v_se0_q, v_se1_q);
+
+      // Accumulate
+      v_sum_d = _mm_add_epi32(v_sum_d, v_e_d);
+      v_sse_q = _mm_add_epi64(v_sse_q, v_se_q);
+    }
+
+    // Move on to next row
+    a += a_stride;
+    b += b_stride;
+    m += m_stride;
+  }
+
+  // Horizontal sum
+  *sum = hsum_epi32_si64(v_sum_d);
+  *sse = hsum_epi64_si64(v_sse_q);
+
+  // Round
+  *sum = (*sum >= 0) ? *sum  : -*sum;
+  *sum = ROUND_POWER_OF_TWO(*sum, 6);
+  *sse = ROUND_POWER_OF_TWO(*sse, 12);
+}
+
+// Main calculation for 4 wide blocks
+static INLINE void highbd_masked_variance64_4wide_ssse3(
+    const uint16_t *a, int  a_stride,
+    const uint16_t *b, int  b_stride,
+    const uint8_t *m, int  m_stride,
+    int  h,
+    int64_t *sum, uint64_t *sse) {
+  int ii;
+
+  const __m128i v_zero = _mm_setzero_si128();
+
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
+
+  assert((h % 2) == 0);
+
+  for (ii = 0; ii < h/2; ii++) {
+    // Load 2 input rows - 8 bits
+    const __m128i v_a0_w = _mm_loadl_epi64((const __m128i*)a);
+    const __m128i v_b0_w = _mm_loadl_epi64((const __m128i*)b);
+    const __m128i v_m0_b = _mm_cvtsi32_si128(*(const uint32_t*)m);
+    const __m128i v_a1_w = _mm_loadl_epi64((const __m128i*)(a + a_stride));
+    const __m128i v_b1_w = _mm_loadl_epi64((const __m128i*)(b + b_stride));
+    const __m128i v_m1_b = _mm_cvtsi32_si128(*(const uint32_t*)(m + m_stride));
+
+    // Interleave 2 rows into a single register
+    const __m128i v_a_w = _mm_unpacklo_epi64(v_a0_w, v_a1_w);
+    const __m128i v_b_w = _mm_unpacklo_epi64(v_b0_w, v_b1_w);
+    const __m128i v_m_b = _mm_unpacklo_epi32(v_m0_b, v_m1_b);
+
+    // Unpack to 16 bits - still containing max 8 bits
+    const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero);
+
+    // Difference: [-4095, 4095]
+    const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+
+    // Error - [-4095, 4095] * [0, 64] => fits in 19 bits (incld sign bit)
+    const __m128i v_e_d = _mm_madd_epi16(v_d_w, v_m_w);
+
+    // Squared error - max (18 bits * 18 bits) = 36 bits (no sign bit)
+    const __m128i v_absd_w = _mm_abs_epi16(v_d_w);
+    const __m128i v_dlo_d = _mm_unpacklo_epi16(v_absd_w, v_zero);
+    const __m128i v_mlo_d = _mm_unpacklo_epi16(v_m_w, v_zero);
+    const __m128i v_elo_d = _mm_madd_epi16(v_dlo_d, v_mlo_d);
+    const __m128i v_dhi_d = _mm_unpackhi_epi16(v_absd_w, v_zero);
+    const __m128i v_mhi_d = _mm_unpackhi_epi16(v_m_w, v_zero);
+    const __m128i v_ehi_d = _mm_madd_epi16(v_dhi_d, v_mhi_d);
+    // Square and sum the errors -> 36bits * 4 = 38bits
+    __m128i v_se0_q, v_se1_q, v_se2_q, v_se3_q, v_se_q, v_elo1_d, v_ehi3_d;
+    v_se0_q = _mm_mul_epu32(v_elo_d, v_elo_d);
+    v_elo1_d = _mm_srli_si128(v_elo_d, 4);
+    v_se1_q = _mm_mul_epu32(v_elo1_d, v_elo1_d);
+    v_se0_q = _mm_add_epi64(v_se0_q, v_se1_q);
+    v_se2_q = _mm_mul_epu32(v_ehi_d, v_ehi_d);
+    v_ehi3_d = _mm_srli_si128(v_ehi_d, 4);
+    v_se3_q = _mm_mul_epu32(v_ehi3_d, v_ehi3_d);
+    v_se1_q = _mm_add_epi64(v_se2_q, v_se3_q);
+    v_se_q = _mm_add_epi64(v_se0_q, v_se1_q);
+
+    // Accumulate
+    v_sum_d = _mm_add_epi32(v_sum_d, v_e_d);
+    v_sse_q = _mm_add_epi64(v_sse_q, v_se_q);
+
+    // Move on to next row
+    a += a_stride * 2;
+    b += b_stride * 2;
+    m += m_stride * 2;
+  }
+
+  // Horizontal sum
+  *sum = hsum_epi32_si32(v_sum_d);
+  *sse = hsum_epi64_si64(v_sse_q);
+
+  // Round
+  *sum = (*sum >= 0) ? *sum  : -*sum;
+  *sum = ROUND_POWER_OF_TWO(*sum, 6);
+  *sse = ROUND_POWER_OF_TWO(*sse, 12);
+}
+
+static INLINE unsigned int highbd_masked_variancewxh_ssse3(
+    const uint16_t *a, int  a_stride,
+    const uint16_t *b, int  b_stride,
+    const uint8_t *m, int  m_stride,
+    int w, int  h,
+    unsigned int *sse) {
+  uint64_t sse64;
+  int64_t sum64;
+
+  if (w == 4)
+    highbd_masked_variance64_4wide_ssse3(a, a_stride, b,  b_stride, m, m_stride,
+            h, &sum64, &sse64);
+  else
+    highbd_masked_variance64_ssse3(a, a_stride, b,  b_stride, m, m_stride, w, h,
+            &sum64, &sse64);
+
+  // Store the SSE
+  *sse = (uint32_t)sse64;
+  // Compute and return variance
+  return *sse - (uint32_t)((sum64 * sum64) / (w * h));
+}
+
+static INLINE unsigned int highbd_10_masked_variancewxh_ssse3(
+    const uint16_t *a, int  a_stride,
+    const uint16_t *b, int  b_stride,
+    const uint8_t *m, int  m_stride,
+    int w, int  h,
+    unsigned int *sse) {
+  uint64_t sse64;
+  int64_t sum64;
+
+  if (w == 4)
+    highbd_masked_variance64_4wide_ssse3(a, a_stride, b,  b_stride, m, m_stride,
+            h, &sum64, &sse64);
+  else
+    highbd_masked_variance64_ssse3(a, a_stride, b,  b_stride, m, m_stride, w, h,
+            &sum64, &sse64);
+
+  // Normalise
+  sum64 = ROUND_POWER_OF_TWO(sum64, 2);
+  sse64 = ROUND_POWER_OF_TWO(sse64, 4);
+
+  // Store the SSE
+  *sse = (uint32_t)sse64;
+  // Compute and return variance
+  return *sse - (uint32_t)((sum64 * sum64) / (w * h));
+}
+
+static INLINE unsigned int highbd_12_masked_variancewxh_ssse3(
+    const uint16_t *a, int  a_stride,
+    const uint16_t *b, int  b_stride,
+    const uint8_t *m, int  m_stride,
+    int w, int  h,
+    unsigned int *sse) {
+  uint64_t sse64;
+  int64_t sum64;
+
+  if (w == 4)
+    highbd_masked_variance64_4wide_ssse3(a, a_stride, b,  b_stride, m, m_stride,
+            h, &sum64, &sse64);
+  else
+    highbd_masked_variance64_ssse3(a, a_stride, b,  b_stride, m, m_stride, w, h,
+            &sum64, &sse64);
+
+  sum64 = ROUND_POWER_OF_TWO(sum64, 4);
+  sse64 = ROUND_POWER_OF_TWO(sse64, 8);
+
+  // Store the SSE
+  *sse = (uint32_t)sse64;
+  // Compute and return variance
+  return *sse - (uint32_t)((sum64 * sum64) / (w * h));
+}
+
+#define HIGHBD_MASKED_VARWXH(W, H)                                             \
+unsigned int vpx_highbd_masked_variance##W##x##H##_ssse3(                      \
+  const uint8_t *a8, int a_stride,                                             \
+  const uint8_t *b8, int b_stride,                                             \
+  const uint8_t *m, int m_stride,                                              \
+  unsigned int *sse) {                                                         \
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);                                       \
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);                                       \
+  return highbd_masked_variancewxh_ssse3(a, a_stride,                          \
+                                         b, b_stride,                          \
+                                         m, m_stride,                          \
+                                         W, H, sse);                           \
+}                                                                              \
+                                                                               \
+unsigned int vpx_highbd_10_masked_variance##W##x##H##_ssse3(                   \
+  const uint8_t *a8, int a_stride,                                             \
+  const uint8_t *b8, int b_stride,                                             \
+  const uint8_t *m, int m_stride,                                              \
+  unsigned int *sse) {                                                         \
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);                                       \
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);                                       \
+  return highbd_10_masked_variancewxh_ssse3(a, a_stride,                       \
+                                            b, b_stride,                       \
+                                            m, m_stride,                       \
+                                            W, H, sse);                        \
+}                                                                              \
+                                                                               \
+unsigned int vpx_highbd_12_masked_variance##W##x##H##_ssse3(                   \
+  const uint8_t *a8, int a_stride,                                             \
+  const uint8_t *b8, int b_stride,                                             \
+  const uint8_t *m, int m_stride,                                              \
+  unsigned int *sse) {                                                         \
+  uint16_t *a = CONVERT_TO_SHORTPTR(a8);                                       \
+  uint16_t *b = CONVERT_TO_SHORTPTR(b8);                                       \
+  return highbd_12_masked_variancewxh_ssse3(a, a_stride,                       \
+                                            b, b_stride,                       \
+                                            m, m_stride,                       \
+                                            W, H, sse);                        \
+}
+
+HIGHBD_MASKED_VARWXH(4, 4)
+HIGHBD_MASKED_VARWXH(4, 8)
+HIGHBD_MASKED_VARWXH(8, 4)
+HIGHBD_MASKED_VARWXH(8, 8)
+HIGHBD_MASKED_VARWXH(8, 16)
+HIGHBD_MASKED_VARWXH(16, 8)
+HIGHBD_MASKED_VARWXH(16, 16)
+HIGHBD_MASKED_VARWXH(16, 32)
+HIGHBD_MASKED_VARWXH(32, 16)
+HIGHBD_MASKED_VARWXH(32, 32)
+HIGHBD_MASKED_VARWXH(32, 64)
+HIGHBD_MASKED_VARWXH(64, 32)
+HIGHBD_MASKED_VARWXH(64, 64)
+#if CONFIG_EXT_PARTITION
+HIGHBD_MASKED_VARWXH(64, 128)
+HIGHBD_MASKED_VARWXH(128, 64)
+HIGHBD_MASKED_VARWXH(128, 128)
+#endif  // CONFIG_EXT_PARTITION
+
+#endif
+
+//////////////////////////////////////////////////////////////////////////////
+// Sub pixel versions
+//////////////////////////////////////////////////////////////////////////////
+
+typedef __m128i (*filter_fn_t)(__m128i v_a_b, __m128i v_b_b,
+                                    __m128i v_filter_b);
+
+static INLINE __m128i apply_filter_avg(const __m128i v_a_b, const __m128i v_b_b,
+                                       const __m128i v_filter_b) {
+  (void) v_filter_b;
+  return _mm_avg_epu8(v_a_b, v_b_b);
+}
+
+static INLINE __m128i apply_filter(const __m128i v_a_b, const __m128i v_b_b,
+                                   const __m128i v_filter_b) {
+  const __m128i v_rounding_w = _mm_set1_epi16(1 << (FILTER_BITS - 1));
+  __m128i v_input_lo_b = _mm_unpacklo_epi8(v_a_b, v_b_b);
+  __m128i v_input_hi_b = _mm_unpackhi_epi8(v_a_b, v_b_b);
+  __m128i v_temp0_w = _mm_maddubs_epi16(v_input_lo_b, v_filter_b);
+  __m128i v_temp1_w = _mm_maddubs_epi16(v_input_hi_b, v_filter_b);
+  __m128i v_res_lo_w = _mm_srai_epi16(_mm_add_epi16(v_temp0_w, v_rounding_w),
+                                      FILTER_BITS);
+  __m128i v_res_hi_w = _mm_srai_epi16(_mm_add_epi16(v_temp1_w, v_rounding_w),
+                                      FILTER_BITS);
+  return _mm_packus_epi16(v_res_lo_w, v_res_hi_w);
+}
+
+// Apply the filter to the contents of the lower half of a and b
+static INLINE void apply_filter_lo(const __m128i v_a_lo_b,
+                                   const __m128i v_b_lo_b,
+                                   const __m128i v_filter_b,
+                                   __m128i* v_res_w) {
+  const __m128i v_rounding_w = _mm_set1_epi16(1 << (FILTER_BITS - 1));
+  __m128i v_input_b = _mm_unpacklo_epi8(v_a_lo_b, v_b_lo_b);
+  __m128i v_temp0_w = _mm_maddubs_epi16(v_input_b, v_filter_b);
+  *v_res_w = _mm_srai_epi16(_mm_add_epi16(v_temp0_w, v_rounding_w),
+                              FILTER_BITS);
+}
+
+static void sum_and_sse(const __m128i v_a_b, const __m128i v_b_b,
+                        const __m128i v_m_b, __m128i* v_sum_d,
+                        __m128i* v_sse_q) {
+  const __m128i v_zero = _mm_setzero_si128();
+  // Unpack to 16 bits - still containing max 8 bits
+  const __m128i v_a0_w = _mm_unpacklo_epi8(v_a_b, v_zero);
+  const __m128i v_b0_w = _mm_unpacklo_epi8(v_b_b, v_zero);
+  const __m128i v_m0_w = _mm_unpacklo_epi8(v_m_b, v_zero);
+  const __m128i v_a1_w = _mm_unpackhi_epi8(v_a_b, v_zero);
+  const __m128i v_b1_w = _mm_unpackhi_epi8(v_b_b, v_zero);
+  const __m128i v_m1_w = _mm_unpackhi_epi8(v_m_b, v_zero);
+
+  // Difference: [-255, 255]
+  const __m128i v_d0_w = _mm_sub_epi16(v_a0_w, v_b0_w);
+  const __m128i v_d1_w = _mm_sub_epi16(v_a1_w, v_b1_w);
+
+  // Error - [-255, 255] * [0, 64] = [0xc040, 0x3fc0] => fits in 15 bits
+  const __m128i v_e0_w = _mm_mullo_epi16(v_d0_w, v_m0_w);
+  const __m128i v_e0_d = _mm_madd_epi16(v_d0_w, v_m0_w);
+  const __m128i v_e1_w = _mm_mullo_epi16(v_d1_w, v_m1_w);
+  const __m128i v_e1_d = _mm_madd_epi16(v_d1_w, v_m1_w);
+
+  // Squared error - using madd it's max (15 bits * 15 bits) * 2 = 31 bits
+  const __m128i v_se0_d = _mm_madd_epi16(v_e0_w, v_e0_w);
+  const __m128i v_se1_d = _mm_madd_epi16(v_e1_w, v_e1_w);
+
+  // Sum of v_se{0,1}_d - 31 bits + 31 bits = 32 bits
+  const __m128i v_se_d = _mm_add_epi32(v_se0_d, v_se1_d);
+
+  // Unpack Squared error to 64 bits
+  const __m128i v_se_lo_q = _mm_unpacklo_epi32(v_se_d, v_zero);
+  const __m128i v_se_hi_q = _mm_unpackhi_epi32(v_se_d, v_zero);
+
+  // Accumulate
+  *v_sum_d = _mm_add_epi32(*v_sum_d, v_e0_d);
+  *v_sum_d = _mm_add_epi32(*v_sum_d, v_e1_d);
+  *v_sse_q = _mm_add_epi64(*v_sse_q, v_se_lo_q);
+  *v_sse_q = _mm_add_epi64(*v_sse_q, v_se_hi_q);
+}
+
+// Functions for width (W) >= 16
+unsigned int vpx_masked_subpel_varWxH_xzero(
+        const uint8_t *src, int src_stride, int yoffset,
+        const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
+        unsigned int *sse, int w, int h, filter_fn_t filter_fn) {
+  int i, j;
+  __m128i v_src0_b, v_src1_b, v_res_b, v_dst_b, v_msk_b;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
+  const __m128i v_filter_b = _mm_set1_epi16((
+        bilinear_filters_2t[yoffset][1] << 8) +
+        bilinear_filters_2t[yoffset][0]);
+  assert(yoffset < BIL_SUBPEL_SHIFTS);
+  for (j = 0; j < w; j += 16) {
+    // Load the first row ready
+    v_src0_b = _mm_loadu_si128((const __m128i*)(src + j));
+    // Process 2 rows at a time
+    for (i = 0; i < h; i += 2) {
+      // Load the next row apply the filter
+      v_src1_b = _mm_loadu_si128((const __m128i*)(src + j + src_stride));
+      v_res_b = filter_fn(v_src0_b, v_src1_b, v_filter_b);
+      // Load the dst and msk for the variance calculation
+      v_dst_b = _mm_loadu_si128((const __m128i*)(dst + j));
+      v_msk_b = _mm_loadu_si128((const __m128i*)(msk + j));
+      sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
+
+      // Load the next row apply the filter
+      v_src0_b = _mm_loadu_si128((const __m128i*)(src + j + src_stride * 2));
+      v_res_b = filter_fn(v_src1_b, v_src0_b, v_filter_b);
+      // Load the dst and msk for the variance calculation
+      v_dst_b = _mm_loadu_si128((const __m128i*)(dst + j + dst_stride));
+      v_msk_b = _mm_loadu_si128((const __m128i*)(msk + j + msk_stride));
+      sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
+      // Move onto the next block of rows
+      src += src_stride * 2;
+      dst += dst_stride * 2;
+      msk += msk_stride * 2;
+    }
+    // Reset to the top of the block
+    src -= src_stride * h;
+    dst -= dst_stride * h;
+    msk -= msk_stride * h;
+  }
+  return calc_masked_variance(v_sum_d, v_sse_q, sse, w, h);
+}
+unsigned int vpx_masked_subpel_varWxH_yzero(
+        const uint8_t *src, int src_stride, int xoffset,
+        const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
+        unsigned int *sse, int w, int h, filter_fn_t filter_fn) {
+  int i, j;
+  __m128i v_src0_b, v_src1_b, v_res_b, v_dst_b, v_msk_b;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
+  const __m128i v_filter_b = _mm_set1_epi16((
+        bilinear_filters_2t[xoffset][1] << 8) +
+        bilinear_filters_2t[xoffset][0]);
+  assert(xoffset < BIL_SUBPEL_SHIFTS);
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j += 16) {
+      // Load this row and one below & apply the filter to them
+      v_src0_b = _mm_loadu_si128((const __m128i*)(src + j));
+      v_src1_b = _mm_loadu_si128((const __m128i*)(src + j + 1));
+      v_res_b = filter_fn(v_src0_b, v_src1_b, v_filter_b);
+
+      // Load the dst and msk for the variance calculation
+      v_dst_b = _mm_loadu_si128((const __m128i*)(dst + j));
+      v_msk_b = _mm_loadu_si128((const __m128i*)(msk + j));
+      sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
+    }
+    src += src_stride;
+    dst += dst_stride;
+    msk += msk_stride;
+  }
+  return calc_masked_variance(v_sum_d, v_sse_q, sse, w, h);
+}
+unsigned int vpx_masked_subpel_varWxH_xnonzero_ynonzero(
+        const uint8_t *src, int src_stride, int xoffset, int yoffset,
+        const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
+        unsigned int *sse, int w, int h, filter_fn_t xfilter_fn,
+        filter_fn_t yfilter_fn) {
+  int i, j;
+  __m128i v_src0_b, v_src1_b, v_src2_b, v_src3_b;
+  __m128i v_filtered0_b, v_filtered1_b, v_res_b, v_dst_b, v_msk_b;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
+  const __m128i v_filterx_b = _mm_set1_epi16((
+        bilinear_filters_2t[xoffset][1] << 8) +
+        bilinear_filters_2t[xoffset][0]);
+  const __m128i v_filtery_b = _mm_set1_epi16((
+        bilinear_filters_2t[yoffset][1] << 8) +
+        bilinear_filters_2t[yoffset][0]);
+  assert(yoffset < BIL_SUBPEL_SHIFTS);
+  assert(xoffset < BIL_SUBPEL_SHIFTS);
+  for (j = 0; j < w; j += 16) {
+    // Load the first row ready
+    v_src0_b = _mm_loadu_si128((const __m128i*)(src + j));
+    v_src1_b = _mm_loadu_si128((const __m128i*)(src + j + 1));
+    v_filtered0_b = xfilter_fn(v_src0_b, v_src1_b, v_filterx_b);
+    // Process 2 rows at a time
+    for (i = 0; i < h; i += 2) {
+      // Load the next row & apply the filter
+      v_src2_b = _mm_loadu_si128((const __m128i*)(src + src_stride + j));
+      v_src3_b = _mm_loadu_si128((const __m128i*)(src + src_stride + j + 1));
+      v_filtered1_b = xfilter_fn(v_src2_b, v_src3_b, v_filterx_b);
+      // Load the dst and msk for the variance calculation
+      v_dst_b = _mm_loadu_si128((const __m128i*)(dst + j));
+      v_msk_b = _mm_loadu_si128((const __m128i*)(msk + j));
+      // Complete the calculation for this row and add it to the running total
+      v_res_b = yfilter_fn(v_filtered0_b, v_filtered1_b, v_filtery_b);
+      sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
+
+      // Load the next row & apply the filter
+      v_src0_b = _mm_loadu_si128((const __m128i*)(src + src_stride * 2 + j));
+      v_src1_b = _mm_loadu_si128((const __m128i*)(src + src_stride * 2 +
+                                                  j + 1));
+      v_filtered0_b = xfilter_fn(v_src0_b, v_src1_b, v_filterx_b);
+      // Load the dst and msk for the variance calculation
+      v_dst_b = _mm_loadu_si128((const __m128i*)(dst + dst_stride + j));
+      v_msk_b = _mm_loadu_si128((const __m128i*)(msk + msk_stride + j));
+      // Complete the calculation for this row and add it to the running total
+      v_res_b = yfilter_fn(v_filtered1_b, v_filtered0_b, v_filtery_b);
+      sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
+      // Move onto the next block of rows
+      src += src_stride * 2;
+      dst += dst_stride * 2;
+      msk += msk_stride * 2;
+    }
+    // Reset to the top of the block
+    src -= src_stride * h;
+    dst -= dst_stride * h;
+    msk -= msk_stride * h;
+  }
+  return calc_masked_variance(v_sum_d, v_sse_q, sse, w, h);
+}
+
+// Note order in which rows loaded xmm[127:96] = row 1, xmm[95:64] = row 2,
+// xmm[63:32] = row 3, xmm[31:0] = row 4
+unsigned int vpx_masked_subpel_var4xH_xzero(
+        const uint8_t *src, int src_stride, int  yoffset,
+        const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
+        unsigned int *sse, int h) {
+  int i;
+  __m128i v_src0_b, v_src1_b, v_src2_b, v_src3_b, v_filtered1_w, v_filtered2_w;
+  __m128i v_dst0_b, v_dst1_b, v_dst2_b, v_dst3_b;
+  __m128i v_msk0_b, v_msk1_b, v_msk2_b, v_msk3_b, v_res_b;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
+  __m128i v_filter_b = _mm_set1_epi16((
+        bilinear_filters_2t[yoffset][1] << 8) +
+        bilinear_filters_2t[yoffset][0]);
+  assert(yoffset < BIL_SUBPEL_SHIFTS);
+  // Load the first row of src data ready
+  v_src0_b = _mm_loadl_epi64((const __m128i*)src);
+  for (i = 0; i < h; i += 4) {
+    // Load the rest of the source data for these rows
+    v_src1_b = _mm_loadl_epi64((const __m128i*)(src + src_stride * 1));
+    v_src1_b = _mm_unpacklo_epi32(v_src1_b, v_src0_b);
+    v_src2_b = _mm_loadl_epi64((const __m128i*)(src + src_stride * 2));
+    v_src3_b = _mm_loadl_epi64((const __m128i*)(src + src_stride * 3));
+    v_src3_b = _mm_unpacklo_epi32(v_src3_b, v_src2_b);
+    v_src0_b = _mm_loadl_epi64((const __m128i*)(src + src_stride * 4));
+    // Load the dst data
+    v_dst0_b = _mm_cvtsi32_si128(*(const uint32_t*)(dst + dst_stride * 0));
+    v_dst1_b = _mm_cvtsi32_si128(*(const uint32_t*)(dst + dst_stride * 1));
+    v_dst0_b = _mm_unpacklo_epi32(v_dst1_b, v_dst0_b);
+    v_dst2_b = _mm_cvtsi32_si128(*(const uint32_t*)(dst + dst_stride * 2));
+    v_dst3_b = _mm_cvtsi32_si128(*(const uint32_t*)(dst + dst_stride * 3));
+    v_dst2_b = _mm_unpacklo_epi32(v_dst3_b, v_dst2_b);
+    v_dst0_b = _mm_unpacklo_epi64(v_dst2_b, v_dst0_b);
+    // Load the mask data
+    v_msk0_b = _mm_cvtsi32_si128(*(const uint32_t*)(msk + msk_stride * 0));
+    v_msk1_b = _mm_cvtsi32_si128(*(const uint32_t*)(msk + msk_stride * 1));
+    v_msk0_b = _mm_unpacklo_epi32(v_msk1_b, v_msk0_b);
+    v_msk2_b = _mm_cvtsi32_si128(*(const uint32_t*)(msk + msk_stride * 2));
+    v_msk3_b = _mm_cvtsi32_si128(*(const uint32_t*)(msk + msk_stride * 3));
+    v_msk2_b = _mm_unpacklo_epi32(v_msk3_b, v_msk2_b);
+    v_msk0_b = _mm_unpacklo_epi64(v_msk2_b, v_msk0_b);
+    // Apply the y filter
+    if (yoffset == HALF_PIXEL_OFFSET) {
+      v_src1_b = _mm_unpacklo_epi64(v_src3_b, v_src1_b);
+      v_src2_b = _mm_or_si128(_mm_slli_si128(v_src1_b, 4),
+            _mm_and_si128(v_src0_b, _mm_setr_epi32(-1, 0, 0, 0)));
+      v_res_b = _mm_avg_epu8(v_src1_b, v_src2_b);
+    } else {
+      v_src2_b = _mm_or_si128(_mm_slli_si128(v_src1_b, 4),
+            _mm_and_si128(v_src2_b, _mm_setr_epi32(-1, 0, 0, 0)));
+      apply_filter_lo(v_src1_b, v_src2_b, v_filter_b, &v_filtered1_w);
+      v_src2_b = _mm_or_si128(_mm_slli_si128(v_src3_b, 4),
+            _mm_and_si128(v_src0_b, _mm_setr_epi32(-1, 0, 0, 0)));
+      apply_filter_lo(v_src3_b, v_src2_b, v_filter_b, &v_filtered2_w);
+      v_res_b = _mm_packus_epi16(v_filtered2_w, v_filtered1_w);
+    }
+    // Compute the sum and SSE
+    sum_and_sse(v_res_b, v_dst0_b, v_msk0_b, &v_sum_d, &v_sse_q);
+    // Move onto the next set of rows
+    src += src_stride * 4;
+    dst += dst_stride * 4;
+    msk += msk_stride * 4;
+  }
+  return calc_masked_variance(v_sum_d, v_sse_q, sse, 4, h);
+}
+
+// Note order in which rows loaded xmm[127:64] = row 1, xmm[63:0] = row 2
+unsigned int vpx_masked_subpel_var8xH_xzero(
+        const uint8_t *src, int src_stride, int yoffset,
+        const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
+        unsigned int *sse, int h) {
+  int i;
+  __m128i v_src0_b, v_src1_b, v_filtered0_w, v_filtered1_w, v_res_b;
+  __m128i v_dst_b = _mm_setzero_si128();
+  __m128i v_msk_b = _mm_setzero_si128();
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
+  __m128i v_filter_b = _mm_set1_epi16((
+        bilinear_filters_2t[yoffset][1] << 8) +
+        bilinear_filters_2t[yoffset][0]);
+  assert(yoffset < BIL_SUBPEL_SHIFTS);
+  // Load the first row of src data ready
+  v_src0_b = _mm_loadl_epi64((const __m128i*)src);
+  for (i = 0; i < h; i += 2) {
+    if (yoffset == HALF_PIXEL_OFFSET) {
+      // Load the rest of the source data for these rows
+      v_src1_b = _mm_or_si128(
+            _mm_slli_si128(v_src0_b, 8),
+            _mm_loadl_epi64((const __m128i*)(src + src_stride * 1)));
+      v_src0_b = _mm_or_si128(
+            _mm_slli_si128(v_src1_b, 8),
+            _mm_loadl_epi64((const __m128i*)(src + src_stride * 2)));
+      // Apply the y filter
+      v_res_b = _mm_avg_epu8(v_src1_b, v_src0_b);
+    } else {
+      // Load the data and apply the y filter
+      v_src1_b = _mm_loadl_epi64((const __m128i*)(src + src_stride * 1));
+      apply_filter_lo(v_src0_b, v_src1_b, v_filter_b, &v_filtered0_w);
+      v_src0_b = _mm_loadl_epi64((const __m128i*)(src + src_stride * 2));
+      apply_filter_lo(v_src1_b, v_src0_b, v_filter_b, &v_filtered1_w);
+      v_res_b = _mm_packus_epi16(v_filtered1_w, v_filtered0_w);
+    }
+    // Load the dst data
+    v_dst_b = _mm_unpacklo_epi64(
+            _mm_loadl_epi64((const __m128i*)(dst + dst_stride * 1)),
+            _mm_loadl_epi64((const __m128i*)(dst + dst_stride * 0)));
+    // Load the mask data
+    v_msk_b = _mm_unpacklo_epi64(
+            _mm_loadl_epi64((const __m128i*)(msk + msk_stride * 1)),
+            _mm_loadl_epi64((const __m128i*)(msk + msk_stride * 0)));
+    // Compute the sum and SSE
+    sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
+    // Move onto the next set of rows
+    src += src_stride * 2;
+    dst += dst_stride * 2;
+    msk += msk_stride * 2;
+  }
+  return calc_masked_variance(v_sum_d, v_sse_q, sse, 8, h);
+}
+
+// Note order in which rows loaded xmm[127:96] = row 1, xmm[95:64] = row 2,
+// xmm[63:32] = row 3, xmm[31:0] = row 4
+unsigned int vpx_masked_subpel_var4xH_yzero(
+        const uint8_t *src, int src_stride, int xoffset,
+        const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
+        unsigned int *sse, int h) {
+  int i;
+  __m128i v_src0_b, v_src1_b, v_src2_b, v_src3_b, v_filtered0_w, v_filtered2_w;
+  __m128i v_src0_shift_b, v_src1_shift_b, v_src2_shift_b, v_src3_shift_b;
+  __m128i v_dst0_b, v_dst1_b, v_dst2_b, v_dst3_b;
+  __m128i v_msk0_b, v_msk1_b, v_msk2_b, v_msk3_b, v_res_b;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
+  __m128i v_filter_b = _mm_set1_epi16((
+        bilinear_filters_2t[xoffset][1] << 8) +
+        bilinear_filters_2t[xoffset][0]);
+  assert(xoffset < BIL_SUBPEL_SHIFTS);
+  for (i = 0; i < h; i += 4) {
+    // Load the src data
+    v_src0_b = _mm_loadl_epi64((const __m128i*)src);
+    v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
+    v_src1_b = _mm_loadl_epi64((const __m128i*)(src + src_stride * 1));
+    v_src0_b = _mm_unpacklo_epi32(v_src1_b, v_src0_b);
+    v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
+    v_src2_b = _mm_loadl_epi64((const __m128i*)(src + src_stride * 2));
+    v_src0_shift_b = _mm_unpacklo_epi32(v_src1_shift_b, v_src0_shift_b);
+    v_src2_shift_b = _mm_srli_si128(v_src2_b, 1);
+    v_src3_b = _mm_loadl_epi64((const __m128i*)(src + src_stride * 3));
+    v_src2_b = _mm_unpacklo_epi32(v_src3_b, v_src2_b);
+    v_src3_shift_b = _mm_srli_si128(v_src3_b, 1);
+    v_src2_shift_b = _mm_unpacklo_epi32(v_src3_shift_b, v_src2_shift_b);
+    // Load the dst data
+    v_dst0_b = _mm_cvtsi32_si128(*(const uint32_t*)(dst + dst_stride * 0));
+    v_dst1_b = _mm_cvtsi32_si128(*(const uint32_t*)(dst + dst_stride * 1));
+    v_dst0_b = _mm_unpacklo_epi32(v_dst1_b, v_dst0_b);
+    v_dst2_b = _mm_cvtsi32_si128(*(const uint32_t*)(dst + dst_stride * 2));
+    v_dst3_b = _mm_cvtsi32_si128(*(const uint32_t*)(dst + dst_stride * 3));
+    v_dst2_b = _mm_unpacklo_epi32(v_dst3_b, v_dst2_b);
+    v_dst0_b = _mm_unpacklo_epi64(v_dst2_b, v_dst0_b);
+    // Load the mask data
+    v_msk0_b = _mm_cvtsi32_si128(*(const uint32_t*)(msk + msk_stride * 0));
+    v_msk1_b = _mm_cvtsi32_si128(*(const uint32_t*)(msk + msk_stride * 1));
+    v_msk0_b = _mm_unpacklo_epi32(v_msk1_b, v_msk0_b);
+    v_msk2_b = _mm_cvtsi32_si128(*(const uint32_t*)(msk + msk_stride * 2));
+    v_msk3_b = _mm_cvtsi32_si128(*(const uint32_t*)(msk + msk_stride * 3));
+    v_msk2_b = _mm_unpacklo_epi32(v_msk3_b, v_msk2_b);
+    v_msk0_b = _mm_unpacklo_epi64(v_msk2_b, v_msk0_b);
+    // Apply the x filter
+    if (xoffset == HALF_PIXEL_OFFSET) {
+      v_src0_b = _mm_unpacklo_epi64(v_src2_b, v_src0_b);
+      v_src0_shift_b = _mm_unpacklo_epi64(v_src2_shift_b, v_src0_shift_b);
+      v_res_b = _mm_avg_epu8(v_src0_b, v_src0_shift_b);
+    } else {
+      apply_filter_lo(v_src0_b, v_src0_shift_b, v_filter_b, &v_filtered0_w);
+      apply_filter_lo(v_src2_b, v_src2_shift_b, v_filter_b, &v_filtered2_w);
+      v_res_b = _mm_packus_epi16(v_filtered2_w, v_filtered0_w);
+    }
+    // Compute the sum and SSE
+    sum_and_sse(v_res_b, v_dst0_b, v_msk0_b, &v_sum_d, &v_sse_q);
+    // Move onto the next set of rows
+    src += src_stride * 4;
+    dst += dst_stride * 4;
+    msk += msk_stride * 4;
+  }
+  return calc_masked_variance(v_sum_d, v_sse_q, sse, 4, h);
+}
+
+unsigned int vpx_masked_subpel_var8xH_yzero(
+        const uint8_t *src, int src_stride, int xoffset,
+        const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
+        unsigned int *sse, int h) {
+  int i;
+  __m128i v_src0_b, v_src1_b, v_filtered0_w, v_filtered1_w;
+  __m128i v_src0_shift_b, v_src1_shift_b, v_res_b, v_dst_b, v_msk_b;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
+  __m128i v_filter_b = _mm_set1_epi16((
+        bilinear_filters_2t[xoffset][1] << 8) +
+        bilinear_filters_2t[xoffset][0]);
+  assert(xoffset < BIL_SUBPEL_SHIFTS);
+  for (i = 0; i < h; i += 2) {
+    // Load the src data
+    v_src0_b = _mm_loadu_si128((const __m128i*)(src));
+    v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
+    v_src1_b = _mm_loadu_si128((const __m128i*)(src + src_stride));
+    v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
+    // Apply the x filter
+    if (xoffset == HALF_PIXEL_OFFSET) {
+      v_src1_b = _mm_unpacklo_epi64(v_src0_b, v_src1_b);
+      v_src1_shift_b = _mm_unpacklo_epi64(v_src0_shift_b, v_src1_shift_b);
+      v_res_b = _mm_avg_epu8(v_src1_b, v_src1_shift_b);
+    } else {
+      apply_filter_lo(v_src0_b, v_src0_shift_b, v_filter_b, &v_filtered0_w);
+      apply_filter_lo(v_src1_b, v_src1_shift_b, v_filter_b, &v_filtered1_w);
+      v_res_b = _mm_packus_epi16(v_filtered0_w, v_filtered1_w);
+    }
+    // Load the dst data
+    v_dst_b = _mm_unpacklo_epi64(
+            _mm_loadl_epi64((const __m128i*)(dst + dst_stride * 0)),
+            _mm_loadl_epi64((const __m128i*)(dst + dst_stride * 1)));
+    // Load the mask data
+    v_msk_b = _mm_unpacklo_epi64(
+            _mm_loadl_epi64((const __m128i*)(msk + msk_stride * 0)),
+            _mm_loadl_epi64((const __m128i*)(msk + msk_stride * 1)));
+    // Compute the sum and SSE
+    sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
+    // Move onto the next set of rows
+    src += src_stride * 2;
+    dst += dst_stride * 2;
+    msk += msk_stride * 2;
+  }
+  return calc_masked_variance(v_sum_d, v_sse_q, sse, 8, h);
+}
+
+// Note order in which rows loaded xmm[127:96] = row 1, xmm[95:64] = row 2,
+// xmm[63:32] = row 3, xmm[31:0] = row 4
+unsigned int vpx_masked_subpel_var4xH_xnonzero_ynonzero(
+        const uint8_t *src, int src_stride, int xoffset, int  yoffset,
+        const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
+        unsigned int *sse, int h) {
+  int i;
+  __m128i v_src0_b, v_src1_b, v_src2_b, v_src3_b, v_filtered0_w, v_filtered2_w;
+  __m128i v_src0_shift_b, v_src1_shift_b, v_src2_shift_b, v_src3_shift_b;
+  __m128i v_dst0_b, v_dst1_b, v_dst2_b, v_dst3_b, v_temp_b;
+  __m128i v_msk0_b, v_msk1_b, v_msk2_b, v_msk3_b, v_extra_row_b, v_res_b;
+  __m128i v_xres_b[2];
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
+  __m128i v_filterx_b = _mm_set1_epi16((
+        bilinear_filters_2t[xoffset][1] << 8) +
+        bilinear_filters_2t[xoffset][0]);
+  __m128i v_filtery_b = _mm_set1_epi16((
+        bilinear_filters_2t[yoffset][1] << 8) +
+        bilinear_filters_2t[yoffset][0]);
+  assert(xoffset < BIL_SUBPEL_SHIFTS);
+  assert(yoffset < BIL_SUBPEL_SHIFTS);
+  for (i = 0; i < h; i += 4) {
+    // Load the src data
+    v_src0_b = _mm_loadl_epi64((const __m128i*)src);
+    v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
+    v_src1_b = _mm_loadl_epi64((const __m128i*)(src + src_stride * 1));
+    v_src0_b = _mm_unpacklo_epi32(v_src1_b, v_src0_b);
+    v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
+    v_src2_b = _mm_loadl_epi64((const __m128i*)(src + src_stride * 2));
+    v_src0_shift_b = _mm_unpacklo_epi32(v_src1_shift_b, v_src0_shift_b);
+    v_src2_shift_b = _mm_srli_si128(v_src2_b, 1);
+    v_src3_b = _mm_loadl_epi64((const __m128i*)(src + src_stride * 3));
+    v_src2_b = _mm_unpacklo_epi32(v_src3_b, v_src2_b);
+    v_src3_shift_b = _mm_srli_si128(v_src3_b, 1);
+    v_src2_shift_b = _mm_unpacklo_epi32(v_src3_shift_b, v_src2_shift_b);
+    // Apply the x filter
+    if (xoffset == HALF_PIXEL_OFFSET) {
+      v_src0_b = _mm_unpacklo_epi64(v_src2_b, v_src0_b);
+      v_src0_shift_b = _mm_unpacklo_epi64(v_src2_shift_b, v_src0_shift_b);
+      v_xres_b[i == 0 ? 0 : 1] = _mm_avg_epu8(v_src0_b, v_src0_shift_b);
+    } else {
+      apply_filter_lo(v_src0_b, v_src0_shift_b, v_filterx_b, &v_filtered0_w);
+      apply_filter_lo(v_src2_b, v_src2_shift_b, v_filterx_b, &v_filtered2_w);
+      v_xres_b[i == 0 ? 0 : 1] = _mm_packus_epi16(v_filtered2_w, v_filtered0_w);
+    }
+    // Move onto the next set of rows
+    src += src_stride * 4;
+  }
+  // Load one more row to be used in the y filter
+  v_src0_b = _mm_loadl_epi64((const __m128i*)src);
+  v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
+  // Apply the x filter
+  if (xoffset == HALF_PIXEL_OFFSET) {
+    v_extra_row_b = _mm_and_si128(
+            _mm_avg_epu8(v_src0_b, v_src0_shift_b),
+            _mm_setr_epi32(-1, 0, 0, 0));
+  } else {
+    apply_filter_lo(v_src0_b, v_src0_shift_b, v_filterx_b, &v_filtered0_w);
+    v_extra_row_b = _mm_and_si128(
+            _mm_packus_epi16(v_filtered0_w, _mm_setzero_si128()),
+            _mm_setr_epi32(-1, 0, 0, 0));
+  }
+
+  for (i = 0; i < h; i += 4) {
+    if (h == 8 && i == 0) {
+      v_temp_b = _mm_or_si128(_mm_slli_si128(v_xres_b[0], 4),
+                              _mm_srli_si128(v_xres_b[1], 12));
+    } else {
+      v_temp_b = _mm_or_si128(_mm_slli_si128(v_xres_b[i == 0 ? 0 : 1], 4),
+                              v_extra_row_b);
+    }
+    // Apply the y filter
+    if (yoffset == HALF_PIXEL_OFFSET) {
+      v_res_b = _mm_avg_epu8(v_xres_b[i == 0 ? 0 : 1], v_temp_b);
+    } else {
+      v_res_b = apply_filter(v_xres_b[i == 0 ? 0 : 1], v_temp_b, v_filtery_b);
+    }
+
+    // Load the dst data
+    v_dst0_b = _mm_cvtsi32_si128(*(const uint32_t*)(dst + dst_stride * 0));
+    v_dst1_b = _mm_cvtsi32_si128(*(const uint32_t*)(dst + dst_stride * 1));
+    v_dst0_b = _mm_unpacklo_epi32(v_dst1_b, v_dst0_b);
+    v_dst2_b = _mm_cvtsi32_si128(*(const uint32_t*)(dst + dst_stride * 2));
+    v_dst3_b = _mm_cvtsi32_si128(*(const uint32_t*)(dst + dst_stride * 3));
+    v_dst2_b = _mm_unpacklo_epi32(v_dst3_b, v_dst2_b);
+    v_dst0_b = _mm_unpacklo_epi64(v_dst2_b, v_dst0_b);
+    // Load the mask data
+    v_msk0_b = _mm_cvtsi32_si128(*(const uint32_t*)(msk + msk_stride * 0));
+    v_msk1_b = _mm_cvtsi32_si128(*(const uint32_t*)(msk + msk_stride * 1));
+    v_msk0_b = _mm_unpacklo_epi32(v_msk1_b, v_msk0_b);
+    v_msk2_b = _mm_cvtsi32_si128(*(const uint32_t*)(msk + msk_stride * 2));
+    v_msk3_b = _mm_cvtsi32_si128(*(const uint32_t*)(msk + msk_stride * 3));
+    v_msk2_b = _mm_unpacklo_epi32(v_msk3_b, v_msk2_b);
+    v_msk0_b = _mm_unpacklo_epi64(v_msk2_b, v_msk0_b);
+    // Compute the sum and SSE
+    sum_and_sse(v_res_b, v_dst0_b, v_msk0_b, &v_sum_d, &v_sse_q);
+    // Move onto the next set of rows
+    dst += dst_stride * 4;
+    msk += msk_stride * 4;
+  }
+  return calc_masked_variance(v_sum_d, v_sse_q, sse, 4, h);
+}
+
+unsigned int vpx_masked_subpel_var8xH_xnonzero_ynonzero(
+        const uint8_t *src, int src_stride, int xoffset, int  yoffset,
+        const uint8_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
+        unsigned int *sse, int h) {
+  int i;
+  __m128i v_src0_b, v_src1_b, v_filtered0_w, v_filtered1_w, v_dst_b, v_msk_b;
+  __m128i v_src0_shift_b, v_src1_shift_b;
+  __m128i v_xres0_b, v_xres1_b, v_res_b, v_temp_b;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
+  __m128i v_filterx_b = _mm_set1_epi16((
+        bilinear_filters_2t[xoffset][1] << 8) +
+        bilinear_filters_2t[xoffset][0]);
+  __m128i v_filtery_b = _mm_set1_epi16((
+        bilinear_filters_2t[yoffset][1] << 8) +
+        bilinear_filters_2t[yoffset][0]);
+  assert(xoffset < BIL_SUBPEL_SHIFTS);
+  assert(yoffset < BIL_SUBPEL_SHIFTS);
+  // Load the first block of src data
+  v_src0_b = _mm_loadu_si128((const __m128i*)(src));
+  v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
+  v_src1_b = _mm_loadu_si128((const __m128i*)(src + src_stride));
+  v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
+  // Apply the x filter
+  if (xoffset == HALF_PIXEL_OFFSET) {
+    v_src1_b = _mm_unpacklo_epi64(v_src0_b, v_src1_b);
+    v_src1_shift_b = _mm_unpacklo_epi64(v_src0_shift_b, v_src1_shift_b);
+    v_xres0_b = _mm_avg_epu8(v_src1_b, v_src1_shift_b);
+  } else {
+    apply_filter_lo(v_src0_b, v_src0_shift_b, v_filterx_b, &v_filtered0_w);
+    apply_filter_lo(v_src1_b, v_src1_shift_b, v_filterx_b, &v_filtered1_w);
+    v_xres0_b = _mm_packus_epi16(v_filtered0_w, v_filtered1_w);
+  }
+  for (i = 0; i < h; i += 4) {
+    // Load the next block of src data
+    v_src0_b = _mm_loadu_si128((const __m128i*)(src + src_stride * 2));
+    v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
+    v_src1_b = _mm_loadu_si128((const __m128i*)(src + src_stride * 3));
+    v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
+    // Apply the x filter
+    if (xoffset == HALF_PIXEL_OFFSET) {
+      v_src1_b = _mm_unpacklo_epi64(v_src0_b, v_src1_b);
+      v_src1_shift_b = _mm_unpacklo_epi64(v_src0_shift_b, v_src1_shift_b);
+      v_xres1_b = _mm_avg_epu8(v_src1_b, v_src1_shift_b);
+    } else {
+      apply_filter_lo(v_src0_b, v_src0_shift_b, v_filterx_b, &v_filtered0_w);
+      apply_filter_lo(v_src1_b, v_src1_shift_b, v_filterx_b, &v_filtered1_w);
+      v_xres1_b = _mm_packus_epi16(v_filtered0_w, v_filtered1_w);
+    }
+    // Apply the y filter to the previous block
+    v_temp_b = _mm_or_si128(_mm_srli_si128(v_xres0_b, 8),
+                            _mm_slli_si128(v_xres1_b, 8));
+    if (yoffset == HALF_PIXEL_OFFSET) {
+      v_res_b = _mm_avg_epu8(v_xres0_b, v_temp_b);
+    } else {
+      v_res_b = apply_filter(v_xres0_b, v_temp_b, v_filtery_b);
+    }
+    // Load the dst data
+    v_dst_b = _mm_unpacklo_epi64(
+            _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)),
+            _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1)));
+    // Load the mask data
+    v_msk_b = _mm_unpacklo_epi64(
+            _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)),
+            _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1)));
+    // Compute the sum and SSE
+    sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
+
+    // Load the next block of src data
+    v_src0_b = _mm_loadu_si128((const __m128i*)(src + src_stride * 4));
+    v_src0_shift_b = _mm_srli_si128(v_src0_b, 1);
+    v_src1_b = _mm_loadu_si128((const __m128i*)(src + src_stride * 5));
+    v_src1_shift_b = _mm_srli_si128(v_src1_b, 1);
+    // Apply the x filter
+    if (xoffset == HALF_PIXEL_OFFSET) {
+      v_src1_b = _mm_unpacklo_epi64(v_src0_b, v_src1_b);
+      v_src1_shift_b = _mm_unpacklo_epi64(v_src0_shift_b, v_src1_shift_b);
+      v_xres0_b = _mm_avg_epu8(v_src1_b, v_src1_shift_b);
+    } else {
+      apply_filter_lo(v_src0_b, v_src0_shift_b, v_filterx_b, &v_filtered0_w);
+      apply_filter_lo(v_src1_b, v_src1_shift_b, v_filterx_b, &v_filtered1_w);
+      v_xres0_b = _mm_packus_epi16(v_filtered0_w, v_filtered1_w);
+    }
+    // Apply the y filter to the previous block
+    v_temp_b = _mm_or_si128(_mm_srli_si128(v_xres1_b, 8),
+                            _mm_slli_si128(v_xres0_b, 8));
+    if (yoffset == HALF_PIXEL_OFFSET) {
+      v_res_b = _mm_avg_epu8(v_xres1_b, v_temp_b);
+    } else {
+      v_res_b = apply_filter(v_xres1_b, v_temp_b, v_filtery_b);
+    }
+    // Load the dst data
+    v_dst_b = _mm_unpacklo_epi64(
+            _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 2)),
+            _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 3)));
+    // Load the mask data
+    v_msk_b = _mm_unpacklo_epi64(
+            _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 2)),
+            _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 3)));
+    // Compute the sum and SSE
+    sum_and_sse(v_res_b, v_dst_b, v_msk_b, &v_sum_d, &v_sse_q);
+    // Move onto the next set of rows
+    src += src_stride * 4;
+    dst += dst_stride * 4;
+    msk += msk_stride * 4;
+  }
+  return calc_masked_variance(v_sum_d, v_sse_q, sse, 8, h);
+}
+
+
+// For W >=16
+#define MASK_SUBPIX_VAR_LARGE(W, H)                                            \
+unsigned int vpx_masked_sub_pixel_variance##W##x##H##_ssse3(                   \
+        const uint8_t *src, int src_stride,                                    \
+        int xoffset, int  yoffset,                                             \
+        const uint8_t *dst, int dst_stride,                                    \
+        const uint8_t *msk, int msk_stride,                                    \
+        unsigned int *sse) {                                                   \
+  assert(W % 16 == 0);                                                         \
+  if (xoffset == 0) {                                                          \
+    if (yoffset == 0)                                                          \
+      return vpx_masked_variance##W##x##H##_ssse3(src, src_stride,             \
+                                                  dst, dst_stride,             \
+                                                  msk, msk_stride, sse);       \
+    else if (yoffset == HALF_PIXEL_OFFSET)                                     \
+      return vpx_masked_subpel_varWxH_xzero(src, src_stride,                   \
+                                            HALF_PIXEL_OFFSET,                 \
+                                            dst, dst_stride, msk, msk_stride,  \
+                                            sse, W, H, apply_filter_avg);      \
+    else                                                                       \
+      return vpx_masked_subpel_varWxH_xzero(src, src_stride,                   \
+                                            yoffset,                           \
+                                            dst, dst_stride, msk, msk_stride,  \
+                                            sse, W, H, apply_filter);          \
+  } else if (yoffset == 0) {                                                   \
+    if (xoffset == HALF_PIXEL_OFFSET)                                          \
+      return vpx_masked_subpel_varWxH_yzero(src, src_stride,                   \
+                                            HALF_PIXEL_OFFSET,                 \
+                                            dst, dst_stride, msk, msk_stride,  \
+                                            sse, W, H, apply_filter_avg);      \
+    else                                                                       \
+      return vpx_masked_subpel_varWxH_yzero(src, src_stride,                   \
+                                            xoffset,                           \
+                                            dst, dst_stride, msk, msk_stride,  \
+                                            sse, W, H, apply_filter);          \
+  } else if (xoffset == HALF_PIXEL_OFFSET) {                                   \
+    if (yoffset == HALF_PIXEL_OFFSET)                                          \
+      return vpx_masked_subpel_varWxH_xnonzero_ynonzero(src, src_stride,       \
+              HALF_PIXEL_OFFSET, HALF_PIXEL_OFFSET, dst, dst_stride, msk,      \
+              msk_stride, sse, W, H, apply_filter_avg, apply_filter_avg);      \
+    else                                                                       \
+      return vpx_masked_subpel_varWxH_xnonzero_ynonzero(src, src_stride,       \
+              HALF_PIXEL_OFFSET, yoffset, dst, dst_stride, msk,                \
+              msk_stride, sse, W, H, apply_filter_avg, apply_filter);          \
+  } else {                                                                     \
+    if (yoffset == HALF_PIXEL_OFFSET)                                          \
+      return vpx_masked_subpel_varWxH_xnonzero_ynonzero(src, src_stride,       \
+              xoffset, HALF_PIXEL_OFFSET, dst, dst_stride, msk,                \
+              msk_stride, sse, W, H, apply_filter, apply_filter_avg);          \
+    else                                                                       \
+      return vpx_masked_subpel_varWxH_xnonzero_ynonzero(src, src_stride,       \
+              xoffset, yoffset, dst, dst_stride, msk,                          \
+              msk_stride, sse, W, H, apply_filter, apply_filter);              \
+  }                                                                            \
+}
+
+// For W < 16
+#define MASK_SUBPIX_VAR_SMALL(W, H)                                            \
+unsigned int vpx_masked_sub_pixel_variance##W##x##H##_ssse3(                   \
+        const uint8_t *src, int src_stride,                                    \
+        int xoffset, int  yoffset,                                             \
+        const uint8_t *dst, int dst_stride,                                    \
+        const uint8_t *msk, int msk_stride,                                    \
+        unsigned int *sse) {                                                   \
+  assert(W == 4 || W == 8);                                                    \
+  if (xoffset == 0 && yoffset == 0)                                            \
+    return vpx_masked_variance##W##x##H##_ssse3(src, src_stride,               \
+                                                dst, dst_stride,               \
+                                                msk, msk_stride, sse);         \
+  else if (xoffset == 0)                                                       \
+    return vpx_masked_subpel_var##W##xH_xzero(src, src_stride, yoffset,        \
+                                              dst, dst_stride,                 \
+                                              msk, msk_stride, sse, H);        \
+  else if (yoffset == 0)                                                       \
+    return vpx_masked_subpel_var##W##xH_yzero(src, src_stride, xoffset,        \
+                                              dst, dst_stride,                 \
+                                              msk, msk_stride, sse, H);        \
+  else                                                                         \
+    return vpx_masked_subpel_var##W##xH_xnonzero_ynonzero(                     \
+          src, src_stride, xoffset, yoffset, dst, dst_stride,                  \
+          msk, msk_stride, sse, H);                                            \
+}
+
+MASK_SUBPIX_VAR_SMALL(4, 4)
+MASK_SUBPIX_VAR_SMALL(4, 8)
+MASK_SUBPIX_VAR_SMALL(8, 4)
+MASK_SUBPIX_VAR_SMALL(8, 8)
+MASK_SUBPIX_VAR_SMALL(8, 16)
+MASK_SUBPIX_VAR_LARGE(16, 8)
+MASK_SUBPIX_VAR_LARGE(16, 16)
+MASK_SUBPIX_VAR_LARGE(16, 32)
+MASK_SUBPIX_VAR_LARGE(32, 16)
+MASK_SUBPIX_VAR_LARGE(32, 32)
+MASK_SUBPIX_VAR_LARGE(32, 64)
+MASK_SUBPIX_VAR_LARGE(64, 32)
+MASK_SUBPIX_VAR_LARGE(64, 64)
+#if CONFIG_EXT_PARTITION
+MASK_SUBPIX_VAR_LARGE(64, 128)
+MASK_SUBPIX_VAR_LARGE(128, 64)
+MASK_SUBPIX_VAR_LARGE(128, 128)
+#endif  // CONFIG_EXT_PARTITION
+
+#if CONFIG_VP9_HIGHBITDEPTH
+typedef uint32_t (*highbd_calc_masked_var_t)(__m128i v_sum_d, __m128i v_sse_q,
+                                             uint32_t *sse,
+                                             const int w, const int h);
+typedef unsigned int (*highbd_variance_fn_t)(
+                      const uint8_t *a8, int a_stride,
+                      const uint8_t *b8, int b_stride,
+                      const uint8_t *m, int m_stride,
+                      unsigned int *sse);
+typedef __m128i (*highbd_filter_fn_t)(__m128i v_a_w, __m128i v_b_w,
+                                    __m128i v_filter_w);
+
+static INLINE __m128i highbd_apply_filter_avg(const __m128i v_a_w,
+                                              const __m128i v_b_w,
+                                              const __m128i v_filter_w) {
+  (void) v_filter_w;
+  return _mm_avg_epu16(v_a_w, v_b_w);
+}
+
+static INLINE __m128i highbd_apply_filter(const __m128i v_a_w,
+                                          const __m128i v_b_w,
+                                          const __m128i v_filter_w) {
+  const __m128i v_rounding_d = _mm_set1_epi32(1 << (FILTER_BITS - 1));
+  __m128i v_input_lo_w = _mm_unpacklo_epi16(v_a_w, v_b_w);
+  __m128i v_input_hi_w = _mm_unpackhi_epi16(v_a_w, v_b_w);
+  __m128i v_temp0_d = _mm_madd_epi16(v_input_lo_w, v_filter_w);
+  __m128i v_temp1_d = _mm_madd_epi16(v_input_hi_w, v_filter_w);
+  __m128i v_res_lo_d = _mm_srai_epi32(_mm_add_epi32(v_temp0_d, v_rounding_d),
+                                      FILTER_BITS);
+  __m128i v_res_hi_d = _mm_srai_epi32(_mm_add_epi32(v_temp1_d, v_rounding_d),
+                                      FILTER_BITS);
+  return _mm_packs_epi32(v_res_lo_d, v_res_hi_d);
+}
+// Apply the filter to the contents of the lower half of a and b
+static INLINE void highbd_apply_filter_lo(const __m128i v_a_lo_w,
+                                          const __m128i v_b_lo_w,
+                                          const __m128i v_filter_w,
+                                          __m128i* v_res_d) {
+  const __m128i v_rounding_d = _mm_set1_epi32(1 << (FILTER_BITS - 1));
+  __m128i v_input_w = _mm_unpacklo_epi16(v_a_lo_w, v_b_lo_w);
+  __m128i v_temp0_d = _mm_madd_epi16(v_input_w, v_filter_w);
+  *v_res_d = _mm_srai_epi32(_mm_add_epi32(v_temp0_d, v_rounding_d),
+                            FILTER_BITS);
+}
+
+static void highbd_sum_and_sse(const __m128i v_a_w, const __m128i v_b_w,
+                               const __m128i v_m_b, __m128i* v_sum_d,
+                               __m128i* v_sse_q) {
+  const __m128i v_zero = _mm_setzero_si128();
+  const __m128i v_m_w = _mm_unpacklo_epi8(v_m_b, v_zero);
+
+  // Difference: [-2^12, 2^12] => 13 bits (incld sign bit)
+  const __m128i v_d_w = _mm_sub_epi16(v_a_w, v_b_w);
+
+  // Error - [-4095, 4095] * [0, 64] & sum pairs => fits in 19 + 1 bits
+  const __m128i v_e_d = _mm_madd_epi16(v_d_w, v_m_w);
+
+  // Squared error - max (18 bits * 18 bits) = 36 bits (no sign bit)
+  const __m128i v_absd_w = _mm_abs_epi16(v_d_w);
+  const __m128i v_dlo_d = _mm_unpacklo_epi16(v_absd_w, v_zero);
+  const __m128i v_mlo_d = _mm_unpacklo_epi16(v_m_w, v_zero);
+  const __m128i v_elo_d = _mm_madd_epi16(v_dlo_d, v_mlo_d);
+  const __m128i v_dhi_d = _mm_unpackhi_epi16(v_absd_w, v_zero);
+  const __m128i v_mhi_d = _mm_unpackhi_epi16(v_m_w, v_zero);
+  const __m128i v_ehi_d = _mm_madd_epi16(v_dhi_d, v_mhi_d);
+  // Square and sum the errors -> 36bits * 4 = 38bits
+  __m128i v_se0_q, v_se1_q, v_se2_q, v_se3_q, v_se_q, v_elo1_d, v_ehi3_d;
+  v_se0_q = _mm_mul_epu32(v_elo_d, v_elo_d);
+  v_elo1_d = _mm_srli_si128(v_elo_d, 4);
+  v_se1_q = _mm_mul_epu32(v_elo1_d, v_elo1_d);
+  v_se0_q = _mm_add_epi64(v_se0_q, v_se1_q);
+  v_se2_q = _mm_mul_epu32(v_ehi_d, v_ehi_d);
+  v_ehi3_d = _mm_srli_si128(v_ehi_d, 4);
+  v_se3_q = _mm_mul_epu32(v_ehi3_d, v_ehi3_d);
+  v_se1_q = _mm_add_epi64(v_se2_q, v_se3_q);
+  v_se_q = _mm_add_epi64(v_se0_q, v_se1_q);
+
+  // Accumulate
+  *v_sum_d = _mm_add_epi32(*v_sum_d, v_e_d);
+  *v_sse_q = _mm_add_epi64(*v_sse_q, v_se_q);
+}
+
+static INLINE uint32_t highbd_10_calc_masked_variance(__m128i v_sum_d,
+                                                      __m128i v_sse_q,
+                                                      uint32_t* sse,
+                                                      const int w,
+                                                      const int h) {
+  int64_t sum64;
+  uint64_t sse64;
+
+  // Horizontal sum
+  sum64 = hsum_epi32_si32(v_sum_d);
+  sse64 = hsum_epi64_si64(v_sse_q);
+
+  sum64 = (sum64 >= 0) ? sum64 : -sum64;
+
+  // Round
+  sum64 = ROUND_POWER_OF_TWO(sum64, 6);
+  sse64 = ROUND_POWER_OF_TWO(sse64, 12);
+
+  // Normalise
+  sum64 = ROUND_POWER_OF_TWO(sum64, 2);
+  sse64 = ROUND_POWER_OF_TWO(sse64, 4);
+
+  // Store the SSE
+  *sse = (uint32_t)sse64;
+  // Compute the variance
+  return  *sse - (uint32_t)((sum64 * sum64) / (w * h));
+}
+static INLINE uint32_t highbd_12_calc_masked_variance(__m128i v_sum_d,
+                                                      __m128i v_sse_q,
+                                                      uint32_t* sse,
+                                                      const int w,
+                                                      const int h) {
+  int64_t sum64;
+  uint64_t sse64;
+
+  // Horizontal sum
+  sum64 = hsum_epi32_si64(v_sum_d);
+  sse64 = hsum_epi64_si64(v_sse_q);
+
+  sum64 = (sum64 >= 0) ? sum64 : -sum64;
+
+  // Round
+  sum64 = ROUND_POWER_OF_TWO(sum64, 6);
+  sse64 = ROUND_POWER_OF_TWO(sse64, 12);
+
+  // Normalise
+  sum64 = ROUND_POWER_OF_TWO(sum64, 4);
+  sse64 = ROUND_POWER_OF_TWO(sse64, 8);
+
+  // Store the SSE
+  *sse = (uint32_t)sse64;
+  // Compute the variance
+  return  *sse - (uint32_t)((sum64 * sum64) / (w * h));
+}
+
+
+// High bit depth functions for width (W) >= 8
+unsigned int vpx_highbd_masked_subpel_varWxH_xzero(
+        const uint16_t *src, int src_stride, int  yoffset,
+        const uint16_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
+        unsigned int *sse, int w, int h, highbd_filter_fn_t filter_fn,
+        highbd_calc_masked_var_t calc_var) {
+  int i, j;
+  __m128i v_src0_w, v_src1_w, v_res_w, v_dst_w, v_msk_b;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
+  const __m128i v_filter_w = _mm_set1_epi32((
+        bilinear_filters_2t[yoffset][1] << 16) +
+        bilinear_filters_2t[yoffset][0]);
+  assert(yoffset < BIL_SUBPEL_SHIFTS);
+  for (j = 0; j < w; j += 8) {
+    // Load the first row ready
+    v_src0_w = _mm_loadu_si128((const __m128i*)(src + j));
+    // Process 2 rows at a time
+    for (i = 0; i < h; i += 2) {
+      // Load the next row apply the filter
+      v_src1_w = _mm_loadu_si128((const __m128i*)(src + j + src_stride));
+      v_res_w = filter_fn(v_src0_w, v_src1_w, v_filter_w);
+      // Load the dst and msk for the variance calculation
+      v_dst_w = _mm_loadu_si128((const __m128i*)(dst + j));
+      v_msk_b = _mm_loadl_epi64((const __m128i*)(msk + j));
+      highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
+
+      // Load the next row apply the filter
+      v_src0_w = _mm_loadu_si128((const __m128i*)(src + j + src_stride * 2));
+      v_res_w = filter_fn(v_src1_w, v_src0_w, v_filter_w);
+      // Load the dst and msk for the variance calculation
+      v_dst_w = _mm_loadu_si128((const __m128i*)(dst + j + dst_stride));
+      v_msk_b = _mm_loadl_epi64((const __m128i*)(msk + j + msk_stride));
+      highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
+      // Move onto the next block of rows
+      src += src_stride * 2;
+      dst += dst_stride * 2;
+      msk += msk_stride * 2;
+    }
+    // Reset to the top of the block
+    src -= src_stride * h;
+    dst -= dst_stride * h;
+    msk -= msk_stride * h;
+  }
+  return calc_var(v_sum_d, v_sse_q, sse, w, h);
+}
+unsigned int vpx_highbd_masked_subpel_varWxH_yzero(
+        const uint16_t *src, int src_stride, int xoffset,
+        const uint16_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
+        unsigned int *sse, int w, int h, highbd_filter_fn_t filter_fn,
+        highbd_calc_masked_var_t calc_var) {
+  int i, j;
+  __m128i v_src0_w, v_src1_w, v_res_w, v_dst_w, v_msk_b;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
+  const __m128i v_filter_w = _mm_set1_epi32((
+        bilinear_filters_2t[xoffset][1] << 16) +
+        bilinear_filters_2t[xoffset][0]);
+  assert(xoffset < BIL_SUBPEL_SHIFTS);
+  for (i = 0; i < h; i++) {
+    for (j = 0; j < w; j += 8) {
+      // Load this row & apply the filter to them
+      v_src0_w = _mm_loadu_si128((const __m128i*)(src + j));
+      v_src1_w = _mm_loadu_si128((const __m128i*)(src + j + 1));
+      v_res_w = filter_fn(v_src0_w, v_src1_w, v_filter_w);
+
+      // Load the dst and msk for the variance calculation
+      v_dst_w = _mm_loadu_si128((const __m128i*)(dst + j));
+      v_msk_b = _mm_loadl_epi64((const __m128i*)(msk + j));
+      highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
+    }
+    src += src_stride;
+    dst += dst_stride;
+    msk += msk_stride;
+  }
+  return calc_var(v_sum_d, v_sse_q, sse, w, h);
+}
+
+unsigned int vpx_highbd_masked_subpel_varWxH_xnonzero_ynonzero(
+        const uint16_t *src, int src_stride, int xoffset, int yoffset,
+        const uint16_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
+        unsigned int *sse, int w, int h, highbd_filter_fn_t xfilter_fn,
+        highbd_filter_fn_t yfilter_fn, highbd_calc_masked_var_t calc_var) {
+  int i, j;
+  __m128i v_src0_w, v_src1_w, v_src2_w, v_src3_w;
+  __m128i v_filtered0_w, v_filtered1_w, v_res_w, v_dst_w, v_msk_b;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
+  const __m128i v_filterx_w = _mm_set1_epi32((
+        bilinear_filters_2t[xoffset][1] << 16) +
+        bilinear_filters_2t[xoffset][0]);
+  const __m128i v_filtery_w = _mm_set1_epi32((
+        bilinear_filters_2t[yoffset][1] << 16) +
+        bilinear_filters_2t[yoffset][0]);
+  assert(xoffset < BIL_SUBPEL_SHIFTS);
+  assert(yoffset < BIL_SUBPEL_SHIFTS);
+  for (j = 0; j < w; j += 8) {
+    // Load the first row ready
+    v_src0_w = _mm_loadu_si128((const __m128i*)(src + j));
+    v_src1_w = _mm_loadu_si128((const __m128i*)(src + j + 1));
+    v_filtered0_w = xfilter_fn(v_src0_w, v_src1_w, v_filterx_w);
+    // Process 2 rows at a time
+    for (i = 0; i < h; i += 2) {
+      // Load the next row & apply the filter
+      v_src2_w = _mm_loadu_si128((const __m128i*)(src + src_stride + j));
+      v_src3_w = _mm_loadu_si128((const __m128i*)(src + src_stride + j + 1));
+      v_filtered1_w = xfilter_fn(v_src2_w, v_src3_w, v_filterx_w);
+      // Load the dst and msk for the variance calculation
+      v_dst_w = _mm_loadu_si128((const __m128i*)(dst + j));
+      v_msk_b = _mm_loadl_epi64((const __m128i*)(msk + j));
+      // Complete the calculation for this row and add it to the running total
+      v_res_w = yfilter_fn(v_filtered0_w, v_filtered1_w, v_filtery_w);
+      highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
+
+      // Load the next row & apply the filter
+      v_src0_w = _mm_loadu_si128((const __m128i*)(src + src_stride * 2 + j));
+      v_src1_w = _mm_loadu_si128((const __m128i*)(src + src_stride * 2 +
+                                                  j + 1));
+      v_filtered0_w = xfilter_fn(v_src0_w, v_src1_w, v_filterx_w);
+      // Load the dst and msk for the variance calculation
+      v_dst_w = _mm_loadu_si128((const __m128i*)(dst + dst_stride + j));
+      v_msk_b = _mm_loadl_epi64((const __m128i*)(msk + msk_stride + j));
+      // Complete the calculation for this row and add it to the running total
+      v_res_w = yfilter_fn(v_filtered1_w, v_filtered0_w, v_filtery_w);
+      highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
+      // Move onto the next block of rows
+      src += src_stride * 2;
+      dst += dst_stride * 2;
+      msk += msk_stride * 2;
+    }
+    // Reset to the top of the block
+    src -= src_stride * h;
+    dst -= dst_stride * h;
+    msk -= msk_stride * h;
+  }
+  return calc_var(v_sum_d, v_sse_q, sse, w, h);
+}
+
+// Note order in which rows loaded xmm[127:64] = row 1, xmm[63:0] = row 2
+unsigned int vpx_highbd_masked_subpel_var4xH_xzero(
+        const uint16_t *src, int src_stride, int yoffset,
+        const uint16_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
+        unsigned int *sse, int h, highbd_calc_masked_var_t calc_var) {
+  int i;
+  __m128i v_src0_w, v_src1_w, v_filtered0_d, v_filtered1_d, v_res_w;
+  __m128i v_dst_w, v_msk_b;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
+  __m128i v_filter_w = _mm_set1_epi32((
+        bilinear_filters_2t[yoffset][1] << 16) +
+        bilinear_filters_2t[yoffset][0]);
+  assert(yoffset < BIL_SUBPEL_SHIFTS);
+  // Load the first row of src data ready
+  v_src0_w = _mm_loadl_epi64((const __m128i*)src);
+  for (i = 0; i < h; i += 2) {
+    if (yoffset == HALF_PIXEL_OFFSET) {
+      // Load the rest of the source data for these rows
+      v_src1_w = _mm_or_si128(
+            _mm_slli_si128(v_src0_w, 8),
+            _mm_loadl_epi64((const __m128i*)(src + src_stride * 1)));
+      v_src0_w = _mm_or_si128(
+            _mm_slli_si128(v_src1_w, 8),
+            _mm_loadl_epi64((const __m128i*)(src + src_stride * 2)));
+      // Apply the y filter
+      v_res_w = _mm_avg_epu16(v_src1_w, v_src0_w);
+    } else {
+      // Load the data and apply the y filter
+      v_src1_w = _mm_loadl_epi64((const __m128i*)(src + src_stride * 1));
+      highbd_apply_filter_lo(v_src0_w, v_src1_w, v_filter_w, &v_filtered0_d);
+      v_src0_w = _mm_loadl_epi64((const __m128i*)(src + src_stride * 2));
+      highbd_apply_filter_lo(v_src1_w, v_src0_w, v_filter_w, &v_filtered1_d);
+      v_res_w = _mm_packs_epi32(v_filtered1_d, v_filtered0_d);
+    }
+    // Load the dst data
+    v_dst_w = _mm_unpacklo_epi64(
+            _mm_loadl_epi64((const __m128i*)(dst + dst_stride * 1)),
+            _mm_loadl_epi64((const __m128i*)(dst + dst_stride * 0)));
+    // Load the mask data
+    v_msk_b = _mm_unpacklo_epi32(
+            _mm_loadl_epi64((const __m128i*)(msk + msk_stride * 1)),
+            _mm_loadl_epi64((const __m128i*)(msk + msk_stride * 0)));
+    // Compute the sum and SSE
+    highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
+    // Move onto the next set of rows
+    src += src_stride * 2;
+    dst += dst_stride * 2;
+    msk += msk_stride * 2;
+  }
+  return calc_var(v_sum_d, v_sse_q, sse, 4, h);
+}
+
+unsigned int vpx_highbd_masked_subpel_var4xH_yzero(
+        const uint16_t *src, int src_stride, int xoffset,
+        const uint16_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
+        unsigned int *sse, int h, highbd_calc_masked_var_t calc_var) {
+  int i;
+  __m128i v_src0_w, v_src1_w, v_filtered0_d, v_filtered1_d;
+  __m128i v_src0_shift_w, v_src1_shift_w, v_res_w, v_dst_w, v_msk_b;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
+  __m128i v_filter_w = _mm_set1_epi32((
+        bilinear_filters_2t[xoffset][1] << 16) +
+        bilinear_filters_2t[xoffset][0]);
+  assert(xoffset < BIL_SUBPEL_SHIFTS);
+  for (i = 0; i < h; i += 2) {
+    // Load the src data
+    v_src0_w = _mm_loadu_si128((const __m128i*)(src));
+    v_src0_shift_w = _mm_srli_si128(v_src0_w, 2);
+    v_src1_w = _mm_loadu_si128((const __m128i*)(src + src_stride));
+    v_src1_shift_w = _mm_srli_si128(v_src1_w, 2);
+    // Apply the x filter
+    if (xoffset == HALF_PIXEL_OFFSET) {
+      v_src1_w = _mm_unpacklo_epi64(v_src0_w, v_src1_w);
+      v_src1_shift_w = _mm_unpacklo_epi64(v_src0_shift_w, v_src1_shift_w);
+      v_res_w = _mm_avg_epu16(v_src1_w, v_src1_shift_w);
+    } else {
+      highbd_apply_filter_lo(v_src0_w, v_src0_shift_w, v_filter_w,
+                             &v_filtered0_d);
+      highbd_apply_filter_lo(v_src1_w, v_src1_shift_w, v_filter_w,
+                             &v_filtered1_d);
+      v_res_w = _mm_packs_epi32(v_filtered0_d, v_filtered1_d);
+    }
+    // Load the dst data
+    v_dst_w = _mm_unpacklo_epi64(
+            _mm_loadl_epi64((const __m128i*)(dst + dst_stride * 0)),
+            _mm_loadl_epi64((const __m128i*)(dst + dst_stride * 1)));
+    // Load the mask data
+    v_msk_b = _mm_unpacklo_epi32(
+            _mm_loadl_epi64((const __m128i*)(msk + msk_stride * 0)),
+            _mm_loadl_epi64((const __m128i*)(msk + msk_stride * 1)));
+    // Compute the sum and SSE
+    highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
+    // Move onto the next set of rows
+    src += src_stride * 2;
+    dst += dst_stride * 2;
+    msk += msk_stride * 2;
+  }
+  return calc_var(v_sum_d, v_sse_q, sse, 4, h);
+}
+
+unsigned int vpx_highbd_masked_subpel_var4xH_xnonzero_ynonzero(
+        const uint16_t *src, int src_stride, int xoffset, int  yoffset,
+        const uint16_t *dst, int dst_stride, const uint8_t *msk, int msk_stride,
+        unsigned int *sse, int h, highbd_calc_masked_var_t calc_var) {
+  int i;
+  __m128i v_src0_w, v_src1_w, v_filtered0_d, v_filtered1_d, v_dst_w, v_msk_b;
+  __m128i v_src0_shift_w, v_src1_shift_w;
+  __m128i v_xres0_w, v_xres1_w, v_res_w, v_temp_w;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_q = _mm_setzero_si128();
+  __m128i v_filterx_w = _mm_set1_epi32((
+        bilinear_filters_2t[xoffset][1] << 16) +
+        bilinear_filters_2t[xoffset][0]);
+  __m128i v_filtery_w = _mm_set1_epi32((
+        bilinear_filters_2t[yoffset][1] << 16) +
+        bilinear_filters_2t[yoffset][0]);
+  assert(xoffset < BIL_SUBPEL_SHIFTS);
+  assert(yoffset < BIL_SUBPEL_SHIFTS);
+  // Load the first block of src data
+  v_src0_w = _mm_loadu_si128((const __m128i*)(src));
+  v_src0_shift_w = _mm_srli_si128(v_src0_w, 2);
+  v_src1_w = _mm_loadu_si128((const __m128i*)(src + src_stride));
+  v_src1_shift_w = _mm_srli_si128(v_src1_w, 2);
+  // Apply the x filter
+  if (xoffset == HALF_PIXEL_OFFSET) {
+    v_src1_w = _mm_unpacklo_epi64(v_src0_w, v_src1_w);
+    v_src1_shift_w = _mm_unpacklo_epi64(v_src0_shift_w, v_src1_shift_w);
+    v_xres0_w = _mm_avg_epu16(v_src1_w, v_src1_shift_w);
+  } else {
+    highbd_apply_filter_lo(v_src0_w, v_src0_shift_w, v_filterx_w,
+                           &v_filtered0_d);
+    highbd_apply_filter_lo(v_src1_w, v_src1_shift_w, v_filterx_w,
+                           &v_filtered1_d);
+    v_xres0_w = _mm_packs_epi32(v_filtered0_d, v_filtered1_d);
+  }
+  for (i = 0; i < h; i += 4) {
+    // Load the next block of src data
+    v_src0_w = _mm_loadu_si128((const __m128i*)(src + src_stride * 2));
+    v_src0_shift_w = _mm_srli_si128(v_src0_w, 2);
+    v_src1_w = _mm_loadu_si128((const __m128i*)(src + src_stride * 3));
+    v_src1_shift_w = _mm_srli_si128(v_src1_w, 2);
+    // Apply the x filter
+    if (xoffset == HALF_PIXEL_OFFSET) {
+      v_src1_w = _mm_unpacklo_epi64(v_src0_w, v_src1_w);
+      v_src1_shift_w = _mm_unpacklo_epi64(v_src0_shift_w, v_src1_shift_w);
+      v_xres1_w = _mm_avg_epu16(v_src1_w, v_src1_shift_w);
+    } else {
+      highbd_apply_filter_lo(v_src0_w, v_src0_shift_w, v_filterx_w,
+                             &v_filtered0_d);
+      highbd_apply_filter_lo(v_src1_w, v_src1_shift_w, v_filterx_w,
+                             &v_filtered1_d);
+      v_xres1_w = _mm_packs_epi32(v_filtered0_d, v_filtered1_d);
+    }
+    // Apply the y filter to the previous block
+    v_temp_w = _mm_or_si128(_mm_srli_si128(v_xres0_w, 8),
+                            _mm_slli_si128(v_xres1_w, 8));
+    if (yoffset == HALF_PIXEL_OFFSET) {
+      v_res_w = _mm_avg_epu16(v_xres0_w, v_temp_w);
+    } else {
+      v_res_w = highbd_apply_filter(v_xres0_w, v_temp_w, v_filtery_w);
+    }
+    // Load the dst data
+    v_dst_w = _mm_unpacklo_epi64(
+            _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 0)),
+            _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 1)));
+    // Load the mask data
+    v_msk_b = _mm_unpacklo_epi32(
+            _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 0)),
+            _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 1)));
+    // Compute the sum and SSE
+    highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
+
+    // Load the next block of src data
+    v_src0_w = _mm_loadu_si128((const __m128i*)(src + src_stride * 4));
+    v_src0_shift_w = _mm_srli_si128(v_src0_w, 2);
+    v_src1_w = _mm_loadu_si128((const __m128i*)(src + src_stride * 5));
+    v_src1_shift_w = _mm_srli_si128(v_src1_w, 2);
+    // Apply the x filter
+    if (xoffset == HALF_PIXEL_OFFSET) {
+      v_src1_w = _mm_unpacklo_epi64(v_src0_w, v_src1_w);
+      v_src1_shift_w = _mm_unpacklo_epi64(v_src0_shift_w, v_src1_shift_w);
+      v_xres0_w = _mm_avg_epu16(v_src1_w, v_src1_shift_w);
+    } else {
+      highbd_apply_filter_lo(v_src0_w, v_src0_shift_w, v_filterx_w,
+                             &v_filtered0_d);
+      highbd_apply_filter_lo(v_src1_w, v_src1_shift_w, v_filterx_w,
+                             &v_filtered1_d);
+      v_xres0_w = _mm_packs_epi32(v_filtered0_d, v_filtered1_d);
+    }
+    // Apply the y filter to the previous block
+    v_temp_w = _mm_or_si128(_mm_srli_si128(v_xres1_w, 8),
+                            _mm_slli_si128(v_xres0_w, 8));
+    if (yoffset == HALF_PIXEL_OFFSET) {
+      v_res_w = _mm_avg_epu16(v_xres1_w, v_temp_w);
+    } else {
+      v_res_w = highbd_apply_filter(v_xres1_w, v_temp_w, v_filtery_w);
+    }
+    // Load the dst data
+    v_dst_w = _mm_unpacklo_epi64(
+            _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 2)),
+            _mm_loadl_epi64((const __m128i *)(dst + dst_stride * 3)));
+    // Load the mask data
+    v_msk_b = _mm_unpacklo_epi32(
+            _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 2)),
+            _mm_loadl_epi64((const __m128i *)(msk + msk_stride * 3)));
+    // Compute the sum and SSE
+    highbd_sum_and_sse(v_res_w, v_dst_w, v_msk_b, &v_sum_d, &v_sse_q);
+    // Move onto the next set of rows
+    src += src_stride * 4;
+    dst += dst_stride * 4;
+    msk += msk_stride * 4;
+  }
+  return calc_var(v_sum_d, v_sse_q, sse, 4, h);
+}
+
+// For W >=8
+#define HIGHBD_MASK_SUBPIX_VAR_LARGE(W, H)                                     \
+unsigned int highbd_masked_sub_pixel_variance##W##x##H##_ssse3(                \
+        const uint8_t *src8, int src_stride,                                   \
+        int xoffset, int  yoffset,                                             \
+        const uint8_t *dst8, int dst_stride,                                   \
+        const uint8_t *msk, int msk_stride,                                    \
+        unsigned int *sse,                                                     \
+        highbd_calc_masked_var_t calc_var,                                     \
+        highbd_variance_fn_t full_variance_function) {                         \
+  uint16_t* src = CONVERT_TO_SHORTPTR(src8);                                   \
+  uint16_t* dst = CONVERT_TO_SHORTPTR(dst8);                                   \
+  assert(W % 8 == 0);                                                          \
+  if (xoffset == 0) {                                                          \
+    if (yoffset == 0)                                                          \
+      return full_variance_function(src8, src_stride, dst8, dst_stride,        \
+                                    msk, msk_stride, sse);                     \
+    else if (yoffset == HALF_PIXEL_OFFSET)                                     \
+      return vpx_highbd_masked_subpel_varWxH_xzero(src, src_stride,            \
+                                                   HALF_PIXEL_OFFSET,          \
+                                                   dst, dst_stride,            \
+                                                   msk, msk_stride,            \
+                                                   sse, W, H,                  \
+                                                   highbd_apply_filter_avg,    \
+                                                   calc_var);                  \
+    else                                                                       \
+      return vpx_highbd_masked_subpel_varWxH_xzero(src, src_stride,            \
+                                                   yoffset,                    \
+                                                   dst, dst_stride,            \
+                                                   msk, msk_stride,            \
+                                                   sse, W, H,                  \
+                                                   highbd_apply_filter,        \
+                                                   calc_var);                  \
+  } else if (yoffset == 0) {                                                   \
+    if (xoffset == HALF_PIXEL_OFFSET)                                          \
+      return vpx_highbd_masked_subpel_varWxH_yzero(src, src_stride,            \
+                                                   HALF_PIXEL_OFFSET,          \
+                                                   dst, dst_stride,            \
+                                                   msk, msk_stride,            \
+                                                   sse, W, H,                  \
+                                                   highbd_apply_filter_avg,    \
+                                                   calc_var);                  \
+    else                                                                       \
+      return vpx_highbd_masked_subpel_varWxH_yzero(src, src_stride,            \
+                                                   xoffset,                    \
+                                                   dst, dst_stride,            \
+                                                   msk, msk_stride,            \
+                                                   sse, W, H,                  \
+                                                   highbd_apply_filter,        \
+                                                   calc_var);                  \
+  } else if (xoffset == HALF_PIXEL_OFFSET) {                                   \
+    if (yoffset == HALF_PIXEL_OFFSET)                                          \
+      return vpx_highbd_masked_subpel_varWxH_xnonzero_ynonzero(                \
+              src, src_stride, HALF_PIXEL_OFFSET, HALF_PIXEL_OFFSET,           \
+              dst, dst_stride, msk, msk_stride, sse, W, H,                     \
+              highbd_apply_filter_avg, highbd_apply_filter_avg, calc_var);     \
+    else                                                                       \
+      return vpx_highbd_masked_subpel_varWxH_xnonzero_ynonzero(                \
+              src, src_stride, HALF_PIXEL_OFFSET, yoffset, dst, dst_stride,    \
+              msk, msk_stride, sse, W, H, highbd_apply_filter_avg,             \
+              highbd_apply_filter, calc_var);                                  \
+  } else {                                                                     \
+    if (yoffset == HALF_PIXEL_OFFSET)                                          \
+      return vpx_highbd_masked_subpel_varWxH_xnonzero_ynonzero(                \
+              src, src_stride, xoffset, HALF_PIXEL_OFFSET,                     \
+              dst, dst_stride, msk, msk_stride, sse, W, H,                     \
+              highbd_apply_filter, highbd_apply_filter_avg, calc_var);         \
+    else                                                                       \
+      return vpx_highbd_masked_subpel_varWxH_xnonzero_ynonzero(                \
+              src, src_stride, xoffset, yoffset,                               \
+              dst, dst_stride, msk, msk_stride, sse, W, H,                     \
+              highbd_apply_filter, highbd_apply_filter, calc_var);             \
+  }                                                                            \
+}
+
+// For W < 8
+#define HIGHBD_MASK_SUBPIX_VAR_SMALL(W, H)                                     \
+unsigned int highbd_masked_sub_pixel_variance##W##x##H##_ssse3(                \
+        const uint8_t *src8, int src_stride,                                   \
+        int xoffset, int  yoffset,                                             \
+        const uint8_t *dst8, int dst_stride,                                   \
+        const uint8_t *msk, int msk_stride,                                    \
+        unsigned int *sse,                                                     \
+        highbd_calc_masked_var_t calc_var,                                     \
+        highbd_variance_fn_t full_variance_function) {                         \
+  uint16_t* src = CONVERT_TO_SHORTPTR(src8);                                   \
+  uint16_t* dst = CONVERT_TO_SHORTPTR(dst8);                                   \
+  assert(W == 4);                                                              \
+  if (xoffset == 0 && yoffset == 0)                                            \
+    return full_variance_function(src8, src_stride, dst8, dst_stride,          \
+                                  msk, msk_stride, sse);                       \
+  else if (xoffset == 0)                                                       \
+    return vpx_highbd_masked_subpel_var4xH_xzero(src, src_stride, yoffset,     \
+                                                     dst, dst_stride,          \
+                                                     msk, msk_stride, sse, H,  \
+                                                     calc_var);                \
+  else if (yoffset == 0)                                                       \
+    return vpx_highbd_masked_subpel_var4xH_yzero(src, src_stride, xoffset,     \
+                                                     dst, dst_stride,          \
+                                                     msk, msk_stride, sse, H,  \
+                                                     calc_var);                \
+  else                                                                         \
+    return vpx_highbd_masked_subpel_var4xH_xnonzero_ynonzero(                  \
+          src, src_stride, xoffset, yoffset, dst, dst_stride,                  \
+          msk, msk_stride, sse, H, calc_var);                                  \
+}
+
+#define HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(W, H)                                  \
+unsigned int vpx_highbd_masked_sub_pixel_variance##W##x##H##_ssse3(            \
+        const uint8_t *src8, int src_stride,                                   \
+        int xoffset, int  yoffset,                                             \
+        const uint8_t *dst8, int dst_stride,                                   \
+        const uint8_t *msk, int msk_stride,                                    \
+        unsigned int *sse) {                                                   \
+    return highbd_masked_sub_pixel_variance##W##x##H##_ssse3(src8, src_stride, \
+            xoffset, yoffset, dst8, dst_stride, msk, msk_stride, sse,          \
+            calc_masked_variance,                                              \
+            vpx_highbd_masked_variance##W##x##H##_ssse3);                      \
+}                                                                              \
+unsigned int vpx_highbd_10_masked_sub_pixel_variance##W##x##H##_ssse3(         \
+        const uint8_t *src8, int src_stride,                                   \
+        int xoffset, int  yoffset,                                             \
+        const uint8_t *dst8, int dst_stride,                                   \
+        const uint8_t *msk, int msk_stride,                                    \
+        unsigned int *sse) {                                                   \
+    return highbd_masked_sub_pixel_variance##W##x##H##_ssse3(src8, src_stride, \
+            xoffset, yoffset, dst8, dst_stride, msk, msk_stride, sse,          \
+            highbd_10_calc_masked_variance,                                    \
+            vpx_highbd_10_masked_variance##W##x##H##_ssse3);                   \
+}                                                                              \
+unsigned int vpx_highbd_12_masked_sub_pixel_variance##W##x##H##_ssse3(         \
+        const uint8_t *src8, int src_stride,                                   \
+        int xoffset, int  yoffset,                                             \
+        const uint8_t *dst8, int dst_stride,                                   \
+        const uint8_t *msk, int msk_stride,                                    \
+        unsigned int *sse) {                                                   \
+    return highbd_masked_sub_pixel_variance##W##x##H##_ssse3(src8, src_stride, \
+            xoffset, yoffset, dst8, dst_stride, msk, msk_stride, sse,          \
+            highbd_12_calc_masked_variance,                                    \
+            vpx_highbd_12_masked_variance##W##x##H##_ssse3);                   \
+}                                                                              \
+
+HIGHBD_MASK_SUBPIX_VAR_SMALL(4, 4)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(4, 4)
+HIGHBD_MASK_SUBPIX_VAR_SMALL(4, 8)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(4, 8)
+HIGHBD_MASK_SUBPIX_VAR_LARGE(8, 4)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(8, 4)
+HIGHBD_MASK_SUBPIX_VAR_LARGE(8, 8)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(8, 8)
+HIGHBD_MASK_SUBPIX_VAR_LARGE(8, 16)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(8, 16)
+HIGHBD_MASK_SUBPIX_VAR_LARGE(16, 8)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(16, 8)
+HIGHBD_MASK_SUBPIX_VAR_LARGE(16, 16)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(16, 16)
+HIGHBD_MASK_SUBPIX_VAR_LARGE(16, 32)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(16, 32)
+HIGHBD_MASK_SUBPIX_VAR_LARGE(32, 16)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(32, 16)
+HIGHBD_MASK_SUBPIX_VAR_LARGE(32, 32)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(32, 32)
+HIGHBD_MASK_SUBPIX_VAR_LARGE(32, 64)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(32, 64)
+HIGHBD_MASK_SUBPIX_VAR_LARGE(64, 32)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(64, 32)
+HIGHBD_MASK_SUBPIX_VAR_LARGE(64, 64)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(64, 64)
+#if CONFIG_EXT_PARTITION
+HIGHBD_MASK_SUBPIX_VAR_LARGE(64, 128)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(64, 128)
+HIGHBD_MASK_SUBPIX_VAR_LARGE(128, 64)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(128, 64)
+HIGHBD_MASK_SUBPIX_VAR_LARGE(128, 128)
+HIGHBD_MASK_SUBPIX_VAR_WRAPPERS(128, 128)
+#endif  // CONFIG_EXT_PARTITION
+#endif
diff --git a/vpx_dsp/x86/obmc_sad_sse4.c b/vpx_dsp/x86/obmc_sad_sse4.c
new file mode 100644
index 0000000..e21bb98
--- /dev/null
+++ b/vpx_dsp/x86/obmc_sad_sse4.c
@@ -0,0 +1,267 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "./vpx_config.h"
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_integer.h"
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/x86/synonyms.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+static INLINE unsigned int obmc_sad_w4(const uint8_t *pre,
+                                       const int pre_stride,
+                                       const int32_t *wsrc,
+                                       const int32_t *mask,
+                                       const int height) {
+  const int pre_step = pre_stride - 4;
+  int n = 0;
+  __m128i v_sad_d = _mm_setzero_si128();
+
+  do {
+    const __m128i v_p_b = xx_loadl_32(pre + n);
+    const __m128i v_m_d = xx_load_128(mask + n);
+    const __m128i v_w_d = xx_load_128(wsrc + n);
+
+    const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
+
+    const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
+    const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d);
+
+    // Rounded absolute difference
+    const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12);
+
+    v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d);
+
+    n += 4;
+
+    if (n % 4 == 0) pre += pre_step;
+  } while (n < 4 * height);
+
+  return xx_hsum_epi32_si32(v_sad_d);
+}
+
+static INLINE unsigned int obmc_sad_w8n(const uint8_t *pre,
+                                        const int pre_stride,
+                                        const int32_t *wsrc,
+                                        const int32_t *mask,
+                                        const int width,
+                                        const int height) {
+  const int pre_step = pre_stride - width;
+  int n = 0;
+  __m128i v_sad_d = _mm_setzero_si128();
+
+  assert(width >= 8);
+  assert(IS_POWER_OF_TWO(width));
+
+  do {
+    const __m128i v_p1_b = xx_loadl_32(pre + n + 4);
+    const __m128i v_m1_d = xx_load_128(mask + n + 4);
+    const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
+    const __m128i v_p0_b = xx_loadl_32(pre + n);
+    const __m128i v_m0_d = xx_load_128(mask + n);
+    const __m128i v_w0_d = xx_load_128(wsrc + n);
+
+    const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b);
+    const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
+    const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
+
+    const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
+    const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
+    const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d);
+    const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d);
+
+    // Rounded absolute difference
+    const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12);
+    const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12);
+
+    v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d);
+    v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d);
+
+    n += 8;
+
+    if (n % width == 0) pre += pre_step;
+  } while (n < width * height);
+
+  return xx_hsum_epi32_si32(v_sad_d);
+}
+
+#define OBMCSADWXH(w, h)                                                      \
+unsigned int vpx_obmc_sad##w##x##h##_sse4_1(const uint8_t *pre,               \
+                                            int pre_stride,                   \
+                                            const int32_t *wsrc,              \
+                                            const int32_t *msk) {             \
+  if (w == 4) {                                                               \
+    return obmc_sad_w4(pre, pre_stride, wsrc, msk, h);                        \
+  } else {                                                                    \
+    return obmc_sad_w8n(pre, pre_stride, wsrc, msk, w, h);                    \
+  }                                                                           \
+}
+
+#if CONFIG_EXT_PARTITION
+OBMCSADWXH(128, 128)
+OBMCSADWXH(128, 64)
+OBMCSADWXH(64, 128)
+#endif  // CONFIG_EXT_PARTITION
+OBMCSADWXH(64, 64)
+OBMCSADWXH(64, 32)
+OBMCSADWXH(32, 64)
+OBMCSADWXH(32, 32)
+OBMCSADWXH(32, 16)
+OBMCSADWXH(16, 32)
+OBMCSADWXH(16, 16)
+OBMCSADWXH(16, 8)
+OBMCSADWXH(8, 16)
+OBMCSADWXH(8, 8)
+OBMCSADWXH(8, 4)
+OBMCSADWXH(4, 8)
+OBMCSADWXH(4, 4)
+
+////////////////////////////////////////////////////////////////////////////////
+// High bit-depth
+////////////////////////////////////////////////////////////////////////////////
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE unsigned int hbd_obmc_sad_w4(const uint8_t *pre8,
+                                           const int pre_stride,
+                                           const int32_t *wsrc,
+                                           const int32_t *mask,
+                                           const int height) {
+  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+  const int pre_step = pre_stride - 4;
+  int n = 0;
+  __m128i v_sad_d = _mm_setzero_si128();
+
+  do {
+    const __m128i v_p_w = xx_loadl_64(pre + n);
+    const __m128i v_m_d = xx_load_128(mask + n);
+    const __m128i v_w_d = xx_load_128(wsrc + n);
+
+    const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
+
+    const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
+    const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d);
+
+    // Rounded absolute difference
+    const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12);
+
+    v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d);
+
+    n += 4;
+
+    if (n % 4 == 0) pre += pre_step;
+  } while (n < 4 * height);
+
+  return xx_hsum_epi32_si32(v_sad_d);
+}
+
+static INLINE unsigned int hbd_obmc_sad_w8n(const uint8_t *pre8,
+                                            const int pre_stride,
+                                            const int32_t *wsrc,
+                                            const int32_t *mask,
+                                            const int width,
+                                            const int height) {
+  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+  const int pre_step = pre_stride - width;
+  int n = 0;
+  __m128i v_sad_d = _mm_setzero_si128();
+
+  assert(width >= 8);
+  assert(IS_POWER_OF_TWO(width));
+
+  do {
+    const __m128i v_p1_w = xx_loadl_64(pre + n + 4);
+    const __m128i v_m1_d = xx_load_128(mask + n + 4);
+    const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
+    const __m128i v_p0_w = xx_loadl_64(pre + n);
+    const __m128i v_m0_d = xx_load_128(mask + n);
+    const __m128i v_w0_d = xx_load_128(wsrc + n);
+
+    const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w);
+    const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
+    const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
+
+    const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
+    const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
+    const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d);
+    const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d);
+
+    // Rounded absolute difference
+    const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12);
+    const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12);
+
+    v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d);
+    v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d);
+
+    n += 8;
+
+    if (n % width == 0) pre += pre_step;
+  } while (n < width * height);
+
+  return xx_hsum_epi32_si32(v_sad_d);
+}
+
+#define HBD_OBMCSADWXH(w, h)                                                  \
+unsigned int vpx_highbd_obmc_sad##w##x##h##_sse4_1(const uint8_t *pre,        \
+                                                   int pre_stride,            \
+                                                   const int32_t *wsrc,       \
+                                                   const int32_t *mask) {     \
+  if (w == 4) {                                                               \
+    return hbd_obmc_sad_w4(pre, pre_stride, wsrc, mask, h);                   \
+  } else {                                                                    \
+    return hbd_obmc_sad_w8n(pre, pre_stride, wsrc, mask, w, h);               \
+  }                                                                           \
+}
+
+#if CONFIG_EXT_PARTITION
+HBD_OBMCSADWXH(128, 128)
+HBD_OBMCSADWXH(128, 64)
+HBD_OBMCSADWXH(64, 128)
+#endif  // CONFIG_EXT_PARTITION
+HBD_OBMCSADWXH(64, 64)
+HBD_OBMCSADWXH(64, 32)
+HBD_OBMCSADWXH(32, 64)
+HBD_OBMCSADWXH(32, 32)
+HBD_OBMCSADWXH(32, 16)
+HBD_OBMCSADWXH(16, 32)
+HBD_OBMCSADWXH(16, 16)
+HBD_OBMCSADWXH(16, 8)
+HBD_OBMCSADWXH(8, 16)
+HBD_OBMCSADWXH(8, 8)
+HBD_OBMCSADWXH(8, 4)
+HBD_OBMCSADWXH(4, 8)
+HBD_OBMCSADWXH(4, 4)
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/x86/obmc_variance_sse4.c b/vpx_dsp/x86/obmc_variance_sse4.c
new file mode 100644
index 0000000..b967c10
--- /dev/null
+++ b/vpx_dsp/x86/obmc_variance_sse4.c
@@ -0,0 +1,379 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <immintrin.h>
+
+#include "./vpx_config.h"
+#include "vpx_ports/mem.h"
+#include "vpx/vpx_integer.h"
+
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_dsp/x86/synonyms.h"
+#include "vpx_dsp/vpx_filter.h"
+
+////////////////////////////////////////////////////////////////////////////////
+// 8 bit
+////////////////////////////////////////////////////////////////////////////////
+
+static INLINE void obmc_variance_w4(const uint8_t *pre,
+                                    const int pre_stride,
+                                    const int32_t *wsrc,
+                                    const int32_t *mask,
+                                    unsigned int *const sse,
+                                    int *const sum,
+                                    const int h) {
+  const int pre_step = pre_stride - 4;
+  int n = 0;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_d = _mm_setzero_si128();
+
+  assert(IS_POWER_OF_TWO(h));
+
+  do {
+    const __m128i v_p_b = xx_loadl_32(pre + n);
+    const __m128i v_m_d = xx_load_128(mask + n);
+    const __m128i v_w_d = xx_load_128(wsrc + n);
+
+    const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
+
+    const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
+    const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12);
+    const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d);
+
+    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
+    v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
+
+    n += 4;
+
+    if (n % 4 == 0) pre += pre_step;
+  } while (n < 4 * h);
+
+  *sum = xx_hsum_epi32_si32(v_sum_d);
+  *sse = xx_hsum_epi32_si32(v_sse_d);
+}
+
+static INLINE void obmc_variance_w8n(const uint8_t *pre,
+                                     const int pre_stride,
+                                     const int32_t *wsrc,
+                                     const int32_t *mask,
+                                     unsigned int *const sse,
+                                     int *const sum,
+                                     const int w,
+                                     const int h) {
+  const int pre_step = pre_stride - w;
+  int n = 0;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_d = _mm_setzero_si128();
+
+  assert(w >= 8);
+  assert(IS_POWER_OF_TWO(w));
+  assert(IS_POWER_OF_TWO(h));
+
+  do {
+    const __m128i v_p1_b = xx_loadl_32(pre + n + 4);
+    const __m128i v_m1_d = xx_load_128(mask + n + 4);
+    const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
+    const __m128i v_p0_b = xx_loadl_32(pre + n);
+    const __m128i v_m0_d = xx_load_128(mask + n);
+    const __m128i v_w0_d = xx_load_128(wsrc + n);
+
+    const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b);
+    const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
+    const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
+
+    const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
+    const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
+
+    const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12);
+    const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12);
+    const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d);
+    const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
+
+    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d);
+    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
+    v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
+
+    n += 8;
+
+    if (n % w == 0) pre += pre_step;
+  } while (n < w * h);
+
+  *sum = xx_hsum_epi32_si32(v_sum_d);
+  *sse = xx_hsum_epi32_si32(v_sse_d);
+}
+
+#define OBMCVARWXH(W, H)                                                      \
+unsigned int vpx_obmc_variance##W##x##H##_sse4_1(const uint8_t *pre,          \
+                                                 int pre_stride,              \
+                                                 const int32_t *wsrc,         \
+                                                 const int32_t *mask,         \
+                                                 unsigned int *sse) {         \
+  int sum;                                                                    \
+  if (W == 4) {                                                               \
+    obmc_variance_w4(pre, pre_stride, wsrc, mask, sse, &sum, H);              \
+  } else {                                                                    \
+    obmc_variance_w8n(pre, pre_stride, wsrc, mask, sse, &sum, W, H);          \
+  }                                                                           \
+  return *sse - (((int64_t)sum * sum) / (W * H));                             \
+}
+
+#if CONFIG_EXT_PARTITION
+OBMCVARWXH(128, 128)
+OBMCVARWXH(128, 64)
+OBMCVARWXH(64, 128)
+#endif  // CONFIG_EXT_PARTITION
+OBMCVARWXH(64, 64)
+OBMCVARWXH(64, 32)
+OBMCVARWXH(32, 64)
+OBMCVARWXH(32, 32)
+OBMCVARWXH(32, 16)
+OBMCVARWXH(16, 32)
+OBMCVARWXH(16, 16)
+OBMCVARWXH(16, 8)
+OBMCVARWXH(8, 16)
+OBMCVARWXH(8, 8)
+OBMCVARWXH(8, 4)
+OBMCVARWXH(4, 8)
+OBMCVARWXH(4, 4)
+
+////////////////////////////////////////////////////////////////////////////////
+// High bit-depth
+////////////////////////////////////////////////////////////////////////////////
+
+#if CONFIG_VP9_HIGHBITDEPTH
+static INLINE void hbd_obmc_variance_w4(const uint8_t *pre8,
+                                        const int pre_stride,
+                                        const int32_t *wsrc,
+                                        const int32_t *mask,
+                                        uint64_t *const sse,
+                                        int64_t *const sum,
+                                        const int h) {
+  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+  const int pre_step = pre_stride - 4;
+  int n = 0;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_d = _mm_setzero_si128();
+
+  assert(IS_POWER_OF_TWO(h));
+
+  do {
+    const __m128i v_p_w = xx_loadl_64(pre + n);
+    const __m128i v_m_d = xx_load_128(mask + n);
+    const __m128i v_w_d = xx_load_128(wsrc + n);
+
+    const __m128i v_p_d = _mm_cvtepu16_epi32(v_p_w);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d);
+
+    const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d);
+    const __m128i v_rdiff_d = xx_roundn_epi32(v_diff_d, 12);
+    const __m128i v_sqrdiff_d = _mm_mullo_epi32(v_rdiff_d, v_rdiff_d);
+
+    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff_d);
+    v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
+
+    n += 4;
+
+    if (n % 4 == 0) pre += pre_step;
+  } while (n < 4 * h);
+
+  *sum = xx_hsum_epi32_si32(v_sum_d);
+  *sse = xx_hsum_epi32_si32(v_sse_d);
+}
+
+static INLINE void hbd_obmc_variance_w8n(const uint8_t *pre8,
+                                         const int pre_stride,
+                                         const int32_t *wsrc,
+                                         const int32_t *mask,
+                                         uint64_t *const sse,
+                                         int64_t *const sum,
+                                         const int w,
+                                         const int h) {
+  const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8);
+  const int pre_step = pre_stride - w;
+  int n = 0;
+  __m128i v_sum_d = _mm_setzero_si128();
+  __m128i v_sse_d = _mm_setzero_si128();
+
+  assert(w >= 8);
+  assert(IS_POWER_OF_TWO(w));
+  assert(IS_POWER_OF_TWO(h));
+
+  do {
+    const __m128i v_p1_w = xx_loadl_64(pre + n + 4);
+    const __m128i v_m1_d = xx_load_128(mask + n + 4);
+    const __m128i v_w1_d = xx_load_128(wsrc + n + 4);
+    const __m128i v_p0_w = xx_loadl_64(pre + n);
+    const __m128i v_m0_d = xx_load_128(mask + n);
+    const __m128i v_w0_d = xx_load_128(wsrc + n);
+
+    const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w);
+    const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w);
+
+    // Values in both pre and mask fit in 15 bits, and are packed at 32 bit
+    // boundaries. We use pmaddwd, as it has lower latency on Haswell
+    // than pmulld but produces the same result with these inputs.
+    const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d);
+    const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d);
+
+    const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d);
+    const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d);
+
+    const __m128i v_rdiff0_d = xx_roundn_epi32(v_diff0_d, 12);
+    const __m128i v_rdiff1_d = xx_roundn_epi32(v_diff1_d, 12);
+    const __m128i v_rdiff01_w = _mm_packs_epi32(v_rdiff0_d, v_rdiff1_d);
+    const __m128i v_sqrdiff_d = _mm_madd_epi16(v_rdiff01_w, v_rdiff01_w);
+
+    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff0_d);
+    v_sum_d = _mm_add_epi32(v_sum_d, v_rdiff1_d);
+    v_sse_d = _mm_add_epi32(v_sse_d, v_sqrdiff_d);
+
+    n += 8;
+
+    if (n % w == 0) pre += pre_step;
+  } while (n < w * h);
+
+  *sum += xx_hsum_epi32_si64(v_sum_d);
+  *sse += xx_hsum_epi32_si64(v_sse_d);
+}
+
+static INLINE void highbd_obmc_variance(const uint8_t *pre8, int pre_stride,
+                                        const int32_t *wsrc,
+                                        const int32_t *mask,
+                                        int w, int h,
+                                        unsigned int *sse, int *sum) {
+  int64_t sum64 = 0;
+  uint64_t sse64 = 0;
+  if (w == 4) {
+    hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
+  } else {
+    hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
+  }
+  *sum = (int)sum64;
+  *sse = (unsigned int)sse64;
+}
+
+static INLINE void highbd_10_obmc_variance(const uint8_t *pre8, int pre_stride,
+                                           const int32_t *wsrc,
+                                           const int32_t *mask,
+                                           int w, int h,
+                                           unsigned int *sse, int *sum) {
+  int64_t sum64 = 0;
+  uint64_t sse64 = 0;
+  if (w == 4) {
+    hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
+  } else {
+    hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
+  }
+  *sum = (int)ROUND_POWER_OF_TWO(sum64, 2);
+  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 4);
+}
+
+static INLINE void highbd_12_obmc_variance(const uint8_t *pre8, int pre_stride,
+                                           const int32_t *wsrc,
+                                           const int32_t *mask,
+                                           int w, int h,
+                                           unsigned int *sse, int *sum) {
+  int64_t sum64 = 0;
+  uint64_t sse64 = 0;
+  if (w == 128) {
+    do {
+      hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask,
+                            &sse64, &sum64, 128, 32);
+      pre8 += 32 * pre_stride;
+      wsrc += 32 * 128;
+      mask += 32 * 128;
+      h -= 32;
+    } while (h > 0);
+  } else if (w == 64 && h >= 128) {
+    do {
+      hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask,
+                            &sse64, &sum64, 64, 64);
+      pre8 += 64 * pre_stride;
+      wsrc += 64 * 64;
+      mask += 64 * 64;
+      h -= 64;
+    } while (h > 0);
+  } else if (w == 4) {
+    hbd_obmc_variance_w4(pre8, pre_stride, wsrc, mask, &sse64, &sum64, h);
+  } else {
+    hbd_obmc_variance_w8n(pre8, pre_stride, wsrc, mask, &sse64, &sum64, w, h);
+  }
+  *sum = (int)ROUND_POWER_OF_TWO(sum64, 4);
+  *sse = (unsigned int)ROUND_POWER_OF_TWO(sse64, 8);
+}
+
+#define HBD_OBMCVARWXH(W, H)                                                  \
+unsigned int vpx_highbd_obmc_variance##W##x##H##_sse4_1(                      \
+    const uint8_t *pre,                                                       \
+    int pre_stride,                                                           \
+    const int32_t *wsrc,                                                      \
+    const int32_t *mask,                                                      \
+    unsigned int *sse) {                                                      \
+  int sum;                                                                    \
+  highbd_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);         \
+  return *sse - (((int64_t)sum * sum) / (W * H));                             \
+}                                                                             \
+                                                                              \
+unsigned int vpx_highbd_10_obmc_variance##W##x##H##_sse4_1(                   \
+    const uint8_t *pre,                                                       \
+    int pre_stride,                                                           \
+    const int32_t *wsrc,                                                      \
+    const int32_t *mask,                                                      \
+    unsigned int *sse) {                                                      \
+  int sum;                                                                    \
+  highbd_10_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);      \
+  return *sse - (((int64_t)sum * sum) / (W * H));                             \
+}                                                                             \
+                                                                              \
+unsigned int vpx_highbd_12_obmc_variance##W##x##H##_sse4_1(                   \
+    const uint8_t *pre,                                                       \
+    int pre_stride,                                                           \
+    const int32_t *wsrc,                                                      \
+    const int32_t *mask,                                                      \
+    unsigned int *sse) {                                                      \
+  int sum;                                                                    \
+  highbd_12_obmc_variance(pre, pre_stride, wsrc, mask, W, H, sse, &sum);      \
+  return *sse - (((int64_t)sum * sum) / (W * H));                             \
+}
+
+#if CONFIG_EXT_PARTITION
+HBD_OBMCVARWXH(128, 128)
+HBD_OBMCVARWXH(128, 64)
+HBD_OBMCVARWXH(64, 128)
+#endif  // CONFIG_EXT_PARTITION
+HBD_OBMCVARWXH(64, 64)
+HBD_OBMCVARWXH(64, 32)
+HBD_OBMCVARWXH(32, 64)
+HBD_OBMCVARWXH(32, 32)
+HBD_OBMCVARWXH(32, 16)
+HBD_OBMCVARWXH(16, 32)
+HBD_OBMCVARWXH(16, 16)
+HBD_OBMCVARWXH(16, 8)
+HBD_OBMCVARWXH(8, 16)
+HBD_OBMCVARWXH(8, 8)
+HBD_OBMCVARWXH(8, 4)
+HBD_OBMCVARWXH(4, 8)
+HBD_OBMCVARWXH(4, 4)
+#endif  // CONFIG_VP9_HIGHBITDEPTH
diff --git a/vpx_dsp/x86/sad4d_sse2.asm b/vpx_dsp/x86/sad4d_sse2.asm
index 3f6e55c..6d49869 100644
--- a/vpx_dsp/x86/sad4d_sse2.asm
+++ b/vpx_dsp/x86/sad4d_sse2.asm
@@ -175,6 +175,12 @@
   PROCESS_32x2x4  0, %4, %5, %4 + 32, %5 + 32, %6
 %endmacro
 
+; PROCESS_128x2x4 first, off_{first,second}_{src,ref}, advance_at_end
+%macro PROCESS_128x2x4 5-6 0
+  PROCESS_64x2x4 %1, %2, %3, %2 + 64, %3 + 64
+  PROCESS_64x2x4  0, %4, %5, %4 + 64, %5 + 64, %6
+%endmacro
+
 ; void vpx_sadNxNx4d_sse2(uint8_t *src,    int src_stride,
 ;                         uint8_t *ref[4], int ref_stride,
 ;                         uint32_t res[4]);
@@ -224,6 +230,11 @@
 %endmacro
 
 INIT_XMM sse2
+%if CONFIG_EXT_PARTITION
+SADNXN4D 128, 128
+SADNXN4D 128, 64
+SADNXN4D 64,  128
+%endif
 SADNXN4D 64, 64
 SADNXN4D 64, 32
 SADNXN4D 32, 64
diff --git a/vpx_dsp/x86/sad_sse2.asm b/vpx_dsp/x86/sad_sse2.asm
index 1ec906c..edef2a7 100644
--- a/vpx_dsp/x86/sad_sse2.asm
+++ b/vpx_dsp/x86/sad_sse2.asm
@@ -44,6 +44,76 @@
 %endif ; %3 == 7
 %endmacro
 
+%if CONFIG_EXT_PARTITION
+; unsigned int vpx_sad128x128_sse2(uint8_t *src, int src_stride,
+;                                  uint8_t *ref, int ref_stride);
+%macro SAD128XN 1-2 0
+  SAD_FN 128, %1, 5, %2
+  mov              n_rowsd, %1
+  pxor                  m0, m0
+
+.loop:
+  movu                  m1, [refq]
+  movu                  m2, [refq+16]
+  movu                  m3, [refq+32]
+  movu                  m4, [refq+48]
+%if %2 == 1
+  pavgb                 m1, [second_predq+mmsize*0]
+  pavgb                 m2, [second_predq+mmsize*1]
+  pavgb                 m3, [second_predq+mmsize*2]
+  pavgb                 m4, [second_predq+mmsize*3]
+%endif
+  psadbw                m1, [srcq]
+  psadbw                m2, [srcq+16]
+  psadbw                m3, [srcq+32]
+  psadbw                m4, [srcq+48]
+
+  paddd                 m1, m2
+  paddd                 m3, m4
+  paddd                 m0, m1
+  paddd                 m0, m3
+
+  movu                  m1, [refq+64]
+  movu                  m2, [refq+80]
+  movu                  m3, [refq+96]
+  movu                  m4, [refq+112]
+%if %2 == 1
+  pavgb                 m1, [second_predq+mmsize*4]
+  pavgb                 m2, [second_predq+mmsize*5]
+  pavgb                 m3, [second_predq+mmsize*6]
+  pavgb                 m4, [second_predq+mmsize*7]
+  lea         second_predq, [second_predq+mmsize*8]
+%endif
+  psadbw                m1, [srcq+64]
+  psadbw                m2, [srcq+80]
+  psadbw                m3, [srcq+96]
+  psadbw                m4, [srcq+112]
+
+  add                 refq, ref_strideq
+  add                 srcq, src_strideq
+
+  paddd                 m1, m2
+  paddd                 m3, m4
+  paddd                 m0, m1
+  paddd                 m0, m3
+
+  sub              n_rowsd, 1
+  jg .loop
+
+  movhlps               m1, m0
+  paddd                 m0, m1
+  movd                 eax, m0
+  RET
+%endmacro
+
+INIT_XMM sse2
+SAD128XN 128     ; sad128x128_sse2
+SAD128XN 128, 1  ; sad128x128_avg_sse2
+SAD128XN 64      ; sad128x64_sse2
+SAD128XN 64, 1   ; sad128x64_avg_sse2
+%endif
+
+
 ; unsigned int vpx_sad64x64_sse2(uint8_t *src, int src_stride,
 ;                                uint8_t *ref, int ref_stride);
 %macro SAD64XN 1-2 0
@@ -82,6 +152,10 @@
 %endmacro
 
 INIT_XMM sse2
+%if CONFIG_EXT_PARTITION
+SAD64XN 128     ; sad64x128_sse2
+SAD64XN 128, 1  ; sad64x128_avg_sse2
+%endif
 SAD64XN 64 ; sad64x64_sse2
 SAD64XN 32 ; sad64x32_sse2
 SAD64XN 64, 1 ; sad64x64_avg_sse2
diff --git a/vpx_dsp/x86/subtract_sse2.asm b/vpx_dsp/x86/subtract_sse2.asm
index 4273efb..2225b7c 100644
--- a/vpx_dsp/x86/subtract_sse2.asm
+++ b/vpx_dsp/x86/subtract_sse2.asm
@@ -31,6 +31,10 @@
   je .case_16
   cmp                colsd, 32
   je .case_32
+%if CONFIG_EXT_PARTITION
+  cmp                colsd, 64
+  je .case_64
+%endif
 
 %macro loop16 6
   mova                  m0, [srcq+%1]
@@ -55,6 +59,22 @@
   mova [diffq+mmsize*1+%6], m1
 %endmacro
 
+%if CONFIG_EXT_PARTITION
+  mov             pred_str, pred_stridemp
+.loop_128:
+  loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize,  0*mmsize,  2*mmsize
+  loop16 2*mmsize, 3*mmsize, 2*mmsize, 3*mmsize,  4*mmsize,  6*mmsize
+  loop16 4*mmsize, 5*mmsize, 4*mmsize, 5*mmsize,  8*mmsize, 10*mmsize
+  loop16 6*mmsize, 7*mmsize, 6*mmsize, 7*mmsize, 12*mmsize, 14*mmsize
+  lea                diffq, [diffq+diff_strideq*2]
+  add                predq, pred_str
+  add                 srcq, src_strideq
+  sub                rowsd, 1
+  jnz .loop_128
+  RET
+
+.case_64:
+%endif
   mov             pred_str, pred_stridemp
 .loop_64:
   loop16 0*mmsize, 1*mmsize, 0*mmsize, 1*mmsize, 0*mmsize, 2*mmsize
diff --git a/vpx_dsp/x86/sum_squares_sse2.c b/vpx_dsp/x86/sum_squares_sse2.c
new file mode 100644
index 0000000..5ecd87e
--- /dev/null
+++ b/vpx_dsp/x86/sum_squares_sse2.c
@@ -0,0 +1,195 @@
+/*
+ *  Copyright (c) 2015 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include <emmintrin.h>
+#include <stdio.h>
+
+#include "vpx_dsp/x86/synonyms.h"
+
+#include "./vpx_dsp_rtcd.h"
+
+static uint64_t vpx_sum_squares_2d_i16_4x4_sse2(const int16_t *src,
+                                                int stride) {
+  const __m128i v_val_0_w = _mm_loadl_epi64((const __m128i*)(src+0*stride));
+  const __m128i v_val_1_w = _mm_loadl_epi64((const __m128i*)(src+1*stride));
+  const __m128i v_val_2_w = _mm_loadl_epi64((const __m128i*)(src+2*stride));
+  const __m128i v_val_3_w = _mm_loadl_epi64((const __m128i*)(src+3*stride));
+
+  const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
+  const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
+  const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
+  const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
+
+  const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
+  const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
+  const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
+
+  const __m128i v_sum_d = _mm_add_epi32(v_sum_0123_d,
+                                        _mm_srli_epi64(v_sum_0123_d, 32));
+
+  return (uint64_t)_mm_cvtsi128_si32(v_sum_d);
+}
+
+#ifdef __GNUC__
+// This prevents GCC/Clang from inlining this function into
+// vpx_sum_squares_2d_i16_sse2, which in turn saves some stack
+// maintenance instructions in the common case of 4x4.
+__attribute__((noinline))
+#endif
+static uint64_t vpx_sum_squares_2d_i16_nxn_sse2(const int16_t *src,
+                                                int stride,
+                                                int size) {
+  int r, c;
+
+  const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
+  __m128i v_acc_q = _mm_setzero_si128();
+
+  for (r = 0; r < size; r += 8) {
+    __m128i v_acc_d = _mm_setzero_si128();
+
+    for (c = 0; c < size; c += 8) {
+      const int16_t *b = src+c;
+
+      const __m128i v_val_0_w = _mm_load_si128((const __m128i*)(b+0*stride));
+      const __m128i v_val_1_w = _mm_load_si128((const __m128i*)(b+1*stride));
+      const __m128i v_val_2_w = _mm_load_si128((const __m128i*)(b+2*stride));
+      const __m128i v_val_3_w = _mm_load_si128((const __m128i*)(b+3*stride));
+      const __m128i v_val_4_w = _mm_load_si128((const __m128i*)(b+4*stride));
+      const __m128i v_val_5_w = _mm_load_si128((const __m128i*)(b+5*stride));
+      const __m128i v_val_6_w = _mm_load_si128((const __m128i*)(b+6*stride));
+      const __m128i v_val_7_w = _mm_load_si128((const __m128i*)(b+7*stride));
+
+      const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
+      const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
+      const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
+      const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
+      const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w);
+      const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w);
+      const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w);
+      const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w);
+
+      const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
+      const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
+      const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d);
+      const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d);
+
+      const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
+      const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d);
+
+      v_acc_d = _mm_add_epi32(v_acc_d, v_sum_0123_d);
+      v_acc_d = _mm_add_epi32(v_acc_d, v_sum_4567_d);
+    }
+
+    v_acc_q = _mm_add_epi64(v_acc_q, _mm_and_si128(v_acc_d, v_zext_mask_q));
+    v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_epi64(v_acc_d, 32));
+
+    src += 8*stride;
+  }
+
+  v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8));
+
+#if ARCH_X86_64
+  return (uint64_t)_mm_cvtsi128_si64(v_acc_q);
+#else
+  {
+    uint64_t tmp;
+    _mm_storel_epi64((__m128i*)&tmp, v_acc_q);
+    return tmp;
+  }
+#endif
+}
+
+uint64_t vpx_sum_squares_2d_i16_sse2(const int16_t *src, int stride,
+                                     int size) {
+  // 4 elements per row only requires half an XMM register, so this
+  // must be a special case, but also note that over 75% of all calls
+  // are with size == 4, so it is also the common case.
+  if (LIKELY(size == 4)) {
+    return vpx_sum_squares_2d_i16_4x4_sse2(src, stride);
+  } else {
+  // Generic case
+    return vpx_sum_squares_2d_i16_nxn_sse2(src, stride, size);
+  }
+}
+
+//////////////////////////////////////////////////////////////////////////////
+// 1D version
+//////////////////////////////////////////////////////////////////////////////
+
+static uint64_t vpx_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) {
+  const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff);
+  __m128i v_acc0_q = _mm_setzero_si128();
+  __m128i v_acc1_q = _mm_setzero_si128();
+
+  const int16_t *const end = src + n;
+
+  assert(n % 64 == 0);
+
+  while (src < end) {
+    const __m128i v_val_0_w = xx_load_128(src);
+    const __m128i v_val_1_w = xx_load_128(src + 8);
+    const __m128i v_val_2_w = xx_load_128(src + 16);
+    const __m128i v_val_3_w = xx_load_128(src + 24);
+    const __m128i v_val_4_w = xx_load_128(src + 32);
+    const __m128i v_val_5_w = xx_load_128(src + 40);
+    const __m128i v_val_6_w = xx_load_128(src + 48);
+    const __m128i v_val_7_w = xx_load_128(src + 56);
+
+    const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w);
+    const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w);
+    const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w);
+    const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w);
+    const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w);
+    const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w);
+    const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w);
+    const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w);
+
+    const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d);
+    const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d);
+    const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d);
+    const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d);
+
+    const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d);
+    const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d);
+
+    const __m128i v_sum_d = _mm_add_epi32(v_sum_0123_d, v_sum_4567_d);
+
+    v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_and_si128(v_sum_d, v_zext_mask_q));
+    v_acc1_q = _mm_add_epi64(v_acc1_q, _mm_srli_epi64(v_sum_d, 32));
+
+    src += 64;
+  }
+
+  v_acc0_q = _mm_add_epi64(v_acc0_q, v_acc1_q);
+  v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8));
+
+#if ARCH_X86_64
+  return (uint64_t)_mm_cvtsi128_si64(v_acc0_q);
+#else
+  {
+    uint64_t tmp;
+    _mm_storel_epi64((__m128i*)&tmp, v_acc0_q);
+    return tmp;
+  }
+#endif
+}
+
+uint64_t vpx_sum_squares_i16_sse2(const int16_t *src, uint32_t n) {
+  if (n % 64 == 0) {
+    return vpx_sum_squares_i16_64n_sse2(src, n);
+  } else if (n > 64) {
+    int k = n & ~(64-1);
+    return vpx_sum_squares_i16_64n_sse2(src, k) +
+           vpx_sum_squares_i16_c(src + k, n - k);
+  } else {
+    return vpx_sum_squares_i16_c(src, n);
+  }
+}
diff --git a/vpx_dsp/x86/synonyms.h b/vpx_dsp/x86/synonyms.h
new file mode 100644
index 0000000..6708dd1
--- /dev/null
+++ b/vpx_dsp/x86/synonyms.h
@@ -0,0 +1,111 @@
+/*
+ *  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_DSP_X86_SYNONYS_H_
+#define VPX_DSP_X86_SYNONYS_H_
+
+#include <immintrin.h>
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+
+/**
+ * Various reusable shorthands for x86 SIMD intrinsics.
+ *
+ * Intrinsics prefixed with xx_ operate on or return 128bit XMM registers.
+ * Intrinsics prefixed with yy_ operate on or return 256bit YMM registers.
+ */
+
+// Loads and stores to do away with the tedium of casting the address
+// to the right type.
+static INLINE __m128i xx_loadl_32(const void *a) {
+  return _mm_cvtsi32_si128(*(const uint32_t*)a);
+}
+
+static INLINE __m128i xx_loadl_64(const void *a) {
+  return _mm_loadl_epi64((const __m128i*)a);
+}
+
+static INLINE __m128i xx_load_128(const void *a) {
+  return _mm_load_si128((const __m128i*)a);
+}
+
+static INLINE __m128i xx_loadu_128(const void *a) {
+  return _mm_loadu_si128((const __m128i*)a);
+}
+
+static INLINE void xx_storel_32(void *const a, const __m128i v) {
+  *(uint32_t*)a = _mm_cvtsi128_si32(v);
+}
+
+static INLINE void xx_storel_64(void *const a, const __m128i v) {
+  _mm_storel_epi64((__m128i*)a, v);
+}
+
+static INLINE void xx_store_128(void *const a, const __m128i v) {
+  _mm_store_si128((__m128i*)a, v);
+}
+
+static INLINE void xx_storeu_128(void *const a, const __m128i v) {
+  _mm_storeu_si128((__m128i*)a, v);
+}
+
+static INLINE __m128i xx_round_epu16(__m128i v_val_w) {
+  return _mm_avg_epu16(v_val_w, _mm_setzero_si128());
+}
+
+static INLINE __m128i xx_roundn_epu16(__m128i v_val_w, int bits) {
+  const __m128i v_s_w =_mm_srli_epi16(v_val_w, bits-1);
+  return _mm_avg_epu16(v_s_w, _mm_setzero_si128());
+}
+
+static INLINE __m128i xx_roundn_epu32(__m128i v_val_d, int bits) {
+  const __m128i v_bias_d = _mm_set1_epi32(1 << (bits - 1));
+  const __m128i v_tmp_d = _mm_add_epi32(v_val_d, v_bias_d);
+  return _mm_srli_epi32(v_tmp_d, bits);
+}
+
+static INLINE __m128i xx_roundn_epi32(__m128i v_val_d, int bits) {
+  const __m128i v_bias_d = _mm_set1_epi32(1 << (bits - 1));
+  const __m128i v_sign_d = _mm_srai_epi32(v_val_d, 31);
+  const __m128i v_tmp_d = _mm_add_epi32(_mm_add_epi32(v_val_d, v_bias_d),
+                                        v_sign_d);
+  return _mm_srai_epi32(v_tmp_d, bits);
+}
+
+#ifdef __SSSE3__
+static INLINE int32_t xx_hsum_epi32_si32(__m128i v_d) {
+  v_d = _mm_hadd_epi32(v_d, v_d);
+  v_d = _mm_hadd_epi32(v_d, v_d);
+  return _mm_cvtsi128_si32(v_d);
+}
+
+static INLINE int64_t xx_hsum_epi64_si64(__m128i v_q) {
+  v_q = _mm_add_epi64(v_q, _mm_srli_si128(v_q, 8));
+#if ARCH_X86_64
+  return _mm_cvtsi128_si64(v_q);
+#else
+  {
+    int64_t tmp;
+    _mm_storel_epi64((__m128i*)&tmp, v_q);
+    return tmp;
+  }
+#endif
+}
+
+static INLINE int64_t xx_hsum_epi32_si64(__m128i v_d) {
+  const __m128i v_sign_d =  _mm_cmplt_epi32(v_d, _mm_setzero_si128());
+  const __m128i v_0_q = _mm_unpacklo_epi32(v_d, v_sign_d);
+  const __m128i v_1_q = _mm_unpackhi_epi32(v_d, v_sign_d);
+  return xx_hsum_epi64_si64(_mm_add_epi64(v_0_q, v_1_q));
+}
+#endif  // __SSSE3__
+
+#endif  // VPX_DSP_X86_SYNONYS_H_
diff --git a/vpx_dsp/x86/txfm_common_sse2.h b/vpx_dsp/x86/txfm_common_sse2.h
index 536b206..f886d30 100644
--- a/vpx_dsp/x86/txfm_common_sse2.h
+++ b/vpx_dsp/x86/txfm_common_sse2.h
@@ -26,4 +26,11 @@
   _mm_setr_epi16((int16_t)(a), (int16_t)(b), (int16_t)(c), (int16_t)(d), \
                  (int16_t)(e), (int16_t)(f), (int16_t)(g), (int16_t)(h))
 
+// Reverse the 8 16 bit words in __m128i
+static INLINE __m128i mm_reverse_epi16(const __m128i x) {
+  const __m128i a = _mm_shufflelo_epi16(x, 0x1b);
+  const __m128i b = _mm_shufflehi_epi16(a, 0x1b);
+  return _mm_shuffle_epi32(b, 0x4e);
+}
+
 #endif  // VPX_DSP_X86_TXFM_COMMON_SSE2_H_
diff --git a/vpx_dsp/x86/variance_sse2.c b/vpx_dsp/x86/variance_sse2.c
index 6987c2e..c2b55a3 100644
--- a/vpx_dsp/x86/variance_sse2.c
+++ b/vpx_dsp/x86/variance_sse2.c
@@ -475,3 +475,217 @@
 #undef FNS
 #undef FN
 #endif  // CONFIG_USE_X86INC
+
+void vpx_upsampled_pred_sse2(uint8_t *comp_pred,
+                             int width, int height,
+                             const uint8_t *ref,  int ref_stride) {
+    int i, j;
+    int stride = ref_stride << 3;
+
+    if (width >= 16) {
+      // read 16 points at one time
+      for (i = 0; i < height; i++) {
+        for (j = 0; j < width; j+= 16) {
+          __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
+          __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16));
+          __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32));
+          __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48));
+          __m128i s4 = _mm_loadu_si128((const __m128i *)(ref + 64));
+          __m128i s5 = _mm_loadu_si128((const __m128i *)(ref + 80));
+          __m128i s6 = _mm_loadu_si128((const __m128i *)(ref + 96));
+          __m128i s7 = _mm_loadu_si128((const __m128i *)(ref + 112));
+          __m128i t0, t1, t2, t3;
+
+          t0 = _mm_unpacklo_epi8(s0, s1);
+          s1 = _mm_unpackhi_epi8(s0, s1);
+          t1 = _mm_unpacklo_epi8(s2, s3);
+          s3 = _mm_unpackhi_epi8(s2, s3);
+          t2 = _mm_unpacklo_epi8(s4, s5);
+          s5 = _mm_unpackhi_epi8(s4, s5);
+          t3 = _mm_unpacklo_epi8(s6, s7);
+          s7 = _mm_unpackhi_epi8(s6, s7);
+
+          s0 = _mm_unpacklo_epi8(t0, s1);
+          s2 = _mm_unpacklo_epi8(t1, s3);
+          s4 = _mm_unpacklo_epi8(t2, s5);
+          s6 = _mm_unpacklo_epi8(t3, s7);
+          s0 = _mm_unpacklo_epi32(s0, s2);
+          s4 = _mm_unpacklo_epi32(s4, s6);
+          s0 = _mm_unpacklo_epi64(s0, s4);
+
+          _mm_storeu_si128((__m128i *)(comp_pred), s0);
+          comp_pred += 16;
+          ref += 16 * 8;
+        }
+        ref += stride - (width << 3);
+      }
+    } else if (width >= 8) {
+      // read 8 points at one time
+      for (i = 0; i < height; i++) {
+        for (j = 0; j < width; j+= 8) {
+          __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
+          __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16));
+          __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32));
+          __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48));
+          __m128i t0, t1;
+
+          t0 = _mm_unpacklo_epi8(s0, s1);
+          s1 = _mm_unpackhi_epi8(s0, s1);
+          t1 = _mm_unpacklo_epi8(s2, s3);
+          s3 = _mm_unpackhi_epi8(s2, s3);
+
+          s0 = _mm_unpacklo_epi8(t0, s1);
+          s2 = _mm_unpacklo_epi8(t1, s3);
+          s0 = _mm_unpacklo_epi32(s0, s2);
+
+          _mm_storel_epi64((__m128i *)(comp_pred), s0);
+          comp_pred += 8;
+          ref += 8 * 8;
+        }
+        ref += stride - (width << 3);
+      }
+    } else {
+      // read 4 points at one time
+      for (i = 0; i < height; i++) {
+        for (j = 0; j < width; j+= 4) {
+          __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
+          __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16));
+          __m128i t0;
+
+          t0 = _mm_unpacklo_epi8(s0, s1);
+          s1 = _mm_unpackhi_epi8(s0, s1);
+          s0 = _mm_unpacklo_epi8(t0, s1);
+
+          *(int *)comp_pred = _mm_cvtsi128_si32(s0);
+          comp_pred += 4;
+          ref += 4 * 8;
+        }
+        ref += stride - (width << 3);
+      }
+    }
+}
+
+void vpx_comp_avg_upsampled_pred_sse2(uint8_t *comp_pred, const uint8_t *pred,
+                                      int width, int height,
+                                      const uint8_t *ref,  int ref_stride) {
+    const __m128i zero = _mm_set1_epi16(0);
+    const __m128i one = _mm_set1_epi16(1);
+    int i, j;
+    int stride = ref_stride << 3;
+
+    if (width >= 16) {
+      // read 16 points at one time
+      for (i = 0; i < height; i++) {
+        for (j = 0; j < width; j+= 16) {
+          __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
+          __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16));
+          __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32));
+          __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48));
+          __m128i s4 = _mm_loadu_si128((const __m128i *)(ref + 64));
+          __m128i s5 = _mm_loadu_si128((const __m128i *)(ref + 80));
+          __m128i s6 = _mm_loadu_si128((const __m128i *)(ref + 96));
+          __m128i s7 = _mm_loadu_si128((const __m128i *)(ref + 112));
+          __m128i p0 = _mm_loadu_si128((const __m128i *)pred);
+          __m128i p1;
+          __m128i t0, t1, t2, t3;
+
+          t0 = _mm_unpacklo_epi8(s0, s1);
+          s1 = _mm_unpackhi_epi8(s0, s1);
+          t1 = _mm_unpacklo_epi8(s2, s3);
+          s3 = _mm_unpackhi_epi8(s2, s3);
+          t2 = _mm_unpacklo_epi8(s4, s5);
+          s5 = _mm_unpackhi_epi8(s4, s5);
+          t3 = _mm_unpacklo_epi8(s6, s7);
+          s7 = _mm_unpackhi_epi8(s6, s7);
+
+          s0 = _mm_unpacklo_epi8(t0, s1);
+          s2 = _mm_unpacklo_epi8(t1, s3);
+          s4 = _mm_unpacklo_epi8(t2, s5);
+          s6 = _mm_unpacklo_epi8(t3, s7);
+
+          s0 = _mm_unpacklo_epi32(s0, s2);
+          s4 = _mm_unpacklo_epi32(s4, s6);
+          s0 = _mm_unpacklo_epi8(s0, zero);
+          s4 = _mm_unpacklo_epi8(s4, zero);
+
+          p1 = _mm_unpackhi_epi8(p0, zero);
+          p0 = _mm_unpacklo_epi8(p0, zero);
+          p0 = _mm_adds_epu16(s0, p0);
+          p1 = _mm_adds_epu16(s4, p1);
+          p0 = _mm_adds_epu16(p0, one);
+          p1 = _mm_adds_epu16(p1, one);
+
+          p0 = _mm_srli_epi16(p0, 1);
+          p1 = _mm_srli_epi16(p1, 1);
+          p0 = _mm_packus_epi16(p0, p1);
+
+          _mm_storeu_si128((__m128i *)(comp_pred), p0);
+          comp_pred += 16;
+          pred += 16;
+          ref += 16 * 8;
+        }
+        ref += stride - (width << 3);
+      }
+    } else if (width >= 8) {
+      // read 8 points at one time
+      for (i = 0; i < height; i++) {
+        for (j = 0; j < width; j+= 8) {
+          __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
+          __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16));
+          __m128i s2 = _mm_loadu_si128((const __m128i *)(ref + 32));
+          __m128i s3 = _mm_loadu_si128((const __m128i *)(ref + 48));
+          __m128i p0 = _mm_loadl_epi64((const __m128i *)pred);
+          __m128i t0, t1;
+
+          t0 = _mm_unpacklo_epi8(s0, s1);
+          s1 = _mm_unpackhi_epi8(s0, s1);
+          t1 = _mm_unpacklo_epi8(s2, s3);
+          s3 = _mm_unpackhi_epi8(s2, s3);
+
+          s0 = _mm_unpacklo_epi8(t0, s1);
+          s2 = _mm_unpacklo_epi8(t1, s3);
+          s0 = _mm_unpacklo_epi32(s0, s2);
+          s0 = _mm_unpacklo_epi8(s0, zero);
+
+          p0 = _mm_unpacklo_epi8(p0, zero);
+          p0 = _mm_adds_epu16(s0, p0);
+          p0 = _mm_adds_epu16(p0, one);
+          p0 = _mm_srli_epi16(p0, 1);
+          p0 = _mm_packus_epi16(p0, zero);
+
+          _mm_storel_epi64((__m128i *)(comp_pred), p0);
+          comp_pred += 8;
+          pred += 8;
+          ref += 8 * 8;
+        }
+        ref += stride - (width << 3);
+      }
+    } else {
+      // read 4 points at one time
+      for (i = 0; i < height; i++) {
+        for (j = 0; j < width; j+= 4) {
+          __m128i s0 = _mm_loadu_si128((const __m128i *)ref);
+          __m128i s1 = _mm_loadu_si128((const __m128i *)(ref + 16));
+          __m128i p0 = _mm_cvtsi32_si128(*(const uint32_t *)pred);
+          __m128i t0;
+
+          t0 = _mm_unpacklo_epi8(s0, s1);
+          s1 = _mm_unpackhi_epi8(s0, s1);
+          s0 = _mm_unpacklo_epi8(t0, s1);
+          s0 = _mm_unpacklo_epi8(s0, zero);
+
+          p0 = _mm_unpacklo_epi8(p0, zero);
+          p0 = _mm_adds_epu16(s0, p0);
+          p0 = _mm_adds_epu16(p0, one);
+          p0 = _mm_srli_epi16(p0, 1);
+          p0 = _mm_packus_epi16(p0, zero);
+
+          *(int *)comp_pred = _mm_cvtsi128_si32(p0);
+          comp_pred += 4;
+          pred += 4;
+          ref += 4 * 8;
+        }
+        ref += stride - (width << 3);
+      }
+    }
+}
diff --git a/vpx_dsp/x86/vpx_convolve_copy_sse2.asm b/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
index abc0270..6d43fc1 100644
--- a/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
+++ b/vpx_dsp/x86/vpx_convolve_copy_sse2.asm
@@ -46,6 +46,119 @@
   je .w16
   cmp r4d, 32
   je .w32
+
+%if CONFIG_VP10 && CONFIG_EXT_PARTITION
+  cmp r4d, 64
+  je .w64
+%ifidn %2, highbd
+  cmp r4d, 128
+  je .w128
+
+.w256:
+  mov                    r4d, dword hm
+.loop256:
+  movu                    m0, [srcq]
+  movu                    m1, [srcq+16]
+  movu                    m2, [srcq+32]
+  movu                    m3, [srcq+48]
+%ifidn %1, avg
+  pavg                    m0, [dstq]
+  pavg                    m1, [dstq+16]
+  pavg                    m2, [dstq+32]
+  pavg                    m3, [dstq+48]
+%endif
+  mova             [dstq   ], m0
+  mova             [dstq+16], m1
+  mova             [dstq+32], m2
+  mova             [dstq+48], m3
+  movu                    m0, [srcq+64]
+  movu                    m1, [srcq+80]
+  movu                    m2, [srcq+96]
+  movu                    m3, [srcq+112]
+%ifidn %1, avg
+  pavg                    m0, [dstq+64]
+  pavg                    m1, [dstq+80]
+  pavg                    m2, [dstq+96]
+  pavg                    m3, [dstq+112]
+%endif
+  mova             [dstq+64], m0
+  mova             [dstq+80], m1
+  mova             [dstq+96], m2
+  mova            [dstq+112], m3
+  movu                    m0, [srcq+128]
+  movu                    m1, [srcq+128+16]
+  movu                    m2, [srcq+128+32]
+  movu                    m3, [srcq+128+48]
+%ifidn %1, avg
+  pavg                    m0, [dstq+128]
+  pavg                    m1, [dstq+128+16]
+  pavg                    m2, [dstq+128+32]
+  pavg                    m3, [dstq+128+48]
+%endif
+  mova         [dstq+128   ], m0
+  mova         [dstq+128+16], m1
+  mova         [dstq+128+32], m2
+  mova         [dstq+128+48], m3
+  movu                    m0, [srcq+128+64]
+  movu                    m1, [srcq+128+80]
+  movu                    m2, [srcq+128+96]
+  movu                    m3, [srcq+128+112]
+  add                   srcq, src_strideq
+%ifidn %1, avg
+  pavg                    m0, [dstq+128+64]
+  pavg                    m1, [dstq+128+80]
+  pavg                    m2, [dstq+128+96]
+  pavg                    m3, [dstq+128+112]
+%endif
+  mova         [dstq+128+64], m0
+  mova         [dstq+128+80], m1
+  mova         [dstq+128+96], m2
+  mova        [dstq+128+112], m3
+  add                   dstq, dst_strideq
+  sub                    r4d, 1
+  jnz .loop256
+  RET
+%endif
+
+.w128:
+  mov                    r4d, dword hm
+.loop128:
+  movu                    m0, [srcq]
+  movu                    m1, [srcq+16]
+  movu                    m2, [srcq+32]
+  movu                    m3, [srcq+48]
+%ifidn %1, avg
+  pavg                    m0, [dstq]
+  pavg                    m1, [dstq+16]
+  pavg                    m2, [dstq+32]
+  pavg                    m3, [dstq+48]
+%endif
+  mova             [dstq   ], m0
+  mova             [dstq+16], m1
+  mova             [dstq+32], m2
+  mova             [dstq+48], m3
+  movu                    m0, [srcq+64]
+  movu                    m1, [srcq+80]
+  movu                    m2, [srcq+96]
+  movu                    m3, [srcq+112]
+  add                   srcq, src_strideq
+%ifidn %1, avg
+  pavg                    m0, [dstq+64]
+  pavg                    m1, [dstq+80]
+  pavg                    m2, [dstq+96]
+  pavg                    m3, [dstq+112]
+%endif
+  mova             [dstq+64], m0
+  mova             [dstq+80], m1
+  mova             [dstq+96], m2
+  mova            [dstq+112], m3
+  add                   dstq, dst_strideq
+  sub                    r4d, 1
+  jnz .loop128
+  RET
+
+%else  ; CONFIG_VP10 && CONFIG_EXT_PARTITION
+
 %ifidn %2, highbd
   cmp r4d, 64
   je .w64
@@ -82,10 +195,11 @@
   mova             [dstq+96], m2
   mova            [dstq+112], m3
   add                   dstq, dst_strideq
-  dec                    r4d
+  sub                    r4d, 1
   jnz .loop128
   RET
 %endif
+%endif  ; CONFIG_VP10 && CONFIG_EXT_PARTITION
 
 .w64
   mov                    r4d, dword hm
@@ -106,7 +220,7 @@
   mova             [dstq+32], m2
   mova             [dstq+48], m3
   add                   dstq, dst_strideq
-  dec                    r4d
+  sub                    r4d, 1
   jnz .loop64
   RET
 
diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index b718678..cbd22dc 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -8,10 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-// Due to a header conflict between math.h and intrinsics includes with ceil()
-// in certain configurations under vs9 this include needs to precede
-// immintrin.h.
-
 #include <immintrin.h>
 
 #include "./vpx_dsp_rtcd.h"
diff --git a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
index 6fd5208..c3797ce 100644
--- a/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
+++ b/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
@@ -8,10 +8,6 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-// Due to a header conflict between math.h and intrinsics includes with ceil()
-// in certain configurations under vs9 this include needs to precede
-// tmmintrin.h.
-
 #include <tmmintrin.h>
 
 #include "./vpx_dsp_rtcd.h"
@@ -844,34 +840,49 @@
   // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
   // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
   // --Require an additional 8 rows for the horiz_w8 transpose tail.
-  DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
+  DECLARE_ALIGNED(16, uint8_t, temp[(MAX_EXT_SIZE + 8) * MAX_SB_SIZE]);
   const int intermediate_height =
       (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;
 
-  assert(w <= 64);
-  assert(h <= 64);
+  assert(w <= MAX_SB_SIZE);
+  assert(h <= MAX_SB_SIZE);
   assert(y_step_q4 <= 32);
   assert(x_step_q4 <= 32);
 
   if (w >= 8) {
     scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                            src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
+                            src_stride,
+                            temp,
+                            MAX_SB_SIZE,
+                            x_filters, x0_q4, x_step_q4,
                             w, intermediate_height);
   } else {
     scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
-                            src_stride, temp, 64, x_filters, x0_q4, x_step_q4,
+                            src_stride,
+                            temp,
+                            MAX_SB_SIZE,
+                            x_filters, x0_q4, x_step_q4,
                             w, intermediate_height);
   }
 
   if (w >= 16) {
-    scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                            dst_stride, y_filters, y0_q4, y_step_q4, w, h);
+    scaledconvolve_vert_w16(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
+                            MAX_SB_SIZE,
+                            dst,
+                            dst_stride,
+                            y_filters, y0_q4, y_step_q4, w, h);
   } else if (w == 8) {
-    scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                           dst_stride, y_filters, y0_q4, y_step_q4, w, h);
+    scaledconvolve_vert_w8(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
+                           MAX_SB_SIZE,
+                           dst,
+                           dst_stride,
+                           y_filters, y0_q4, y_step_q4, w, h);
   } else {
-    scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
-                           dst_stride, y_filters, y0_q4, y_step_q4, w, h);
+    scaledconvolve_vert_w4(temp + MAX_SB_SIZE * (SUBPEL_TAPS / 2 - 1),
+                           MAX_SB_SIZE,
+                           dst,
+                           dst_stride,
+                           y_filters, y0_q4, y_step_q4, w, h);
   }
 }
 
diff --git a/vpx_ports/bitops.h b/vpx_ports/bitops.h
index 84ff365..19426fa 100644
--- a/vpx_ports/bitops.h
+++ b/vpx_ports/bitops.h
@@ -16,8 +16,7 @@
 #include "vpx_ports/msvc.h"
 
 #ifdef _MSC_VER
-# include <math.h>  // the ceil() definition must precede intrin.h
-# if _MSC_VER > 1310 && (defined(_M_X64) || defined(_M_IX86))
+# if defined(_M_X64) || defined(_M_IX86)
 #  include <intrin.h>
 #  define USE_MSC_INTRINSICS
 # endif
diff --git a/vpx_ports/mem.h b/vpx_ports/mem.h
index 7502f90..48549ce 100644
--- a/vpx_ports/mem.h
+++ b/vpx_ports/mem.h
@@ -38,16 +38,21 @@
 #define __builtin_prefetch(x)
 #endif
 
-/* Shift down with rounding */
+/* Shift down with rounding for use when n >= 0, value >= 0 */
 #define ROUND_POWER_OF_TWO(value, n) \
-    (((value) + (1 << ((n) - 1))) >> (n))
+    (((value) + (((1 << (n)) >> 1))) >> (n))
+
+/* Shift down with rounding for signed integers, for use when n >= 0 */
+#define ROUND_POWER_OF_TWO_SIGNED(value, n) \
+    (((value) < 0) ? -ROUND_POWER_OF_TWO(-(value), (n)) \
+                   : ROUND_POWER_OF_TWO((value), (n)))
 
 #define ALIGN_POWER_OF_TWO(value, n) \
     (((value) + ((1 << (n)) - 1)) & ~((1 << (n)) - 1))
 
+#define CONVERT_TO_SHORTPTR(x) ((uint16_t*)(((uintptr_t)(x)) << 1))
 #if CONFIG_VP9_HIGHBITDEPTH
-#define CONVERT_TO_SHORTPTR(x) ((uint16_t*)(((uintptr_t)x) << 1))
-#define CONVERT_TO_BYTEPTR(x) ((uint8_t*)(((uintptr_t)x) >> 1))
+#define CONVERT_TO_BYTEPTR(x) ((uint8_t*)(((uintptr_t)(x)) >> 1))
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
 #endif  // VPX_PORTS_MEM_H_
diff --git a/vpx_ports/msvc.h b/vpx_ports/msvc.h
index cab7740..d6b8503 100644
--- a/vpx_ports/msvc.h
+++ b/vpx_ports/msvc.h
@@ -26,6 +26,20 @@
   else
     return floor(x + 0.5);
 }
+
+static INLINE float roundf(float x) {
+  if (x < 0)
+    return (float)ceil(x - 0.5f);
+  else
+    return (float)floor(x + 0.5f);
+}
+
+static INLINE long lroundf(float x) {
+  if (x < 0)
+    return (long)(x - 0.5f);
+  else
+    return (long)(x + 0.5f);
+}
 #endif  // _MSC_VER < 1800
 
 #endif  // _MSC_VER
diff --git a/vpx_scale/generic/yv12extend.c b/vpx_scale/generic/yv12extend.c
index 3eba563..9dd508d 100644
--- a/vpx_scale/generic/yv12extend.c
+++ b/vpx_scale/generic/yv12extend.c
@@ -210,6 +210,31 @@
   extend_frame(ybf, inner_bw);
 }
 
+void vpx_extend_frame_borders_y_c(YV12_BUFFER_CONFIG *ybf) {
+  int ext_size = ybf->border;
+  assert(ybf->y_height - ybf->y_crop_height < 16);
+  assert(ybf->y_width - ybf->y_crop_width < 16);
+  assert(ybf->y_height - ybf->y_crop_height >= 0);
+  assert(ybf->y_width - ybf->y_crop_width >= 0);
+
+#if CONFIG_VP9_HIGHBITDEPTH
+  if (ybf->flags & YV12_FLAG_HIGHBITDEPTH) {
+    extend_plane_high(ybf->y_buffer, ybf->y_stride,
+                      ybf->y_crop_width, ybf->y_crop_height,
+                      ext_size, ext_size,
+                      ext_size + ybf->y_height - ybf->y_crop_height,
+                      ext_size + ybf->y_width - ybf->y_crop_width);
+    return;
+  }
+#endif
+  extend_plane(ybf->y_buffer, ybf->y_stride,
+               ybf->y_crop_width, ybf->y_crop_height,
+               ext_size, ext_size,
+               ext_size + ybf->y_height - ybf->y_crop_height,
+               ext_size + ybf->y_width - ybf->y_crop_width);
+}
+#endif  // CONFIG_VP9 || CONFIG_VP10
+
 #if CONFIG_VP9_HIGHBITDEPTH
 static void memcpy_short_addr(uint8_t *dst8, const uint8_t *src8, int num) {
   uint16_t *dst = CONVERT_TO_SHORTPTR(dst8);
@@ -217,7 +242,6 @@
   memcpy(dst, src, num * sizeof(uint16_t));
 }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
-#endif  // CONFIG_VP9 || CONFIG_VP10
 
 // Copies the source image into the destination image and updates the
 // destination's UMV borders.
diff --git a/vpx_scale/vpx_scale_rtcd.pl b/vpx_scale/vpx_scale_rtcd.pl
index 56b952b..68a1a3e 100644
--- a/vpx_scale/vpx_scale_rtcd.pl
+++ b/vpx_scale/vpx_scale_rtcd.pl
@@ -28,5 +28,8 @@
 
     add_proto qw/void vpx_extend_frame_inner_borders/, "struct yv12_buffer_config *ybf";
     specialize qw/vpx_extend_frame_inner_borders dspr2/;
+
+    add_proto qw/void vpx_extend_frame_borders_y/, "struct yv12_buffer_config *ybf";
+    specialize qw/vpx_extend_frame_borders_y/;
 }
 1;
diff --git a/vpx_scale/yv12config.h b/vpx_scale/yv12config.h
index 37b255d..04467d0 100644
--- a/vpx_scale/yv12config.h
+++ b/vpx_scale/yv12config.h
@@ -21,10 +21,14 @@
 #include "vpx/vpx_integer.h"
 
 #define VP8BORDERINPIXELS           32
-#define VP9INNERBORDERINPIXELS      96
+#if CONFIG_EXT_PARTITION
+# define VP9INNERBORDERINPIXELS     160
+#else
+# define VP9INNERBORDERINPIXELS     96
+#endif  // CONFIG_EXT_PARTITION
 #define VP9_INTERP_EXTEND           4
 #define VP9_ENC_BORDER_IN_PIXELS    160
-#define VP9_DEC_BORDER_IN_PIXELS    32
+#define VP9_DEC_BORDER_IN_PIXELS    160
 
 typedef struct yv12_buffer_config {
   int   y_width;
diff --git a/vpxdec.c b/vpxdec.c
index 1bef4bd..d96b39c 100644
--- a/vpxdec.c
+++ b/vpxdec.c
@@ -93,6 +93,14 @@
 static const arg_def_t outbitdeptharg = ARG_DEF(
     NULL, "output-bit-depth", 1, "Output bit-depth for decoded frames");
 #endif
+#if CONFIG_EXT_TILE
+static const arg_def_t tiler = ARG_DEF(
+    NULL, "tile-row", 1, "Row index of tile to decode "
+                          "(-1 for all rows)");
+static const arg_def_t tilec = ARG_DEF(
+    NULL, "tile-column", 1, "Column index of tile to decode "
+                            "(-1 for all columns)");
+#endif  // CONFIG_EXT_TILE
 
 static const arg_def_t *all_args[] = {
   &codecarg, &use_yv12, &use_i420, &flipuvarg, &rawvideo, &noblitarg,
@@ -102,6 +110,9 @@
 #if CONFIG_VP9_HIGHBITDEPTH
   &outbitdeptharg,
 #endif
+#if CONFIG_EXT_TILE
+  &tiler, &tilec,
+#endif  // CONFIG_EXT_TILE
   NULL
 };
 
@@ -563,6 +574,10 @@
 #if CONFIG_VP9_HIGHBITDEPTH
   unsigned int            output_bit_depth = 0;
 #endif
+#if CONFIG_EXT_TILE
+  int                     tile_row = -1;
+  int                     tile_col = -1;
+#endif  // CONFIG_EXT_TILE
 #if CONFIG_VP8_DECODER
   vp8_postproc_cfg_t      vp8_pp_cfg = {0};
   int                     vp8_dbg_color_ref_frame = 0;
@@ -658,6 +673,12 @@
       output_bit_depth = arg_parse_uint(&arg);
     }
 #endif
+#if CONFIG_EXT_TILE
+    else if (arg_match(&arg, &tiler, argi))
+      tile_row = arg_parse_int(&arg);
+    else if (arg_match(&arg, &tilec, argi))
+      tile_col = arg_parse_int(&arg);
+#endif  // CONFIG_EXT_TILE
 #if CONFIG_VP8_DECODER
     else if (arg_match(&arg, &addnoise_level, argi)) {
       postproc = 1;
@@ -850,6 +871,21 @@
   }
 #endif
 
+#if CONFIG_VP10_DECODER && CONFIG_EXT_TILE
+  if (strncmp(decoder.name, "WebM Project VP10", 17) == 0) {
+    if (vpx_codec_control(&decoder, VP10_SET_DECODE_TILE_ROW, tile_row)) {
+      fprintf(stderr, "Failed to set decode_tile_row: %s\n",
+              vpx_codec_error(&decoder));
+      return EXIT_FAILURE;
+    }
+
+    if (vpx_codec_control(&decoder, VP10_SET_DECODE_TILE_COL, tile_col)) {
+      fprintf(stderr, "Failed to set decode_tile_col: %s\n",
+              vpx_codec_error(&decoder));
+      return EXIT_FAILURE;
+    }
+  }
+#endif
 
   if (arg_skip)
     fprintf(stderr, "Skipping first %d frames.\n", arg_skip);
@@ -1015,6 +1051,11 @@
       }
 #endif
 
+#if CONFIG_EXT_TILE
+      vpx_input_ctx.width = img->d_w;
+      vpx_input_ctx.height = img->d_h;
+#endif  // CONFIG_EXT_TILE
+
       if (single_file) {
         if (use_y4m) {
           char buf[Y4M_BUFFER_SIZE] = {0};
diff --git a/vpxenc.c b/vpxenc.c
index f2def54..6463334 100644
--- a/vpxenc.c
+++ b/vpxenc.c
@@ -481,6 +481,17 @@
 #endif
 
 #if CONFIG_VP10_ENCODER
+#if CONFIG_EXT_PARTITION
+static const struct arg_enum_list superblock_size_enum[] = {
+  {"dynamic", VPX_SUPERBLOCK_SIZE_DYNAMIC},
+  {"64", VPX_SUPERBLOCK_SIZE_64X64},
+  {"128", VPX_SUPERBLOCK_SIZE_128X128},
+  {NULL, 0}
+};
+static const arg_def_t superblock_size = ARG_DEF_ENUM(
+    NULL, "sb-size", 1, "Superblock size to use", superblock_size_enum);
+#endif  // CONFIG_EXT_PARTITION
+
 static const arg_def_t *vp10_args[] = {
   &cpu_used_vp9, &auto_altref, &sharpness, &static_thresh,
   &tile_cols, &tile_rows, &arnr_maxframes, &arnr_strength, &arnr_type,
@@ -489,6 +500,9 @@
   &frame_parallel_decoding, &aq_mode, &frame_periodic_boost,
   &noise_sens, &tune_content, &input_color_space,
   &min_gf_interval, &max_gf_interval,
+#if CONFIG_EXT_PARTITION
+  &superblock_size,
+#endif  // CONFIG_EXT_PARTITION
 #if CONFIG_VP9_HIGHBITDEPTH
   &bitdeptharg, &inbitdeptharg,
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -505,6 +519,9 @@
   VP9E_SET_FRAME_PERIODIC_BOOST, VP9E_SET_NOISE_SENSITIVITY,
   VP9E_SET_TUNE_CONTENT, VP9E_SET_COLOR_SPACE,
   VP9E_SET_MIN_GF_INTERVAL, VP9E_SET_MAX_GF_INTERVAL,
+#if CONFIG_EXT_PARTITION
+  VP10E_SET_SUPERBLOCK_SIZE,
+#endif  // CONFIG_EXT_PARTITION
   0
 };
 #endif
@@ -1568,7 +1585,18 @@
 #if CONFIG_DECODERS
   if (global->test_decode != TEST_DECODE_OFF) {
     const VpxInterface *decoder = get_vpx_decoder_by_name(global->codec->name);
-    vpx_codec_dec_init(&stream->decoder, decoder->codec_interface(), NULL, 0);
+    vpx_codec_dec_cfg_t cfg = { 0, 0, 0};
+    vpx_codec_dec_init(&stream->decoder, decoder->codec_interface(), &cfg, 0);
+
+#if CONFIG_VP10_DECODER && CONFIG_EXT_TILE
+    if (strcmp(global->codec->name, "vp10") == 0) {
+      vpx_codec_control(&stream->decoder, VP10_SET_DECODE_TILE_ROW, -1);
+      ctx_exit_on_error(&stream->decoder, "Failed to set decode_tile_row");
+
+      vpx_codec_control(&stream->decoder, VP10_SET_DECODE_TILE_COL, -1);
+      ctx_exit_on_error(&stream->decoder, "Failed to set decode_tile_col");
+    }
+#endif
   }
 #endif
 }
@@ -1833,26 +1861,25 @@
     vpx_codec_control(&stream->encoder, VP8_COPY_REFERENCE, &ref_enc);
     vpx_codec_control(&stream->decoder, VP8_COPY_REFERENCE, &ref_dec);
   } else {
-    struct vp9_ref_frame ref_enc, ref_dec;
+    vpx_codec_control(&stream->encoder, VP10_GET_NEW_FRAME_IMAGE, &enc_img);
+    vpx_codec_control(&stream->decoder, VP10_GET_NEW_FRAME_IMAGE, &dec_img);
 
-    ref_enc.idx = 0;
-    ref_dec.idx = 0;
-    vpx_codec_control(&stream->encoder, VP9_GET_REFERENCE, &ref_enc);
-    enc_img = ref_enc.img;
-    vpx_codec_control(&stream->decoder, VP9_GET_REFERENCE, &ref_dec);
-    dec_img = ref_dec.img;
 #if CONFIG_VP9_HIGHBITDEPTH
     if ((enc_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) !=
         (dec_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH)) {
       if (enc_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
-        vpx_img_alloc(&enc_img, enc_img.fmt - VPX_IMG_FMT_HIGHBITDEPTH,
+        vpx_image_t enc_hbd_img;
+        vpx_img_alloc(&enc_hbd_img, enc_img.fmt - VPX_IMG_FMT_HIGHBITDEPTH,
                       enc_img.d_w, enc_img.d_h, 16);
-        vpx_img_truncate_16_to_8(&enc_img, &ref_enc.img);
+        vpx_img_truncate_16_to_8(&enc_hbd_img, &enc_img);
+        enc_img = enc_hbd_img;
       }
       if (dec_img.fmt & VPX_IMG_FMT_HIGHBITDEPTH) {
-        vpx_img_alloc(&dec_img, dec_img.fmt - VPX_IMG_FMT_HIGHBITDEPTH,
+        vpx_image_t dec_hbd_img;
+        vpx_img_alloc(&dec_hbd_img, dec_img.fmt - VPX_IMG_FMT_HIGHBITDEPTH,
                       dec_img.d_w, dec_img.d_h, 16);
-        vpx_img_truncate_16_to_8(&dec_img, &ref_dec.img);
+        vpx_img_truncate_16_to_8(&dec_hbd_img, &dec_img);
+        dec_img = dec_hbd_img;
       }
     }
 #endif
@@ -2260,7 +2287,8 @@
     }
 
     if (global.show_psnr) {
-      if (global.codec->fourcc == VP9_FOURCC) {
+      if (global.codec->fourcc == VP9_FOURCC ||
+          global.codec->fourcc == VP10_FOURCC) {
         FOREACH_STREAM(
             show_psnr(stream, (1 << stream->config.cfg.g_input_bit_depth) - 1));
       } else {